00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "assert.h"
00019 #include "ctype.h"
00020 #include "stdio.h"
00021 #include "zlib.h"
00022 #include "Error.h"
00023
00024
00025 #include "Generic.h"
00026 #include "GenomeSequence.h"
00027
00028 #include <algorithm>
00029 #include <istream>
00030 #include <fstream>
00031 #include <sstream>
00032 #include <stdexcept>
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050 unsigned char GenomeSequence::base2int[256+1] =
00051 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00052 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00053 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\004\005"
00054 "\000\001\002\003\005\005\005\005\005\005\005\005\005\005\005\005"
00055 "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005"
00056 "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005"
00057 "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005"
00058 "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005"
00059
00060 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00061 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00062 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00063 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00064 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00065 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00066 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00067 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00068 ;
00069
00070
00071
00072
00073 const char GenomeSequence::int2base[] = "ACGTNMXXXXXXXXXX";
00074
00075
00076
00077 const char GenomeSequence::int2colorSpace[] = "0123NXXXXXXXXXXX";
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088 unsigned char GenomeSequence::base2complement[256+1 ] =
00089 "NNNNNNNNNNNNNNNN"
00090 "NNNNNNNNNNNNNNNN"
00091 "NNNNNNNNNNNNNNNN"
00092 "0123NNNNNNNNNNNN"
00093 "NTNGNNNCNNNNNNNN"
00094 "NNNNANNNNNNNNNNN"
00095 "NTNGNNNCNNNNNNNN"
00096 "NNNNANNNNNNNNNNN"
00097
00098 "NNNNNNNNNNNNNNNN"
00099 "NNNNNNNNNNNNNNNN"
00100 "NNNNNNNNNNNNNNNN"
00101 "NNNNNNNNNNNNNNNN"
00102 "NNNNNNNNNNNNNNNN"
00103 "NNNNNNNNNNNNNNNN"
00104 "NNNNNNNNNNNNNNNN"
00105 "NNNNNNNNNNNNNNNN"
00106 ;
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121 void PackedRead::set(const char *rhs, int padWithNCount)
00122 {
00123 clear();
00124
00125
00126 while (padWithNCount>1)
00127 {
00128 packedBases.push_back(
00129 GenomeSequence::base2int[(int) 'N'] << 4 |
00130 GenomeSequence::base2int[(int) 'N']
00131 );
00132 padWithNCount -= 2;
00133 length+=2;
00134 }
00135
00136
00137
00138 if (padWithNCount)
00139 {
00140
00141
00142 packedBases.push_back(
00143 GenomeSequence::base2int[(int) *rhs] << 4 |
00144 GenomeSequence::base2int[(int) 'N']
00145 );
00146
00147 if (*rhs)
00148 {
00149 length+=2;
00150 rhs++;
00151 }
00152 else
00153 {
00154 length++;
00155 }
00156 padWithNCount--;
00157 assert(padWithNCount==0);
00158 }
00159
00160
00161 while (*rhs && *(rhs+1))
00162 {
00163 packedBases.push_back(
00164 GenomeSequence::base2int[(int) *(rhs+1)] << 4 |
00165 GenomeSequence::base2int[(int) *(rhs+0)]
00166 );
00167 rhs+=2;
00168 length+=2;
00169 }
00170
00171
00172
00173 if (*rhs)
00174 {
00175 packedBases.push_back(
00176 GenomeSequence::base2int[(int) *(rhs+0)]
00177 );
00178 length++;
00179 }
00180 return;
00181 }
00182
00183 std::string GenomeSequence::IntegerToSeq(unsigned int n, unsigned int wordsize)
00184 {
00185 std::string sequence("");
00186 for (unsigned int i = 0; i < wordsize; i ++)
00187 sequence += "N";
00188
00189 unsigned int clearHigherBits = ~(3U << (wordsize<<1));
00190
00191 if (n > clearHigherBits)
00192 error("%d needs to be a non-negative integer < clearHigherBits\n", n);
00193
00194 for (unsigned int i = 0; i < wordsize; i++)
00195 {
00196 sequence[wordsize-1-i] = int2base[n & 3];
00197 n >>= 2;
00198 }
00199 return sequence;
00200 }
00201
00202
00203
00204 GenomeSequence::GenomeSequence()
00205 {
00206 constructorClear();
00207 }
00208
00209 void GenomeSequence::constructorClear()
00210 {
00211 _debugFlag = 0;
00212 _progressStream = NULL;
00213 _colorSpace = false;
00214 }
00215
00216 void GenomeSequence::setup(const char *referenceFilename)
00217 {
00218 setReferenceName(referenceFilename);
00219
00220 if (_progressStream) *_progressStream << "open and prefetch reference genome " << referenceFilename << ": " << std::flush;
00221
00222 if (open(false))
00223 {
00224 std::cerr << "Failed to open reference genome " << referenceFilename << std::endl;
00225 std::cerr << errorStr << std::endl;
00226 exit(1);
00227 }
00228
00229 prefetch();
00230 if (_progressStream) *_progressStream << "done." << std::endl << std::flush;
00231 }
00232
00233 GenomeSequence::~GenomeSequence()
00234 {
00235
00236 _umfaFile.close();
00237 }
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248
00249 bool GenomeSequence::open(bool isColorSpace, int flags)
00250 {
00251 bool rc;
00252
00253 if (isColorSpace)
00254 {
00255 _umfaFilename = _baseFilename + "-cs.umfa";
00256 }
00257 else
00258 {
00259 _umfaFilename = _baseFilename + "-bs.umfa";
00260 }
00261
00262 rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags);
00263 if (rc)
00264 {
00265 std::cerr << "GenomeSequence::open: failed to open file "
00266 << _umfaFilename
00267 << std::endl;
00268 return true;
00269 }
00270
00271 _colorSpace = header->_colorSpace;
00272
00273 return false;
00274 }
00275
00276 void GenomeSequence::sanityCheck(MemoryMap &fasta)
00277 {
00278 unsigned int i;
00279
00280 unsigned int genomeIndex = 0;
00281 for (i=0; i<fasta.length(); i++)
00282 {
00283 switch (fasta[i])
00284 {
00285 case '>':
00286 while (fasta[i]!='\n' && fasta[i]!='\r') i++;
00287 break;
00288 case '\n':
00289 case '\r':
00290 break;
00291 default:
00292 assert(base2int[(int)(*this)[genomeIndex]] == base2int[(int) fasta[i]]);
00293 #if 0
00294 if (base2int[(*this)[genomeIndex]] != base2int[(int) fasta[i]])
00295 {
00296 int bp1 = base2int[(*this)[genomeIndex]];
00297 int bp2 = base2int[fasta[i]];
00298 printf("failing at genome index = %u, fasta index = %u.\n",
00299 genomeIndex,i);
00300 }
00301 #endif
00302 genomeIndex++;
00303 break;
00304 }
00305 }
00306 }
00307
00308 #define HAS_SUFFIX(str, suffix) ((strlen(suffix) < str.size()) && (str.substr(str.size() - strlen(suffix)) == suffix))
00309
00310
00311
00312
00313
00314 bool GenomeSequence::setReferenceName(std::string referenceFilename)
00315 {
00316
00317 if (HAS_SUFFIX(referenceFilename, ".fa"))
00318 {
00319 _referenceFilename = referenceFilename;
00320 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3);
00321 }
00322 else if (HAS_SUFFIX(referenceFilename, ".umfa"))
00323 {
00324 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5);
00325 }
00326 else if (HAS_SUFFIX(referenceFilename, "-cs.umfa"))
00327 {
00328 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00329 }
00330 else if (HAS_SUFFIX(referenceFilename, "-bs.umfa"))
00331 {
00332 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00333 }
00334 else
00335 {
00336 _baseFilename = referenceFilename;
00337 }
00338 _fastaFilename = _baseFilename + ".fa";
00339
00340 return false;
00341 }
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352 bool GenomeSequence::setChromosomeMD5andLength(uint32_t whichChromosome)
00353 {
00354 if (whichChromosome>=header->_chromosomeCount) return true;
00355
00356 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00357 c->size = header->elementCount - c->start;
00358
00359 #if !defined(WIN32)
00360 MD5_CTX md5Context;
00361 uint8_t md5Signature[MD5_DIGEST_LENGTH];
00362
00363
00364
00365
00366
00367 char *md5Buffer = (char *) malloc(c->size);
00368
00369 MD5_Init(&md5Context);
00370
00371 for (genomeIndex_t i = 0; i < c->size; i ++)
00372 {
00373 md5Buffer[i] = (*this)[c->start + i];
00374 }
00375 MD5_Update(&md5Context, md5Buffer, c->size);
00376 MD5_Final((unsigned char *) &md5Signature, &md5Context);
00377 free(md5Buffer);
00378 for (int i=0; i<MD5_DIGEST_LENGTH; i++)
00379 {
00380
00381 sprintf(c->md5+2*i, "%02x", md5Signature[i]);
00382 }
00383
00384
00385 c->md5[2*MD5_DIGEST_LENGTH] = '\0';
00386 #else
00387 memset(c->md5, 0, 2*MD5_DIGEST_LENGTH);
00388 #endif
00389 return false;
00390 }
00391
00392
00393
00394
00395
00396 static bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
00397 {
00398 chromosomeCount = 0;
00399 baseCount = 0;
00400 bool atLineStart = true;
00401
00402
00403
00404
00405
00406 for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00407 {
00408 switch (fastaData[fastaIndex])
00409 {
00410 case '\n':
00411 case '\r':
00412 atLineStart = true;
00413 break;
00414 case '>':
00415 {
00416 if (!atLineStart) break;
00417 chromosomeCount++;
00418
00419
00420
00421 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
00422 {
00423 fastaIndex++;
00424 }
00425 break;
00426 }
00427 default:
00428 baseCount++;
00429 atLineStart = false;
00430 break;
00431 }
00432
00433 }
00434 return false;
00435 }
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446 bool GenomeSequence::create()
00447 {
00448
00449 assert(int2base[base2int[(int) 'A']]=='A');
00450 assert(int2base[base2int[(int) 'a']]=='A');
00451 assert(int2base[base2int[(int) 'T']]=='T');
00452 assert(int2base[base2int[(int) 't']]=='T');
00453 assert(int2base[base2int[(int) 'C']]=='C');
00454 assert(int2base[base2int[(int) 'c']]=='C');
00455 assert(int2base[base2int[(int) 'G']]=='G');
00456 assert(int2base[base2int[(int) 'g']]=='G');
00457 assert(int2base[base2int[(int) 'N']]=='N');
00458 assert(int2base[base2int[(int) 'n']]=='N');
00459 assert(int2base[base2int[(int) 'M']]=='M');
00460 assert(int2base[base2int[(int) 'm']]=='M');
00461
00462 assert(base2int[(int) 'N']==4);
00463 assert(base2int[(int) 'n']==4);
00464 assert(base2int[(int) 'A']==0);
00465 assert(base2int[(int) 'a']==0);
00466 assert(base2int[(int) 'T']==3);
00467 assert(base2int[(int) 't']==3);
00468 assert(base2int[(int) 'C']==1);
00469 assert(base2int[(int) 'c']==1);
00470 assert(base2int[(int) 'G']==2);
00471 assert(base2int[(int) 'g']==2);
00472
00473 if (_baseFilename=="")
00474 {
00475 std::cerr << "Base reference filename is empty." << std::endl;
00476 return true;
00477 }
00478
00479 if (isColorSpace())
00480 {
00481 _umfaFilename = _baseFilename + "-cs.umfa";
00482 }
00483 else
00484 {
00485 _umfaFilename = _baseFilename + "-bs.umfa";
00486 }
00487
00488 if (access(_umfaFilename.c_str(), R_OK) == 0)
00489 {
00490 std::cerr << "Output file '" << _umfaFilename << "' exists or is not writable - please remove." << std::endl;
00491 return true;
00492 }
00493
00494 MemoryMap fastaFile;
00495
00496 if (fastaFile.open(_fastaFilename.c_str()))
00497 {
00498 std::cerr << "failed to open input fasta file '" << _fastaFilename << "'." << std::endl;
00499 return true;
00500 }
00501
00502 std::cerr << "Creating FASTA "
00503 << (isColorSpace() ? "color space " : "")
00504 << "binary cache file '"
00505 << _umfaFilename
00506 << "'."
00507 << std::endl;
00508
00509 std::cerr << std::flush;
00510
00511
00512
00513
00514
00515 const char *fasta = (const char *) fastaFile.data;
00516 size_t fastaDataSize = fastaFile.length();
00517
00518 uint32_t chromosomeCount = 0;
00519 uint64_t baseCount = 0;
00520 getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount);
00521
00522 if (genomeSequenceArray::create(_umfaFilename.c_str(), baseCount, chromosomeCount))
00523 {
00524 std::cerr << "failed to create '"
00525 << _umfaFilename
00526 << "'."
00527 << std::endl;
00528 perror("");
00529 return true;
00530 }
00531 header->elementCount = 0;
00532 header->_colorSpace = isColorSpace();
00533 header->setApplication(_application.c_str());
00534 header->_chromosomeCount = chromosomeCount;
00535
00536
00537
00538 for (uint32_t i=0; i<header->_chromosomeCount; i++) header->_chromosomes[i].constructorClear();
00539
00540 std::string chromosomeName;
00541
00542
00543
00544 signed char lastBase = base2int[(int) 'N'];
00545 bool terminateLoad = false;
00546 int percent = -1, newPercent;
00547 uint32_t whichChromosome = 0;
00548 for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00549 {
00550 if (_progressStream)
00551 {
00552 newPercent = (1.0 * fastaIndex / fastaDataSize) * 100;
00553 if (newPercent>percent)
00554 {
00555 std::cerr << "\r" << newPercent << "% ";
00556 std::cerr << std::flush;
00557 percent = newPercent;
00558 }
00559 }
00560 switch (fasta[fastaIndex])
00561 {
00562 case '\n':
00563 case '\r':
00564 break;
00565 case '>':
00566 {
00567 chromosomeName = "";
00568 fastaIndex++;
00569
00570
00571
00572 while (!isspace(fasta[fastaIndex]))
00573 {
00574 chromosomeName += fasta[fastaIndex++];
00575 }
00576
00577
00578
00579 while (fasta[fastaIndex]!='\n' && fasta[fastaIndex]!='\r')
00580 {
00581 fastaIndex++;
00582 }
00583
00584
00585
00586
00587 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00588 c->setChromosomeName(chromosomeName.c_str());
00589 c->start = header->elementCount;
00590
00591
00592 if (whichChromosome>0)
00593 {
00594
00595
00596
00597
00598
00599
00600 setChromosomeMD5andLength(whichChromosome - 1);
00601 }
00602 whichChromosome++;
00603 if (whichChromosome > header->_chromosomeCount)
00604 {
00605 std::cerr << "BUG: Exceeded computed chromosome count ("
00606 << header->_chromosomeCount
00607 << ") - genome is now truncated at chromosome "
00608 << header->_chromosomes[header->_chromosomeCount-1].name
00609 << " (index "
00610 << header->_chromosomeCount
00611 << ")."
00612 << std::endl;
00613 terminateLoad = true;
00614 }
00615 break;
00616 }
00617 default:
00618
00619
00620
00621 if (isColorSpace())
00622 {
00623
00624
00625
00626
00627
00628 const char fromBase2CS[] =
00629 {
00630 0,
00631 1,
00632 2,
00633 3,
00634 1,
00635 0,
00636 3,
00637 2,
00638 2,
00639 3,
00640 0,
00641 1,
00642 3,
00643 2,
00644 1,
00645 0,
00646 };
00647
00648
00649
00650
00651
00652
00653
00654
00655 char thisBase = base2int[(int)(fasta[fastaIndex])];
00656 if (lastBase>=0)
00657 {
00658 char color;
00659 if (lastBase>3 || thisBase>3) color=4;
00660 else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
00661
00662
00663 set(header->elementCount++, int2base[(int) color]);
00664 }
00665 lastBase = thisBase;
00666 }
00667 else
00668 {
00669 set(header->elementCount++, toupper(fasta[fastaIndex]));
00670 }
00671 break;
00672 }
00673
00674
00675
00676
00677
00678 if (terminateLoad) break;
00679 }
00680
00681
00682
00683
00684
00685 if (whichChromosome==0)
00686 {
00687 fastaFile.close();
00688 throw std::runtime_error("No chromosomes found - aborting!");
00689 }
00690 else
00691 {
00692 setChromosomeMD5andLength(whichChromosome-1);
00693 }
00694
00695 #if 0
00696 sanityCheck(fastaFile);
00697 #endif
00698 fastaFile.close();
00699
00700 if (_progressStream) *_progressStream << "\r";
00701
00702 std::cerr << "FASTA binary cache file '"
00703 << _umfaFilename
00704 << "' created."
00705 << std::endl;
00706
00707
00708
00709
00710 return false;
00711 }
00712
00713 int GenomeSequence::getChromosomeCount()
00714 {
00715 return header->_chromosomeCount;
00716 }
00717
00718
00719 int GenomeSequence::getChromosome(genomeIndex_t position)
00720 {
00721 if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX;
00722
00723 if (header->_chromosomeCount == 0)
00724 return -1;
00725
00726 int start = 0;
00727 int stop = header->_chromosomeCount - 1;
00728
00729
00730
00731
00732 if (position > header->_chromosomes[stop].start)
00733 return (stop);
00734
00735 while (start <= stop)
00736 {
00737 int middle = (start + stop) / 2;
00738
00739 if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start)
00740 return middle;
00741
00742 if (position == header->_chromosomes[middle + 1].start)
00743 return (middle + 1);
00744
00745 if (position > header->_chromosomes[middle + 1].start)
00746 start = middle + 1;
00747
00748 if (position < header->_chromosomes[middle].start)
00749 stop = middle - 1;
00750 }
00751
00752 return -1;
00753 }
00754
00755
00756
00757
00758
00759
00760
00761 genomeIndex_t GenomeSequence::getGenomePosition(
00762 const char *chromosomeName,
00763 unsigned int chromosomeIndex)
00764 {
00765 genomeIndex_t i = getGenomePosition(chromosomeName);
00766 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00767 return i + chromosomeIndex - 1;
00768 }
00769
00770 genomeIndex_t GenomeSequence::getGenomePosition(
00771 int chromosome,
00772 unsigned int chromosomeIndex)
00773 {
00774 if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX;
00775
00776 genomeIndex_t i = header->_chromosomes[chromosome].start;
00777 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00778 return i + chromosomeIndex - 1;
00779 }
00780
00781
00782
00783
00784
00785
00786
00787
00788
00789 genomeIndex_t GenomeSequence::getGenomePosition(const char *chromosomeName)
00790 {
00791 int chromosome = getChromosome(chromosomeName);
00792 if (chromosome==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00793 return header->_chromosomes[chromosome].start;
00794 }
00795
00796 int GenomeSequence::getChromosome(const char *chromosomeName)
00797 {
00798 unsigned int i;
00799 for (i=0; i<header->_chromosomeCount; i++)
00800 {
00801 if (strcmp(header->_chromosomes[i].name, chromosomeName)==0)
00802 {
00803 return i;
00804 }
00805 }
00806 return INVALID_CHROMOSOME_INDEX;
00807 }
00808
00809
00810
00811
00812
00813 void GenomeSequence::getReverseRead(std::string &read)
00814 {
00815 std::string newRead;
00816 if (read.size()) for (int32_t i=(int) read.size() - 1; i>=0; i--)
00817 {
00818 newRead.push_back(BasePair(read[i]));
00819 }
00820 read = newRead;
00821 }
00822
00823 void GenomeSequence::getReverseRead(String& read)
00824 {
00825 int i = 0;
00826 int j = read.Length()-1;
00827 char temp;
00828 while (i < j)
00829 {
00830 temp = read[j];
00831 read[j] = read[i];
00832 read[i] = temp;
00833 }
00834 }
00835
00836 #define ABS(x) ( (x) > 0 ? (x) : -(x) )
00837 int GenomeSequence::debugPrintReadValidation(
00838 std::string &read,
00839 std::string &quality,
00840 char direction,
00841 genomeIndex_t readLocation,
00842 int sumQuality,
00843 int mismatchCount,
00844 bool recurse
00845 )
00846 {
00847 int validateSumQ = 0;
00848 int validateMismatchCount = 0;
00849 int rc = 0;
00850 std::string genomeData;
00851
00852 for (uint32_t i=0; i<read.size(); i++)
00853 {
00854 if (tolower(read[i]) != tolower((*this)[readLocation + i]))
00855 {
00856 validateSumQ += quality[i] - '!';
00857
00858 if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
00859 genomeData.push_back(tolower((*this)[readLocation + i]));
00860 }
00861 else
00862 {
00863 genomeData.push_back(toupper((*this)[readLocation + i]));
00864 }
00865 }
00866 assert(validateSumQ>=0);
00867 if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
00868 {
00869 printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n",
00870 genomeData.c_str(),
00871 read.c_str(),
00872 validateSumQ,
00873 sumQuality
00874 );
00875 rc++;
00876 }
00877 else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
00878 {
00879 printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n",
00880 genomeData.c_str(),
00881 read.c_str(),
00882 validateMismatchCount,
00883 mismatchCount
00884 );
00885 rc++;
00886 }
00887 else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
00888 {
00889 printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
00890 genomeData.c_str(),
00891 read.c_str(),
00892 validateSumQ,
00893 sumQuality,
00894 validateMismatchCount,
00895 mismatchCount
00896 );
00897 rc++;
00898 }
00899
00900 if (recurse && ABS(validateMismatchCount - mismatchCount) > (int) read.size()/2)
00901 {
00902 printf("large mismatch difference, trying reverse strand: ");
00903 std::string reverseRead = read;
00904 std::string reverseQuality = quality;
00905 getReverseRead(reverseRead);
00906 reverse(reverseQuality.begin(), reverseQuality.end());
00907 rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
00908 }
00909 return rc;
00910 }
00911 #undef ABS
00912
00913
00914 bool GenomeSequence::wordMatch(unsigned int index, std::string &word)
00915 {
00916 for (uint32_t i = 0; i<word.size(); i++)
00917 {
00918 if ((*this)[index + i] != word[i]) return false;
00919 }
00920 return true;
00921 }
00922
00923 bool GenomeSequence::printNearbyWords(unsigned int index, unsigned int deviation, std::string &word)
00924 {
00925 for (unsigned int i = index - deviation; i < index + deviation; i++)
00926 {
00927 if (wordMatch(i, word))
00928 {
00929 std::cerr << "word '"
00930 << word
00931 << "' found "
00932 << index - i
00933 << " away from position "
00934 << index
00935 << "."
00936 << std::endl;
00937 }
00938 }
00939 return false;
00940 }
00941
00942 void GenomeSequence::dumpSequenceSAMDictionary(std::ostream &file)
00943 {
00944 for (unsigned int i=0; i<header->_chromosomeCount; i++)
00945 {
00946 file
00947 << "@SQ"
00948 << "\tSN:" << header->_chromosomes[i].name
00949 << "\tLN:" << header->_chromosomes[i].size
00950 << "\tAS:" << header->_chromosomes[i].assemblyID
00951 << "\tM5:" << header->_chromosomes[i].md5
00952 << "\tUR:" << header->_chromosomes[i].uri
00953 << "\tSP:" << header->_chromosomes[i].species
00954 << std::endl;
00955 }
00956 }
00957
00958 void GenomeSequence::dumpHeaderTSV(std::ostream &file)
00959 {
00960 file << "# Reference: " << _baseFilename << std::endl;
00961 file << "# SN: sample name - must be unique" << std::endl;
00962 file << "# AS: assembly name" << std::endl;
00963 file << "# SP: species" << std::endl;
00964 file << "# LN: chromosome/contig length" << std::endl;
00965 file << "# M5: chromosome/contig MD5 checksum" << std::endl;
00966 file << "# LN and M5 are only printed for informational purposes." << std::endl;
00967 file << "# Karma will only set those values when creating the index." << std::endl;
00968 file << "SN" << "\t" << "AS" << "\t" << "SP" << "\t" << "UR" << "\t" << "LN" << "\t" << "M5" << std::endl;
00969 for (unsigned int i=0; i<header->_chromosomeCount; i++)
00970 {
00971 file
00972 << header->_chromosomes[i].name
00973 << "\t" << header->_chromosomes[i].assemblyID
00974 << "\t" << header->_chromosomes[i].uri
00975 << "\t" << header->_chromosomes[i].species
00976 << "\t" << header->_chromosomes[i].size
00977 << "\t" << header->_chromosomes[i].md5
00978 << std::endl;
00979 }
00980 }
00981
00982 bool loadHeaderFromTSV(std::istream &input)
00983 {
00984 return false;
00985 }
00986
00987
00988 void GenomeSequence::getString(std::string &str, int chromosome, genomeIndex_t index, int baseCount)
00989 {
00990
00991
00992
00993 genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1;
00994
00995 getString(str, genomeIndex, baseCount);
00996 }
00997
00998 void GenomeSequence::getString(String &str, int chromosome, genomeIndex_t index, int baseCount)
00999 {
01000 std::string string;
01001 this-> getString(string, chromosome, index, baseCount);
01002 str = string.c_str();
01003 }
01004
01005 void GenomeSequence::getString(std::string &str, genomeIndex_t index, int baseCount)
01006 {
01007 str.clear();
01008 if (baseCount > 0)
01009 {
01010 for (int i=0; i<baseCount; i++)
01011 {
01012 str.push_back((*this)[index + i]);
01013 }
01014 }
01015 else
01016 {
01017
01018
01019 for (int i=0; i< -baseCount; i++)
01020 {
01021 str.push_back(base2complement[(int)(*this)[index + i]]);
01022 }
01023 }
01024 }
01025
01026 void GenomeSequence::getString(String &str, genomeIndex_t index, int baseCount)
01027 {
01028 std::string string;
01029 getString(string, index, baseCount);
01030 str = string.c_str();
01031 }
01032
01033 void GenomeSequence::getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd)
01034 {
01035 str.clear();
01036 if (baseCount > 0)
01037 {
01038 for (int i=0; i<baseCount; i++)
01039 {
01040 char base = (*this)[index + i];
01041 if (in(index+i, highLightStart, highLightEnd))
01042 base = tolower(base);
01043 str.push_back(base);
01044 }
01045 }
01046 else
01047 {
01048
01049
01050 for (int i=0; i< -baseCount; i++)
01051 {
01052 char base = base2complement[(int)(*this)[index + i]];
01053 if (in(index+i, highLightStart, highLightEnd))
01054 base = tolower(base);
01055 str.push_back(base);
01056 }
01057 }
01058 }
01059
01060 void GenomeSequence::print30(genomeIndex_t index)
01061 {
01062 std::cout << "index: " << index << "\n";
01063 for (genomeIndex_t i=index-30; i<index+30; i++)
01064 std::cout << (*this)[i];
01065 std::cout << "\n";
01066 for (genomeIndex_t i=index-30; i<index; i++)
01067 std::cout << " ";
01068 std::cout << "^";
01069 std::cout << std::endl;
01070 }
01071
01072 void GenomeSequence::getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location)
01073 {
01074 result = "";
01075 for (uint32_t i=0; i < read.size(); i++)
01076 {
01077 if (read[i] == (*this)[location+i])
01078 result.push_back(' ');
01079 else
01080 result.push_back('^');
01081 }
01082 }
01083
01084 void GenomeSequence::getMismatchString(std::string &result, const std::string &read, genomeIndex_t location)
01085 {
01086 result = "";
01087 for (uint32_t i=0; i < read.size(); i++)
01088 {
01089 if (read[i] == (*this)[location+i])
01090 result.push_back(toupper(read[i]));
01091 else
01092 result.push_back(tolower(read[i]));
01093 }
01094 }
01095
01096 genomeIndex_t GenomeSequence::simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize)
01097 {
01098 int bestScore = 1000000;
01099 genomeIndex_t bestMatchLocation = INVALID_GENOME_INDEX;
01100 for (int i=-windowSize; i<windowSize; i++)
01101 {
01102 int newScore;
01103
01104 if (i<0 && ((uint32_t) -i) > index) continue;
01105 if (index + i + read.size() >= getNumberBases()) continue;
01106
01107 if (quality=="")
01108 {
01109 newScore = this->getMismatchCount(read, index + i);
01110 }
01111 else
01112 {
01113 newScore = this->getSumQ(read, quality, index + i);
01114 }
01115 if (newScore < bestScore)
01116 {
01117 bestScore = newScore;
01118 bestMatchLocation = index + i;
01119 }
01120 }
01121 return bestMatchLocation;
01122 }
01123
01124 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h)
01125 {
01126 stream << (MemoryMapArrayHeader &) h;
01127 stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01128 stream << "isColorSpace: " << h._colorSpace << std::endl;
01129 stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01130 uint64_t totalSize = 0;
01131 for (uint32_t i=0; i < h._chromosomeCount; i++)
01132 {
01133 totalSize += h._chromosomes[i].size;
01134 stream << "Chromosome Index " << i << " name: " << h._chromosomes[i].name << std::endl;
01135 stream << "Chromosome Index " << i << " whole genome start: " << h._chromosomes[i].start << std::endl;
01136 stream << "Chromosome Index " << i << " whole genome size: " << h._chromosomes[i].size << std::endl;
01137 stream << "Chromosome Index " << i << " md5 checksum: " << h._chromosomes[i].md5 << std::endl;
01138 stream << "Chromosome Index " << i << " assemblyID: " << h._chromosomes[i].assemblyID << std::endl;
01139 stream << "Chromosome Index " << i << " species: " << h._chromosomes[i].species << std::endl;
01140 stream << "Chromosome Index " << i << " URI: " << h._chromosomes[i].uri << std::endl;
01141 }
01142 stream << "Total Genome Size: " << totalSize << " bases."<< std::endl;
01143 if (totalSize != h.elementCount)
01144 {
01145 stream << "Total Genome Size: does not match elementCount!\n";
01146 }
01147
01148 stream << std::endl;
01149 return stream;
01150 }
01151
01152 void GenomeSequence::getChromosomeAndIndex(std::string &s, genomeIndex_t i)
01153 {
01154 int whichChromosome = 0;
01155
01156 whichChromosome = getChromosome(i);
01157
01158 if (whichChromosome == INVALID_CHROMOSOME_INDEX)
01159 {
01160 s = "invalid genome index";
01161 }
01162 else
01163 {
01164 std::ostringstream buf;
01165 genomeIndex_t chromosomeIndex = i - getChromosomeStart(whichChromosome) + 1;
01166 buf << header->_chromosomes[whichChromosome].name << ":" << chromosomeIndex;
01167 #if 0
01168 buf << " (GenomeIndex " << i << ")";
01169 #endif
01170 s = buf.str();
01171 }
01172 return;
01173 }
01174
01175
01176 void GenomeSequence::getChromosomeAndIndex(String& s, genomeIndex_t i)
01177 {
01178 std::string ss;
01179 getChromosomeAndIndex(ss, i);
01180 s = ss.c_str();
01181 return;
01182 }
01183
01184
01185
01186
01187
01188
01189
01190
01191
01192
01193
01194
01195
01196
01197
01198
01199
01200 bool GenomeSequence::populateDBSNP(
01201 mmapArrayBool_t &dbSNP,
01202 std::ifstream &inputFile)
01203 {
01204 std::string inputLine;
01205
01206 assert(dbSNP.getElementCount() == getNumberBases());
01207
01208 std::string chromosomeName;
01209 genomeIndex_t chromosomePosition1;
01210 uint64_t ignoredLineCount = 0;
01211
01212 while (getline(inputFile, inputLine))
01213 {
01214 std::istringstream inputLine2(inputLine);
01215 genomeIndex_t genomeIndex;
01216
01217 inputLine2 >> chromosomeName;
01218 inputLine2 >> chromosomePosition1;
01219
01220
01221 if (chromosomeName.size()>0 && chromosomeName[0]=='#')
01222 {
01223 continue;
01224 }
01225
01226 #if 0
01227
01228
01229
01230
01231
01232 if (!isdigit(chromosomeName[0])) continue;
01233 #endif
01234
01235 genomeIndex = getGenomePosition(chromosomeName.c_str(), chromosomePosition1);
01236
01237
01238 if (genomeIndex == INVALID_GENOME_INDEX)
01239 {
01240 ignoredLineCount++;
01241 continue;
01242 }
01243
01244 dbSNP.set(genomeIndex, true);
01245 }
01246
01247 inputFile.close();
01248
01249 if (ignoredLineCount > 0)
01250 {
01251 std::cerr << "GenomeSequence::populateDBSNP: ignored " << ignoredLineCount << " SNP positions due to invalid format of line." << std::endl;
01252 return false;
01253 }
01254 return true;
01255 }
01256
01257 bool GenomeSequence::loadDBSNP(
01258 mmapArrayBool_t &dbSNP,
01259 const char *inputFileName)
01260 {
01261
01262
01263
01264
01265
01266
01267
01268
01269
01270
01271
01272
01273
01274
01275 if (strlen(inputFileName)!=0)
01276 {
01277 std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush;
01278
01279 if (dbSNP.open(inputFileName, O_RDONLY))
01280 {
01281
01282
01283
01284
01285
01286
01287 if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos)
01288 {
01289 std::cerr << "Error: " << dbSNP.getErrorString() << std::endl;
01290 exit(1);
01291 }
01292
01293
01294
01295 std::ifstream inputFile;
01296 inputFile.open(inputFileName);
01297 if (inputFile.fail())
01298 {
01299 std::cerr << "Error: failed to open " << inputFileName << std::endl;
01300 exit(1);
01301 }
01302
01303 std::cerr << "(as text file) ";
01304
01305
01306 dbSNP.create(getNumberBases());
01307
01308
01309 populateDBSNP(dbSNP, inputFile);
01310 inputFile.close();
01311
01312 }
01313 else
01314 {
01315 std::cerr << "(as binary mapped file) ";
01316 }
01317
01318 std::cerr << "DONE!" << std::endl;
01319 return false;
01320 }
01321 else
01322 {
01323 return true;
01324 }
01325 }
01326
01327
01328 #if defined(TEST)
01329
01330 void simplestExample(void)
01331 {
01332 GenomeSequence reference;
01333 genomeIndex_t index;
01334
01335
01336
01337
01338
01339
01340
01341 if (reference.open())
01342 {
01343 perror("GenomeSequence::open");
01344 exit(1);
01345 }
01346
01347
01348 index = 1000000000;
01349
01350
01351
01352
01353
01354
01355
01356
01357
01358 std::cout << "base[" << index << "] = " << reference[index] << std::endl;
01359
01360
01361
01362
01363
01364 int chr = reference.getChromosome(index);
01365 genomeIndex_t chrIndex = index - reference.getChromosomeStart(chr) + 1;
01366
01367 std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
01368
01369
01370
01371
01372
01373 const char *chromosomeName = "5";
01374 chr = reference.getChromosome(chromosomeName);
01375 chrIndex = 100000;
01376
01377 index = reference.getChromosomeStart(chr) + chrIndex - 1;
01378
01379 std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
01380
01381 reference.close();
01382 }
01383
01384 void testGenomeSequence(void)
01385 {
01386 GenomeSequence reference;
01387
01388 #if 0
01389 std::string referenceName = "someotherreference";
01390 if (reference.setFastaName(referenceName))
01391 {
01392 std::cerr << "failed to open reference file "
01393 << referenceName
01394 << std::endl;
01395 exit(1);
01396 }
01397 #endif
01398
01399 std::cerr << "open and prefetch the reference genome: ";
01400
01401
01402 if (reference.open())
01403 {
01404 exit(1);
01405 }
01406 std::cerr << "done!" << std::endl;
01407
01408
01409
01410
01411 genomeIndex_t genomeIndex;
01412 unsigned int chromosomeIndex;
01413 unsigned int chromosome;
01414 std::string chromosomeName;
01415
01416
01417
01418
01419
01420 chromosomeName = "2";
01421 chromosomeIndex = 1234567;
01422
01423 genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
01424 assert(genomeIndex!=INVALID_GENOME_INDEX);
01425 std::cout << "Chromosome " << chromosomeName << ", index ";
01426 std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
01427 std::cout << " at genome index position " << genomeIndex << std::endl;
01428
01429
01430
01431
01432
01433
01434
01435 chromosome = reference.getChromosome(genomeIndex);
01436 unsigned int newChromosomeIndex;
01437
01438 newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
01439
01440 assert(chromosomeIndex == newChromosomeIndex);
01441
01442
01443
01444 PackedRead pr;
01445
01446 pr.set("ATCGATCG", 0);
01447 assert(pr.size()==8);
01448 assert(pr[0]==GenomeSequence::base2int[(int) 'A']);
01449 assert(pr[1]==GenomeSequence::base2int[(int) 'T']);
01450 assert(pr[2]==GenomeSequence::base2int[(int) 'C']);
01451 assert(pr[3]==GenomeSequence::base2int[(int) 'G']);
01452 pr.set("ATCGATCG", 1);
01453 assert(pr.size()==9);
01454 pr.set("", 0);
01455 assert(pr.size()==0);
01456 pr.set("", 1);
01457 assert(pr.size()==1);
01458 pr.set("", 2);
01459 assert(pr.size()==2);
01460 pr.set("", 3);
01461 assert(pr.size()==3);
01462 assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
01463 assert(pr[1]==GenomeSequence::base2int[(int) 'N']);
01464 assert(pr[2]==GenomeSequence::base2int[(int) 'N']);
01465 pr.set("C", 1);
01466 assert(pr.size()==2);
01467 assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
01468 assert(pr[1]==GenomeSequence::base2int[(int) 'C']);
01469
01470 }
01471
01472
01473
01474
01475
01476
01477
01478
01479 int main(int argc, const char **argv)
01480 {
01481 #if 1
01482 simplestExample();
01483 #else
01484 testGenomeSequence();
01485 #endif
01486 exit(0);
01487 }
01488
01489 #endif