00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "assert.h"
00019 #include "ctype.h"
00020 #include "stdio.h"
00021 #include "zlib.h"
00022 #include "Error.h"
00023
00024
00025 #include "Generic.h"
00026 #include "GenomeSequence.h"
00027
00028 #include <algorithm>
00029 #include <istream>
00030 #include <fstream>
00031 #include <sstream>
00032 #include <stdexcept>
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050 unsigned char GenomeSequence::base2int[256+1] =
00051 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00052 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00053 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\004\005"
00054 "\000\001\002\003\005\005\005\005\005\005\005\005\005\005\005\005"
00055 "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005"
00056 "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005"
00057 "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005"
00058 "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005"
00059
00060 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00061 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00062 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00063 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00064 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00065 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00066 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00067 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00068 ;
00069
00070
00071
00072
00073 const char GenomeSequence::int2base[] = "ACGTNMXXXXXXXXXX";
00074
00075
00076
00077 const char GenomeSequence::int2colorSpace[] = "0123NXXXXXXXXXXX";
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088 unsigned char GenomeSequence::base2complement[256+1 ] =
00089 "NNNNNNNNNNNNNNNN"
00090 "NNNNNNNNNNNNNNNN"
00091 "NNNNNNNNNNNNNNNN"
00092 "0123NNNNNNNNNNNN"
00093 "NTNGNNNCNNNNNNNN"
00094 "NNNNANNNNNNNNNNN"
00095 "NTNGNNNCNNNNNNNN"
00096 "NNNNANNNNNNNNNNN"
00097
00098 "NNNNNNNNNNNNNNNN"
00099 "NNNNNNNNNNNNNNNN"
00100 "NNNNNNNNNNNNNNNN"
00101 "NNNNNNNNNNNNNNNN"
00102 "NNNNNNNNNNNNNNNN"
00103 "NNNNNNNNNNNNNNNN"
00104 "NNNNNNNNNNNNNNNN"
00105 "NNNNNNNNNNNNNNNN"
00106 ;
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121 void PackedRead::set(const char *rhs, int padWithNCount)
00122 {
00123 clear();
00124
00125
00126 while (padWithNCount>1)
00127 {
00128 packedBases.push_back(
00129 GenomeSequence::base2int[(int) 'N'] << 4 |
00130 GenomeSequence::base2int[(int) 'N']
00131 );
00132 padWithNCount -= 2;
00133 length+=2;
00134 }
00135
00136
00137
00138 if (padWithNCount)
00139 {
00140
00141
00142 packedBases.push_back(
00143 GenomeSequence::base2int[(int) *rhs] << 4 |
00144 GenomeSequence::base2int[(int) 'N']
00145 );
00146
00147 if (*rhs)
00148 {
00149 length+=2;
00150 rhs++;
00151 }
00152 else
00153 {
00154 length++;
00155 }
00156 padWithNCount--;
00157 assert(padWithNCount==0);
00158 }
00159
00160
00161 while (*rhs && *(rhs+1))
00162 {
00163 packedBases.push_back(
00164 GenomeSequence::base2int[(int) *(rhs+1)] << 4 |
00165 GenomeSequence::base2int[(int) *(rhs+0)]
00166 );
00167 rhs+=2;
00168 length+=2;
00169 }
00170
00171
00172
00173 if (*rhs)
00174 {
00175 packedBases.push_back(
00176 GenomeSequence::base2int[(int) *(rhs+0)]
00177 );
00178 length++;
00179 }
00180 return;
00181 }
00182
00183 std::string GenomeSequence::IntegerToSeq(unsigned int n, unsigned int wordsize) const
00184 {
00185 std::string sequence("");
00186 for (unsigned int i = 0; i < wordsize; i ++)
00187 sequence += "N";
00188
00189 unsigned int clearHigherBits = ~(3U << (wordsize<<1));
00190
00191 if (n > clearHigherBits)
00192 error("%d needs to be a non-negative integer < clearHigherBits\n", n);
00193
00194 for (unsigned int i = 0; i < wordsize; i++)
00195 {
00196 sequence[wordsize-1-i] = int2base[n & 3];
00197 n >>= 2;
00198 }
00199 return sequence;
00200 }
00201
00202
00203
00204 GenomeSequence::GenomeSequence()
00205 {
00206 constructorClear();
00207 }
00208
00209 void GenomeSequence::constructorClear()
00210 {
00211 _debugFlag = 0;
00212 _progressStream = NULL;
00213 _colorSpace = false;
00214 _createOverwrite = false;
00215 }
00216
00217 void GenomeSequence::setup(const char *referenceFilename)
00218 {
00219 setReferenceName(referenceFilename);
00220
00221 if (_progressStream) *_progressStream << "open and prefetch reference genome " << referenceFilename << ": " << std::flush;
00222
00223 if (open(false))
00224 {
00225 std::cerr << "Failed to open reference genome " << referenceFilename << std::endl;
00226 std::cerr << errorStr << std::endl;
00227 exit(1);
00228 }
00229
00230 prefetch();
00231 if (_progressStream) *_progressStream << "done." << std::endl << std::flush;
00232 }
00233
00234 GenomeSequence::~GenomeSequence()
00235 {
00236
00237 _umfaFile.close();
00238 }
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248
00249
00250 bool GenomeSequence::open(bool isColorSpace, int flags)
00251 {
00252 bool rc;
00253
00254 if (isColorSpace)
00255 {
00256 _umfaFilename = _baseFilename + "-cs.umfa";
00257 }
00258 else
00259 {
00260 _umfaFilename = _baseFilename + "-bs.umfa";
00261 }
00262
00263 rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags);
00264 if (rc)
00265 {
00266 std::cerr << "GenomeSequence::open: failed to open file "
00267 << _umfaFilename
00268 << std::endl;
00269 return true;
00270 }
00271
00272 _colorSpace = header->_colorSpace;
00273
00274 return false;
00275 }
00276
00277 void GenomeSequence::sanityCheck(MemoryMap &fasta) const
00278 {
00279 unsigned int i;
00280
00281 unsigned int genomeIndex = 0;
00282 for (i=0; i<fasta.length(); i++)
00283 {
00284 switch (fasta[i])
00285 {
00286 case '>':
00287 while (fasta[i]!='\n' && fasta[i]!='\r') i++;
00288 break;
00289 case '\n':
00290 case '\r':
00291 break;
00292 default:
00293 assert(base2int[(int)(*this)[genomeIndex]] == base2int[(int) fasta[i]]);
00294 #if 0
00295 if (base2int[(*this)[genomeIndex]] != base2int[(int) fasta[i]])
00296 {
00297 int bp1 = base2int[(*this)[genomeIndex]];
00298 int bp2 = base2int[fasta[i]];
00299 printf("failing at genome index = %u, fasta index = %u.\n",
00300 genomeIndex,i);
00301 }
00302 #endif
00303 genomeIndex++;
00304 break;
00305 }
00306 }
00307 }
00308
00309 #define HAS_SUFFIX(str, suffix) ((strlen(suffix) < str.size()) && (str.substr(str.size() - strlen(suffix)) == suffix))
00310
00311
00312
00313
00314
00315 bool GenomeSequence::setReferenceName(std::string referenceFilename)
00316 {
00317
00318 if (HAS_SUFFIX(referenceFilename, ".fa"))
00319 {
00320 _referenceFilename = referenceFilename;
00321 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3);
00322 }
00323 else if (HAS_SUFFIX(referenceFilename, ".umfa"))
00324 {
00325 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5);
00326 }
00327 else if (HAS_SUFFIX(referenceFilename, "-cs.umfa"))
00328 {
00329 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00330 }
00331 else if (HAS_SUFFIX(referenceFilename, "-bs.umfa"))
00332 {
00333 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00334 }
00335 else
00336 {
00337 _baseFilename = referenceFilename;
00338 }
00339 _fastaFilename = _baseFilename + ".fa";
00340
00341 return false;
00342 }
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353 bool GenomeSequence::setChromosomeMD5andLength(uint32_t whichChromosome)
00354 {
00355 if (whichChromosome>=header->_chromosomeCount) return true;
00356
00357 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00358 c->size = header->elementCount - c->start;
00359
00360 #if !defined(WIN32)
00361 MD5_CTX md5Context;
00362 uint8_t md5Signature[MD5_DIGEST_LENGTH];
00363
00364
00365
00366
00367
00368 char *md5Buffer = (char *) malloc(c->size);
00369
00370 MD5_Init(&md5Context);
00371
00372 for (genomeIndex_t i = 0; i < c->size; i ++)
00373 {
00374 md5Buffer[i] = (*this)[c->start + i];
00375 }
00376 MD5_Update(&md5Context, md5Buffer, c->size);
00377 MD5_Final((unsigned char *) &md5Signature, &md5Context);
00378 free(md5Buffer);
00379 for (int i=0; i<MD5_DIGEST_LENGTH; i++)
00380 {
00381
00382 sprintf(c->md5+2*i, "%02x", md5Signature[i]);
00383 }
00384
00385
00386 c->md5[2*MD5_DIGEST_LENGTH] = '\0';
00387 #else
00388 memset(c->md5, 0, 2*MD5_DIGEST_LENGTH);
00389 #endif
00390 return false;
00391 }
00392
00393
00394
00395
00396
00397 static bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
00398 {
00399 chromosomeCount = 0;
00400 baseCount = 0;
00401 bool atLineStart = true;
00402
00403
00404
00405
00406
00407 for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00408 {
00409 switch (fastaData[fastaIndex])
00410 {
00411 case '\n':
00412 case '\r':
00413 atLineStart = true;
00414 break;
00415 case '>':
00416 {
00417 if (!atLineStart) break;
00418 chromosomeCount++;
00419
00420
00421
00422 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
00423 {
00424 fastaIndex++;
00425 }
00426 break;
00427 }
00428 default:
00429 baseCount++;
00430 atLineStart = false;
00431 break;
00432 }
00433
00434 }
00435 return false;
00436 }
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447 bool GenomeSequence::create()
00448 {
00449
00450 assert(int2base[base2int[(int) 'A']]=='A');
00451 assert(int2base[base2int[(int) 'a']]=='A');
00452 assert(int2base[base2int[(int) 'T']]=='T');
00453 assert(int2base[base2int[(int) 't']]=='T');
00454 assert(int2base[base2int[(int) 'C']]=='C');
00455 assert(int2base[base2int[(int) 'c']]=='C');
00456 assert(int2base[base2int[(int) 'G']]=='G');
00457 assert(int2base[base2int[(int) 'g']]=='G');
00458 assert(int2base[base2int[(int) 'N']]=='N');
00459 assert(int2base[base2int[(int) 'n']]=='N');
00460 assert(int2base[base2int[(int) 'M']]=='M');
00461 assert(int2base[base2int[(int) 'm']]=='M');
00462
00463 assert(base2int[(int) 'N']==4);
00464 assert(base2int[(int) 'n']==4);
00465 assert(base2int[(int) 'A']==0);
00466 assert(base2int[(int) 'a']==0);
00467 assert(base2int[(int) 'T']==3);
00468 assert(base2int[(int) 't']==3);
00469 assert(base2int[(int) 'C']==1);
00470 assert(base2int[(int) 'c']==1);
00471 assert(base2int[(int) 'G']==2);
00472 assert(base2int[(int) 'g']==2);
00473
00474 if (_baseFilename=="")
00475 {
00476 std::cerr << "Base reference filename is empty." << std::endl;
00477 return true;
00478 }
00479
00480 if (isColorSpace())
00481 {
00482 _umfaFilename = _baseFilename + "-cs.umfa";
00483 }
00484 else
00485 {
00486 _umfaFilename = _baseFilename + "-bs.umfa";
00487 }
00488
00489 if (!_createOverwrite && access(_umfaFilename.c_str(), R_OK) == 0)
00490 {
00491 std::cerr << "Output file '" << _umfaFilename << "' exists or is not writable - please remove." << std::endl;
00492 return true;
00493 }
00494
00495 MemoryMap fastaFile;
00496
00497 if (fastaFile.open(_fastaFilename.c_str()))
00498 {
00499 std::cerr << "failed to open input fasta file '" << _fastaFilename << "'." << std::endl;
00500 return true;
00501 }
00502
00503 std::cerr << "Creating FASTA "
00504 << (isColorSpace() ? "color space " : "")
00505 << "binary cache file '"
00506 << _umfaFilename
00507 << "'."
00508 << std::endl;
00509
00510 std::cerr << std::flush;
00511
00512
00513
00514
00515
00516 const char *fasta = (const char *) fastaFile.data;
00517 size_t fastaDataSize = fastaFile.length();
00518
00519 uint32_t chromosomeCount = 0;
00520 uint64_t baseCount = 0;
00521 getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount);
00522
00523 if (genomeSequenceArray::create(_umfaFilename.c_str(), baseCount, chromosomeCount))
00524 {
00525 std::cerr << "failed to create '"
00526 << _umfaFilename
00527 << "'."
00528 << std::endl;
00529 perror("");
00530 return true;
00531 }
00532 header->elementCount = 0;
00533 header->_colorSpace = isColorSpace();
00534 header->setApplication(_application.c_str());
00535 header->_chromosomeCount = chromosomeCount;
00536
00537
00538
00539 for (uint32_t i=0; i<header->_chromosomeCount; i++) header->_chromosomes[i].constructorClear();
00540
00541 std::string chromosomeName;
00542
00543
00544
00545 signed char lastBase = base2int[(int) 'N'];
00546 bool terminateLoad = false;
00547 int percent = -1, newPercent;
00548 uint32_t whichChromosome = 0;
00549 for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00550 {
00551 if (_progressStream)
00552 {
00553 newPercent = (int) (1.0 * fastaIndex / fastaDataSize) * 100;
00554 if (newPercent>percent)
00555 {
00556 std::cerr << "\r" << newPercent << "% ";
00557 std::cerr << std::flush;
00558 percent = newPercent;
00559 }
00560 }
00561 switch (fasta[fastaIndex])
00562 {
00563 case '\n':
00564 case '\r':
00565 break;
00566 case '>':
00567 {
00568 chromosomeName = "";
00569 fastaIndex++;
00570
00571
00572
00573 while (!isspace(fasta[fastaIndex]))
00574 {
00575 chromosomeName += fasta[fastaIndex++];
00576 }
00577
00578
00579
00580 while (fasta[fastaIndex]!='\n' && fasta[fastaIndex]!='\r')
00581 {
00582 fastaIndex++;
00583 }
00584
00585
00586
00587
00588 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00589 c->setChromosomeName(chromosomeName.c_str());
00590 c->start = header->elementCount;
00591
00592
00593 if (whichChromosome>0)
00594 {
00595
00596
00597
00598
00599
00600
00601 setChromosomeMD5andLength(whichChromosome - 1);
00602 }
00603 whichChromosome++;
00604 if (whichChromosome > header->_chromosomeCount)
00605 {
00606 std::cerr << "BUG: Exceeded computed chromosome count ("
00607 << header->_chromosomeCount
00608 << ") - genome is now truncated at chromosome "
00609 << header->_chromosomes[header->_chromosomeCount-1].name
00610 << " (index "
00611 << header->_chromosomeCount
00612 << ")."
00613 << std::endl;
00614 terminateLoad = true;
00615 }
00616 break;
00617 }
00618 default:
00619
00620
00621
00622 if (isColorSpace())
00623 {
00624
00625
00626
00627
00628
00629 const char fromBase2CS[] =
00630 {
00631 0,
00632 1,
00633 2,
00634 3,
00635 1,
00636 0,
00637 3,
00638 2,
00639 2,
00640 3,
00641 0,
00642 1,
00643 3,
00644 2,
00645 1,
00646 0,
00647 };
00648
00649
00650
00651
00652
00653
00654
00655
00656 char thisBase = base2int[(int)(fasta[fastaIndex])];
00657 if (lastBase>=0)
00658 {
00659 char color;
00660 if (lastBase>3 || thisBase>3) color=4;
00661 else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
00662
00663
00664 set(header->elementCount++, int2base[(int) color]);
00665 }
00666 lastBase = thisBase;
00667 }
00668 else
00669 {
00670 set(header->elementCount++, toupper(fasta[fastaIndex]));
00671 }
00672 break;
00673 }
00674
00675
00676
00677
00678
00679 if (terminateLoad) break;
00680 }
00681
00682
00683
00684
00685
00686 if (whichChromosome==0)
00687 {
00688 fastaFile.close();
00689 throw std::runtime_error("No chromosomes found - aborting!");
00690 }
00691 else
00692 {
00693 setChromosomeMD5andLength(whichChromosome-1);
00694 }
00695
00696 #if 0
00697 sanityCheck(fastaFile);
00698 #endif
00699 fastaFile.close();
00700
00701 if (_progressStream) *_progressStream << "\r";
00702
00703 std::cerr << "FASTA binary cache file '"
00704 << _umfaFilename
00705 << "' created."
00706 << std::endl;
00707
00708
00709
00710
00711 return false;
00712 }
00713
00714 int GenomeSequence::getChromosomeCount() const
00715 {
00716 return header->_chromosomeCount;
00717 }
00718
00719
00720 int GenomeSequence::getChromosome(genomeIndex_t position) const
00721 {
00722 if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX;
00723
00724 if (header->_chromosomeCount == 0)
00725 return INVALID_CHROMOSOME_INDEX;
00726
00727 int start = 0;
00728 int stop = header->_chromosomeCount - 1;
00729
00730
00731
00732
00733 if (position > header->_chromosomes[stop].start)
00734 return (stop);
00735
00736 while (start <= stop)
00737 {
00738 int middle = (start + stop) / 2;
00739
00740 if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start)
00741 return middle;
00742
00743 if (position == header->_chromosomes[middle + 1].start)
00744 return (middle + 1);
00745
00746 if (position > header->_chromosomes[middle + 1].start)
00747 start = middle + 1;
00748
00749 if (position < header->_chromosomes[middle].start)
00750 stop = middle - 1;
00751 }
00752
00753 return -1;
00754 }
00755
00756
00757
00758
00759
00760
00761
00762 genomeIndex_t GenomeSequence::getGenomePosition(
00763 const char *chromosomeName,
00764 unsigned int chromosomeIndex) const
00765 {
00766 genomeIndex_t i = getGenomePosition(chromosomeName);
00767 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00768 return i + chromosomeIndex - 1;
00769 }
00770
00771 genomeIndex_t GenomeSequence::getGenomePosition(
00772 int chromosome,
00773 unsigned int chromosomeIndex) const
00774 {
00775 if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX;
00776
00777 genomeIndex_t i = header->_chromosomes[chromosome].start;
00778 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00779 return i + chromosomeIndex - 1;
00780 }
00781
00782
00783
00784
00785
00786
00787
00788
00789
00790 genomeIndex_t GenomeSequence::getGenomePosition(const char *chromosomeName) const
00791 {
00792 int chromosome = getChromosome(chromosomeName);
00793 if (chromosome==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00794 return header->_chromosomes[chromosome].start;
00795 }
00796
00797 int GenomeSequence::getChromosome(const char *chromosomeName) const
00798 {
00799 unsigned int i;
00800 for (i=0; i<header->_chromosomeCount; i++)
00801 {
00802 if (strcmp(header->_chromosomes[i].name, chromosomeName)==0)
00803 {
00804 return i;
00805 }
00806 }
00807 return INVALID_CHROMOSOME_INDEX;
00808 }
00809
00810
00811
00812
00813
00814 void GenomeSequence::getReverseRead(std::string &read)
00815 {
00816 std::string newRead;
00817 if (read.size()) for (int32_t i=(int) read.size() - 1; i>=0; i--)
00818 {
00819 newRead.push_back(BasePair(read[i]));
00820 }
00821 read = newRead;
00822 }
00823
00824 void GenomeSequence::getReverseRead(String& read)
00825 {
00826 int i = 0;
00827 int j = read.Length()-1;
00828 char temp;
00829 while (i < j)
00830 {
00831 temp = read[j];
00832 read[j] = read[i];
00833 read[i] = temp;
00834 }
00835 }
00836
00837 #define ABS(x) ( (x) > 0 ? (x) : -(x) )
00838 int GenomeSequence::debugPrintReadValidation(
00839 std::string &read,
00840 std::string &quality,
00841 char direction,
00842 genomeIndex_t readLocation,
00843 int sumQuality,
00844 int mismatchCount,
00845 bool recurse
00846 )
00847 {
00848 int validateSumQ = 0;
00849 int validateMismatchCount = 0;
00850 int rc = 0;
00851 std::string genomeData;
00852
00853 for (uint32_t i=0; i<read.size(); i++)
00854 {
00855 if (tolower(read[i]) != tolower((*this)[readLocation + i]))
00856 {
00857 validateSumQ += quality[i] - '!';
00858
00859 if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
00860 genomeData.push_back(tolower((*this)[readLocation + i]));
00861 }
00862 else
00863 {
00864 genomeData.push_back(toupper((*this)[readLocation + i]));
00865 }
00866 }
00867 assert(validateSumQ>=0);
00868 if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
00869 {
00870 printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n",
00871 genomeData.c_str(),
00872 read.c_str(),
00873 validateSumQ,
00874 sumQuality
00875 );
00876 rc++;
00877 }
00878 else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
00879 {
00880 printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n",
00881 genomeData.c_str(),
00882 read.c_str(),
00883 validateMismatchCount,
00884 mismatchCount
00885 );
00886 rc++;
00887 }
00888 else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
00889 {
00890 printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
00891 genomeData.c_str(),
00892 read.c_str(),
00893 validateSumQ,
00894 sumQuality,
00895 validateMismatchCount,
00896 mismatchCount
00897 );
00898 rc++;
00899 }
00900
00901 if (recurse && ABS(validateMismatchCount - mismatchCount) > (int) read.size()/2)
00902 {
00903 printf("large mismatch difference, trying reverse strand: ");
00904 std::string reverseRead = read;
00905 std::string reverseQuality = quality;
00906 getReverseRead(reverseRead);
00907 reverse(reverseQuality.begin(), reverseQuality.end());
00908 rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
00909 }
00910 return rc;
00911 }
00912 #undef ABS
00913
00914
00915 bool GenomeSequence::wordMatch(unsigned int index, std::string &word) const
00916 {
00917 for (uint32_t i = 0; i<word.size(); i++)
00918 {
00919 if ((*this)[index + i] != word[i]) return false;
00920 }
00921 return true;
00922 }
00923
00924 bool GenomeSequence::printNearbyWords(unsigned int index, unsigned int deviation, std::string &word) const
00925 {
00926 for (unsigned int i = index - deviation; i < index + deviation; i++)
00927 {
00928 if (wordMatch(i, word))
00929 {
00930 std::cerr << "word '"
00931 << word
00932 << "' found "
00933 << index - i
00934 << " away from position "
00935 << index
00936 << "."
00937 << std::endl;
00938 }
00939 }
00940 return false;
00941 }
00942
00943 void GenomeSequence::dumpSequenceSAMDictionary(std::ostream &file) const
00944 {
00945 for (unsigned int i=0; i<header->_chromosomeCount; i++)
00946 {
00947 file
00948 << "@SQ"
00949 << "\tSN:" << header->_chromosomes[i].name
00950 << "\tLN:" << header->_chromosomes[i].size
00951 << "\tAS:" << header->_chromosomes[i].assemblyID
00952 << "\tM5:" << header->_chromosomes[i].md5
00953 << "\tUR:" << header->_chromosomes[i].uri
00954 << "\tSP:" << header->_chromosomes[i].species
00955 << std::endl;
00956 }
00957 }
00958
00959 void GenomeSequence::dumpHeaderTSV(std::ostream &file) const
00960 {
00961 file << "# Reference: " << _baseFilename << std::endl;
00962 file << "# SN: sample name - must be unique" << std::endl;
00963 file << "# AS: assembly name" << std::endl;
00964 file << "# SP: species" << std::endl;
00965 file << "# LN: chromosome/contig length" << std::endl;
00966 file << "# M5: chromosome/contig MD5 checksum" << std::endl;
00967 file << "# LN and M5 are only printed for informational purposes." << std::endl;
00968 file << "# Karma will only set those values when creating the index." << std::endl;
00969 file << "SN" << "\t" << "AS" << "\t" << "SP" << "\t" << "UR" << "\t" << "LN" << "\t" << "M5" << std::endl;
00970 for (unsigned int i=0; i<header->_chromosomeCount; i++)
00971 {
00972 file
00973 << header->_chromosomes[i].name
00974 << "\t" << header->_chromosomes[i].assemblyID
00975 << "\t" << header->_chromosomes[i].uri
00976 << "\t" << header->_chromosomes[i].species
00977 << "\t" << header->_chromosomes[i].size
00978 << "\t" << header->_chromosomes[i].md5
00979 << std::endl;
00980 }
00981 }
00982
00983
00984 void GenomeSequence::getString(std::string &str, int chromosome, genomeIndex_t index, int baseCount) const
00985 {
00986
00987
00988
00989 genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1;
00990
00991 getString(str, genomeIndex, baseCount);
00992 }
00993
00994 void GenomeSequence::getString(String &str, int chromosome, genomeIndex_t index, int baseCount) const
00995 {
00996 std::string string;
00997 this-> getString(string, chromosome, index, baseCount);
00998 str = string.c_str();
00999 }
01000
01001 void GenomeSequence::getString(std::string &str, genomeIndex_t index, int baseCount) const
01002 {
01003 str.clear();
01004 if (baseCount > 0)
01005 {
01006 for (int i=0; i<baseCount; i++)
01007 {
01008 str.push_back((*this)[index + i]);
01009 }
01010 }
01011 else
01012 {
01013
01014
01015 for (int i=0; i< -baseCount; i++)
01016 {
01017 str.push_back(base2complement[(int)(*this)[index + i]]);
01018 }
01019 }
01020 }
01021
01022 void GenomeSequence::getString(String &str, genomeIndex_t index, int baseCount) const
01023 {
01024 std::string string;
01025 getString(string, index, baseCount);
01026 str = string.c_str();
01027 }
01028
01029 void GenomeSequence::getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const
01030 {
01031 str.clear();
01032 if (baseCount > 0)
01033 {
01034 for (int i=0; i<baseCount; i++)
01035 {
01036 char base = (*this)[index + i];
01037 if (in(index+i, highLightStart, highLightEnd))
01038 base = tolower(base);
01039 str.push_back(base);
01040 }
01041 }
01042 else
01043 {
01044
01045
01046 for (int i=0; i< -baseCount; i++)
01047 {
01048 char base = base2complement[(int)(*this)[index + i]];
01049 if (in(index+i, highLightStart, highLightEnd))
01050 base = tolower(base);
01051 str.push_back(base);
01052 }
01053 }
01054 }
01055
01056 void GenomeSequence::print30(genomeIndex_t index) const
01057 {
01058 std::cout << "index: " << index << "\n";
01059 for (genomeIndex_t i=index-30; i<index+30; i++)
01060 std::cout << (*this)[i];
01061 std::cout << "\n";
01062 for (genomeIndex_t i=index-30; i<index; i++)
01063 std::cout << " ";
01064 std::cout << "^";
01065 std::cout << std::endl;
01066 }
01067
01068 void GenomeSequence::getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const
01069 {
01070 result = "";
01071 for (uint32_t i=0; i < read.size(); i++)
01072 {
01073 if (read[i] == (*this)[location+i])
01074 result.push_back(' ');
01075 else
01076 result.push_back('^');
01077 }
01078 }
01079
01080 void GenomeSequence::getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const
01081 {
01082 result = "";
01083 for (uint32_t i=0; i < read.size(); i++)
01084 {
01085 if (read[i] == (*this)[location+i])
01086 result.push_back(toupper(read[i]));
01087 else
01088 result.push_back(tolower(read[i]));
01089 }
01090 }
01091
01092 genomeIndex_t GenomeSequence::simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const
01093 {
01094 int bestScore = 1000000;
01095 genomeIndex_t bestMatchLocation = INVALID_GENOME_INDEX;
01096 for (int i=-windowSize; i<windowSize; i++)
01097 {
01098 int newScore;
01099
01100 if (i<0 && ((uint32_t) -i) > index) continue;
01101 if (index + i + read.size() >= getNumberBases()) continue;
01102
01103 if (quality=="")
01104 {
01105 newScore = this->getMismatchCount(read, index + i);
01106 }
01107 else
01108 {
01109 newScore = this->getSumQ(read, quality, index + i);
01110 }
01111 if (newScore < bestScore)
01112 {
01113 bestScore = newScore;
01114 bestMatchLocation = index + i;
01115 }
01116 }
01117 return bestMatchLocation;
01118 }
01119
01120 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h)
01121 {
01122 stream << (MemoryMapArrayHeader &) h;
01123 stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01124 stream << "isColorSpace: " << h._colorSpace << std::endl;
01125 stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01126 uint64_t totalSize = 0;
01127 for (uint32_t i=0; i < h._chromosomeCount; i++)
01128 {
01129 totalSize += h._chromosomes[i].size;
01130 stream << "Chromosome Index " << i << " name: " << h._chromosomes[i].name << std::endl;
01131 stream << "Chromosome Index " << i << " whole genome start: " << h._chromosomes[i].start << std::endl;
01132 stream << "Chromosome Index " << i << " whole genome size: " << h._chromosomes[i].size << std::endl;
01133 stream << "Chromosome Index " << i << " md5 checksum: " << h._chromosomes[i].md5 << std::endl;
01134 stream << "Chromosome Index " << i << " assemblyID: " << h._chromosomes[i].assemblyID << std::endl;
01135 stream << "Chromosome Index " << i << " species: " << h._chromosomes[i].species << std::endl;
01136 stream << "Chromosome Index " << i << " URI: " << h._chromosomes[i].uri << std::endl;
01137 }
01138 stream << "Total Genome Size: " << totalSize << " bases."<< std::endl;
01139 if (totalSize != h.elementCount)
01140 {
01141 stream << "Total Genome Size: does not match elementCount!\n";
01142 }
01143
01144 stream << std::endl;
01145 return stream;
01146 }
01147
01148 void GenomeSequence::getChromosomeAndIndex(std::string &s, genomeIndex_t i) const
01149 {
01150 int whichChromosome = 0;
01151
01152 whichChromosome = getChromosome(i);
01153
01154 if (whichChromosome == INVALID_CHROMOSOME_INDEX)
01155 {
01156 s = "invalid genome index";
01157 }
01158 else
01159 {
01160 std::ostringstream buf;
01161 genomeIndex_t chromosomeIndex = i - getChromosomeStart(whichChromosome) + 1;
01162 buf << header->_chromosomes[whichChromosome].name << ":" << chromosomeIndex;
01163 #if 0
01164 buf << " (GenomeIndex " << i << ")";
01165 #endif
01166 s = buf.str();
01167 }
01168 return;
01169 }
01170
01171
01172 void GenomeSequence::getChromosomeAndIndex(String& s, genomeIndex_t i) const
01173 {
01174 std::string ss;
01175 getChromosomeAndIndex(ss, i);
01176 s = ss.c_str();
01177 return;
01178 }
01179
01180
01181
01182
01183
01184
01185
01186
01187
01188
01189
01190
01191
01192
01193
01194
01195
01196 bool GenomeSequence::populateDBSNP(
01197 mmapArrayBool_t &dbSNP,
01198 std::ifstream &inputFile) const
01199 {
01200 std::string inputLine;
01201
01202 assert(dbSNP.getElementCount() == getNumberBases());
01203
01204 std::string chromosomeName;
01205 genomeIndex_t chromosomePosition1;
01206 uint64_t ignoredLineCount = 0;
01207
01208 while (getline(inputFile, inputLine))
01209 {
01210 std::istringstream inputLine2(inputLine);
01211 genomeIndex_t genomeIndex;
01212
01213 inputLine2 >> chromosomeName;
01214 inputLine2 >> chromosomePosition1;
01215
01216
01217 if (chromosomeName.size()>0 && chromosomeName[0]=='#')
01218 {
01219 continue;
01220 }
01221
01222 #if 0
01223
01224
01225
01226
01227
01228 if (!isdigit(chromosomeName[0])) continue;
01229 #endif
01230
01231 genomeIndex = getGenomePosition(chromosomeName.c_str(), chromosomePosition1);
01232
01233
01234 if (genomeIndex == INVALID_GENOME_INDEX)
01235 {
01236 ignoredLineCount++;
01237 continue;
01238 }
01239
01240 dbSNP.set(genomeIndex, true);
01241 }
01242
01243 inputFile.close();
01244
01245 if (ignoredLineCount > 0)
01246 {
01247 std::cerr << "GenomeSequence::populateDBSNP: ignored " << ignoredLineCount << " SNP positions due to invalid format of line." << std::endl;
01248 return false;
01249 }
01250 return true;
01251 }
01252
01253 bool GenomeSequence::loadDBSNP(
01254 mmapArrayBool_t &dbSNP,
01255 const char *inputFileName) const
01256 {
01257
01258
01259
01260
01261
01262
01263
01264
01265
01266
01267
01268
01269
01270
01271 if (strlen(inputFileName)!=0)
01272 {
01273 std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush;
01274
01275 if (dbSNP.open(inputFileName, O_RDONLY))
01276 {
01277
01278
01279
01280
01281
01282
01283 if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos)
01284 {
01285 std::cerr << "Error: " << dbSNP.getErrorString() << std::endl;
01286 exit(1);
01287 }
01288
01289
01290
01291 std::ifstream inputFile;
01292 inputFile.open(inputFileName);
01293 if (inputFile.fail())
01294 {
01295 std::cerr << "Error: failed to open " << inputFileName << std::endl;
01296 exit(1);
01297 }
01298
01299 std::cerr << "(as text file) ";
01300
01301
01302 dbSNP.create(getNumberBases());
01303
01304
01305 populateDBSNP(dbSNP, inputFile);
01306 inputFile.close();
01307
01308 }
01309 else
01310 {
01311 std::cerr << "(as binary mapped file) ";
01312 }
01313
01314 std::cerr << "DONE!" << std::endl;
01315 return false;
01316 }
01317 else
01318 {
01319 return true;
01320 }
01321 }
01322
01323
01324 #if defined(TEST)
01325
01326 void simplestExample(void)
01327 {
01328 GenomeSequence reference;
01329 genomeIndex_t index;
01330
01331
01332
01333
01334
01335
01336
01337 if (reference.open())
01338 {
01339 perror("GenomeSequence::open");
01340 exit(1);
01341 }
01342
01343
01344 index = 1000000000;
01345
01346
01347
01348
01349
01350
01351
01352
01353
01354 std::cout << "base[" << index << "] = " << reference[index] << std::endl;
01355
01356
01357
01358
01359
01360 int chr = reference.getChromosome(index);
01361 genomeIndex_t chrIndex = index - reference.getChromosomeStart(chr) + 1;
01362
01363 std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
01364
01365
01366
01367
01368
01369 const char *chromosomeName = "5";
01370 chr = reference.getChromosome(chromosomeName);
01371 chrIndex = 100000;
01372
01373 index = reference.getChromosomeStart(chr) + chrIndex - 1;
01374
01375 std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
01376
01377 reference.close();
01378 }
01379
01380 void testGenomeSequence(void)
01381 {
01382 GenomeSequence reference;
01383
01384 #if 0
01385 std::string referenceName = "someotherreference";
01386 if (reference.setFastaName(referenceName))
01387 {
01388 std::cerr << "failed to open reference file "
01389 << referenceName
01390 << std::endl;
01391 exit(1);
01392 }
01393 #endif
01394
01395 std::cerr << "open and prefetch the reference genome: ";
01396
01397
01398 if (reference.open())
01399 {
01400 exit(1);
01401 }
01402 std::cerr << "done!" << std::endl;
01403
01404
01405
01406
01407 genomeIndex_t genomeIndex;
01408 unsigned int chromosomeIndex;
01409 unsigned int chromosome;
01410 std::string chromosomeName;
01411
01412
01413
01414
01415
01416 chromosomeName = "2";
01417 chromosomeIndex = 1234567;
01418
01419 genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
01420 assert(genomeIndex!=INVALID_GENOME_INDEX);
01421 std::cout << "Chromosome " << chromosomeName << ", index ";
01422 std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
01423 std::cout << " at genome index position " << genomeIndex << std::endl;
01424
01425
01426
01427
01428
01429
01430
01431 chromosome = reference.getChromosome(genomeIndex);
01432 unsigned int newChromosomeIndex;
01433
01434 newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
01435
01436 assert(chromosomeIndex == newChromosomeIndex);
01437
01438
01439
01440 PackedRead pr;
01441
01442 pr.set("ATCGATCG", 0);
01443 assert(pr.size()==8);
01444 assert(pr[0]==GenomeSequence::base2int[(int) 'A']);
01445 assert(pr[1]==GenomeSequence::base2int[(int) 'T']);
01446 assert(pr[2]==GenomeSequence::base2int[(int) 'C']);
01447 assert(pr[3]==GenomeSequence::base2int[(int) 'G']);
01448 pr.set("ATCGATCG", 1);
01449 assert(pr.size()==9);
01450 pr.set("", 0);
01451 assert(pr.size()==0);
01452 pr.set("", 1);
01453 assert(pr.size()==1);
01454 pr.set("", 2);
01455 assert(pr.size()==2);
01456 pr.set("", 3);
01457 assert(pr.size()==3);
01458 assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
01459 assert(pr[1]==GenomeSequence::base2int[(int) 'N']);
01460 assert(pr[2]==GenomeSequence::base2int[(int) 'N']);
01461 pr.set("C", 1);
01462 assert(pr.size()==2);
01463 assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
01464 assert(pr[1]==GenomeSequence::base2int[(int) 'C']);
01465
01466 }
01467
01468
01469
01470
01471
01472
01473
01474
01475 int main(int argc, const char **argv)
01476 {
01477 #if 1
01478 simplestExample();
01479 #else
01480 testGenomeSequence();
01481 #endif
01482 exit(0);
01483 }
01484
01485 #endif