00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "assert.h"
00019 #include "ctype.h"
00020 #include "stdio.h"
00021 #include "zlib.h"
00022 #include "Error.h"
00023
00024
00025 #include "Generic.h"
00026 #include "GenomeSequence.h"
00027
00028 #include <algorithm>
00029 #include <istream>
00030 #include <fstream>
00031 #include <sstream>
00032 #include <stdexcept>
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050 unsigned char GenomeSequence::base2int[256+1] =
00051 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00052 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00053 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\004\005"
00054 "\000\001\002\003\005\005\005\005\005\005\005\005\005\005\005\005"
00055 "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005"
00056 "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005"
00057 "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005"
00058 "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005"
00059
00060 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00061 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00062 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00063 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00064 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00065 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00066 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00067 "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
00068 ;
00069
00070
00071
00072
00073 const char GenomeSequence::int2base[] = "ACGTNMXXXXXXXXXX";
00074
00075
00076
00077 const char GenomeSequence::int2colorSpace[] = "0123NXXXXXXXXXXX";
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088 unsigned char GenomeSequence::base2complement[256+1 ] =
00089 "NNNNNNNNNNNNNNNN"
00090 "NNNNNNNNNNNNNNNN"
00091 "NNNNNNNNNNNNNNNN"
00092 "0123NNNNNNNNNNNN"
00093 "NTNGNNNCNNNNNNNN"
00094 "NNNNANNNNNNNNNNN"
00095 "NTNGNNNCNNNNNNNN"
00096 "NNNNANNNNNNNNNNN"
00097
00098 "NNNNNNNNNNNNNNNN"
00099 "NNNNNNNNNNNNNNNN"
00100 "NNNNNNNNNNNNNNNN"
00101 "NNNNNNNNNNNNNNNN"
00102 "NNNNNNNNNNNNNNNN"
00103 "NNNNNNNNNNNNNNNN"
00104 "NNNNNNNNNNNNNNNN"
00105 "NNNNNNNNNNNNNNNN"
00106 ;
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121 void PackedRead::set(const char *rhs, int padWithNCount)
00122 {
00123 clear();
00124
00125
00126 while (padWithNCount>1)
00127 {
00128 packedBases.push_back(
00129 GenomeSequence::base2int[(int) 'N'] << 4 |
00130 GenomeSequence::base2int[(int) 'N']
00131 );
00132 padWithNCount -= 2;
00133 length+=2;
00134 }
00135
00136
00137
00138 if (padWithNCount)
00139 {
00140
00141
00142 packedBases.push_back(
00143 GenomeSequence::base2int[(int) *rhs] << 4 |
00144 GenomeSequence::base2int[(int) 'N']
00145 );
00146
00147 if (*rhs)
00148 {
00149 length+=2;
00150 rhs++;
00151 }
00152 else
00153 {
00154 length++;
00155 }
00156 padWithNCount--;
00157 assert(padWithNCount==0);
00158 }
00159
00160
00161 while (*rhs && *(rhs+1))
00162 {
00163 packedBases.push_back(
00164 GenomeSequence::base2int[(int) *(rhs+1)] << 4 |
00165 GenomeSequence::base2int[(int) *(rhs+0)]
00166 );
00167 rhs+=2;
00168 length+=2;
00169 }
00170
00171
00172
00173 if (*rhs)
00174 {
00175 packedBases.push_back(
00176 GenomeSequence::base2int[(int) *(rhs+0)]
00177 );
00178 length++;
00179 }
00180 return;
00181 }
00182
00183 std::string GenomeSequence::IntegerToSeq(unsigned int n, unsigned int wordsize) const
00184 {
00185 std::string sequence("");
00186 for (unsigned int i = 0; i < wordsize; i ++)
00187 sequence += "N";
00188
00189 unsigned int clearHigherBits = ~(3U << (wordsize<<1));
00190
00191 if (n > clearHigherBits)
00192 error("%d needs to be a non-negative integer < clearHigherBits\n", n);
00193
00194 for (unsigned int i = 0; i < wordsize; i++)
00195 {
00196 sequence[wordsize-1-i] = int2base[n & 3];
00197 n >>= 2;
00198 }
00199 return sequence;
00200 }
00201
00202
00203
00204 GenomeSequence::GenomeSequence()
00205 {
00206 constructorClear();
00207 }
00208
00209 void GenomeSequence::constructorClear()
00210 {
00211 _debugFlag = 0;
00212 _progressStream = NULL;
00213 _colorSpace = false;
00214 }
00215
00216 void GenomeSequence::setup(const char *referenceFilename)
00217 {
00218 setReferenceName(referenceFilename);
00219
00220 if (_progressStream) *_progressStream << "open and prefetch reference genome " << referenceFilename << ": " << std::flush;
00221
00222 if (open(false))
00223 {
00224 std::cerr << "Failed to open reference genome " << referenceFilename << std::endl;
00225 std::cerr << errorStr << std::endl;
00226 exit(1);
00227 }
00228
00229 prefetch();
00230 if (_progressStream) *_progressStream << "done." << std::endl << std::flush;
00231 }
00232
00233 GenomeSequence::~GenomeSequence()
00234 {
00235
00236 _umfaFile.close();
00237 }
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248
00249 bool GenomeSequence::open(bool isColorSpace, int flags)
00250 {
00251 bool rc;
00252
00253 if (isColorSpace)
00254 {
00255 _umfaFilename = _baseFilename + "-cs.umfa";
00256 }
00257 else
00258 {
00259 _umfaFilename = _baseFilename + "-bs.umfa";
00260 }
00261
00262 rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags);
00263 if (rc)
00264 {
00265 std::cerr << "GenomeSequence::open: failed to open file "
00266 << _umfaFilename
00267 << std::endl;
00268 return true;
00269 }
00270
00271 _colorSpace = header->_colorSpace;
00272
00273 return false;
00274 }
00275
00276 void GenomeSequence::sanityCheck(MemoryMap &fasta) const
00277 {
00278 unsigned int i;
00279
00280 unsigned int genomeIndex = 0;
00281 for (i=0; i<fasta.length(); i++)
00282 {
00283 switch (fasta[i])
00284 {
00285 case '>':
00286 while (fasta[i]!='\n' && fasta[i]!='\r') i++;
00287 break;
00288 case '\n':
00289 case '\r':
00290 break;
00291 default:
00292 assert(base2int[(int)(*this)[genomeIndex]] == base2int[(int) fasta[i]]);
00293 #if 0
00294 if (base2int[(*this)[genomeIndex]] != base2int[(int) fasta[i]])
00295 {
00296 int bp1 = base2int[(*this)[genomeIndex]];
00297 int bp2 = base2int[fasta[i]];
00298 printf("failing at genome index = %u, fasta index = %u.\n",
00299 genomeIndex,i);
00300 }
00301 #endif
00302 genomeIndex++;
00303 break;
00304 }
00305 }
00306 }
00307
00308 #define HAS_SUFFIX(str, suffix) ((strlen(suffix) < str.size()) && (str.substr(str.size() - strlen(suffix)) == suffix))
00309
00310
00311
00312
00313
00314 bool GenomeSequence::setReferenceName(std::string referenceFilename)
00315 {
00316
00317 if (HAS_SUFFIX(referenceFilename, ".fa"))
00318 {
00319 _referenceFilename = referenceFilename;
00320 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3);
00321 }
00322 else if (HAS_SUFFIX(referenceFilename, ".umfa"))
00323 {
00324 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5);
00325 }
00326 else if (HAS_SUFFIX(referenceFilename, "-cs.umfa"))
00327 {
00328 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00329 }
00330 else if (HAS_SUFFIX(referenceFilename, "-bs.umfa"))
00331 {
00332 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00333 }
00334 else
00335 {
00336 _baseFilename = referenceFilename;
00337 }
00338 _fastaFilename = _baseFilename + ".fa";
00339
00340 return false;
00341 }
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352 bool GenomeSequence::setChromosomeMD5andLength(uint32_t whichChromosome)
00353 {
00354 if (whichChromosome>=header->_chromosomeCount) return true;
00355
00356 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00357 c->size = header->elementCount - c->start;
00358
00359 #if !defined(WIN32)
00360 MD5_CTX md5Context;
00361 uint8_t md5Signature[MD5_DIGEST_LENGTH];
00362
00363
00364
00365
00366
00367 char *md5Buffer = (char *) malloc(c->size);
00368
00369 MD5_Init(&md5Context);
00370
00371 for (genomeIndex_t i = 0; i < c->size; i ++)
00372 {
00373 md5Buffer[i] = (*this)[c->start + i];
00374 }
00375 MD5_Update(&md5Context, md5Buffer, c->size);
00376 MD5_Final((unsigned char *) &md5Signature, &md5Context);
00377 free(md5Buffer);
00378 for (int i=0; i<MD5_DIGEST_LENGTH; i++)
00379 {
00380
00381 sprintf(c->md5+2*i, "%02x", md5Signature[i]);
00382 }
00383
00384
00385 c->md5[2*MD5_DIGEST_LENGTH] = '\0';
00386 #else
00387 memset(c->md5, 0, 2*MD5_DIGEST_LENGTH);
00388 #endif
00389 return false;
00390 }
00391
00392
00393
00394
00395
00396 static bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
00397 {
00398 chromosomeCount = 0;
00399 baseCount = 0;
00400 bool atLineStart = true;
00401
00402
00403
00404
00405
00406 for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00407 {
00408 switch (fastaData[fastaIndex])
00409 {
00410 case '\n':
00411 case '\r':
00412 atLineStart = true;
00413 break;
00414 case '>':
00415 {
00416 if (!atLineStart) break;
00417 chromosomeCount++;
00418
00419
00420
00421 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
00422 {
00423 fastaIndex++;
00424 }
00425 break;
00426 }
00427 default:
00428 baseCount++;
00429 atLineStart = false;
00430 break;
00431 }
00432
00433 }
00434 return false;
00435 }
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446 bool GenomeSequence::create()
00447 {
00448
00449 assert(int2base[base2int[(int) 'A']]=='A');
00450 assert(int2base[base2int[(int) 'a']]=='A');
00451 assert(int2base[base2int[(int) 'T']]=='T');
00452 assert(int2base[base2int[(int) 't']]=='T');
00453 assert(int2base[base2int[(int) 'C']]=='C');
00454 assert(int2base[base2int[(int) 'c']]=='C');
00455 assert(int2base[base2int[(int) 'G']]=='G');
00456 assert(int2base[base2int[(int) 'g']]=='G');
00457 assert(int2base[base2int[(int) 'N']]=='N');
00458 assert(int2base[base2int[(int) 'n']]=='N');
00459 assert(int2base[base2int[(int) 'M']]=='M');
00460 assert(int2base[base2int[(int) 'm']]=='M');
00461
00462 assert(base2int[(int) 'N']==4);
00463 assert(base2int[(int) 'n']==4);
00464 assert(base2int[(int) 'A']==0);
00465 assert(base2int[(int) 'a']==0);
00466 assert(base2int[(int) 'T']==3);
00467 assert(base2int[(int) 't']==3);
00468 assert(base2int[(int) 'C']==1);
00469 assert(base2int[(int) 'c']==1);
00470 assert(base2int[(int) 'G']==2);
00471 assert(base2int[(int) 'g']==2);
00472
00473 if (_baseFilename=="")
00474 {
00475 std::cerr << "Base reference filename is empty." << std::endl;
00476 return true;
00477 }
00478
00479 if (isColorSpace())
00480 {
00481 _umfaFilename = _baseFilename + "-cs.umfa";
00482 }
00483 else
00484 {
00485 _umfaFilename = _baseFilename + "-bs.umfa";
00486 }
00487
00488 if (access(_umfaFilename.c_str(), R_OK) == 0)
00489 {
00490 std::cerr << "Output file '" << _umfaFilename << "' exists or is not writable - please remove." << std::endl;
00491 return true;
00492 }
00493
00494 MemoryMap fastaFile;
00495
00496 if (fastaFile.open(_fastaFilename.c_str()))
00497 {
00498 std::cerr << "failed to open input fasta file '" << _fastaFilename << "'." << std::endl;
00499 return true;
00500 }
00501
00502 std::cerr << "Creating FASTA "
00503 << (isColorSpace() ? "color space " : "")
00504 << "binary cache file '"
00505 << _umfaFilename
00506 << "'."
00507 << std::endl;
00508
00509 std::cerr << std::flush;
00510
00511
00512
00513
00514
00515 const char *fasta = (const char *) fastaFile.data;
00516 size_t fastaDataSize = fastaFile.length();
00517
00518 uint32_t chromosomeCount = 0;
00519 uint64_t baseCount = 0;
00520 getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount);
00521
00522 if (genomeSequenceArray::create(_umfaFilename.c_str(), baseCount, chromosomeCount))
00523 {
00524 std::cerr << "failed to create '"
00525 << _umfaFilename
00526 << "'."
00527 << std::endl;
00528 perror("");
00529 return true;
00530 }
00531 header->elementCount = 0;
00532 header->_colorSpace = isColorSpace();
00533 header->setApplication(_application.c_str());
00534 header->_chromosomeCount = chromosomeCount;
00535
00536
00537
00538 for (uint32_t i=0; i<header->_chromosomeCount; i++) header->_chromosomes[i].constructorClear();
00539
00540 std::string chromosomeName;
00541
00542
00543
00544 signed char lastBase = base2int[(int) 'N'];
00545 bool terminateLoad = false;
00546 int percent = -1, newPercent;
00547 uint32_t whichChromosome = 0;
00548 for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00549 {
00550 if (_progressStream)
00551 {
00552 newPercent = (int) (1.0 * fastaIndex / fastaDataSize) * 100;
00553 if (newPercent>percent)
00554 {
00555 std::cerr << "\r" << newPercent << "% ";
00556 std::cerr << std::flush;
00557 percent = newPercent;
00558 }
00559 }
00560 switch (fasta[fastaIndex])
00561 {
00562 case '\n':
00563 case '\r':
00564 break;
00565 case '>':
00566 {
00567 chromosomeName = "";
00568 fastaIndex++;
00569
00570
00571
00572 while (!isspace(fasta[fastaIndex]))
00573 {
00574 chromosomeName += fasta[fastaIndex++];
00575 }
00576
00577
00578
00579 while (fasta[fastaIndex]!='\n' && fasta[fastaIndex]!='\r')
00580 {
00581 fastaIndex++;
00582 }
00583
00584
00585
00586
00587 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00588 c->setChromosomeName(chromosomeName.c_str());
00589 c->start = header->elementCount;
00590
00591
00592 if (whichChromosome>0)
00593 {
00594
00595
00596
00597
00598
00599
00600 setChromosomeMD5andLength(whichChromosome - 1);
00601 }
00602 whichChromosome++;
00603 if (whichChromosome > header->_chromosomeCount)
00604 {
00605 std::cerr << "BUG: Exceeded computed chromosome count ("
00606 << header->_chromosomeCount
00607 << ") - genome is now truncated at chromosome "
00608 << header->_chromosomes[header->_chromosomeCount-1].name
00609 << " (index "
00610 << header->_chromosomeCount
00611 << ")."
00612 << std::endl;
00613 terminateLoad = true;
00614 }
00615 break;
00616 }
00617 default:
00618
00619
00620
00621 if (isColorSpace())
00622 {
00623
00624
00625
00626
00627
00628 const char fromBase2CS[] =
00629 {
00630 0,
00631 1,
00632 2,
00633 3,
00634 1,
00635 0,
00636 3,
00637 2,
00638 2,
00639 3,
00640 0,
00641 1,
00642 3,
00643 2,
00644 1,
00645 0,
00646 };
00647
00648
00649
00650
00651
00652
00653
00654
00655 char thisBase = base2int[(int)(fasta[fastaIndex])];
00656 if (lastBase>=0)
00657 {
00658 char color;
00659 if (lastBase>3 || thisBase>3) color=4;
00660 else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
00661
00662
00663 set(header->elementCount++, int2base[(int) color]);
00664 }
00665 lastBase = thisBase;
00666 }
00667 else
00668 {
00669 set(header->elementCount++, toupper(fasta[fastaIndex]));
00670 }
00671 break;
00672 }
00673
00674
00675
00676
00677
00678 if (terminateLoad) break;
00679 }
00680
00681
00682
00683
00684
00685 if (whichChromosome==0)
00686 {
00687 fastaFile.close();
00688 throw std::runtime_error("No chromosomes found - aborting!");
00689 }
00690 else
00691 {
00692 setChromosomeMD5andLength(whichChromosome-1);
00693 }
00694
00695 #if 0
00696 sanityCheck(fastaFile);
00697 #endif
00698 fastaFile.close();
00699
00700 if (_progressStream) *_progressStream << "\r";
00701
00702 std::cerr << "FASTA binary cache file '"
00703 << _umfaFilename
00704 << "' created."
00705 << std::endl;
00706
00707
00708
00709
00710 return false;
00711 }
00712
00713 int GenomeSequence::getChromosomeCount() const
00714 {
00715 return header->_chromosomeCount;
00716 }
00717
00718
00719 int GenomeSequence::getChromosome(genomeIndex_t position) const
00720 {
00721 if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX;
00722
00723 if (header->_chromosomeCount == 0)
00724 return INVALID_CHROMOSOME_INDEX;
00725
00726 int start = 0;
00727 int stop = header->_chromosomeCount - 1;
00728
00729
00730
00731
00732 if (position > header->_chromosomes[stop].start)
00733 return (stop);
00734
00735 while (start <= stop)
00736 {
00737 int middle = (start + stop) / 2;
00738
00739 if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start)
00740 return middle;
00741
00742 if (position == header->_chromosomes[middle + 1].start)
00743 return (middle + 1);
00744
00745 if (position > header->_chromosomes[middle + 1].start)
00746 start = middle + 1;
00747
00748 if (position < header->_chromosomes[middle].start)
00749 stop = middle - 1;
00750 }
00751
00752 return -1;
00753 }
00754
00755
00756
00757
00758
00759
00760
00761 genomeIndex_t GenomeSequence::getGenomePosition(
00762 const char *chromosomeName,
00763 unsigned int chromosomeIndex) const
00764 {
00765 genomeIndex_t i = getGenomePosition(chromosomeName);
00766 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00767 return i + chromosomeIndex - 1;
00768 }
00769
00770 genomeIndex_t GenomeSequence::getGenomePosition(
00771 int chromosome,
00772 unsigned int chromosomeIndex) const
00773 {
00774 if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX;
00775
00776 genomeIndex_t i = header->_chromosomes[chromosome].start;
00777 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00778 return i + chromosomeIndex - 1;
00779 }
00780
00781
00782
00783
00784
00785
00786
00787
00788
00789 genomeIndex_t GenomeSequence::getGenomePosition(const char *chromosomeName) const
00790 {
00791 int chromosome = getChromosome(chromosomeName);
00792 if (chromosome==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00793 return header->_chromosomes[chromosome].start;
00794 }
00795
00796 int GenomeSequence::getChromosome(const char *chromosomeName) const
00797 {
00798 unsigned int i;
00799 for (i=0; i<header->_chromosomeCount; i++)
00800 {
00801 if (strcmp(header->_chromosomes[i].name, chromosomeName)==0)
00802 {
00803 return i;
00804 }
00805 }
00806 return INVALID_CHROMOSOME_INDEX;
00807 }
00808
00809
00810
00811
00812
00813 void GenomeSequence::getReverseRead(std::string &read)
00814 {
00815 std::string newRead;
00816 if (read.size()) for (int32_t i=(int) read.size() - 1; i>=0; i--)
00817 {
00818 newRead.push_back(BasePair(read[i]));
00819 }
00820 read = newRead;
00821 }
00822
00823 void GenomeSequence::getReverseRead(String& read)
00824 {
00825 int i = 0;
00826 int j = read.Length()-1;
00827 char temp;
00828 while (i < j)
00829 {
00830 temp = read[j];
00831 read[j] = read[i];
00832 read[i] = temp;
00833 }
00834 }
00835
00836 #define ABS(x) ( (x) > 0 ? (x) : -(x) )
00837 int GenomeSequence::debugPrintReadValidation(
00838 std::string &read,
00839 std::string &quality,
00840 char direction,
00841 genomeIndex_t readLocation,
00842 int sumQuality,
00843 int mismatchCount,
00844 bool recurse
00845 )
00846 {
00847 int validateSumQ = 0;
00848 int validateMismatchCount = 0;
00849 int rc = 0;
00850 std::string genomeData;
00851
00852 for (uint32_t i=0; i<read.size(); i++)
00853 {
00854 if (tolower(read[i]) != tolower((*this)[readLocation + i]))
00855 {
00856 validateSumQ += quality[i] - '!';
00857
00858 if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
00859 genomeData.push_back(tolower((*this)[readLocation + i]));
00860 }
00861 else
00862 {
00863 genomeData.push_back(toupper((*this)[readLocation + i]));
00864 }
00865 }
00866 assert(validateSumQ>=0);
00867 if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
00868 {
00869 printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n",
00870 genomeData.c_str(),
00871 read.c_str(),
00872 validateSumQ,
00873 sumQuality
00874 );
00875 rc++;
00876 }
00877 else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
00878 {
00879 printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n",
00880 genomeData.c_str(),
00881 read.c_str(),
00882 validateMismatchCount,
00883 mismatchCount
00884 );
00885 rc++;
00886 }
00887 else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
00888 {
00889 printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
00890 genomeData.c_str(),
00891 read.c_str(),
00892 validateSumQ,
00893 sumQuality,
00894 validateMismatchCount,
00895 mismatchCount
00896 );
00897 rc++;
00898 }
00899
00900 if (recurse && ABS(validateMismatchCount - mismatchCount) > (int) read.size()/2)
00901 {
00902 printf("large mismatch difference, trying reverse strand: ");
00903 std::string reverseRead = read;
00904 std::string reverseQuality = quality;
00905 getReverseRead(reverseRead);
00906 reverse(reverseQuality.begin(), reverseQuality.end());
00907 rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
00908 }
00909 return rc;
00910 }
00911 #undef ABS
00912
00913
00914 bool GenomeSequence::wordMatch(unsigned int index, std::string &word) const
00915 {
00916 for (uint32_t i = 0; i<word.size(); i++)
00917 {
00918 if ((*this)[index + i] != word[i]) return false;
00919 }
00920 return true;
00921 }
00922
00923 bool GenomeSequence::printNearbyWords(unsigned int index, unsigned int deviation, std::string &word) const
00924 {
00925 for (unsigned int i = index - deviation; i < index + deviation; i++)
00926 {
00927 if (wordMatch(i, word))
00928 {
00929 std::cerr << "word '"
00930 << word
00931 << "' found "
00932 << index - i
00933 << " away from position "
00934 << index
00935 << "."
00936 << std::endl;
00937 }
00938 }
00939 return false;
00940 }
00941
00942 void GenomeSequence::dumpSequenceSAMDictionary(std::ostream &file) const
00943 {
00944 for (unsigned int i=0; i<header->_chromosomeCount; i++)
00945 {
00946 file
00947 << "@SQ"
00948 << "\tSN:" << header->_chromosomes[i].name
00949 << "\tLN:" << header->_chromosomes[i].size
00950 << "\tAS:" << header->_chromosomes[i].assemblyID
00951 << "\tM5:" << header->_chromosomes[i].md5
00952 << "\tUR:" << header->_chromosomes[i].uri
00953 << "\tSP:" << header->_chromosomes[i].species
00954 << std::endl;
00955 }
00956 }
00957
00958 void GenomeSequence::dumpHeaderTSV(std::ostream &file) const
00959 {
00960 file << "# Reference: " << _baseFilename << std::endl;
00961 file << "# SN: sample name - must be unique" << std::endl;
00962 file << "# AS: assembly name" << std::endl;
00963 file << "# SP: species" << std::endl;
00964 file << "# LN: chromosome/contig length" << std::endl;
00965 file << "# M5: chromosome/contig MD5 checksum" << std::endl;
00966 file << "# LN and M5 are only printed for informational purposes." << std::endl;
00967 file << "# Karma will only set those values when creating the index." << std::endl;
00968 file << "SN" << "\t" << "AS" << "\t" << "SP" << "\t" << "UR" << "\t" << "LN" << "\t" << "M5" << std::endl;
00969 for (unsigned int i=0; i<header->_chromosomeCount; i++)
00970 {
00971 file
00972 << header->_chromosomes[i].name
00973 << "\t" << header->_chromosomes[i].assemblyID
00974 << "\t" << header->_chromosomes[i].uri
00975 << "\t" << header->_chromosomes[i].species
00976 << "\t" << header->_chromosomes[i].size
00977 << "\t" << header->_chromosomes[i].md5
00978 << std::endl;
00979 }
00980 }
00981
00982
00983 void GenomeSequence::getString(std::string &str, int chromosome, genomeIndex_t index, int baseCount) const
00984 {
00985
00986
00987
00988 genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1;
00989
00990 getString(str, genomeIndex, baseCount);
00991 }
00992
00993 void GenomeSequence::getString(String &str, int chromosome, genomeIndex_t index, int baseCount) const
00994 {
00995 std::string string;
00996 this-> getString(string, chromosome, index, baseCount);
00997 str = string.c_str();
00998 }
00999
01000 void GenomeSequence::getString(std::string &str, genomeIndex_t index, int baseCount) const
01001 {
01002 str.clear();
01003 if (baseCount > 0)
01004 {
01005 for (int i=0; i<baseCount; i++)
01006 {
01007 str.push_back((*this)[index + i]);
01008 }
01009 }
01010 else
01011 {
01012
01013
01014 for (int i=0; i< -baseCount; i++)
01015 {
01016 str.push_back(base2complement[(int)(*this)[index + i]]);
01017 }
01018 }
01019 }
01020
01021 void GenomeSequence::getString(String &str, genomeIndex_t index, int baseCount) const
01022 {
01023 std::string string;
01024 getString(string, index, baseCount);
01025 str = string.c_str();
01026 }
01027
01028 void GenomeSequence::getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const
01029 {
01030 str.clear();
01031 if (baseCount > 0)
01032 {
01033 for (int i=0; i<baseCount; i++)
01034 {
01035 char base = (*this)[index + i];
01036 if (in(index+i, highLightStart, highLightEnd))
01037 base = tolower(base);
01038 str.push_back(base);
01039 }
01040 }
01041 else
01042 {
01043
01044
01045 for (int i=0; i< -baseCount; i++)
01046 {
01047 char base = base2complement[(int)(*this)[index + i]];
01048 if (in(index+i, highLightStart, highLightEnd))
01049 base = tolower(base);
01050 str.push_back(base);
01051 }
01052 }
01053 }
01054
01055 void GenomeSequence::print30(genomeIndex_t index) const
01056 {
01057 std::cout << "index: " << index << "\n";
01058 for (genomeIndex_t i=index-30; i<index+30; i++)
01059 std::cout << (*this)[i];
01060 std::cout << "\n";
01061 for (genomeIndex_t i=index-30; i<index; i++)
01062 std::cout << " ";
01063 std::cout << "^";
01064 std::cout << std::endl;
01065 }
01066
01067 void GenomeSequence::getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const
01068 {
01069 result = "";
01070 for (uint32_t i=0; i < read.size(); i++)
01071 {
01072 if (read[i] == (*this)[location+i])
01073 result.push_back(' ');
01074 else
01075 result.push_back('^');
01076 }
01077 }
01078
01079 void GenomeSequence::getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const
01080 {
01081 result = "";
01082 for (uint32_t i=0; i < read.size(); i++)
01083 {
01084 if (read[i] == (*this)[location+i])
01085 result.push_back(toupper(read[i]));
01086 else
01087 result.push_back(tolower(read[i]));
01088 }
01089 }
01090
01091 genomeIndex_t GenomeSequence::simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const
01092 {
01093 int bestScore = 1000000;
01094 genomeIndex_t bestMatchLocation = INVALID_GENOME_INDEX;
01095 for (int i=-windowSize; i<windowSize; i++)
01096 {
01097 int newScore;
01098
01099 if (i<0 && ((uint32_t) -i) > index) continue;
01100 if (index + i + read.size() >= getNumberBases()) continue;
01101
01102 if (quality=="")
01103 {
01104 newScore = this->getMismatchCount(read, index + i);
01105 }
01106 else
01107 {
01108 newScore = this->getSumQ(read, quality, index + i);
01109 }
01110 if (newScore < bestScore)
01111 {
01112 bestScore = newScore;
01113 bestMatchLocation = index + i;
01114 }
01115 }
01116 return bestMatchLocation;
01117 }
01118
01119 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h)
01120 {
01121 stream << (MemoryMapArrayHeader &) h;
01122 stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01123 stream << "isColorSpace: " << h._colorSpace << std::endl;
01124 stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01125 uint64_t totalSize = 0;
01126 for (uint32_t i=0; i < h._chromosomeCount; i++)
01127 {
01128 totalSize += h._chromosomes[i].size;
01129 stream << "Chromosome Index " << i << " name: " << h._chromosomes[i].name << std::endl;
01130 stream << "Chromosome Index " << i << " whole genome start: " << h._chromosomes[i].start << std::endl;
01131 stream << "Chromosome Index " << i << " whole genome size: " << h._chromosomes[i].size << std::endl;
01132 stream << "Chromosome Index " << i << " md5 checksum: " << h._chromosomes[i].md5 << std::endl;
01133 stream << "Chromosome Index " << i << " assemblyID: " << h._chromosomes[i].assemblyID << std::endl;
01134 stream << "Chromosome Index " << i << " species: " << h._chromosomes[i].species << std::endl;
01135 stream << "Chromosome Index " << i << " URI: " << h._chromosomes[i].uri << std::endl;
01136 }
01137 stream << "Total Genome Size: " << totalSize << " bases."<< std::endl;
01138 if (totalSize != h.elementCount)
01139 {
01140 stream << "Total Genome Size: does not match elementCount!\n";
01141 }
01142
01143 stream << std::endl;
01144 return stream;
01145 }
01146
01147 void GenomeSequence::getChromosomeAndIndex(std::string &s, genomeIndex_t i) const
01148 {
01149 int whichChromosome = 0;
01150
01151 whichChromosome = getChromosome(i);
01152
01153 if (whichChromosome == INVALID_CHROMOSOME_INDEX)
01154 {
01155 s = "invalid genome index";
01156 }
01157 else
01158 {
01159 std::ostringstream buf;
01160 genomeIndex_t chromosomeIndex = i - getChromosomeStart(whichChromosome) + 1;
01161 buf << header->_chromosomes[whichChromosome].name << ":" << chromosomeIndex;
01162 #if 0
01163 buf << " (GenomeIndex " << i << ")";
01164 #endif
01165 s = buf.str();
01166 }
01167 return;
01168 }
01169
01170
01171 void GenomeSequence::getChromosomeAndIndex(String& s, genomeIndex_t i) const
01172 {
01173 std::string ss;
01174 getChromosomeAndIndex(ss, i);
01175 s = ss.c_str();
01176 return;
01177 }
01178
01179
01180
01181
01182
01183
01184
01185
01186
01187
01188
01189
01190
01191
01192
01193
01194
01195 bool GenomeSequence::populateDBSNP(
01196 mmapArrayBool_t &dbSNP,
01197 std::ifstream &inputFile) const
01198 {
01199 std::string inputLine;
01200
01201 assert(dbSNP.getElementCount() == getNumberBases());
01202
01203 std::string chromosomeName;
01204 genomeIndex_t chromosomePosition1;
01205 uint64_t ignoredLineCount = 0;
01206
01207 while (getline(inputFile, inputLine))
01208 {
01209 std::istringstream inputLine2(inputLine);
01210 genomeIndex_t genomeIndex;
01211
01212 inputLine2 >> chromosomeName;
01213 inputLine2 >> chromosomePosition1;
01214
01215
01216 if (chromosomeName.size()>0 && chromosomeName[0]=='#')
01217 {
01218 continue;
01219 }
01220
01221 #if 0
01222
01223
01224
01225
01226
01227 if (!isdigit(chromosomeName[0])) continue;
01228 #endif
01229
01230 genomeIndex = getGenomePosition(chromosomeName.c_str(), chromosomePosition1);
01231
01232
01233 if (genomeIndex == INVALID_GENOME_INDEX)
01234 {
01235 ignoredLineCount++;
01236 continue;
01237 }
01238
01239 dbSNP.set(genomeIndex, true);
01240 }
01241
01242 inputFile.close();
01243
01244 if (ignoredLineCount > 0)
01245 {
01246 std::cerr << "GenomeSequence::populateDBSNP: ignored " << ignoredLineCount << " SNP positions due to invalid format of line." << std::endl;
01247 return false;
01248 }
01249 return true;
01250 }
01251
01252 bool GenomeSequence::loadDBSNP(
01253 mmapArrayBool_t &dbSNP,
01254 const char *inputFileName) const
01255 {
01256
01257
01258
01259
01260
01261
01262
01263
01264
01265
01266
01267
01268
01269
01270 if (strlen(inputFileName)!=0)
01271 {
01272 std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush;
01273
01274 if (dbSNP.open(inputFileName, O_RDONLY))
01275 {
01276
01277
01278
01279
01280
01281
01282 if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos)
01283 {
01284 std::cerr << "Error: " << dbSNP.getErrorString() << std::endl;
01285 exit(1);
01286 }
01287
01288
01289
01290 std::ifstream inputFile;
01291 inputFile.open(inputFileName);
01292 if (inputFile.fail())
01293 {
01294 std::cerr << "Error: failed to open " << inputFileName << std::endl;
01295 exit(1);
01296 }
01297
01298 std::cerr << "(as text file) ";
01299
01300
01301 dbSNP.create(getNumberBases());
01302
01303
01304 populateDBSNP(dbSNP, inputFile);
01305 inputFile.close();
01306
01307 }
01308 else
01309 {
01310 std::cerr << "(as binary mapped file) ";
01311 }
01312
01313 std::cerr << "DONE!" << std::endl;
01314 return false;
01315 }
01316 else
01317 {
01318 return true;
01319 }
01320 }
01321
01322
01323 #if defined(TEST)
01324
01325 void simplestExample(void)
01326 {
01327 GenomeSequence reference;
01328 genomeIndex_t index;
01329
01330
01331
01332
01333
01334
01335
01336 if (reference.open())
01337 {
01338 perror("GenomeSequence::open");
01339 exit(1);
01340 }
01341
01342
01343 index = 1000000000;
01344
01345
01346
01347
01348
01349
01350
01351
01352
01353 std::cout << "base[" << index << "] = " << reference[index] << std::endl;
01354
01355
01356
01357
01358
01359 int chr = reference.getChromosome(index);
01360 genomeIndex_t chrIndex = index - reference.getChromosomeStart(chr) + 1;
01361
01362 std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
01363
01364
01365
01366
01367
01368 const char *chromosomeName = "5";
01369 chr = reference.getChromosome(chromosomeName);
01370 chrIndex = 100000;
01371
01372 index = reference.getChromosomeStart(chr) + chrIndex - 1;
01373
01374 std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
01375
01376 reference.close();
01377 }
01378
01379 void testGenomeSequence(void)
01380 {
01381 GenomeSequence reference;
01382
01383 #if 0
01384 std::string referenceName = "someotherreference";
01385 if (reference.setFastaName(referenceName))
01386 {
01387 std::cerr << "failed to open reference file "
01388 << referenceName
01389 << std::endl;
01390 exit(1);
01391 }
01392 #endif
01393
01394 std::cerr << "open and prefetch the reference genome: ";
01395
01396
01397 if (reference.open())
01398 {
01399 exit(1);
01400 }
01401 std::cerr << "done!" << std::endl;
01402
01403
01404
01405
01406 genomeIndex_t genomeIndex;
01407 unsigned int chromosomeIndex;
01408 unsigned int chromosome;
01409 std::string chromosomeName;
01410
01411
01412
01413
01414
01415 chromosomeName = "2";
01416 chromosomeIndex = 1234567;
01417
01418 genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
01419 assert(genomeIndex!=INVALID_GENOME_INDEX);
01420 std::cout << "Chromosome " << chromosomeName << ", index ";
01421 std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
01422 std::cout << " at genome index position " << genomeIndex << std::endl;
01423
01424
01425
01426
01427
01428
01429
01430 chromosome = reference.getChromosome(genomeIndex);
01431 unsigned int newChromosomeIndex;
01432
01433 newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
01434
01435 assert(chromosomeIndex == newChromosomeIndex);
01436
01437
01438
01439 PackedRead pr;
01440
01441 pr.set("ATCGATCG", 0);
01442 assert(pr.size()==8);
01443 assert(pr[0]==GenomeSequence::base2int[(int) 'A']);
01444 assert(pr[1]==GenomeSequence::base2int[(int) 'T']);
01445 assert(pr[2]==GenomeSequence::base2int[(int) 'C']);
01446 assert(pr[3]==GenomeSequence::base2int[(int) 'G']);
01447 pr.set("ATCGATCG", 1);
01448 assert(pr.size()==9);
01449 pr.set("", 0);
01450 assert(pr.size()==0);
01451 pr.set("", 1);
01452 assert(pr.size()==1);
01453 pr.set("", 2);
01454 assert(pr.size()==2);
01455 pr.set("", 3);
01456 assert(pr.size()==3);
01457 assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
01458 assert(pr[1]==GenomeSequence::base2int[(int) 'N']);
01459 assert(pr[2]==GenomeSequence::base2int[(int) 'N']);
01460 pr.set("C", 1);
01461 assert(pr.size()==2);
01462 assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
01463 assert(pr[1]==GenomeSequence::base2int[(int) 'C']);
01464
01465 }
01466
01467
01468
01469
01470
01471
01472
01473
01474 int main(int argc, const char **argv)
01475 {
01476 #if 1
01477 simplestExample();
01478 #else
01479 testGenomeSequence();
01480 #endif
01481 exit(0);
01482 }
01483
01484 #endif