00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "assert.h"
00019 #include "ctype.h"
00020 #include "stdio.h"
00021 #include "Error.h"
00022
00023
00024 #include "Generic.h"
00025 #include "GenomeSequence.h"
00026
00027 #include <algorithm>
00028 #include <istream>
00029 #include <fstream>
00030 #include <sstream>
00031 #include <stdexcept>
00032
00033
00034 #include "CSG_MD5.h"
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049 void PackedRead::set(const char *rhs, int padWithNCount)
00050 {
00051 clear();
00052
00053
00054 while (padWithNCount>1)
00055 {
00056 packedBases.push_back(
00057 BaseAsciiMap::base2int[(int) 'N'] << 4 |
00058 BaseAsciiMap::base2int[(int) 'N']
00059 );
00060 padWithNCount -= 2;
00061 length+=2;
00062 }
00063
00064
00065
00066 if (padWithNCount)
00067 {
00068
00069
00070 packedBases.push_back(
00071 BaseAsciiMap::base2int[(int) *rhs] << 4 |
00072 BaseAsciiMap::base2int[(int) 'N']
00073 );
00074
00075 if (*rhs)
00076 {
00077 length+=2;
00078 rhs++;
00079 }
00080 else
00081 {
00082 length++;
00083 }
00084 padWithNCount--;
00085 assert(padWithNCount==0);
00086 }
00087
00088
00089 while (*rhs && *(rhs+1))
00090 {
00091 packedBases.push_back(
00092 BaseAsciiMap::base2int[(int) *(rhs+1)] << 4 |
00093 BaseAsciiMap::base2int[(int) *(rhs+0)]
00094 );
00095 rhs+=2;
00096 length+=2;
00097 }
00098
00099
00100
00101 if (*rhs)
00102 {
00103 packedBases.push_back(
00104 BaseAsciiMap::base2int[(int) *(rhs+0)]
00105 );
00106 length++;
00107 }
00108 return;
00109 }
00110
00111 std::string GenomeSequence::IntegerToSeq(unsigned int n, unsigned int wordsize) const
00112 {
00113 std::string sequence("");
00114 for (unsigned int i = 0; i < wordsize; i ++)
00115 sequence += "N";
00116
00117 unsigned int clearHigherBits = ~(3U << (wordsize<<1));
00118
00119 if (n > clearHigherBits)
00120 error("%d needs to be a non-negative integer < clearHigherBits\n", n);
00121
00122 for (unsigned int i = 0; i < wordsize; i++)
00123 {
00124 sequence[wordsize-1-i] = BaseAsciiMap::int2base[n & 3];
00125 n >>= 2;
00126 }
00127 return sequence;
00128 }
00129
00130
00131
00132 GenomeSequence::GenomeSequence()
00133 {
00134 constructorClear();
00135 }
00136
00137 void GenomeSequence::constructorClear()
00138 {
00139 _debugFlag = 0;
00140 _progressStream = NULL;
00141 _colorSpace = false;
00142 _createOverwrite = false;
00143 }
00144
00145 void GenomeSequence::setup(const char *referenceFilename)
00146 {
00147 setReferenceName(referenceFilename);
00148
00149 if (_progressStream) *_progressStream << "open and prefetch reference genome " << referenceFilename << ": " << std::flush;
00150
00151 if (open(false))
00152 {
00153 std::cerr << "Failed to open reference genome " << referenceFilename << std::endl;
00154 std::cerr << errorStr << std::endl;
00155 exit(1);
00156 }
00157
00158 prefetch();
00159 if (_progressStream) *_progressStream << "done." << std::endl << std::flush;
00160 }
00161
00162 GenomeSequence::~GenomeSequence()
00163 {
00164
00165 _umfaFile.close();
00166 }
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178 bool GenomeSequence::open(bool isColorSpace, int flags)
00179 {
00180 bool rc;
00181
00182 if (isColorSpace)
00183 {
00184 _umfaFilename = _baseFilename + "-cs.umfa";
00185 }
00186 else
00187 {
00188 _umfaFilename = _baseFilename + "-bs.umfa";
00189 }
00190
00191 if(access(_umfaFilename.c_str(), R_OK) != 0)
00192 {
00193
00194 if(create(isColorSpace))
00195 {
00196
00197 std::cerr << "GenomeSequence::open: failed to open file "
00198 << _umfaFilename
00199 << " also failed creating it."
00200 << std::endl;
00201 return true;
00202 }
00203 }
00204
00205 rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags);
00206 if (rc)
00207 {
00208 std::cerr << "GenomeSequence::open: failed to open file "
00209 << _umfaFilename
00210 << std::endl;
00211 return true;
00212 }
00213
00214 _colorSpace = header->_colorSpace;
00215
00216 return false;
00217 }
00218
00219 void GenomeSequence::sanityCheck(MemoryMap &fasta) const
00220 {
00221 unsigned int i;
00222
00223 unsigned int genomeIndex = 0;
00224 for (i=0; i<fasta.length(); i++)
00225 {
00226 switch (fasta[i])
00227 {
00228 case '>':
00229 while (fasta[i]!='\n' && fasta[i]!='\r') i++;
00230 break;
00231 case '\n':
00232 case '\r':
00233 break;
00234 default:
00235 assert(BaseAsciiMap::base2int[(int)(*this)[genomeIndex]] ==
00236 BaseAsciiMap::base2int[(int) fasta[i]]);
00237 #if 0
00238 if(BaseAsciiMap::base2int[(*this)[genomeIndex]] !=
00239 BaseAsciiMap::base2int[(int) fasta[i]])
00240 {
00241 int bp1 = BaseAsciiMap::base2int[(*this)[genomeIndex]];
00242 int bp2 = BaseAsciiMap::base2int[fasta[i]];
00243 printf("failing at genome index = %u, fasta index = %u.\n",
00244 genomeIndex,i);
00245 }
00246 #endif
00247 genomeIndex++;
00248 break;
00249 }
00250 }
00251 }
00252
00253 #define HAS_SUFFIX(str, suffix) ((strlen(suffix) < str.size()) && (str.substr(str.size() - strlen(suffix)) == suffix))
00254
00255
00256
00257
00258
00259
00260 bool GenomeSequence::setReferenceName(std::string referenceFilename)
00261 {
00262
00263 if (HAS_SUFFIX(referenceFilename, ".fa"))
00264 {
00265 _referenceFilename = referenceFilename;
00266 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3);
00267 }
00268 else if (HAS_SUFFIX(referenceFilename, ".umfa"))
00269 {
00270 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5);
00271 }
00272 else if (HAS_SUFFIX(referenceFilename, "-cs.umfa"))
00273 {
00274 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00275 }
00276 else if (HAS_SUFFIX(referenceFilename, "-bs.umfa"))
00277 {
00278 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00279 }
00280 else
00281 {
00282 _baseFilename = referenceFilename;
00283 }
00284 _fastaFilename = _baseFilename + ".fa";
00285
00286 if (HAS_SUFFIX(referenceFilename, ".fasta"))
00287 {
00288 _referenceFilename = referenceFilename;
00289 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 6);
00290 _fastaFilename = _baseFilename + ".fasta";
00291 }
00292
00293 return false;
00294 }
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305 bool GenomeSequence::setChromosomeMD5andLength(uint32_t whichChromosome)
00306 {
00307 if (whichChromosome>=header->_chromosomeCount) return true;
00308
00309 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00310 c->size = header->elementCount - c->start;
00311
00312 MD5_CTX md5Context;
00313 uint8_t md5Signature[MD5_DIGEST_LENGTH];
00314
00315
00316
00317
00318
00319 char *md5Buffer = (char *) malloc(c->size);
00320
00321 MD5Init(&md5Context);
00322
00323 for (genomeIndex_t i = 0; i < c->size; i ++)
00324 {
00325 md5Buffer[i] = (*this)[c->start + i];
00326 }
00327 MD5Update(&md5Context, (unsigned char *) md5Buffer, c->size);
00328 MD5Final((unsigned char *) &md5Signature, &md5Context);
00329 free(md5Buffer);
00330 for (int i=0; i<MD5_DIGEST_LENGTH; i++)
00331 {
00332
00333 sprintf(c->md5+2*i, "%02x", md5Signature[i]);
00334 }
00335
00336
00337 c->md5[2*MD5_DIGEST_LENGTH] = '\0';
00338
00339 return false;
00340 }
00341
00342
00343
00344
00345
00346 static bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
00347 {
00348 chromosomeCount = 0;
00349 baseCount = 0;
00350 bool atLineStart = true;
00351
00352
00353
00354
00355
00356 for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00357 {
00358 switch (fastaData[fastaIndex])
00359 {
00360 case '\n':
00361 case '\r':
00362 atLineStart = true;
00363 break;
00364 case '>':
00365 {
00366 if (!atLineStart) break;
00367 chromosomeCount++;
00368
00369
00370
00371 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
00372 {
00373 fastaIndex++;
00374 }
00375 break;
00376 }
00377 default:
00378 baseCount++;
00379 atLineStart = false;
00380 break;
00381 }
00382
00383 }
00384 return false;
00385 }
00386
00387 class PackedSequenceData : public std::vector<uint8_t>
00388 {
00389 std::vector<uint8_t> m_packedBases;
00390
00391 size_t m_baseCount;
00392
00393 void set(size_t index, uint8_t value) {
00394 m_packedBases[index>>1] =
00395 (m_packedBases[index>>1]
00396 & ~(7<<((index&0x01)<<2)))
00397 | ((value&0x0f)<<((index&0x1)<<2));
00398 }
00399
00400 public:
00401
00402 void reserve(size_t baseCount) {m_packedBases.reserve(baseCount/2);}
00403 size_t size() {return m_baseCount;}
00404 void clear() {m_packedBases.clear(); m_baseCount = 0;}
00405 uint8_t operator [](size_t index)
00406 {
00407 return (m_packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
00408 }
00409 void push_back(uint8_t base);
00410 };
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426 bool loadFastaFile(const char *filename,
00427 std::vector<PackedSequenceData> &sequenceData,
00428 std::vector<std::string> &chromosomeNames)
00429 {
00430 InputFile inputStream(filename, "r", InputFile::DEFAULT);
00431
00432 if(!inputStream.isOpen()) {
00433 std::cerr << "Failed to open file " << filename << "\n";
00434 return true;
00435 }
00436
00437 int whichChromosome = -1;
00438 chromosomeNames.clear();
00439
00440 char ch;
00441 while((ch = inputStream.ifgetc()) != EOF) {
00442 switch (ch)
00443 {
00444 case '\n':
00445 case '\r':
00446 break;
00447 case '>':
00448 {
00449 std::string chromosomeName = "";
00450
00451
00452
00453 while (!isspace((ch = inputStream.ifgetc())) && ch != EOF)
00454 {
00455 chromosomeName += ch;
00456 }
00457
00458
00459
00460 do {
00461 ch = inputStream.ifgetc();
00462 } while(ch != EOF && ch != '\n' && ch != '\r');
00463
00464
00465
00466
00467 chromosomeNames.push_back(chromosomeName);
00468
00469 whichChromosome++;
00470
00471 break;
00472 }
00473 default:
00474
00475
00476
00477
00478
00479 #if 0
00480 if (isColorSpace())
00481 {
00482
00483
00484
00485
00486
00487 const char fromBase2CS[] =
00488 {
00489 0,
00490 1,
00491 2,
00492 3,
00493 1,
00494 0,
00495 3,
00496 2,
00497 2,
00498 3,
00499 0,
00500 1,
00501 3,
00502 2,
00503 1,
00504 0,
00505 };
00506
00507
00508
00509
00510
00511
00512
00513
00514 char thisBase =
00515 BaseAsciiMap::base2int[(int)(fasta[fastaIndex])];
00516 if (lastBase>=0)
00517 {
00518 char color;
00519 if (lastBase>3 || thisBase>3) color=4;
00520 else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
00521
00522
00523 set(header->elementCount++,
00524 BaseAsciiMap::int2base[(int) color]);
00525 }
00526 lastBase = thisBase;
00527 }
00528 else
00529 {
00530 set(header->elementCount++, toupper(fasta[fastaIndex]));
00531 }
00532 #endif
00533 break;
00534 }
00535 }
00536 return false;
00537 }
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548 bool GenomeSequence::create(bool isColor)
00549 {
00550 setColorSpace(isColor);
00551
00552 if (_baseFilename=="")
00553 {
00554 std::cerr << "Base reference filename is empty." << std::endl;
00555 return true;
00556 }
00557
00558 if (isColorSpace())
00559 {
00560 _umfaFilename = _baseFilename + "-cs.umfa";
00561 }
00562 else
00563 {
00564 _umfaFilename = _baseFilename + "-bs.umfa";
00565 }
00566
00567 if (!_createOverwrite && access(_umfaFilename.c_str(), R_OK) == 0)
00568 {
00569 std::cerr << "Output file '" << _umfaFilename << "' exists or is not writable - please remove." << std::endl;
00570 return true;
00571 }
00572
00573 MemoryMap fastaFile;
00574
00575 if (fastaFile.open(_fastaFilename.c_str()))
00576 {
00577 std::cerr << "failed to open input fasta file '" << _fastaFilename << "'." << std::endl;
00578 return true;
00579 }
00580
00581 std::cerr << "Creating FASTA "
00582 << (isColorSpace() ? "color space " : "")
00583 << "binary cache file '"
00584 << _umfaFilename
00585 << "'."
00586 << std::endl;
00587
00588 std::cerr << std::flush;
00589
00590
00591
00592
00593
00594 const char *fasta = (const char *) fastaFile.data;
00595 size_t fastaDataSize = fastaFile.length();
00596
00597 uint32_t chromosomeCount = 0;
00598 uint64_t baseCount = 0;
00599 getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount);
00600
00601 if (genomeSequenceArray::create(_umfaFilename.c_str(), baseCount, chromosomeCount))
00602 {
00603 std::cerr << "failed to create '"
00604 << _umfaFilename
00605 << "'."
00606 << std::endl;
00607 perror("");
00608 return true;
00609 }
00610 header->elementCount = 0;
00611 header->_colorSpace = isColorSpace();
00612 header->setApplication(_application.c_str());
00613 header->_chromosomeCount = chromosomeCount;
00614
00615
00616
00617 for (uint32_t i=0; i<header->_chromosomeCount; i++) header->_chromosomes[i].constructorClear();
00618
00619 std::string chromosomeName;
00620
00621
00622
00623 signed char lastBase = BaseAsciiMap::base2int[(int) 'N'];
00624 bool terminateLoad = false;
00625 int percent = -1, newPercent;
00626 uint32_t whichChromosome = 0;
00627 for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00628 {
00629 if (_progressStream)
00630 {
00631 newPercent = (int) (1.0 * fastaIndex / fastaDataSize) * 100;
00632 if (newPercent>percent)
00633 {
00634 *_progressStream << "\r" << newPercent << "% ";
00635 *_progressStream << std::flush;
00636 percent = newPercent;
00637 }
00638 }
00639 switch (fasta[fastaIndex])
00640 {
00641 case '\n':
00642 case '\r':
00643 break;
00644 case '>':
00645 {
00646 chromosomeName = "";
00647 fastaIndex++;
00648
00649
00650
00651 while (!isspace(fasta[fastaIndex]))
00652 {
00653 chromosomeName += fasta[fastaIndex++];
00654 }
00655
00656
00657
00658 while (fasta[fastaIndex]!='\n' && fasta[fastaIndex]!='\r')
00659 {
00660 fastaIndex++;
00661 }
00662
00663
00664
00665
00666 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00667 c->setChromosomeName(chromosomeName.c_str());
00668 c->start = header->elementCount;
00669
00670
00671 if (whichChromosome>0)
00672 {
00673
00674
00675
00676
00677
00678
00679 setChromosomeMD5andLength(whichChromosome - 1);
00680 }
00681 whichChromosome++;
00682 if (whichChromosome > header->_chromosomeCount)
00683 {
00684 std::cerr << "BUG: Exceeded computed chromosome count ("
00685 << header->_chromosomeCount
00686 << ") - genome is now truncated at chromosome "
00687 << header->_chromosomes[header->_chromosomeCount-1].name
00688 << " (index "
00689 << header->_chromosomeCount
00690 << ")."
00691 << std::endl;
00692 terminateLoad = true;
00693 }
00694 break;
00695 }
00696 default:
00697
00698
00699
00700 if (isColorSpace())
00701 {
00702
00703
00704
00705
00706
00707 const char fromBase2CS[] =
00708 {
00709 0,
00710 1,
00711 2,
00712 3,
00713 1,
00714 0,
00715 3,
00716 2,
00717 2,
00718 3,
00719 0,
00720 1,
00721 3,
00722 2,
00723 1,
00724 0,
00725 };
00726
00727
00728
00729
00730
00731
00732
00733
00734 char thisBase =
00735 BaseAsciiMap::base2int[(int)(fasta[fastaIndex])];
00736 if (lastBase>=0)
00737 {
00738 char color;
00739 if (lastBase>3 || thisBase>3) color=4;
00740 else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
00741
00742
00743 set(header->elementCount++,
00744 BaseAsciiMap::int2base[(int) color]);
00745 }
00746 lastBase = thisBase;
00747 }
00748 else
00749 {
00750 set(header->elementCount++, toupper(fasta[fastaIndex]));
00751 }
00752 break;
00753 }
00754
00755
00756
00757
00758
00759 if (terminateLoad) break;
00760 }
00761
00762
00763
00764
00765
00766 if (whichChromosome==0)
00767 {
00768 fastaFile.close();
00769 throw std::runtime_error("No chromosomes found - aborting!");
00770 }
00771 else
00772 {
00773 setChromosomeMD5andLength(whichChromosome-1);
00774 }
00775
00776 #if 0
00777 sanityCheck(fastaFile);
00778 #endif
00779 fastaFile.close();
00780
00781 if (_progressStream) *_progressStream << "\r";
00782
00783 std::cerr << "FASTA binary cache file '"
00784 << _umfaFilename
00785 << "' created."
00786 << std::endl;
00787
00788
00789
00790
00791 return false;
00792 }
00793
00794 int GenomeSequence::getChromosomeCount() const
00795 {
00796 return header->_chromosomeCount;
00797 }
00798
00799
00800 int GenomeSequence::getChromosome(genomeIndex_t position) const
00801 {
00802 if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX;
00803
00804 if (header->_chromosomeCount == 0)
00805 return INVALID_CHROMOSOME_INDEX;
00806
00807 int start = 0;
00808 int stop = header->_chromosomeCount - 1;
00809
00810
00811
00812
00813 if (position > header->_chromosomes[stop].start)
00814 return (stop);
00815
00816 while (start <= stop)
00817 {
00818 int middle = (start + stop) / 2;
00819
00820 if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start)
00821 return middle;
00822
00823 if (position == header->_chromosomes[middle + 1].start)
00824 return (middle + 1);
00825
00826 if (position > header->_chromosomes[middle + 1].start)
00827 start = middle + 1;
00828
00829 if (position < header->_chromosomes[middle].start)
00830 stop = middle - 1;
00831 }
00832
00833 return -1;
00834 }
00835
00836
00837
00838
00839
00840
00841
00842 genomeIndex_t GenomeSequence::getGenomePosition(
00843 const char *chromosomeName,
00844 unsigned int chromosomeIndex) const
00845 {
00846 genomeIndex_t i = getGenomePosition(chromosomeName);
00847 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00848 return i + chromosomeIndex - 1;
00849 }
00850
00851 genomeIndex_t GenomeSequence::getGenomePosition(
00852 int chromosome,
00853 unsigned int chromosomeIndex) const
00854 {
00855 if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX;
00856
00857 genomeIndex_t i = header->_chromosomes[chromosome].start;
00858 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00859 return i + chromosomeIndex - 1;
00860 }
00861
00862
00863
00864
00865
00866
00867
00868
00869
00870 genomeIndex_t GenomeSequence::getGenomePosition(const char *chromosomeName) const
00871 {
00872 int chromosome = getChromosome(chromosomeName);
00873 if (chromosome==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00874 return header->_chromosomes[chromosome].start;
00875 }
00876
00877 int GenomeSequence::getChromosome(const char *chromosomeName) const
00878 {
00879 unsigned int i;
00880 for (i=0; i<header->_chromosomeCount; i++)
00881 {
00882 if (strcmp(header->_chromosomes[i].name, chromosomeName)==0)
00883 {
00884 return i;
00885 }
00886 }
00887 return INVALID_CHROMOSOME_INDEX;
00888 }
00889
00890
00891
00892
00893
00894 void GenomeSequence::getReverseRead(std::string &read)
00895 {
00896 std::string newRead;
00897 if (read.size()) for (int32_t i=(int) read.size() - 1; i>=0; i--)
00898 {
00899 newRead.push_back(BasePair(read[i]));
00900 }
00901 read = newRead;
00902 }
00903
00904 void GenomeSequence::getReverseRead(String& read)
00905 {
00906 int i = 0;
00907 int j = read.Length()-1;
00908 char temp;
00909 while (i < j)
00910 {
00911 temp = read[j];
00912 read[j] = read[i];
00913 read[i] = temp;
00914 }
00915 }
00916
00917 #define ABS(x) ( (x) > 0 ? (x) : -(x) )
00918 int GenomeSequence::debugPrintReadValidation(
00919 std::string &read,
00920 std::string &quality,
00921 char direction,
00922 genomeIndex_t readLocation,
00923 int sumQuality,
00924 int mismatchCount,
00925 bool recurse
00926 )
00927 {
00928 int validateSumQ = 0;
00929 int validateMismatchCount = 0;
00930 int rc = 0;
00931 std::string genomeData;
00932
00933 for (uint32_t i=0; i<read.size(); i++)
00934 {
00935 if (tolower(read[i]) != tolower((*this)[readLocation + i]))
00936 {
00937 validateSumQ += quality[i] - '!';
00938
00939 if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
00940 genomeData.push_back(tolower((*this)[readLocation + i]));
00941 }
00942 else
00943 {
00944 genomeData.push_back(toupper((*this)[readLocation + i]));
00945 }
00946 }
00947 assert(validateSumQ>=0);
00948 if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
00949 {
00950 printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n",
00951 genomeData.c_str(),
00952 read.c_str(),
00953 validateSumQ,
00954 sumQuality
00955 );
00956 rc++;
00957 }
00958 else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
00959 {
00960 printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n",
00961 genomeData.c_str(),
00962 read.c_str(),
00963 validateMismatchCount,
00964 mismatchCount
00965 );
00966 rc++;
00967 }
00968 else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
00969 {
00970 printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
00971 genomeData.c_str(),
00972 read.c_str(),
00973 validateSumQ,
00974 sumQuality,
00975 validateMismatchCount,
00976 mismatchCount
00977 );
00978 rc++;
00979 }
00980
00981 if (recurse && ABS(validateMismatchCount - mismatchCount) > (int) read.size()/2)
00982 {
00983 printf("large mismatch difference, trying reverse strand: ");
00984 std::string reverseRead = read;
00985 std::string reverseQuality = quality;
00986 getReverseRead(reverseRead);
00987 reverse(reverseQuality.begin(), reverseQuality.end());
00988 rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
00989 }
00990 return rc;
00991 }
00992 #undef ABS
00993
00994
00995 bool GenomeSequence::wordMatch(unsigned int index, std::string &word) const
00996 {
00997 for (uint32_t i = 0; i<word.size(); i++)
00998 {
00999 if ((*this)[index + i] != word[i]) return false;
01000 }
01001 return true;
01002 }
01003
01004 bool GenomeSequence::printNearbyWords(unsigned int index, unsigned int deviation, std::string &word) const
01005 {
01006 for (unsigned int i = index - deviation; i < index + deviation; i++)
01007 {
01008 if (wordMatch(i, word))
01009 {
01010 std::cerr << "word '"
01011 << word
01012 << "' found "
01013 << i - index
01014 << " away from position "
01015 << index
01016 << "."
01017 << std::endl;
01018 }
01019 }
01020 return false;
01021 }
01022
01023 void GenomeSequence::dumpSequenceSAMDictionary(std::ostream &file) const
01024 {
01025 for (unsigned int i=0; i<header->_chromosomeCount; i++)
01026 {
01027 file
01028 << "@SQ"
01029 << "\tSN:" << header->_chromosomes[i].name
01030 << "\tLN:" << header->_chromosomes[i].size
01031 << "\tAS:" << header->_chromosomes[i].assemblyID
01032 << "\tM5:" << header->_chromosomes[i].md5
01033 << "\tUR:" << header->_chromosomes[i].uri
01034 << "\tSP:" << header->_chromosomes[i].species
01035 << std::endl;
01036 }
01037 }
01038
01039 void GenomeSequence::dumpHeaderTSV(std::ostream &file) const
01040 {
01041 file << "# Reference: " << _baseFilename << std::endl;
01042 file << "# SN: sample name - must be unique" << std::endl;
01043 file << "# AS: assembly name" << std::endl;
01044 file << "# SP: species" << std::endl;
01045 file << "# LN: chromosome/contig length" << std::endl;
01046 file << "# M5: chromosome/contig MD5 checksum" << std::endl;
01047 file << "# LN and M5 are only printed for informational purposes." << std::endl;
01048 file << "# Karma will only set those values when creating the index." << std::endl;
01049 file << "SN" << "\t" << "AS" << "\t" << "SP" << "\t" << "UR" << "\t" << "LN" << "\t" << "M5" << std::endl;
01050 for (unsigned int i=0; i<header->_chromosomeCount; i++)
01051 {
01052 file
01053 << header->_chromosomes[i].name
01054 << "\t" << header->_chromosomes[i].assemblyID
01055 << "\t" << header->_chromosomes[i].uri
01056 << "\t" << header->_chromosomes[i].species
01057 << "\t" << header->_chromosomes[i].size
01058 << "\t" << header->_chromosomes[i].md5
01059 << std::endl;
01060 }
01061 }
01062
01063
01064 void GenomeSequence::getString(std::string &str, int chromosome, uint32_t index, int baseCount) const
01065 {
01066
01067
01068
01069 genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1;
01070
01071 getString(str, genomeIndex, baseCount);
01072 }
01073
01074 void GenomeSequence::getString(String &str, int chromosome, uint32_t index, int baseCount) const
01075 {
01076 std::string string;
01077 this-> getString(string, chromosome, index, baseCount);
01078 str = string.c_str();
01079 }
01080
01081 void GenomeSequence::getString(std::string &str, genomeIndex_t index, int baseCount) const
01082 {
01083 str.clear();
01084 if (baseCount > 0)
01085 {
01086 for (int i=0; i<baseCount; i++)
01087 {
01088 str.push_back((*this)[index + i]);
01089 }
01090 }
01091 else
01092 {
01093
01094
01095 for (int i=0; i< -baseCount; i++)
01096 {
01097 str.push_back(BaseAsciiMap::base2complement[(int)(*this)[index + i]]);
01098 }
01099 }
01100 }
01101
01102 void GenomeSequence::getString(String &str, genomeIndex_t index, int baseCount) const
01103 {
01104 std::string string;
01105 getString(string, index, baseCount);
01106 str = string.c_str();
01107 }
01108
01109 void GenomeSequence::getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const
01110 {
01111 str.clear();
01112 if (baseCount > 0)
01113 {
01114 for (int i=0; i<baseCount; i++)
01115 {
01116 char base = (*this)[index + i];
01117 if (in(index+i, highLightStart, highLightEnd))
01118 base = tolower(base);
01119 str.push_back(base);
01120 }
01121 }
01122 else
01123 {
01124
01125
01126 for (int i=0; i< -baseCount; i++)
01127 {
01128 char base = BaseAsciiMap::base2complement[(int)(*this)[index + i]];
01129 if (in(index+i, highLightStart, highLightEnd))
01130 base = tolower(base);
01131 str.push_back(base);
01132 }
01133 }
01134 }
01135
01136 void GenomeSequence::print30(genomeIndex_t index) const
01137 {
01138 std::cout << "index: " << index << "\n";
01139 for (genomeIndex_t i=index-30; i<index+30; i++)
01140 std::cout << (*this)[i];
01141 std::cout << "\n";
01142 for (genomeIndex_t i=index-30; i<index; i++)
01143 std::cout << " ";
01144 std::cout << "^";
01145 std::cout << std::endl;
01146 }
01147
01148 void GenomeSequence::getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const
01149 {
01150 result = "";
01151 for (uint32_t i=0; i < read.size(); i++)
01152 {
01153 if (read[i] == (*this)[location+i])
01154 result.push_back(' ');
01155 else
01156 result.push_back('^');
01157 }
01158 }
01159
01160 void GenomeSequence::getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const
01161 {
01162 result = "";
01163 for (uint32_t i=0; i < read.size(); i++)
01164 {
01165 if (read[i] == (*this)[location+i])
01166 result.push_back(toupper(read[i]));
01167 else
01168 result.push_back(tolower(read[i]));
01169 }
01170 }
01171
01172 genomeIndex_t GenomeSequence::simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const
01173 {
01174 int bestScore = 1000000;
01175 genomeIndex_t bestMatchLocation = INVALID_GENOME_INDEX;
01176 for (int i=-windowSize; i<windowSize; i++)
01177 {
01178 int newScore;
01179
01180 if (i<0 && ((uint32_t) -i) > index) continue;
01181 if (index + i + read.size() >= getNumberBases()) continue;
01182
01183 if (quality=="")
01184 {
01185 newScore = this->getMismatchCount(read, index + i);
01186 }
01187 else
01188 {
01189 newScore = this->getSumQ(read, quality, index + i);
01190 }
01191 if (newScore < bestScore)
01192 {
01193 bestScore = newScore;
01194 bestMatchLocation = index + i;
01195 }
01196 }
01197 return bestMatchLocation;
01198 }
01199
01200 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h)
01201 {
01202 stream << (MemoryMapArrayHeader &) h;
01203 stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01204 stream << "isColorSpace: " << h._colorSpace << std::endl;
01205 stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01206 uint64_t totalSize = 0;
01207 for (uint32_t i=0; i < h._chromosomeCount; i++)
01208 {
01209 totalSize += h._chromosomes[i].size;
01210 stream << "Chromosome Index " << i << " name: " << h._chromosomes[i].name << std::endl;
01211 stream << "Chromosome Index " << i << " whole genome start: " << h._chromosomes[i].start << std::endl;
01212 stream << "Chromosome Index " << i << " whole genome size: " << h._chromosomes[i].size << std::endl;
01213 stream << "Chromosome Index " << i << " md5 checksum: " << h._chromosomes[i].md5 << std::endl;
01214 stream << "Chromosome Index " << i << " assemblyID: " << h._chromosomes[i].assemblyID << std::endl;
01215 stream << "Chromosome Index " << i << " species: " << h._chromosomes[i].species << std::endl;
01216 stream << "Chromosome Index " << i << " URI: " << h._chromosomes[i].uri << std::endl;
01217 }
01218 stream << "Total Genome Size: " << totalSize << " bases."<< std::endl;
01219 if (totalSize != h.elementCount)
01220 {
01221 stream << "Total Genome Size: does not match elementCount!\n";
01222 }
01223
01224 stream << std::endl;
01225 return stream;
01226 }
01227
01228 void GenomeSequence::getChromosomeAndIndex(std::string &s, genomeIndex_t i) const
01229 {
01230 int whichChromosome = 0;
01231
01232 whichChromosome = getChromosome(i);
01233
01234 if (whichChromosome == INVALID_CHROMOSOME_INDEX)
01235 {
01236 s = "invalid genome index";
01237 }
01238 else
01239 {
01240 std::ostringstream buf;
01241 genomeIndex_t chromosomeIndex = i - getChromosomeStart(whichChromosome) + 1;
01242 buf << header->_chromosomes[whichChromosome].name << ":" << chromosomeIndex;
01243 #if 0
01244 buf << " (GenomeIndex " << i << ")";
01245 #endif
01246 s = buf.str();
01247 }
01248 return;
01249 }
01250
01251
01252 void GenomeSequence::getChromosomeAndIndex(String& s, genomeIndex_t i) const
01253 {
01254 std::string ss;
01255 getChromosomeAndIndex(ss, i);
01256 s = ss.c_str();
01257 return;
01258 }
01259
01260
01261
01262
01263
01264
01265
01266
01267
01268
01269
01270
01271
01272
01273
01274
01275
01276 bool GenomeSequence::populateDBSNP(
01277 mmapArrayBool_t &dbSNP,
01278 IFILE inputFile) const
01279 {
01280 assert(dbSNP.getElementCount() == getNumberBases());
01281
01282 if(inputFile == NULL)
01283 {
01284
01285 return(false);
01286 }
01287
01288 std::string chromosomeName;
01289 std::string position;
01290 genomeIndex_t chromosomePosition1;
01291 uint64_t ignoredLineCount = 0;
01292
01293
01294 char* postPosPtr = NULL;
01295 while(!inputFile->ifeof())
01296 {
01297 chromosomeName.clear();
01298 position.clear();
01299
01300 if(inputFile->readTilTab(chromosomeName) <= 0)
01301 {
01302
01303
01304 if(chromosomeName.size()>0 && chromosomeName[0]=='#')
01305 {
01306
01307 continue;
01308 }
01309
01310 ++ignoredLineCount;
01311
01312 continue;
01313 }
01314
01315
01316 if(chromosomeName.size()>0 && chromosomeName[0]=='#')
01317 {
01318
01319
01320 inputFile->discardLine();
01321 continue;
01322 }
01323
01324
01325 if(inputFile->readTilTab(position) > 0)
01326 {
01327
01328 inputFile->discardLine();
01329 }
01330
01331
01332 chromosomePosition1 = strtoul(position.c_str(), &postPosPtr, 0);
01333
01334
01335 if(postPosPtr == position.c_str())
01336 {
01337 ++ignoredLineCount;
01338 continue;
01339 }
01340
01341
01342 genomeIndex_t genomeIndex =
01343 getGenomePosition(chromosomeName.c_str(), chromosomePosition1);
01344
01345
01346 if((genomeIndex == INVALID_GENOME_INDEX) ||
01347 (genomeIndex > getNumberBases()))
01348 {
01349 ignoredLineCount++;
01350 continue;
01351 }
01352
01353 dbSNP.set(genomeIndex, true);
01354 }
01355
01356 if (ignoredLineCount > 0)
01357 {
01358 std::cerr << "GenomeSequence::populateDBSNP: ignored " << ignoredLineCount << " SNP positions due to invalid format of line." << std::endl;
01359 return false;
01360 }
01361 return true;
01362 }
01363
01364 bool GenomeSequence::loadDBSNP(
01365 mmapArrayBool_t &dbSNP,
01366 const char *inputFileName) const
01367 {
01368
01369
01370
01371
01372
01373
01374
01375
01376
01377
01378
01379
01380
01381
01382 if (strlen(inputFileName)!=0)
01383 {
01384 std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush;
01385
01386 if (dbSNP.open(inputFileName, O_RDONLY))
01387 {
01388
01389
01390
01391
01392
01393
01394 if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos)
01395 {
01396 std::cerr << "Error: " << dbSNP.getErrorString() << std::endl;
01397 exit(1);
01398 }
01399
01400
01401
01402 IFILE inputFile = ifopen(inputFileName, "r");
01403 if(inputFile == NULL)
01404 {
01405 std::cerr << "Error: failed to open " << inputFileName << std::endl;
01406 exit(1);
01407 }
01408
01409 std::cerr << "(as text file) ";
01410
01411
01412 dbSNP.create(getNumberBases());
01413
01414
01415 populateDBSNP(dbSNP, inputFile);
01416 ifclose(inputFile);
01417
01418 }
01419 else
01420 {
01421 std::cerr << "(as binary mapped file) ";
01422 }
01423
01424 std::cerr << "DONE!" << std::endl;
01425 return false;
01426 }
01427 else
01428 {
01429 return true;
01430 }
01431 }
01432
01433
01434 #if defined(TEST)
01435
01436 void simplestExample(void)
01437 {
01438 GenomeSequence reference;
01439 genomeIndex_t index;
01440
01441
01442
01443
01444
01445
01446
01447 if (reference.open())
01448 {
01449 perror("GenomeSequence::open");
01450 exit(1);
01451 }
01452
01453
01454 index = 1000000000;
01455
01456
01457
01458
01459
01460
01461
01462
01463
01464 std::cout << "base[" << index << "] = " << reference[index] << std::endl;
01465
01466
01467
01468
01469
01470 int chr = reference.getChromosome(index);
01471 genomeIndex_t chrIndex = index - reference.getChromosomeStart(chr) + 1;
01472
01473 std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
01474
01475
01476
01477
01478
01479 const char *chromosomeName = "5";
01480 chr = reference.getChromosome(chromosomeName);
01481 chrIndex = 100000;
01482
01483 index = reference.getChromosomeStart(chr) + chrIndex - 1;
01484
01485 std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
01486
01487 reference.close();
01488 }
01489
01490 void testGenomeSequence(void)
01491 {
01492 GenomeSequence reference;
01493
01494 #if 0
01495 std::string referenceName = "someotherreference";
01496 if (reference.setFastaName(referenceName))
01497 {
01498 std::cerr << "failed to open reference file "
01499 << referenceName
01500 << std::endl;
01501 exit(1);
01502 }
01503 #endif
01504
01505 std::cerr << "open and prefetch the reference genome: ";
01506
01507
01508 if (reference.open())
01509 {
01510 exit(1);
01511 }
01512 std::cerr << "done!" << std::endl;
01513
01514
01515
01516
01517 genomeIndex_t genomeIndex;
01518 unsigned int chromosomeIndex;
01519 unsigned int chromosome;
01520 std::string chromosomeName;
01521
01522
01523
01524
01525
01526 chromosomeName = "2";
01527 chromosomeIndex = 1234567;
01528
01529 genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
01530 assert(genomeIndex!=INVALID_GENOME_INDEX);
01531 std::cout << "Chromosome " << chromosomeName << ", index ";
01532 std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
01533 std::cout << " at genome index position " << genomeIndex << std::endl;
01534
01535
01536
01537
01538
01539
01540
01541 chromosome = reference.getChromosome(genomeIndex);
01542 unsigned int newChromosomeIndex;
01543
01544 newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
01545
01546 assert(chromosomeIndex == newChromosomeIndex);
01547
01548
01549
01550 PackedRead pr;
01551
01552 pr.set("ATCGATCG", 0);
01553 assert(pr.size()==8);
01554 assert(pr[0]==BaseAsciiMap::base2int[(int) 'A']);
01555 assert(pr[1]==BaseAsciiMap::base2int[(int) 'T']);
01556 assert(pr[2]==BaseAsciiMap::base2int[(int) 'C']);
01557 assert(pr[3]==BaseAsciiMap::base2int[(int) 'G']);
01558 pr.set("ATCGATCG", 1);
01559 assert(pr.size()==9);
01560 pr.set("", 0);
01561 assert(pr.size()==0);
01562 pr.set("", 1);
01563 assert(pr.size()==1);
01564 pr.set("", 2);
01565 assert(pr.size()==2);
01566 pr.set("", 3);
01567 assert(pr.size()==3);
01568 assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']);
01569 assert(pr[1]==BaseAsciiMap::base2int[(int) 'N']);
01570 assert(pr[2]==BaseAsciiMap::base2int[(int) 'N']);
01571 pr.set("C", 1);
01572 assert(pr.size()==2);
01573 assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']);
01574 assert(pr[1]==BaseAsciiMap::base2int[(int) 'C']);
01575
01576 }
01577
01578
01579
01580
01581
01582
01583
01584
01585 int main(int argc, const char **argv)
01586 {
01587 #if 1
01588 simplestExample();
01589 #else
01590 testGenomeSequence();
01591 #endif
01592 exit(0);
01593 }
01594
01595 #endif