libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010-2012 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include "assert.h" 00019 #include "ctype.h" 00020 #include "stdio.h" 00021 #include "Error.h" 00022 00023 00024 #include "Generic.h" 00025 #include "GenomeSequence.h" 00026 00027 #include <algorithm> 00028 #include <istream> 00029 #include <fstream> 00030 #include <sstream> 00031 #include <stdexcept> 00032 00033 #if defined(_WIN32) 00034 #include <io.h> 00035 #ifndef R_OK 00036 #define R_OK 4 00037 #endif 00038 #endif 00039 00040 // not general use: 00041 #include "CSG_MD5.h" 00042 00043 // 00044 // given a read in a string, pack it into the vector of 00045 // bytes coded as two bases per byte. 00046 // 00047 // The goal is to allow us to more rapidly compare against 00048 // the genome, which is itself packed 2 bases per byte. 00049 // 00050 // Unfortunately, the match position may be odd or even, 00051 // so as a result, we also need to be able to prepad 00052 // with 'N' bases to get the byte alignment the same, so 00053 // padWithNCount may be a positive number indicating how 00054 // many N bases to prepend. 00055 // 00056 void PackedRead::set(const char *rhs, int padWithNCount) 00057 { 00058 clear(); 00059 00060 // pad this packed read with 'N' bases two at a time 00061 while (padWithNCount>1) 00062 { 00063 packedBases.push_back( 00064 BaseAsciiMap::base2int[(int) 'N'] << 4 | 00065 BaseAsciiMap::base2int[(int) 'N'] 00066 ); 00067 padWithNCount -= 2; 00068 length+=2; 00069 } 00070 00071 // when we have only one base, pack one 'N' base with 00072 // the first base in rhs if there is one. 00073 if (padWithNCount) 00074 { 00075 // NB: *rhs could be NUL, which is ok here - just keep 00076 // the length straight. 00077 packedBases.push_back( 00078 BaseAsciiMap::base2int[(int) *rhs] << 4 | 00079 BaseAsciiMap::base2int[(int) 'N'] 00080 ); 00081 // two cases - have characters in rhs or we don't: 00082 if (*rhs) 00083 { 00084 length+=2; // pad byte plus one byte from rhs 00085 rhs++; 00086 } 00087 else 00088 { 00089 length++; 00090 } 00091 padWithNCount--; // should now be zero, so superfluous. 00092 assert(padWithNCount==0); 00093 } 00094 00095 // pad pairs of bases from rhs, two at a time: 00096 while (*rhs && *(rhs+1)) 00097 { 00098 packedBases.push_back( 00099 BaseAsciiMap::base2int[(int) *(rhs+1)] << 4 | 00100 BaseAsciiMap::base2int[(int) *(rhs+0)] 00101 ); 00102 rhs+=2; 00103 length+=2; 00104 } 00105 00106 // if there is an odd base left at the end, put it 00107 // in a byte all its own (low 4 bits == 0): 00108 if (*rhs) 00109 { 00110 packedBases.push_back( 00111 BaseAsciiMap::base2int[(int) *(rhs+0)] 00112 ); 00113 length++; 00114 } 00115 return; 00116 } 00117 00118 std::string GenomeSequence::IntegerToSeq(unsigned int n, unsigned int wordsize) const 00119 { 00120 std::string sequence(""); 00121 for (unsigned int i = 0; i < wordsize; i ++) 00122 sequence += "N"; 00123 00124 unsigned int clearHigherBits = ~(3U << (wordsize<<1)); // XXX refactor - this appears several places 00125 00126 if (n > clearHigherBits) 00127 error("%d needs to be a non-negative integer < clearHigherBits\n", n); 00128 00129 for (unsigned int i = 0; i < wordsize; i++) 00130 { 00131 sequence[wordsize-1-i] = BaseAsciiMap::int2base[n & 3]; 00132 n >>= 2; 00133 } 00134 return sequence; 00135 } 00136 00137 00138 00139 GenomeSequence::GenomeSequence() 00140 { 00141 constructorClear(); 00142 } 00143 00144 void GenomeSequence::constructorClear() 00145 { 00146 _debugFlag = 0; 00147 _progressStream = NULL; 00148 _colorSpace = false; 00149 _createOverwrite = false; 00150 } 00151 00152 void GenomeSequence::setup(const char *referenceFilename) 00153 { 00154 setReferenceName(referenceFilename); 00155 00156 if (_progressStream) *_progressStream << "open and prefetch reference genome " << referenceFilename << ": " << std::flush; 00157 00158 if (open(false)) 00159 { 00160 std::cerr << "Failed to open reference genome " << referenceFilename << std::endl; 00161 std::cerr << errorStr << std::endl; 00162 exit(1); 00163 } 00164 00165 prefetch(); 00166 if (_progressStream) *_progressStream << "done." << std::endl << std::flush; 00167 } 00168 00169 GenomeSequence::~GenomeSequence() 00170 { 00171 // free up resources: 00172 _umfaFile.close(); 00173 } 00174 00175 // 00176 // mapped open. 00177 // 00178 // if the file exists, map in into memory, and fill in a few useful 00179 // fields. 00180 // 00181 00182 bool GenomeSequence::open(bool isColorSpace, int flags) 00183 { 00184 bool rc; 00185 00186 if (isColorSpace) 00187 { 00188 _umfaFilename = _baseFilename + "-cs.umfa"; 00189 } 00190 else 00191 { 00192 _umfaFilename = _baseFilename + "-bs.umfa"; 00193 } 00194 00195 if(access(_umfaFilename.c_str(), R_OK) != 0) 00196 { 00197 // umfa file doesn't exist, so try to create it. 00198 if(create(isColorSpace)) 00199 { 00200 // Couldon't access or create the umfa. 00201 std::cerr << "GenomeSequence::open: failed to open file " 00202 << _umfaFilename 00203 << " also failed creating it." 00204 << std::endl; 00205 return true; 00206 } 00207 } 00208 00209 rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags); 00210 if (rc) 00211 { 00212 std::cerr << "GenomeSequence::open: failed to open file " 00213 << _umfaFilename 00214 << std::endl; 00215 return true; 00216 } 00217 00218 _colorSpace = header->_colorSpace; 00219 00220 return false; 00221 } 00222 00223 void GenomeSequence::sanityCheck(MemoryMap &fasta) const 00224 { 00225 unsigned int i; 00226 00227 unsigned int genomeIndex = 0; 00228 for (i=0; i<fasta.length(); i++) 00229 { 00230 switch (fasta[i]) 00231 { 00232 case '>': 00233 while (fasta[i]!='\n' && fasta[i]!='\r') i++; 00234 break; 00235 case '\n': 00236 case '\r': 00237 break; 00238 default: 00239 assert(BaseAsciiMap::base2int[(int)(*this)[genomeIndex]] == 00240 BaseAsciiMap::base2int[(int) fasta[i]]); 00241 genomeIndex++; 00242 break; 00243 } 00244 } 00245 } 00246 00247 #define HAS_SUFFIX(str, suffix) ((strlen(suffix) < str.size()) && (str.substr(str.size() - strlen(suffix)) == suffix)) 00248 // 00249 // referenceFilename is either a fasta or a UM fasta (.fa or .umfa) 00250 // filename. In both cases, the suffix gets removed and the 00251 // base name is kept for later use depending on context. 00252 // @return always return false 00253 // 00254 bool GenomeSequence::setReferenceName(std::string referenceFilename) 00255 { 00256 00257 if (HAS_SUFFIX(referenceFilename, ".fa")) 00258 { 00259 _referenceFilename = referenceFilename; 00260 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3); 00261 } 00262 else if (HAS_SUFFIX(referenceFilename, ".umfa")) 00263 { 00264 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5); 00265 } 00266 else if (HAS_SUFFIX(referenceFilename, "-cs.umfa")) 00267 { 00268 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8); 00269 } 00270 else if (HAS_SUFFIX(referenceFilename, "-bs.umfa")) 00271 { 00272 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8); 00273 } 00274 else 00275 { 00276 _baseFilename = referenceFilename; 00277 } 00278 _fastaFilename = _baseFilename + ".fa"; 00279 00280 if (HAS_SUFFIX(referenceFilename, ".fasta")) 00281 { 00282 _referenceFilename = referenceFilename; 00283 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 6); 00284 _fastaFilename = _baseFilename + ".fasta"; 00285 } 00286 00287 return false; 00288 } 00289 00290 // 00291 // this works in lockstep with ::create to populate 00292 // the per chromosome header fields size and md5 00293 // checksum. 00294 // 00295 // It relies on header->elementCount being set to 00296 // the data length loaded so far ... not the ultimate 00297 // reference length. 00298 // 00299 bool GenomeSequence::setChromosomeMD5andLength(uint32_t whichChromosome) 00300 { 00301 if (whichChromosome>=header->_chromosomeCount) return true; 00302 00303 ChromosomeInfo *c = &header->_chromosomes[whichChromosome]; 00304 c->size = header->elementCount - c->start; 00305 00306 MD5_CTX md5Context; 00307 uint8_t md5Signature[MD5_DIGEST_LENGTH]; 00308 00309 // 00310 // it's easier to ensure we do this right if we just do it 00311 // in one big chunk: 00312 // 00313 char *md5Buffer = (char *) malloc(c->size); 00314 00315 MD5Init(&md5Context); 00316 00317 for (genomeIndex_t i = 0; i < c->size; i ++) 00318 { 00319 md5Buffer[i] = (*this)[c->start + i]; 00320 } 00321 MD5Update(&md5Context, (unsigned char *) md5Buffer, c->size); 00322 MD5Final((unsigned char *) &md5Signature, &md5Context); 00323 free(md5Buffer); 00324 for (int i=0; i<MD5_DIGEST_LENGTH; i++) 00325 { 00326 // cheesy but works: 00327 sprintf(c->md5+2*i, "%02x", md5Signature[i]); 00328 } 00329 // redundant, strictly speaking due to sprintf NUL terminating 00330 // it's output strings, but put it here anyway. 00331 c->md5[2*MD5_DIGEST_LENGTH] = '\0'; 00332 00333 return false; 00334 } 00335 00336 // 00337 // Given a buffer with a fasta format contents, count 00338 // the number of chromosomes in it and return that value. 00339 // 00340 static bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount) 00341 { 00342 chromosomeCount = 0; 00343 baseCount = 0; 00344 bool atLineStart = true; 00345 00346 // 00347 // loop over the fasta file, essentially matching for the 00348 // pattern '^>.*$' and counting them. 00349 // 00350 for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++) 00351 { 00352 switch (fastaData[fastaIndex]) 00353 { 00354 case '\n': 00355 case '\r': 00356 atLineStart = true; 00357 break; 00358 case '>': 00359 { 00360 if (!atLineStart) break; 00361 chromosomeCount++; 00362 // 00363 // eat the rest of the line 00364 // 00365 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r') 00366 { 00367 fastaIndex++; 00368 } 00369 break; 00370 } 00371 default: 00372 baseCount++; 00373 atLineStart = false; 00374 break; 00375 } 00376 00377 } 00378 return false; 00379 } 00380 00381 class PackedSequenceData : public std::vector<uint8_t> 00382 { 00383 std::vector<uint8_t> m_packedBases; 00384 00385 size_t m_baseCount; 00386 00387 void set(size_t index, uint8_t value) { 00388 m_packedBases[index>>1] = 00389 (m_packedBases[index>>1] // original value 00390 & ~(7<<((index&0x01)<<2))) // logical AND off the original value 00391 | ((value&0x0f)<<((index&0x1)<<2)); // logical OR in the new value 00392 } 00393 00394 public: 00395 00396 void reserve(size_t baseCount) {m_packedBases.reserve(baseCount/2);} 00397 size_t size() {return m_baseCount;} 00398 void clear() {m_packedBases.clear(); m_baseCount = 0;} 00399 uint8_t operator [](size_t index) 00400 { 00401 return (m_packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf; 00402 } 00403 void push_back(uint8_t base); 00404 }; 00405 00406 // 00407 // Load a fasta format file from filename into the buffer 00408 // provided by the caller. 00409 // While parsing the fasta file, record each chromosome name, 00410 // its start location, and its size. 00411 // 00412 // NB: the caller must implement the logic to determine how 00413 // large the sequence data is. There is no correct way to do 00414 // this, because we can't reliably estimate here how much sequence 00415 // data is contained in a compressed file. 00416 // 00417 // To safely pre-allocate space in sequenceData, use the reserve() method 00418 // before calling this function. 00419 // 00420 bool loadFastaFile(const char *filename, 00421 std::vector<PackedSequenceData> &sequenceData, 00422 std::vector<std::string> &chromosomeNames) 00423 { 00424 InputFile inputStream(filename, "r", InputFile::DEFAULT); 00425 00426 if(!inputStream.isOpen()) { 00427 std::cerr << "Failed to open file " << filename << "\n"; 00428 return true; 00429 } 00430 00431 int whichChromosome = -1; 00432 chromosomeNames.clear(); 00433 00434 char ch; 00435 while((ch = inputStream.ifgetc()) != EOF) { 00436 switch (ch) 00437 { 00438 case '\n': 00439 case '\r': 00440 break; 00441 case '>': 00442 { 00443 std::string chromosomeName = ""; 00444 // 00445 // pull out the chromosome new name 00446 // 00447 while (!isspace((ch = inputStream.ifgetc())) && ch != EOF) 00448 { 00449 chromosomeName += ch; // slow, but who cares 00450 } 00451 // 00452 // eat the rest of the line 00453 // 00454 do { 00455 ch = inputStream.ifgetc(); 00456 } while(ch != EOF && ch != '\n' && ch != '\r'); 00457 // 00458 // save the Chromosome name and index into our 00459 // header so we can use them later. 00460 // 00461 chromosomeNames.push_back(chromosomeName); 00462 00463 whichChromosome++; 00464 00465 break; 00466 } 00467 default: 00468 // we get here for sequence data. 00469 // 00470 // save the base value 00471 // Note: invalid characters come here as well, but we 00472 // let ::set deal with mapping them. 00473 break; 00474 } 00475 } 00476 return false; 00477 } 00478 00479 // 00480 // recreate the umfa file from a reference fasta format file 00481 // 00482 // The general format of a FASTA file is best described 00483 // on wikipedia at http://en.wikipedia.org/wiki/FASTA_format 00484 // 00485 // The format parsed here is a simpler subset, and is 00486 // described here http://www.ncbi.nlm.nih.gov/blast/fasta.shtml 00487 // 00488 bool GenomeSequence::create(bool isColor) 00489 { 00490 setColorSpace(isColor); 00491 00492 if (_baseFilename=="") 00493 { 00494 std::cerr << "Base reference filename is empty." << std::endl; 00495 return true; 00496 } 00497 00498 if (isColorSpace()) 00499 { 00500 _umfaFilename = _baseFilename + "-cs.umfa"; 00501 } 00502 else 00503 { 00504 _umfaFilename = _baseFilename + "-bs.umfa"; 00505 } 00506 00507 if (!_createOverwrite && access(_umfaFilename.c_str(), R_OK) == 0) 00508 { 00509 std::cerr << "Output file '" << _umfaFilename << "' exists or is not writable - please remove." << std::endl; 00510 return true; 00511 } 00512 00513 MemoryMap fastaFile; 00514 00515 if (fastaFile.open(_fastaFilename.c_str())) 00516 { 00517 std::cerr << "failed to open input fasta file '" << _fastaFilename << "'." << std::endl; 00518 return true; 00519 } 00520 00521 std::cerr << "Creating FASTA " 00522 << (isColorSpace() ? "color space " : "") 00523 << "binary cache file '" 00524 << _umfaFilename 00525 << "'." 00526 << std::endl; 00527 00528 std::cerr << std::flush; 00529 00530 // 00531 // simple ptr to fasta data -- just treat the memory map 00532 // as an array of fastaDataSize characters... 00533 // 00534 const char *fasta = (const char *) fastaFile.data; 00535 size_t fastaDataSize = fastaFile.length(); 00536 00537 uint32_t chromosomeCount = 0; 00538 uint64_t baseCount = 0; 00539 getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount); 00540 00541 if (genomeSequenceArray::create(_umfaFilename.c_str(), baseCount, chromosomeCount)) 00542 { 00543 std::cerr << "failed to create '" 00544 << _umfaFilename 00545 << "'." 00546 << std::endl; 00547 perror(""); 00548 return true; 00549 } 00550 header->elementCount = 0; 00551 header->_colorSpace = isColorSpace(); 00552 header->setApplication(_application.c_str()); 00553 header->_chromosomeCount = chromosomeCount; 00554 // 00555 // clear out the variable length chromosome info array 00556 // 00557 for (uint32_t i=0; i<header->_chromosomeCount; i++) header->_chromosomes[i].constructorClear(); 00558 00559 std::string chromosomeName; 00560 00561 // 00562 // for converting the reference to colorspace, the first base is always 5 (in base space it is 'N') 00563 signed char lastBase = BaseAsciiMap::base2int[(int) 'N']; 00564 bool terminateLoad = false; 00565 int percent = -1, newPercent; 00566 uint32_t whichChromosome = 0; 00567 for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++) 00568 { 00569 if (_progressStream) 00570 { 00571 newPercent = (int) (1.0 * fastaIndex / fastaDataSize) * 100; 00572 if (newPercent>percent) 00573 { 00574 *_progressStream << "\r" << newPercent << "% "; 00575 *_progressStream << std::flush; 00576 percent = newPercent; 00577 } 00578 } 00579 switch (fasta[fastaIndex]) 00580 { 00581 case '\n': 00582 case '\r': 00583 break; 00584 case '>': 00585 { 00586 chromosomeName = ""; 00587 fastaIndex++; // skip the > char 00588 // 00589 // pull out the chromosome new name 00590 // 00591 while (!isspace(fasta[fastaIndex])) 00592 { 00593 chromosomeName += fasta[fastaIndex++]; // slow, but who cares 00594 } 00595 // 00596 // eat the rest of the line 00597 // 00598 while (fasta[fastaIndex]!='\n' && fasta[fastaIndex]!='\r') 00599 { 00600 fastaIndex++; 00601 } 00602 // 00603 // save the Chromosome name and index into our 00604 // header so we can use them later. 00605 // 00606 ChromosomeInfo *c = &header->_chromosomes[whichChromosome]; 00607 c->setChromosomeName(chromosomeName.c_str()); 00608 c->start = header->elementCount; 00609 // c->size gets computed at the next '>' line or at the EOF 00610 00611 if (whichChromosome>0) 00612 { 00613 // 00614 // compute md5 checksum for the chromosome that we just 00615 // loaded (if there was one) - note that on the last 00616 // chromosome, we have to duplicate this code after 00617 // the end of this loop 00618 // 00619 setChromosomeMD5andLength(whichChromosome - 1); 00620 } 00621 whichChromosome++; 00622 if (whichChromosome > header->_chromosomeCount) 00623 { 00624 std::cerr << "BUG: Exceeded computed chromosome count (" 00625 << header->_chromosomeCount 00626 << ") - genome is now truncated at chromosome " 00627 << header->_chromosomes[header->_chromosomeCount-1].name 00628 << " (index " 00629 << header->_chromosomeCount 00630 << ")." 00631 << std::endl; 00632 terminateLoad = true; 00633 } 00634 break; 00635 } 00636 default: 00637 // save the base pair value 00638 // Note: invalid characters come here as well, but we 00639 // let ::set deal with mapping them. 00640 if (isColorSpace()) 00641 { 00642 // 00643 // anything outside these values represents an invalid base 00644 // base codes: 0-> A, 1-> C, 2-> G, 3-> T 00645 // colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red 00646 // 00647 const char fromBase2CS[] = 00648 { 00649 /* 0000 */ 0, // A->A 00650 /* 0001 */ 1, // A->C 00651 /* 0010 */ 2, // A->G 00652 /* 0011 */ 3, // A->T 00653 /* 0100 */ 1, // C->A 00654 /* 0101 */ 0, // C->C 00655 /* 0110 */ 3, // C->G 00656 /* 0111 */ 2, // C->T 00657 /* 1000 */ 2, // G->A 00658 /* 1001 */ 3, // G->C 00659 /* 1010 */ 0, // G->G 00660 /* 1011 */ 1, // G->T 00661 /* 1100 */ 3, // T->A 00662 /* 1101 */ 2, // T->C 00663 /* 1110 */ 1, // T->G 00664 /* 1111 */ 0, // T->T 00665 }; 00666 // 00667 // we are writing color space values on transitions, 00668 // so we don't write a colorspace value when we 00669 // get the first base value. 00670 // 00671 // On second and subsequent bases, write based on 00672 // the index table above 00673 // 00674 char thisBase = 00675 BaseAsciiMap::base2int[(int)(fasta[fastaIndex])]; 00676 if (lastBase>=0) 00677 { 00678 char color; 00679 if (lastBase>3 || thisBase>3) color=4; 00680 else color = fromBase2CS[(int)(lastBase<<2 | thisBase)]; 00681 // re-use the int to base, because ::set expects a base char (ATCG), not 00682 // a color code (0123). It should only matter on final output. 00683 set(header->elementCount++, 00684 BaseAsciiMap::int2base[(int) color]); 00685 } 00686 lastBase = thisBase; 00687 } 00688 else 00689 { 00690 set(header->elementCount++, toupper(fasta[fastaIndex])); 00691 } 00692 break; 00693 } 00694 00695 // 00696 // slightly awkward exit handling when we exceed the fixed 00697 // number of chromosomes 00698 // 00699 if (terminateLoad) break; 00700 } 00701 00702 // 00703 // also slightly awkward code to handle the last dangling chromosome... 00704 // all we should need to do is compute the md5 checksum 00705 // 00706 if (whichChromosome==0) 00707 { 00708 fastaFile.close(); 00709 throw std::runtime_error("No chromosomes found - aborting!"); 00710 } 00711 else 00712 { 00713 setChromosomeMD5andLength(whichChromosome-1); 00714 } 00715 00716 fastaFile.close(); 00717 00718 if (_progressStream) *_progressStream << "\r"; 00719 00720 std::cerr << "FASTA binary cache file '" 00721 << _umfaFilename 00722 << "' created." 00723 << std::endl; 00724 00725 // 00726 // leave the umfastaFile open in case caller wants to use it 00727 // 00728 return false; 00729 } 00730 00731 int GenomeSequence::getChromosomeCount() const 00732 { 00733 return header->_chromosomeCount; 00734 } 00735 00736 //return chromosome index: 0, 1, ... 24; 00737 int GenomeSequence::getChromosome(genomeIndex_t position) const 00738 { 00739 if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX; 00740 00741 if (header->_chromosomeCount == 0) 00742 return INVALID_CHROMOSOME_INDEX; 00743 00744 int start = 0; 00745 int stop = header->_chromosomeCount - 1; 00746 00747 // eliminate case where position is in the last chromosome, since the loop 00748 // below falls off the end of the list if it in the last one. 00749 00750 if (position > header->_chromosomes[stop].start) 00751 return (stop); 00752 00753 while (start <= stop) 00754 { 00755 int middle = (start + stop) / 2; 00756 00757 if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start) 00758 return middle; 00759 00760 if (position == header->_chromosomes[middle + 1].start) 00761 return (middle + 1); 00762 00763 if (position > header->_chromosomes[middle + 1].start) 00764 start = middle + 1; 00765 00766 if (position < header->_chromosomes[middle].start) 00767 stop = middle - 1; 00768 } 00769 00770 return -1; 00771 } 00772 00773 // 00774 // Given a chromosome name and 1-based chromosome index, return the 00775 // genome index (0 based) into sequence for it. 00776 // 00777 // NB: the header->chromosomes array contains zero based genome positions 00778 // 00779 genomeIndex_t GenomeSequence::getGenomePosition( 00780 const char *chromosomeName, 00781 unsigned int chromosomeIndex) const 00782 { 00783 genomeIndex_t i = getGenomePosition(chromosomeName); 00784 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX; 00785 return i + chromosomeIndex - 1; 00786 } 00787 00788 genomeIndex_t GenomeSequence::getGenomePosition( 00789 int chromosome, 00790 unsigned int chromosomeIndex) const 00791 { 00792 if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX; 00793 00794 genomeIndex_t i = header->_chromosomes[chromosome].start; 00795 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX; 00796 return i + chromosomeIndex - 1; 00797 } 00798 00799 // 00800 // return the genome index (0 based) of the start of the named 00801 // chromosome. If none is found, INVALID_GENOME_INDEX is returned. 00802 // 00803 // XXX may need to speed this up - and smarten it up with some 00804 // modest chromosome name parsing.... e.g. '%d/X/Y' or 'chr%d/chrX/chrY' or 00805 // other schemes. 00806 // 00807 genomeIndex_t GenomeSequence::getGenomePosition(const char *chromosomeName) const 00808 { 00809 int chromosome = getChromosome(chromosomeName); 00810 if (chromosome==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX; 00811 return header->_chromosomes[chromosome].start; 00812 } 00813 00814 int GenomeSequence::getChromosome(const char *chromosomeName) const 00815 { 00816 unsigned int i; 00817 for (i=0; i<header->_chromosomeCount; i++) 00818 { 00819 if (strcmp(header->_chromosomes[i].name, chromosomeName)==0) 00820 { 00821 return i; 00822 } 00823 } 00824 return INVALID_CHROMOSOME_INDEX; 00825 } 00826 00827 // 00828 // Given a read, reverse the string and swap the base 00829 // pairs for the reverse strand equivalents. 00830 // 00831 void GenomeSequence::getReverseRead(std::string &read) 00832 { 00833 std::string newRead; 00834 if (read.size()) for (int32_t i=(int) read.size() - 1; i>=0; i--) 00835 { 00836 newRead.push_back(BasePair(read[i])); 00837 } 00838 read = newRead; 00839 } 00840 00841 void GenomeSequence::getReverseRead(String& read) 00842 { 00843 int i = 0; 00844 int j = read.Length()-1; 00845 char temp; 00846 while (i < j) 00847 { 00848 temp = read[j]; 00849 read[j] = read[i]; 00850 read[i] = temp; 00851 } 00852 } 00853 00854 #define ABS(x) ( (x) > 0 ? (x) : -(x) ) 00855 int GenomeSequence::debugPrintReadValidation( 00856 std::string &read, 00857 std::string &quality, 00858 char direction, 00859 genomeIndex_t readLocation, 00860 int sumQuality, 00861 int mismatchCount, 00862 bool recurse 00863 ) 00864 { 00865 int validateSumQ = 0; 00866 int validateMismatchCount = 0; 00867 int rc = 0; 00868 std::string genomeData; 00869 00870 for (uint32_t i=0; i<read.size(); i++) 00871 { 00872 if (tolower(read[i]) != tolower((*this)[readLocation + i])) 00873 { 00874 validateSumQ += quality[i] - '!'; 00875 // XXX no longer valid: 00876 if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++; 00877 genomeData.push_back(tolower((*this)[readLocation + i])); 00878 } 00879 else 00880 { 00881 genomeData.push_back(toupper((*this)[readLocation + i])); 00882 } 00883 } 00884 assert(validateSumQ>=0); 00885 if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount) 00886 { 00887 printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n", 00888 genomeData.c_str(), 00889 read.c_str(), 00890 validateSumQ, 00891 sumQuality 00892 ); 00893 rc++; 00894 } 00895 else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount) 00896 { 00897 printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n", 00898 genomeData.c_str(), 00899 read.c_str(), 00900 validateMismatchCount, 00901 mismatchCount 00902 ); 00903 rc++; 00904 } 00905 else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount) 00906 { 00907 printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n", 00908 genomeData.c_str(), 00909 read.c_str(), 00910 validateSumQ, 00911 sumQuality, 00912 validateMismatchCount, 00913 mismatchCount 00914 ); 00915 rc++; 00916 } 00917 00918 if (recurse && ABS(validateMismatchCount - mismatchCount) > (int) read.size()/2) 00919 { 00920 printf("large mismatch difference, trying reverse strand: "); 00921 std::string reverseRead = read; 00922 std::string reverseQuality = quality; 00923 getReverseRead(reverseRead); 00924 reverse(reverseQuality.begin(), reverseQuality.end()); 00925 rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false); 00926 } 00927 return rc; 00928 } 00929 #undef ABS 00930 00931 00932 bool GenomeSequence::wordMatch(unsigned int index, std::string &word) const 00933 { 00934 for (uint32_t i = 0; i<word.size(); i++) 00935 { 00936 if ((*this)[index + i] != word[i]) return false; 00937 } 00938 return true; 00939 } 00940 00941 bool GenomeSequence::printNearbyWords(unsigned int index, unsigned int deviation, std::string &word) const 00942 { 00943 for (unsigned int i = index - deviation; i < index + deviation; i++) 00944 { 00945 if (wordMatch(i, word)) 00946 { 00947 std::cerr << "word '" 00948 << word 00949 << "' found " 00950 << i - index 00951 << " away from position " 00952 << index 00953 << "." 00954 << std::endl; 00955 } 00956 } 00957 return false; 00958 } 00959 00960 void GenomeSequence::dumpSequenceSAMDictionary(std::ostream &file) const 00961 { 00962 for (unsigned int i=0; i<header->_chromosomeCount; i++) 00963 { 00964 file 00965 << "@SQ" 00966 << "\tSN:" << header->_chromosomes[i].name // name 00967 << "\tLN:" << header->_chromosomes[i].size // number of bases 00968 << "\tAS:" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3 00969 << "\tM5:" << header->_chromosomes[i].md5 00970 << "\tUR:" << header->_chromosomes[i].uri 00971 << "\tSP:" << header->_chromosomes[i].species // e.g. Homo_sapiens 00972 << std::endl; 00973 } 00974 } 00975 00976 void GenomeSequence::dumpHeaderTSV(std::ostream &file) const 00977 { 00978 file << "# Reference: " << _baseFilename << std::endl; 00979 file << "# SN: sample name - must be unique" << std::endl; 00980 file << "# AS: assembly name" << std::endl; 00981 file << "# SP: species" << std::endl; 00982 file << "# LN: chromosome/contig length" << std::endl; 00983 file << "# M5: chromosome/contig MD5 checksum" << std::endl; 00984 file << "# LN and M5 are only printed for informational purposes." << std::endl; 00985 file << "# Karma will only set those values when creating the index." << std::endl; 00986 file << "SN" << "\t" << "AS" << "\t" << "SP" << "\t" << "UR" << "\t" << "LN" << "\t" << "M5" << std::endl; 00987 for (unsigned int i=0; i<header->_chromosomeCount; i++) 00988 { 00989 file 00990 << header->_chromosomes[i].name // name 00991 << "\t" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3 00992 << "\t" << header->_chromosomes[i].uri 00993 << "\t" << header->_chromosomes[i].species // e.g. Homo_sapiens 00994 << "\t" << header->_chromosomes[i].size // number of bases 00995 << "\t" << header->_chromosomes[i].md5 00996 << std::endl; 00997 } 00998 } 00999 01000 01001 void GenomeSequence::getString(std::string &str, int chromosome, uint32_t index, int baseCount) const 01002 { 01003 // 01004 // calculate the genome index for the lazy caller... 01005 // 01006 genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1; 01007 01008 getString(str, genomeIndex, baseCount); 01009 } 01010 01011 void GenomeSequence::getString(String &str, int chromosome, uint32_t index, int baseCount) const 01012 { 01013 std::string string; 01014 this-> getString(string, chromosome, index, baseCount); 01015 str = string.c_str(); 01016 } 01017 01018 void GenomeSequence::getString(std::string &str, genomeIndex_t index, int baseCount) const 01019 { 01020 str.clear(); 01021 if (baseCount > 0) 01022 { 01023 for (int i=0; i<baseCount; i++) 01024 { 01025 str.push_back((*this)[index + i]); 01026 } 01027 } 01028 else 01029 { 01030 // if caller passed negative basecount, give them 01031 // the read for the 3' end 01032 for (int i=0; i< -baseCount; i++) 01033 { 01034 str.push_back(BaseAsciiMap::base2complement[(int)(*this)[index + i]]); 01035 } 01036 } 01037 } 01038 01039 void GenomeSequence::getString(String &str, genomeIndex_t index, int baseCount) const 01040 { 01041 std::string string; 01042 getString(string, index, baseCount); 01043 str = string.c_str(); 01044 } 01045 01046 void GenomeSequence::getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const 01047 { 01048 str.clear(); 01049 if (baseCount > 0) 01050 { 01051 for (int i=0; i<baseCount; i++) 01052 { 01053 char base = (*this)[index + i]; 01054 if (in(index+i, highLightStart, highLightEnd)) 01055 base = tolower(base); 01056 str.push_back(base); 01057 } 01058 } 01059 else 01060 { 01061 // if caller passed negative basecount, give them 01062 // the read for the 3' end 01063 for (int i=0; i< -baseCount; i++) 01064 { 01065 char base = BaseAsciiMap::base2complement[(int)(*this)[index + i]]; 01066 if (in(index+i, highLightStart, highLightEnd)) 01067 base = tolower(base); 01068 str.push_back(base); 01069 } 01070 } 01071 } 01072 01073 void GenomeSequence::print30(genomeIndex_t index) const 01074 { 01075 std::cout << "index: " << index << "\n"; 01076 for (genomeIndex_t i=index-30; i<index+30; i++) 01077 std::cout << (*this)[i]; 01078 std::cout << "\n"; 01079 for (genomeIndex_t i=index-30; i<index; i++) 01080 std::cout << " "; 01081 std::cout << "^"; 01082 std::cout << std::endl; 01083 } 01084 01085 void GenomeSequence::getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const 01086 { 01087 result = ""; 01088 for (uint32_t i=0; i < read.size(); i++) 01089 { 01090 if (read[i] == (*this)[location+i]) 01091 result.push_back(' '); 01092 else 01093 result.push_back('^'); 01094 } 01095 } 01096 01097 void GenomeSequence::getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const 01098 { 01099 result = ""; 01100 for (uint32_t i=0; i < read.size(); i++) 01101 { 01102 if (read[i] == (*this)[location+i]) 01103 result.push_back(toupper(read[i])); 01104 else 01105 result.push_back(tolower(read[i])); 01106 } 01107 } 01108 01109 genomeIndex_t GenomeSequence::simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const 01110 { 01111 int bestScore = 1000000; // either mismatch count or sumQ 01112 genomeIndex_t bestMatchLocation = INVALID_GENOME_INDEX; 01113 for (int i=-windowSize; i<windowSize; i++) 01114 { 01115 int newScore; 01116 01117 if (i<0 && ((uint32_t) -i) > index) continue; 01118 if (index + i + read.size() >= getNumberBases()) continue; 01119 01120 if (quality=="") 01121 { 01122 newScore = this->getMismatchCount(read, index + i); 01123 } 01124 else 01125 { 01126 newScore = this->getSumQ(read, quality, index + i); 01127 } 01128 if (newScore < bestScore) 01129 { 01130 bestScore = newScore; 01131 bestMatchLocation = index + i; 01132 } 01133 } 01134 return bestMatchLocation; 01135 } 01136 01137 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h) 01138 { 01139 stream << (MemoryMapArrayHeader &) h; 01140 stream << "chromosomeCount: " << h._chromosomeCount << std::endl; 01141 stream << "isColorSpace: " << h._colorSpace << std::endl; 01142 stream << "chromosomeCount: " << h._chromosomeCount << std::endl; 01143 uint64_t totalSize = 0; 01144 for (uint32_t i=0; i < h._chromosomeCount; i++) 01145 { 01146 totalSize += h._chromosomes[i].size; 01147 stream << "Chromosome Index " << i << " name: " << h._chromosomes[i].name << std::endl; 01148 stream << "Chromosome Index " << i << " whole genome start: " << h._chromosomes[i].start << std::endl; 01149 stream << "Chromosome Index " << i << " whole genome size: " << h._chromosomes[i].size << std::endl; 01150 stream << "Chromosome Index " << i << " md5 checksum: " << h._chromosomes[i].md5 << std::endl; 01151 stream << "Chromosome Index " << i << " assemblyID: " << h._chromosomes[i].assemblyID << std::endl; 01152 stream << "Chromosome Index " << i << " species: " << h._chromosomes[i].species << std::endl; 01153 stream << "Chromosome Index " << i << " URI: " << h._chromosomes[i].uri << std::endl; 01154 } 01155 stream << "Total Genome Size: " << totalSize << " bases."<< std::endl; 01156 if (totalSize != h.elementCount) 01157 { 01158 stream << "Total Genome Size: does not match elementCount!\n"; 01159 } 01160 01161 stream << std::endl; 01162 return stream; 01163 } 01164 01165 void GenomeSequence::getChromosomeAndIndex(std::string &s, genomeIndex_t i) const 01166 { 01167 int whichChromosome = 0; 01168 01169 whichChromosome = getChromosome(i); 01170 01171 if (whichChromosome == INVALID_CHROMOSOME_INDEX) 01172 { 01173 s = "invalid genome index"; // TODO include the index in error 01174 } 01175 else 01176 { 01177 std::ostringstream buf; 01178 genomeIndex_t chromosomeIndex = i - getChromosomeStart(whichChromosome) + 1; 01179 buf << header->_chromosomes[whichChromosome].name << ":" << chromosomeIndex; 01180 #if 0 01181 buf << " (GenomeIndex " << i << ")"; 01182 #endif 01183 s = buf.str(); 01184 } 01185 return; 01186 } 01187 01188 01189 void GenomeSequence::getChromosomeAndIndex(String& s, genomeIndex_t i) const 01190 { 01191 std::string ss; 01192 getChromosomeAndIndex(ss, i); 01193 s = ss.c_str(); 01194 return; 01195 } 01196 01197 01198 // 01199 // This is intended to be a helper routine to get dbSNP files 01200 // loaded. In some cases, we will load into an mmap() file (ie 01201 // when we are creating it), in others, we will simply be loading 01202 // an existing dbSNP file into RAM (when the binary file does not 01203 // exist or when we are running with useMemoryMapFlag == false. 01204 // 01205 // Assume that dbSNP exists, is writable, and is the right size. 01206 // 01207 // Using the dbSNPFilename given, mark each dbSNP position 01208 // with a bool true. 01209 // 01210 // Return value: 01211 // True: if populateDBSNP() succeed 01212 // False: if not succeed 01213 bool GenomeSequence::populateDBSNP( 01214 mmapArrayBool_t &dbSNP, 01215 IFILE inputFile) const 01216 { 01217 assert(dbSNP.getElementCount() == getNumberBases()); 01218 01219 if(inputFile == NULL) 01220 { 01221 // FAIL, file not opened. 01222 return(false); 01223 } 01224 01225 std::string chromosomeName; 01226 std::string position; 01227 genomeIndex_t chromosomePosition1; // 1-based 01228 uint64_t ignoredLineCount = 0; 01229 01230 // Read til the end of the file. 01231 char* postPosPtr = NULL; 01232 while(!inputFile->ifeof()) 01233 { 01234 chromosomeName.clear(); 01235 position.clear(); 01236 // Read the chromosome 01237 if(inputFile->readTilTab(chromosomeName) <= 0) 01238 { 01239 // hit either eof or end of line, check if 01240 // it is a header. 01241 if(chromosomeName.size()>0 && chromosomeName[0]=='#') 01242 { 01243 // header, so just continue. 01244 continue; 01245 } 01246 // Not the header, so this line is poorly formatted. 01247 ++ignoredLineCount; 01248 // Continue to the next line. 01249 continue; 01250 } 01251 01252 // Check if it is a header line. 01253 if(chromosomeName.size()>0 && chromosomeName[0]=='#') 01254 { 01255 // did not hit eof or end of line, 01256 // so discard the rest of the line. 01257 inputFile->discardLine(); 01258 continue; 01259 } 01260 01261 // Not a header, so read the position. 01262 if(inputFile->readTilTab(position) > 0) 01263 { 01264 // Additional data on the line, so discard it. 01265 inputFile->discardLine(); 01266 } 01267 01268 // Convert the position to a string. 01269 chromosomePosition1 = strtoul(position.c_str(), &postPosPtr, 0); 01270 01271 01272 if(postPosPtr == position.c_str()) 01273 { 01274 ++ignoredLineCount; 01275 continue; 01276 } 01277 01278 // 1-based genome index. 01279 genomeIndex_t genomeIndex = 01280 getGenomePosition(chromosomeName.c_str(), chromosomePosition1); 01281 01282 // if the genome index is invalid, ignore it 01283 if((genomeIndex == INVALID_GENOME_INDEX) || 01284 (genomeIndex > getNumberBases())) 01285 { 01286 ignoredLineCount++; 01287 continue; 01288 } 01289 01290 dbSNP.set(genomeIndex, true); 01291 } 01292 01293 if (ignoredLineCount > 0) 01294 { 01295 std::cerr << "GenomeSequence::populateDBSNP: ignored " << ignoredLineCount << " SNP positions due to invalid format of line." << std::endl; 01296 return false; 01297 } 01298 return true; 01299 } 01300 01301 bool GenomeSequence::loadDBSNP( 01302 mmapArrayBool_t &dbSNP, 01303 const char *inputFileName) const 01304 { 01305 // 01306 // the goal in this section of code is to allow the user 01307 // to either specify a valid binary version of the SNP file, 01308 // or the original text file that it gets created from. 01309 // 01310 // To do this, we basically open, sniff the error message, 01311 // and if it claims it is not a binary version of the file, 01312 // we go ahead and treat it as the text file and use the 01313 // GenomeSequence::populateDBSNP method to load it. 01314 // 01315 // Further checking is really needed to ensure users don't 01316 // mix a dbSNP file for a different reference, since it is really 01317 // easy to do. 01318 // 01319 if (strlen(inputFileName)!=0) 01320 { 01321 std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush; 01322 01323 if (dbSNP.open(inputFileName, O_RDONLY)) 01324 { 01325 // 01326 // failed to open, possibly due to bad magic. 01327 // 01328 // this is really awful ... need to have a return 01329 // code that is smart enough to avoid this ugliness: 01330 // 01331 if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos) 01332 { 01333 std::cerr << "Error: " << dbSNP.getErrorString() << std::endl; 01334 exit(1); 01335 } 01336 // 01337 // we have a file, assume we can load it as a text file 01338 // 01339 IFILE inputFile = ifopen(inputFileName, "r"); 01340 if(inputFile == NULL) 01341 { 01342 std::cerr << "Error: failed to open " << inputFileName << std::endl; 01343 exit(1); 01344 } 01345 01346 std::cerr << "(as text file) "; 01347 01348 // anonymously (RAM resident only) create: 01349 dbSNP.create(getNumberBases()); 01350 01351 // now load it into RAM 01352 populateDBSNP(dbSNP, inputFile); 01353 ifclose(inputFile); 01354 01355 } 01356 else 01357 { 01358 std::cerr << "(as binary mapped file) "; 01359 } 01360 01361 std::cerr << "DONE!" << std::endl; 01362 return false; 01363 } 01364 else 01365 { 01366 return true; 01367 } 01368 } 01369 01370 01371 #if defined(TEST) 01372 01373 void simplestExample(void) 01374 { 01375 GenomeSequence reference; 01376 genomeIndex_t index; 01377 01378 // a particular reference is set by: 01379 // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa") 01380 // 01381 // In the above example, the suffix .fa is stripped and replaced with .umfa, 01382 // which contains the actual file being opened. 01383 // 01384 if (reference.open()) 01385 { 01386 perror("GenomeSequence::open"); 01387 exit(1); 01388 } 01389 01390 01391 index = 1000000000; // 10^9 01392 // 01393 // Write the base at the given index. Here, index is 0 based, 01394 // and is across the whole genome, as all chromosomes are sequentially 01395 // concatenated, so the allowed range is 01396 // 01397 // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last)) 01398 // 01399 // (where int last = reference.getChromosomeCount() - 1;) 01400 // 01401 std::cout << "base[" << index << "] = " << reference[index] << std::endl; 01402 01403 // 01404 // Example for finding chromosome and one based chromosome position given 01405 // and absolute position on the genome in 'index': 01406 // 01407 int chr = reference.getChromosome(index); 01408 genomeIndex_t chrIndex = index - reference.getChromosomeStart(chr) + 1; // 1-based 01409 01410 std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl; 01411 01412 // 01413 // Example for finding an absolute genome index position when the 01414 // chromosome name and one based position are known: 01415 // 01416 const char *chromosomeName = "5"; 01417 chr = reference.getChromosome(chromosomeName); // 0-based 01418 chrIndex = 100000; // 1-based 01419 01420 index = reference.getChromosomeStart(chr) + chrIndex - 1; 01421 01422 std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl; 01423 01424 reference.close(); 01425 } 01426 01427 void testGenomeSequence(void) 01428 { 01429 GenomeSequence reference; 01430 01431 #if 0 01432 std::string referenceName = "someotherreference"; 01433 if (reference.setFastaName(referenceName)) 01434 { 01435 std::cerr << "failed to open reference file " 01436 << referenceName 01437 << std::endl; 01438 exit(1); 01439 } 01440 #endif 01441 01442 std::cerr << "open and prefetch the reference genome: "; 01443 01444 // open it 01445 if (reference.open()) 01446 { 01447 exit(1); 01448 } 01449 std::cerr << "done!" << std::endl; 01450 01451 // 01452 // For the human genome, genomeIndex ranges from 0 to 3.2x10^9 01453 // 01454 genomeIndex_t genomeIndex; // 0 based 01455 unsigned int chromosomeIndex; // 1 based 01456 unsigned int chromosome; // 0..23 or so 01457 std::string chromosomeName; 01458 01459 // 01460 // Here we'll start with a chromosome name, then obtain the genome 01461 // index, and use it to find the base we want: 01462 // 01463 chromosomeName = "2"; 01464 chromosomeIndex = 1234567; 01465 // this call is slow (string search for chromsomeName): 01466 genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex); 01467 assert(genomeIndex!=INVALID_GENOME_INDEX); 01468 std::cout << "Chromosome " << chromosomeName << ", index "; 01469 std::cout << chromosomeIndex << " contains base " << reference[genomeIndex]; 01470 std::cout << " at genome index position " << genomeIndex << std::endl; 01471 01472 // 01473 // now reverse it - given a genomeIndex from above, find the chromosome 01474 // name and index: 01475 // 01476 01477 // slow (binary search on genomeIndex): 01478 chromosome = reference.getChromosome(genomeIndex); 01479 unsigned int newChromosomeIndex; 01480 // not slow: 01481 newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1; 01482 01483 assert(chromosomeIndex == newChromosomeIndex); 01484 01485 // more testing... at least test and use PackedRead: 01486 // 01487 PackedRead pr; 01488 01489 pr.set("ATCGATCG", 0); 01490 assert(pr.size()==8); 01491 assert(pr[0]==BaseAsciiMap::base2int[(int) 'A']); 01492 assert(pr[1]==BaseAsciiMap::base2int[(int) 'T']); 01493 assert(pr[2]==BaseAsciiMap::base2int[(int) 'C']); 01494 assert(pr[3]==BaseAsciiMap::base2int[(int) 'G']); 01495 pr.set("ATCGATCG", 1); 01496 assert(pr.size()==9); 01497 pr.set("", 0); 01498 assert(pr.size()==0); 01499 pr.set("", 1); 01500 assert(pr.size()==1); 01501 pr.set("", 2); 01502 assert(pr.size()==2); 01503 pr.set("", 3); 01504 assert(pr.size()==3); 01505 assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']); 01506 assert(pr[1]==BaseAsciiMap::base2int[(int) 'N']); 01507 assert(pr[2]==BaseAsciiMap::base2int[(int) 'N']); 01508 pr.set("C", 1); 01509 assert(pr.size()==2); 01510 assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']); 01511 assert(pr[1]==BaseAsciiMap::base2int[(int) 'C']); 01512 01513 } 01514 01515 // 01516 // After I build libcsg, I compile and run this test code using: 01517 // 01518 // g++ -DTEST -o try GenomeSequence.cpp -L. -lcsg -lm -lz -lssl 01519 // you also may need -fno-rtti 01520 // ./try 01521 // 01522 int main(int argc, const char **argv) 01523 { 01524 #if 1 01525 simplestExample(); 01526 #else 01527 testGenomeSequence(); 01528 #endif 01529 exit(0); 01530 } 01531 01532 #endif