00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef _GENOME_SEQUENCE_H 00019 #define _GENOME_SEQUENCE_H 00020 00021 #include <sys/types.h> 00022 #include <sys/stat.h> 00023 #if !defined(MD5_DIGEST_LENGTH) 00024 #define MD5_DIGEST_LENGTH 16 00025 #endif 00026 #include <string> 00027 #include "MemoryMapArray.h" 00028 #include "BaseAsciiMap.h" 00029 00030 // Goncalo's String class 00031 #include "StringArray.h" 00032 00033 #include <stdint.h> 00034 00035 // stdint.h will define this, but only if __STDC_LIMIT_MACROS was 00036 // defined prior to the first include of stdint.h 00037 #ifndef UINT32_MAX 00038 #define UINT32_MAX 0xFFFFFFFF 00039 #endif 00040 00041 typedef uint32_t genomeIndex_t; 00042 #define INVALID_GENOME_INDEX UINT32_MAX 00043 00044 // chromosome index is just a signed int, so this is ok here: 00045 #define INVALID_CHROMOSOME_INDEX -1 00046 00047 #include "GenomeSequenceHelpers.h" 00048 00049 #define UMFA_COOKIE 0x1b7933a1 // unique cookie id 00050 #define UMFA_VERSION 20100401U // YYYYMMDD of last change to file layout 00051 00052 typedef MemoryMapArray< 00053 uint32_t, 00054 genomeIndex_t, 00055 UMFA_COOKIE, 00056 UMFA_VERSION, 00057 PackedAccess_4Bit, 00058 PackedAssign_4Bit, 00059 Packed4BitElementCount2Bytes, 00060 genomeSequenceMmapHeader 00061 > genomeSequenceArray; 00062 00063 00064 // std::string &operator = (std::string &lhs, const PackedRead &rhs); 00065 // 00066 00067 //! Create/Access/Modify/Load Genome Sequences stored as binary mapped files. 00068 /*! 00069 GenomeSequence is designed to be a high performance shared access 00070 reference object. 00071 00072 It is implemented as a MemoryMapArray template object with unsigned 8 00073 bit ints, each of which stores two bases. Although 2 bits could be used, 00074 most references have more than four symbols (usually at least including 00075 'N', indicating an unknown or masked out base). 00076 00077 Normal use of this class follows these steps: 00078 -# create the reference 00079 -# instantiate the GenomeSequence class object 00080 -# create the actual file (memory mapped) that is to hold the data 00081 -# populate the data using GenomeSequence::set 00082 -# use the reference 00083 -# use the reference by instantiating a GenomeSequence object 00084 -# either use the constructor with the reference filename 00085 -# or use GenomeSequence::setReferenceName() followed by ::open 00086 -# access the bases via the overloaded array operator [] 00087 -# check sequence length by using GenomeSequence::getNumberBases() 00088 -# accessing chromosomes in the reference 00089 -# you typically will need to know about the chromosomes in the sequence 00090 -# see methods and docs with prefix 'getChromosome' 00091 00092 Sharing is accomplished using the mmap() function via the MemoryMap 00093 base class. This allows a potentially large genome reference to be 00094 shared among a number of simultaneously executing instances of one or 00095 more programs sharing the same reference. 00096 */ 00097 00098 00099 class GenomeSequence : public genomeSequenceArray 00100 { 00101 private: 00102 int _debugFlag; 00103 00104 std::ostream *_progressStream; 00105 bool _colorSpace; 00106 00107 // Whether or not to overwrite an existing file when creating a umfa file (via create). 00108 bool _createOverwrite; 00109 00110 std::string _baseFilename; // for later use by WordIndex create and open 00111 std::string _referenceFilename; 00112 std::string _fastaFilename; 00113 std::string _umfaFilename; 00114 std::string _application; // only used in ::create() 00115 00116 MemoryMap _umfaFile; 00117 00118 void setup(const char *referenceFilename); 00119 00120 public: 00121 /// Simple constructor - no implicit file open 00122 GenomeSequence(); 00123 void constructorClear(); 00124 /// \brief attempt to open an existing sequence 00125 /// 00126 /// \param referenceFilename the name of the reference fasta file to open 00127 /// \param debug if true, additional debug information is printed 00128 GenomeSequence(std::string &referenceFilename) 00129 { 00130 constructorClear(); 00131 setup(referenceFilename.c_str()); 00132 } 00133 00134 /// Smarter constructor - attempt to open an existing sequence 00135 /// 00136 /// \param referenceFilename the name of the reference fasta file to open 00137 /// \param debug if true, additional debug information is printed 00138 GenomeSequence(const char *referenceFilename) 00139 { 00140 constructorClear(); 00141 setup(referenceFilename); 00142 } 00143 00144 /// Close the file if open and destroy the object 00145 ~GenomeSequence(); 00146 00147 /// open the reference specified using GenomeSequence::setReferenceName 00148 /// 00149 /// \param isColorSpace open the color space reference 00150 /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents) 00151 /// \return false for success, true otherwise 00152 bool open(bool isColorSpace = false, int flags = O_RDONLY); 00153 00154 /// open the given file as the genome (no filename munging occurs). 00155 /// 00156 /// \param filename the name of the file to open 00157 /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents) 00158 /// \return false for success, true otherwise 00159 bool open(const char *filename, int flags = O_RDONLY) 00160 { 00161 _umfaFilename = filename; 00162 #if 0 00163 if(genomeSequenceArray::open(filename, flags)) { 00164 // 00165 // if load of the umfa (binary encoded file) fails, attempt 00166 // to load it as a fasta file. 00167 // 00168 return loadFasta(); 00169 } 00170 #endif 00171 return false; 00172 } 00173 00174 private: 00175 bool _searchCommonFileSuffix; 00176 public: 00177 bool create(bool isColor = false); 00178 00179 // NEW API? 00180 00181 // load time modifiers: 00182 /// if set, then show progress when creating and pre-fetching 00183 void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;} 00184 void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; } 00185 void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;} 00186 00187 // Set whether or not to overwrite a umfa file when calling create. 00188 void setCreateOverwrite(bool createOverwrite) {_createOverwrite = createOverwrite;} 00189 00190 bool loadFastaData(const char *filename); 00191 00192 /// set the reference name that will be used in open() 00193 /// \param referenceFilename the name of the reference fasta file to open 00194 /// \return false for success, true otherwise 00195 /// 00196 /// \sa open() 00197 bool setReferenceName(std::string referenceFilename); 00198 00199 /// set the application name in the binary file header 00200 /// 00201 /// \param application name of the application 00202 void setApplication(std::string application) 00203 { 00204 _application = application; // used in ::create() to set application name 00205 } 00206 const std::string &getFastaName() const 00207 { 00208 return _fastaFilename; 00209 } 00210 const std::string &getReferenceName() const 00211 { 00212 return _referenceFilename; 00213 } 00214 00215 /// tell us if we are a color space reference or not 00216 /// \return true if colorspace, false otherwise 00217 bool isColorSpace() const 00218 { 00219 return _colorSpace; 00220 } 00221 00222 /// return the number of bases represented in this reference 00223 /// \return count of bases 00224 genomeIndex_t getNumberBases() const 00225 { 00226 return getElementCount(); 00227 } 00228 00229 /// given a whole genome index, get the chromosome it is located in 00230 /// 00231 /// This is done via a binary search of the chromosome table in the 00232 /// header of the mapped file, so it is O(log(N)) 00233 /// 00234 /// \param 0-based position the base in the genome 00235 /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error 00236 int getChromosome(genomeIndex_t position) const; 00237 00238 /// given a chromosome name, return the chromosome index 00239 /// 00240 /// This is done via a linear search of the chromosome table in the 00241 /// header of the mapped file, so it is O(N) 00242 /// 00243 /// \param chromosomeName the name of the chromosome - exact match only 00244 /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error 00245 int getChromosome(const char *chromosomeName) const; 00246 /// Return the number of chromosomes in the genome 00247 /// \return number of chromosomes in the genome 00248 int getChromosomeCount() const; 00249 00250 /// given a chromosome, return the genome base it starts in 00251 /// 00252 /// \param 0-based chromosome index 00253 /// \return 0-based genome index of the base that starts the chromosome 00254 genomeIndex_t getChromosomeStart(int chromosomeIndex) const 00255 { 00256 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX; 00257 return header->_chromosomes[chromosomeIndex].start; 00258 } 00259 00260 /// given a chromosome, return its size in bases 00261 /// 00262 /// \param 0-based chromosome index 00263 /// \return size of the chromosome in bases 00264 genomeIndex_t getChromosomeSize(int chromosomeIndex) const 00265 { 00266 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0; 00267 return header->_chromosomes[chromosomeIndex].size; 00268 } 00269 00270 /// given a chromosome name and position, return the genome position 00271 /// 00272 /// \param chromosomeName name of the chromosome - exact match only 00273 /// \param chromosomeIndex 1-based chromosome position 00274 /// \return genome index of the above chromosome position 00275 genomeIndex_t getGenomePosition( 00276 const char *chromosomeName, 00277 unsigned int chromosomeIndex) const; 00278 00279 /// given a chromosome index and position, return the genome position 00280 /// 00281 /// \param chromosome index of the chromosome 00282 /// \param chromosomeIndex 1-based chromosome position 00283 /// \return genome index of the above chromosome position 00284 genomeIndex_t getGenomePosition( 00285 int chromosome, 00286 unsigned int chromosomeIndex) const; 00287 00288 /// given the chromosome name, get the corresponding 0 based genome index 00289 /// for the start of that chromosome 00290 genomeIndex_t getGenomePosition(const char *chromosomeName) const; 00291 genomeIndex_t getGenomePosition(int chromosomeIndex) const; 00292 00293 const std::string &getBaseFilename() const 00294 { 00295 return _baseFilename; 00296 } 00297 00298 const char *getChromosomeName(int chromosomeIndex) const 00299 { 00300 return header->_chromosomes[chromosomeIndex].name; 00301 } 00302 00303 void setDebugFlag(bool d) 00304 { 00305 _debugFlag = d; 00306 } 00307 00308 genomeIndex_t sequenceLength() const 00309 { 00310 return (genomeIndex_t) header->elementCount; 00311 } 00312 00313 const char *chromosomeName(int chr) const 00314 { 00315 return header->_chromosomes[chr].name; 00316 } 00317 00318 void sanityCheck(MemoryMap &fasta) const; 00319 00320 // TODO - this will be moved somewhere else and be made a static method. 00321 std::string IntegerToSeq(unsigned int n, unsigned int wordsize) const; 00322 00323 bool wordMatch(unsigned int index, std::string &word) const; 00324 bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word) const; 00325 00326 // TODO - this will be moved somewhere else and be made a static method. 00327 char BasePair(char c) const 00328 { 00329 return BaseAsciiMap::base2complement[(int) c]; 00330 } 00331 00332 void dumpSequenceSAMDictionary(std::ostream&) const; 00333 void dumpHeaderTSV(std::ostream&) const; 00334 00335 /// 00336 /// Return the bases in base space or color space for within range index, ot 00337 /// @param index the array-like index (0 based). 00338 /// @return ACTGN in base space; 0123N for color space; and 'N' for invalid. 00339 /// For color space, index i represents the transition of base at position (i-1) to base at position i 00340 /// 00341 /// NB: bounds checking here needs to be deprecated - do not assume it 00342 /// will exist - the call must clip reads so that this routine is never 00343 /// called with a index value larger than the genome. 00344 /// 00345 /// The reason for this is simply that this routine gets called hundreds 00346 /// of billions of time in one run of karma, which will absolutely kill 00347 /// performance. Every single instruction here matters a great, great deal. 00348 /// 00349 00350 // 00351 // 3.5% improvement for color space matching: 00352 // I guess the compiler isn't inlining 3 functions deep. 00353 // 00354 00355 #if 0 00356 // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 00357 // The following code does not work even in the base space, 00358 // since the memory layout has changed. USE IT WITH CAUTIOUS! 00359 // 00360 // This block of code is a functional duplicate of the following 00361 // code - leave this here for reference and possibly later 00362 // performance testing as well as compiler evaluation. 00363 inline char operator[](genomeIndex_t index) const 00364 { 00365 return BaseAsciiMap::int2base[(*((genomeSequenceArray*) this))[index]]; 00366 } 00367 #endif 00368 00369 inline char operator[](genomeIndex_t index) const 00370 { 00371 uint8_t val; 00372 if (index < getNumberBases()) 00373 { 00374 if ((index&1)==0) 00375 { 00376 val = ((uint8_t *) data)[index>>1] & 0xf; 00377 } 00378 else 00379 { 00380 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4; 00381 } 00382 } 00383 else 00384 { 00385 val = BaseAsciiMap::baseNIndex; 00386 } 00387 val = isColorSpace() ? BaseAsciiMap::int2colorSpace[val] : 00388 BaseAsciiMap::int2base[val]; 00389 return val; 00390 } 00391 00392 /// given a chromosome name and 1-based position, return the reference base. 00393 /// \param chromosomeName name of the chromosome - exact match only 00394 /// \param chromosomeIndex 1-based chromosome position 00395 /// \return reference base at the above chromosome position 00396 inline char getBase(const char *chromosomeName, 00397 unsigned int chromosomeIndex) const 00398 { 00399 genomeIndex_t index = 00400 getGenomePosition(chromosomeName, chromosomeIndex); 00401 if(index == INVALID_GENOME_INDEX) 00402 { 00403 // Invalid position, so return 'N' 00404 return('N'); 00405 } 00406 return((*this)[index]); 00407 } 00408 00409 00410 inline uint8_t getInteger(genomeIndex_t index) const 00411 { 00412 return (*((genomeSequenceArray*) this))[index]; 00413 } 00414 00415 inline void set(genomeIndex_t index, char value) 00416 { 00417 genomeSequenceArray::set(index, 00418 BaseAsciiMap::base2int[(uint8_t) value]); 00419 } 00420 00421 /// obtain the pointer to the raw data for other access methods 00422 /// 00423 /// this is a fairly ugly hack to reach into the 00424 /// raw genome vector, get the byte that encodes 00425 /// two bases, and return it. This is used by 00426 /// karma ReadIndexer::getSumQ to compare genome 00427 /// matchines by byte (two bases at a time) to speed 00428 /// it up. 00429 /// 00430 uint8_t *getDataPtr(genomeIndex_t index) 00431 { 00432 return ((uint8_t *) data + index/2); 00433 } 00434 private: 00435 /// 00436 /// when creating the genome mapped file, we call this to set 00437 /// the MD5 checksum of the chromosome sequence and length so that 00438 /// we can write the SAM SQ headers properly. 00439 /// NB: operates on the last fully loaded chromosome. 00440 bool setChromosomeMD5andLength(uint32_t whichChromosome); 00441 00442 public: 00443 00444 // TODO - this will be moved somewhere else and be made a static method. 00445 // replace read with the reversed one 00446 void getReverseRead(std::string &read); 00447 00448 // TODO - this will be moved somewhere else and be made a static method. 00449 void getReverseRead(String& read); 00450 00451 // debug the given read - print nice results 00452 int debugPrintReadValidation( 00453 std::string &read, 00454 std::string &quality, 00455 char direction, 00456 genomeIndex_t readLocation, 00457 int sumQuality, 00458 int mismatchCount, 00459 bool recurse = true 00460 ); 00461 00462 // 00463 // get the sequence from this GenomeSequence using the specified chromosome and 0-based position. 00464 // if baseCount < 0, get the reverse complement 00465 // that starts at index (but do not reverse the string?) 00466 void getString(std::string &str, int chromosome, uint32_t index, int baseCount) const; 00467 void getString(String &str, int chromosome, uint32_t index, int baseCount) const; 00468 // 00469 // get the sequence from this GenomeSequence. 00470 // if baseCount < 0, get the reverse complement 00471 // that starts at index (but do not reverse the string?) 00472 // 00473 void getString(std::string &str, genomeIndex_t index, int baseCount) const; 00474 void getString(String &str, genomeIndex_t index, int baseCount) const; 00475 00476 void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const; 00477 00478 void print30(genomeIndex_t) const; 00479 00480 // for debugging, not for speed: 00481 genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const; 00482 00483 00484 // TODO - these methods do not handle a CIGAR string and do not handle '=' when a read matches the reference. 00485 // They are here for alignment and should be moved to the aligner (karma). 00486 // OR they should optionally take a CIGAR and use that if specified.... 00487 // maybe they should be helper methods that are found somewhere else 00488 00489 /// Return the mismatch count, disregarding CIGAR strings 00490 /// 00491 /// \param read is the sequence we're counting mismatches in 00492 /// \param location is where in the genmoe we start comparing 00493 /// \param exclude is a wildcard character (e.g. '.' or 'N') 00494 /// 00495 /// \return number of bases that don't match the reference, except those that match exclude 00496 int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const 00497 { 00498 int mismatchCount = 0; 00499 for (uint32_t i=0; i<read.size(); i++) 00500 if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i]; 00501 return mismatchCount; 00502 }; 00503 00504 /// brute force sumQ - no sanity checking 00505 /// 00506 /// \param read shotgun sequencer read string 00507 /// \param qualities phred quality string of same length 00508 /// \param location the alignment location to check sumQ 00509 int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const 00510 { 00511 int sumQ = 0; 00512 for (uint32_t i=0; i<read.size(); i++) 00513 sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0); 00514 return sumQ; 00515 }; 00516 // return a string highlighting mismatch postions with '^' chars: 00517 void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const; 00518 void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const; 00519 00520 // END TODO 00521 00522 void getChromosomeAndIndex(std::string &, genomeIndex_t) const; 00523 void getChromosomeAndIndex(String &, genomeIndex_t) const; 00524 00525 /// check a SAM format read, using phred quality scores and 00526 /// the CIGAR string to determine if it is correct. 00527 /// 00528 /// 00529 /// \param read the read in base space 00530 /// \param qualities the phred encoded qualities (Sanger, not Illumina) 00531 /// \param cigar the SAM file CIGAR column 00532 /// \param sumQ if >0 on entry, is checked against the computed sumQ 00533 /// \param insertions count of insertions found in 00534 /// 00535 /// 00536 bool checkRead( 00537 std::string &read, 00538 std::string &qualities, 00539 std::string &cigar, 00540 int &sumQ, // input and output 00541 int &gapOpenCount, // output only 00542 int &gapExtendCount, // output only 00543 int &gapDeleteCount, // output only 00544 std::string &result 00545 ) const; 00546 00547 bool populateDBSNP(mmapArrayBool_t &dbSNP, IFILE inputFile) const; 00548 00549 /// user friendly dbSNP loader. 00550 /// 00551 /// \param inputFileName may be empty, point to a text file or a dbSNP vector file 00552 /// 00553 /// In all cases, dbSNP is returned the same length as this genome. 00554 /// 00555 /// When no SNPs are loaded, all values are false. 00556 /// 00557 /// When a text file is given, the file is parsed with two space 00558 /// separated columns - the first column is the chromosome name, and 00559 /// the second is the 1-based chromosome position of the SNP. 00560 /// 00561 /// \return false if a dbSNP file was correctly loaded, true otherwise 00562 /// 00563 bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const; 00564 }; 00565 00566 #endif