libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef _GENOME_SEQUENCE_H 00019 #define _GENOME_SEQUENCE_H 00020 00021 #include <sys/types.h> 00022 #include <sys/stat.h> 00023 #if !defined(MD5_DIGEST_LENGTH) 00024 #define MD5_DIGEST_LENGTH 16 00025 #endif 00026 #include <string> 00027 #include "MemoryMapArray.h" 00028 #include "BaseAsciiMap.h" 00029 00030 // Goncalo's String class 00031 #include "StringArray.h" 00032 00033 #include <stdint.h> 00034 00035 // stdint.h will define this, but only if __STDC_LIMIT_MACROS was 00036 // defined prior to the first include of stdint.h 00037 #ifndef UINT32_MAX 00038 #define UINT32_MAX 0xFFFFFFFF 00039 #endif 00040 00041 typedef uint32_t genomeIndex_t; 00042 #define INVALID_GENOME_INDEX UINT32_MAX 00043 00044 // chromosome index is just a signed int, so this is ok here: 00045 #define INVALID_CHROMOSOME_INDEX -1 00046 00047 #include "GenomeSequenceHelpers.h" 00048 00049 #define UMFA_COOKIE 0x1b7933a1 // unique cookie id 00050 #define UMFA_VERSION 20100401U // YYYYMMDD of last change to file layout 00051 00052 typedef MemoryMapArray< 00053 uint32_t, 00054 genomeIndex_t, 00055 UMFA_COOKIE, 00056 UMFA_VERSION, 00057 PackedAccess_4Bit, 00058 PackedAssign_4Bit, 00059 Packed4BitElementCount2Bytes, 00060 genomeSequenceMmapHeader 00061 > genomeSequenceArray; 00062 00063 00064 // std::string &operator = (std::string &lhs, const PackedRead &rhs); 00065 // 00066 00067 //! Create/Access/Modify/Load Genome Sequences stored as binary mapped files. 00068 /*! 00069 GenomeSequence is designed to be a high performance shared access 00070 reference object. 00071 00072 It is implemented as a MemoryMapArray template object with unsigned 8 00073 bit ints, each of which stores two bases. Although 2 bits could be used, 00074 most references have more than four symbols (usually at least including 00075 'N', indicating an unknown or masked out base). 00076 00077 Normal use of this class follows these steps: 00078 -# create the reference 00079 -# instantiate the GenomeSequence class object 00080 -# create the actual file (memory mapped) that is to hold the data 00081 -# populate the data using GenomeSequence::set 00082 -# use the reference 00083 -# use the reference by instantiating a GenomeSequence object 00084 -# either use the constructor with the reference filename 00085 -# or use GenomeSequence::setReferenceName() followed by ::open 00086 -# access the bases via the overloaded array operator [] 00087 -# check sequence length by using GenomeSequence::getNumberBases() 00088 -# accessing chromosomes in the reference 00089 -# you typically will need to know about the chromosomes in the sequence 00090 -# see methods and docs with prefix 'getChromosome' 00091 00092 Sharing is accomplished using the mmap() function via the MemoryMap 00093 base class. This allows a potentially large genome reference to be 00094 shared among a number of simultaneously executing instances of one or 00095 more programs sharing the same reference. 00096 */ 00097 00098 00099 class GenomeSequence : public genomeSequenceArray 00100 { 00101 private: 00102 int _debugFlag; 00103 00104 std::ostream *_progressStream; 00105 bool _colorSpace; 00106 00107 // Whether or not to overwrite an existing file when creating a umfa file (via create). 00108 bool _createOverwrite; 00109 00110 std::string _baseFilename; // for later use by WordIndex create and open 00111 std::string _referenceFilename; 00112 std::string _fastaFilename; 00113 std::string _umfaFilename; 00114 std::string _application; // only used in ::create() 00115 00116 MemoryMap _umfaFile; 00117 00118 void setup(const char *referenceFilename); 00119 00120 public: 00121 /// Simple constructor - no implicit file open 00122 GenomeSequence(); 00123 void constructorClear(); 00124 /// \brief attempt to open an existing sequence 00125 /// 00126 /// \param referenceFilename the name of the reference fasta file to open 00127 /// \param debug if true, additional debug information is printed 00128 GenomeSequence(std::string &referenceFilename) 00129 { 00130 constructorClear(); 00131 setup(referenceFilename.c_str()); 00132 } 00133 00134 /// Smarter constructor - attempt to open an existing sequence 00135 /// 00136 /// \param referenceFilename the name of the reference fasta file to open 00137 /// \param debug if true, additional debug information is printed 00138 GenomeSequence(const char *referenceFilename) 00139 { 00140 constructorClear(); 00141 setup(referenceFilename); 00142 } 00143 00144 /// Close the file if open and destroy the object 00145 ~GenomeSequence(); 00146 00147 /// open the reference specified using GenomeSequence::setReferenceName 00148 /// 00149 /// \param isColorSpace open the color space reference 00150 /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents) 00151 /// \return false for success, true otherwise 00152 bool open(bool isColorSpace = false, int flags = O_RDONLY); 00153 00154 /// open the given file as the genome (no filename munging occurs). 00155 /// 00156 /// \param filename the name of the file to open 00157 /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents) 00158 /// \return false for success, true otherwise 00159 bool open(const char *filename, int flags = O_RDONLY) 00160 { 00161 _umfaFilename = filename; 00162 // TODO - should this method be doing something??? 00163 return false; 00164 } 00165 00166 private: 00167 bool _searchCommonFileSuffix; 00168 public: 00169 bool create(bool isColor = false); 00170 00171 // NEW API? 00172 00173 // load time modifiers: 00174 /// if set, then show progress when creating and pre-fetching 00175 void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;} 00176 void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; } 00177 void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;} 00178 00179 // Set whether or not to overwrite a umfa file when calling create. 00180 void setCreateOverwrite(bool createOverwrite) {_createOverwrite = createOverwrite;} 00181 00182 bool loadFastaData(const char *filename); 00183 00184 /// set the reference name that will be used in open() 00185 /// \param referenceFilename the name of the reference fasta file to open 00186 /// \return false for success, true otherwise 00187 /// 00188 /// \sa open() 00189 bool setReferenceName(std::string referenceFilename); 00190 00191 /// set the application name in the binary file header 00192 /// 00193 /// \param application name of the application 00194 void setApplication(std::string application) 00195 { 00196 _application = application; // used in ::create() to set application name 00197 } 00198 const std::string &getFastaName() const 00199 { 00200 return _fastaFilename; 00201 } 00202 const std::string &getReferenceName() const 00203 { 00204 return _referenceFilename; 00205 } 00206 00207 /// tell us if we are a color space reference or not 00208 /// \return true if colorspace, false otherwise 00209 bool isColorSpace() const 00210 { 00211 return _colorSpace; 00212 } 00213 00214 /// return the number of bases represented in this reference 00215 /// \return count of bases 00216 genomeIndex_t getNumberBases() const 00217 { 00218 return getElementCount(); 00219 } 00220 00221 /// given a whole genome index, get the chromosome it is located in 00222 /// 00223 /// This is done via a binary search of the chromosome table in the 00224 /// header of the mapped file, so it is O(log(N)) 00225 /// 00226 /// \param 0-based position the base in the genome 00227 /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error 00228 int getChromosome(genomeIndex_t position) const; 00229 00230 /// given a chromosome name, return the chromosome index 00231 /// 00232 /// This is done via a linear search of the chromosome table in the 00233 /// header of the mapped file, so it is O(N) 00234 /// 00235 /// \param chromosomeName the name of the chromosome - exact match only 00236 /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error 00237 int getChromosome(const char *chromosomeName) const; 00238 /// Return the number of chromosomes in the genome 00239 /// \return number of chromosomes in the genome 00240 int getChromosomeCount() const; 00241 00242 /// given a chromosome, return the genome base it starts in 00243 /// 00244 /// \param 0-based chromosome index 00245 /// \return 0-based genome index of the base that starts the chromosome 00246 genomeIndex_t getChromosomeStart(int chromosomeIndex) const 00247 { 00248 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX; 00249 return header->_chromosomes[chromosomeIndex].start; 00250 } 00251 00252 /// given a chromosome, return its size in bases 00253 /// 00254 /// \param 0-based chromosome index 00255 /// \return size of the chromosome in bases 00256 genomeIndex_t getChromosomeSize(int chromosomeIndex) const 00257 { 00258 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0; 00259 return header->_chromosomes[chromosomeIndex].size; 00260 } 00261 00262 /// given a chromosome name and position, return the genome position 00263 /// 00264 /// \param chromosomeName name of the chromosome - exact match only 00265 /// \param chromosomeIndex 1-based chromosome position 00266 /// \return genome index of the above chromosome position 00267 genomeIndex_t getGenomePosition( 00268 const char *chromosomeName, 00269 unsigned int chromosomeIndex) const; 00270 00271 /// given a chromosome index and position, return the genome position 00272 /// 00273 /// \param chromosome index of the chromosome 00274 /// \param chromosomeIndex 1-based chromosome position 00275 /// \return genome index of the above chromosome position 00276 genomeIndex_t getGenomePosition( 00277 int chromosome, 00278 unsigned int chromosomeIndex) const; 00279 00280 /// given the chromosome name, get the corresponding 0 based genome index 00281 /// for the start of that chromosome 00282 genomeIndex_t getGenomePosition(const char *chromosomeName) const; 00283 genomeIndex_t getGenomePosition(int chromosomeIndex) const; 00284 00285 const std::string &getBaseFilename() const 00286 { 00287 return _baseFilename; 00288 } 00289 00290 const char *getChromosomeName(int chromosomeIndex) const 00291 { 00292 return header->_chromosomes[chromosomeIndex].name; 00293 } 00294 00295 void setDebugFlag(bool d) 00296 { 00297 _debugFlag = d; 00298 } 00299 00300 genomeIndex_t sequenceLength() const 00301 { 00302 return (genomeIndex_t) header->elementCount; 00303 } 00304 00305 const char *chromosomeName(int chr) const 00306 { 00307 return header->_chromosomes[chr].name; 00308 } 00309 00310 void sanityCheck(MemoryMap &fasta) const; 00311 00312 // TODO - this will be moved somewhere else and be made a static method. 00313 std::string IntegerToSeq(unsigned int n, unsigned int wordsize) const; 00314 00315 bool wordMatch(unsigned int index, std::string &word) const; 00316 bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word) const; 00317 00318 // TODO - this will be moved somewhere else and be made a static method. 00319 char BasePair(char c) const 00320 { 00321 return BaseAsciiMap::base2complement[(int) c]; 00322 } 00323 00324 void dumpSequenceSAMDictionary(std::ostream&) const; 00325 void dumpHeaderTSV(std::ostream&) const; 00326 00327 /// 00328 /// Return the bases in base space or color space for within range index, ot 00329 /// @param index the array-like index (0 based). 00330 /// @return ACTGN in base space; 0123N for color space; and 'N' for invalid. 00331 /// For color space, index i represents the transition of base at position (i-1) to base at position i 00332 /// 00333 /// NB: bounds checking here needs to be deprecated - do not assume it 00334 /// will exist - the call must clip reads so that this routine is never 00335 /// called with a index value larger than the genome. 00336 /// 00337 /// The reason for this is simply that this routine gets called hundreds 00338 /// of billions of time in one run of karma, which will absolutely kill 00339 /// performance. Every single instruction here matters a great, great deal. 00340 /// 00341 00342 // 00343 // 3.5% improvement for color space matching: 00344 // I guess the compiler isn't inlining 3 functions deep. 00345 // 00346 00347 #if 0 00348 // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 00349 // The following code does not work even in the base space, 00350 // since the memory layout has changed. USE IT WITH CAUTIOUS! 00351 // 00352 // This block of code is a functional duplicate of the following 00353 // code - leave this here for reference and possibly later 00354 // performance testing as well as compiler evaluation. 00355 inline char operator[](genomeIndex_t index) const 00356 { 00357 return BaseAsciiMap::int2base[(*((genomeSequenceArray*) this))[index]]; 00358 } 00359 #endif 00360 00361 inline char operator[](genomeIndex_t index) const 00362 { 00363 uint8_t val; 00364 if (index < getNumberBases()) 00365 { 00366 if ((index&1)==0) 00367 { 00368 val = ((uint8_t *) data)[index>>1] & 0xf; 00369 } 00370 else 00371 { 00372 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4; 00373 } 00374 } 00375 else 00376 { 00377 val = BaseAsciiMap::baseNIndex; 00378 } 00379 val = isColorSpace() ? BaseAsciiMap::int2colorSpace[val] : 00380 BaseAsciiMap::int2base[val]; 00381 return val; 00382 } 00383 00384 /// given a chromosome name and 1-based position, return the reference base. 00385 /// \param chromosomeName name of the chromosome - exact match only 00386 /// \param chromosomeIndex 1-based chromosome position 00387 /// \return reference base at the above chromosome position 00388 inline char getBase(const char *chromosomeName, 00389 unsigned int chromosomeIndex) const 00390 { 00391 genomeIndex_t index = 00392 getGenomePosition(chromosomeName, chromosomeIndex); 00393 if(index == INVALID_GENOME_INDEX) 00394 { 00395 // Invalid position, so return 'N' 00396 return('N'); 00397 } 00398 return((*this)[index]); 00399 } 00400 00401 00402 inline uint8_t getInteger(genomeIndex_t index) const 00403 { 00404 return (*((genomeSequenceArray*) this))[index]; 00405 } 00406 00407 inline void set(genomeIndex_t index, char value) 00408 { 00409 genomeSequenceArray::set(index, 00410 BaseAsciiMap::base2int[(uint8_t) value]); 00411 } 00412 00413 /// obtain the pointer to the raw data for other access methods 00414 /// 00415 /// this is a fairly ugly hack to reach into the 00416 /// raw genome vector, get the byte that encodes 00417 /// two bases, and return it. This is used by 00418 /// karma ReadIndexer::getSumQ to compare genome 00419 /// matchines by byte (two bases at a time) to speed 00420 /// it up. 00421 /// 00422 uint8_t *getDataPtr(genomeIndex_t index) 00423 { 00424 return ((uint8_t *) data + index/2); 00425 } 00426 private: 00427 /// 00428 /// when creating the genome mapped file, we call this to set 00429 /// the MD5 checksum of the chromosome sequence and length so that 00430 /// we can write the SAM SQ headers properly. 00431 /// NB: operates on the last fully loaded chromosome. 00432 bool setChromosomeMD5andLength(uint32_t whichChromosome); 00433 00434 public: 00435 00436 // TODO - this will be moved somewhere else and be made a static method. 00437 // replace read with the reversed one 00438 void getReverseRead(std::string &read); 00439 00440 // TODO - this will be moved somewhere else and be made a static method. 00441 void getReverseRead(String& read); 00442 00443 // debug the given read - print nice results 00444 int debugPrintReadValidation( 00445 std::string &read, 00446 std::string &quality, 00447 char direction, 00448 genomeIndex_t readLocation, 00449 int sumQuality, 00450 int mismatchCount, 00451 bool recurse = true 00452 ); 00453 00454 // 00455 // get the sequence from this GenomeSequence using the specified chromosome and 0-based position. 00456 // if baseCount < 0, get the reverse complement 00457 // that starts at index (but do not reverse the string?) 00458 void getString(std::string &str, int chromosome, uint32_t index, int baseCount) const; 00459 void getString(String &str, int chromosome, uint32_t index, int baseCount) const; 00460 // 00461 // get the sequence from this GenomeSequence. 00462 // if baseCount < 0, get the reverse complement 00463 // that starts at index (but do not reverse the string?) 00464 // 00465 void getString(std::string &str, genomeIndex_t index, int baseCount) const; 00466 void getString(String &str, genomeIndex_t index, int baseCount) const; 00467 00468 void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const; 00469 00470 void print30(genomeIndex_t) const; 00471 00472 // for debugging, not for speed: 00473 genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const; 00474 00475 00476 // TODO - these methods do not handle a CIGAR string and do not handle '=' when a read matches the reference. 00477 // They are here for alignment and should be moved to the aligner (karma). 00478 // OR they should optionally take a CIGAR and use that if specified.... 00479 // maybe they should be helper methods that are found somewhere else 00480 00481 /// Return the mismatch count, disregarding CIGAR strings 00482 /// 00483 /// \param read is the sequence we're counting mismatches in 00484 /// \param location is where in the genmoe we start comparing 00485 /// \param exclude is a wildcard character (e.g. '.' or 'N') 00486 /// 00487 /// \return number of bases that don't match the reference, except those that match exclude 00488 int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const 00489 { 00490 int mismatchCount = 0; 00491 for (uint32_t i=0; i<read.size(); i++) 00492 if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i]; 00493 return mismatchCount; 00494 }; 00495 00496 /// brute force sumQ - no sanity checking 00497 /// 00498 /// \param read shotgun sequencer read string 00499 /// \param qualities phred quality string of same length 00500 /// \param location the alignment location to check sumQ 00501 int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const 00502 { 00503 int sumQ = 0; 00504 for (uint32_t i=0; i<read.size(); i++) 00505 sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0); 00506 return sumQ; 00507 }; 00508 // return a string highlighting mismatch postions with '^' chars: 00509 void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const; 00510 void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const; 00511 00512 // END TODO 00513 00514 void getChromosomeAndIndex(std::string &, genomeIndex_t) const; 00515 void getChromosomeAndIndex(String &, genomeIndex_t) const; 00516 00517 /// check a SAM format read, using phred quality scores and 00518 /// the CIGAR string to determine if it is correct. 00519 /// 00520 /// 00521 /// \param read the read in base space 00522 /// \param qualities the phred encoded qualities (Sanger, not Illumina) 00523 /// \param cigar the SAM file CIGAR column 00524 /// \param sumQ if >0 on entry, is checked against the computed sumQ 00525 /// \param insertions count of insertions found in 00526 /// 00527 /// 00528 bool checkRead( 00529 std::string &read, 00530 std::string &qualities, 00531 std::string &cigar, 00532 int &sumQ, // input and output 00533 int &gapOpenCount, // output only 00534 int &gapExtendCount, // output only 00535 int &gapDeleteCount, // output only 00536 std::string &result 00537 ) const; 00538 00539 bool populateDBSNP(mmapArrayBool_t &dbSNP, IFILE inputFile) const; 00540 00541 /// user friendly dbSNP loader. 00542 /// 00543 /// \param inputFileName may be empty, point to a text file or a dbSNP vector file 00544 /// 00545 /// In all cases, dbSNP is returned the same length as this genome. 00546 /// 00547 /// When no SNPs are loaded, all values are false. 00548 /// 00549 /// When a text file is given, the file is parsed with two space 00550 /// separated columns - the first column is the chromosome name, and 00551 /// the second is the 1-based chromosome position of the SNP. 00552 /// 00553 /// \return false if a dbSNP file was correctly loaded, true otherwise 00554 /// 00555 bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const; 00556 }; 00557 00558 #endif