libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __REFERENCESEQUENCE_H 00019 #define __REFERENCESEQUENCE_H 00020 00021 #include "BaseAsciiMap.h" 00022 #include "Generic.h" 00023 #include "PackedVector.h" 00024 00025 #include <algorithm> 00026 #include <iostream> 00027 00028 // 00029 // The namespace Sequence is for templated algorithms that are by and large 00030 // independent of the underlying storage mechanism for the bases. 00031 // 00032 // They are written in such a way as to assume that the array operator [] 00033 // will return an ASCII representation of a base space nucleotide. 00034 // 00035 // In theory, this set of templates will work with a variety of combinations 00036 // of means for representing bases - String, std::string, and others. 00037 // 00038 // The containers are expected to allow for an overidden [] operator, 00039 // and provide a size() method to return the number of bases in the 00040 // container. 00041 // 00042 // In containers where sequence data is placed, they must in addition 00043 // have a clear() method as well as a push_back() method as done 00044 // in std::string containers. 00045 // 00046 namespace Sequence { 00047 00048 // 00049 // wordMatch(Sequnece, Index, Word) - compare a word to a sequence of bases 00050 // Sequence is a generic container with a large set of bases 00051 // Index is the starting point to start the comparison 00052 // Word is the small sequence of bases to match 00053 // 00054 template<typename sequenceType, typename sequenceIndexType, typename wordType> 00055 bool wordMatch(sequenceType &sequence, sequenceIndexType index, wordType &word) 00056 { 00057 if( (index + word.size()) >= sequence.size() ) return false; 00058 00059 for(size_t i = 0; i < word.size(); i++) { 00060 if( sequence[index + i] != word[i]) return false; 00061 } 00062 return true; 00063 } 00064 00065 // 00066 // printNearbyWords(output, sequence, index, word, deviation) searches 00067 // for 'deviation' bases on either side of the index into sequence 00068 // and prints all occurrences where word appears. 00069 // 00070 template<typename sequenceType, typename sequenceIndexType, typename wordType> 00071 void printNearbyWords(std::ostream &output, sequenceType &sequence, sequenceIndexType index, wordType &word, int deviation) 00072 { 00073 for (sequenceIndexType i = index - deviation; i < index + deviation; i++) 00074 { 00075 if (wordMatch(sequence, i, word)) 00076 { 00077 output << "word '" 00078 << word 00079 << "' found " 00080 << i - index 00081 << " away from position " 00082 << index 00083 << "." 00084 << std::endl; 00085 } 00086 } 00087 } 00088 00089 // 00090 // getString(sequence, index, baseCount, word) - populate word with the 'baseCount' 00091 // bases that occur at the 'index' starting position in sequence. 00092 // 00093 template<typename sequenceType, typename sequenceIndexType, typename wordType> 00094 void getString(sequenceType &sequence, sequenceIndexType index, int baseCount, wordType &word) 00095 { 00096 word.clear(); 00097 for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++) 00098 { 00099 word.push_back(sequence[index + i]); 00100 } 00101 } 00102 00103 // 00104 // getHighLightedString() is a debugging aid for printing "highlighted" 00105 // subsets of bases, where the highlighting is done via turning the 00106 // base into a lower case ASCII equivalent. 00107 // 00108 template<typename sequenceType, typename sequenceIndexType, typename wordType> 00109 void getHighLightedString( 00110 sequenceType &sequence, 00111 sequenceIndexType index, 00112 int baseCount, 00113 wordType &word, 00114 sequenceIndexType highLightStart, 00115 sequenceIndexType highLightEnd) 00116 { 00117 word.clear(); 00118 for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++) 00119 { 00120 char base = sequence[index+i]; 00121 00122 if(in(index+i, highLightStart, highLightEnd)) 00123 base = tolower(base); 00124 00125 word.push_back(base); 00126 } 00127 } 00128 00129 // 00130 // printBaseContext() outputs a base at location 'index' along with 'baseCount' 00131 // bases on either side of that base (default 30). 00132 // 00133 template<typename sequenceType, typename sequenceIndexType> 00134 void printBaseContext(std::ostream &output, sequenceType &sequence, sequenceIndexType index, int baseCount = 30) 00135 { 00136 output << "index: " << index << std::endl; 00137 for (sequenceIndexType i=index-baseCount; i<=index+baseCount; i++) 00138 output << sequence[i]; 00139 output << std::endl; 00140 for (sequenceIndexType i=index-baseCount; i<index; i++) 00141 std::cout << " "; 00142 output << "^"; 00143 output << std::endl; 00144 } 00145 00146 template<typename sequenceType, typename sequenceIndexType> 00147 void getMismatchHatString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read) 00148 { 00149 result = ""; 00150 for (uint32_t i=0; i < read.size(); i++) 00151 { 00152 if (read[i] == sequence[location+i]) 00153 result.push_back(' '); 00154 else 00155 result.push_back('^'); 00156 } 00157 } 00158 00159 template<typename sequenceType, typename sequenceIndexType> 00160 void getMismatchString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read) 00161 { 00162 result = ""; 00163 for (uint32_t i=0; i < read.size(); i++) 00164 { 00165 if (read[i] == sequence[location+i]) 00166 result.push_back(toupper(read[i])); 00167 else 00168 result.push_back(tolower(read[i])); 00169 } 00170 } 00171 00172 /// Return the mismatch count, disregarding CIGAR strings 00173 /// 00174 /// \param read is the sequence we're counting mismatches in 00175 /// \param location is where in the genmoe we start comparing 00176 /// \param exclude is a wildcard character (e.g. '.' or 'N') 00177 /// 00178 /// \return number of bases that don't match the reference, except those that match exclude 00179 00180 template<typename sequenceType, typename sequenceIndexType, typename readType> 00181 int getMismatchCount(sequenceType &sequence, sequenceIndexType location, readType &read, char exclude='\0') 00182 { 00183 int mismatchCount = 0; 00184 for (uint32_t i=0; i<read.size(); i++) 00185 if (read[i]!=exclude) mismatchCount += read[i]!=sequence[location + i]; 00186 return mismatchCount; 00187 } 00188 00189 /// brute force sumQ - no sanity checking 00190 /// 00191 /// \param read shotgun sequencer read string 00192 /// \param qualities phred quality string of same length 00193 /// \param location the alignment location to check sumQ 00194 template<typename sequenceType, typename sequenceIndexType, typename readType, typename qualityType> 00195 int getSumQ(sequenceType &sequence, sequenceIndexType location, readType &read, qualityType &qualities) 00196 { 00197 int sumQ = 0; 00198 for (uint32_t i=0; i<read.size(); i++) 00199 sumQ += (read[i]!=sequence[location + i] ? (qualities[i]-33) : 0); 00200 return sumQ; 00201 }; 00202 00203 00204 00205 template<typename sequenceType, typename sequenceIndexType, typename readType, typename qualityType> 00206 sequenceIndexType simpleLocalAligner( 00207 sequenceType &sequence, 00208 sequenceIndexType index, 00209 readType &read, 00210 qualityType &quality, 00211 int windowSize) 00212 { 00213 int bestScore = 1000000; // either mismatch count or sumQ 00214 sequenceIndexType bestMatchLocation = -1; 00215 for (int i=-windowSize; i<windowSize; i++) 00216 { 00217 int newScore; 00218 00219 // 'in' is a template, and types of all three args have to match 00220 if( not in((size_t) index + i, (size_t) 0, sequence.size())) continue; 00221 00222 if (quality.size() == 0) 00223 { 00224 newScore = getMismatchCount(sequence, index + i, read); 00225 } 00226 else 00227 { 00228 newScore = getSumQ(sequence, index + i, read, quality); 00229 } 00230 if (newScore < bestScore) 00231 { 00232 bestScore = newScore; 00233 bestMatchLocation = index + i; 00234 } 00235 } 00236 return bestMatchLocation; 00237 } 00238 00239 00240 } 00241 00242 typedef uint32_t PackedVectorIndex_t; 00243 00244 // 00245 // this is actually a base space implementation, since 00246 // the load/set methods do indirection from the symbol 00247 // value through a symbol value to symbol lookup table. 00248 // 00249 class PackedSequenceData : public PackedVector4Bit_t 00250 { 00251 00252 public: 00253 inline char operator[](PackedVectorIndex_t baseIndex) 00254 { 00255 return BaseAsciiMap::int2base[ (static_cast<PackedVector4Bit_t>(*this))[baseIndex]]; 00256 } 00257 inline void set(PackedVectorIndex_t baseIndex, char value) 00258 { 00259 this->PackedVector4Bit_t::set(baseIndex, BaseAsciiMap::base2int[(uint32_t) value]); 00260 } 00261 inline void push_back(char value) 00262 { 00263 this->PackedVector4Bit_t::push_back(BaseAsciiMap::base2int[(uint32_t) value]); 00264 } 00265 }; 00266 00267 std::ostream &operator << (std::ostream &stream, PackedSequenceData &v) 00268 { 00269 for(size_t i=0; i<v.size(); i++) { 00270 stream << i << ": " << v[i] << std::endl; 00271 } 00272 return stream; 00273 } 00274 00275 00276 // 00277 // Load a fasta format file from filename into the buffer 00278 // provided by the caller. 00279 // While parsing the fasta file, record each chromosome name, 00280 // its start location, and its size. 00281 // 00282 // NB: the caller must implement the logic to determine how 00283 // large the sequence data is. There is no correct way to do 00284 // this, because we can't reliably estimate here how much sequence 00285 // data is contained in a compressed file. 00286 // 00287 // To safely pre-allocate space in sequenceData, use the reserve() method 00288 // before calling this function. 00289 // 00290 template<typename SequenceDataType, typename ChromosomeNameType> 00291 bool loadFastaFile(const char *filename, 00292 std::vector<SequenceDataType> &sequenceData, 00293 std::vector<ChromosomeNameType> &chromosomeNames) 00294 { 00295 InputFile inputStream(filename, "r", InputFile::DEFAULT); 00296 00297 if(!inputStream.isOpen()) { 00298 std::cerr << "Failed to open file " << filename << "\n"; 00299 return true; 00300 } 00301 00302 int whichChromosome = -1; 00303 00304 // 00305 // chromosomeNames is cheap to clear, so do it here. 00306 // 00307 // NB: I explicitly choose not to clear the sequence data 00308 // container, this allows the caller to pre-allocate based 00309 // on their knowledge of the size of the expected genome. 00310 // 00311 00312 chromosomeNames.clear(); 00313 00314 char ch; 00315 while((ch = inputStream.ifgetc()) != EOF) { 00316 switch (ch) 00317 { 00318 case '\n': 00319 case '\r': 00320 break; 00321 case '>': 00322 { 00323 std::string chromosomeName = ""; 00324 // 00325 // pull out the chromosome new name 00326 // 00327 while (!isspace((ch = inputStream.ifgetc())) && ch != EOF) 00328 { 00329 chromosomeName += ch; // slow, but who cares 00330 } 00331 // 00332 // eat the rest of the line 00333 // 00334 do { 00335 ch = inputStream.ifgetc(); 00336 } while(ch != EOF && ch != '\n' && ch != '\r'); 00337 // 00338 // save the Chromosome name and index into our 00339 // header so we can use them later. 00340 // 00341 chromosomeNames.push_back(chromosomeName); 00342 00343 whichChromosome++; 00344 00345 sequenceData.resize(whichChromosome+1); 00346 00347 break; 00348 } 00349 default: 00350 // we get here for sequence data. 00351 // 00352 // save the base value 00353 // Note: invalid characters come here as well, but we 00354 // let ::set deal with mapping them. 00355 00356 sequenceData[whichChromosome].push_back(toupper(ch)); 00357 #if 0 00358 if (isColorSpace()) 00359 { 00360 // 00361 // anything outside these values represents an invalid base 00362 // base codes: 0-> A, 1-> C, 2-> G, 3-> T 00363 // colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red 00364 // 00365 const char fromBase2CS[] = 00366 { 00367 /* 0000 */ 0, // A->A 00368 /* 0001 */ 1, // A->C 00369 /* 0010 */ 2, // A->G 00370 /* 0011 */ 3, // A->T 00371 /* 0100 */ 1, // C->A 00372 /* 0101 */ 0, // C->C 00373 /* 0110 */ 3, // C->G 00374 /* 0111 */ 2, // C->T 00375 /* 1000 */ 2, // G->A 00376 /* 1001 */ 3, // G->C 00377 /* 1010 */ 0, // G->G 00378 /* 1011 */ 1, // G->T 00379 /* 1100 */ 3, // T->A 00380 /* 1101 */ 2, // T->C 00381 /* 1110 */ 1, // T->G 00382 /* 1111 */ 0, // T->T 00383 }; 00384 // 00385 // we are writing color space values on transitions, 00386 // so we don't write a colorspace value when we 00387 // get the first base value. 00388 // 00389 // On second and subsequent bases, write based on 00390 // the index table above 00391 // 00392 char thisBase = base2int[(int)(fasta[fastaIndex])]; 00393 if (lastBase>=0) 00394 { 00395 char color; 00396 if (lastBase>3 || thisBase>3) color=4; 00397 else color = fromBase2CS[(int)(lastBase<<2 | thisBase)]; 00398 // re-use the int to base, because ::set expects a base char (ATCG), not 00399 // a color code (0123). It should only matter on final output. 00400 set(header->elementCount++, int2base[(int) color]); 00401 } 00402 lastBase = thisBase; 00403 } 00404 else 00405 { 00406 set(header->elementCount++, toupper(fasta[fastaIndex])); 00407 } 00408 #endif 00409 break; 00410 } 00411 } 00412 return false; 00413 } 00414 00415 #endif