libStatGen Software  1
ReferenceSequence.h
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __REFERENCESEQUENCE_H
00019 #define __REFERENCESEQUENCE_H
00020 
00021 #include "BaseAsciiMap.h"
00022 #include "Generic.h"
00023 #include "PackedVector.h"
00024 
00025 #include <algorithm>
00026 #include <iostream>
00027 
00028 //
00029 // The namespace Sequence is for templated algorithms that are by and large
00030 // independent of the underlying storage mechanism for the bases.
00031 //
00032 // They are written in such a way as to assume that the array operator []
00033 // will return an ASCII representation of a base space nucleotide.
00034 //
00035 // In theory, this set of templates will work with a variety of combinations
00036 // of means for representing bases - String, std::string, and others.
00037 //
00038 // The containers are expected to allow for an overidden [] operator,
00039 // and provide a size() method to return the number of bases in the
00040 // container.
00041 //
00042 // In containers where sequence data is placed, they must in addition
00043 // have a clear() method as well as a push_back() method as done
00044 // in std::string containers.
00045 //
00046 namespace Sequence {
00047 
00048 //
00049 // wordMatch(Sequnece, Index, Word) - compare a word to a sequence of bases
00050 //  Sequence is a generic container with a large set of bases
00051 //  Index is the starting point to start the comparison
00052 //  Word is the small sequence of bases to match
00053 //
00054 template<typename sequenceType, typename sequenceIndexType, typename wordType>
00055 bool wordMatch(sequenceType &sequence,  sequenceIndexType index, wordType &word) 
00056 {
00057     if( (index + word.size()) >= sequence.size() ) return false;
00058 
00059     for(size_t i = 0; i < word.size(); i++) {
00060         if( sequence[index + i] != word[i]) return false;
00061     }
00062     return true;
00063 }
00064 
00065 //
00066 // printNearbyWords(output, sequence, index, word, deviation) searches
00067 // for 'deviation' bases on either side of the index into sequence
00068 // and prints all occurrences where word appears.
00069 //
00070 template<typename sequenceType, typename sequenceIndexType, typename wordType>
00071 void printNearbyWords(std::ostream &output, sequenceType &sequence,  sequenceIndexType index, wordType &word, int deviation)
00072 {
00073     for (sequenceIndexType i = index - deviation; i < index + deviation; i++)
00074     {
00075         if (wordMatch(sequence, i, word))
00076         {
00077             output << "word '"
00078             << word
00079             << "' found "
00080             << i - index
00081             << " away from position "
00082             << index
00083             << "."
00084             << std::endl;
00085         }
00086     }
00087 }
00088 
00089 //
00090 // getString(sequence, index, baseCount, word) - populate word with the 'baseCount'
00091 // bases that occur at the 'index' starting position in sequence.
00092 //
00093 template<typename sequenceType, typename sequenceIndexType, typename wordType>
00094 void getString(sequenceType &sequence, sequenceIndexType index, int baseCount, wordType &word)
00095 {
00096     word.clear();
00097     for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++)
00098     {
00099         word.push_back(sequence[index + i]);
00100     }
00101 }
00102 
00103 //
00104 // getHighLightedString() is a debugging aid for printing "highlighted"
00105 // subsets of bases, where the highlighting is done via turning the
00106 // base into a lower case ASCII equivalent.
00107 //
00108 template<typename sequenceType, typename sequenceIndexType, typename wordType>
00109 void getHighLightedString(
00110         sequenceType &sequence,
00111         sequenceIndexType index,
00112         int baseCount,
00113         wordType &word,
00114         sequenceIndexType highLightStart,
00115         sequenceIndexType highLightEnd)
00116 {
00117     word.clear();
00118     for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++)
00119     {
00120         char base = sequence[index+i];
00121 
00122         if(in(index+i, highLightStart, highLightEnd))
00123             base = tolower(base);
00124 
00125         word.push_back(base);
00126     }
00127 }
00128 
00129 //
00130 // printBaseContext() outputs a base at location 'index' along with 'baseCount'
00131 // bases on either side of that base (default 30).
00132 //
00133 template<typename sequenceType, typename sequenceIndexType>
00134 void printBaseContext(std::ostream &output, sequenceType &sequence, sequenceIndexType index, int baseCount = 30)
00135 {
00136     output << "index: " << index << std::endl;
00137     for (sequenceIndexType i=index-baseCount; i<=index+baseCount; i++)
00138         output << sequence[i];
00139     output << std::endl;
00140     for (sequenceIndexType i=index-baseCount; i<index; i++)
00141         std::cout << " ";
00142     output << "^";
00143     output << std::endl;
00144 }
00145 
00146 template<typename sequenceType, typename sequenceIndexType>
00147 void getMismatchHatString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read)
00148 {
00149     result = "";
00150     for (uint32_t i=0; i < read.size(); i++)
00151     {
00152         if (read[i] == sequence[location+i])
00153             result.push_back(' ');
00154         else
00155             result.push_back('^');
00156     }
00157 }
00158 
00159 template<typename sequenceType, typename sequenceIndexType>
00160 void getMismatchString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read)
00161 {
00162     result = "";
00163     for (uint32_t i=0; i < read.size(); i++)
00164     {
00165         if (read[i] == sequence[location+i])
00166             result.push_back(toupper(read[i]));
00167         else
00168             result.push_back(tolower(read[i]));
00169     }
00170 }
00171 
00172 /// Return the mismatch count, disregarding CIGAR strings
00173 ///
00174 /// \param read is the sequence we're counting mismatches in
00175 /// \param location is where in the genmoe we start comparing
00176 /// \param exclude is a wildcard character (e.g. '.' or 'N')
00177 ///
00178 /// \return number of bases that don't match the reference, except those that match exclude
00179 
00180 template<typename sequenceType, typename sequenceIndexType, typename readType>
00181 int getMismatchCount(sequenceType &sequence, sequenceIndexType location, readType &read, char exclude='\0')
00182 {
00183     int mismatchCount = 0;
00184     for (uint32_t i=0; i<read.size(); i++)
00185         if (read[i]!=exclude) mismatchCount += read[i]!=sequence[location + i];
00186     return mismatchCount;
00187 }
00188 
00189 /// brute force sumQ - no sanity checking
00190 ///
00191 /// \param read shotgun sequencer read string
00192 /// \param qualities phred quality string of same length
00193 /// \param location the alignment location to check sumQ
00194 template<typename sequenceType, typename sequenceIndexType, typename readType, typename qualityType>
00195 int getSumQ(sequenceType &sequence, sequenceIndexType location, readType &read, qualityType &qualities)
00196 {
00197     int sumQ = 0;
00198     for (uint32_t i=0; i<read.size(); i++)
00199         sumQ += (read[i]!=sequence[location + i] ? (qualities[i]-33) : 0);
00200     return sumQ;
00201 };
00202 
00203 
00204 
00205 template<typename sequenceType, typename sequenceIndexType, typename readType, typename qualityType>
00206 sequenceIndexType simpleLocalAligner(
00207         sequenceType &sequence,
00208         sequenceIndexType index,
00209         readType &read,
00210         qualityType &quality,
00211         int windowSize)
00212 {
00213     int bestScore = 1000000; // either mismatch count or sumQ
00214     sequenceIndexType bestMatchLocation = -1;
00215     for (int i=-windowSize; i<windowSize; i++)
00216     {
00217         int newScore;
00218 
00219         // 'in' is a template, and types of all three args have to match
00220         if( not in((size_t) index + i, (size_t) 0, sequence.size())) continue;
00221 
00222         if (quality.size() == 0)
00223         {
00224             newScore = getMismatchCount(sequence, index + i, read);
00225         }
00226         else
00227         {
00228             newScore = getSumQ(sequence, index + i, read, quality);
00229         }
00230         if (newScore < bestScore)
00231         {
00232             bestScore = newScore;
00233             bestMatchLocation = index + i;
00234         }
00235     }
00236     return bestMatchLocation;
00237 }
00238 
00239 
00240 }
00241 
00242 typedef uint32_t PackedVectorIndex_t;
00243 
00244 //
00245 // this is actually a base space implementation, since
00246 // the load/set methods do indirection from the symbol
00247 // value through a symbol value to symbol lookup table.
00248 //
00249 class PackedSequenceData : public PackedVector4Bit_t
00250 {
00251 
00252     public:
00253     inline char operator[](PackedVectorIndex_t baseIndex)
00254     {
00255         return BaseAsciiMap::int2base[ (static_cast<PackedVector4Bit_t>(*this))[baseIndex]];
00256     }
00257     inline void set(PackedVectorIndex_t baseIndex, char value)
00258     {
00259         this->PackedVector4Bit_t::set(baseIndex, BaseAsciiMap::base2int[(uint32_t) value]);
00260     }
00261     inline void push_back(char value)
00262     {
00263         this->PackedVector4Bit_t::push_back(BaseAsciiMap::base2int[(uint32_t) value]);
00264     }
00265 };
00266 
00267 std::ostream &operator << (std::ostream &stream, PackedSequenceData &v)
00268 {
00269     for(size_t i=0; i<v.size(); i++) {
00270         stream << i << ": " << v[i] << std::endl;
00271     }
00272     return stream;
00273 }
00274 
00275 
00276 //
00277 // Load a fasta format file from filename into the buffer
00278 // provided by the caller.
00279 // While parsing the fasta file, record each chromosome name,
00280 // its start location, and its size.
00281 //
00282 // NB: the caller must implement the logic to determine how
00283 // large the sequence data is.  There is no correct way to do
00284 // this, because we can't reliably estimate here how much sequence
00285 // data is contained in a compressed file.
00286 //
00287 // To safely pre-allocate space in sequenceData, use the reserve() method
00288 // before calling this function.
00289 //
00290 template<typename SequenceDataType, typename ChromosomeNameType>
00291 bool loadFastaFile(const char *filename,
00292         std::vector<SequenceDataType> &sequenceData,
00293         std::vector<ChromosomeNameType> &chromosomeNames)
00294 {
00295     InputFile inputStream(filename, "r", InputFile::DEFAULT);
00296 
00297     if(!inputStream.isOpen()) {
00298         std::cerr << "Failed to open file " << filename << "\n";
00299         return true;
00300     }
00301 
00302     int whichChromosome = -1;
00303 
00304     //
00305     // chromosomeNames is cheap to clear, so do it here.
00306     //
00307     // NB: I explicitly choose not to clear the sequence data
00308     // container, this allows the caller to pre-allocate based
00309     // on their knowledge of the size of the expected genome.
00310     //
00311 
00312     chromosomeNames.clear();
00313 
00314     char ch;
00315     while((ch = inputStream.ifgetc()) != EOF) {
00316         switch (ch)
00317         {
00318             case '\n':
00319             case '\r':
00320                 break;
00321             case '>':
00322             {
00323                 std::string chromosomeName = "";
00324                 //
00325                 // pull out the chromosome new name
00326                 //
00327                 while (!isspace((ch = inputStream.ifgetc())) && ch != EOF)
00328                 {
00329                     chromosomeName += ch;  // slow, but who cares
00330                 }
00331                 //
00332                 // eat the rest of the line
00333                 //
00334                 do {
00335                     ch = inputStream.ifgetc();
00336                 } while(ch != EOF && ch != '\n' && ch != '\r');
00337                 //
00338                 // save the Chromosome name and index into our
00339                 // header so we can use them later.
00340                 //
00341                 chromosomeNames.push_back(chromosomeName);
00342 
00343                 whichChromosome++;
00344 
00345                 sequenceData.resize(whichChromosome+1);
00346 
00347                 break;
00348             }
00349             default:
00350                 // we get here for sequence data.
00351                 //
00352                 // save the base value
00353                 // Note: invalid characters come here as well, but we
00354                 // let ::set deal with mapping them.
00355 
00356                 sequenceData[whichChromosome].push_back(toupper(ch));
00357 #if 0
00358                 if (isColorSpace())
00359                 {
00360 //
00361 // anything outside these values represents an invalid base
00362 // base codes: 0-> A,    1-> C,     2-> G,      3-> T
00363 // colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red
00364 //
00365                     const char fromBase2CS[] =
00366                     {
00367                         /* 0000 */ 0,   // A->A
00368                         /* 0001 */ 1,   // A->C
00369                         /* 0010 */ 2,   // A->G
00370                         /* 0011 */ 3,   // A->T
00371                         /* 0100 */ 1,   // C->A
00372                         /* 0101 */ 0,   // C->C
00373                         /* 0110 */ 3,   // C->G
00374                         /* 0111 */ 2,   // C->T
00375                         /* 1000 */ 2,   // G->A
00376                         /* 1001 */ 3,   // G->C
00377                         /* 1010 */ 0,   // G->G
00378                         /* 1011 */ 1,   // G->T
00379                         /* 1100 */ 3,   // T->A
00380                         /* 1101 */ 2,   // T->C
00381                         /* 1110 */ 1,   // T->G
00382                         /* 1111 */ 0,   // T->T
00383                     };
00384                     //
00385                     // we are writing color space values on transitions,
00386                     // so we don't write a colorspace value when we
00387                     // get the first base value.
00388                     //
00389                     // On second and subsequent bases, write based on
00390                     // the index table above
00391                     //
00392                     char thisBase = base2int[(int)(fasta[fastaIndex])];
00393                     if (lastBase>=0)
00394                     {
00395                         char color;
00396                         if (lastBase>3 || thisBase>3) color=4;
00397                         else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
00398                         // re-use the int to base, because ::set expects a base char (ATCG), not
00399                         // a color code (0123).  It should only matter on final output.
00400                         set(header->elementCount++, int2base[(int) color]);
00401                     }
00402                     lastBase = thisBase;
00403                 }
00404                 else
00405                 {
00406                     set(header->elementCount++, toupper(fasta[fastaIndex]));
00407                 }
00408 #endif
00409                 break;
00410         }
00411     }
00412     return false;
00413 }
00414 
00415 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends