00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #ifndef __REFERENCESEQUENCE_H
00019 #define __REFERENCESEQUENCE_H
00020
00021 #include "BaseAsciiMap.h"
00022 #include "Generic.h"
00023 #include "PackedVector.h"
00024
00025 #include <algorithm>
00026 #include <iostream>
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046 namespace Sequence {
00047
00048
00049
00050
00051
00052
00053
00054 template<typename sequenceType, typename sequenceIndexType, typename wordType>
00055 bool wordMatch(sequenceType &sequence, sequenceIndexType index, wordType &word)
00056 {
00057 if( (index + word.size()) >= sequence.size() ) return false;
00058
00059 for(size_t i = 0; i < word.size(); i++) {
00060 if( sequence[index + i] != word[i]) return false;
00061 }
00062 return true;
00063 }
00064
00065
00066
00067
00068
00069
00070 template<typename sequenceType, typename sequenceIndexType, typename wordType>
00071 void printNearbyWords(std::ostream &output, sequenceType &sequence, sequenceIndexType index, wordType &word, int deviation)
00072 {
00073 for (sequenceIndexType i = index - deviation; i < index + deviation; i++)
00074 {
00075 if (wordMatch(sequence, i, word))
00076 {
00077 output << "word '"
00078 << word
00079 << "' found "
00080 << i - index
00081 << " away from position "
00082 << index
00083 << "."
00084 << std::endl;
00085 }
00086 }
00087 }
00088
00089
00090
00091
00092
00093 template<typename sequenceType, typename sequenceIndexType, typename wordType>
00094 void getString(sequenceType &sequence, sequenceIndexType index, int baseCount, wordType &word)
00095 {
00096 word.clear();
00097 for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++)
00098 {
00099 word.push_back(sequence[index + i]);
00100 }
00101 }
00102
00103
00104
00105
00106
00107
00108 template<typename sequenceType, typename sequenceIndexType, typename wordType>
00109 void getHighLightedString(
00110 sequenceType &sequence,
00111 sequenceIndexType index,
00112 int baseCount,
00113 wordType &word,
00114 sequenceIndexType highLightStart,
00115 sequenceIndexType highLightEnd)
00116 {
00117 word.clear();
00118 for (sequenceIndexType i=0; i < (sequenceIndexType) baseCount; i++)
00119 {
00120 char base = sequence[index+i];
00121
00122 if(in(index+i, highLightStart, highLightEnd))
00123 base = tolower(base);
00124
00125 word.push_back(base);
00126 }
00127 }
00128
00129
00130
00131
00132
00133 template<typename sequenceType, typename sequenceIndexType>
00134 void printBaseContext(std::ostream &output, sequenceType &sequence, sequenceIndexType index, int baseCount = 30)
00135 {
00136 output << "index: " << index << std::endl;
00137 for (sequenceIndexType i=index-baseCount; i<=index+baseCount; i++)
00138 output << sequence[i];
00139 output << std::endl;
00140 for (sequenceIndexType i=index-baseCount; i<index; i++)
00141 std::cout << " ";
00142 output << "^";
00143 output << std::endl;
00144 }
00145
00146 template<typename sequenceType, typename sequenceIndexType>
00147 void getMismatchHatString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read)
00148 {
00149 result = "";
00150 for (uint32_t i=0; i < read.size(); i++)
00151 {
00152 if (read[i] == sequence[location+i])
00153 result.push_back(' ');
00154 else
00155 result.push_back('^');
00156 }
00157 }
00158
00159 template<typename sequenceType, typename sequenceIndexType>
00160 void getMismatchString(sequenceType &sequence, sequenceIndexType location, std::string &result, std::string &read)
00161 {
00162 result = "";
00163 for (uint32_t i=0; i < read.size(); i++)
00164 {
00165 if (read[i] == sequence[location+i])
00166 result.push_back(toupper(read[i]));
00167 else
00168 result.push_back(tolower(read[i]));
00169 }
00170 }
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180 template<typename sequenceType, typename sequenceIndexType, typename readType>
00181 int getMismatchCount(sequenceType &sequence, sequenceIndexType location, readType &read, char exclude='\0')
00182 {
00183 int mismatchCount = 0;
00184 for (uint32_t i=0; i<read.size(); i++)
00185 if (read[i]!=exclude) mismatchCount += read[i]!=sequence[location + i];
00186 return mismatchCount;
00187 }
00188
00189
00190
00191
00192
00193
00194 template<typename sequenceType, typename sequenceIndexType, typename readType, typename qualityType>
00195 int getSumQ(sequenceType &sequence, sequenceIndexType location, readType &read, qualityType &qualities)
00196 {
00197 int sumQ = 0;
00198 for (uint32_t i=0; i<read.size(); i++)
00199 sumQ += (read[i]!=sequence[location + i] ? (qualities[i]-33) : 0);
00200 return sumQ;
00201 };
00202
00203
00204
00205 template<typename sequenceType, typename sequenceIndexType, typename readType, typename qualityType>
00206 sequenceIndexType simpleLocalAligner(
00207 sequenceType &sequence,
00208 sequenceIndexType index,
00209 readType &read,
00210 qualityType &quality,
00211 int windowSize)
00212 {
00213 int bestScore = 1000000;
00214 sequenceIndexType bestMatchLocation = -1;
00215 for (int i=-windowSize; i<windowSize; i++)
00216 {
00217 int newScore;
00218
00219
00220 if( not in((size_t) index + i, (size_t) 0, sequence.size())) continue;
00221
00222 if (quality.size() == 0)
00223 {
00224 newScore = getMismatchCount(sequence, index + i, read);
00225 }
00226 else
00227 {
00228 newScore = getSumQ(sequence, index + i, read, quality);
00229 }
00230 if (newScore < bestScore)
00231 {
00232 bestScore = newScore;
00233 bestMatchLocation = index + i;
00234 }
00235 }
00236 return bestMatchLocation;
00237 }
00238
00239
00240 }
00241
00242 typedef uint32_t PackedVectorIndex_t;
00243
00244
00245
00246
00247
00248
00249 class PackedSequenceData : public PackedVector4Bit_t
00250 {
00251
00252 public:
00253 inline char operator[](PackedVectorIndex_t baseIndex)
00254 {
00255 return BaseAsciiMap::int2base[ (static_cast<PackedVector4Bit_t>(*this))[baseIndex]];
00256 }
00257 inline void set(PackedVectorIndex_t baseIndex, char value)
00258 {
00259 this->PackedVector4Bit_t::set(baseIndex, BaseAsciiMap::base2int[(uint32_t) value]);
00260 }
00261 inline void push_back(char value)
00262 {
00263 this->PackedVector4Bit_t::push_back(BaseAsciiMap::base2int[(uint32_t) value]);
00264 }
00265 };
00266
00267 std::ostream &operator << (std::ostream &stream, PackedSequenceData &v)
00268 {
00269 for(size_t i=0; i<v.size(); i++) {
00270 stream << i << ": " << v[i] << std::endl;
00271 }
00272 return stream;
00273 }
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290 template<typename SequenceDataType, typename ChromosomeNameType>
00291 bool loadFastaFile(const char *filename,
00292 std::vector<SequenceDataType> &sequenceData,
00293 std::vector<ChromosomeNameType> &chromosomeNames)
00294 {
00295 InputFile inputStream(filename, "r", InputFile::DEFAULT);
00296
00297 if(!inputStream.isOpen()) {
00298 std::cerr << "Failed to open file " << filename << "\n";
00299 return true;
00300 }
00301
00302 int whichChromosome = -1;
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312 chromosomeNames.clear();
00313
00314 char ch;
00315 while((ch = inputStream.ifgetc()) != EOF) {
00316 switch (ch)
00317 {
00318 case '\n':
00319 case '\r':
00320 break;
00321 case '>':
00322 {
00323 std::string chromosomeName = "";
00324
00325
00326
00327 while (!isspace((ch = inputStream.ifgetc())) && ch != EOF)
00328 {
00329 chromosomeName += ch;
00330 }
00331
00332
00333
00334 do {
00335 ch = inputStream.ifgetc();
00336 } while(ch != EOF && ch != '\n' && ch != '\r');
00337
00338
00339
00340
00341 chromosomeNames.push_back(chromosomeName);
00342
00343 whichChromosome++;
00344
00345 sequenceData.resize(whichChromosome+1);
00346
00347 break;
00348 }
00349 default:
00350
00351
00352
00353
00354
00355
00356 sequenceData[whichChromosome].push_back(toupper(ch));
00357 #if 0
00358 if (isColorSpace())
00359 {
00360
00361
00362
00363
00364
00365 const char fromBase2CS[] =
00366 {
00367 0,
00368 1,
00369 2,
00370 3,
00371 1,
00372 0,
00373 3,
00374 2,
00375 2,
00376 3,
00377 0,
00378 1,
00379 3,
00380 2,
00381 1,
00382 0,
00383 };
00384
00385
00386
00387
00388
00389
00390
00391
00392 char thisBase = base2int[(int)(fasta[fastaIndex])];
00393 if (lastBase>=0)
00394 {
00395 char color;
00396 if (lastBase>3 || thisBase>3) color=4;
00397 else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
00398
00399
00400 set(header->elementCount++, int2base[(int) color]);
00401 }
00402 lastBase = thisBase;
00403 }
00404 else
00405 {
00406 set(header->elementCount++, toupper(fasta[fastaIndex]));
00407 }
00408 #endif
00409 break;
00410 }
00411 }
00412 return false;
00413 }
00414
00415 #endif