00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #ifndef _GENOME_SEQUENCE_H
00019 #define _GENOME_SEQUENCE_H
00020
00021 #include <assert.h>
00022 #include <fcntl.h>
00023 #include <stdio.h>
00024 #include <stddef.h>
00025 #include <sys/types.h>
00026 #include <sys/stat.h>
00027 #if defined(WIN32)
00028 #define MD5_DIGEST_LENGTH 16
00029 #else
00030 #include <openssl/md5.h>
00031 #endif
00032 #include <string>
00033 #include <vector>
00034 #include "MemoryMapArray.h"
00035
00036
00037 #include "StringArray.h"
00038
00039 #ifndef __STDC_LIMIT_MACROS
00040 #define __STDC_LIMIT_MACROS
00041 #endif
00042 #include <stdint.h>
00043
00044 typedef uint32_t genomeIndex_t;
00045 #define INVALID_GENOME_INDEX UINT32_MAX
00046
00047
00048 #define INVALID_CHROMOSOME_INDEX -1
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061 struct ChromosomeInfo
00062 {
00063 static const int MAX_GENOME_INFO_STRING=128;
00064
00065 void constructorClear()
00066 {
00067 memset(this,0, sizeof(*this));
00068 }
00069 void setChromosomeName(const char *n)
00070 {
00071 strncpy(name, n, sizeof(name)-1);
00072 }
00073 genomeIndex_t start;
00074 genomeIndex_t size;
00075 char md5[2*MD5_DIGEST_LENGTH + 1];
00076 char name[MAX_GENOME_INFO_STRING];
00077 char assemblyID[MAX_GENOME_INFO_STRING];
00078 char uri[MAX_GENOME_INFO_STRING];
00079 char species[MAX_GENOME_INFO_STRING];
00080
00081
00082 void setAssemblyID(const char *newID)
00083 {
00084 strncpy(assemblyID, newID, sizeof(assemblyID) - 1);
00085 }
00086 void setSpecies(const char *newSpecies)
00087 {
00088 strncpy(species, newSpecies, sizeof(newSpecies));
00089 }
00090 void setURI(const char *newURI)
00091 {
00092 strncpy(uri, newURI, sizeof(uri));
00093 }
00094 };
00095
00096 class genomeSequenceMmapHeader : public MemoryMapArrayHeader
00097 {
00098 friend class GenomeSequence;
00099 friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &);
00100 private:
00101 uint32_t _chromosomeCount;
00102 bool _colorSpace;
00103
00104 ChromosomeInfo _chromosomes[0];
00105
00106 public:
00107
00108
00109
00110
00111
00112 static size_t getHeaderSize(int chromosomeCount)
00113 {
00114 return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount;
00115 }
00116
00117
00118
00119
00120 };
00121
00122 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h);
00123
00124
00125
00126
00127
00128
00129
00130 inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index)
00131 {
00132 if ((index&1)==0)
00133 {
00134 return ((uint8_t *) base)[index>>1] & 0xf;
00135 }
00136 else
00137 {
00138 return (((uint8_t *) base)[index>>1] & 0xf0) >> 4;
00139 }
00140 };
00141 inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v)
00142 {
00143 if ((index&1)==0)
00144 {
00145 ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v;
00146 }
00147 else
00148 {
00149 ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4;
00150 }
00151 }
00152
00153 inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i)
00154 {
00155 return sizeof(uint8_t) * i / 2;
00156 }
00157
00158 #define UMFA_COOKIE 0x1b7933a1 // unique cookie id
00159 #define UMFA_VERSION 20100401U // YYYYMMDD of last change to file layout
00160
00161 typedef MemoryMapArray<
00162 uint8_t,
00163 genomeIndex_t,
00164 UMFA_COOKIE,
00165 UMFA_VERSION,
00166 genomeSequenceAccess,
00167 genomeSequenceSet,
00168 mmapGenomeSequenceElementCount2Bytes,
00169 genomeSequenceMmapHeader
00170 > genomeSequenceArray;
00171
00172 class PackedRead
00173 {
00174 void set(int index, int val)
00175 {
00176 packedBases[index>>1] =
00177 (packedBases[index>>1]
00178 & ~(7<<((index&0x01)<<2)))
00179 | ((val&0x0f)<<((index&0x1)<<2));
00180 }
00181 public:
00182 std::vector<uint8_t> packedBases;
00183 uint32_t length;
00184 int size()
00185 {
00186 return length;
00187 }
00188 void clear()
00189 {
00190 packedBases.clear();
00191 length=0;
00192 }
00193 void set(const char *rhs, int padWithNCount = 0);
00194 uint8_t operator [](int index)
00195 {
00196 return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
00197 }
00198 };
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235 class GenomeSequence : public genomeSequenceArray
00236 {
00237 public:
00238 static const int baseAIndex = 000;
00239 static const int baseTIndex = 001;
00240 static const int baseCIndex = 002;
00241 static const int baseGIndex = 003;
00242 static const int baseNIndex = 004;
00243 static const int baseXIndex = 005;
00244
00245
00246
00247
00248
00249 static unsigned char base2int[];
00250 static const char int2base[];
00251 static const char int2colorSpace[];
00252 static unsigned char base2complement[];
00253
00254 private:
00255 int _debugFlag;
00256
00257 std::ostream *_progressStream;
00258 bool _colorSpace;
00259
00260 std::string _baseFilename;
00261 std::string _referenceFilename;
00262 std::string _fastaFilename;
00263 std::string _umfaFilename;
00264 std::string _application;
00265
00266 MemoryMap _umfaFile;
00267
00268 void setup(const char *referenceFilename);
00269
00270 public:
00271
00272 GenomeSequence();
00273 void constructorClear();
00274
00275
00276
00277
00278 GenomeSequence(std::string &referenceFilename)
00279 {
00280 constructorClear();
00281 setup(referenceFilename.c_str());
00282 }
00283
00284
00285
00286
00287
00288 GenomeSequence(const char *referenceFilename)
00289 {
00290 constructorClear();
00291 setup(referenceFilename);
00292 }
00293
00294
00295 ~GenomeSequence();
00296
00297
00298
00299
00300
00301
00302 bool open(bool isColorSpace = false, int flags = O_RDONLY);
00303
00304
00305
00306
00307
00308
00309 bool open(const char *filename, int flags = O_RDONLY)
00310 {
00311 _umfaFilename = filename;
00312 return genomeSequenceArray::open(filename, flags);
00313 }
00314
00315
00316 private:
00317 bool _searchCommonFileSuffix;
00318 public:
00319 bool create();
00320
00321
00322 bool create(bool isColor) {setColorSpace(isColor); return create(); }
00323
00324
00325
00326
00327 void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;}
00328 void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; }
00329 void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
00330
00331 bool loadFastaData(const char *filename);
00332
00333
00334
00335
00336
00337
00338 bool setReferenceName(std::string referenceFilename);
00339
00340
00341
00342
00343 void setApplication(std::string application)
00344 {
00345 _application = application;
00346 }
00347 const std::string &getFastaName()
00348 {
00349 return _fastaFilename;
00350 }
00351 const std::string &getReferenceName()
00352 {
00353 return _referenceFilename;
00354 }
00355
00356
00357
00358 bool isColorSpace()
00359 {
00360 return _colorSpace;
00361 }
00362
00363
00364
00365 genomeIndex_t getNumberBases()
00366 {
00367 return getElementCount();
00368 }
00369
00370
00371
00372
00373
00374
00375
00376
00377 int getChromosome(genomeIndex_t position);
00378
00379
00380
00381
00382
00383
00384
00385 int getChromosome(const char *chromosomeName);
00386
00387
00388 int getChromosomeCount();
00389
00390
00391
00392
00393
00394 genomeIndex_t getChromosomeStart(int chromosomeIndex)
00395 {
00396 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00397 return header->_chromosomes[chromosomeIndex].start;
00398 }
00399
00400
00401
00402
00403
00404 genomeIndex_t getChromosomeSize(int chromosomeIndex)
00405 {
00406 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
00407 return header->_chromosomes[chromosomeIndex].size;
00408 }
00409
00410
00411
00412
00413
00414
00415 genomeIndex_t getGenomePosition(
00416 const char *chromosomeName,
00417 unsigned int chromosomeIndex);
00418
00419
00420
00421
00422
00423
00424 genomeIndex_t getGenomePosition(
00425 int chromosome,
00426 unsigned int chromosomeIndex);
00427
00428
00429
00430 genomeIndex_t getGenomePosition(const char *chromosomeName);
00431 genomeIndex_t getGenomePosition(int chromosomeIndex);
00432
00433 const std::string &getBaseFilename()
00434 {
00435 return _baseFilename;
00436 }
00437 const char *getChromosomeName(int chromosomeIndex)
00438 {
00439 return header->_chromosomes[chromosomeIndex].name;
00440 }
00441
00442 void setDebugFlag(bool d)
00443 {
00444 _debugFlag = d;
00445 }
00446
00447 genomeIndex_t sequenceLength()
00448 {
00449 return (genomeIndex_t) header->elementCount;
00450 }
00451 const char *chromosomeName(int chr)
00452 {
00453 return header->_chromosomes[chr].name;
00454 }
00455
00456 void sanityCheck(MemoryMap &fasta);
00457 std::string IntegerToSeq(unsigned int n, unsigned int wordsize);
00458
00459 bool wordMatch(unsigned int index, std::string &word);
00460 bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word);
00461 char BasePair(char c)
00462 {
00463 return base2complement[(int) c];
00464 }
00465 void dumpSequenceSAMDictionary(std::ostream&);
00466 void dumpHeaderTSV(std::ostream&);
00467 void loadHeaderFromTSV(std::istream&);
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489 #if 0
00490
00491
00492
00493
00494
00495
00496
00497 inline char operator[](genomeIndex_t index)
00498 {
00499 return int2base[(*((genomeSequenceArray*) this))[index]];
00500 }
00501 #endif
00502
00503 inline char operator[](genomeIndex_t index)
00504 {
00505 uint8_t val;
00506 if (index < getNumberBases())
00507 {
00508 if ((index&1)==0)
00509 {
00510 val = ((uint8_t *) data)[index>>1] & 0xf;
00511 }
00512 else
00513 {
00514 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
00515 }
00516 }
00517 else
00518 {
00519 val = baseNIndex;
00520 }
00521 val = isColorSpace() ? int2colorSpace[val] : int2base[val];
00522 return val;
00523 }
00524
00525 inline uint8_t getInteger(genomeIndex_t index)
00526 {
00527 return (*((genomeSequenceArray*) this))[index];
00528 }
00529 inline void set(genomeIndex_t index, char value)
00530 {
00531 genomeSequenceArray::set(index, base2int[(uint8_t) value]);
00532 }
00533
00534
00535
00536
00537
00538
00539
00540
00541
00542 uint8_t *getDataPtr(genomeIndex_t index)
00543 {
00544 return ((uint8_t *) data + index/2);
00545 }
00546 private:
00547
00548
00549
00550
00551
00552 bool setChromosomeMD5andLength(uint32_t whichChromosome);
00553 public:
00554
00555 void getReverseRead(std::string &read);
00556 void getReverseRead(String& read);
00557
00558 int debugPrintReadValidation(
00559 std::string &read,
00560 std::string &quality,
00561 char direction,
00562 genomeIndex_t readLocation,
00563 int sumQuality,
00564 int mismatchCount,
00565 bool recurse = true
00566 );
00567
00568 void getString(std::string &str, int chromosome, genomeIndex_t index, int baseCount);
00569 void getString(String &str, int chromosome, genomeIndex_t index, int baseCount);
00570
00571
00572
00573
00574
00575 void getString(std::string &str, genomeIndex_t index, int baseCount);
00576 void getString(String &str, genomeIndex_t index, int baseCount);
00577
00578 void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd);
00579
00580 void print30(genomeIndex_t);
00581
00582
00583 genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize);
00584
00585
00586
00587
00588
00589
00590
00591
00592 int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0')
00593 {
00594 int mismatchCount = 0;
00595 for (uint32_t i=0; i<read.size(); i++)
00596 if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
00597 return mismatchCount;
00598 };
00599
00600
00601
00602
00603
00604
00605 int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location)
00606 {
00607 int sumQ = 0;
00608 for (uint32_t i=0; i<read.size(); i++)
00609 sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
00610 return sumQ;
00611 };
00612
00613 void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location);
00614 void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location);
00615
00616 void getChromosomeAndIndex(std::string &, genomeIndex_t);
00617 void getChromosomeAndIndex(String &, genomeIndex_t);
00618
00619
00620
00621
00622
00623
00624
00625
00626
00627
00628
00629
00630 bool checkRead(
00631 std::string &read,
00632 std::string &qualities,
00633 std::string &cigar,
00634 int &sumQ,
00635 int &gapOpenCount,
00636 int &gapExtendCount,
00637 int &gapDeleteCount,
00638 std::string &result
00639 );
00640
00641 bool populateDBSNP(mmapArrayBool_t &dbSNP, std::ifstream &inputFile);
00642
00643
00644
00645
00646
00647
00648
00649
00650
00651
00652
00653
00654
00655
00656
00657 bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName);
00658 };
00659
00660 #endif