00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #ifndef _GENOME_SEQUENCE_H
00019 #define _GENOME_SEQUENCE_H
00020
00021 #include <assert.h>
00022 #include <fcntl.h>
00023 #include <stdio.h>
00024 #include <stddef.h>
00025 #include <sys/types.h>
00026 #include <sys/stat.h>
00027 #if defined(WIN32)
00028 #define MD5_DIGEST_LENGTH 16
00029 #else
00030 #include <openssl/md5.h>
00031 #endif
00032 #include <string>
00033 #include <vector>
00034 #include "MemoryMapArray.h"
00035
00036
00037 #include "StringArray.h"
00038
00039 #include <stdint.h>
00040
00041
00042
00043 #ifndef UINT32_MAX
00044 #define UINT32_MAX 0xFFFFFFFF
00045 #endif
00046
00047 typedef uint32_t genomeIndex_t;
00048 #define INVALID_GENOME_INDEX UINT32_MAX
00049
00050
00051 #define INVALID_CHROMOSOME_INDEX -1
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064 struct ChromosomeInfo
00065 {
00066 static const int MAX_GENOME_INFO_STRING=128;
00067
00068 void constructorClear()
00069 {
00070 memset(this,0, sizeof(*this));
00071 }
00072 void setChromosomeName(const char *n)
00073 {
00074 strncpy(name, n, sizeof(name)-1);
00075 }
00076 genomeIndex_t start;
00077 genomeIndex_t size;
00078 char md5[2*MD5_DIGEST_LENGTH + 1];
00079 char name[MAX_GENOME_INFO_STRING];
00080 char assemblyID[MAX_GENOME_INFO_STRING];
00081 char uri[MAX_GENOME_INFO_STRING];
00082 char species[MAX_GENOME_INFO_STRING];
00083
00084
00085 void setAssemblyID(const char *newID)
00086 {
00087 strncpy(assemblyID, newID, sizeof(assemblyID) - 1);
00088 }
00089 void setSpecies(const char *newSpecies)
00090 {
00091 strncpy(species, newSpecies, sizeof(newSpecies));
00092 }
00093 void setURI(const char *newURI)
00094 {
00095 strncpy(uri, newURI, sizeof(uri));
00096 }
00097 };
00098
00099 class genomeSequenceMmapHeader : public MemoryMapArrayHeader
00100 {
00101 friend class GenomeSequence;
00102 friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &);
00103 private:
00104 uint32_t _chromosomeCount;
00105 bool _colorSpace;
00106
00107 ChromosomeInfo _chromosomes[0];
00108
00109 public:
00110
00111
00112
00113
00114
00115 static size_t getHeaderSize(int chromosomeCount)
00116 {
00117 return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount;
00118 }
00119
00120
00121
00122
00123 };
00124
00125 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h);
00126
00127
00128
00129
00130
00131
00132
00133 inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index)
00134 {
00135 if ((index&1)==0)
00136 {
00137 return ((uint8_t *) base)[index>>1] & 0xf;
00138 }
00139 else
00140 {
00141 return (((uint8_t *) base)[index>>1] & 0xf0) >> 4;
00142 }
00143 };
00144 inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v)
00145 {
00146 if ((index&1)==0)
00147 {
00148 ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v;
00149 }
00150 else
00151 {
00152 ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4;
00153 }
00154 }
00155
00156 inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i)
00157 {
00158 return sizeof(uint8_t) * i / 2;
00159 }
00160
00161 #define UMFA_COOKIE 0x1b7933a1 // unique cookie id
00162 #define UMFA_VERSION 20100401U // YYYYMMDD of last change to file layout
00163
00164 typedef MemoryMapArray<
00165 uint8_t,
00166 genomeIndex_t,
00167 UMFA_COOKIE,
00168 UMFA_VERSION,
00169 genomeSequenceAccess,
00170 genomeSequenceSet,
00171 mmapGenomeSequenceElementCount2Bytes,
00172 genomeSequenceMmapHeader
00173 > genomeSequenceArray;
00174
00175 class PackedRead
00176 {
00177 void set(int index, int val)
00178 {
00179 packedBases[index>>1] =
00180 (packedBases[index>>1]
00181 & ~(7<<((index&0x01)<<2)))
00182 | ((val&0x0f)<<((index&0x1)<<2));
00183 }
00184 public:
00185 std::vector<uint8_t> packedBases;
00186 uint32_t length;
00187 int size()
00188 {
00189 return length;
00190 }
00191 void clear()
00192 {
00193 packedBases.clear();
00194 length=0;
00195 }
00196 void set(const char *rhs, int padWithNCount = 0);
00197 uint8_t operator [](int index)
00198 {
00199 return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
00200 }
00201 };
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238 class GenomeSequence : public genomeSequenceArray
00239 {
00240 public:
00241
00242 static const int baseAIndex = 000;
00243 static const int baseTIndex = 001;
00244 static const int baseCIndex = 002;
00245 static const int baseGIndex = 003;
00246 static const int baseNIndex = 004;
00247 static const int baseXIndex = 005;
00248
00249
00250
00251
00252
00253 static unsigned char base2int[];
00254
00255
00256 static const char int2base[];
00257 static const char int2colorSpace[];
00258 static unsigned char base2complement[];
00259
00260 private:
00261 int _debugFlag;
00262
00263 std::ostream *_progressStream;
00264 bool _colorSpace;
00265
00266 std::string _baseFilename;
00267 std::string _referenceFilename;
00268 std::string _fastaFilename;
00269 std::string _umfaFilename;
00270 std::string _application;
00271
00272 MemoryMap _umfaFile;
00273
00274 void setup(const char *referenceFilename);
00275
00276 public:
00277
00278 GenomeSequence();
00279 void constructorClear();
00280
00281
00282
00283
00284 GenomeSequence(std::string &referenceFilename)
00285 {
00286 constructorClear();
00287 setup(referenceFilename.c_str());
00288 }
00289
00290
00291
00292
00293
00294 GenomeSequence(const char *referenceFilename)
00295 {
00296 constructorClear();
00297 setup(referenceFilename);
00298 }
00299
00300
00301 ~GenomeSequence();
00302
00303
00304
00305
00306
00307
00308 bool open(bool isColorSpace = false, int flags = O_RDONLY);
00309
00310
00311
00312
00313
00314
00315 bool open(const char *filename, int flags = O_RDONLY)
00316 {
00317 _umfaFilename = filename;
00318 return genomeSequenceArray::open(filename, flags);
00319 }
00320
00321
00322 private:
00323 bool _searchCommonFileSuffix;
00324 public:
00325 bool create();
00326
00327
00328 bool create(bool isColor) {setColorSpace(isColor); return create(); }
00329
00330
00331
00332
00333 void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;}
00334 void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; }
00335 void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
00336
00337 bool loadFastaData(const char *filename);
00338
00339
00340
00341
00342
00343
00344 bool setReferenceName(std::string referenceFilename);
00345
00346
00347
00348
00349 void setApplication(std::string application)
00350 {
00351 _application = application;
00352 }
00353 const std::string &getFastaName() const
00354 {
00355 return _fastaFilename;
00356 }
00357 const std::string &getReferenceName() const
00358 {
00359 return _referenceFilename;
00360 }
00361
00362
00363
00364 bool isColorSpace() const
00365 {
00366 return _colorSpace;
00367 }
00368
00369
00370
00371 genomeIndex_t getNumberBases() const
00372 {
00373 return getElementCount();
00374 }
00375
00376
00377
00378
00379
00380
00381
00382
00383 int getChromosome(genomeIndex_t position) const;
00384
00385
00386
00387
00388
00389
00390
00391
00392 int getChromosome(const char *chromosomeName) const;
00393
00394
00395 int getChromosomeCount() const;
00396
00397
00398
00399
00400
00401 genomeIndex_t getChromosomeStart(int chromosomeIndex) const
00402 {
00403 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00404 return header->_chromosomes[chromosomeIndex].start;
00405 }
00406
00407
00408
00409
00410
00411 genomeIndex_t getChromosomeSize(int chromosomeIndex) const
00412 {
00413 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
00414 return header->_chromosomes[chromosomeIndex].size;
00415 }
00416
00417
00418
00419
00420
00421
00422 genomeIndex_t getGenomePosition(
00423 const char *chromosomeName,
00424 unsigned int chromosomeIndex) const;
00425
00426
00427
00428
00429
00430
00431 genomeIndex_t getGenomePosition(
00432 int chromosome,
00433 unsigned int chromosomeIndex) const;
00434
00435
00436
00437 genomeIndex_t getGenomePosition(const char *chromosomeName) const;
00438 genomeIndex_t getGenomePosition(int chromosomeIndex) const;
00439
00440 const std::string &getBaseFilename() const
00441 {
00442 return _baseFilename;
00443 }
00444
00445 const char *getChromosomeName(int chromosomeIndex) const
00446 {
00447 return header->_chromosomes[chromosomeIndex].name;
00448 }
00449
00450 void setDebugFlag(bool d)
00451 {
00452 _debugFlag = d;
00453 }
00454
00455 genomeIndex_t sequenceLength() const
00456 {
00457 return (genomeIndex_t) header->elementCount;
00458 }
00459
00460 const char *chromosomeName(int chr) const
00461 {
00462 return header->_chromosomes[chr].name;
00463 }
00464
00465 void sanityCheck(MemoryMap &fasta) const;
00466
00467
00468 std::string IntegerToSeq(unsigned int n, unsigned int wordsize) const;
00469
00470 bool wordMatch(unsigned int index, std::string &word) const;
00471 bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word) const;
00472
00473
00474 char BasePair(char c) const
00475 {
00476 return base2complement[(int) c];
00477 }
00478
00479 void dumpSequenceSAMDictionary(std::ostream&) const;
00480 void dumpHeaderTSV(std::ostream&) const;
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502 #if 0
00503
00504
00505
00506
00507
00508
00509
00510 inline char operator[](genomeIndex_t index) const
00511 {
00512 return int2base[(*((genomeSequenceArray*) this))[index]];
00513 }
00514 #endif
00515
00516 inline char operator[](genomeIndex_t index) const
00517 {
00518 uint8_t val;
00519 if (index < getNumberBases())
00520 {
00521 if ((index&1)==0)
00522 {
00523 val = ((uint8_t *) data)[index>>1] & 0xf;
00524 }
00525 else
00526 {
00527 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
00528 }
00529 }
00530 else
00531 {
00532 val = baseNIndex;
00533 }
00534 val = isColorSpace() ? int2colorSpace[val] : int2base[val];
00535 return val;
00536 }
00537
00538 inline uint8_t getInteger(genomeIndex_t index) const
00539 {
00540 return (*((genomeSequenceArray*) this))[index];
00541 }
00542
00543 inline void set(genomeIndex_t index, char value)
00544 {
00545 genomeSequenceArray::set(index, base2int[(uint8_t) value]);
00546 }
00547
00548
00549
00550
00551
00552
00553
00554
00555
00556
00557 uint8_t *getDataPtr(genomeIndex_t index)
00558 {
00559 return ((uint8_t *) data + index/2);
00560 }
00561 private:
00562
00563
00564
00565
00566
00567 bool setChromosomeMD5andLength(uint32_t whichChromosome);
00568
00569 public:
00570
00571
00572
00573 void getReverseRead(std::string &read);
00574
00575
00576 void getReverseRead(String& read);
00577
00578
00579 int debugPrintReadValidation(
00580 std::string &read,
00581 std::string &quality,
00582 char direction,
00583 genomeIndex_t readLocation,
00584 int sumQuality,
00585 int mismatchCount,
00586 bool recurse = true
00587 );
00588
00589 void getString(std::string &str, int chromosome, genomeIndex_t index, int baseCount) const;
00590 void getString(String &str, int chromosome, genomeIndex_t index, int baseCount) const;
00591
00592
00593
00594
00595
00596 void getString(std::string &str, genomeIndex_t index, int baseCount) const;
00597 void getString(String &str, genomeIndex_t index, int baseCount) const;
00598
00599 void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const;
00600
00601 void print30(genomeIndex_t) const;
00602
00603
00604 genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const;
00605
00606
00607
00608
00609
00610
00611
00612
00613
00614
00615
00616
00617
00618
00619 int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
00620 {
00621 int mismatchCount = 0;
00622 for (uint32_t i=0; i<read.size(); i++)
00623 if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
00624 return mismatchCount;
00625 };
00626
00627
00628
00629
00630
00631
00632 int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
00633 {
00634 int sumQ = 0;
00635 for (uint32_t i=0; i<read.size(); i++)
00636 sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
00637 return sumQ;
00638 };
00639
00640 void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const;
00641 void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const;
00642
00643
00644
00645 void getChromosomeAndIndex(std::string &, genomeIndex_t) const;
00646 void getChromosomeAndIndex(String &, genomeIndex_t) const;
00647
00648
00649
00650
00651
00652
00653
00654
00655
00656
00657
00658
00659 bool checkRead(
00660 std::string &read,
00661 std::string &qualities,
00662 std::string &cigar,
00663 int &sumQ,
00664 int &gapOpenCount,
00665 int &gapExtendCount,
00666 int &gapDeleteCount,
00667 std::string &result
00668 ) const;
00669
00670 bool populateDBSNP(mmapArrayBool_t &dbSNP, std::ifstream &inputFile) const;
00671
00672
00673
00674
00675
00676
00677
00678
00679
00680
00681
00682
00683
00684
00685
00686 bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const;
00687 };
00688
00689 #endif