00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #ifndef _GENOME_SEQUENCE_H
00019 #define _GENOME_SEQUENCE_H
00020
00021 #include <assert.h>
00022 #include <fcntl.h>
00023 #include <stdio.h>
00024 #include <stddef.h>
00025 #include <sys/types.h>
00026 #include <sys/stat.h>
00027 #if defined(WIN32)
00028 #define MD5_DIGEST_LENGTH 16
00029 #else
00030 #include <openssl/md5.h>
00031 #endif
00032 #include <string>
00033 #include <vector>
00034 #include "MemoryMapArray.h"
00035
00036
00037 #include "StringArray.h"
00038
00039 #include <stdint.h>
00040
00041
00042
00043 #ifndef UINT32_MAX
00044 #define UINT32_MAX 0xFFFFFFFF
00045 #endif
00046
00047 typedef uint32_t genomeIndex_t;
00048 #define INVALID_GENOME_INDEX UINT32_MAX
00049
00050
00051 #define INVALID_CHROMOSOME_INDEX -1
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064 struct ChromosomeInfo
00065 {
00066 static const int MAX_GENOME_INFO_STRING=128;
00067
00068 void constructorClear()
00069 {
00070 memset(this,0, sizeof(*this));
00071 }
00072 void setChromosomeName(const char *n)
00073 {
00074 strncpy(name, n, sizeof(name)-1);
00075 }
00076 genomeIndex_t start;
00077 genomeIndex_t size;
00078 char md5[2*MD5_DIGEST_LENGTH + 1];
00079 char name[MAX_GENOME_INFO_STRING];
00080 char assemblyID[MAX_GENOME_INFO_STRING];
00081 char uri[MAX_GENOME_INFO_STRING];
00082 char species[MAX_GENOME_INFO_STRING];
00083
00084
00085 void setAssemblyID(const char *newID)
00086 {
00087 strncpy(assemblyID, newID, sizeof(assemblyID) - 1);
00088 }
00089 void setSpecies(const char *newSpecies)
00090 {
00091 strncpy(species, newSpecies, sizeof(newSpecies));
00092 }
00093 void setURI(const char *newURI)
00094 {
00095 strncpy(uri, newURI, sizeof(uri));
00096 }
00097 };
00098
00099 class genomeSequenceMmapHeader : public MemoryMapArrayHeader
00100 {
00101 friend class GenomeSequence;
00102 friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &);
00103 private:
00104 uint32_t _chromosomeCount;
00105 bool _colorSpace;
00106
00107 ChromosomeInfo _chromosomes[0];
00108
00109 public:
00110
00111
00112
00113
00114
00115 static size_t getHeaderSize(int chromosomeCount)
00116 {
00117 return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount;
00118 }
00119
00120
00121
00122
00123 };
00124
00125 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h);
00126
00127
00128
00129
00130
00131
00132
00133 inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index)
00134 {
00135 if ((index&1)==0)
00136 {
00137 return ((uint8_t *) base)[index>>1] & 0xf;
00138 }
00139 else
00140 {
00141 return (((uint8_t *) base)[index>>1] & 0xf0) >> 4;
00142 }
00143 };
00144 inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v)
00145 {
00146 if ((index&1)==0)
00147 {
00148 ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v;
00149 }
00150 else
00151 {
00152 ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4;
00153 }
00154 }
00155
00156 inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i)
00157 {
00158 return sizeof(uint8_t) * i / 2;
00159 }
00160
00161 #define UMFA_COOKIE 0x1b7933a1 // unique cookie id
00162 #define UMFA_VERSION 20100401U // YYYYMMDD of last change to file layout
00163
00164 typedef MemoryMapArray<
00165 uint8_t,
00166 genomeIndex_t,
00167 UMFA_COOKIE,
00168 UMFA_VERSION,
00169 genomeSequenceAccess,
00170 genomeSequenceSet,
00171 mmapGenomeSequenceElementCount2Bytes,
00172 genomeSequenceMmapHeader
00173 > genomeSequenceArray;
00174
00175 class PackedRead
00176 {
00177 void set(int index, int val)
00178 {
00179 packedBases[index>>1] =
00180 (packedBases[index>>1]
00181 & ~(7<<((index&0x01)<<2)))
00182 | ((val&0x0f)<<((index&0x1)<<2));
00183 }
00184 public:
00185 std::vector<uint8_t> packedBases;
00186 uint32_t length;
00187 int size()
00188 {
00189 return length;
00190 }
00191 void clear()
00192 {
00193 packedBases.clear();
00194 length=0;
00195 }
00196 void set(const char *rhs, int padWithNCount = 0);
00197 uint8_t operator [](int index)
00198 {
00199 return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
00200 }
00201 };
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238 class GenomeSequence : public genomeSequenceArray
00239 {
00240 public:
00241
00242 static const int baseAIndex = 000;
00243 static const int baseTIndex = 001;
00244 static const int baseCIndex = 002;
00245 static const int baseGIndex = 003;
00246 static const int baseNIndex = 004;
00247 static const int baseXIndex = 005;
00248
00249
00250
00251
00252
00253 static unsigned char base2int[];
00254
00255
00256 static const char int2base[];
00257 static const char int2colorSpace[];
00258 static unsigned char base2complement[];
00259
00260 private:
00261 int _debugFlag;
00262
00263 std::ostream *_progressStream;
00264 bool _colorSpace;
00265
00266
00267 bool _createOverwrite;
00268
00269 std::string _baseFilename;
00270 std::string _referenceFilename;
00271 std::string _fastaFilename;
00272 std::string _umfaFilename;
00273 std::string _application;
00274
00275 MemoryMap _umfaFile;
00276
00277 void setup(const char *referenceFilename);
00278
00279 public:
00280
00281 GenomeSequence();
00282 void constructorClear();
00283
00284
00285
00286
00287 GenomeSequence(std::string &referenceFilename)
00288 {
00289 constructorClear();
00290 setup(referenceFilename.c_str());
00291 }
00292
00293
00294
00295
00296
00297 GenomeSequence(const char *referenceFilename)
00298 {
00299 constructorClear();
00300 setup(referenceFilename);
00301 }
00302
00303
00304 ~GenomeSequence();
00305
00306
00307
00308
00309
00310
00311 bool open(bool isColorSpace = false, int flags = O_RDONLY);
00312
00313
00314
00315
00316
00317
00318 bool open(const char *filename, int flags = O_RDONLY)
00319 {
00320 _umfaFilename = filename;
00321 return genomeSequenceArray::open(filename, flags);
00322 }
00323
00324
00325 private:
00326 bool _searchCommonFileSuffix;
00327 public:
00328 bool create();
00329
00330
00331 bool create(bool isColor) {setColorSpace(isColor); return create(); }
00332
00333
00334
00335
00336 void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;}
00337 void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; }
00338 void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
00339
00340
00341 void setCreateOverwrite(bool createOverwrite) {_createOverwrite = createOverwrite;}
00342
00343 bool loadFastaData(const char *filename);
00344
00345
00346
00347
00348
00349
00350 bool setReferenceName(std::string referenceFilename);
00351
00352
00353
00354
00355 void setApplication(std::string application)
00356 {
00357 _application = application;
00358 }
00359 const std::string &getFastaName() const
00360 {
00361 return _fastaFilename;
00362 }
00363 const std::string &getReferenceName() const
00364 {
00365 return _referenceFilename;
00366 }
00367
00368
00369
00370 bool isColorSpace() const
00371 {
00372 return _colorSpace;
00373 }
00374
00375
00376
00377 genomeIndex_t getNumberBases() const
00378 {
00379 return getElementCount();
00380 }
00381
00382
00383
00384
00385
00386
00387
00388
00389 int getChromosome(genomeIndex_t position) const;
00390
00391
00392
00393
00394
00395
00396
00397
00398 int getChromosome(const char *chromosomeName) const;
00399
00400
00401 int getChromosomeCount() const;
00402
00403
00404
00405
00406
00407 genomeIndex_t getChromosomeStart(int chromosomeIndex) const
00408 {
00409 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00410 return header->_chromosomes[chromosomeIndex].start;
00411 }
00412
00413
00414
00415
00416
00417 genomeIndex_t getChromosomeSize(int chromosomeIndex) const
00418 {
00419 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
00420 return header->_chromosomes[chromosomeIndex].size;
00421 }
00422
00423
00424
00425
00426
00427
00428 genomeIndex_t getGenomePosition(
00429 const char *chromosomeName,
00430 unsigned int chromosomeIndex) const;
00431
00432
00433
00434
00435
00436
00437 genomeIndex_t getGenomePosition(
00438 int chromosome,
00439 unsigned int chromosomeIndex) const;
00440
00441
00442
00443 genomeIndex_t getGenomePosition(const char *chromosomeName) const;
00444 genomeIndex_t getGenomePosition(int chromosomeIndex) const;
00445
00446 const std::string &getBaseFilename() const
00447 {
00448 return _baseFilename;
00449 }
00450
00451 const char *getChromosomeName(int chromosomeIndex) const
00452 {
00453 return header->_chromosomes[chromosomeIndex].name;
00454 }
00455
00456 void setDebugFlag(bool d)
00457 {
00458 _debugFlag = d;
00459 }
00460
00461 genomeIndex_t sequenceLength() const
00462 {
00463 return (genomeIndex_t) header->elementCount;
00464 }
00465
00466 const char *chromosomeName(int chr) const
00467 {
00468 return header->_chromosomes[chr].name;
00469 }
00470
00471 void sanityCheck(MemoryMap &fasta) const;
00472
00473
00474 std::string IntegerToSeq(unsigned int n, unsigned int wordsize) const;
00475
00476 bool wordMatch(unsigned int index, std::string &word) const;
00477 bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word) const;
00478
00479
00480 char BasePair(char c) const
00481 {
00482 return base2complement[(int) c];
00483 }
00484
00485 void dumpSequenceSAMDictionary(std::ostream&) const;
00486 void dumpHeaderTSV(std::ostream&) const;
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508 #if 0
00509
00510
00511
00512
00513
00514
00515
00516 inline char operator[](genomeIndex_t index) const
00517 {
00518 return int2base[(*((genomeSequenceArray*) this))[index]];
00519 }
00520 #endif
00521
00522 inline char operator[](genomeIndex_t index) const
00523 {
00524 uint8_t val;
00525 if (index < getNumberBases())
00526 {
00527 if ((index&1)==0)
00528 {
00529 val = ((uint8_t *) data)[index>>1] & 0xf;
00530 }
00531 else
00532 {
00533 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
00534 }
00535 }
00536 else
00537 {
00538 val = baseNIndex;
00539 }
00540 val = isColorSpace() ? int2colorSpace[val] : int2base[val];
00541 return val;
00542 }
00543
00544 inline uint8_t getInteger(genomeIndex_t index) const
00545 {
00546 return (*((genomeSequenceArray*) this))[index];
00547 }
00548
00549 inline void set(genomeIndex_t index, char value)
00550 {
00551 genomeSequenceArray::set(index, base2int[(uint8_t) value]);
00552 }
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563 uint8_t *getDataPtr(genomeIndex_t index)
00564 {
00565 return ((uint8_t *) data + index/2);
00566 }
00567 private:
00568
00569
00570
00571
00572
00573 bool setChromosomeMD5andLength(uint32_t whichChromosome);
00574
00575 public:
00576
00577
00578
00579 void getReverseRead(std::string &read);
00580
00581
00582 void getReverseRead(String& read);
00583
00584
00585 int debugPrintReadValidation(
00586 std::string &read,
00587 std::string &quality,
00588 char direction,
00589 genomeIndex_t readLocation,
00590 int sumQuality,
00591 int mismatchCount,
00592 bool recurse = true
00593 );
00594
00595 void getString(std::string &str, int chromosome, genomeIndex_t index, int baseCount) const;
00596 void getString(String &str, int chromosome, genomeIndex_t index, int baseCount) const;
00597
00598
00599
00600
00601
00602 void getString(std::string &str, genomeIndex_t index, int baseCount) const;
00603 void getString(String &str, genomeIndex_t index, int baseCount) const;
00604
00605 void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const;
00606
00607 void print30(genomeIndex_t) const;
00608
00609
00610 genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const;
00611
00612
00613
00614
00615
00616
00617
00618
00619
00620
00621
00622
00623
00624
00625 int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
00626 {
00627 int mismatchCount = 0;
00628 for (uint32_t i=0; i<read.size(); i++)
00629 if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
00630 return mismatchCount;
00631 };
00632
00633
00634
00635
00636
00637
00638 int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
00639 {
00640 int sumQ = 0;
00641 for (uint32_t i=0; i<read.size(); i++)
00642 sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
00643 return sumQ;
00644 };
00645
00646 void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const;
00647 void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const;
00648
00649
00650
00651 void getChromosomeAndIndex(std::string &, genomeIndex_t) const;
00652 void getChromosomeAndIndex(String &, genomeIndex_t) const;
00653
00654
00655
00656
00657
00658
00659
00660
00661
00662
00663
00664
00665 bool checkRead(
00666 std::string &read,
00667 std::string &qualities,
00668 std::string &cigar,
00669 int &sumQ,
00670 int &gapOpenCount,
00671 int &gapExtendCount,
00672 int &gapDeleteCount,
00673 std::string &result
00674 ) const;
00675
00676 bool populateDBSNP(mmapArrayBool_t &dbSNP, std::ifstream &inputFile) const;
00677
00678
00679
00680
00681
00682
00683
00684
00685
00686
00687
00688
00689
00690
00691
00692 bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const;
00693 };
00694
00695 #endif