GenomeSequence.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _GENOME_SEQUENCE_H
00019 #define _GENOME_SEQUENCE_H
00020 
00021 #include <sys/types.h>
00022 #include <sys/stat.h>
00023 #if !defined(MD5_DIGEST_LENGTH)
00024 #define MD5_DIGEST_LENGTH 16
00025 #endif
00026 #include <string>
00027 #include "MemoryMapArray.h"
00028 #include "BaseAsciiMap.h"
00029 
00030 // Goncalo's String class
00031 #include "StringArray.h"
00032 
00033 #include <stdint.h>
00034 
00035 // stdint.h will define this, but only if __STDC_LIMIT_MACROS was
00036 // defined prior to the first include of stdint.h
00037 #ifndef UINT32_MAX
00038 #define UINT32_MAX 0xFFFFFFFF
00039 #endif
00040 
00041 typedef uint32_t    genomeIndex_t;
00042 #define INVALID_GENOME_INDEX UINT32_MAX
00043 
00044 // chromosome index is just a signed int, so this is ok here:
00045 #define INVALID_CHROMOSOME_INDEX -1
00046 
00047 #include "GenomeSequenceHelpers.h"
00048 
00049 #define UMFA_COOKIE 0x1b7933a1  // unique cookie id
00050 #define UMFA_VERSION 20100401U  // YYYYMMDD of last change to file layout
00051 
00052 typedef MemoryMapArray<
00053 uint32_t,
00054 genomeIndex_t,
00055 UMFA_COOKIE,
00056 UMFA_VERSION,
00057 PackedAccess_4Bit,
00058 PackedAssign_4Bit,
00059 Packed4BitElementCount2Bytes,
00060 genomeSequenceMmapHeader
00061 > genomeSequenceArray;
00062 
00063 
00064 // std::string &operator = (std::string &lhs, const PackedRead &rhs);
00065 //
00066 
00067 //!  Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
00068 /*!
00069   GenomeSequence is designed to be a high performance shared access
00070   reference object.
00071 
00072   It is implemented as a MemoryMapArray template object with unsigned 8
00073   bit ints, each of which stores two bases.  Although 2 bits could be used,
00074   most references have more than four symbols (usually at least including
00075   'N', indicating an unknown or masked out base).
00076 
00077   Normal use of this class follows these steps:
00078    -# create the reference
00079     -# instantiate the GenomeSequence class object
00080     -# create the actual file (memory mapped) that is to hold the data
00081     -# populate the data using GenomeSequence::set
00082    -# use the reference
00083     -# use the reference by instantiating a GenomeSequence object
00084     -# either use the constructor with the reference filename
00085     -# or use GenomeSequence::setReferenceName() followed by ::open
00086     -# access the bases via the overloaded array operator []
00087     -# check sequence length by using GenomeSequence::getNumberBases()
00088    -# accessing chromosomes in the reference
00089     -# you typically will need to know about the chromosomes in the sequence
00090     -# see methods and docs with prefix 'getChromosome'
00091 
00092   Sharing is accomplished using the mmap() function via the MemoryMap
00093   base class.  This allows a potentially large genome reference to be
00094   shared among a number of simultaneously executing instances of one or
00095   more programs sharing the same reference.
00096  */
00097 
00098 
00099 class GenomeSequence : public genomeSequenceArray
00100 {
00101 private:
00102     int                     _debugFlag;
00103 
00104     std::ostream            *_progressStream;
00105     bool                    _colorSpace;
00106 
00107     // Whether or not to overwrite an existing file when creating a umfa file (via create).
00108     bool                    _createOverwrite;
00109 
00110     std::string             _baseFilename;   // for later use by WordIndex create and open
00111     std::string             _referenceFilename;
00112     std::string             _fastaFilename;
00113     std::string             _umfaFilename;
00114     std::string             _application;        // only used in ::create()
00115 
00116     MemoryMap               _umfaFile;
00117 
00118     void setup(const char *referenceFilename);
00119 
00120 public:
00121     /// Simple constructor - no implicit file open
00122     GenomeSequence();
00123     void constructorClear();
00124     /// \brief attempt to open an existing sequence
00125     ///
00126     /// \param referenceFilename the name of the reference fasta file to open
00127     /// \param debug if true, additional debug information is printed
00128     GenomeSequence(std::string &referenceFilename)
00129     {
00130         constructorClear();
00131         setup(referenceFilename.c_str());
00132     }
00133 
00134     /// Smarter constructor - attempt to open an existing sequence
00135     ///
00136     /// \param referenceFilename the name of the reference fasta file to open
00137     /// \param debug if true, additional debug information is printed
00138     GenomeSequence(const char *referenceFilename)
00139     {
00140         constructorClear();
00141         setup(referenceFilename);
00142     }
00143 
00144     /// Close the file if open and destroy the object
00145     ~GenomeSequence();
00146 
00147     /// open the reference specified using GenomeSequence::setReferenceName
00148     ///
00149     /// \param isColorSpace open the color space reference
00150     /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
00151     /// \return false for success, true otherwise
00152     bool open(bool isColorSpace = false, int flags = O_RDONLY);
00153 
00154     /// open the given file as the genome (no filename munging occurs).
00155     ///
00156     /// \param filename the name of the file to open
00157     /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
00158     /// \return false for success, true otherwise
00159     bool open(const char *filename, int flags = O_RDONLY)
00160     {
00161         _umfaFilename = filename;
00162 #if 0
00163         if(genomeSequenceArray::open(filename, flags)) {
00164             //
00165             // if load of the umfa (binary encoded file) fails, attempt
00166             // to load it as a fasta file.
00167             //
00168             return loadFasta();
00169         }
00170 #endif
00171         return false;
00172     }
00173 
00174 private:
00175     bool    _searchCommonFileSuffix;
00176 public:
00177     bool create(bool isColor = false);
00178 
00179     // NEW API?
00180 
00181     // load time modifiers:
00182     /// if set, then show progress when creating and pre-fetching
00183     void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;}
00184     void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; }
00185     void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
00186 
00187     // Set whether or not to overwrite a umfa file when calling create.
00188     void setCreateOverwrite(bool createOverwrite) {_createOverwrite = createOverwrite;}
00189 
00190     bool loadFastaData(const char *filename);
00191 
00192     /// set the reference name that will be used in open()
00193     /// \param referenceFilename the name of the reference fasta file to open
00194     /// \return false for success, true otherwise
00195     ///
00196     /// \sa open()
00197     bool setReferenceName(std::string referenceFilename);
00198 
00199     /// set the application name in the binary file header
00200     ///
00201     /// \param application name of the application
00202     void setApplication(std::string application)
00203     {
00204         _application = application;     // used in ::create() to set application name
00205     }
00206     const std::string &getFastaName() const
00207     {
00208         return _fastaFilename;
00209     }
00210     const std::string &getReferenceName() const
00211     {
00212         return _referenceFilename;
00213     }
00214 
00215     /// tell us if we are a color space reference or not
00216     /// \return true if colorspace, false otherwise
00217     bool isColorSpace() const
00218     {
00219         return _colorSpace;
00220     }
00221 
00222     /// return the number of bases represented in this reference
00223     /// \return count of bases
00224     genomeIndex_t   getNumberBases() const
00225     {
00226         return getElementCount();
00227     }
00228 
00229     /// given a whole genome index, get the chromosome it is located in
00230     ///
00231     /// This is done via a binary search of the chromosome table in the
00232     /// header of the mapped file, so it is O(log(N))
00233     ///
00234     /// \param 0-based position the base in the genome
00235     /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
00236     int getChromosome(genomeIndex_t position) const;
00237 
00238     /// given a chromosome name, return the chromosome index
00239     ///
00240     /// This is done via a linear search of the chromosome table in the
00241     /// header of the mapped file, so it is O(N)
00242     ///
00243     /// \param chromosomeName the name of the chromosome - exact match only
00244     /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
00245     int getChromosome(const char *chromosomeName) const;
00246     /// Return the number of chromosomes in the genome
00247     /// \return number of chromosomes in the genome
00248     int getChromosomeCount() const;
00249 
00250     /// given a chromosome, return the genome base it starts in
00251     ///
00252     /// \param 0-based chromosome index
00253     /// \return 0-based genome index of the base that starts the chromosome
00254     genomeIndex_t getChromosomeStart(int chromosomeIndex) const
00255     {
00256         if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00257         return header->_chromosomes[chromosomeIndex].start;
00258     }
00259 
00260     /// given a chromosome, return its size in bases
00261     ///
00262     /// \param 0-based chromosome index
00263     /// \return size of the chromosome in bases
00264     genomeIndex_t getChromosomeSize(int chromosomeIndex) const
00265     {
00266         if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
00267         return header->_chromosomes[chromosomeIndex].size;
00268     }
00269 
00270     /// given a chromosome name and position, return the genome position
00271     ///
00272     /// \param chromosomeName name of the chromosome - exact match only
00273     /// \param chromosomeIndex 1-based chromosome position
00274     /// \return genome index of the above chromosome position
00275     genomeIndex_t getGenomePosition(
00276         const char *chromosomeName,
00277         unsigned int chromosomeIndex) const;
00278 
00279     /// given a chromosome index and position, return the genome position
00280     ///
00281     /// \param chromosome index of the chromosome
00282     /// \param chromosomeIndex 1-based chromosome position
00283     /// \return genome index of the above chromosome position
00284     genomeIndex_t getGenomePosition(
00285         int chromosome,
00286         unsigned int chromosomeIndex) const;
00287 
00288     /// given the chromosome name, get the corresponding 0 based genome index
00289     /// for the start of that chromosome
00290     genomeIndex_t getGenomePosition(const char *chromosomeName) const;
00291     genomeIndex_t getGenomePosition(int chromosomeIndex) const;
00292 
00293     const std::string &getBaseFilename() const
00294     {
00295         return _baseFilename;
00296     }
00297 
00298     const char *getChromosomeName(int chromosomeIndex) const
00299     {
00300         return header->_chromosomes[chromosomeIndex].name;
00301     }
00302 
00303     void setDebugFlag(bool d)
00304     {
00305         _debugFlag = d;
00306     }
00307 
00308     genomeIndex_t sequenceLength() const
00309     {
00310         return (genomeIndex_t) header->elementCount;
00311     }
00312 
00313     const char *chromosomeName(int chr) const
00314     {
00315         return header->_chromosomes[chr].name;
00316     }
00317 
00318     void sanityCheck(MemoryMap &fasta) const;
00319 
00320     // TODO - this will be moved somewhere else and be made a static method.
00321     std::string IntegerToSeq(unsigned int n, unsigned int wordsize) const;
00322 
00323     bool wordMatch(unsigned int index, std::string &word) const;
00324     bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word) const;
00325 
00326     // TODO - this will be moved somewhere else and be made a static method.
00327     char BasePair(char c) const
00328     {
00329         return BaseAsciiMap::base2complement[(int) c];
00330     }
00331 
00332     void dumpSequenceSAMDictionary(std::ostream&) const;
00333     void dumpHeaderTSV(std::ostream&) const;
00334 
00335     ///
00336     /// Return the bases in base space or color space for within range index, ot
00337     /// @param index the array-like index (0 based).
00338     /// @return ACTGN in base space; 0123N for color space; and 'N' for invalid.
00339     /// For color space, index i represents the transition of base at position (i-1) to base at position i
00340     ///
00341     /// NB: bounds checking here needs to be deprecated - do not assume it
00342     /// will exist - the call must clip reads so that this routine is never
00343     /// called with a index value larger than the genome.
00344     ///
00345     /// The reason for this is simply that this routine gets called hundreds
00346     /// of billions of time in one run of karma, which will absolutely kill
00347     /// performance.  Every single instruction here matters a great, great deal.
00348     ///
00349 
00350     //
00351     // 3.5% improvement for color space matching:
00352     // I guess the compiler isn't inlining 3 functions deep.
00353     //
00354 
00355 #if 0
00356     // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
00357     // The following code does not work even in the base space,
00358     // since the memory layout has changed. USE IT WITH CAUTIOUS!
00359     //
00360     // This block of code is a functional duplicate of the following
00361     // code - leave this here for reference and possibly later
00362     // performance testing as well as compiler evaluation.
00363     inline char operator[](genomeIndex_t index) const
00364     {
00365         return BaseAsciiMap::int2base[(*((genomeSequenceArray*) this))[index]];
00366     }
00367 #endif
00368 
00369     inline char operator[](genomeIndex_t index) const
00370     {
00371         uint8_t val;
00372         if (index < getNumberBases())
00373         {
00374             if ((index&1)==0)
00375             {
00376                 val = ((uint8_t *) data)[index>>1] & 0xf;
00377             }
00378             else
00379             {
00380                 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
00381             }
00382         }
00383         else
00384         {
00385             val = BaseAsciiMap::baseNIndex;
00386         }
00387         val = isColorSpace() ? BaseAsciiMap::int2colorSpace[val] : 
00388             BaseAsciiMap::int2base[val];
00389         return val;
00390     }
00391 
00392     /// given a chromosome name and 1-based position, return the reference base.
00393     /// \param chromosomeName name of the chromosome - exact match only
00394     /// \param chromosomeIndex 1-based chromosome position
00395     /// \return reference base at the above chromosome position
00396     inline char getBase(const char *chromosomeName,
00397                         unsigned int chromosomeIndex) const
00398     {
00399         genomeIndex_t index = 
00400             getGenomePosition(chromosomeName, chromosomeIndex);
00401         if(index == INVALID_GENOME_INDEX)
00402         {
00403             // Invalid position, so return 'N'
00404             return('N');
00405         }
00406         return((*this)[index]);
00407     }
00408 
00409 
00410     inline uint8_t getInteger(genomeIndex_t index) const
00411     {
00412         return (*((genomeSequenceArray*) this))[index];
00413     }
00414 
00415     inline void set(genomeIndex_t index, char value)
00416     {
00417         genomeSequenceArray::set(index, 
00418                                  BaseAsciiMap::base2int[(uint8_t) value]);
00419     }
00420 
00421     /// obtain the pointer to the raw data for other access methods
00422     ///
00423     /// this is a fairly ugly hack to reach into the
00424     /// raw genome vector, get the byte that encodes
00425     /// two bases, and return it.  This is used by
00426     /// karma ReadIndexer::getSumQ to compare genome
00427     /// matchines by byte (two bases at a time) to speed
00428     /// it up.
00429     ///
00430     uint8_t *getDataPtr(genomeIndex_t index)
00431     {
00432         return ((uint8_t *) data + index/2);
00433     }
00434 private:
00435     ///
00436     /// when creating the genome mapped file, we call this to set
00437     /// the MD5 checksum of the chromosome sequence and length so that
00438     /// we can write the SAM SQ headers properly.
00439     /// NB: operates on the last fully loaded chromosome.
00440     bool setChromosomeMD5andLength(uint32_t whichChromosome);
00441 
00442 public:
00443 
00444     // TODO - this will be moved somewhere else and be made a static method.
00445     // replace read with the reversed one
00446     void getReverseRead(std::string &read);
00447 
00448     // TODO - this will be moved somewhere else and be made a static method.
00449     void getReverseRead(String& read);
00450 
00451     // debug the given read - print nice results
00452     int debugPrintReadValidation(
00453         std::string &read,
00454         std::string &quality,
00455         char   direction,
00456         genomeIndex_t   readLocation,
00457         int sumQuality,
00458         int mismatchCount,
00459         bool recurse = true
00460         );
00461 
00462     //
00463     // get the sequence from this GenomeSequence using the specified chromosome and 0-based position.
00464     // if baseCount < 0, get the reverse complement
00465     // that starts at index (but do not reverse the string?)
00466     void getString(std::string &str, int chromosome, uint32_t index, int baseCount) const;
00467     void getString(String &str, int chromosome, uint32_t index, int baseCount) const;
00468     //
00469     // get the sequence from this GenomeSequence.
00470     // if baseCount < 0, get the reverse complement
00471     // that starts at index (but do not reverse the string?)
00472     //
00473     void getString(std::string &str, genomeIndex_t index, int baseCount) const;
00474     void getString(String &str, genomeIndex_t index, int baseCount) const;
00475 
00476     void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const;
00477 
00478     void print30(genomeIndex_t) const;
00479 
00480     // for debugging, not for speed:
00481     genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const;
00482 
00483 
00484     // TODO - these methods do not handle a CIGAR string and do not handle '=' when a read matches the reference.
00485     // They are here for alignment and should be moved to the aligner (karma).
00486     // OR they should optionally take a CIGAR and use that if specified....
00487     // maybe they should be helper methods that are found somewhere else
00488 
00489     /// Return the mismatch count, disregarding CIGAR strings
00490     ///
00491     /// \param read is the sequence we're counting mismatches in
00492     /// \param location is where in the genmoe we start comparing
00493     /// \param exclude is a wildcard character (e.g. '.' or 'N')
00494     ///
00495     /// \return number of bases that don't match the reference, except those that match exclude
00496     int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
00497     {
00498         int mismatchCount = 0;
00499         for (uint32_t i=0; i<read.size(); i++)
00500             if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
00501         return mismatchCount;
00502     };
00503 
00504     /// brute force sumQ - no sanity checking
00505     ///
00506     /// \param read shotgun sequencer read string
00507     /// \param qualities phred quality string of same length
00508     /// \param location the alignment location to check sumQ
00509     int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
00510     {
00511         int sumQ = 0;
00512         for (uint32_t i=0; i<read.size(); i++)
00513             sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
00514         return sumQ;
00515     };
00516     // return a string highlighting mismatch postions with '^' chars:
00517     void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const;
00518     void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const;
00519 
00520     // END TODO
00521 
00522     void getChromosomeAndIndex(std::string &, genomeIndex_t) const;
00523     void getChromosomeAndIndex(String &, genomeIndex_t) const;
00524 
00525     /// check a SAM format read, using phred quality scores and
00526     /// the CIGAR string to determine if it is correct.
00527     ///
00528     ///
00529     /// \param read the read in base space
00530     /// \param qualities the phred encoded qualities (Sanger, not Illumina)
00531     /// \param cigar the SAM file CIGAR column
00532     /// \param sumQ if >0 on entry, is checked against the computed sumQ
00533     /// \param insertions count of insertions found in
00534     ///
00535     ///
00536     bool checkRead(
00537         std::string &read,
00538         std::string &qualities,
00539         std::string &cigar,
00540         int &sumQ,              // input and output
00541         int &gapOpenCount,      // output only
00542         int &gapExtendCount,    // output only
00543         int &gapDeleteCount,    // output only
00544         std::string &result
00545     ) const;
00546 
00547     bool populateDBSNP(mmapArrayBool_t &dbSNP, IFILE inputFile) const;
00548 
00549     /// user friendly dbSNP loader.
00550     ///
00551     /// \param inputFileName may be empty, point to a text file or a dbSNP vector file
00552     ///
00553     /// In all cases, dbSNP is returned the same length as this genome.
00554     ///
00555     /// When no SNPs are loaded, all values are false.
00556     ///
00557     /// When a text file is given, the file is parsed with two space
00558     /// separated columns - the first column is the chromosome name, and
00559     /// the second is the 1-based chromosome position of the SNP.
00560     ///
00561     /// \return false if a dbSNP file was correctly loaded, true otherwise
00562     ///
00563     bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const;
00564 };
00565 
00566 #endif