libStatGen Software  1
GenomeSequence.h
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _GENOME_SEQUENCE_H
00019 #define _GENOME_SEQUENCE_H
00020 
00021 #include <sys/types.h>
00022 #include <sys/stat.h>
00023 #if !defined(MD5_DIGEST_LENGTH)
00024 #define MD5_DIGEST_LENGTH 16
00025 #endif
00026 #include <string>
00027 #include "MemoryMapArray.h"
00028 #include "BaseAsciiMap.h"
00029 
00030 // Goncalo's String class
00031 #include "StringArray.h"
00032 
00033 #include <stdint.h>
00034 
00035 // stdint.h will define this, but only if __STDC_LIMIT_MACROS was
00036 // defined prior to the first include of stdint.h
00037 #ifndef UINT32_MAX
00038 #define UINT32_MAX 0xFFFFFFFF
00039 #endif
00040 
00041 typedef uint32_t    genomeIndex_t;
00042 #define INVALID_GENOME_INDEX UINT32_MAX
00043 
00044 // chromosome index is just a signed int, so this is ok here:
00045 #define INVALID_CHROMOSOME_INDEX -1
00046 
00047 #include "GenomeSequenceHelpers.h"
00048 
00049 #define UMFA_COOKIE 0x1b7933a1  // unique cookie id
00050 #define UMFA_VERSION 20100401U  // YYYYMMDD of last change to file layout
00051 
00052 typedef MemoryMapArray<
00053 uint32_t,
00054 genomeIndex_t,
00055 UMFA_COOKIE,
00056 UMFA_VERSION,
00057 PackedAccess_4Bit,
00058 PackedAssign_4Bit,
00059 Packed4BitElementCount2Bytes,
00060 genomeSequenceMmapHeader
00061 > genomeSequenceArray;
00062 
00063 
00064 // std::string &operator = (std::string &lhs, const PackedRead &rhs);
00065 //
00066 
00067 //!  Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
00068 /*!
00069   GenomeSequence is designed to be a high performance shared access
00070   reference object.
00071 
00072   It is implemented as a MemoryMapArray template object with unsigned 8
00073   bit ints, each of which stores two bases.  Although 2 bits could be used,
00074   most references have more than four symbols (usually at least including
00075   'N', indicating an unknown or masked out base).
00076 
00077   Normal use of this class follows these steps:
00078    -# create the reference
00079     -# instantiate the GenomeSequence class object
00080     -# create the actual file (memory mapped) that is to hold the data
00081     -# populate the data using GenomeSequence::set
00082    -# use the reference
00083     -# use the reference by instantiating a GenomeSequence object
00084     -# either use the constructor with the reference filename
00085     -# or use GenomeSequence::setReferenceName() followed by ::open
00086     -# access the bases via the overloaded array operator []
00087     -# check sequence length by using GenomeSequence::getNumberBases()
00088    -# accessing chromosomes in the reference
00089     -# you typically will need to know about the chromosomes in the sequence
00090     -# see methods and docs with prefix 'getChromosome'
00091 
00092   Sharing is accomplished using the mmap() function via the MemoryMap
00093   base class.  This allows a potentially large genome reference to be
00094   shared among a number of simultaneously executing instances of one or
00095   more programs sharing the same reference.
00096  */
00097 
00098 
00099 class GenomeSequence : public genomeSequenceArray
00100 {
00101 private:
00102     int                     _debugFlag;
00103 
00104     std::ostream            *_progressStream;
00105     bool                    _colorSpace;
00106 
00107     // Whether or not to overwrite an existing file when creating a umfa file (via create).
00108     bool                    _createOverwrite;
00109 
00110     std::string             _baseFilename;   // for later use by WordIndex create and open
00111     std::string             _referenceFilename;
00112     std::string             _fastaFilename;
00113     std::string             _umfaFilename;
00114     std::string             _application;        // only used in ::create()
00115 
00116     MemoryMap               _umfaFile;
00117 
00118     void setup(const char *referenceFilename);
00119 
00120 public:
00121     /// Simple constructor - no implicit file open
00122     GenomeSequence();
00123     void constructorClear();
00124     /// \brief attempt to open an existing sequence
00125     ///
00126     /// \param referenceFilename the name of the reference fasta file to open
00127     /// \param debug if true, additional debug information is printed
00128     GenomeSequence(std::string &referenceFilename)
00129     {
00130         constructorClear();
00131         setup(referenceFilename.c_str());
00132     }
00133 
00134     /// Smarter constructor - attempt to open an existing sequence
00135     ///
00136     /// \param referenceFilename the name of the reference fasta file to open
00137     /// \param debug if true, additional debug information is printed
00138     GenomeSequence(const char *referenceFilename)
00139     {
00140         constructorClear();
00141         setup(referenceFilename);
00142     }
00143 
00144     /// Close the file if open and destroy the object
00145     ~GenomeSequence();
00146 
00147     /// open the reference specified using GenomeSequence::setReferenceName
00148     ///
00149     /// \param isColorSpace open the color space reference
00150     /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
00151     /// \return false for success, true otherwise
00152     bool open(bool isColorSpace = false, int flags = O_RDONLY);
00153 
00154     /// open the given file as the genome (no filename munging occurs).
00155     ///
00156     /// \param filename the name of the file to open
00157     /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
00158     /// \return false for success, true otherwise
00159     bool open(const char *filename, int flags = O_RDONLY)
00160     {
00161         _umfaFilename = filename;
00162         // TODO - should this method be doing something???
00163         return false;
00164     }
00165 
00166 private:
00167     bool    _searchCommonFileSuffix;
00168 public:
00169     bool create(bool isColor = false);
00170 
00171     // NEW API?
00172 
00173     // load time modifiers:
00174     /// if set, then show progress when creating and pre-fetching
00175     void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;}
00176     void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; }
00177     void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
00178 
00179     // Set whether or not to overwrite a umfa file when calling create.
00180     void setCreateOverwrite(bool createOverwrite) {_createOverwrite = createOverwrite;}
00181 
00182     bool loadFastaData(const char *filename);
00183 
00184     /// set the reference name that will be used in open()
00185     /// \param referenceFilename the name of the reference fasta file to open
00186     /// \return false for success, true otherwise
00187     ///
00188     /// \sa open()
00189     bool setReferenceName(std::string referenceFilename);
00190 
00191     /// set the application name in the binary file header
00192     ///
00193     /// \param application name of the application
00194     void setApplication(std::string application)
00195     {
00196         _application = application;     // used in ::create() to set application name
00197     }
00198     const std::string &getFastaName() const
00199     {
00200         return _fastaFilename;
00201     }
00202     const std::string &getReferenceName() const
00203     {
00204         return _referenceFilename;
00205     }
00206 
00207     /// tell us if we are a color space reference or not
00208     /// \return true if colorspace, false otherwise
00209     bool isColorSpace() const
00210     {
00211         return _colorSpace;
00212     }
00213 
00214     /// return the number of bases represented in this reference
00215     /// \return count of bases
00216     genomeIndex_t   getNumberBases() const
00217     {
00218         return getElementCount();
00219     }
00220 
00221     /// given a whole genome index, get the chromosome it is located in
00222     ///
00223     /// This is done via a binary search of the chromosome table in the
00224     /// header of the mapped file, so it is O(log(N))
00225     ///
00226     /// \param 0-based position the base in the genome
00227     /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
00228     int getChromosome(genomeIndex_t position) const;
00229 
00230     /// given a chromosome name, return the chromosome index
00231     ///
00232     /// This is done via a linear search of the chromosome table in the
00233     /// header of the mapped file, so it is O(N)
00234     ///
00235     /// \param chromosomeName the name of the chromosome - exact match only
00236     /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
00237     int getChromosome(const char *chromosomeName) const;
00238     /// Return the number of chromosomes in the genome
00239     /// \return number of chromosomes in the genome
00240     int getChromosomeCount() const;
00241 
00242     /// given a chromosome, return the genome base it starts in
00243     ///
00244     /// \param 0-based chromosome index
00245     /// \return 0-based genome index of the base that starts the chromosome
00246     genomeIndex_t getChromosomeStart(int chromosomeIndex) const
00247     {
00248         if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00249         return header->_chromosomes[chromosomeIndex].start;
00250     }
00251 
00252     /// given a chromosome, return its size in bases
00253     ///
00254     /// \param 0-based chromosome index
00255     /// \return size of the chromosome in bases
00256     genomeIndex_t getChromosomeSize(int chromosomeIndex) const
00257     {
00258         if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
00259         return header->_chromosomes[chromosomeIndex].size;
00260     }
00261 
00262     /// given a chromosome name and position, return the genome position
00263     ///
00264     /// \param chromosomeName name of the chromosome - exact match only
00265     /// \param chromosomeIndex 1-based chromosome position
00266     /// \return genome index of the above chromosome position
00267     genomeIndex_t getGenomePosition(
00268         const char *chromosomeName,
00269         unsigned int chromosomeIndex) const;
00270 
00271     /// given a chromosome index and position, return the genome position
00272     ///
00273     /// \param chromosome index of the chromosome
00274     /// \param chromosomeIndex 1-based chromosome position
00275     /// \return genome index of the above chromosome position
00276     genomeIndex_t getGenomePosition(
00277         int chromosome,
00278         unsigned int chromosomeIndex) const;
00279 
00280     /// given the chromosome name, get the corresponding 0 based genome index
00281     /// for the start of that chromosome
00282     genomeIndex_t getGenomePosition(const char *chromosomeName) const;
00283     genomeIndex_t getGenomePosition(int chromosomeIndex) const;
00284 
00285     const std::string &getBaseFilename() const
00286     {
00287         return _baseFilename;
00288     }
00289 
00290     const char *getChromosomeName(int chromosomeIndex) const
00291     {
00292         return header->_chromosomes[chromosomeIndex].name;
00293     }
00294 
00295     void setDebugFlag(bool d)
00296     {
00297         _debugFlag = d;
00298     }
00299 
00300     genomeIndex_t sequenceLength() const
00301     {
00302         return (genomeIndex_t) header->elementCount;
00303     }
00304 
00305     const char *chromosomeName(int chr) const
00306     {
00307         return header->_chromosomes[chr].name;
00308     }
00309 
00310     void sanityCheck(MemoryMap &fasta) const;
00311 
00312     // TODO - this will be moved somewhere else and be made a static method.
00313     std::string IntegerToSeq(unsigned int n, unsigned int wordsize) const;
00314 
00315     bool wordMatch(unsigned int index, std::string &word) const;
00316     bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word) const;
00317 
00318     // TODO - this will be moved somewhere else and be made a static method.
00319     char BasePair(char c) const
00320     {
00321         return BaseAsciiMap::base2complement[(int) c];
00322     }
00323 
00324     void dumpSequenceSAMDictionary(std::ostream&) const;
00325     void dumpHeaderTSV(std::ostream&) const;
00326 
00327     ///
00328     /// Return the bases in base space or color space for within range index, ot
00329     /// @param index the array-like index (0 based).
00330     /// @return ACTGN in base space; 0123N for color space; and 'N' for invalid.
00331     /// For color space, index i represents the transition of base at position (i-1) to base at position i
00332     ///
00333     /// NB: bounds checking here needs to be deprecated - do not assume it
00334     /// will exist - the call must clip reads so that this routine is never
00335     /// called with a index value larger than the genome.
00336     ///
00337     /// The reason for this is simply that this routine gets called hundreds
00338     /// of billions of time in one run of karma, which will absolutely kill
00339     /// performance.  Every single instruction here matters a great, great deal.
00340     ///
00341 
00342     //
00343     // 3.5% improvement for color space matching:
00344     // I guess the compiler isn't inlining 3 functions deep.
00345     //
00346 
00347 #if 0
00348     // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
00349     // The following code does not work even in the base space,
00350     // since the memory layout has changed. USE IT WITH CAUTIOUS!
00351     //
00352     // This block of code is a functional duplicate of the following
00353     // code - leave this here for reference and possibly later
00354     // performance testing as well as compiler evaluation.
00355     inline char operator[](genomeIndex_t index) const
00356     {
00357         return BaseAsciiMap::int2base[(*((genomeSequenceArray*) this))[index]];
00358     }
00359 #endif
00360 
00361     inline char operator[](genomeIndex_t index) const
00362     {
00363         uint8_t val;
00364         if (index < getNumberBases())
00365         {
00366             if ((index&1)==0)
00367             {
00368                 val = ((uint8_t *) data)[index>>1] & 0xf;
00369             }
00370             else
00371             {
00372                 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
00373             }
00374         }
00375         else
00376         {
00377             val = BaseAsciiMap::baseNIndex;
00378         }
00379         val = isColorSpace() ? BaseAsciiMap::int2colorSpace[val] : 
00380             BaseAsciiMap::int2base[val];
00381         return val;
00382     }
00383 
00384     /// given a chromosome name and 1-based position, return the reference base.
00385     /// \param chromosomeName name of the chromosome - exact match only
00386     /// \param chromosomeIndex 1-based chromosome position
00387     /// \return reference base at the above chromosome position
00388     inline char getBase(const char *chromosomeName,
00389                         unsigned int chromosomeIndex) const
00390     {
00391         genomeIndex_t index = 
00392             getGenomePosition(chromosomeName, chromosomeIndex);
00393         if(index == INVALID_GENOME_INDEX)
00394         {
00395             // Invalid position, so return 'N'
00396             return('N');
00397         }
00398         return((*this)[index]);
00399     }
00400 
00401 
00402     inline uint8_t getInteger(genomeIndex_t index) const
00403     {
00404         return (*((genomeSequenceArray*) this))[index];
00405     }
00406 
00407     inline void set(genomeIndex_t index, char value)
00408     {
00409         genomeSequenceArray::set(index, 
00410                                  BaseAsciiMap::base2int[(uint8_t) value]);
00411     }
00412 
00413     /// obtain the pointer to the raw data for other access methods
00414     ///
00415     /// this is a fairly ugly hack to reach into the
00416     /// raw genome vector, get the byte that encodes
00417     /// two bases, and return it.  This is used by
00418     /// karma ReadIndexer::getSumQ to compare genome
00419     /// matchines by byte (two bases at a time) to speed
00420     /// it up.
00421     ///
00422     uint8_t *getDataPtr(genomeIndex_t index)
00423     {
00424         return ((uint8_t *) data + index/2);
00425     }
00426 private:
00427     ///
00428     /// when creating the genome mapped file, we call this to set
00429     /// the MD5 checksum of the chromosome sequence and length so that
00430     /// we can write the SAM SQ headers properly.
00431     /// NB: operates on the last fully loaded chromosome.
00432     bool setChromosomeMD5andLength(uint32_t whichChromosome);
00433 
00434 public:
00435 
00436     // TODO - this will be moved somewhere else and be made a static method.
00437     // replace read with the reversed one
00438     void getReverseRead(std::string &read);
00439 
00440     // TODO - this will be moved somewhere else and be made a static method.
00441     void getReverseRead(String& read);
00442 
00443     // debug the given read - print nice results
00444     int debugPrintReadValidation(
00445         std::string &read,
00446         std::string &quality,
00447         char   direction,
00448         genomeIndex_t   readLocation,
00449         int sumQuality,
00450         int mismatchCount,
00451         bool recurse = true
00452         );
00453 
00454     //
00455     // get the sequence from this GenomeSequence using the specified chromosome and 0-based position.
00456     // if baseCount < 0, get the reverse complement
00457     // that starts at index (but do not reverse the string?)
00458     void getString(std::string &str, int chromosome, uint32_t index, int baseCount) const;
00459     void getString(String &str, int chromosome, uint32_t index, int baseCount) const;
00460     //
00461     // get the sequence from this GenomeSequence.
00462     // if baseCount < 0, get the reverse complement
00463     // that starts at index (but do not reverse the string?)
00464     //
00465     void getString(std::string &str, genomeIndex_t index, int baseCount) const;
00466     void getString(String &str, genomeIndex_t index, int baseCount) const;
00467 
00468     void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const;
00469 
00470     void print30(genomeIndex_t) const;
00471 
00472     // for debugging, not for speed:
00473     genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const;
00474 
00475 
00476     // TODO - these methods do not handle a CIGAR string and do not handle '=' when a read matches the reference.
00477     // They are here for alignment and should be moved to the aligner (karma).
00478     // OR they should optionally take a CIGAR and use that if specified....
00479     // maybe they should be helper methods that are found somewhere else
00480 
00481     /// Return the mismatch count, disregarding CIGAR strings
00482     ///
00483     /// \param read is the sequence we're counting mismatches in
00484     /// \param location is where in the genmoe we start comparing
00485     /// \param exclude is a wildcard character (e.g. '.' or 'N')
00486     ///
00487     /// \return number of bases that don't match the reference, except those that match exclude
00488     int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
00489     {
00490         int mismatchCount = 0;
00491         for (uint32_t i=0; i<read.size(); i++)
00492             if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
00493         return mismatchCount;
00494     };
00495 
00496     /// brute force sumQ - no sanity checking
00497     ///
00498     /// \param read shotgun sequencer read string
00499     /// \param qualities phred quality string of same length
00500     /// \param location the alignment location to check sumQ
00501     int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
00502     {
00503         int sumQ = 0;
00504         for (uint32_t i=0; i<read.size(); i++)
00505             sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
00506         return sumQ;
00507     };
00508     // return a string highlighting mismatch postions with '^' chars:
00509     void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const;
00510     void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const;
00511 
00512     // END TODO
00513 
00514     void getChromosomeAndIndex(std::string &, genomeIndex_t) const;
00515     void getChromosomeAndIndex(String &, genomeIndex_t) const;
00516 
00517     /// check a SAM format read, using phred quality scores and
00518     /// the CIGAR string to determine if it is correct.
00519     ///
00520     ///
00521     /// \param read the read in base space
00522     /// \param qualities the phred encoded qualities (Sanger, not Illumina)
00523     /// \param cigar the SAM file CIGAR column
00524     /// \param sumQ if >0 on entry, is checked against the computed sumQ
00525     /// \param insertions count of insertions found in
00526     ///
00527     ///
00528     bool checkRead(
00529         std::string &read,
00530         std::string &qualities,
00531         std::string &cigar,
00532         int &sumQ,              // input and output
00533         int &gapOpenCount,      // output only
00534         int &gapExtendCount,    // output only
00535         int &gapDeleteCount,    // output only
00536         std::string &result
00537     ) const;
00538 
00539     bool populateDBSNP(mmapArrayBool_t &dbSNP, IFILE inputFile) const;
00540 
00541     /// user friendly dbSNP loader.
00542     ///
00543     /// \param inputFileName may be empty, point to a text file or a dbSNP vector file
00544     ///
00545     /// In all cases, dbSNP is returned the same length as this genome.
00546     ///
00547     /// When no SNPs are loaded, all values are false.
00548     ///
00549     /// When a text file is given, the file is parsed with two space
00550     /// separated columns - the first column is the chromosome name, and
00551     /// the second is the 1-based chromosome position of the SNP.
00552     ///
00553     /// \return false if a dbSNP file was correctly loaded, true otherwise
00554     ///
00555     bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const;
00556 };
00557 
00558 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends