GenomeSequence.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _GENOME_SEQUENCE_H
00019 #define _GENOME_SEQUENCE_H
00020 
00021 #include <assert.h>
00022 #include <fcntl.h>
00023 #include <stdio.h>              // for size_t, which md5.h needs (for redhat)
00024 #include <stddef.h>             // for size_t, which md5.h needs (for correct systems)
00025 #include <sys/types.h>
00026 #include <sys/stat.h>
00027 #if defined(WIN32)
00028 #define MD5_DIGEST_LENGTH 16
00029 #else
00030 #include <openssl/md5.h>        // for MD5_DIGEST_LENGTH
00031 #endif
00032 #include <string>
00033 #include <vector>
00034 #include "MemoryMapArray.h"
00035 
00036 // Goncalo's String class
00037 #include "StringArray.h"
00038 
00039 #include <stdint.h>
00040 
00041 // stdint.h will define this, but only if __STDC_LIMIT_MACROS was
00042 // defined prior to the first include of stdint.h
00043 #ifndef UINT32_MAX
00044 #define UINT32_MAX 0xFFFFFFFF
00045 #endif
00046 
00047 typedef uint32_t    genomeIndex_t;
00048 #define INVALID_GENOME_INDEX UINT32_MAX
00049 
00050 // chromosome index is just a signed int, so this is ok here:
00051 #define INVALID_CHROMOSOME_INDEX -1
00052 
00053 
00054 //
00055 // ChromosomeInfo represents the per chromosome information
00056 // necessary to write out SAM/BAM records.  In addition, it
00057 // contains a single internal index used to point to the vector
00058 // offset where the chromosome bases start.
00059 //
00060 // This is mildly non-optimal for larger collections of chromosomes
00061 // or contigs - one use case described having millions of contigs,
00062 // in which case, this structure alone would take a gigabyte or more.
00063 //
00064 struct ChromosomeInfo
00065 {
00066     static const int  MAX_GENOME_INFO_STRING=128;
00067 
00068     void constructorClear()
00069     {
00070         memset(this,0, sizeof(*this));
00071     }
00072     void setChromosomeName(const char *n)
00073     {
00074         strncpy(name, n, sizeof(name)-1);
00075     }
00076     genomeIndex_t   start;                              // internal offset to combined genome vector
00077     genomeIndex_t   size;                               // SAM SQ:LN value
00078     char            md5[2*MD5_DIGEST_LENGTH + 1];       // 32 chars plus NUL, SAM SQ:M5 value
00079     char            name[MAX_GENOME_INFO_STRING];       // SAM SQ:SN value
00080     char            assemblyID[MAX_GENOME_INFO_STRING]; // SAM SQ:AS value
00081     char            uri[MAX_GENOME_INFO_STRING];        // SAM SQ:UR value
00082     char            species[MAX_GENOME_INFO_STRING];    // SAM SQ:SP value
00083 
00084     // handy setting methods:
00085     void setAssemblyID(const char *newID)
00086     {
00087         strncpy(assemblyID, newID, sizeof(assemblyID) - 1);
00088     }
00089     void setSpecies(const char *newSpecies)
00090     {
00091         strncpy(species, newSpecies, sizeof(newSpecies));
00092     }
00093     void setURI(const char *newURI)
00094     {
00095         strncpy(uri, newURI, sizeof(uri));
00096     }
00097 };
00098 
00099 class genomeSequenceMmapHeader : public MemoryMapArrayHeader
00100 {
00101     friend class GenomeSequence;
00102     friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &);
00103 private:
00104     uint32_t    _chromosomeCount;
00105     bool        _colorSpace;
00106 
00107     ChromosomeInfo  _chromosomes[0];
00108 
00109 public:
00110     //
00111     // getHeaderSize is special in that it must not access any
00112     // member variables, since it is called when the header has
00113     // not been created yet.
00114     //
00115     static size_t  getHeaderSize(int chromosomeCount)
00116     {
00117         return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount;
00118     }
00119     //
00120     // below methods return TRUE if it failed, false otherwise (primarily
00121     // a length check).
00122     //
00123 };
00124 
00125 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h);
00126 
00127 //
00128 // define the genomeSequence array type:
00129 //
00130 // NB the access/set routines use the encoded base values in the range
00131 // 0-15, not the corresponding base pair character.
00132 //
00133 inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index)
00134 {
00135     if ((index&1)==0)
00136     {
00137         return ((uint8_t *) base)[index>>1] & 0xf;
00138     }
00139     else
00140     {
00141         return (((uint8_t *) base)[index>>1] & 0xf0) >> 4;
00142     }
00143 };
00144 inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v)
00145 {
00146     if ((index&1)==0)
00147     {
00148         ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v;
00149     }
00150     else
00151     {
00152         ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4;
00153     }
00154 }
00155 
00156 inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i)
00157 {
00158     return sizeof(uint8_t) * i / 2;
00159 }
00160 
00161 #define UMFA_COOKIE 0x1b7933a1  // unique cookie id
00162 #define UMFA_VERSION 20100401U  // YYYYMMDD of last change to file layout
00163 
00164 typedef MemoryMapArray<
00165 uint8_t,
00166 genomeIndex_t,
00167 UMFA_COOKIE,
00168 UMFA_VERSION,
00169 genomeSequenceAccess,
00170 genomeSequenceSet,
00171 mmapGenomeSequenceElementCount2Bytes,
00172 genomeSequenceMmapHeader
00173 > genomeSequenceArray;
00174 
00175 class PackedRead
00176 {
00177     void set(int index, int val)
00178     {
00179         packedBases[index>>1] =
00180             (packedBases[index>>1]             // original value
00181              & ~(7<<((index&0x01)<<2)))         // logical AND off the original value
00182             | ((val&0x0f)<<((index&0x1)<<2));  // logical OR in the new value
00183     }
00184 public:
00185     std::vector<uint8_t> packedBases;
00186     uint32_t    length;
00187     int size()
00188     {
00189         return length;
00190     }
00191     void clear()
00192     {
00193         packedBases.clear();
00194         length=0;
00195     }
00196     void set(const char *rhs, int padWithNCount = 0);
00197     uint8_t operator [](int index)
00198     {
00199         return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
00200     }
00201 };
00202 
00203 // std::string &operator = (std::string &lhs, const PackedRead &rhs);
00204 //
00205 
00206 //!  Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
00207 /*!
00208   GenomeSequence is designed to be a high performance shared access
00209   reference object.
00210 
00211   It is implemented as a MemoryMapArray template object with unsigned 8
00212   bit ints, each of which stores two bases.  Although 2 bits could be used,
00213   most references have more than four symbols (usually at least including
00214   'N', indicating an unknown or masked out base).
00215 
00216   Normal use of this class follows these steps:
00217    -# create the reference
00218     -# instantiate the GenomeSequence class object
00219     -# create the actual file (memory mapped) that is to hold the data
00220     -# populate the data using GenomeSequence::set
00221    -# use the reference
00222     -# use the reference by instantiating a GenomeSequence object
00223     -# either use the constructor with the reference filename
00224     -# or use GenomeSequence::setReferenceName() followed by ::open
00225     -# access the bases via the overloaded array operator []
00226     -# check sequence length by using GenomeSequence::getNumberBases()
00227    -# accessing chromosomes in the reference
00228     -# you typically will need to know about the chromosomes in the sequence
00229     -# see methods and docs with prefix 'getChromosome'
00230 
00231   Sharing is accomplished using the mmap() function via the MemoryMap
00232   base class.  This allows a potentially large genome reference to be
00233   shared among a number of simultaneously executing instances of one or
00234   more programs sharing the same reference.
00235  */
00236 
00237 
00238 class GenomeSequence : public genomeSequenceArray
00239 {
00240 public:
00241     // TODO - these are found in BaseAsciiMap so should be removed from here.
00242     static const int baseAIndex = 000;
00243     static const int baseTIndex = 001;
00244     static const int baseCIndex = 002;
00245     static const int baseGIndex = 003;
00246     static const int baseNIndex = 004; // baseN --> bad read
00247     static const int baseXIndex = 005; // baseX --> unknown (bad data)
00248     //
00249     // two arrays for converting back and forth between base pair character
00250     // value (ASCII) to a base integer in the range 0..3.  Note there is actually
00251     // a value 4 and 5, for 'N' (indelible) and 'M' (unknown to me).
00252     //
00253     static unsigned char base2int[];        // base space read (ATCG)
00254 
00255     // TODO - these are found in BaseAsciiMap so should be removed from here.
00256     static const char int2base[];
00257     static const char int2colorSpace[];
00258     static unsigned char base2complement[];
00259 
00260 private:
00261     int                     _debugFlag;
00262 
00263     std::ostream            *_progressStream;
00264     bool                    _colorSpace;
00265 
00266     std::string             _baseFilename;   // for later use by WordIndex create and open
00267     std::string             _referenceFilename;
00268     std::string             _fastaFilename;
00269     std::string             _umfaFilename;
00270     std::string             _application;        // only used in ::create()
00271 
00272     MemoryMap               _umfaFile;
00273 
00274     void setup(const char *referenceFilename);
00275 
00276 public:
00277     /// Simple constructor - no implicit file open
00278     GenomeSequence();
00279     void constructorClear();
00280     /// \brief attempt to open an existing sequence
00281     ///
00282     /// \param referenceFilename the name of the reference fasta file to open
00283     /// \param debug if true, additional debug information is printed
00284     GenomeSequence(std::string &referenceFilename)
00285     {
00286         constructorClear();
00287         setup(referenceFilename.c_str());
00288     }
00289 
00290     /// Smarter constructor - attempt to open an existing sequence
00291     ///
00292     /// \param referenceFilename the name of the reference fasta file to open
00293     /// \param debug if true, additional debug information is printed
00294     GenomeSequence(const char *referenceFilename)
00295     {
00296         constructorClear();
00297         setup(referenceFilename);
00298     }
00299 
00300     /// Close the file if open and destroy the object
00301     ~GenomeSequence();
00302 
00303     /// open the reference specified using GenomeSequence::setReferenceName
00304     ///
00305     /// \param isColorSpace open the color space reference
00306     /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
00307     /// \return false for success, true otherwise
00308     bool open(bool isColorSpace = false, int flags = O_RDONLY);
00309 
00310     /// open the given file as the genome (no filename munging occurs).
00311     ///
00312     /// \param filename the name of the file to open
00313     /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
00314     /// \return false for success, true otherwise
00315     bool open(const char *filename, int flags = O_RDONLY)
00316     {
00317         _umfaFilename = filename;
00318         return genomeSequenceArray::open(filename, flags);
00319     }
00320 
00321     /// if true, then show progress when creating and pre-fetching
00322 private:
00323     bool    _searchCommonFileSuffix;
00324 public:
00325     bool create();
00326     //
00327     // backwards compatability:
00328     bool create(bool isColor) {setColorSpace(isColor); return create(); }
00329 
00330     // NEW API?
00331 
00332     // load time modifiers:
00333     void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;}
00334     void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; }
00335     void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
00336 
00337     bool loadFastaData(const char *filename);
00338 
00339     /// set the reference name that will be used in open()
00340     /// \param referenceFilename the name of the reference fasta file to open
00341     /// \return false for success, true otherwise
00342     ///
00343     /// \sa open()
00344     bool setReferenceName(std::string referenceFilename);
00345 
00346     /// set the application name in the binary file header
00347     ///
00348     /// \param application name of the application
00349     void setApplication(std::string application)
00350     {
00351         _application = application;     // used in ::create() to set application name
00352     }
00353     const std::string &getFastaName() const
00354     {
00355         return _fastaFilename;
00356     }
00357     const std::string &getReferenceName() const
00358     {
00359         return _referenceFilename;
00360     }
00361 
00362     /// tell us if we are a color space reference or not
00363     /// \return true if colorspace, false otherwise
00364     bool isColorSpace() const
00365     {
00366         return _colorSpace;
00367     }
00368 
00369     /// return the number of bases represented in this reference
00370     /// \return count of bases
00371     genomeIndex_t   getNumberBases() const
00372     {
00373         return getElementCount();
00374     }
00375 
00376     /// given a whole genome index, get the chromosome it is located in
00377     ///
00378     /// This is done via a binary search of the chromosome table in the
00379     /// header of the mapped file, so it is O(log(N))
00380     ///
00381     /// \param 0-based position the base in the genome
00382     /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
00383     int getChromosome(genomeIndex_t position) const;
00384 
00385     /// given a chromosome name, return the chromosome index
00386     ///
00387     /// This is done via a linear search of the chromosome table in the
00388     /// header of the mapped file, so it is O(N)
00389     ///
00390     /// \param chromosomeName the name of the chromosome - exact match only
00391     /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
00392     int getChromosome(const char *chromosomeName) const;
00393     /// Return the number of chromosomes in the genome
00394     /// \return number of chromosomes in the genome
00395     int getChromosomeCount() const;
00396 
00397     /// given a chromosome, return the genome base it starts in
00398     ///
00399     /// \param 0-based chromosome index
00400     /// \return 0-based genome index of the base that starts the chromosome
00401     genomeIndex_t getChromosomeStart(int chromosomeIndex) const
00402     {
00403         if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00404         return header->_chromosomes[chromosomeIndex].start;
00405     }
00406 
00407     /// given a chromosome, return its size in bases
00408     ///
00409     /// \param 0-based chromosome index
00410     /// \return size of the chromosome in bases
00411     genomeIndex_t getChromosomeSize(int chromosomeIndex) const
00412     {
00413         if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
00414         return header->_chromosomes[chromosomeIndex].size;
00415     }
00416 
00417     /// given a chromosome name and position, return the genome position
00418     ///
00419     /// \param chromosomeName name of the chromosome - exact match only
00420     /// \param chromosomeIndex 1-based chromosome position
00421     /// \return genome index of the above chromosome position
00422     genomeIndex_t getGenomePosition(
00423         const char *chromosomeName,
00424         unsigned int chromosomeIndex) const;
00425 
00426     /// given a chromosome index and position, return the genome position
00427     ///
00428     /// \param chromosome index of the chromosome
00429     /// \param chromosomeIndex 1-based chromosome position
00430     /// \return genome index of the above chromosome position
00431     genomeIndex_t getGenomePosition(
00432         int chromosome,
00433         unsigned int chromosomeIndex) const;
00434 
00435     /// given the chromosome name, get the corresponding 0 based genome index
00436     /// for the start of that chromosome
00437     genomeIndex_t getGenomePosition(const char *chromosomeName) const;
00438     genomeIndex_t getGenomePosition(int chromosomeIndex) const;
00439 
00440     const std::string &getBaseFilename() const
00441     {
00442         return _baseFilename;
00443     }
00444 
00445     const char *getChromosomeName(int chromosomeIndex) const
00446     {
00447         return header->_chromosomes[chromosomeIndex].name;
00448     }
00449 
00450     void setDebugFlag(bool d)
00451     {
00452         _debugFlag = d;
00453     }
00454 
00455     genomeIndex_t sequenceLength() const
00456     {
00457         return (genomeIndex_t) header->elementCount;
00458     }
00459 
00460     const char *chromosomeName(int chr) const
00461     {
00462         return header->_chromosomes[chr].name;
00463     }
00464 
00465     void sanityCheck(MemoryMap &fasta) const;
00466 
00467     // TODO - this will be moved somewhere else and be made a static method.
00468     std::string IntegerToSeq(unsigned int n, unsigned int wordsize) const;
00469 
00470     bool wordMatch(unsigned int index, std::string &word) const;
00471     bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word) const;
00472 
00473     // TODO - this will be moved somewhere else and be made a static method.
00474     char BasePair(char c) const
00475     {
00476         return base2complement[(int) c];
00477     }
00478 
00479     void dumpSequenceSAMDictionary(std::ostream&) const;
00480     void dumpHeaderTSV(std::ostream&) const;
00481 
00482     ///
00483     /// Return the bases in base space or color space for within range index, ot
00484     /// @param index the array-like index (0 based).
00485     /// @return ACTGN in base space; 0123N for color space; and 'N' for invalid.
00486     /// For color space, index i represents the transition of base at position (i-1) to base at position i
00487     ///
00488     /// NB: bounds checking here needs to be deprecated - do not assume it
00489     /// will exist - the call must clip reads so that this routine is never
00490     /// called with a index value larger than the genome.
00491     ///
00492     /// The reason for this is simply that this routine gets called hundreds
00493     /// of billions of time in one run of karma, which will absolutely kill
00494     /// performance.  Every single instruction here matters a great, great deal.
00495     ///
00496 
00497     //
00498     // 3.5% improvement for color space matching:
00499     // I guess the compiler isn't inlining 3 functions deep.
00500     //
00501 
00502 #if 0
00503     // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
00504     // The following code does not work even in the base space,
00505     // since the memory layout has changed. USE IT WITH CAUTIOUS!
00506     //
00507     // This block of code is a functional duplicate of the following
00508     // code - leave this here for reference and possibly later
00509     // performance testing as well as compiler evaluation.
00510     inline char operator[](genomeIndex_t index) const
00511     {
00512         return int2base[(*((genomeSequenceArray*) this))[index]];
00513     }
00514 #endif
00515 
00516     inline char operator[](genomeIndex_t index) const
00517     {
00518         uint8_t val;
00519         if (index < getNumberBases())
00520         {
00521             if ((index&1)==0)
00522             {
00523                 val = ((uint8_t *) data)[index>>1] & 0xf;
00524             }
00525             else
00526             {
00527                 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
00528             }
00529         }
00530         else
00531         {
00532             val = baseNIndex;
00533         }
00534         val = isColorSpace() ? int2colorSpace[val] : int2base[val];
00535         return val;
00536     }
00537 
00538     inline uint8_t getInteger(genomeIndex_t index) const
00539     {
00540         return (*((genomeSequenceArray*) this))[index];
00541     }
00542 
00543     inline void set(genomeIndex_t index, char value)
00544     {
00545         genomeSequenceArray::set(index, base2int[(uint8_t) value]);
00546     }
00547 
00548     /// obtain the pointer to the raw data for other access methods
00549     ///
00550     /// this is a fairly ugly hack to reach into the
00551     /// raw genome vector, get the byte that encodes
00552     /// two bases, and return it.  This is used by
00553     /// karma ReadIndexer::getSumQ to compare genome
00554     /// matchines by byte (two bases at a time) to speed
00555     /// it up.
00556     ///
00557     uint8_t *getDataPtr(genomeIndex_t index)
00558     {
00559         return ((uint8_t *) data + index/2);
00560     }
00561 private:
00562     ///
00563     /// when creating the genome mapped file, we call this to set
00564     /// the MD5 checksum of the chromosome sequence and length so that
00565     /// we can write the SAM SQ headers properly.
00566     /// NB: operates on the last fully loaded chromosome.
00567     bool setChromosomeMD5andLength(uint32_t whichChromosome);
00568 
00569 public:
00570 
00571     // TODO - this will be moved somewhere else and be made a static method.
00572     // replace read with the reversed one
00573     void getReverseRead(std::string &read);
00574 
00575     // TODO - this will be moved somewhere else and be made a static method.
00576     void getReverseRead(String& read);
00577 
00578     // debug the given read - print nice results
00579     int debugPrintReadValidation(
00580         std::string &read,
00581         std::string &quality,
00582         char   direction,
00583         genomeIndex_t   readLocation,
00584         int sumQuality,
00585         int mismatchCount,
00586         bool recurse = true
00587         );
00588 
00589     void getString(std::string &str, int chromosome, genomeIndex_t index, int baseCount) const;
00590     void getString(String &str, int chromosome, genomeIndex_t index, int baseCount) const;
00591     //
00592     // get the sequence from this GenomeSequence.
00593     // if baseCount < 0, get the reverse complement
00594     // that starts at index (but do not reverse the string?)
00595     //
00596     void getString(std::string &str, genomeIndex_t index, int baseCount) const;
00597     void getString(String &str, genomeIndex_t index, int baseCount) const;
00598 
00599     void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const;
00600 
00601     void print30(genomeIndex_t) const;
00602 
00603     // for debugging, not for speed:
00604     genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const;
00605 
00606 
00607     // TODO - these methods do not handle a CIGAR string and do not handle '=' when a read matches the reference.
00608     // They are here for alignment and should be moved to the aligner (karma).
00609     // OR they should optionally take a CIGAR and use that if specified....
00610     // maybe they should be helper methods that are found somewhere else
00611 
00612     /// Return the mismatch count, disregarding CIGAR strings
00613     ///
00614     /// \param read is the sequence we're counting mismatches in
00615     /// \param location is where in the genmoe we start comparing
00616     /// \param exclude is a wildcard character (e.g. '.' or 'N')
00617     ///
00618     /// \return number of bases that don't match the reference, except those that match exclude
00619     int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
00620     {
00621         int mismatchCount = 0;
00622         for (uint32_t i=0; i<read.size(); i++)
00623             if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
00624         return mismatchCount;
00625     };
00626 
00627     /// brute force sumQ - no sanity checking
00628     ///
00629     /// \param read shotgun sequencer read string
00630     /// \param qualities phred quality string of same length
00631     /// \param location the alignment location to check sumQ
00632     int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
00633     {
00634         int sumQ = 0;
00635         for (uint32_t i=0; i<read.size(); i++)
00636             sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
00637         return sumQ;
00638     };
00639     // return a string highlighting mismatch postions with '^' chars:
00640     void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const;
00641     void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const;
00642 
00643     // END TODO
00644 
00645     void getChromosomeAndIndex(std::string &, genomeIndex_t) const;
00646     void getChromosomeAndIndex(String &, genomeIndex_t) const;
00647 
00648     /// check a SAM format read, using phred quality scores and
00649     /// the CIGAR string to determine if it is correct.
00650     ///
00651     ///
00652     /// \param read the read in base space
00653     /// \param qualities the phred encoded qualities (Sanger, not Illumina)
00654     /// \param cigar the SAM file CIGAR column
00655     /// \param sumQ if >0 on entry, is checked against the computed sumQ
00656     /// \param insertions count of insertions found in
00657     ///
00658     ///
00659     bool checkRead(
00660         std::string &read,
00661         std::string &qualities,
00662         std::string &cigar,
00663         int &sumQ,              // input and output
00664         int &gapOpenCount,      // output only
00665         int &gapExtendCount,    // output only
00666         int &gapDeleteCount,    // output only
00667         std::string &result
00668     ) const;
00669 
00670     bool populateDBSNP(mmapArrayBool_t &dbSNP, std::ifstream &inputFile) const;
00671 
00672     /// user friendly dbSNP loader.
00673     ///
00674     /// \param inputFileName may be empty, point to a text file or a dbSNP vector file
00675     ///
00676     /// In all cases, dbSNP is returned the same length as this genome.
00677     ///
00678     /// When no SNPs are loaded, all values are false.
00679     ///
00680     /// When a text file is given, the file is parsed with two space
00681     /// separated columns - the first column is the chromosome name, and
00682     /// the second is the 1-based chromosome position of the SNP.
00683     ///
00684     /// \return false if a dbSNP file was correctly loaded, true otherwise
00685     ///
00686     bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const;
00687 };
00688 
00689 #endif