GenomeSequence.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _GENOME_SEQUENCE_H
00019 #define _GENOME_SEQUENCE_H
00020 
00021 #include <assert.h>
00022 #include <fcntl.h>
00023 #include <stdio.h>              // for size_t, which md5.h needs (for redhat)
00024 #include <stddef.h>             // for size_t, which md5.h needs (for correct systems)
00025 #include <sys/types.h>
00026 #include <sys/stat.h>
00027 #if defined(WIN32)
00028 #define MD5_DIGEST_LENGTH 16
00029 #else
00030 #include <openssl/md5.h>        // for MD5_DIGEST_LENGTH
00031 #endif
00032 #include <string>
00033 #include <vector>
00034 #include "MemoryMapArray.h"
00035 
00036 // Goncalo's String class
00037 #include "StringArray.h"
00038 
00039 #include <stdint.h>
00040 
00041 // stdint.h will define this, but only if __STDC_LIMIT_MACROS was
00042 // defined prior to the first include of stdint.h
00043 #ifndef UINT32_MAX
00044 #define UINT32_MAX 0xFFFFFFFF
00045 #endif
00046 
00047 typedef uint32_t    genomeIndex_t;
00048 #define INVALID_GENOME_INDEX UINT32_MAX
00049 
00050 // chromosome index is just a signed int, so this is ok here:
00051 #define INVALID_CHROMOSOME_INDEX -1
00052 
00053 
00054 //
00055 // ChromosomeInfo represents the per chromosome information
00056 // necessary to write out SAM/BAM records.  In addition, it
00057 // contains a single internal index used to point to the vector
00058 // offset where the chromosome bases start.
00059 //
00060 // This is mildly non-optimal for larger collections of chromosomes
00061 // or contigs - one use case described having millions of contigs,
00062 // in which case, this structure alone would take a gigabyte or more.
00063 //
00064 struct ChromosomeInfo
00065 {
00066     static const int  MAX_GENOME_INFO_STRING=128;
00067 
00068     void constructorClear()
00069     {
00070         memset(this,0, sizeof(*this));
00071     }
00072     void setChromosomeName(const char *n)
00073     {
00074         strncpy(name, n, sizeof(name)-1);
00075     }
00076     genomeIndex_t   start;                              // internal offset to combined genome vector
00077     genomeIndex_t   size;                               // SAM SQ:LN value
00078     char            md5[2*MD5_DIGEST_LENGTH + 1];       // 32 chars plus NUL, SAM SQ:M5 value
00079     char            name[MAX_GENOME_INFO_STRING];       // SAM SQ:SN value
00080     char            assemblyID[MAX_GENOME_INFO_STRING]; // SAM SQ:AS value
00081     char            uri[MAX_GENOME_INFO_STRING];        // SAM SQ:UR value
00082     char            species[MAX_GENOME_INFO_STRING];    // SAM SQ:SP value
00083 
00084     // handy setting methods:
00085     void setAssemblyID(const char *newID)
00086     {
00087         strncpy(assemblyID, newID, sizeof(assemblyID) - 1);
00088     }
00089     void setSpecies(const char *newSpecies)
00090     {
00091         strncpy(species, newSpecies, sizeof(newSpecies));
00092     }
00093     void setURI(const char *newURI)
00094     {
00095         strncpy(uri, newURI, sizeof(uri));
00096     }
00097 };
00098 
00099 class genomeSequenceMmapHeader : public MemoryMapArrayHeader
00100 {
00101     friend class GenomeSequence;
00102     friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &);
00103 private:
00104     uint32_t    _chromosomeCount;
00105     bool        _colorSpace;
00106 
00107     ChromosomeInfo  _chromosomes[0];
00108 
00109 public:
00110     //
00111     // getHeaderSize is special in that it must not access any
00112     // member variables, since it is called when the header has
00113     // not been created yet.
00114     //
00115     static size_t  getHeaderSize(int chromosomeCount)
00116     {
00117         return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount;
00118     }
00119     //
00120     // below methods return TRUE if it failed, false otherwise (primarily
00121     // a length check).
00122     //
00123 };
00124 
00125 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h);
00126 
00127 //
00128 // define the genomeSequence array type:
00129 //
00130 // NB the access/set routines use the encoded base values in the range
00131 // 0-15, not the corresponding base pair character.
00132 //
00133 inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index)
00134 {
00135     if ((index&1)==0)
00136     {
00137         return ((uint8_t *) base)[index>>1] & 0xf;
00138     }
00139     else
00140     {
00141         return (((uint8_t *) base)[index>>1] & 0xf0) >> 4;
00142     }
00143 };
00144 inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v)
00145 {
00146     if ((index&1)==0)
00147     {
00148         ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v;
00149     }
00150     else
00151     {
00152         ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4;
00153     }
00154 }
00155 
00156 inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i)
00157 {
00158     return sizeof(uint8_t) * i / 2;
00159 }
00160 
00161 #define UMFA_COOKIE 0x1b7933a1  // unique cookie id
00162 #define UMFA_VERSION 20100401U  // YYYYMMDD of last change to file layout
00163 
00164 typedef MemoryMapArray<
00165 uint8_t,
00166 genomeIndex_t,
00167 UMFA_COOKIE,
00168 UMFA_VERSION,
00169 genomeSequenceAccess,
00170 genomeSequenceSet,
00171 mmapGenomeSequenceElementCount2Bytes,
00172 genomeSequenceMmapHeader
00173 > genomeSequenceArray;
00174 
00175 class PackedRead
00176 {
00177     void set(int index, int val)
00178     {
00179         packedBases[index>>1] =
00180             (packedBases[index>>1]             // original value
00181              & ~(7<<((index&0x01)<<2)))         // logical AND off the original value
00182             | ((val&0x0f)<<((index&0x1)<<2));  // logical OR in the new value
00183     }
00184 public:
00185     std::vector<uint8_t> packedBases;
00186     uint32_t    length;
00187     int size()
00188     {
00189         return length;
00190     }
00191     void clear()
00192     {
00193         packedBases.clear();
00194         length=0;
00195     }
00196     void set(const char *rhs, int padWithNCount = 0);
00197     uint8_t operator [](int index)
00198     {
00199         return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
00200     }
00201 };
00202 
00203 // std::string &operator = (std::string &lhs, const PackedRead &rhs);
00204 //
00205 
00206 //!  Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
00207 /*!
00208   GenomeSequence is designed to be a high performance shared access
00209   reference object.
00210 
00211   It is implemented as a MemoryMapArray template object with unsigned 8
00212   bit ints, each of which stores two bases.  Although 2 bits could be used,
00213   most references have more than four symbols (usually at least including
00214   'N', indicating an unknown or masked out base).
00215 
00216   Normal use of this class follows these steps:
00217    -# create the reference
00218     -# instantiate the GenomeSequence class object
00219     -# create the actual file (memory mapped) that is to hold the data
00220     -# populate the data using GenomeSequence::set
00221    -# use the reference
00222     -# use the reference by instantiating a GenomeSequence object
00223     -# either use the constructor with the reference filename
00224     -# or use GenomeSequence::setReferenceName() followed by ::open
00225     -# access the bases via the overloaded array operator []
00226     -# check sequence length by using GenomeSequence::getNumberBases()
00227    -# accessing chromosomes in the reference
00228     -# you typically will need to know about the chromosomes in the sequence
00229     -# see methods and docs with prefix 'getChromosome'
00230 
00231   Sharing is accomplished using the mmap() function via the MemoryMap
00232   base class.  This allows a potentially large genome reference to be
00233   shared among a number of simultaneously executing instances of one or
00234   more programs sharing the same reference.
00235  */
00236 
00237 
00238 class GenomeSequence : public genomeSequenceArray
00239 {
00240 public:
00241     // TODO - these are found in BaseAsciiMap so should be removed from here.
00242     static const int baseAIndex = 000;
00243     static const int baseTIndex = 001;
00244     static const int baseCIndex = 002;
00245     static const int baseGIndex = 003;
00246     static const int baseNIndex = 004; // baseN --> bad read
00247     static const int baseXIndex = 005; // baseX --> unknown (bad data)
00248     //
00249     // two arrays for converting back and forth between base pair character
00250     // value (ASCII) to a base integer in the range 0..3.  Note there is actually
00251     // a value 4 and 5, for 'N' (indelible) and 'M' (unknown to me).
00252     //
00253     static unsigned char base2int[];        // base space read (ATCG)
00254 
00255     // TODO - these are found in BaseAsciiMap so should be removed from here.
00256     static const char int2base[];
00257     static const char int2colorSpace[];
00258     static unsigned char base2complement[];
00259 
00260 private:
00261     int                     _debugFlag;
00262 
00263     std::ostream            *_progressStream;
00264     bool                    _colorSpace;
00265 
00266     // Whether or not to overwrite an existing file when creating a umfa file (via create).
00267     bool                    _createOverwrite;
00268 
00269     std::string             _baseFilename;   // for later use by WordIndex create and open
00270     std::string             _referenceFilename;
00271     std::string             _fastaFilename;
00272     std::string             _umfaFilename;
00273     std::string             _application;        // only used in ::create()
00274 
00275     MemoryMap               _umfaFile;
00276 
00277     void setup(const char *referenceFilename);
00278 
00279 public:
00280     /// Simple constructor - no implicit file open
00281     GenomeSequence();
00282     void constructorClear();
00283     /// \brief attempt to open an existing sequence
00284     ///
00285     /// \param referenceFilename the name of the reference fasta file to open
00286     /// \param debug if true, additional debug information is printed
00287     GenomeSequence(std::string &referenceFilename)
00288     {
00289         constructorClear();
00290         setup(referenceFilename.c_str());
00291     }
00292 
00293     /// Smarter constructor - attempt to open an existing sequence
00294     ///
00295     /// \param referenceFilename the name of the reference fasta file to open
00296     /// \param debug if true, additional debug information is printed
00297     GenomeSequence(const char *referenceFilename)
00298     {
00299         constructorClear();
00300         setup(referenceFilename);
00301     }
00302 
00303     /// Close the file if open and destroy the object
00304     ~GenomeSequence();
00305 
00306     /// open the reference specified using GenomeSequence::setReferenceName
00307     ///
00308     /// \param isColorSpace open the color space reference
00309     /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
00310     /// \return false for success, true otherwise
00311     bool open(bool isColorSpace = false, int flags = O_RDONLY);
00312 
00313     /// open the given file as the genome (no filename munging occurs).
00314     ///
00315     /// \param filename the name of the file to open
00316     /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
00317     /// \return false for success, true otherwise
00318     bool open(const char *filename, int flags = O_RDONLY)
00319     {
00320         _umfaFilename = filename;
00321         return genomeSequenceArray::open(filename, flags);
00322     }
00323 
00324     /// if true, then show progress when creating and pre-fetching
00325 private:
00326     bool    _searchCommonFileSuffix;
00327 public:
00328     bool create();
00329     //
00330     // backwards compatability:
00331     bool create(bool isColor) {setColorSpace(isColor); return create(); }
00332 
00333     // NEW API?
00334 
00335     // load time modifiers:
00336     void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;}
00337     void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; }
00338     void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
00339 
00340     // Set whether or not to overwrite a umfa file when calling create.
00341     void setCreateOverwrite(bool createOverwrite) {_createOverwrite = createOverwrite;}
00342 
00343     bool loadFastaData(const char *filename);
00344 
00345     /// set the reference name that will be used in open()
00346     /// \param referenceFilename the name of the reference fasta file to open
00347     /// \return false for success, true otherwise
00348     ///
00349     /// \sa open()
00350     bool setReferenceName(std::string referenceFilename);
00351 
00352     /// set the application name in the binary file header
00353     ///
00354     /// \param application name of the application
00355     void setApplication(std::string application)
00356     {
00357         _application = application;     // used in ::create() to set application name
00358     }
00359     const std::string &getFastaName() const
00360     {
00361         return _fastaFilename;
00362     }
00363     const std::string &getReferenceName() const
00364     {
00365         return _referenceFilename;
00366     }
00367 
00368     /// tell us if we are a color space reference or not
00369     /// \return true if colorspace, false otherwise
00370     bool isColorSpace() const
00371     {
00372         return _colorSpace;
00373     }
00374 
00375     /// return the number of bases represented in this reference
00376     /// \return count of bases
00377     genomeIndex_t   getNumberBases() const
00378     {
00379         return getElementCount();
00380     }
00381 
00382     /// given a whole genome index, get the chromosome it is located in
00383     ///
00384     /// This is done via a binary search of the chromosome table in the
00385     /// header of the mapped file, so it is O(log(N))
00386     ///
00387     /// \param 0-based position the base in the genome
00388     /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
00389     int getChromosome(genomeIndex_t position) const;
00390 
00391     /// given a chromosome name, return the chromosome index
00392     ///
00393     /// This is done via a linear search of the chromosome table in the
00394     /// header of the mapped file, so it is O(N)
00395     ///
00396     /// \param chromosomeName the name of the chromosome - exact match only
00397     /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
00398     int getChromosome(const char *chromosomeName) const;
00399     /// Return the number of chromosomes in the genome
00400     /// \return number of chromosomes in the genome
00401     int getChromosomeCount() const;
00402 
00403     /// given a chromosome, return the genome base it starts in
00404     ///
00405     /// \param 0-based chromosome index
00406     /// \return 0-based genome index of the base that starts the chromosome
00407     genomeIndex_t getChromosomeStart(int chromosomeIndex) const
00408     {
00409         if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00410         return header->_chromosomes[chromosomeIndex].start;
00411     }
00412 
00413     /// given a chromosome, return its size in bases
00414     ///
00415     /// \param 0-based chromosome index
00416     /// \return size of the chromosome in bases
00417     genomeIndex_t getChromosomeSize(int chromosomeIndex) const
00418     {
00419         if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
00420         return header->_chromosomes[chromosomeIndex].size;
00421     }
00422 
00423     /// given a chromosome name and position, return the genome position
00424     ///
00425     /// \param chromosomeName name of the chromosome - exact match only
00426     /// \param chromosomeIndex 1-based chromosome position
00427     /// \return genome index of the above chromosome position
00428     genomeIndex_t getGenomePosition(
00429         const char *chromosomeName,
00430         unsigned int chromosomeIndex) const;
00431 
00432     /// given a chromosome index and position, return the genome position
00433     ///
00434     /// \param chromosome index of the chromosome
00435     /// \param chromosomeIndex 1-based chromosome position
00436     /// \return genome index of the above chromosome position
00437     genomeIndex_t getGenomePosition(
00438         int chromosome,
00439         unsigned int chromosomeIndex) const;
00440 
00441     /// given the chromosome name, get the corresponding 0 based genome index
00442     /// for the start of that chromosome
00443     genomeIndex_t getGenomePosition(const char *chromosomeName) const;
00444     genomeIndex_t getGenomePosition(int chromosomeIndex) const;
00445 
00446     const std::string &getBaseFilename() const
00447     {
00448         return _baseFilename;
00449     }
00450 
00451     const char *getChromosomeName(int chromosomeIndex) const
00452     {
00453         return header->_chromosomes[chromosomeIndex].name;
00454     }
00455 
00456     void setDebugFlag(bool d)
00457     {
00458         _debugFlag = d;
00459     }
00460 
00461     genomeIndex_t sequenceLength() const
00462     {
00463         return (genomeIndex_t) header->elementCount;
00464     }
00465 
00466     const char *chromosomeName(int chr) const
00467     {
00468         return header->_chromosomes[chr].name;
00469     }
00470 
00471     void sanityCheck(MemoryMap &fasta) const;
00472 
00473     // TODO - this will be moved somewhere else and be made a static method.
00474     std::string IntegerToSeq(unsigned int n, unsigned int wordsize) const;
00475 
00476     bool wordMatch(unsigned int index, std::string &word) const;
00477     bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word) const;
00478 
00479     // TODO - this will be moved somewhere else and be made a static method.
00480     char BasePair(char c) const
00481     {
00482         return base2complement[(int) c];
00483     }
00484 
00485     void dumpSequenceSAMDictionary(std::ostream&) const;
00486     void dumpHeaderTSV(std::ostream&) const;
00487 
00488     ///
00489     /// Return the bases in base space or color space for within range index, ot
00490     /// @param index the array-like index (0 based).
00491     /// @return ACTGN in base space; 0123N for color space; and 'N' for invalid.
00492     /// For color space, index i represents the transition of base at position (i-1) to base at position i
00493     ///
00494     /// NB: bounds checking here needs to be deprecated - do not assume it
00495     /// will exist - the call must clip reads so that this routine is never
00496     /// called with a index value larger than the genome.
00497     ///
00498     /// The reason for this is simply that this routine gets called hundreds
00499     /// of billions of time in one run of karma, which will absolutely kill
00500     /// performance.  Every single instruction here matters a great, great deal.
00501     ///
00502 
00503     //
00504     // 3.5% improvement for color space matching:
00505     // I guess the compiler isn't inlining 3 functions deep.
00506     //
00507 
00508 #if 0
00509     // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
00510     // The following code does not work even in the base space,
00511     // since the memory layout has changed. USE IT WITH CAUTIOUS!
00512     //
00513     // This block of code is a functional duplicate of the following
00514     // code - leave this here for reference and possibly later
00515     // performance testing as well as compiler evaluation.
00516     inline char operator[](genomeIndex_t index) const
00517     {
00518         return int2base[(*((genomeSequenceArray*) this))[index]];
00519     }
00520 #endif
00521 
00522     inline char operator[](genomeIndex_t index) const
00523     {
00524         uint8_t val;
00525         if (index < getNumberBases())
00526         {
00527             if ((index&1)==0)
00528             {
00529                 val = ((uint8_t *) data)[index>>1] & 0xf;
00530             }
00531             else
00532             {
00533                 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
00534             }
00535         }
00536         else
00537         {
00538             val = baseNIndex;
00539         }
00540         val = isColorSpace() ? int2colorSpace[val] : int2base[val];
00541         return val;
00542     }
00543 
00544     inline uint8_t getInteger(genomeIndex_t index) const
00545     {
00546         return (*((genomeSequenceArray*) this))[index];
00547     }
00548 
00549     inline void set(genomeIndex_t index, char value)
00550     {
00551         genomeSequenceArray::set(index, base2int[(uint8_t) value]);
00552     }
00553 
00554     /// obtain the pointer to the raw data for other access methods
00555     ///
00556     /// this is a fairly ugly hack to reach into the
00557     /// raw genome vector, get the byte that encodes
00558     /// two bases, and return it.  This is used by
00559     /// karma ReadIndexer::getSumQ to compare genome
00560     /// matchines by byte (two bases at a time) to speed
00561     /// it up.
00562     ///
00563     uint8_t *getDataPtr(genomeIndex_t index)
00564     {
00565         return ((uint8_t *) data + index/2);
00566     }
00567 private:
00568     ///
00569     /// when creating the genome mapped file, we call this to set
00570     /// the MD5 checksum of the chromosome sequence and length so that
00571     /// we can write the SAM SQ headers properly.
00572     /// NB: operates on the last fully loaded chromosome.
00573     bool setChromosomeMD5andLength(uint32_t whichChromosome);
00574 
00575 public:
00576 
00577     // TODO - this will be moved somewhere else and be made a static method.
00578     // replace read with the reversed one
00579     void getReverseRead(std::string &read);
00580 
00581     // TODO - this will be moved somewhere else and be made a static method.
00582     void getReverseRead(String& read);
00583 
00584     // debug the given read - print nice results
00585     int debugPrintReadValidation(
00586         std::string &read,
00587         std::string &quality,
00588         char   direction,
00589         genomeIndex_t   readLocation,
00590         int sumQuality,
00591         int mismatchCount,
00592         bool recurse = true
00593         );
00594 
00595     void getString(std::string &str, int chromosome, genomeIndex_t index, int baseCount) const;
00596     void getString(String &str, int chromosome, genomeIndex_t index, int baseCount) const;
00597     //
00598     // get the sequence from this GenomeSequence.
00599     // if baseCount < 0, get the reverse complement
00600     // that starts at index (but do not reverse the string?)
00601     //
00602     void getString(std::string &str, genomeIndex_t index, int baseCount) const;
00603     void getString(String &str, genomeIndex_t index, int baseCount) const;
00604 
00605     void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const;
00606 
00607     void print30(genomeIndex_t) const;
00608 
00609     // for debugging, not for speed:
00610     genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const;
00611 
00612 
00613     // TODO - these methods do not handle a CIGAR string and do not handle '=' when a read matches the reference.
00614     // They are here for alignment and should be moved to the aligner (karma).
00615     // OR they should optionally take a CIGAR and use that if specified....
00616     // maybe they should be helper methods that are found somewhere else
00617 
00618     /// Return the mismatch count, disregarding CIGAR strings
00619     ///
00620     /// \param read is the sequence we're counting mismatches in
00621     /// \param location is where in the genmoe we start comparing
00622     /// \param exclude is a wildcard character (e.g. '.' or 'N')
00623     ///
00624     /// \return number of bases that don't match the reference, except those that match exclude
00625     int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0') const
00626     {
00627         int mismatchCount = 0;
00628         for (uint32_t i=0; i<read.size(); i++)
00629             if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
00630         return mismatchCount;
00631     };
00632 
00633     /// brute force sumQ - no sanity checking
00634     ///
00635     /// \param read shotgun sequencer read string
00636     /// \param qualities phred quality string of same length
00637     /// \param location the alignment location to check sumQ
00638     int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location) const
00639     {
00640         int sumQ = 0;
00641         for (uint32_t i=0; i<read.size(); i++)
00642             sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
00643         return sumQ;
00644     };
00645     // return a string highlighting mismatch postions with '^' chars:
00646     void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const;
00647     void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const;
00648 
00649     // END TODO
00650 
00651     void getChromosomeAndIndex(std::string &, genomeIndex_t) const;
00652     void getChromosomeAndIndex(String &, genomeIndex_t) const;
00653 
00654     /// check a SAM format read, using phred quality scores and
00655     /// the CIGAR string to determine if it is correct.
00656     ///
00657     ///
00658     /// \param read the read in base space
00659     /// \param qualities the phred encoded qualities (Sanger, not Illumina)
00660     /// \param cigar the SAM file CIGAR column
00661     /// \param sumQ if >0 on entry, is checked against the computed sumQ
00662     /// \param insertions count of insertions found in
00663     ///
00664     ///
00665     bool checkRead(
00666         std::string &read,
00667         std::string &qualities,
00668         std::string &cigar,
00669         int &sumQ,              // input and output
00670         int &gapOpenCount,      // output only
00671         int &gapExtendCount,    // output only
00672         int &gapDeleteCount,    // output only
00673         std::string &result
00674     ) const;
00675 
00676     bool populateDBSNP(mmapArrayBool_t &dbSNP, std::ifstream &inputFile) const;
00677 
00678     /// user friendly dbSNP loader.
00679     ///
00680     /// \param inputFileName may be empty, point to a text file or a dbSNP vector file
00681     ///
00682     /// In all cases, dbSNP is returned the same length as this genome.
00683     ///
00684     /// When no SNPs are loaded, all values are false.
00685     ///
00686     /// When a text file is given, the file is parsed with two space
00687     /// separated columns - the first column is the chromosome name, and
00688     /// the second is the 1-based chromosome position of the SNP.
00689     ///
00690     /// \return false if a dbSNP file was correctly loaded, true otherwise
00691     ///
00692     bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName) const;
00693 };
00694 
00695 #endif