GenomeSequence.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _GENOME_SEQUENCE_H
00019 #define _GENOME_SEQUENCE_H
00020 
00021 #include <assert.h>
00022 #include <fcntl.h>
00023 #include <stdio.h>              // for size_t, which md5.h needs (for redhat)
00024 #include <stddef.h>             // for size_t, which md5.h needs (for correct systems)
00025 #include <sys/types.h>
00026 #include <sys/stat.h>
00027 #if defined(WIN32)
00028 #define MD5_DIGEST_LENGTH 16
00029 #else
00030 #include <openssl/md5.h>        // for MD5_DIGEST_LENGTH
00031 #endif
00032 #include <string>
00033 #include <vector>
00034 #include "MemoryMapArray.h"
00035 
00036 // Goncalo's String class
00037 #include "StringArray.h"
00038 
00039 #ifndef __STDC_LIMIT_MACROS
00040 #define __STDC_LIMIT_MACROS
00041 #endif
00042 #include <stdint.h>
00043 
00044 typedef uint32_t    genomeIndex_t;
00045 #define INVALID_GENOME_INDEX UINT32_MAX
00046 
00047 // chromosome index is just a signed int, so this is ok here:
00048 #define INVALID_CHROMOSOME_INDEX -1
00049 
00050 
00051 //
00052 // ChromosomeInfo represents the per chromosome information
00053 // necessary to write out SAM/BAM records.  In addition, it
00054 // contains a single internal index used to point to the vector
00055 // offset where the chromosome bases start.
00056 //
00057 // This is mildly non-optimal for larger collections of chromosomes
00058 // or contigs - one use case described having millions of contigs,
00059 // in which case, this structure alone would take a gigabyte or more.
00060 //
00061 struct ChromosomeInfo
00062 {
00063     static const int  MAX_GENOME_INFO_STRING=128;
00064 
00065     void constructorClear()
00066     {
00067         memset(this,0, sizeof(*this));
00068     }
00069     void setChromosomeName(const char *n)
00070     {
00071         strncpy(name, n, sizeof(name)-1);
00072     }
00073     genomeIndex_t   start;                              // internal offset to combined genome vector
00074     genomeIndex_t   size;                               // SAM SQ:LN value
00075     char            md5[2*MD5_DIGEST_LENGTH + 1];       // 32 chars plus NUL, SAM SQ:M5 value
00076     char            name[MAX_GENOME_INFO_STRING];       // SAM SQ:SN value
00077     char            assemblyID[MAX_GENOME_INFO_STRING]; // SAM SQ:AS value
00078     char            uri[MAX_GENOME_INFO_STRING];        // SAM SQ:UR value
00079     char            species[MAX_GENOME_INFO_STRING];    // SAM SQ:SP value
00080 
00081     // handy setting methods:
00082     void setAssemblyID(const char *newID)
00083     {
00084         strncpy(assemblyID, newID, sizeof(assemblyID) - 1);
00085     }
00086     void setSpecies(const char *newSpecies)
00087     {
00088         strncpy(species, newSpecies, sizeof(newSpecies));
00089     }
00090     void setURI(const char *newURI)
00091     {
00092         strncpy(uri, newURI, sizeof(uri));
00093     }
00094 };
00095 
00096 class genomeSequenceMmapHeader : public MemoryMapArrayHeader
00097 {
00098     friend class GenomeSequence;
00099     friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &);
00100 private:
00101     uint32_t    _chromosomeCount;
00102     bool        _colorSpace;
00103 
00104     ChromosomeInfo  _chromosomes[0];
00105 
00106 public:
00107     //
00108     // getHeaderSize is special in that it must not access any
00109     // member variables, since it is called when the header has
00110     // not been created yet.
00111     //
00112     static size_t  getHeaderSize(int chromosomeCount)
00113     {
00114         return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount;
00115     }
00116     //
00117     // below methods return TRUE if it failed, false otherwise (primarily
00118     // a length check).
00119     //
00120 };
00121 
00122 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h);
00123 
00124 //
00125 // define the genomeSequence array type:
00126 //
00127 // NB the access/set routines use the encoded base values in the range
00128 // 0-15, not the corresponding base pair character.
00129 //
00130 inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index)
00131 {
00132     if ((index&1)==0)
00133     {
00134         return ((uint8_t *) base)[index>>1] & 0xf;
00135     }
00136     else
00137     {
00138         return (((uint8_t *) base)[index>>1] & 0xf0) >> 4;
00139     }
00140 };
00141 inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v)
00142 {
00143     if ((index&1)==0)
00144     {
00145         ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v;
00146     }
00147     else
00148     {
00149         ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4;
00150     }
00151 }
00152 
00153 inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i)
00154 {
00155     return sizeof(uint8_t) * i / 2;
00156 }
00157 
00158 #define UMFA_COOKIE 0x1b7933a1  // unique cookie id
00159 #define UMFA_VERSION 20100401U  // YYYYMMDD of last change to file layout
00160 
00161 typedef MemoryMapArray<
00162 uint8_t,
00163 genomeIndex_t,
00164 UMFA_COOKIE,
00165 UMFA_VERSION,
00166 genomeSequenceAccess,
00167 genomeSequenceSet,
00168 mmapGenomeSequenceElementCount2Bytes,
00169 genomeSequenceMmapHeader
00170 > genomeSequenceArray;
00171 
00172 class PackedRead
00173 {
00174     void set(int index, int val)
00175     {
00176         packedBases[index>>1] =
00177             (packedBases[index>>1]             // original value
00178              & ~(7<<((index&0x01)<<2)))         // logical AND off the original value
00179             | ((val&0x0f)<<((index&0x1)<<2));  // logical OR in the new value
00180     }
00181 public:
00182     std::vector<uint8_t> packedBases;
00183     uint32_t    length;
00184     int size()
00185     {
00186         return length;
00187     }
00188     void clear()
00189     {
00190         packedBases.clear();
00191         length=0;
00192     }
00193     void set(const char *rhs, int padWithNCount = 0);
00194     uint8_t operator [](int index)
00195     {
00196         return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
00197     }
00198 };
00199 
00200 // std::string &operator = (std::string &lhs, const PackedRead &rhs);
00201 //
00202 
00203 //!  Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
00204 /*!
00205   GenomeSequence is designed to be a high performance shared access
00206   reference object.
00207 
00208   It is implemented as a MemoryMapArray template object with unsigned 8
00209   bit ints, each of which stores two bases.  Although 2 bits could be used,
00210   most references have more than four symbols (usually at least including
00211   'N', indicating an unknown or masked out base).
00212 
00213   Normal use of this class follows these steps:
00214    -# create the reference
00215     -# instantiate the GenomeSequence class object
00216     -# create the actual file (memory mapped) that is to hold the data
00217     -# populate the data using GenomeSequence::set
00218    -# use the reference
00219     -# use the reference by instantiating a GenomeSequence object
00220     -# either use the constructor with the reference filename
00221     -# or use GenomeSequence::setReferenceName() followed by ::open
00222     -# access the bases via the overloaded array operator []
00223     -# check sequence length by using GenomeSequence::getNumberBases()
00224    -# accessing chromosomes in the reference
00225     -# you typically will need to know about the chromosomes in the sequence
00226     -# see methods and docs with prefix 'getChromosome'
00227 
00228   Sharing is accomplished using the mmap() function via the MemoryMap
00229   base class.  This allows a potentially large genome reference to be
00230   shared among a number of simultaneously executing instances of one or
00231   more programs sharing the same reference.
00232  */
00233 
00234 
00235 class GenomeSequence : public genomeSequenceArray
00236 {
00237 public:
00238     static const int baseAIndex = 000;
00239     static const int baseTIndex = 001;
00240     static const int baseCIndex = 002;
00241     static const int baseGIndex = 003;
00242     static const int baseNIndex = 004; // baseN --> bad read
00243     static const int baseXIndex = 005; // baseX --> unknown (bad data)
00244     //
00245     // two arrays for converting back and forth between base pair character
00246     // value (ASCII) to a base integer in the range 0..3.  Note there is actually
00247     // a value 4 and 5, for 'N' (indelible) and 'M' (unknown to me).
00248     //
00249     static unsigned char base2int[];        // base space read (ATCG)
00250     static const char int2base[];
00251     static const char int2colorSpace[];
00252     static unsigned char base2complement[];
00253 
00254 private:
00255     int                     _debugFlag;
00256 
00257     std::ostream            *_progressStream;
00258     bool                    _colorSpace;
00259 
00260     std::string             _baseFilename;   // for later use by WordIndex create and open
00261     std::string             _referenceFilename;
00262     std::string             _fastaFilename;
00263     std::string             _umfaFilename;
00264     std::string             _application;        // only used in ::create()
00265 
00266     MemoryMap               _umfaFile;
00267 
00268     void setup(const char *referenceFilename);
00269 
00270 public:
00271     /// Simple constructor - no implicit file open
00272     GenomeSequence();
00273     void constructorClear();
00274     /// \brief attempt to open an existing sequence
00275     ///
00276     /// \param referenceFilename the name of the reference fasta file to open
00277     /// \param debug if true, additional debug information is printed
00278     GenomeSequence(std::string &referenceFilename)
00279     {
00280         constructorClear();
00281         setup(referenceFilename.c_str());
00282     }
00283 
00284     /// Smarter constructor - attempt to open an existing sequence
00285     ///
00286     /// \param referenceFilename the name of the reference fasta file to open
00287     /// \param debug if true, additional debug information is printed
00288     GenomeSequence(const char *referenceFilename)
00289     {
00290         constructorClear();
00291         setup(referenceFilename);
00292     }
00293 
00294     /// Close the file if open and destroy the object
00295     ~GenomeSequence();
00296 
00297     /// open the reference specified using GenomeSequence::setReferenceName
00298     ///
00299     /// \param isColorSpace open the color space reference
00300     /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
00301     /// \return false for success, true otherwise
00302     bool open(bool isColorSpace = false, int flags = O_RDONLY);
00303 
00304     /// open the given file as the genome (no filename munging occurs).
00305     ///
00306     /// \param filename the name of the file to open
00307     /// \param flags pass through to the ::open() call (O_RDWR lets you modify the contents)
00308     /// \return false for success, true otherwise
00309     bool open(const char *filename, int flags = O_RDONLY)
00310     {
00311         _umfaFilename = filename;
00312         return genomeSequenceArray::open(filename, flags);
00313     }
00314 
00315     /// if true, then show progress when creating and pre-fetching
00316 private:
00317     bool    _searchCommonFileSuffix;
00318 public:
00319     bool create();
00320     //
00321     // backwards compatability:
00322     bool create(bool isColor) {setColorSpace(isColor); return create(); }
00323 
00324     // NEW API?
00325 
00326     // load time modifiers:
00327     void setProgressStream(std::ostream &progressStream) {_progressStream = &progressStream;}
00328     void setColorSpace(bool colorSpace) {_colorSpace = colorSpace; }
00329     void setSearchCommonFileSuffix(bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
00330 
00331     bool loadFastaData(const char *filename);
00332 
00333     /// set the reference name that will be used in open()
00334     /// \param referenceFilename the name of the reference fasta file to open
00335     /// \return false for success, true otherwise
00336     ///
00337     /// \sa open()
00338     bool setReferenceName(std::string referenceFilename);
00339 
00340     /// set the application name in the binary file header
00341     ///
00342     /// \param application name of the application
00343     void setApplication(std::string application)
00344     {
00345         _application = application;     // used in ::create() to set application name
00346     }
00347     const std::string &getFastaName()
00348     {
00349         return _fastaFilename;
00350     }
00351     const std::string &getReferenceName()
00352     {
00353         return _referenceFilename;
00354     }
00355 
00356     /// tell us if we are a color space reference or not
00357     /// \return true if colorspace, false otherwise
00358     bool isColorSpace()
00359     {
00360         return _colorSpace;
00361     }
00362 
00363     /// return the number of bases represented in this reference
00364     /// \return count of bases
00365     genomeIndex_t   getNumberBases()
00366     {
00367         return getElementCount();
00368     }
00369 
00370     /// given a whole genome index, get the chromosome it is located in
00371     ///
00372     /// This is done via a binary search of the chromosome table in the
00373     /// header of the mapped file, so it is O(log(N))
00374     ///
00375     /// \param 0-based position the base in the genome
00376     /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
00377     int getChromosome(genomeIndex_t position);
00378     /// given a chromosome name, return the chromosome index
00379     ///
00380     /// This is done via a linear search of the chromosome table in the
00381     /// header of the mapped file, so it is O(N)
00382     ///
00383     /// \param chromosomeName the name of the chromosome - exact match only
00384     /// \return 0-based index into chromosome table - INVALID_CHROMOSOME_INDEX if error
00385     int getChromosome(const char *chromosomeName);
00386     /// Return the number of chromosomes in the genome
00387     /// \return number of chromosomes in the genome
00388     int getChromosomeCount();
00389 
00390     /// given a chromosome, return the genome base it starts in
00391     ///
00392     /// \param 0-based chromosome index
00393     /// \return 0-based genome index of the base that starts the chromosome
00394     genomeIndex_t getChromosomeStart(int chromosomeIndex)
00395     {
00396         if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00397         return header->_chromosomes[chromosomeIndex].start;
00398     }
00399 
00400     /// given a chromosome, return its size in bases
00401     ///
00402     /// \param 0-based chromosome index
00403     /// \return size of the chromosome in bases
00404     genomeIndex_t getChromosomeSize(int chromosomeIndex)
00405     {
00406         if (chromosomeIndex==INVALID_CHROMOSOME_INDEX) return 0;
00407         return header->_chromosomes[chromosomeIndex].size;
00408     }
00409 
00410     /// given a chromosome name and position, return the genome position
00411     ///
00412     /// \param chromosomeName name of the chromosome - exact match only
00413     /// \param chromosomeIndex 1-based chromosome position
00414     /// \return genome index of the above chromosome position
00415     genomeIndex_t getGenomePosition(
00416         const char *chromosomeName,
00417         unsigned int chromosomeIndex);
00418 
00419     /// given a chromosome index and position, return the genome position
00420     ///
00421     /// \param chromosome index of the chromosome
00422     /// \param chromosomeIndex 1-based chromosome position
00423     /// \return genome index of the above chromosome position
00424     genomeIndex_t getGenomePosition(
00425         int chromosome,
00426         unsigned int chromosomeIndex);
00427 
00428     /// given the chromosome name, get the corresponding 0 based genome index
00429     /// for the start of that chromosome
00430     genomeIndex_t getGenomePosition(const char *chromosomeName);
00431     genomeIndex_t getGenomePosition(int chromosomeIndex);
00432 
00433     const std::string &getBaseFilename()
00434     {
00435         return _baseFilename;
00436     }
00437     const char *getChromosomeName(int chromosomeIndex)
00438     {
00439         return header->_chromosomes[chromosomeIndex].name;
00440     }
00441 
00442     void setDebugFlag(bool d)
00443     {
00444         _debugFlag = d;
00445     }
00446 
00447     genomeIndex_t sequenceLength()
00448     {
00449         return (genomeIndex_t) header->elementCount;
00450     }
00451     const char *chromosomeName(int chr)
00452     {
00453         return header->_chromosomes[chr].name;
00454     }
00455 
00456     void sanityCheck(MemoryMap &fasta);
00457     std::string IntegerToSeq(unsigned int n, unsigned int wordsize);
00458 
00459     bool wordMatch(unsigned int index, std::string &word);
00460     bool printNearbyWords(unsigned int index, unsigned int variance, std::string &word);
00461     char BasePair(char c)
00462     {
00463         return base2complement[(int) c];
00464     }
00465     void dumpSequenceSAMDictionary(std::ostream&);
00466     void dumpHeaderTSV(std::ostream&);
00467     void loadHeaderFromTSV(std::istream&);
00468 
00469     ///
00470     /// Return the bases in base space or color space for within range index, ot
00471     /// @param index the array-like index (0 based).
00472     /// @return ACTGN in base space; 0123N for color space; and 'N' for invalid.
00473     /// For color space, index i represents the transition of base at position (i-1) to base at position i
00474     ///
00475     /// NB: bounds checking here needs to be deprecated - do not assume it
00476     /// will exist - the call must clip reads so that this routine is never
00477     /// called with a index value larger than the genome.
00478     ///
00479     /// The reason for this is simply that this routine gets called hundreds
00480     /// of billions of time in one run of karma, which will absolutely kill
00481     /// performance.  Every single instruction here matters a great, great deal.
00482     ///
00483 
00484     //
00485     // 3.5% improvement for color space matching:
00486     // I guess the compiler isn't inlining 3 functions deep.
00487     //
00488 
00489 #if 0
00490     // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
00491     // The following code does not work even in the base space,
00492     // since the memory layout has changed. USE IT WITH CAUTIOUS!
00493     //
00494     // This block of code is a functional duplicate of the following
00495     // code - leave this here for reference and possibly later
00496     // performance testing as well as compiler evaluation.
00497     inline char operator[](genomeIndex_t index)
00498     {
00499         return int2base[(*((genomeSequenceArray*) this))[index]];
00500     }
00501 #endif
00502 
00503     inline char operator[](genomeIndex_t index)
00504     {
00505         uint8_t val;
00506         if (index < getNumberBases())
00507         {
00508             if ((index&1)==0)
00509             {
00510                 val = ((uint8_t *) data)[index>>1] & 0xf;
00511             }
00512             else
00513             {
00514                 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
00515             }
00516         }
00517         else
00518         {
00519             val = baseNIndex;
00520         }
00521         val = isColorSpace() ? int2colorSpace[val] : int2base[val];
00522         return val;
00523     }
00524 
00525     inline uint8_t getInteger(genomeIndex_t index)
00526     {
00527         return (*((genomeSequenceArray*) this))[index];
00528     }
00529     inline void set(genomeIndex_t index, char value)
00530     {
00531         genomeSequenceArray::set(index, base2int[(uint8_t) value]);
00532     }
00533     /// obtain the pointer to the raw data for other access methods
00534     ///
00535     /// this is a fairly ugly hack to reach into the
00536     /// raw genome vector, get the byte that encodes
00537     /// two bases, and return it.  This is used by
00538     /// karma ReadIndexer::getSumQ to compare genome
00539     /// matchines by byte (two bases at a time) to speed
00540     /// it up.
00541     ///
00542     uint8_t *getDataPtr(genomeIndex_t index)
00543     {
00544         return ((uint8_t *) data + index/2);
00545     }
00546 private:
00547     ///
00548     /// when creating the genome mapped file, we call this to set
00549     /// the MD5 checksum of the chromosome sequence and length so that
00550     /// we can write the SAM SQ headers properly.
00551     /// NB: operates on the last fully loaded chromosome.
00552     bool setChromosomeMD5andLength(uint32_t whichChromosome);
00553 public:
00554     // replace read with the reversed one
00555     void getReverseRead(std::string &read);
00556     void getReverseRead(String& read);
00557     // debug the given read - print nice results
00558     int debugPrintReadValidation(
00559         std::string &read,
00560         std::string &quality,
00561         char   direction,
00562         genomeIndex_t   readLocation,
00563         int sumQuality,
00564         int mismatchCount,
00565         bool recurse = true
00566     );
00567 
00568     void getString(std::string &str, int chromosome, genomeIndex_t index, int baseCount);
00569     void getString(String &str, int chromosome, genomeIndex_t index, int baseCount);
00570     //
00571     // get the sequence from this GenomeSequence.
00572     // if baseCount < 0, get the reverse complement
00573     // that starts at index (but do not reverse the string?)
00574     //
00575     void getString(std::string &str, genomeIndex_t index, int baseCount);
00576     void getString(String &str, genomeIndex_t index, int baseCount);
00577 
00578     void getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd);
00579 
00580     void print30(genomeIndex_t);
00581 
00582     // for debugging, not for speed:
00583     genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize);
00584 
00585     /// Return the mismatch count, disregarding CIGAR strings
00586     ///
00587     /// \param read is the sequence we're counting mismatches in
00588     /// \param location is where in the genmoe we start comparing
00589     /// \param exclude is a wildcard character (e.g. '.' or 'N')
00590     ///
00591     /// \return number of bases that don't match the reference, except those that match exclude
00592     int getMismatchCount(std::string &read, genomeIndex_t location, char exclude='\0')
00593     {
00594         int mismatchCount = 0;
00595         for (uint32_t i=0; i<read.size(); i++)
00596             if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
00597         return mismatchCount;
00598     };
00599 
00600     /// brute force sumQ - no sanity checking
00601     ///
00602     /// \param read shotgun sequencer read string
00603     /// \param qualities phred quality string of same length
00604     /// \param location the alignment location to check sumQ
00605     int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location)
00606     {
00607         int sumQ = 0;
00608         for (uint32_t i=0; i<read.size(); i++)
00609             sumQ += (read[i]!=(*this)[location + i] ? (qualities[i]-33) : 0);
00610         return sumQ;
00611     };
00612     // return a string highlighting mismatch postions with '^' chars:
00613     void getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location);
00614     void getMismatchString(std::string &result, const std::string &read, genomeIndex_t location);
00615 
00616     void getChromosomeAndIndex(std::string &, genomeIndex_t);
00617     void getChromosomeAndIndex(String &, genomeIndex_t);
00618 
00619     /// check a SAM format read, using phred quality scores and
00620     /// the CIGAR string to determine if it is correct.
00621     ///
00622     ///
00623     /// \param read the read in base space
00624     /// \param qualities the phred encoded qualities (Sanger, not Illumina)
00625     /// \param cigar the SAM file CIGAR column
00626     /// \param sumQ if >0 on entry, is checked against the computed sumQ
00627     /// \param insertions count of insertions found in
00628     ///
00629     ///
00630     bool checkRead(
00631         std::string &read,
00632         std::string &qualities,
00633         std::string &cigar,
00634         int &sumQ,              // input and output
00635         int &gapOpenCount,      // output only
00636         int &gapExtendCount,    // output only
00637         int &gapDeleteCount,    // output only
00638         std::string &result
00639     );
00640 
00641     bool populateDBSNP(mmapArrayBool_t &dbSNP, std::ifstream &inputFile);
00642 
00643     /// user friendly dbSNP loader.
00644     ///
00645     /// \param inputFileName may be empty, point to a text file or a dbSNP vector file
00646     ///
00647     /// In all cases, dbSNP is returned the same length as this genome.
00648     ///
00649     /// When no SNPs are loaded, all values are false.
00650     ///
00651     /// When a text file is given, the file is parsed with two space
00652     /// separated columns - the first column is the chromosome name, and
00653     /// the second is the 1-based chromosome position of the SNP.
00654     ///
00655     /// \return false if a dbSNP file was correctly loaded, true otherwise
00656     ///
00657     bool loadDBSNP(mmapArrayBool_t &dbSNP, const char *inputFileName);
00658 };
00659 
00660 #endif