libStatGen Software  1
GenomeSequence.cpp
00001 /*
00002  *  Copyright (C) 2010-2012  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "assert.h"
00019 #include "ctype.h"
00020 #include "stdio.h"
00021 #include "Error.h"
00022 
00023 
00024 #include "Generic.h"
00025 #include "GenomeSequence.h"
00026 
00027 #include <algorithm>
00028 #include <istream>
00029 #include <fstream>
00030 #include <sstream>
00031 #include <stdexcept>
00032 
00033 #if defined(_WIN32)
00034 #include <io.h>
00035 #ifndef R_OK
00036 #define R_OK 4
00037 #endif
00038 #endif
00039 
00040 // not general use:
00041 #include "CSG_MD5.h"
00042 
00043 //
00044 // given a read in a string, pack it into the vector of
00045 // bytes coded as two bases per byte.
00046 //
00047 // The goal is to allow us to more rapidly compare against
00048 // the genome, which is itself packed 2 bases per byte.
00049 //
00050 // Unfortunately, the match position may be odd or even,
00051 // so as a result, we also need to be able to prepad
00052 // with 'N' bases to get the byte alignment the same, so
00053 // padWithNCount may be a positive number indicating how
00054 // many N bases to prepend.
00055 //
00056 void PackedRead::set(const char *rhs, int padWithNCount)
00057 {
00058     clear();
00059 
00060     // pad this packed read with 'N' bases two at a time
00061     while (padWithNCount>1)
00062     {
00063         packedBases.push_back(
00064             BaseAsciiMap::base2int[(int) 'N'] << 4 |
00065             BaseAsciiMap::base2int[(int) 'N']
00066         );
00067         padWithNCount -= 2;
00068         length+=2;
00069     }
00070 
00071     // when we have only one base, pack one 'N' base with
00072     // the first base in rhs if there is one.
00073     if (padWithNCount)
00074     {
00075         // NB: *rhs could be NUL, which is ok here - just keep
00076         // the length straight.
00077         packedBases.push_back(
00078             BaseAsciiMap::base2int[(int) *rhs] << 4 |
00079             BaseAsciiMap::base2int[(int) 'N']
00080         );
00081         // two cases - have characters in rhs or we don't:
00082         if (*rhs)
00083         {
00084             length+=2;  // pad byte plus one byte from rhs
00085             rhs++;
00086         }
00087         else
00088         {
00089             length++;
00090         }
00091         padWithNCount--;    // should now be zero, so superfluous.
00092         assert(padWithNCount==0);
00093     }
00094 
00095     // pad pairs of bases from rhs, two at a time:
00096     while (*rhs && *(rhs+1))
00097     {
00098         packedBases.push_back(
00099             BaseAsciiMap::base2int[(int) *(rhs+1)] << 4 |
00100             BaseAsciiMap::base2int[(int) *(rhs+0)]
00101         );
00102         rhs+=2;
00103         length+=2;
00104     }
00105 
00106     // if there is an odd base left at the end, put it
00107     // in a byte all its own (low 4 bits == 0):
00108     if (*rhs)
00109     {
00110         packedBases.push_back(
00111             BaseAsciiMap::base2int[(int) *(rhs+0)]
00112         );
00113         length++;
00114     }
00115     return;
00116 }
00117 
00118 std::string GenomeSequence::IntegerToSeq(unsigned int n, unsigned int wordsize) const
00119 {
00120     std::string sequence("");
00121     for (unsigned int i = 0; i < wordsize; i ++)
00122         sequence += "N";
00123 
00124     unsigned int clearHigherBits = ~(3U << (wordsize<<1));  // XXX refactor - this appears several places
00125 
00126     if (n > clearHigherBits)
00127         error("%d needs to be a non-negative integer < clearHigherBits\n", n);
00128 
00129     for (unsigned int i = 0; i < wordsize; i++)
00130     {
00131         sequence[wordsize-1-i] = BaseAsciiMap::int2base[n & 3];
00132         n >>= 2;
00133     }
00134     return sequence;
00135 }
00136 
00137 
00138 
00139 GenomeSequence::GenomeSequence()
00140 {
00141     constructorClear();
00142 }
00143 
00144 void GenomeSequence::constructorClear()
00145 {
00146     _debugFlag = 0;
00147     _progressStream = NULL;
00148     _colorSpace = false;
00149     _createOverwrite = false;
00150 }
00151 
00152 void GenomeSequence::setup(const char *referenceFilename)
00153 {
00154     setReferenceName(referenceFilename);
00155 
00156     if (_progressStream) *_progressStream << "open and prefetch reference genome " << referenceFilename << ": " << std::flush;
00157 
00158     if (open(false))
00159     {
00160         std::cerr << "Failed to open reference genome " << referenceFilename << std::endl;
00161         std::cerr << errorStr << std::endl;
00162         exit(1);
00163     }
00164 
00165     prefetch();
00166     if (_progressStream) *_progressStream << "done." << std::endl << std::flush;
00167 }
00168 
00169 GenomeSequence::~GenomeSequence()
00170 {
00171     // free up resources:
00172     _umfaFile.close();
00173 }
00174 
00175 //
00176 // mapped open.
00177 //
00178 // if the file exists, map in into memory, and fill in a few useful
00179 // fields.
00180 //
00181 
00182 bool GenomeSequence::open(bool isColorSpace, int flags)
00183 {
00184     bool rc;
00185 
00186     if (isColorSpace)
00187     {
00188         _umfaFilename = _baseFilename + "-cs.umfa";
00189     }
00190     else
00191     {
00192         _umfaFilename = _baseFilename + "-bs.umfa";
00193     }
00194 
00195     if(access(_umfaFilename.c_str(), R_OK) != 0)
00196     {
00197         // umfa file doesn't exist, so try to create it.
00198         if(create(isColorSpace))
00199         {
00200             // Couldon't access or create the umfa.
00201             std::cerr << "GenomeSequence::open: failed to open file "
00202                       << _umfaFilename
00203                       << " also failed creating it."
00204                       << std::endl;
00205             return true;
00206         }
00207     }
00208 
00209     rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags);
00210     if (rc)
00211     {
00212         std::cerr << "GenomeSequence::open: failed to open file "
00213                   << _umfaFilename
00214                   << std::endl;
00215         return true;
00216     }
00217 
00218     _colorSpace = header->_colorSpace;
00219 
00220     return false;
00221 }
00222 
00223 void GenomeSequence::sanityCheck(MemoryMap &fasta) const
00224 {
00225     unsigned int i;
00226 
00227     unsigned int genomeIndex = 0;
00228     for (i=0; i<fasta.length(); i++)
00229     {
00230         switch (fasta[i])
00231         {
00232             case '>':
00233                 while (fasta[i]!='\n' && fasta[i]!='\r') i++;
00234                 break;
00235             case '\n':
00236             case '\r':
00237                 break;
00238             default:
00239                 assert(BaseAsciiMap::base2int[(int)(*this)[genomeIndex]] == 
00240                        BaseAsciiMap::base2int[(int) fasta[i]]);
00241                 genomeIndex++;
00242                 break;
00243         }
00244     }
00245 }
00246 
00247 #define HAS_SUFFIX(str, suffix) ((strlen(suffix) < str.size()) && (str.substr(str.size() - strlen(suffix)) == suffix))
00248 //
00249 // referenceFilename is  either a fasta or a UM fasta (.fa or .umfa)
00250 // filename.  In both cases, the suffix gets removed and the
00251 // base name is kept for later use depending on context.
00252 // @return always return false
00253 //
00254 bool GenomeSequence::setReferenceName(std::string referenceFilename)
00255 {
00256 
00257     if (HAS_SUFFIX(referenceFilename, ".fa"))
00258     {
00259         _referenceFilename = referenceFilename;
00260         _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3);
00261     }
00262     else if (HAS_SUFFIX(referenceFilename, ".umfa"))
00263     {
00264         _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5);
00265     }
00266     else if (HAS_SUFFIX(referenceFilename, "-cs.umfa"))
00267     {
00268         _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00269     }
00270     else if (HAS_SUFFIX(referenceFilename, "-bs.umfa"))
00271     {
00272         _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00273     }
00274     else
00275     {
00276         _baseFilename = referenceFilename;
00277     }
00278     _fastaFilename = _baseFilename + ".fa";
00279 
00280     if (HAS_SUFFIX(referenceFilename, ".fasta"))
00281     {
00282         _referenceFilename = referenceFilename;
00283         _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 6);
00284         _fastaFilename = _baseFilename + ".fasta";        
00285     }
00286     
00287     return false;
00288 }
00289 
00290 //
00291 // this works in lockstep with ::create to populate
00292 // the per chromosome header fields size and md5
00293 // checksum.
00294 //
00295 // It relies on header->elementCount being set to
00296 // the data length loaded so far ... not the ultimate
00297 // reference length.
00298 //
00299 bool GenomeSequence::setChromosomeMD5andLength(uint32_t whichChromosome)
00300 {
00301     if (whichChromosome>=header->_chromosomeCount) return true;
00302 
00303     ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00304     c->size = header->elementCount - c->start;
00305 
00306     MD5_CTX md5Context;
00307     uint8_t md5Signature[MD5_DIGEST_LENGTH];
00308 
00309     //
00310     // it's easier to ensure we do this right if we just do it
00311     // in one big chunk:
00312     //
00313     char *md5Buffer = (char *) malloc(c->size);
00314 
00315     MD5Init(&md5Context);
00316 
00317     for (genomeIndex_t i = 0; i < c->size; i ++)
00318     {
00319         md5Buffer[i] = (*this)[c->start + i];
00320     }
00321     MD5Update(&md5Context, (unsigned char *) md5Buffer, c->size);
00322     MD5Final((unsigned char *) &md5Signature, &md5Context);
00323     free(md5Buffer);
00324     for (int i=0; i<MD5_DIGEST_LENGTH; i++)
00325     {
00326         // cheesy but works:
00327         sprintf(c->md5+2*i, "%02x", md5Signature[i]);
00328     }
00329     // redundant, strictly speaking due to sprintf NUL terminating
00330     // it's output strings, but put it here anyway.
00331     c->md5[2*MD5_DIGEST_LENGTH] = '\0';
00332 
00333     return false;
00334 }
00335 
00336 //
00337 // Given a buffer with a fasta format contents, count
00338 // the number of chromosomes in it and return that value.
00339 //
00340 static bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
00341 {
00342     chromosomeCount = 0;
00343     baseCount = 0;
00344     bool atLineStart = true;
00345 
00346     //
00347     // loop over the fasta file, essentially matching for the
00348     // pattern '^>.*$' and counting them.
00349     //
00350     for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00351     {
00352         switch (fastaData[fastaIndex])
00353         {
00354             case '\n':
00355             case '\r':
00356                 atLineStart = true;
00357                 break;
00358             case '>':
00359             {
00360                 if (!atLineStart) break;
00361                 chromosomeCount++;
00362                 //
00363                 // eat the rest of the line
00364                 //
00365                 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
00366                 {
00367                     fastaIndex++;
00368                 }
00369                 break;
00370             }
00371             default:
00372                 baseCount++;
00373                 atLineStart = false;
00374                 break;
00375         }
00376 
00377     }
00378     return false;
00379 }
00380 
00381 class PackedSequenceData : public std::vector<uint8_t>
00382 {
00383     std::vector<uint8_t> m_packedBases;
00384 
00385     size_t    m_baseCount;
00386 
00387     void set(size_t index, uint8_t value) {
00388         m_packedBases[index>>1] =
00389             (m_packedBases[index>>1]            // original value
00390             & ~(7<<((index&0x01)<<2)))          // logical AND off the original value
00391             | ((value&0x0f)<<((index&0x1)<<2)); // logical OR in the new value
00392     }
00393 
00394 public:
00395 
00396     void reserve(size_t baseCount) {m_packedBases.reserve(baseCount/2);}
00397     size_t size() {return m_baseCount;}
00398     void clear() {m_packedBases.clear(); m_baseCount = 0;}
00399     uint8_t operator [](size_t index)
00400     {
00401         return (m_packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
00402     }
00403     void push_back(uint8_t base);
00404 };
00405 
00406 //
00407 // Load a fasta format file from filename into the buffer
00408 // provided by the caller.
00409 // While parsing the fasta file, record each chromosome name,
00410 // its start location, and its size.
00411 //
00412 // NB: the caller must implement the logic to determine how
00413 // large the sequence data is.  There is no correct way to do
00414 // this, because we can't reliably estimate here how much sequence
00415 // data is contained in a compressed file.
00416 //
00417 // To safely pre-allocate space in sequenceData, use the reserve() method
00418 // before calling this function.
00419 //
00420 bool loadFastaFile(const char *filename,
00421         std::vector<PackedSequenceData> &sequenceData,
00422         std::vector<std::string> &chromosomeNames)
00423 {
00424     InputFile inputStream(filename, "r", InputFile::DEFAULT);
00425 
00426     if(!inputStream.isOpen()) {
00427         std::cerr << "Failed to open file " << filename << "\n";
00428         return true;
00429     }
00430 
00431     int whichChromosome = -1;
00432     chromosomeNames.clear();
00433 
00434     char ch;
00435     while((ch = inputStream.ifgetc()) != EOF) {
00436         switch (ch)
00437         {
00438             case '\n':
00439             case '\r':
00440                 break;
00441             case '>':
00442             {
00443                 std::string chromosomeName = "";
00444                 //
00445                 // pull out the chromosome new name
00446                 //
00447                 while (!isspace((ch = inputStream.ifgetc())) && ch != EOF)
00448                 {
00449                     chromosomeName += ch;  // slow, but who cares
00450                 }
00451                 //
00452                 // eat the rest of the line
00453                 //
00454                 do {
00455                     ch = inputStream.ifgetc();
00456                 } while(ch != EOF && ch != '\n' && ch != '\r');
00457                 //
00458                 // save the Chromosome name and index into our
00459                 // header so we can use them later.
00460                 //
00461                 chromosomeNames.push_back(chromosomeName);
00462 
00463                 whichChromosome++;
00464 
00465                 break;
00466             }
00467             default:
00468                 // we get here for sequence data.
00469                 //
00470                 // save the base value
00471                 // Note: invalid characters come here as well, but we
00472                 // let ::set deal with mapping them.
00473                 break;
00474         }
00475     }
00476     return false;
00477 }
00478 
00479 //
00480 // recreate the umfa file from a reference fasta format file
00481 //
00482 // The general format of a FASTA file is best described
00483 // on wikipedia at http://en.wikipedia.org/wiki/FASTA_format
00484 //
00485 // The format parsed here is a simpler subset, and is
00486 // described here http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
00487 //
00488 bool GenomeSequence::create(bool isColor)
00489 {
00490     setColorSpace(isColor);
00491 
00492     if (_baseFilename=="")
00493     {
00494         std::cerr << "Base reference filename is empty." << std::endl;
00495         return true;
00496     }
00497 
00498     if (isColorSpace())
00499     {
00500         _umfaFilename = _baseFilename + "-cs.umfa";
00501     }
00502     else
00503     {
00504         _umfaFilename = _baseFilename + "-bs.umfa";
00505     }
00506 
00507     if (!_createOverwrite && access(_umfaFilename.c_str(), R_OK) == 0)
00508     {
00509         std::cerr << "Output file '" << _umfaFilename << "' exists or is not writable - please remove." << std::endl;
00510         return true;
00511     }
00512 
00513     MemoryMap fastaFile;
00514 
00515     if (fastaFile.open(_fastaFilename.c_str()))
00516     {
00517         std::cerr << "failed to open input fasta file '" << _fastaFilename << "'." << std::endl;
00518         return true;
00519     }
00520 
00521     std::cerr << "Creating FASTA "
00522               << (isColorSpace() ? "color space " : "")
00523               << "binary cache file '"
00524               << _umfaFilename
00525               << "'."
00526               << std::endl;
00527 
00528     std::cerr << std::flush;
00529 
00530     //
00531     // simple ptr to fasta data -- just treat the memory map
00532     // as an array of fastaDataSize characters...
00533     //
00534     const char *fasta = (const char *) fastaFile.data;
00535     size_t fastaDataSize = fastaFile.length();
00536 
00537     uint32_t    chromosomeCount = 0;
00538     uint64_t    baseCount = 0;
00539     getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount);
00540 
00541     if (genomeSequenceArray::create(_umfaFilename.c_str(), baseCount, chromosomeCount))
00542     {
00543         std::cerr << "failed to create '"
00544                   << _umfaFilename
00545                   << "'."
00546                   << std::endl;
00547         perror("");
00548         return true;
00549     }
00550     header->elementCount = 0;
00551     header->_colorSpace = isColorSpace();
00552     header->setApplication(_application.c_str());
00553     header->_chromosomeCount = chromosomeCount;
00554     //
00555     // clear out the variable length chromosome info array
00556     //
00557     for (uint32_t i=0; i<header->_chromosomeCount; i++) header->_chromosomes[i].constructorClear();
00558 
00559     std::string chromosomeName;
00560 
00561     //
00562     // for converting the reference to colorspace, the first base is always 5 (in base space it is 'N')
00563     signed char lastBase = BaseAsciiMap::base2int[(int) 'N'];
00564     bool terminateLoad = false;
00565     int percent = -1, newPercent;
00566     uint32_t whichChromosome = 0;
00567     for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00568     {
00569         if (_progressStream)
00570         {
00571             newPercent = (int) (1.0 * fastaIndex / fastaDataSize) * 100;
00572             if (newPercent>percent)
00573             {
00574                 *_progressStream << "\r" << newPercent << "% ";
00575                 *_progressStream << std::flush;
00576                 percent = newPercent;
00577             }
00578         }
00579         switch (fasta[fastaIndex])
00580         {
00581             case '\n':
00582             case '\r':
00583                 break;
00584             case '>':
00585             {
00586                 chromosomeName = "";
00587                 fastaIndex++;       // skip the > char
00588                 //
00589                 // pull out the chromosome new name
00590                 //
00591                 while (!isspace(fasta[fastaIndex]))
00592                 {
00593                     chromosomeName += fasta[fastaIndex++];  // slow, but who cares
00594                 }
00595                 //
00596                 // eat the rest of the line
00597                 //
00598                 while (fasta[fastaIndex]!='\n' && fasta[fastaIndex]!='\r')
00599                 {
00600                     fastaIndex++;
00601                 }
00602                 //
00603                 // save the Chromosome name and index into our
00604                 // header so we can use them later.
00605                 //
00606                 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00607                 c->setChromosomeName(chromosomeName.c_str());
00608                 c->start = header->elementCount;
00609                 // c->size gets computed at the next '>' line or at the EOF
00610 
00611                 if (whichChromosome>0)
00612                 {
00613                     //
00614                     // compute md5 checksum for the chromosome that we just
00615                     // loaded (if there was one) - note that on the last
00616                     // chromosome, we have to duplicate this code after
00617                     // the end of this loop
00618                     //
00619                     setChromosomeMD5andLength(whichChromosome - 1);
00620                 }
00621                 whichChromosome++;
00622                 if (whichChromosome > header->_chromosomeCount)
00623                 {
00624                     std::cerr << "BUG: Exceeded computed chromosome count ("
00625                               << header->_chromosomeCount
00626                               << ") - genome is now truncated at chromosome "
00627                               << header->_chromosomes[header->_chromosomeCount-1].name
00628                               << " (index "
00629                               << header->_chromosomeCount
00630                               << ")."
00631                               << std::endl;
00632                     terminateLoad = true;
00633                 }
00634                 break;
00635             }
00636             default:
00637                 // save the base pair value
00638                 // Note: invalid characters come here as well, but we
00639                 // let ::set deal with mapping them.
00640                 if (isColorSpace())
00641                 {
00642 //
00643 // anything outside these values represents an invalid base
00644 // base codes: 0-> A,    1-> C,     2-> G,      3-> T
00645 // colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red
00646 //
00647                     const char fromBase2CS[] =
00648                     {
00649                         /* 0000 */ 0,   // A->A
00650                         /* 0001 */ 1,   // A->C
00651                         /* 0010 */ 2,   // A->G
00652                         /* 0011 */ 3,   // A->T
00653                         /* 0100 */ 1,   // C->A
00654                         /* 0101 */ 0,   // C->C
00655                         /* 0110 */ 3,   // C->G
00656                         /* 0111 */ 2,   // C->T
00657                         /* 1000 */ 2,   // G->A
00658                         /* 1001 */ 3,   // G->C
00659                         /* 1010 */ 0,   // G->G
00660                         /* 1011 */ 1,   // G->T
00661                         /* 1100 */ 3,   // T->A
00662                         /* 1101 */ 2,   // T->C
00663                         /* 1110 */ 1,   // T->G
00664                         /* 1111 */ 0,   // T->T
00665                     };
00666                     //
00667                     // we are writing color space values on transitions,
00668                     // so we don't write a colorspace value when we
00669                     // get the first base value.
00670                     //
00671                     // On second and subsequent bases, write based on
00672                     // the index table above
00673                     //
00674                     char thisBase = 
00675                         BaseAsciiMap::base2int[(int)(fasta[fastaIndex])];
00676                     if (lastBase>=0)
00677                     {
00678                         char color;
00679                         if (lastBase>3 || thisBase>3) color=4;
00680                         else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
00681                         // re-use the int to base, because ::set expects a base char (ATCG), not
00682                         // a color code (0123).  It should only matter on final output.
00683                         set(header->elementCount++, 
00684                             BaseAsciiMap::int2base[(int) color]);
00685                     }
00686                     lastBase = thisBase;
00687                 }
00688                 else
00689                 {
00690                     set(header->elementCount++, toupper(fasta[fastaIndex]));
00691                 }
00692                 break;
00693         }
00694 
00695         //
00696         // slightly awkward exit handling when we exceed the fixed
00697         // number of chromosomes
00698         //
00699         if (terminateLoad) break;
00700     }
00701 
00702     //
00703     // also slightly awkward code to handle the last dangling chromosome...
00704     // all we should need to do is compute the md5 checksum
00705     //
00706     if (whichChromosome==0)
00707     {
00708         fastaFile.close();
00709         throw std::runtime_error("No chromosomes found - aborting!");
00710     }
00711     else
00712     {
00713         setChromosomeMD5andLength(whichChromosome-1);
00714     }
00715 
00716     fastaFile.close();
00717 
00718     if (_progressStream) *_progressStream << "\r";
00719 
00720     std::cerr << "FASTA binary cache file '"
00721               << _umfaFilename
00722               << "' created."
00723               << std::endl;
00724 
00725     //
00726     // leave the umfastaFile open in case caller wants to use it
00727     //
00728     return false;
00729 }
00730 
00731 int GenomeSequence::getChromosomeCount() const
00732 {
00733     return header->_chromosomeCount;
00734 }
00735 
00736 //return chromosome index: 0, 1, ... 24;
00737 int GenomeSequence::getChromosome(genomeIndex_t position) const
00738 {
00739     if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX;
00740 
00741     if (header->_chromosomeCount == 0)
00742         return INVALID_CHROMOSOME_INDEX;
00743 
00744     int start = 0;
00745     int stop = header->_chromosomeCount - 1;
00746 
00747     // eliminate case where position is in the last chromosome, since the loop
00748     // below falls off the end of the list if it in the last one.
00749 
00750     if (position > header->_chromosomes[stop].start)
00751         return (stop);
00752 
00753     while (start <= stop)
00754     {
00755         int middle = (start + stop) / 2;
00756 
00757         if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start)
00758             return middle;
00759 
00760         if (position == header->_chromosomes[middle + 1].start)
00761             return (middle + 1);
00762 
00763         if (position > header->_chromosomes[middle + 1].start)
00764             start = middle + 1;
00765 
00766         if (position < header->_chromosomes[middle].start)
00767             stop = middle - 1;
00768     }
00769 
00770     return -1;
00771 }
00772 
00773 //
00774 // Given a chromosome name and 1-based chromosome index, return the
00775 // genome index (0 based) into sequence for it.
00776 //
00777 // NB: the header->chromosomes array contains zero based genome positions
00778 //
00779 genomeIndex_t GenomeSequence::getGenomePosition(
00780     const char *chromosomeName,
00781     unsigned int chromosomeIndex) const
00782 {
00783     genomeIndex_t i = getGenomePosition(chromosomeName);
00784     if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00785     return i + chromosomeIndex - 1;
00786 }
00787 
00788 genomeIndex_t GenomeSequence::getGenomePosition(
00789     int chromosome,
00790     unsigned int chromosomeIndex) const
00791 {
00792     if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX;
00793 
00794     genomeIndex_t i = header->_chromosomes[chromosome].start;
00795     if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00796     return i + chromosomeIndex - 1;
00797 }
00798 
00799 //
00800 // return the genome index (0 based) of the start of the named
00801 // chromosome.  If none is found, INVALID_GENOME_INDEX is returned.
00802 //
00803 // XXX may need to speed this up - and smarten it up with some
00804 // modest chromosome name parsing.... e.g. '%d/X/Y' or 'chr%d/chrX/chrY' or
00805 // other schemes.
00806 //
00807 genomeIndex_t GenomeSequence::getGenomePosition(const char *chromosomeName) const
00808 {
00809     int chromosome = getChromosome(chromosomeName);
00810     if (chromosome==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00811     return header->_chromosomes[chromosome].start;
00812 }
00813 
00814 int GenomeSequence::getChromosome(const char *chromosomeName) const
00815 {
00816     unsigned int i;
00817     for (i=0; i<header->_chromosomeCount; i++)
00818     {
00819         if (strcmp(header->_chromosomes[i].name, chromosomeName)==0)
00820         {
00821             return i;
00822         }
00823     }
00824     return INVALID_CHROMOSOME_INDEX;
00825 }
00826 
00827 //
00828 // Given a read, reverse the string and swap the base
00829 // pairs for the reverse strand equivalents.
00830 //
00831 void GenomeSequence::getReverseRead(std::string &read)
00832 {
00833     std::string newRead;
00834     if (read.size()) for (int32_t i=(int) read.size() - 1; i>=0; i--)
00835         {
00836             newRead.push_back(BasePair(read[i]));
00837         }
00838     read = newRead;
00839 }
00840 
00841 void GenomeSequence::getReverseRead(String& read)
00842 {
00843     int i = 0;
00844     int j = read.Length()-1;
00845     char temp;
00846     while (i < j)
00847     {
00848         temp = read[j];
00849         read[j] = read[i];
00850         read[i] = temp;
00851     }
00852 }
00853 
00854 #define ABS(x) ( (x) > 0 ? (x) : -(x) )
00855 int GenomeSequence::debugPrintReadValidation(
00856     std::string &read,
00857     std::string &quality,
00858     char   direction,
00859     genomeIndex_t   readLocation,
00860     int sumQuality,
00861     int mismatchCount,
00862     bool recurse
00863 ) 
00864 {
00865     int validateSumQ = 0;
00866     int validateMismatchCount = 0;
00867     int rc = 0;
00868     std::string genomeData;
00869 
00870     for (uint32_t i=0; i<read.size(); i++)
00871     {
00872         if (tolower(read[i]) != tolower((*this)[readLocation + i]))
00873         {
00874             validateSumQ += quality[i] - '!';
00875             // XXX no longer valid:
00876             if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
00877             genomeData.push_back(tolower((*this)[readLocation + i]));
00878         }
00879         else
00880         {
00881             genomeData.push_back(toupper((*this)[readLocation + i]));
00882         }
00883     }
00884     assert(validateSumQ>=0);
00885     if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
00886     {
00887         printf("SUMQ: Original Genome: %s  test read: %s : actual sumQ = %d, test sumQ = %d\n",
00888                genomeData.c_str(),
00889                read.c_str(),
00890                validateSumQ,
00891                sumQuality
00892               );
00893         rc++;
00894     }
00895     else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
00896     {
00897         printf("MISM: Original Genome: %s  test read: %s : actual mismatch %d test mismatches %d\n",
00898                genomeData.c_str(),
00899                read.c_str(),
00900                validateMismatchCount,
00901                mismatchCount
00902               );
00903         rc++;
00904     }
00905     else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
00906     {
00907         printf("BOTH: Original Genome: %s  test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
00908                genomeData.c_str(),
00909                read.c_str(),
00910                validateSumQ,
00911                sumQuality,
00912                validateMismatchCount,
00913                mismatchCount
00914               );
00915         rc++;
00916     }
00917 
00918     if (recurse && ABS(validateMismatchCount - mismatchCount) > (int) read.size()/2)
00919     {
00920         printf("large mismatch difference, trying reverse strand: ");
00921         std::string reverseRead = read;
00922         std::string reverseQuality = quality;
00923         getReverseRead(reverseRead);
00924         reverse(reverseQuality.begin(), reverseQuality.end());
00925         rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
00926     }
00927     return rc;
00928 }
00929 #undef ABS
00930 
00931 
00932 bool GenomeSequence::wordMatch(unsigned int index, std::string &word) const
00933 {
00934     for (uint32_t i = 0; i<word.size(); i++)
00935     {
00936         if ((*this)[index + i] != word[i]) return false;
00937     }
00938     return true;
00939 }
00940 
00941 bool GenomeSequence::printNearbyWords(unsigned int index, unsigned int deviation, std::string &word) const
00942 {
00943     for (unsigned int i = index - deviation; i < index + deviation; i++)
00944     {
00945         if (wordMatch(i, word))
00946         {
00947             std::cerr << "word '"
00948                       << word
00949                       << "' found "
00950                       << i - index
00951                       << " away from position "
00952                       << index
00953                       << "."
00954                       << std::endl;
00955         }
00956     }
00957     return false;
00958 }
00959 
00960 void GenomeSequence::dumpSequenceSAMDictionary(std::ostream &file) const
00961 {
00962     for (unsigned int i=0; i<header->_chromosomeCount; i++)
00963     {
00964         file
00965         <<  "@SQ"
00966         << "\tSN:" << header->_chromosomes[i].name  // name
00967         << "\tLN:" << header->_chromosomes[i].size  // number of bases
00968         << "\tAS:" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
00969         << "\tM5:" << header->_chromosomes[i].md5
00970         << "\tUR:" << header->_chromosomes[i].uri
00971         << "\tSP:" << header->_chromosomes[i].species // e.g. Homo_sapiens
00972         << std::endl;
00973     }
00974 }
00975 
00976 void GenomeSequence::dumpHeaderTSV(std::ostream &file) const
00977 {
00978     file << "# Reference: " << _baseFilename << std::endl;
00979     file << "# SN: sample name - must be unique" << std::endl;
00980     file << "# AS: assembly name" << std::endl;
00981     file << "# SP: species" << std::endl;
00982     file << "# LN: chromosome/contig length" << std::endl;
00983     file << "# M5: chromosome/contig MD5 checksum" << std::endl;
00984     file << "# LN and M5 are only printed for informational purposes." << std::endl;
00985     file << "# Karma will only set those values when creating the index." << std::endl;
00986     file << "SN" << "\t" << "AS" << "\t" << "SP" << "\t" << "UR" << "\t" << "LN" << "\t" << "M5" << std::endl;
00987     for (unsigned int i=0; i<header->_chromosomeCount; i++)
00988     {
00989         file
00990         << header->_chromosomes[i].name  // name
00991         << "\t" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
00992         << "\t" << header->_chromosomes[i].uri
00993         << "\t" << header->_chromosomes[i].species // e.g. Homo_sapiens
00994         << "\t" << header->_chromosomes[i].size  // number of bases
00995         << "\t" << header->_chromosomes[i].md5
00996         << std::endl;
00997     }
00998 }
00999 
01000 
01001 void GenomeSequence::getString(std::string &str, int chromosome, uint32_t index, int baseCount) const
01002 {
01003     //
01004     // calculate the genome index for the lazy caller...
01005     //
01006     genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1;
01007 
01008     getString(str, genomeIndex, baseCount);
01009 }
01010 
01011 void GenomeSequence::getString(String &str, int chromosome, uint32_t index, int baseCount) const
01012 {
01013     std::string string;
01014     this-> getString(string, chromosome, index, baseCount);
01015     str = string.c_str();
01016 }
01017 
01018 void GenomeSequence::getString(std::string &str, genomeIndex_t index, int baseCount) const
01019 {
01020     str.clear();
01021     if (baseCount > 0)
01022     {
01023         for (int i=0; i<baseCount; i++)
01024         {
01025             str.push_back((*this)[index + i]);
01026         }
01027     }
01028     else
01029     {
01030         // if caller passed negative basecount, give them
01031         // the read for the 3' end
01032         for (int i=0; i< -baseCount; i++)
01033         {
01034             str.push_back(BaseAsciiMap::base2complement[(int)(*this)[index + i]]);
01035         }
01036     }
01037 }
01038 
01039 void GenomeSequence::getString(String &str, genomeIndex_t index, int baseCount) const
01040 {
01041     std::string string;
01042     getString(string, index, baseCount);
01043     str = string.c_str();
01044 }
01045 
01046 void GenomeSequence::getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const
01047 {
01048     str.clear();
01049     if (baseCount > 0)
01050     {
01051         for (int i=0; i<baseCount; i++)
01052         {
01053             char base = (*this)[index + i];
01054             if (in(index+i, highLightStart, highLightEnd))
01055                 base = tolower(base);
01056             str.push_back(base);
01057         }
01058     }
01059     else
01060     {
01061         // if caller passed negative basecount, give them
01062         // the read for the 3' end
01063         for (int i=0; i< -baseCount; i++)
01064         {
01065             char base = BaseAsciiMap::base2complement[(int)(*this)[index + i]];
01066             if (in(index+i, highLightStart, highLightEnd))
01067                 base = tolower(base);
01068             str.push_back(base);
01069         }
01070     }
01071 }
01072 
01073 void GenomeSequence::print30(genomeIndex_t index) const
01074 {
01075     std::cout << "index: " << index << "\n";
01076     for (genomeIndex_t i=index-30; i<index+30; i++)
01077         std::cout << (*this)[i];
01078     std::cout << "\n";
01079     for (genomeIndex_t i=index-30; i<index; i++)
01080         std::cout << " ";
01081     std::cout << "^";
01082     std::cout << std::endl;
01083 }
01084 
01085 void GenomeSequence::getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const
01086 {
01087     result = "";
01088     for (uint32_t i=0; i < read.size(); i++)
01089     {
01090         if (read[i] == (*this)[location+i])
01091             result.push_back(' ');
01092         else
01093             result.push_back('^');
01094     }
01095 }
01096 
01097 void GenomeSequence::getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const
01098 {
01099     result = "";
01100     for (uint32_t i=0; i < read.size(); i++)
01101     {
01102         if (read[i] == (*this)[location+i])
01103             result.push_back(toupper(read[i]));
01104         else
01105             result.push_back(tolower(read[i]));
01106     }
01107 }
01108 
01109 genomeIndex_t GenomeSequence::simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const
01110 {
01111     int bestScore = 1000000; // either mismatch count or sumQ
01112     genomeIndex_t bestMatchLocation = INVALID_GENOME_INDEX;
01113     for (int i=-windowSize; i<windowSize; i++)
01114     {
01115         int newScore;
01116 
01117         if (i<0 && ((uint32_t) -i) > index) continue;
01118         if (index + i + read.size() >= getNumberBases()) continue;
01119 
01120         if (quality=="")
01121         {
01122             newScore = this->getMismatchCount(read, index + i);
01123         }
01124         else
01125         {
01126             newScore = this->getSumQ(read, quality, index + i);
01127         }
01128         if (newScore < bestScore)
01129         {
01130             bestScore = newScore;
01131             bestMatchLocation = index + i;
01132         }
01133     }
01134     return bestMatchLocation;
01135 }
01136 
01137 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h)
01138 {
01139     stream << (MemoryMapArrayHeader &) h;
01140     stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01141     stream << "isColorSpace: " << h._colorSpace << std::endl;
01142     stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01143     uint64_t totalSize = 0;
01144     for (uint32_t i=0; i < h._chromosomeCount; i++)
01145     {
01146         totalSize += h._chromosomes[i].size;
01147         stream << "Chromosome Index " << i << " name: " << h._chromosomes[i].name << std::endl;
01148         stream << "Chromosome Index " << i << " whole genome start: " << h._chromosomes[i].start << std::endl;
01149         stream << "Chromosome Index " << i << " whole genome size: " << h._chromosomes[i].size << std::endl;
01150         stream << "Chromosome Index " << i << " md5 checksum: " << h._chromosomes[i].md5 << std::endl;
01151         stream << "Chromosome Index " << i << " assemblyID: " << h._chromosomes[i].assemblyID << std::endl;
01152         stream << "Chromosome Index " << i << " species: " << h._chromosomes[i].species << std::endl;
01153         stream << "Chromosome Index " << i << " URI: " << h._chromosomes[i].uri << std::endl;
01154     }
01155     stream << "Total Genome Size: " << totalSize << " bases."<< std::endl;
01156     if (totalSize != h.elementCount)
01157     {
01158         stream << "Total Genome Size: does not match elementCount!\n";
01159     }
01160 
01161     stream << std::endl;
01162     return stream;
01163 }
01164 
01165 void GenomeSequence::getChromosomeAndIndex(std::string &s, genomeIndex_t i) const
01166 {
01167     int whichChromosome = 0;
01168 
01169     whichChromosome = getChromosome(i);
01170 
01171     if (whichChromosome == INVALID_CHROMOSOME_INDEX)
01172     {
01173         s = "invalid genome index";     // TODO include the index in error
01174     }
01175     else
01176     {
01177         std::ostringstream buf;
01178         genomeIndex_t   chromosomeIndex = i - getChromosomeStart(whichChromosome) + 1;
01179         buf << header->_chromosomes[whichChromosome].name  << ":"  << chromosomeIndex;
01180 #if 0
01181         buf << " (GenomeIndex " << i << ")";
01182 #endif
01183         s = buf.str();
01184     }
01185     return;
01186 }
01187 
01188 
01189 void GenomeSequence::getChromosomeAndIndex(String& s, genomeIndex_t i) const
01190 {
01191     std::string ss;
01192     getChromosomeAndIndex(ss, i);
01193     s = ss.c_str();
01194     return;
01195 }
01196 
01197 
01198 //
01199 // This is intended to be a helper routine to get dbSNP files
01200 // loaded.  In some cases, we will load into an mmap() file (ie
01201 // when we are creating it), in others, we will simply be loading
01202 // an existing dbSNP file into RAM (when the binary file does not
01203 // exist or when we are running with useMemoryMapFlag == false.
01204 //
01205 // Assume that dbSNP exists, is writable, and is the right size.
01206 //
01207 // Using the dbSNPFilename given, mark each dbSNP position
01208 // with a bool true.
01209 //
01210 // Return value:
01211 //   True: if populateDBSNP() succeed
01212 //   False: if not succeed
01213 bool GenomeSequence::populateDBSNP(
01214     mmapArrayBool_t &dbSNP,
01215     IFILE inputFile) const
01216 {
01217     assert(dbSNP.getElementCount() == getNumberBases());
01218 
01219     if(inputFile == NULL)
01220     {
01221         // FAIL, file not opened.
01222         return(false);
01223     }
01224 
01225     std::string chromosomeName;
01226     std::string position;
01227     genomeIndex_t chromosomePosition1;  // 1-based
01228     uint64_t    ignoredLineCount = 0;
01229 
01230     // Read til the end of the file.
01231     char* postPosPtr = NULL;
01232     while(!inputFile->ifeof())
01233     {
01234         chromosomeName.clear();
01235         position.clear();
01236         // Read the chromosome
01237         if(inputFile->readTilTab(chromosomeName) <= 0)
01238         {
01239             // hit either eof or end of line, check if
01240             // it is a header.
01241             if(chromosomeName.size()>0 && chromosomeName[0]=='#')
01242             {
01243                 // header, so just continue.
01244                 continue;
01245             }
01246             // Not the header, so this line is poorly formatted.
01247             ++ignoredLineCount;
01248             // Continue to the next line.
01249             continue;
01250         }
01251 
01252         // Check if it is a header line.
01253         if(chromosomeName.size()>0 && chromosomeName[0]=='#')
01254         {
01255             // did not hit eof or end of line, 
01256             // so discard the rest of the line.
01257             inputFile->discardLine();
01258             continue;
01259         }
01260 
01261         // Not a header, so read the position.
01262         if(inputFile->readTilTab(position) > 0)
01263         {
01264             // Additional data on the line, so discard it.
01265             inputFile->discardLine();
01266         }
01267 
01268         // Convert the position to a string.
01269         chromosomePosition1 = strtoul(position.c_str(), &postPosPtr, 0);
01270 
01271 
01272         if(postPosPtr == position.c_str())
01273         {
01274             ++ignoredLineCount;
01275             continue;
01276         }
01277 
01278         // 1-based genome index.
01279         genomeIndex_t genomeIndex = 
01280             getGenomePosition(chromosomeName.c_str(), chromosomePosition1);
01281 
01282         // if the genome index is invalid, ignore it
01283         if((genomeIndex == INVALID_GENOME_INDEX) || 
01284            (genomeIndex > getNumberBases()))
01285         {
01286             ignoredLineCount++;
01287             continue;
01288         }
01289 
01290         dbSNP.set(genomeIndex, true);
01291     }
01292 
01293     if (ignoredLineCount > 0)
01294     {
01295         std::cerr << "GenomeSequence::populateDBSNP: ignored " << ignoredLineCount << " SNP positions due to invalid format of line." << std::endl;
01296         return false;
01297     }
01298     return true;
01299 }
01300 
01301 bool GenomeSequence::loadDBSNP(
01302     mmapArrayBool_t &dbSNP,
01303     const char *inputFileName) const
01304 {
01305     //
01306     // the goal in this section of code is to allow the user
01307     // to either specify a valid binary version of the SNP file,
01308     // or the original text file that it gets created from.
01309     //
01310     // To do this, we basically open, sniff the error message,
01311     // and if it claims it is not a binary version of the file,
01312     // we go ahead and treat it as the text file and use the
01313     // GenomeSequence::populateDBSNP method to load it.
01314     //
01315     // Further checking is really needed to ensure users don't
01316     // mix a dbSNP file for a different reference, since it is really
01317     // easy to do.
01318     //
01319     if (strlen(inputFileName)!=0)
01320     {
01321         std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush;
01322 
01323         if (dbSNP.open(inputFileName, O_RDONLY))
01324         {
01325             //
01326             // failed to open, possibly due to bad magic.
01327             //
01328             // this is really awful ... need to have a return
01329             // code that is smart enough to avoid this ugliness:
01330             //
01331             if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos)
01332             {
01333                 std::cerr << "Error: " << dbSNP.getErrorString() << std::endl;
01334                 exit(1);
01335             }
01336             //
01337             // we have a file, assume we can load it as a text file
01338             //
01339             IFILE inputFile = ifopen(inputFileName, "r");
01340             if(inputFile == NULL)
01341             {
01342                 std::cerr << "Error: failed to open " << inputFileName << std::endl;
01343                 exit(1);
01344             }
01345 
01346             std::cerr << "(as text file) ";
01347 
01348             // anonymously (RAM resident only) create:
01349             dbSNP.create(getNumberBases());
01350 
01351             // now load it into RAM
01352             populateDBSNP(dbSNP, inputFile);
01353             ifclose(inputFile);
01354 
01355         }
01356         else
01357         {
01358             std::cerr << "(as binary mapped file) ";
01359         }
01360 
01361         std::cerr << "DONE!" << std::endl;
01362         return false;
01363     }
01364     else
01365     {
01366         return true;
01367     }
01368 }
01369 
01370 
01371 #if defined(TEST)
01372 
01373 void simplestExample(void)
01374 {
01375     GenomeSequence  reference;
01376     genomeIndex_t   index;
01377 
01378     // a particular reference is set by:
01379     // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa")
01380     //
01381     // In the above example, the suffix .fa is stripped and replaced with .umfa,
01382     // which contains the actual file being opened.
01383     //
01384     if (reference.open())
01385     {
01386         perror("GenomeSequence::open");
01387         exit(1);
01388     }
01389 
01390 
01391     index = 1000000000; // 10^9
01392     //
01393     // Write the base at the given index.  Here, index is 0 based,
01394     // and is across the whole genome, as all chromosomes are sequentially
01395     // concatenated, so the allowed range is
01396     //
01397     // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last))
01398     //
01399     // (where int last = reference.getChromosomeCount() - 1;)
01400     //
01401     std::cout << "base[" << index << "] = " << reference[index] << std::endl;
01402 
01403     //
01404     // Example for finding chromosome and one based chromosome position given
01405     // and absolute position on the genome in 'index':
01406     //
01407     int chr = reference.getChromosome(index);
01408     genomeIndex_t   chrIndex = index - reference.getChromosomeStart(chr) + 1;   // 1-based
01409 
01410     std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
01411 
01412     //
01413     // Example for finding an absolute genome index position when the
01414     // chromosome name and one based position are known:
01415     //
01416     const char *chromosomeName = "5";
01417     chr = reference.getChromosome(chromosomeName);     // 0-based
01418     chrIndex = 100000;                      // 1-based
01419 
01420     index = reference.getChromosomeStart(chr) + chrIndex - 1;
01421 
01422     std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
01423 
01424     reference.close();
01425 }
01426 
01427 void testGenomeSequence(void)
01428 {
01429     GenomeSequence reference;
01430 
01431 #if 0
01432     std::string referenceName = "someotherreference";
01433     if (reference.setFastaName(referenceName))
01434     {
01435         std::cerr << "failed to open reference file "
01436                   << referenceName
01437                   << std::endl;
01438         exit(1);
01439     }
01440 #endif
01441 
01442     std::cerr << "open and prefetch the reference genome: ";
01443 
01444     // open it
01445     if (reference.open())
01446     {
01447         exit(1);
01448     }
01449     std::cerr << "done!" << std::endl;
01450 
01451     //
01452     // For the human genome, genomeIndex ranges from 0 to 3.2x10^9
01453     //
01454     genomeIndex_t   genomeIndex;    // 0 based
01455     unsigned int chromosomeIndex;   // 1 based
01456     unsigned int chromosome;        // 0..23 or so
01457     std::string chromosomeName;
01458 
01459     //
01460     // Here we'll start with a chromosome name, then obtain the genome
01461     // index, and use it to find the base we want:
01462     //
01463     chromosomeName = "2";
01464     chromosomeIndex = 1234567;
01465     // this call is slow (string search for chromsomeName):
01466     genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
01467     assert(genomeIndex!=INVALID_GENOME_INDEX);
01468     std::cout << "Chromosome " << chromosomeName << ", index ";
01469     std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
01470     std::cout << " at genome index position " << genomeIndex << std::endl;
01471 
01472     //
01473     // now reverse it - given a genomeIndex from above, find the chromosome
01474     // name and index:
01475     //
01476 
01477     // slow (binary search on genomeIndex):
01478     chromosome = reference.getChromosome(genomeIndex);
01479     unsigned int newChromosomeIndex;
01480     // not slow:
01481     newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
01482 
01483     assert(chromosomeIndex == newChromosomeIndex);
01484 
01485     // more testing... at least test and use PackedRead:
01486     //
01487     PackedRead pr;
01488 
01489     pr.set("ATCGATCG", 0);
01490     assert(pr.size()==8);
01491     assert(pr[0]==BaseAsciiMap::base2int[(int) 'A']);
01492     assert(pr[1]==BaseAsciiMap::base2int[(int) 'T']);
01493     assert(pr[2]==BaseAsciiMap::base2int[(int) 'C']);
01494     assert(pr[3]==BaseAsciiMap::base2int[(int) 'G']);
01495     pr.set("ATCGATCG", 1);
01496     assert(pr.size()==9);
01497     pr.set("", 0);
01498     assert(pr.size()==0);
01499     pr.set("", 1);
01500     assert(pr.size()==1);
01501     pr.set("", 2);
01502     assert(pr.size()==2);
01503     pr.set("", 3);
01504     assert(pr.size()==3);
01505     assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']);
01506     assert(pr[1]==BaseAsciiMap::base2int[(int) 'N']);
01507     assert(pr[2]==BaseAsciiMap::base2int[(int) 'N']);
01508     pr.set("C", 1);
01509     assert(pr.size()==2);
01510     assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']);
01511     assert(pr[1]==BaseAsciiMap::base2int[(int) 'C']);
01512 
01513 }
01514 
01515 //
01516 // After I build libcsg, I compile and run this test code using:
01517 //
01518 // g++ -DTEST -o try GenomeSequence.cpp -L. -lcsg -lm -lz -lssl
01519 // you also may need -fno-rtti
01520 // ./try
01521 //
01522 int main(int argc, const char **argv)
01523 {
01524 #if 1
01525     simplestExample();
01526 #else
01527     testGenomeSequence();
01528 #endif
01529     exit(0);
01530 }
01531 
01532 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends