GenomeSequence.cpp

00001 /*
00002  *  Copyright (C) 2010-2012  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "assert.h"
00019 #include "ctype.h"
00020 #include "stdio.h"
00021 #include "Error.h"
00022 
00023 
00024 #include "Generic.h"
00025 #include "GenomeSequence.h"
00026 
00027 #include <algorithm>
00028 #include <istream>
00029 #include <fstream>
00030 #include <sstream>
00031 #include <stdexcept>
00032 
00033 // not general use:
00034 #include "CSG_MD5.h"
00035 
00036 //
00037 // given a read in a string, pack it into the vector of
00038 // bytes coded as two bases per byte.
00039 //
00040 // The goal is to allow us to more rapidly compare against
00041 // the genome, which is itself packed 2 bases per byte.
00042 //
00043 // Unfortunately, the match position may be odd or even,
00044 // so as a result, we also need to be able to prepad
00045 // with 'N' bases to get the byte alignment the same, so
00046 // padWithNCount may be a positive number indicating how
00047 // many N bases to prepend.
00048 //
00049 void PackedRead::set(const char *rhs, int padWithNCount)
00050 {
00051     clear();
00052 
00053     // pad this packed read with 'N' bases two at a time
00054     while (padWithNCount>1)
00055     {
00056         packedBases.push_back(
00057             BaseAsciiMap::base2int[(int) 'N'] << 4 |
00058             BaseAsciiMap::base2int[(int) 'N']
00059         );
00060         padWithNCount -= 2;
00061         length+=2;
00062     }
00063 
00064     // when we have only one base, pack one 'N' base with
00065     // the first base in rhs if there is one.
00066     if (padWithNCount)
00067     {
00068         // NB: *rhs could be NUL, which is ok here - just keep
00069         // the length straight.
00070         packedBases.push_back(
00071             BaseAsciiMap::base2int[(int) *rhs] << 4 |
00072             BaseAsciiMap::base2int[(int) 'N']
00073         );
00074         // two cases - have characters in rhs or we don't:
00075         if (*rhs)
00076         {
00077             length+=2;  // pad byte plus one byte from rhs
00078             rhs++;
00079         }
00080         else
00081         {
00082             length++;
00083         }
00084         padWithNCount--;    // should now be zero, so superfluous.
00085         assert(padWithNCount==0);
00086     }
00087 
00088     // pad pairs of bases from rhs, two at a time:
00089     while (*rhs && *(rhs+1))
00090     {
00091         packedBases.push_back(
00092             BaseAsciiMap::base2int[(int) *(rhs+1)] << 4 |
00093             BaseAsciiMap::base2int[(int) *(rhs+0)]
00094         );
00095         rhs+=2;
00096         length+=2;
00097     }
00098 
00099     // if there is an odd base left at the end, put it
00100     // in a byte all its own (low 4 bits == 0):
00101     if (*rhs)
00102     {
00103         packedBases.push_back(
00104             BaseAsciiMap::base2int[(int) *(rhs+0)]
00105         );
00106         length++;
00107     }
00108     return;
00109 }
00110 
00111 std::string GenomeSequence::IntegerToSeq(unsigned int n, unsigned int wordsize) const
00112 {
00113     std::string sequence("");
00114     for (unsigned int i = 0; i < wordsize; i ++)
00115         sequence += "N";
00116 
00117     unsigned int clearHigherBits = ~(3U << (wordsize<<1));  // XXX refactor - this appears several places
00118 
00119     if (n > clearHigherBits)
00120         error("%d needs to be a non-negative integer < clearHigherBits\n", n);
00121 
00122     for (unsigned int i = 0; i < wordsize; i++)
00123     {
00124         sequence[wordsize-1-i] = BaseAsciiMap::int2base[n & 3];
00125         n >>= 2;
00126     }
00127     return sequence;
00128 }
00129 
00130 
00131 
00132 GenomeSequence::GenomeSequence()
00133 {
00134     constructorClear();
00135 }
00136 
00137 void GenomeSequence::constructorClear()
00138 {
00139     _debugFlag = 0;
00140     _progressStream = NULL;
00141     _colorSpace = false;
00142     _createOverwrite = false;
00143 }
00144 
00145 void GenomeSequence::setup(const char *referenceFilename)
00146 {
00147     setReferenceName(referenceFilename);
00148 
00149     if (_progressStream) *_progressStream << "open and prefetch reference genome " << referenceFilename << ": " << std::flush;
00150 
00151     if (open(false))
00152     {
00153         std::cerr << "Failed to open reference genome " << referenceFilename << std::endl;
00154         std::cerr << errorStr << std::endl;
00155         exit(1);
00156     }
00157 
00158     prefetch();
00159     if (_progressStream) *_progressStream << "done." << std::endl << std::flush;
00160 }
00161 
00162 GenomeSequence::~GenomeSequence()
00163 {
00164     // free up resources:
00165     _umfaFile.close();
00166 }
00167 
00168 //
00169 // mapped open.
00170 //
00171 // if the file exists, map in into memory, and fill in a few useful
00172 // fields.
00173 //
00174 // isColorSpace is used only when the baseFilename is empty, so
00175 // that we can choose an appropriate default reference file.
00176 //
00177 
00178 bool GenomeSequence::open(bool isColorSpace, int flags)
00179 {
00180     bool rc;
00181 
00182     if (isColorSpace)
00183     {
00184         _umfaFilename = _baseFilename + "-cs.umfa";
00185     }
00186     else
00187     {
00188         _umfaFilename = _baseFilename + "-bs.umfa";
00189     }
00190 
00191     if(access(_umfaFilename.c_str(), R_OK) != 0)
00192     {
00193         // umfa file doesn't exist, so try to create it.
00194         if(create(isColorSpace))
00195         {
00196             // Couldon't access or create the umfa.
00197             std::cerr << "GenomeSequence::open: failed to open file "
00198                       << _umfaFilename
00199                       << " also failed creating it."
00200                       << std::endl;
00201             return true;
00202         }
00203     }
00204 
00205     rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags);
00206     if (rc)
00207     {
00208         std::cerr << "GenomeSequence::open: failed to open file "
00209                   << _umfaFilename
00210                   << std::endl;
00211         return true;
00212     }
00213 
00214     _colorSpace = header->_colorSpace;
00215 
00216     return false;
00217 }
00218 
00219 void GenomeSequence::sanityCheck(MemoryMap &fasta) const
00220 {
00221     unsigned int i;
00222 
00223     unsigned int genomeIndex = 0;
00224     for (i=0; i<fasta.length(); i++)
00225     {
00226         switch (fasta[i])
00227         {
00228             case '>':
00229                 while (fasta[i]!='\n' && fasta[i]!='\r') i++;
00230                 break;
00231             case '\n':
00232             case '\r':
00233                 break;
00234             default:
00235                 assert(BaseAsciiMap::base2int[(int)(*this)[genomeIndex]] == 
00236                        BaseAsciiMap::base2int[(int) fasta[i]]);
00237 #if 0
00238                 if(BaseAsciiMap::base2int[(*this)[genomeIndex]] != 
00239                    BaseAsciiMap::base2int[(int) fasta[i]])
00240                 {
00241                     int bp1 = BaseAsciiMap::base2int[(*this)[genomeIndex]];
00242                     int bp2 = BaseAsciiMap::base2int[fasta[i]];
00243                     printf("failing at genome index = %u, fasta index = %u.\n",
00244                            genomeIndex,i);
00245                 }
00246 #endif
00247                 genomeIndex++;
00248                 break;
00249         }
00250     }
00251 }
00252 
00253 #define HAS_SUFFIX(str, suffix) ((strlen(suffix) < str.size()) && (str.substr(str.size() - strlen(suffix)) == suffix))
00254 //
00255 // referenceFilename is  either a fasta or a UM fasta (.fa or .umfa)
00256 // filename.  In both cases, the suffix gets removed and the
00257 // base name is kept for later use depending on context.
00258 // @return always return false
00259 //
00260 bool GenomeSequence::setReferenceName(std::string referenceFilename)
00261 {
00262 
00263     if (HAS_SUFFIX(referenceFilename, ".fa"))
00264     {
00265         _referenceFilename = referenceFilename;
00266         _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3);
00267     }
00268     else if (HAS_SUFFIX(referenceFilename, ".umfa"))
00269     {
00270         _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5);
00271     }
00272     else if (HAS_SUFFIX(referenceFilename, "-cs.umfa"))
00273     {
00274         _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00275     }
00276     else if (HAS_SUFFIX(referenceFilename, "-bs.umfa"))
00277     {
00278         _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00279     }
00280     else
00281     {
00282         _baseFilename = referenceFilename;
00283     }
00284     _fastaFilename = _baseFilename + ".fa";
00285 
00286     if (HAS_SUFFIX(referenceFilename, ".fasta"))
00287     {
00288         _referenceFilename = referenceFilename;
00289         _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 6);
00290         _fastaFilename = _baseFilename + ".fasta";        
00291     }
00292     
00293     return false;
00294 }
00295 
00296 //
00297 // this works in lockstep with ::create to populate
00298 // the per chromosome header fields size and md5
00299 // checksum.
00300 //
00301 // It relies on header->elementCount being set to
00302 // the data length loaded so far ... not the ultimate
00303 // reference length.
00304 //
00305 bool GenomeSequence::setChromosomeMD5andLength(uint32_t whichChromosome)
00306 {
00307     if (whichChromosome>=header->_chromosomeCount) return true;
00308 
00309     ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00310     c->size = header->elementCount - c->start;
00311 
00312     MD5_CTX md5Context;
00313     uint8_t md5Signature[MD5_DIGEST_LENGTH];
00314 
00315     //
00316     // it's easier to ensure we do this right if we just do it
00317     // in one big chunk:
00318     //
00319     char *md5Buffer = (char *) malloc(c->size);
00320 
00321     MD5Init(&md5Context);
00322 
00323     for (genomeIndex_t i = 0; i < c->size; i ++)
00324     {
00325         md5Buffer[i] = (*this)[c->start + i];
00326     }
00327     MD5Update(&md5Context, (unsigned char *) md5Buffer, c->size);
00328     MD5Final((unsigned char *) &md5Signature, &md5Context);
00329     free(md5Buffer);
00330     for (int i=0; i<MD5_DIGEST_LENGTH; i++)
00331     {
00332         // cheesy but works:
00333         sprintf(c->md5+2*i, "%02x", md5Signature[i]);
00334     }
00335     // redundant, strictly speaking due to sprintf NUL terminating
00336     // it's output strings, but put it here anyway.
00337     c->md5[2*MD5_DIGEST_LENGTH] = '\0';
00338 
00339     return false;
00340 }
00341 
00342 //
00343 // Given a buffer with a fasta format contents, count
00344 // the number of chromsomes in it and return that value.
00345 //
00346 static bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
00347 {
00348     chromosomeCount = 0;
00349     baseCount = 0;
00350     bool atLineStart = true;
00351 
00352     //
00353     // loop over the fasta file, essentially matching for the
00354     // pattern '^>.*$' and counting them.
00355     //
00356     for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00357     {
00358         switch (fastaData[fastaIndex])
00359         {
00360             case '\n':
00361             case '\r':
00362                 atLineStart = true;
00363                 break;
00364             case '>':
00365             {
00366                 if (!atLineStart) break;
00367                 chromosomeCount++;
00368                 //
00369                 // eat the rest of the line
00370                 //
00371                 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
00372                 {
00373                     fastaIndex++;
00374                 }
00375                 break;
00376             }
00377             default:
00378                 baseCount++;
00379                 atLineStart = false;
00380                 break;
00381         }
00382 
00383     }
00384     return false;
00385 }
00386 
00387 class PackedSequenceData : public std::vector<uint8_t>
00388 {
00389     std::vector<uint8_t> m_packedBases;
00390 
00391     size_t    m_baseCount;
00392 
00393     void set(size_t index, uint8_t value) {
00394         m_packedBases[index>>1] =
00395             (m_packedBases[index>>1]            // original value
00396             & ~(7<<((index&0x01)<<2)))          // logical AND off the original value
00397             | ((value&0x0f)<<((index&0x1)<<2)); // logical OR in the new value
00398     }
00399 
00400 public:
00401 
00402     void reserve(size_t baseCount) {m_packedBases.reserve(baseCount/2);}
00403     size_t size() {return m_baseCount;}
00404     void clear() {m_packedBases.clear(); m_baseCount = 0;}
00405     uint8_t operator [](size_t index)
00406     {
00407         return (m_packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
00408     }
00409     void push_back(uint8_t base);
00410 };
00411 
00412 //
00413 // Load a fasta format file from filename into the buffer
00414 // provided by the caller.
00415 // While parsing the fasta file, record each chromosome name,
00416 // its start location, and its size.
00417 //
00418 // NB: the caller must implement the logic to determine how
00419 // large the sequence data is.  There is no correct way to do
00420 // this, because we can't reliably estimate here how much sequence
00421 // data is contained in a compressed file.
00422 //
00423 // To safely pre-allocate space in sequenceData, use the reserve() method
00424 // before calling this function.
00425 //
00426 bool loadFastaFile(const char *filename,
00427         std::vector<PackedSequenceData> &sequenceData,
00428         std::vector<std::string> &chromosomeNames)
00429 {
00430     InputFile inputStream(filename, "r", InputFile::DEFAULT);
00431 
00432     if(!inputStream.isOpen()) {
00433         std::cerr << "Failed to open file " << filename << "\n";
00434         return true;
00435     }
00436 
00437     int whichChromosome = -1;
00438     chromosomeNames.clear();
00439 
00440     char ch;
00441     while((ch = inputStream.ifgetc()) != EOF) {
00442         switch (ch)
00443         {
00444             case '\n':
00445             case '\r':
00446                 break;
00447             case '>':
00448             {
00449                 std::string chromosomeName = "";
00450                 //
00451                 // pull out the chromosome new name
00452                 //
00453                 while (!isspace((ch = inputStream.ifgetc())) && ch != EOF)
00454                 {
00455                     chromosomeName += ch;  // slow, but who cares
00456                 }
00457                 //
00458                 // eat the rest of the line
00459                 //
00460                 do {
00461                     ch = inputStream.ifgetc();
00462                 } while(ch != EOF && ch != '\n' && ch != '\r');
00463                 //
00464                 // save the Chromosome name and index into our
00465                 // header so we can use them later.
00466                 //
00467                 chromosomeNames.push_back(chromosomeName);
00468 
00469                 whichChromosome++;
00470 
00471                 break;
00472             }
00473             default:
00474                 // we get here for sequence data.
00475                 //
00476                 // save the base value
00477                 // Note: invalid characters come here as well, but we
00478                 // let ::set deal with mapping them.
00479 #if 0
00480                 if (isColorSpace())
00481                 {
00482 //
00483 // anything outside these values represents an invalid base
00484 // base codes: 0-> A,    1-> C,     2-> G,      3-> T
00485 // colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red
00486 //
00487                     const char fromBase2CS[] =
00488                     {
00489                         /* 0000 */ 0,   // A->A
00490                         /* 0001 */ 1,   // A->C
00491                         /* 0010 */ 2,   // A->G
00492                         /* 0011 */ 3,   // A->T
00493                         /* 0100 */ 1,   // C->A
00494                         /* 0101 */ 0,   // C->C
00495                         /* 0110 */ 3,   // C->G
00496                         /* 0111 */ 2,   // C->T
00497                         /* 1000 */ 2,   // G->A
00498                         /* 1001 */ 3,   // G->C
00499                         /* 1010 */ 0,   // G->G
00500                         /* 1011 */ 1,   // G->T
00501                         /* 1100 */ 3,   // T->A
00502                         /* 1101 */ 2,   // T->C
00503                         /* 1110 */ 1,   // T->G
00504                         /* 1111 */ 0,   // T->T
00505                     };
00506                     //
00507                     // we are writing color space values on transitions,
00508                     // so we don't write a colorspace value when we
00509                     // get the first base value.
00510                     //
00511                     // On second and subsequent bases, write based on
00512                     // the index table above
00513                     //
00514                     char thisBase = 
00515                         BaseAsciiMap::base2int[(int)(fasta[fastaIndex])];
00516                     if (lastBase>=0)
00517                     {
00518                         char color;
00519                         if (lastBase>3 || thisBase>3) color=4;
00520                         else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
00521                         // re-use the int to base, because ::set expects a base char (ATCG), not
00522                         // a color code (0123).  It should only matter on final output.
00523                         set(header->elementCount++, 
00524                             BaseAsciiMap::int2base[(int) color]);
00525                     }
00526                     lastBase = thisBase;
00527                 }
00528                 else
00529                 {
00530                     set(header->elementCount++, toupper(fasta[fastaIndex]));
00531                 }
00532 #endif
00533                 break;
00534         }
00535     }
00536     return false;
00537 }
00538 
00539 //
00540 // recreate the umfa file from a reference fasta format file
00541 //
00542 // The general format of a FASTA file is best described
00543 // on wikipedia at http://en.wikipedia.org/wiki/FASTA_format
00544 //
00545 // The format parsed here is a simpler subset, and is
00546 // described here http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
00547 //
00548 bool GenomeSequence::create(bool isColor)
00549 {
00550     setColorSpace(isColor);
00551 
00552     if (_baseFilename=="")
00553     {
00554         std::cerr << "Base reference filename is empty." << std::endl;
00555         return true;
00556     }
00557 
00558     if (isColorSpace())
00559     {
00560         _umfaFilename = _baseFilename + "-cs.umfa";
00561     }
00562     else
00563     {
00564         _umfaFilename = _baseFilename + "-bs.umfa";
00565     }
00566 
00567     if (!_createOverwrite && access(_umfaFilename.c_str(), R_OK) == 0)
00568     {
00569         std::cerr << "Output file '" << _umfaFilename << "' exists or is not writable - please remove." << std::endl;
00570         return true;
00571     }
00572 
00573     MemoryMap fastaFile;
00574 
00575     if (fastaFile.open(_fastaFilename.c_str()))
00576     {
00577         std::cerr << "failed to open input fasta file '" << _fastaFilename << "'." << std::endl;
00578         return true;
00579     }
00580 
00581     std::cerr << "Creating FASTA "
00582               << (isColorSpace() ? "color space " : "")
00583               << "binary cache file '"
00584               << _umfaFilename
00585               << "'."
00586               << std::endl;
00587 
00588     std::cerr << std::flush;
00589 
00590     //
00591     // simple ptr to fasta data -- just treat the memory map
00592     // as an array of fastaDataSize characters...
00593     //
00594     const char *fasta = (const char *) fastaFile.data;
00595     size_t fastaDataSize = fastaFile.length();
00596 
00597     uint32_t    chromosomeCount = 0;
00598     uint64_t    baseCount = 0;
00599     getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount);
00600 
00601     if (genomeSequenceArray::create(_umfaFilename.c_str(), baseCount, chromosomeCount))
00602     {
00603         std::cerr << "failed to create '"
00604                   << _umfaFilename
00605                   << "'."
00606                   << std::endl;
00607         perror("");
00608         return true;
00609     }
00610     header->elementCount = 0;
00611     header->_colorSpace = isColorSpace();
00612     header->setApplication(_application.c_str());
00613     header->_chromosomeCount = chromosomeCount;
00614     //
00615     // clear out the variable length chromosome info array
00616     //
00617     for (uint32_t i=0; i<header->_chromosomeCount; i++) header->_chromosomes[i].constructorClear();
00618 
00619     std::string chromosomeName;
00620 
00621     //
00622     // for converting the reference to colorspace, the first base is always 5 (in base space it is 'N')
00623     signed char lastBase = BaseAsciiMap::base2int[(int) 'N'];
00624     bool terminateLoad = false;
00625     int percent = -1, newPercent;
00626     uint32_t whichChromosome = 0;
00627     for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00628     {
00629         if (_progressStream)
00630         {
00631             newPercent = (int) (1.0 * fastaIndex / fastaDataSize) * 100;
00632             if (newPercent>percent)
00633             {
00634                 *_progressStream << "\r" << newPercent << "% ";
00635                 *_progressStream << std::flush;
00636                 percent = newPercent;
00637             }
00638         }
00639         switch (fasta[fastaIndex])
00640         {
00641             case '\n':
00642             case '\r':
00643                 break;
00644             case '>':
00645             {
00646                 chromosomeName = "";
00647                 fastaIndex++;       // skip the > char
00648                 //
00649                 // pull out the chromosome new name
00650                 //
00651                 while (!isspace(fasta[fastaIndex]))
00652                 {
00653                     chromosomeName += fasta[fastaIndex++];  // slow, but who cares
00654                 }
00655                 //
00656                 // eat the rest of the line
00657                 //
00658                 while (fasta[fastaIndex]!='\n' && fasta[fastaIndex]!='\r')
00659                 {
00660                     fastaIndex++;
00661                 }
00662                 //
00663                 // save the Chromosome name and index into our
00664                 // header so we can use them later.
00665                 //
00666                 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00667                 c->setChromosomeName(chromosomeName.c_str());
00668                 c->start = header->elementCount;
00669                 // c->size gets computed at the next '>' line or at the EOF
00670 
00671                 if (whichChromosome>0)
00672                 {
00673                     //
00674                     // compute md5 checksum for the chromosome that we just
00675                     // loaded (if there was one) - note that on the last
00676                     // chromosome, we have to duplicate this code after
00677                     // the end of this loop
00678                     //
00679                     setChromosomeMD5andLength(whichChromosome - 1);
00680                 }
00681                 whichChromosome++;
00682                 if (whichChromosome > header->_chromosomeCount)
00683                 {
00684                     std::cerr << "BUG: Exceeded computed chromosome count ("
00685                               << header->_chromosomeCount
00686                               << ") - genome is now truncated at chromosome "
00687                               << header->_chromosomes[header->_chromosomeCount-1].name
00688                               << " (index "
00689                               << header->_chromosomeCount
00690                               << ")."
00691                               << std::endl;
00692                     terminateLoad = true;
00693                 }
00694                 break;
00695             }
00696             default:
00697                 // save the base pair value
00698                 // Note: invalid characters come here as well, but we
00699                 // let ::set deal with mapping them.
00700                 if (isColorSpace())
00701                 {
00702 //
00703 // anything outside these values represents an invalid base
00704 // base codes: 0-> A,    1-> C,     2-> G,      3-> T
00705 // colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red
00706 //
00707                     const char fromBase2CS[] =
00708                     {
00709                         /* 0000 */ 0,   // A->A
00710                         /* 0001 */ 1,   // A->C
00711                         /* 0010 */ 2,   // A->G
00712                         /* 0011 */ 3,   // A->T
00713                         /* 0100 */ 1,   // C->A
00714                         /* 0101 */ 0,   // C->C
00715                         /* 0110 */ 3,   // C->G
00716                         /* 0111 */ 2,   // C->T
00717                         /* 1000 */ 2,   // G->A
00718                         /* 1001 */ 3,   // G->C
00719                         /* 1010 */ 0,   // G->G
00720                         /* 1011 */ 1,   // G->T
00721                         /* 1100 */ 3,   // T->A
00722                         /* 1101 */ 2,   // T->C
00723                         /* 1110 */ 1,   // T->G
00724                         /* 1111 */ 0,   // T->T
00725                     };
00726                     //
00727                     // we are writing color space values on transitions,
00728                     // so we don't write a colorspace value when we
00729                     // get the first base value.
00730                     //
00731                     // On second and subsequent bases, write based on
00732                     // the index table above
00733                     //
00734                     char thisBase = 
00735                         BaseAsciiMap::base2int[(int)(fasta[fastaIndex])];
00736                     if (lastBase>=0)
00737                     {
00738                         char color;
00739                         if (lastBase>3 || thisBase>3) color=4;
00740                         else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
00741                         // re-use the int to base, because ::set expects a base char (ATCG), not
00742                         // a color code (0123).  It should only matter on final output.
00743                         set(header->elementCount++, 
00744                             BaseAsciiMap::int2base[(int) color]);
00745                     }
00746                     lastBase = thisBase;
00747                 }
00748                 else
00749                 {
00750                     set(header->elementCount++, toupper(fasta[fastaIndex]));
00751                 }
00752                 break;
00753         }
00754 
00755         //
00756         // slightly awkward exit handling when we exceed the fixed
00757         // number of chromosomes
00758         //
00759         if (terminateLoad) break;
00760     }
00761 
00762     //
00763     // also slightly awkward code to handle the last dangling chromosome...
00764     // all we should need to do is compute the md5 checksum
00765     //
00766     if (whichChromosome==0)
00767     {
00768         fastaFile.close();
00769         throw std::runtime_error("No chromosomes found - aborting!");
00770     }
00771     else
00772     {
00773         setChromosomeMD5andLength(whichChromosome-1);
00774     }
00775 
00776 #if 0
00777     sanityCheck(fastaFile);
00778 #endif
00779     fastaFile.close();
00780 
00781     if (_progressStream) *_progressStream << "\r";
00782 
00783     std::cerr << "FASTA binary cache file '"
00784               << _umfaFilename
00785               << "' created."
00786               << std::endl;
00787 
00788     //
00789     // leave the umfastaFile open in case caller wants to use it
00790     //
00791     return false;
00792 }
00793 
00794 int GenomeSequence::getChromosomeCount() const
00795 {
00796     return header->_chromosomeCount;
00797 }
00798 
00799 //return chromosome index: 0, 1, ... 24;
00800 int GenomeSequence::getChromosome(genomeIndex_t position) const
00801 {
00802     if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX;
00803 
00804     if (header->_chromosomeCount == 0)
00805         return INVALID_CHROMOSOME_INDEX;
00806 
00807     int start = 0;
00808     int stop = header->_chromosomeCount - 1;
00809 
00810     // eliminate case where position is in the last chromosome, since the loop
00811     // below falls off the end of the list if it in the last one.
00812 
00813     if (position > header->_chromosomes[stop].start)
00814         return (stop);
00815 
00816     while (start <= stop)
00817     {
00818         int middle = (start + stop) / 2;
00819 
00820         if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start)
00821             return middle;
00822 
00823         if (position == header->_chromosomes[middle + 1].start)
00824             return (middle + 1);
00825 
00826         if (position > header->_chromosomes[middle + 1].start)
00827             start = middle + 1;
00828 
00829         if (position < header->_chromosomes[middle].start)
00830             stop = middle - 1;
00831     }
00832 
00833     return -1;
00834 }
00835 
00836 //
00837 // Given a chromosome name and 1-based chromosome index, return the
00838 // genome index (0 based) into sequence for it.
00839 //
00840 // NB: the header->chromosomes array contains zero based genome positions
00841 //
00842 genomeIndex_t GenomeSequence::getGenomePosition(
00843     const char *chromosomeName,
00844     unsigned int chromosomeIndex) const
00845 {
00846     genomeIndex_t i = getGenomePosition(chromosomeName);
00847     if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00848     return i + chromosomeIndex - 1;
00849 }
00850 
00851 genomeIndex_t GenomeSequence::getGenomePosition(
00852     int chromosome,
00853     unsigned int chromosomeIndex) const
00854 {
00855     if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX;
00856 
00857     genomeIndex_t i = header->_chromosomes[chromosome].start;
00858     if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00859     return i + chromosomeIndex - 1;
00860 }
00861 
00862 //
00863 // return the genome index (0 based) of the start of the named
00864 // chromosome.  If none is found, INVALID_GENOME_INDEX is returned.
00865 //
00866 // XXX may need to speed this up - and smarten it up with some
00867 // modest chromosome name parsing.... e.g. '%d/X/Y' or 'chr%d/chrX/chrY' or
00868 // other schemes.
00869 //
00870 genomeIndex_t GenomeSequence::getGenomePosition(const char *chromosomeName) const
00871 {
00872     int chromosome = getChromosome(chromosomeName);
00873     if (chromosome==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00874     return header->_chromosomes[chromosome].start;
00875 }
00876 
00877 int GenomeSequence::getChromosome(const char *chromosomeName) const
00878 {
00879     unsigned int i;
00880     for (i=0; i<header->_chromosomeCount; i++)
00881     {
00882         if (strcmp(header->_chromosomes[i].name, chromosomeName)==0)
00883         {
00884             return i;
00885         }
00886     }
00887     return INVALID_CHROMOSOME_INDEX;
00888 }
00889 
00890 //
00891 // Given a read, reverse the string and swap the base
00892 // pairs for the reverse strand equivalents.
00893 //
00894 void GenomeSequence::getReverseRead(std::string &read)
00895 {
00896     std::string newRead;
00897     if (read.size()) for (int32_t i=(int) read.size() - 1; i>=0; i--)
00898         {
00899             newRead.push_back(BasePair(read[i]));
00900         }
00901     read = newRead;
00902 }
00903 
00904 void GenomeSequence::getReverseRead(String& read)
00905 {
00906     int i = 0;
00907     int j = read.Length()-1;
00908     char temp;
00909     while (i < j)
00910     {
00911         temp = read[j];
00912         read[j] = read[i];
00913         read[i] = temp;
00914     }
00915 }
00916 
00917 #define ABS(x) ( (x) > 0 ? (x) : -(x) )
00918 int GenomeSequence::debugPrintReadValidation(
00919     std::string &read,
00920     std::string &quality,
00921     char   direction,
00922     genomeIndex_t   readLocation,
00923     int sumQuality,
00924     int mismatchCount,
00925     bool recurse
00926 ) 
00927 {
00928     int validateSumQ = 0;
00929     int validateMismatchCount = 0;
00930     int rc = 0;
00931     std::string genomeData;
00932 
00933     for (uint32_t i=0; i<read.size(); i++)
00934     {
00935         if (tolower(read[i]) != tolower((*this)[readLocation + i]))
00936         {
00937             validateSumQ += quality[i] - '!';
00938             // XXX no longer valid:
00939             if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
00940             genomeData.push_back(tolower((*this)[readLocation + i]));
00941         }
00942         else
00943         {
00944             genomeData.push_back(toupper((*this)[readLocation + i]));
00945         }
00946     }
00947     assert(validateSumQ>=0);
00948     if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
00949     {
00950         printf("SUMQ: Original Genome: %s  test read: %s : actual sumQ = %d, test sumQ = %d\n",
00951                genomeData.c_str(),
00952                read.c_str(),
00953                validateSumQ,
00954                sumQuality
00955               );
00956         rc++;
00957     }
00958     else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
00959     {
00960         printf("MISM: Original Genome: %s  test read: %s : actual mismatch %d test mismatches %d\n",
00961                genomeData.c_str(),
00962                read.c_str(),
00963                validateMismatchCount,
00964                mismatchCount
00965               );
00966         rc++;
00967     }
00968     else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
00969     {
00970         printf("BOTH: Original Genome: %s  test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
00971                genomeData.c_str(),
00972                read.c_str(),
00973                validateSumQ,
00974                sumQuality,
00975                validateMismatchCount,
00976                mismatchCount
00977               );
00978         rc++;
00979     }
00980 
00981     if (recurse && ABS(validateMismatchCount - mismatchCount) > (int) read.size()/2)
00982     {
00983         printf("large mismatch difference, trying reverse strand: ");
00984         std::string reverseRead = read;
00985         std::string reverseQuality = quality;
00986         getReverseRead(reverseRead);
00987         reverse(reverseQuality.begin(), reverseQuality.end());
00988         rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
00989     }
00990     return rc;
00991 }
00992 #undef ABS
00993 
00994 
00995 bool GenomeSequence::wordMatch(unsigned int index, std::string &word) const
00996 {
00997     for (uint32_t i = 0; i<word.size(); i++)
00998     {
00999         if ((*this)[index + i] != word[i]) return false;
01000     }
01001     return true;
01002 }
01003 
01004 bool GenomeSequence::printNearbyWords(unsigned int index, unsigned int deviation, std::string &word) const
01005 {
01006     for (unsigned int i = index - deviation; i < index + deviation; i++)
01007     {
01008         if (wordMatch(i, word))
01009         {
01010             std::cerr << "word '"
01011                       << word
01012                       << "' found "
01013                       << i - index
01014                       << " away from position "
01015                       << index
01016                       << "."
01017                       << std::endl;
01018         }
01019     }
01020     return false;
01021 }
01022 
01023 void GenomeSequence::dumpSequenceSAMDictionary(std::ostream &file) const
01024 {
01025     for (unsigned int i=0; i<header->_chromosomeCount; i++)
01026     {
01027         file
01028         <<  "@SQ"
01029         << "\tSN:" << header->_chromosomes[i].name  // name
01030         << "\tLN:" << header->_chromosomes[i].size  // number of bases
01031         << "\tAS:" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
01032         << "\tM5:" << header->_chromosomes[i].md5
01033         << "\tUR:" << header->_chromosomes[i].uri
01034         << "\tSP:" << header->_chromosomes[i].species // e.g. Homo_sapiens
01035         << std::endl;
01036     }
01037 }
01038 
01039 void GenomeSequence::dumpHeaderTSV(std::ostream &file) const
01040 {
01041     file << "# Reference: " << _baseFilename << std::endl;
01042     file << "# SN: sample name - must be unique" << std::endl;
01043     file << "# AS: assembly name" << std::endl;
01044     file << "# SP: species" << std::endl;
01045     file << "# LN: chromosome/contig length" << std::endl;
01046     file << "# M5: chromosome/contig MD5 checksum" << std::endl;
01047     file << "# LN and M5 are only printed for informational purposes." << std::endl;
01048     file << "# Karma will only set those values when creating the index." << std::endl;
01049     file << "SN" << "\t" << "AS" << "\t" << "SP" << "\t" << "UR" << "\t" << "LN" << "\t" << "M5" << std::endl;
01050     for (unsigned int i=0; i<header->_chromosomeCount; i++)
01051     {
01052         file
01053         << header->_chromosomes[i].name  // name
01054         << "\t" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
01055         << "\t" << header->_chromosomes[i].uri
01056         << "\t" << header->_chromosomes[i].species // e.g. Homo_sapiens
01057         << "\t" << header->_chromosomes[i].size  // number of bases
01058         << "\t" << header->_chromosomes[i].md5
01059         << std::endl;
01060     }
01061 }
01062 
01063 
01064 void GenomeSequence::getString(std::string &str, int chromosome, uint32_t index, int baseCount) const
01065 {
01066     //
01067     // calculate the genome index for the lazy caller...
01068     //
01069     genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1;
01070 
01071     getString(str, genomeIndex, baseCount);
01072 }
01073 
01074 void GenomeSequence::getString(String &str, int chromosome, uint32_t index, int baseCount) const
01075 {
01076     std::string string;
01077     this-> getString(string, chromosome, index, baseCount);
01078     str = string.c_str();
01079 }
01080 
01081 void GenomeSequence::getString(std::string &str, genomeIndex_t index, int baseCount) const
01082 {
01083     str.clear();
01084     if (baseCount > 0)
01085     {
01086         for (int i=0; i<baseCount; i++)
01087         {
01088             str.push_back((*this)[index + i]);
01089         }
01090     }
01091     else
01092     {
01093         // if caller passed negative basecount, give them
01094         // the read for the 3' end
01095         for (int i=0; i< -baseCount; i++)
01096         {
01097             str.push_back(BaseAsciiMap::base2complement[(int)(*this)[index + i]]);
01098         }
01099     }
01100 }
01101 
01102 void GenomeSequence::getString(String &str, genomeIndex_t index, int baseCount) const
01103 {
01104     std::string string;
01105     getString(string, index, baseCount);
01106     str = string.c_str();
01107 }
01108 
01109 void GenomeSequence::getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const
01110 {
01111     str.clear();
01112     if (baseCount > 0)
01113     {
01114         for (int i=0; i<baseCount; i++)
01115         {
01116             char base = (*this)[index + i];
01117             if (in(index+i, highLightStart, highLightEnd))
01118                 base = tolower(base);
01119             str.push_back(base);
01120         }
01121     }
01122     else
01123     {
01124         // if caller passed negative basecount, give them
01125         // the read for the 3' end
01126         for (int i=0; i< -baseCount; i++)
01127         {
01128             char base = BaseAsciiMap::base2complement[(int)(*this)[index + i]];
01129             if (in(index+i, highLightStart, highLightEnd))
01130                 base = tolower(base);
01131             str.push_back(base);
01132         }
01133     }
01134 }
01135 
01136 void GenomeSequence::print30(genomeIndex_t index) const
01137 {
01138     std::cout << "index: " << index << "\n";
01139     for (genomeIndex_t i=index-30; i<index+30; i++)
01140         std::cout << (*this)[i];
01141     std::cout << "\n";
01142     for (genomeIndex_t i=index-30; i<index; i++)
01143         std::cout << " ";
01144     std::cout << "^";
01145     std::cout << std::endl;
01146 }
01147 
01148 void GenomeSequence::getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const
01149 {
01150     result = "";
01151     for (uint32_t i=0; i < read.size(); i++)
01152     {
01153         if (read[i] == (*this)[location+i])
01154             result.push_back(' ');
01155         else
01156             result.push_back('^');
01157     }
01158 }
01159 
01160 void GenomeSequence::getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const
01161 {
01162     result = "";
01163     for (uint32_t i=0; i < read.size(); i++)
01164     {
01165         if (read[i] == (*this)[location+i])
01166             result.push_back(toupper(read[i]));
01167         else
01168             result.push_back(tolower(read[i]));
01169     }
01170 }
01171 
01172 genomeIndex_t GenomeSequence::simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const
01173 {
01174     int bestScore = 1000000; // either mismatch count or sumQ
01175     genomeIndex_t bestMatchLocation = INVALID_GENOME_INDEX;
01176     for (int i=-windowSize; i<windowSize; i++)
01177     {
01178         int newScore;
01179 
01180         if (i<0 && ((uint32_t) -i) > index) continue;
01181         if (index + i + read.size() >= getNumberBases()) continue;
01182 
01183         if (quality=="")
01184         {
01185             newScore = this->getMismatchCount(read, index + i);
01186         }
01187         else
01188         {
01189             newScore = this->getSumQ(read, quality, index + i);
01190         }
01191         if (newScore < bestScore)
01192         {
01193             bestScore = newScore;
01194             bestMatchLocation = index + i;
01195         }
01196     }
01197     return bestMatchLocation;
01198 }
01199 
01200 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h)
01201 {
01202     stream << (MemoryMapArrayHeader &) h;
01203     stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01204     stream << "isColorSpace: " << h._colorSpace << std::endl;
01205     stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01206     uint64_t totalSize = 0;
01207     for (uint32_t i=0; i < h._chromosomeCount; i++)
01208     {
01209         totalSize += h._chromosomes[i].size;
01210         stream << "Chromosome Index " << i << " name: " << h._chromosomes[i].name << std::endl;
01211         stream << "Chromosome Index " << i << " whole genome start: " << h._chromosomes[i].start << std::endl;
01212         stream << "Chromosome Index " << i << " whole genome size: " << h._chromosomes[i].size << std::endl;
01213         stream << "Chromosome Index " << i << " md5 checksum: " << h._chromosomes[i].md5 << std::endl;
01214         stream << "Chromosome Index " << i << " assemblyID: " << h._chromosomes[i].assemblyID << std::endl;
01215         stream << "Chromosome Index " << i << " species: " << h._chromosomes[i].species << std::endl;
01216         stream << "Chromosome Index " << i << " URI: " << h._chromosomes[i].uri << std::endl;
01217     }
01218     stream << "Total Genome Size: " << totalSize << " bases."<< std::endl;
01219     if (totalSize != h.elementCount)
01220     {
01221         stream << "Total Genome Size: does not match elementCount!\n";
01222     }
01223 
01224     stream << std::endl;
01225     return stream;
01226 }
01227 
01228 void GenomeSequence::getChromosomeAndIndex(std::string &s, genomeIndex_t i) const
01229 {
01230     int whichChromosome = 0;
01231 
01232     whichChromosome = getChromosome(i);
01233 
01234     if (whichChromosome == INVALID_CHROMOSOME_INDEX)
01235     {
01236         s = "invalid genome index";     // TODO include the index in error
01237     }
01238     else
01239     {
01240         std::ostringstream buf;
01241         genomeIndex_t   chromosomeIndex = i - getChromosomeStart(whichChromosome) + 1;
01242         buf << header->_chromosomes[whichChromosome].name  << ":"  << chromosomeIndex;
01243 #if 0
01244         buf << " (GenomeIndex " << i << ")";
01245 #endif
01246         s = buf.str();
01247     }
01248     return;
01249 }
01250 
01251 
01252 void GenomeSequence::getChromosomeAndIndex(String& s, genomeIndex_t i) const
01253 {
01254     std::string ss;
01255     getChromosomeAndIndex(ss, i);
01256     s = ss.c_str();
01257     return;
01258 }
01259 
01260 
01261 //
01262 // This is intended to be a helper routine to get dbSNP files
01263 // loaded.  In some cases, we will load into an mmap() file (ie
01264 // when we are creating it), in others, we will simply be loading
01265 // an existing dbSNP file into RAM (when the binary file does not
01266 // exist or when we are running with useMemoryMapFlag == false.
01267 //
01268 // Assume that dbSNP exists, is writable, and is the right size.
01269 //
01270 // Using the dbSNPFilename given, mark each dbSNP position
01271 // with a bool true.
01272 //
01273 // Return value:
01274 //   True: if populateDBSNP() succeed
01275 //   False: if not succeed
01276 bool GenomeSequence::populateDBSNP(
01277     mmapArrayBool_t &dbSNP,
01278     IFILE inputFile) const
01279 {
01280     assert(dbSNP.getElementCount() == getNumberBases());
01281 
01282     if(inputFile == NULL)
01283     {
01284         // FAIL, file not opened.
01285         return(false);
01286     }
01287 
01288     std::string chromosomeName;
01289     std::string position;
01290     genomeIndex_t chromosomePosition1;  // 1-based
01291     uint64_t    ignoredLineCount = 0;
01292 
01293     // Read til the end of the file.
01294     char* postPosPtr = NULL;
01295     while(!inputFile->ifeof())
01296     {
01297         chromosomeName.clear();
01298         position.clear();
01299         // Read the chromosome
01300         if(inputFile->readTilTab(chromosomeName) <= 0)
01301         {
01302             // hit either eof or end of line, check if
01303             // it is a header.
01304             if(chromosomeName.size()>0 && chromosomeName[0]=='#')
01305             {
01306                 // header, so just continue.
01307                 continue;
01308             }
01309             // Not the header, so this line is poorly formatted.
01310             ++ignoredLineCount;
01311             // Continue to the next line.
01312             continue;
01313         }
01314 
01315         // Check if it is a header line.
01316         if(chromosomeName.size()>0 && chromosomeName[0]=='#')
01317         {
01318             // did not hit eof or end of line, 
01319             // so discard the rest of the line.
01320             inputFile->discardLine();
01321             continue;
01322         }
01323 
01324         // Not a header, so read the position.
01325         if(inputFile->readTilTab(position) > 0)
01326         {
01327             // Additional data on the line, so discard it.
01328             inputFile->discardLine();
01329         }
01330 
01331         // Convert the position to a string.
01332         chromosomePosition1 = strtoul(position.c_str(), &postPosPtr, 0);
01333 
01334 
01335         if(postPosPtr == position.c_str())
01336         {
01337             ++ignoredLineCount;
01338             continue;
01339         }
01340 
01341         // 1-based genome index.
01342         genomeIndex_t genomeIndex = 
01343             getGenomePosition(chromosomeName.c_str(), chromosomePosition1);
01344 
01345         // if the genome index is invalid, ignore it
01346         if((genomeIndex == INVALID_GENOME_INDEX) || 
01347            (genomeIndex > getNumberBases()))
01348         {
01349             ignoredLineCount++;
01350             continue;
01351         }
01352 
01353         dbSNP.set(genomeIndex, true);
01354     }
01355 
01356     if (ignoredLineCount > 0)
01357     {
01358         std::cerr << "GenomeSequence::populateDBSNP: ignored " << ignoredLineCount << " SNP positions due to invalid format of line." << std::endl;
01359         return false;
01360     }
01361     return true;
01362 }
01363 
01364 bool GenomeSequence::loadDBSNP(
01365     mmapArrayBool_t &dbSNP,
01366     const char *inputFileName) const
01367 {
01368     //
01369     // the goal in this section of code is to allow the user
01370     // to either specify a valid binary version of the SNP file,
01371     // or the original text file that it gets created from.
01372     //
01373     // To do this, we basically open, sniff the error message,
01374     // and if it claims it is not a binary version of the file,
01375     // we go ahead and treat it as the text file and use the
01376     // GenomeSequence::populateDBSNP method to load it.
01377     //
01378     // Further checking is really needed to ensure users don't
01379     // mix a dbSNP file for a different reference, since it is really
01380     // easy to do.
01381     //
01382     if (strlen(inputFileName)!=0)
01383     {
01384         std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush;
01385 
01386         if (dbSNP.open(inputFileName, O_RDONLY))
01387         {
01388             //
01389             // failed to open, possibly due to bad magic.
01390             //
01391             // this is really awful ... need to have a return
01392             // code that is smart enough to avoid this ugliness:
01393             //
01394             if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos)
01395             {
01396                 std::cerr << "Error: " << dbSNP.getErrorString() << std::endl;
01397                 exit(1);
01398             }
01399             //
01400             // we have a file, assume we can load it as a text file
01401             //
01402             IFILE inputFile = ifopen(inputFileName, "r");
01403             if(inputFile == NULL)
01404             {
01405                 std::cerr << "Error: failed to open " << inputFileName << std::endl;
01406                 exit(1);
01407             }
01408 
01409             std::cerr << "(as text file) ";
01410 
01411             // anonymously (RAM resident only) create:
01412             dbSNP.create(getNumberBases());
01413 
01414             // now load it into RAM
01415             populateDBSNP(dbSNP, inputFile);
01416             ifclose(inputFile);
01417 
01418         }
01419         else
01420         {
01421             std::cerr << "(as binary mapped file) ";
01422         }
01423 
01424         std::cerr << "DONE!" << std::endl;
01425         return false;
01426     }
01427     else
01428     {
01429         return true;
01430     }
01431 }
01432 
01433 
01434 #if defined(TEST)
01435 
01436 void simplestExample(void)
01437 {
01438     GenomeSequence  reference;
01439     genomeIndex_t   index;
01440 
01441     // a particular reference is set by:
01442     // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa")
01443     //
01444     // In the above example, the suffix .fa is stripped and replaced with .umfa,
01445     // which contains the actual file being opened.
01446     //
01447     if (reference.open())
01448     {
01449         perror("GenomeSequence::open");
01450         exit(1);
01451     }
01452 
01453 
01454     index = 1000000000; // 10^9
01455     //
01456     // Write the base at the given index.  Here, index is 0 based,
01457     // and is across the whole genome, as all chromosomes are sequentially
01458     // concatenated, so the allowed range is
01459     //
01460     // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last))
01461     //
01462     // (where int last = reference.getChromosomeCount() - 1;)
01463     //
01464     std::cout << "base[" << index << "] = " << reference[index] << std::endl;
01465 
01466     //
01467     // Example for finding chromosome and one based chromosome position given
01468     // and absolute position on the genome in 'index':
01469     //
01470     int chr = reference.getChromosome(index);
01471     genomeIndex_t   chrIndex = index - reference.getChromosomeStart(chr) + 1;   // 1-based
01472 
01473     std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
01474 
01475     //
01476     // Example for finding an absolute genome index position when the
01477     // chromosome name and one based position are known:
01478     //
01479     const char *chromosomeName = "5";
01480     chr = reference.getChromosome(chromosomeName);     // 0-based
01481     chrIndex = 100000;                      // 1-based
01482 
01483     index = reference.getChromosomeStart(chr) + chrIndex - 1;
01484 
01485     std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
01486 
01487     reference.close();
01488 }
01489 
01490 void testGenomeSequence(void)
01491 {
01492     GenomeSequence reference;
01493 
01494 #if 0
01495     std::string referenceName = "someotherreference";
01496     if (reference.setFastaName(referenceName))
01497     {
01498         std::cerr << "failed to open reference file "
01499                   << referenceName
01500                   << std::endl;
01501         exit(1);
01502     }
01503 #endif
01504 
01505     std::cerr << "open and prefetch the reference genome: ";
01506 
01507     // open it
01508     if (reference.open())
01509     {
01510         exit(1);
01511     }
01512     std::cerr << "done!" << std::endl;
01513 
01514     //
01515     // For the human genome, genomeIndex ranges from 0 to 3.2x10^9
01516     //
01517     genomeIndex_t   genomeIndex;    // 0 based
01518     unsigned int chromosomeIndex;   // 1 based
01519     unsigned int chromosome;        // 0..23 or so
01520     std::string chromosomeName;
01521 
01522     //
01523     // Here we'll start with a chromosome name, then obtain the genome
01524     // index, and use it to find the base we want:
01525     //
01526     chromosomeName = "2";
01527     chromosomeIndex = 1234567;
01528     // this call is slow (string search for chromsomeName):
01529     genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
01530     assert(genomeIndex!=INVALID_GENOME_INDEX);
01531     std::cout << "Chromosome " << chromosomeName << ", index ";
01532     std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
01533     std::cout << " at genome index position " << genomeIndex << std::endl;
01534 
01535     //
01536     // now reverse it - given a genomeIndex from above, find the chromosome
01537     // name and index:
01538     //
01539 
01540     // slow (binary search on genomeIndex):
01541     chromosome = reference.getChromosome(genomeIndex);
01542     unsigned int newChromosomeIndex;
01543     // not slow:
01544     newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
01545 
01546     assert(chromosomeIndex == newChromosomeIndex);
01547 
01548     // more testing... at least test and use PackedRead:
01549     //
01550     PackedRead pr;
01551 
01552     pr.set("ATCGATCG", 0);
01553     assert(pr.size()==8);
01554     assert(pr[0]==BaseAsciiMap::base2int[(int) 'A']);
01555     assert(pr[1]==BaseAsciiMap::base2int[(int) 'T']);
01556     assert(pr[2]==BaseAsciiMap::base2int[(int) 'C']);
01557     assert(pr[3]==BaseAsciiMap::base2int[(int) 'G']);
01558     pr.set("ATCGATCG", 1);
01559     assert(pr.size()==9);
01560     pr.set("", 0);
01561     assert(pr.size()==0);
01562     pr.set("", 1);
01563     assert(pr.size()==1);
01564     pr.set("", 2);
01565     assert(pr.size()==2);
01566     pr.set("", 3);
01567     assert(pr.size()==3);
01568     assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']);
01569     assert(pr[1]==BaseAsciiMap::base2int[(int) 'N']);
01570     assert(pr[2]==BaseAsciiMap::base2int[(int) 'N']);
01571     pr.set("C", 1);
01572     assert(pr.size()==2);
01573     assert(pr[0]==BaseAsciiMap::base2int[(int) 'N']);
01574     assert(pr[1]==BaseAsciiMap::base2int[(int) 'C']);
01575 
01576 }
01577 
01578 //
01579 // After I build libcsg, I compile and run this test code using:
01580 //
01581 // g++ -DTEST -o try GenomeSequence.cpp -L. -lcsg -lm -lz -lssl
01582 // you also may need -fno-rtti
01583 // ./try
01584 //
01585 int main(int argc, const char **argv)
01586 {
01587 #if 1
01588     simplestExample();
01589 #else
01590     testGenomeSequence();
01591 #endif
01592     exit(0);
01593 }
01594 
01595 #endif