GenomeSequence.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "assert.h"
00019 #include "ctype.h"
00020 #include "stdio.h"
00021 #include "zlib.h"
00022 #include "Error.h"
00023 
00024 
00025 #include "Generic.h"
00026 #include "GenomeSequence.h"
00027 
00028 #include <algorithm>
00029 #include <istream>
00030 #include <fstream>
00031 #include <sstream>
00032 #include <stdexcept>
00033 
00034 
00035 //
00036 // Map ASCII values to a 2 (or 3) bit encoding for the base pair value
00037 //  class 0 -> 'A' (Adenine - 0x41 and 0x61)
00038 //  class 1 -> 'C' (Cytosine - 0x43 and 0x63)
00039 //  class 2 -> 'G' (Guanine - 0x47 and 0x67)
00040 //  class 3 -> 'T' (Thymine - 0x54 and 0x74)
00041 //  class 4 -> 'N' (Unknown - read error or incomplete data - 0x4E and 0x6E)
00042 //  class 5 -> not a valid DNA base pair character
00043 //
00044 // Note: The +1 array size is for the terminating NUL character
00045 //
00046 // NB: This table also maps 0, 1, 2, and 3 to the corresponding integers,
00047 // and '.' to class 4.  This allows ABI SOLiD reads to be converted
00048 // to integers via ReadIndexer::Word2Integer.
00049 //
00050 unsigned char GenomeSequence::base2int[256+1] =
00051     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0x00-0x0F
00052     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0x10-0x1F
00053     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\004\005"  // 0x20-0x2F
00054     "\000\001\002\003\005\005\005\005\005\005\005\005\005\005\005\005"  // 0x30-0x3F
00055     "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005"  // 0x40-0x4F
00056     "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005"  // 0x50-0x5F
00057     "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005"  // 0x60-0x6F
00058     "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005"  // 0x70-0x7F
00059 // not used, but included for completeness:
00060     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0x80-0x8F
00061     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0x90-0x9F
00062     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0xA0-0xAF
00063     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0xB0-0xBF
00064     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0xC0-0xCF
00065     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0xD0-0xDF
00066     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0xE0-0xEF
00067     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0xF0-0xFF
00068     ;
00069 
00070 //
00071 // This is obviously for base space use only:
00072 //
00073 const char GenomeSequence::int2base[] = "ACGTNMXXXXXXXXXX";
00074 //
00075 // convert int to color space value
00076 //
00077 const char GenomeSequence::int2colorSpace[] = "0123NXXXXXXXXXXX";
00078 
00079 //
00080 // This table maps 5' base space to the 3' complement base space
00081 // values, as well as 5' color space values to the corresponding
00082 // 3' complement color space values.
00083 //
00084 // In both cases, invalids are mapped to 'N', which isn't accurate
00085 // for ABI SOLiD, but internally it shouldn't matter (on output it
00086 // will).
00087 //
00088 unsigned char GenomeSequence::base2complement[256+1 /* for NUL char */] =
00089     "NNNNNNNNNNNNNNNN"  // 0x00-0x0F
00090     "NNNNNNNNNNNNNNNN"  // 0x10-0x1F
00091     "NNNNNNNNNNNNNNNN"  // 0x20-0x2F
00092     "0123NNNNNNNNNNNN"  // 0x30-0x3F
00093     "NTNGNNNCNNNNNNNN"  // 0x40-0x4F
00094     "NNNNANNNNNNNNNNN"  // 0x50-0x5F
00095     "NTNGNNNCNNNNNNNN"  // 0x60-0x6F
00096     "NNNNANNNNNNNNNNN"  // 0x70-0x7F
00097 // not used, but included for completeness:
00098     "NNNNNNNNNNNNNNNN"  // 0x80-0x8F
00099     "NNNNNNNNNNNNNNNN"  // 0x90-0x9F
00100     "NNNNNNNNNNNNNNNN"  // 0xA0-0xAF
00101     "NNNNNNNNNNNNNNNN"  // 0xB0-0xBF
00102     "NNNNNNNNNNNNNNNN"  // 0xC0-0xCF
00103     "NNNNNNNNNNNNNNNN"  // 0xD0-0xDF
00104     "NNNNNNNNNNNNNNNN"  // 0xE0-0xEF
00105     "NNNNNNNNNNNNNNNN"  // 0xF0-0xFF
00106     ;
00107 
00108 //
00109 // given a read in a string, pack it into the vector of
00110 // bytes coded as two bases per byte.
00111 //
00112 // The goal is to allow us to more rapidly compare against
00113 // the genome, which is itself packed 2 bases per byte.
00114 //
00115 // Unfortunately, the match position may be odd or even,
00116 // so as a result, we also need to be able to prepad
00117 // with 'N' bases to get the byte alignment the same, so
00118 // padWithNCount may be a positive number indicating how
00119 // many N bases to prepend.
00120 //
00121 void PackedRead::set(const char *rhs, int padWithNCount)
00122 {
00123     clear();
00124 
00125     // pad this packed read with 'N' bases two at a time
00126     while (padWithNCount>1)
00127     {
00128         packedBases.push_back(
00129             GenomeSequence::base2int[(int) 'N'] << 4 |
00130             GenomeSequence::base2int[(int) 'N']
00131         );
00132         padWithNCount -= 2;
00133         length+=2;
00134     }
00135 
00136     // when we have only one base, pack one 'N' base with
00137     // the first base in rhs if there is one.
00138     if (padWithNCount)
00139     {
00140         // NB: *rhs could be NUL, which is ok here - just keep
00141         // the length straight.
00142         packedBases.push_back(
00143             GenomeSequence::base2int[(int) *rhs] << 4 |
00144             GenomeSequence::base2int[(int) 'N']
00145         );
00146         // two cases - have characters in rhs or we don't:
00147         if (*rhs)
00148         {
00149             length+=2;  // pad byte plus one byte from rhs
00150             rhs++;
00151         }
00152         else
00153         {
00154             length++;
00155         }
00156         padWithNCount--;    // should now be zero, so superfluous.
00157         assert(padWithNCount==0);
00158     }
00159 
00160     // pad pairs of bases from rhs, two at a time:
00161     while (*rhs && *(rhs+1))
00162     {
00163         packedBases.push_back(
00164             GenomeSequence::base2int[(int) *(rhs+1)] << 4 |
00165             GenomeSequence::base2int[(int) *(rhs+0)]
00166         );
00167         rhs+=2;
00168         length+=2;
00169     }
00170 
00171     // if there is an odd base left at the end, put it
00172     // in a byte all its own (low 4 bits == 0):
00173     if (*rhs)
00174     {
00175         packedBases.push_back(
00176             GenomeSequence::base2int[(int) *(rhs+0)]
00177         );
00178         length++;
00179     }
00180     return;
00181 }
00182 
00183 std::string GenomeSequence::IntegerToSeq(unsigned int n, unsigned int wordsize) const
00184 {
00185     std::string sequence("");
00186     for (unsigned int i = 0; i < wordsize; i ++)
00187         sequence += "N";
00188 
00189     unsigned int clearHigherBits = ~(3U << (wordsize<<1));  // XXX refactor - this appears several places
00190 
00191     if (n > clearHigherBits)
00192         error("%d needs to be a non-negative integer < clearHigherBits\n", n);
00193 
00194     for (unsigned int i = 0; i < wordsize; i++)
00195     {
00196         sequence[wordsize-1-i] = int2base[n & 3];
00197         n >>= 2;
00198     }
00199     return sequence;
00200 }
00201 
00202 
00203 
00204 GenomeSequence::GenomeSequence()
00205 {
00206     constructorClear();
00207 }
00208 
00209 void GenomeSequence::constructorClear()
00210 {
00211     _debugFlag = 0;
00212     _progressStream = NULL;
00213     _colorSpace = false;
00214     _createOverwrite = false;
00215 }
00216 
00217 void GenomeSequence::setup(const char *referenceFilename)
00218 {
00219     setReferenceName(referenceFilename);
00220 
00221     if (_progressStream) *_progressStream << "open and prefetch reference genome " << referenceFilename << ": " << std::flush;
00222 
00223     if (open(false))
00224     {
00225         std::cerr << "Failed to open reference genome " << referenceFilename << std::endl;
00226         std::cerr << errorStr << std::endl;
00227         exit(1);
00228     }
00229 
00230     prefetch();
00231     if (_progressStream) *_progressStream << "done." << std::endl << std::flush;
00232 }
00233 
00234 GenomeSequence::~GenomeSequence()
00235 {
00236     // free up resources:
00237     _umfaFile.close();
00238 }
00239 
00240 //
00241 // mapped open.
00242 //
00243 // if the file exists, map in into memory, and fill in a few useful
00244 // fields.
00245 //
00246 // isColorSpace is used only when the baseFilename is empty, so
00247 // that we can choose an appropriate default reference file.
00248 //
00249 
00250 bool GenomeSequence::open(bool isColorSpace, int flags)
00251 {
00252     bool rc;
00253 
00254     if (isColorSpace)
00255     {
00256         _umfaFilename = _baseFilename + "-cs.umfa";
00257     }
00258     else
00259     {
00260         _umfaFilename = _baseFilename + "-bs.umfa";
00261     }
00262 
00263     rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags);
00264     if (rc)
00265     {
00266         std::cerr << "GenomeSequence::open: failed to open file "
00267                   << _umfaFilename
00268                   << std::endl;
00269         return true;
00270     }
00271 
00272     _colorSpace = header->_colorSpace;
00273 
00274     return false;
00275 }
00276 
00277 void GenomeSequence::sanityCheck(MemoryMap &fasta) const
00278 {
00279     unsigned int i;
00280 
00281     unsigned int genomeIndex = 0;
00282     for (i=0; i<fasta.length(); i++)
00283     {
00284         switch (fasta[i])
00285         {
00286             case '>':
00287                 while (fasta[i]!='\n' && fasta[i]!='\r') i++;
00288                 break;
00289             case '\n':
00290             case '\r':
00291                 break;
00292             default:
00293                 assert(base2int[(int)(*this)[genomeIndex]] == base2int[(int) fasta[i]]);
00294 #if 0
00295                 if (base2int[(*this)[genomeIndex]] != base2int[(int) fasta[i]])
00296                 {
00297                     int bp1 = base2int[(*this)[genomeIndex]];
00298                     int bp2 = base2int[fasta[i]];
00299                     printf("failing at genome index = %u, fasta index = %u.\n",
00300                            genomeIndex,i);
00301                 }
00302 #endif
00303                 genomeIndex++;
00304                 break;
00305         }
00306     }
00307 }
00308 
00309 #define HAS_SUFFIX(str, suffix) ((strlen(suffix) < str.size()) && (str.substr(str.size() - strlen(suffix)) == suffix))
00310 //
00311 // referenceFilename is  either a fasta or a UM fasta (.fa or .umfa)
00312 // filename.  In both cases, the suffix gets removed and the
00313 // base name is kept for later use depending on context.
00314 //
00315 bool GenomeSequence::setReferenceName(std::string referenceFilename)
00316 {
00317 
00318     if (HAS_SUFFIX(referenceFilename, ".fa"))
00319     {
00320         _referenceFilename = referenceFilename;
00321         _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3);
00322     }
00323     else if (HAS_SUFFIX(referenceFilename, ".umfa"))
00324     {
00325         _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5);
00326     }
00327     else if (HAS_SUFFIX(referenceFilename, "-cs.umfa"))
00328     {
00329         _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00330     }
00331     else if (HAS_SUFFIX(referenceFilename, "-bs.umfa"))
00332     {
00333         _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00334     }
00335     else
00336     {
00337         _baseFilename = referenceFilename;
00338     }
00339     _fastaFilename = _baseFilename + ".fa";
00340 
00341     return false;
00342 }
00343 
00344 //
00345 // this works in lockstep with ::create to populate
00346 // the per chromosome header fields size and md5
00347 // checksum.
00348 //
00349 // It relies on header->elementCount being set to
00350 // the data length loaded so far ... not the ultimate
00351 // reference length.
00352 //
00353 bool GenomeSequence::setChromosomeMD5andLength(uint32_t whichChromosome)
00354 {
00355     if (whichChromosome>=header->_chromosomeCount) return true;
00356 
00357     ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00358     c->size = header->elementCount - c->start;
00359 
00360 #if !defined(WIN32)
00361     MD5_CTX md5Context;
00362     uint8_t md5Signature[MD5_DIGEST_LENGTH];
00363 
00364     //
00365     // it's easier to ensure we do this right if we just do it
00366     // in one big chunk:
00367     //
00368     char *md5Buffer = (char *) malloc(c->size);
00369 
00370     MD5_Init(&md5Context);
00371 
00372     for (genomeIndex_t i = 0; i < c->size; i ++)
00373     {
00374         md5Buffer[i] = (*this)[c->start + i];
00375     }
00376     MD5_Update(&md5Context, md5Buffer, c->size);
00377     MD5_Final((unsigned char *) &md5Signature, &md5Context);
00378     free(md5Buffer);
00379     for (int i=0; i<MD5_DIGEST_LENGTH; i++)
00380     {
00381         // cheesy but works:
00382         sprintf(c->md5+2*i, "%02x", md5Signature[i]);
00383     }
00384     // redundant, strictly speaking due to sprintf NUL terminating
00385     // it's output strings, but put it here anyway.
00386     c->md5[2*MD5_DIGEST_LENGTH] = '\0';
00387 #else
00388     memset(c->md5, 0, 2*MD5_DIGEST_LENGTH);
00389 #endif
00390     return false;
00391 }
00392 
00393 //
00394 // Given a buffer with a fasta format contents, count
00395 // the number of chromsomes in it and return that value.
00396 //
00397 static bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
00398 {
00399     chromosomeCount = 0;
00400     baseCount = 0;
00401     bool atLineStart = true;
00402 
00403     //
00404     // loop over the fasta file, essentially matching for the
00405     // pattern '^>.*$' and counting them.
00406     //
00407     for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00408     {
00409         switch (fastaData[fastaIndex])
00410         {
00411             case '\n':
00412             case '\r':
00413                 atLineStart = true;
00414                 break;
00415             case '>':
00416             {
00417                 if (!atLineStart) break;
00418                 chromosomeCount++;
00419                 //
00420                 // eat the rest of the line
00421                 //
00422                 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
00423                 {
00424                     fastaIndex++;
00425                 }
00426                 break;
00427             }
00428             default:
00429                 baseCount++;
00430                 atLineStart = false;
00431                 break;
00432         }
00433 
00434     }
00435     return false;
00436 }
00437 
00438 //
00439 // recreate the umfa file from a reference fasta format file
00440 //
00441 // The general format of a FASTA file is best described
00442 // on wikipedia at http://en.wikipedia.org/wiki/FASTA_format
00443 //
00444 // The format parsed here is a simpler subset, and is
00445 // described here http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
00446 //
00447 bool GenomeSequence::create()
00448 {
00449     // quick sanity check...
00450     assert(int2base[base2int[(int) 'A']]=='A');
00451     assert(int2base[base2int[(int) 'a']]=='A');
00452     assert(int2base[base2int[(int) 'T']]=='T');
00453     assert(int2base[base2int[(int) 't']]=='T');
00454     assert(int2base[base2int[(int) 'C']]=='C');
00455     assert(int2base[base2int[(int) 'c']]=='C');
00456     assert(int2base[base2int[(int) 'G']]=='G');
00457     assert(int2base[base2int[(int) 'g']]=='G');
00458     assert(int2base[base2int[(int) 'N']]=='N');
00459     assert(int2base[base2int[(int) 'n']]=='N');
00460     assert(int2base[base2int[(int) 'M']]=='M');
00461     assert(int2base[base2int[(int) 'm']]=='M');
00462 
00463     assert(base2int[(int) 'N']==4);
00464     assert(base2int[(int) 'n']==4);
00465     assert(base2int[(int) 'A']==0);
00466     assert(base2int[(int) 'a']==0);
00467     assert(base2int[(int) 'T']==3);
00468     assert(base2int[(int) 't']==3);
00469     assert(base2int[(int) 'C']==1);
00470     assert(base2int[(int) 'c']==1);
00471     assert(base2int[(int) 'G']==2);
00472     assert(base2int[(int) 'g']==2);
00473 
00474     if (_baseFilename=="")
00475     {
00476         std::cerr << "Base reference filename is empty." << std::endl;
00477         return true;
00478     }
00479 
00480     if (isColorSpace())
00481     {
00482         _umfaFilename = _baseFilename + "-cs.umfa";
00483     }
00484     else
00485     {
00486         _umfaFilename = _baseFilename + "-bs.umfa";
00487     }
00488 
00489     if (!_createOverwrite && access(_umfaFilename.c_str(), R_OK) == 0)
00490     {
00491         std::cerr << "Output file '" << _umfaFilename << "' exists or is not writable - please remove." << std::endl;
00492         return true;
00493     }
00494 
00495     MemoryMap fastaFile;
00496 
00497     if (fastaFile.open(_fastaFilename.c_str()))
00498     {
00499         std::cerr << "failed to open input fasta file '" << _fastaFilename << "'." << std::endl;
00500         return true;
00501     }
00502 
00503     std::cerr << "Creating FASTA "
00504               << (isColorSpace() ? "color space " : "")
00505               << "binary cache file '"
00506               << _umfaFilename
00507               << "'."
00508               << std::endl;
00509 
00510     std::cerr << std::flush;
00511 
00512     //
00513     // simple ptr to fasta data -- just treat the memory map
00514     // as an array of fastaDataSize characters...
00515     //
00516     const char *fasta = (const char *) fastaFile.data;
00517     size_t fastaDataSize = fastaFile.length();
00518 
00519     uint32_t    chromosomeCount = 0;
00520     uint64_t    baseCount = 0;
00521     getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount);
00522 
00523     if (genomeSequenceArray::create(_umfaFilename.c_str(), baseCount, chromosomeCount))
00524     {
00525         std::cerr << "failed to create '"
00526                   << _umfaFilename
00527                   << "'."
00528                   << std::endl;
00529         perror("");
00530         return true;
00531     }
00532     header->elementCount = 0;
00533     header->_colorSpace = isColorSpace();
00534     header->setApplication(_application.c_str());
00535     header->_chromosomeCount = chromosomeCount;
00536     //
00537     // clear out the variable length chromosome info array
00538     //
00539     for (uint32_t i=0; i<header->_chromosomeCount; i++) header->_chromosomes[i].constructorClear();
00540 
00541     std::string chromosomeName;
00542 
00543     //
00544     // for converting the reference to colorspace, the first base is always 5 (in base space it is 'N')
00545     signed char lastBase = base2int[(int) 'N'];
00546     bool terminateLoad = false;
00547     int percent = -1, newPercent;
00548     uint32_t whichChromosome = 0;
00549     for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00550     {
00551         if (_progressStream)
00552         {
00553             newPercent = (int) (1.0 * fastaIndex / fastaDataSize) * 100;
00554             if (newPercent>percent)
00555             {
00556                 std::cerr << "\r" << newPercent << "% ";
00557                 std::cerr << std::flush;
00558                 percent = newPercent;
00559             }
00560         }
00561         switch (fasta[fastaIndex])
00562         {
00563             case '\n':
00564             case '\r':
00565                 break;
00566             case '>':
00567             {
00568                 chromosomeName = "";
00569                 fastaIndex++;       // skip the > char
00570                 //
00571                 // pull out the chromosome new name
00572                 //
00573                 while (!isspace(fasta[fastaIndex]))
00574                 {
00575                     chromosomeName += fasta[fastaIndex++];  // slow, but who cares
00576                 }
00577                 //
00578                 // eat the rest of the line
00579                 //
00580                 while (fasta[fastaIndex]!='\n' && fasta[fastaIndex]!='\r')
00581                 {
00582                     fastaIndex++;
00583                 }
00584                 //
00585                 // save the Chromosome name and index into our
00586                 // header so we can use them later.
00587                 //
00588                 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00589                 c->setChromosomeName(chromosomeName.c_str());
00590                 c->start = header->elementCount;
00591                 // c->size gets computed at the next '>' line or at the EOF
00592 
00593                 if (whichChromosome>0)
00594                 {
00595                     //
00596                     // compute md5 checksum for the chromosome that we just
00597                     // loaded (if there was one) - note that on the last
00598                     // chromosome, we have to duplicate this code after
00599                     // the end of this loop
00600                     //
00601                     setChromosomeMD5andLength(whichChromosome - 1);
00602                 }
00603                 whichChromosome++;
00604                 if (whichChromosome > header->_chromosomeCount)
00605                 {
00606                     std::cerr << "BUG: Exceeded computed chromosome count ("
00607                               << header->_chromosomeCount
00608                               << ") - genome is now truncated at chromosome "
00609                               << header->_chromosomes[header->_chromosomeCount-1].name
00610                               << " (index "
00611                               << header->_chromosomeCount
00612                               << ")."
00613                               << std::endl;
00614                     terminateLoad = true;
00615                 }
00616                 break;
00617             }
00618             default:
00619                 // save the base pair value
00620                 // Note: invalid characters come here as well, but we
00621                 // let ::set deal with mapping them.
00622                 if (isColorSpace())
00623                 {
00624 //
00625 // anything outside these values represents an invalid base
00626 // base codes: 0-> A,    1-> C,     2-> G,      3-> T
00627 // colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red
00628 //
00629                     const char fromBase2CS[] =
00630                     {
00631                         /* 0000 */ 0,   // A->A
00632                         /* 0001 */ 1,   // A->C
00633                         /* 0010 */ 2,   // A->G
00634                         /* 0011 */ 3,   // A->T
00635                         /* 0100 */ 1,   // C->A
00636                         /* 0101 */ 0,   // C->C
00637                         /* 0110 */ 3,   // C->G
00638                         /* 0111 */ 2,   // C->T
00639                         /* 1000 */ 2,   // G->A
00640                         /* 1001 */ 3,   // G->C
00641                         /* 1010 */ 0,   // G->G
00642                         /* 1011 */ 1,   // G->T
00643                         /* 1100 */ 3,   // T->A
00644                         /* 1101 */ 2,   // T->C
00645                         /* 1110 */ 1,   // T->G
00646                         /* 1111 */ 0,   // T->T
00647                     };
00648                     //
00649                     // we are writing color space values on transitions,
00650                     // so we don't write a colorspace value when we
00651                     // get the first base value.
00652                     //
00653                     // On second and subsequent bases, write based on
00654                     // the index table above
00655                     //
00656                     char thisBase = base2int[(int)(fasta[fastaIndex])];
00657                     if (lastBase>=0)
00658                     {
00659                         char color;
00660                         if (lastBase>3 || thisBase>3) color=4;
00661                         else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
00662                         // re-use the int to base, because ::set expects a base char (ATCG), not
00663                         // a color code (0123).  It should only matter on final output.
00664                         set(header->elementCount++, int2base[(int) color]);
00665                     }
00666                     lastBase = thisBase;
00667                 }
00668                 else
00669                 {
00670                     set(header->elementCount++, toupper(fasta[fastaIndex]));
00671                 }
00672                 break;
00673         }
00674 
00675         //
00676         // slightly awkward exit handling when we exceed the fixed
00677         // number of chromosomes
00678         //
00679         if (terminateLoad) break;
00680     }
00681 
00682     //
00683     // also slightly awkward code to handle the last dangling chromosome...
00684     // all we should need to do is compute the md5 checksum
00685     //
00686     if (whichChromosome==0)
00687     {
00688         fastaFile.close();
00689         throw std::runtime_error("No chromosomes found - aborting!");
00690     }
00691     else
00692     {
00693         setChromosomeMD5andLength(whichChromosome-1);
00694     }
00695 
00696 #if 0
00697     sanityCheck(fastaFile);
00698 #endif
00699     fastaFile.close();
00700 
00701     if (_progressStream) *_progressStream << "\r";
00702 
00703     std::cerr << "FASTA binary cache file '"
00704               << _umfaFilename
00705               << "' created."
00706               << std::endl;
00707 
00708     //
00709     // leave the umfastaFile open in case caller wants to use it
00710     //
00711     return false;
00712 }
00713 
00714 int GenomeSequence::getChromosomeCount() const
00715 {
00716     return header->_chromosomeCount;
00717 }
00718 
00719 //return chromosome index: 0, 1, ... 24;
00720 int GenomeSequence::getChromosome(genomeIndex_t position) const
00721 {
00722     if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX;
00723 
00724     if (header->_chromosomeCount == 0)
00725         return INVALID_CHROMOSOME_INDEX;
00726 
00727     int start = 0;
00728     int stop = header->_chromosomeCount - 1;
00729 
00730     // eliminate case where position is in the last chromosome, since the loop
00731     // below falls off the end of the list if it in the last one.
00732 
00733     if (position > header->_chromosomes[stop].start)
00734         return (stop);
00735 
00736     while (start <= stop)
00737     {
00738         int middle = (start + stop) / 2;
00739 
00740         if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start)
00741             return middle;
00742 
00743         if (position == header->_chromosomes[middle + 1].start)
00744             return (middle + 1);
00745 
00746         if (position > header->_chromosomes[middle + 1].start)
00747             start = middle + 1;
00748 
00749         if (position < header->_chromosomes[middle].start)
00750             stop = middle - 1;
00751     }
00752 
00753     return -1;
00754 }
00755 
00756 //
00757 // Given a chromosome name and 1-based chromosome index, return the
00758 // genome index (0 based) into sequence for it.
00759 //
00760 // NB: the header->chromosomes array contains zero based genome positions
00761 //
00762 genomeIndex_t GenomeSequence::getGenomePosition(
00763     const char *chromosomeName,
00764     unsigned int chromosomeIndex) const
00765 {
00766     genomeIndex_t i = getGenomePosition(chromosomeName);
00767     if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00768     return i + chromosomeIndex - 1;
00769 }
00770 
00771 genomeIndex_t GenomeSequence::getGenomePosition(
00772     int chromosome,
00773     unsigned int chromosomeIndex) const
00774 {
00775     if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX;
00776 
00777     genomeIndex_t i = header->_chromosomes[chromosome].start;
00778     if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00779     return i + chromosomeIndex - 1;
00780 }
00781 
00782 //
00783 // return the genome index (0 based) of the start of the named
00784 // chromosome.  If none is found, INVALID_GENOME_INDEX is returned.
00785 //
00786 // XXX may need to speed this up - and smarten it up with some
00787 // modest chromosome name parsing.... e.g. '%d/X/Y' or 'chr%d/chrX/chrY' or
00788 // other schemes.
00789 //
00790 genomeIndex_t GenomeSequence::getGenomePosition(const char *chromosomeName) const
00791 {
00792     int chromosome = getChromosome(chromosomeName);
00793     if (chromosome==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00794     return header->_chromosomes[chromosome].start;
00795 }
00796 
00797 int GenomeSequence::getChromosome(const char *chromosomeName) const
00798 {
00799     unsigned int i;
00800     for (i=0; i<header->_chromosomeCount; i++)
00801     {
00802         if (strcmp(header->_chromosomes[i].name, chromosomeName)==0)
00803         {
00804             return i;
00805         }
00806     }
00807     return INVALID_CHROMOSOME_INDEX;
00808 }
00809 
00810 //
00811 // Given a read, reverse the string and swap the base
00812 // pairs for the reverse strand equivalents.
00813 //
00814 void GenomeSequence::getReverseRead(std::string &read)
00815 {
00816     std::string newRead;
00817     if (read.size()) for (int32_t i=(int) read.size() - 1; i>=0; i--)
00818         {
00819             newRead.push_back(BasePair(read[i]));
00820         }
00821     read = newRead;
00822 }
00823 
00824 void GenomeSequence::getReverseRead(String& read)
00825 {
00826     int i = 0;
00827     int j = read.Length()-1;
00828     char temp;
00829     while (i < j)
00830     {
00831         temp = read[j];
00832         read[j] = read[i];
00833         read[i] = temp;
00834     }
00835 }
00836 
00837 #define ABS(x) ( (x) > 0 ? (x) : -(x) )
00838 int GenomeSequence::debugPrintReadValidation(
00839     std::string &read,
00840     std::string &quality,
00841     char   direction,
00842     genomeIndex_t   readLocation,
00843     int sumQuality,
00844     int mismatchCount,
00845     bool recurse
00846 ) 
00847 {
00848     int validateSumQ = 0;
00849     int validateMismatchCount = 0;
00850     int rc = 0;
00851     std::string genomeData;
00852 
00853     for (uint32_t i=0; i<read.size(); i++)
00854     {
00855         if (tolower(read[i]) != tolower((*this)[readLocation + i]))
00856         {
00857             validateSumQ += quality[i] - '!';
00858             // XXX no longer valid:
00859             if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
00860             genomeData.push_back(tolower((*this)[readLocation + i]));
00861         }
00862         else
00863         {
00864             genomeData.push_back(toupper((*this)[readLocation + i]));
00865         }
00866     }
00867     assert(validateSumQ>=0);
00868     if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
00869     {
00870         printf("SUMQ: Original Genome: %s  test read: %s : actual sumQ = %d, test sumQ = %d\n",
00871                genomeData.c_str(),
00872                read.c_str(),
00873                validateSumQ,
00874                sumQuality
00875               );
00876         rc++;
00877     }
00878     else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
00879     {
00880         printf("MISM: Original Genome: %s  test read: %s : actual mismatch %d test mismatches %d\n",
00881                genomeData.c_str(),
00882                read.c_str(),
00883                validateMismatchCount,
00884                mismatchCount
00885               );
00886         rc++;
00887     }
00888     else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
00889     {
00890         printf("BOTH: Original Genome: %s  test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
00891                genomeData.c_str(),
00892                read.c_str(),
00893                validateSumQ,
00894                sumQuality,
00895                validateMismatchCount,
00896                mismatchCount
00897               );
00898         rc++;
00899     }
00900 
00901     if (recurse && ABS(validateMismatchCount - mismatchCount) > (int) read.size()/2)
00902     {
00903         printf("large mismatch difference, trying reverse strand: ");
00904         std::string reverseRead = read;
00905         std::string reverseQuality = quality;
00906         getReverseRead(reverseRead);
00907         reverse(reverseQuality.begin(), reverseQuality.end());
00908         rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
00909     }
00910     return rc;
00911 }
00912 #undef ABS
00913 
00914 
00915 bool GenomeSequence::wordMatch(unsigned int index, std::string &word) const
00916 {
00917     for (uint32_t i = 0; i<word.size(); i++)
00918     {
00919         if ((*this)[index + i] != word[i]) return false;
00920     }
00921     return true;
00922 }
00923 
00924 bool GenomeSequence::printNearbyWords(unsigned int index, unsigned int deviation, std::string &word) const
00925 {
00926     for (unsigned int i = index - deviation; i < index + deviation; i++)
00927     {
00928         if (wordMatch(i, word))
00929         {
00930             std::cerr << "word '"
00931                       << word
00932                       << "' found "
00933                       << index - i
00934                       << " away from position "
00935                       << index
00936                       << "."
00937                       << std::endl;
00938         }
00939     }
00940     return false;
00941 }
00942 
00943 void GenomeSequence::dumpSequenceSAMDictionary(std::ostream &file) const
00944 {
00945     for (unsigned int i=0; i<header->_chromosomeCount; i++)
00946     {
00947         file
00948         <<  "@SQ"
00949         << "\tSN:" << header->_chromosomes[i].name  // name
00950         << "\tLN:" << header->_chromosomes[i].size  // number of bases
00951         << "\tAS:" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
00952         << "\tM5:" << header->_chromosomes[i].md5
00953         << "\tUR:" << header->_chromosomes[i].uri
00954         << "\tSP:" << header->_chromosomes[i].species // e.g. Homo_sapiens
00955         << std::endl;
00956     }
00957 }
00958 
00959 void GenomeSequence::dumpHeaderTSV(std::ostream &file) const
00960 {
00961     file << "# Reference: " << _baseFilename << std::endl;
00962     file << "# SN: sample name - must be unique" << std::endl;
00963     file << "# AS: assembly name" << std::endl;
00964     file << "# SP: species" << std::endl;
00965     file << "# LN: chromosome/contig length" << std::endl;
00966     file << "# M5: chromosome/contig MD5 checksum" << std::endl;
00967     file << "# LN and M5 are only printed for informational purposes." << std::endl;
00968     file << "# Karma will only set those values when creating the index." << std::endl;
00969     file << "SN" << "\t" << "AS" << "\t" << "SP" << "\t" << "UR" << "\t" << "LN" << "\t" << "M5" << std::endl;
00970     for (unsigned int i=0; i<header->_chromosomeCount; i++)
00971     {
00972         file
00973         << header->_chromosomes[i].name  // name
00974         << "\t" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
00975         << "\t" << header->_chromosomes[i].uri
00976         << "\t" << header->_chromosomes[i].species // e.g. Homo_sapiens
00977         << "\t" << header->_chromosomes[i].size  // number of bases
00978         << "\t" << header->_chromosomes[i].md5
00979         << std::endl;
00980     }
00981 }
00982 
00983 
00984 void GenomeSequence::getString(std::string &str, int chromosome, genomeIndex_t index, int baseCount) const
00985 {
00986     //
00987     // calculate the genome index for the lazy caller...
00988     //
00989     genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1;
00990 
00991     getString(str, genomeIndex, baseCount);
00992 }
00993 
00994 void GenomeSequence::getString(String &str, int chromosome, genomeIndex_t index, int baseCount) const
00995 {
00996     std::string string;
00997     this-> getString(string, chromosome, index, baseCount);
00998     str = string.c_str();
00999 }
01000 
01001 void GenomeSequence::getString(std::string &str, genomeIndex_t index, int baseCount) const
01002 {
01003     str.clear();
01004     if (baseCount > 0)
01005     {
01006         for (int i=0; i<baseCount; i++)
01007         {
01008             str.push_back((*this)[index + i]);
01009         }
01010     }
01011     else
01012     {
01013         // if caller passed negative basecount, give them
01014         // the read for the 3' end
01015         for (int i=0; i< -baseCount; i++)
01016         {
01017             str.push_back(base2complement[(int)(*this)[index + i]]);
01018         }
01019     }
01020 }
01021 
01022 void GenomeSequence::getString(String &str, genomeIndex_t index, int baseCount) const
01023 {
01024     std::string string;
01025     getString(string, index, baseCount);
01026     str = string.c_str();
01027 }
01028 
01029 void GenomeSequence::getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const
01030 {
01031     str.clear();
01032     if (baseCount > 0)
01033     {
01034         for (int i=0; i<baseCount; i++)
01035         {
01036             char base = (*this)[index + i];
01037             if (in(index+i, highLightStart, highLightEnd))
01038                 base = tolower(base);
01039             str.push_back(base);
01040         }
01041     }
01042     else
01043     {
01044         // if caller passed negative basecount, give them
01045         // the read for the 3' end
01046         for (int i=0; i< -baseCount; i++)
01047         {
01048             char base = base2complement[(int)(*this)[index + i]];
01049             if (in(index+i, highLightStart, highLightEnd))
01050                 base = tolower(base);
01051             str.push_back(base);
01052         }
01053     }
01054 }
01055 
01056 void GenomeSequence::print30(genomeIndex_t index) const
01057 {
01058     std::cout << "index: " << index << "\n";
01059     for (genomeIndex_t i=index-30; i<index+30; i++)
01060         std::cout << (*this)[i];
01061     std::cout << "\n";
01062     for (genomeIndex_t i=index-30; i<index; i++)
01063         std::cout << " ";
01064     std::cout << "^";
01065     std::cout << std::endl;
01066 }
01067 
01068 void GenomeSequence::getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location) const
01069 {
01070     result = "";
01071     for (uint32_t i=0; i < read.size(); i++)
01072     {
01073         if (read[i] == (*this)[location+i])
01074             result.push_back(' ');
01075         else
01076             result.push_back('^');
01077     }
01078 }
01079 
01080 void GenomeSequence::getMismatchString(std::string &result, const std::string &read, genomeIndex_t location) const
01081 {
01082     result = "";
01083     for (uint32_t i=0; i < read.size(); i++)
01084     {
01085         if (read[i] == (*this)[location+i])
01086             result.push_back(toupper(read[i]));
01087         else
01088             result.push_back(tolower(read[i]));
01089     }
01090 }
01091 
01092 genomeIndex_t GenomeSequence::simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const
01093 {
01094     int bestScore = 1000000; // either mismatch count or sumQ
01095     genomeIndex_t bestMatchLocation = INVALID_GENOME_INDEX;
01096     for (int i=-windowSize; i<windowSize; i++)
01097     {
01098         int newScore;
01099 
01100         if (i<0 && ((uint32_t) -i) > index) continue;
01101         if (index + i + read.size() >= getNumberBases()) continue;
01102 
01103         if (quality=="")
01104         {
01105             newScore = this->getMismatchCount(read, index + i);
01106         }
01107         else
01108         {
01109             newScore = this->getSumQ(read, quality, index + i);
01110         }
01111         if (newScore < bestScore)
01112         {
01113             bestScore = newScore;
01114             bestMatchLocation = index + i;
01115         }
01116     }
01117     return bestMatchLocation;
01118 }
01119 
01120 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h)
01121 {
01122     stream << (MemoryMapArrayHeader &) h;
01123     stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01124     stream << "isColorSpace: " << h._colorSpace << std::endl;
01125     stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01126     uint64_t totalSize = 0;
01127     for (uint32_t i=0; i < h._chromosomeCount; i++)
01128     {
01129         totalSize += h._chromosomes[i].size;
01130         stream << "Chromosome Index " << i << " name: " << h._chromosomes[i].name << std::endl;
01131         stream << "Chromosome Index " << i << " whole genome start: " << h._chromosomes[i].start << std::endl;
01132         stream << "Chromosome Index " << i << " whole genome size: " << h._chromosomes[i].size << std::endl;
01133         stream << "Chromosome Index " << i << " md5 checksum: " << h._chromosomes[i].md5 << std::endl;
01134         stream << "Chromosome Index " << i << " assemblyID: " << h._chromosomes[i].assemblyID << std::endl;
01135         stream << "Chromosome Index " << i << " species: " << h._chromosomes[i].species << std::endl;
01136         stream << "Chromosome Index " << i << " URI: " << h._chromosomes[i].uri << std::endl;
01137     }
01138     stream << "Total Genome Size: " << totalSize << " bases."<< std::endl;
01139     if (totalSize != h.elementCount)
01140     {
01141         stream << "Total Genome Size: does not match elementCount!\n";
01142     }
01143 
01144     stream << std::endl;
01145     return stream;
01146 }
01147 
01148 void GenomeSequence::getChromosomeAndIndex(std::string &s, genomeIndex_t i) const
01149 {
01150     int whichChromosome = 0;
01151 
01152     whichChromosome = getChromosome(i);
01153 
01154     if (whichChromosome == INVALID_CHROMOSOME_INDEX)
01155     {
01156         s = "invalid genome index";     // TODO include the index in error
01157     }
01158     else
01159     {
01160         std::ostringstream buf;
01161         genomeIndex_t   chromosomeIndex = i - getChromosomeStart(whichChromosome) + 1;
01162         buf << header->_chromosomes[whichChromosome].name  << ":"  << chromosomeIndex;
01163 #if 0
01164         buf << " (GenomeIndex " << i << ")";
01165 #endif
01166         s = buf.str();
01167     }
01168     return;
01169 }
01170 
01171 
01172 void GenomeSequence::getChromosomeAndIndex(String& s, genomeIndex_t i) const
01173 {
01174     std::string ss;
01175     getChromosomeAndIndex(ss, i);
01176     s = ss.c_str();
01177     return;
01178 }
01179 
01180 
01181 //
01182 // This is intended to be a helper routine to get dbSNP files
01183 // loaded.  In some cases, we will load into an mmap() file (ie
01184 // when we are creating it), in others, we will simply be loading
01185 // an existing dbSNP file into RAM (when the binary file does not
01186 // exist or when we are running with useMemoryMapFlag == false.
01187 //
01188 // Assume that dbSNP exists, is writable, and is the right size.
01189 //
01190 // Using the dbSNPFilename given, mark each dbSNP position
01191 // with a bool true.
01192 //
01193 // Return value:
01194 //   True: if populateDBSNP() succeed
01195 //   False: if not succeed
01196 bool GenomeSequence::populateDBSNP(
01197     mmapArrayBool_t &dbSNP,
01198     std::ifstream &inputFile) const
01199 {
01200     std::string inputLine;
01201 
01202     assert(dbSNP.getElementCount() == getNumberBases());
01203 
01204     std::string chromosomeName;
01205     genomeIndex_t chromosomePosition1;  // 1-based
01206     uint64_t    ignoredLineCount = 0;
01207 
01208     while (getline(inputFile, inputLine))
01209     {
01210         std::istringstream inputLine2(inputLine);
01211         genomeIndex_t genomeIndex;  // 1-based
01212 
01213         inputLine2 >> chromosomeName;
01214         inputLine2 >> chromosomePosition1;
01215 
01216         // check for a comment line
01217         if (chromosomeName.size()>0 && chromosomeName[0]=='#')
01218         {
01219             continue;
01220         }
01221 
01222 #if 0
01223         //
01224         // enable this check if we want only chromosomes
01225         // 1-22.  Some code is sensitive to this, so we may
01226         // need to make it an option.
01227         //
01228         if (!isdigit(chromosomeName[0])) continue;
01229 #endif
01230 
01231         genomeIndex = getGenomePosition(chromosomeName.c_str(), chromosomePosition1);
01232 
01233         // if the genome index is invalid, ignore it
01234         if (genomeIndex == INVALID_GENOME_INDEX)
01235         {
01236             ignoredLineCount++;
01237             continue;
01238         }
01239 
01240         dbSNP.set(genomeIndex, true);
01241     }
01242 
01243     inputFile.close();
01244 
01245     if (ignoredLineCount > 0)
01246     {
01247         std::cerr << "GenomeSequence::populateDBSNP: ignored " << ignoredLineCount << " SNP positions due to invalid format of line." << std::endl;
01248         return false;
01249     }
01250     return true;
01251 }
01252 
01253 bool GenomeSequence::loadDBSNP(
01254     mmapArrayBool_t &dbSNP,
01255     const char *inputFileName) const
01256 {
01257     //
01258     // the goal in this section of code is to allow the user
01259     // to either specify a valid binary version of the SNP file,
01260     // or the original text file that it gets created from.
01261     //
01262     // To do this, we basically open, sniff the error message,
01263     // and if it claims it is not a binary version of the file,
01264     // we go ahead and treat it as the text file and use the
01265     // GenomeSequence::populateDBSNP method to load it.
01266     //
01267     // Further checking is really needed to ensure users don't
01268     // mix a dbSNP file for a different reference, since it is really
01269     // easy to do.
01270     //
01271     if (strlen(inputFileName)!=0)
01272     {
01273         std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush;
01274 
01275         if (dbSNP.open(inputFileName, O_RDONLY))
01276         {
01277             //
01278             // failed to open, possibly due to bad magic.
01279             //
01280             // this is really awful ... need to have a return
01281             // code that is smart enough to avoid this ugliness:
01282             //
01283             if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos)
01284             {
01285                 std::cerr << "Error: " << dbSNP.getErrorString() << std::endl;
01286                 exit(1);
01287             }
01288             //
01289             // we have a file, assume we can load it as a text file
01290             //
01291             std::ifstream inputFile;
01292             inputFile.open(inputFileName);
01293             if (inputFile.fail())
01294             {
01295                 std::cerr << "Error: failed to open " << inputFileName << std::endl;
01296                 exit(1);
01297             }
01298 
01299             std::cerr << "(as text file) ";
01300 
01301             // anonymously (RAM resident only) create:
01302             dbSNP.create(getNumberBases());
01303 
01304             // now load it into RAM
01305             populateDBSNP(dbSNP, inputFile);
01306             inputFile.close();
01307 
01308         }
01309         else
01310         {
01311             std::cerr << "(as binary mapped file) ";
01312         }
01313 
01314         std::cerr << "DONE!" << std::endl;
01315         return false;
01316     }
01317     else
01318     {
01319         return true;
01320     }
01321 }
01322 
01323 
01324 #if defined(TEST)
01325 
01326 void simplestExample(void)
01327 {
01328     GenomeSequence  reference;
01329     genomeIndex_t   index;
01330 
01331     // a particular reference is set by:
01332     // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa")
01333     //
01334     // In the above example, the suffix .fa is stripped and replaced with .umfa,
01335     // which contains the actual file being opened.
01336     //
01337     if (reference.open())
01338     {
01339         perror("GenomeSequence::open");
01340         exit(1);
01341     }
01342 
01343 
01344     index = 1000000000; // 10^9
01345     //
01346     // Write the base at the given index.  Here, index is 0 based,
01347     // and is across the whole genome, as all chromosomes are sequentially
01348     // concatenated, so the allowed range is
01349     //
01350     // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last))
01351     //
01352     // (where int last = reference.getChromosomeCount() - 1;)
01353     //
01354     std::cout << "base[" << index << "] = " << reference[index] << std::endl;
01355 
01356     //
01357     // Example for finding chromosome and one based chromosome position given
01358     // and absolute position on the genome in 'index':
01359     //
01360     int chr = reference.getChromosome(index);
01361     genomeIndex_t   chrIndex = index - reference.getChromosomeStart(chr) + 1;   // 1-based
01362 
01363     std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
01364 
01365     //
01366     // Example for finding an absolute genome index position when the
01367     // chromosome name and one based position are known:
01368     //
01369     const char *chromosomeName = "5";
01370     chr = reference.getChromosome(chromosomeName);     // 0-based
01371     chrIndex = 100000;                      // 1-based
01372 
01373     index = reference.getChromosomeStart(chr) + chrIndex - 1;
01374 
01375     std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
01376 
01377     reference.close();
01378 }
01379 
01380 void testGenomeSequence(void)
01381 {
01382     GenomeSequence reference;
01383 
01384 #if 0
01385     std::string referenceName = "someotherreference";
01386     if (reference.setFastaName(referenceName))
01387     {
01388         std::cerr << "failed to open reference file "
01389                   << referenceName
01390                   << std::endl;
01391         exit(1);
01392     }
01393 #endif
01394 
01395     std::cerr << "open and prefetch the reference genome: ";
01396 
01397     // open it
01398     if (reference.open())
01399     {
01400         exit(1);
01401     }
01402     std::cerr << "done!" << std::endl;
01403 
01404     //
01405     // For the human genome, genomeIndex ranges from 0 to 3.2x10^9
01406     //
01407     genomeIndex_t   genomeIndex;    // 0 based
01408     unsigned int chromosomeIndex;   // 1 based
01409     unsigned int chromosome;        // 0..23 or so
01410     std::string chromosomeName;
01411 
01412     //
01413     // Here we'll start with a chromosome name, then obtain the genome
01414     // index, and use it to find the base we want:
01415     //
01416     chromosomeName = "2";
01417     chromosomeIndex = 1234567;
01418     // this call is slow (string search for chromsomeName):
01419     genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
01420     assert(genomeIndex!=INVALID_GENOME_INDEX);
01421     std::cout << "Chromosome " << chromosomeName << ", index ";
01422     std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
01423     std::cout << " at genome index position " << genomeIndex << std::endl;
01424 
01425     //
01426     // now reverse it - given a genomeIndex from above, find the chromosome
01427     // name and index:
01428     //
01429 
01430     // slow (binary search on genomeIndex):
01431     chromosome = reference.getChromosome(genomeIndex);
01432     unsigned int newChromosomeIndex;
01433     // not slow:
01434     newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
01435 
01436     assert(chromosomeIndex == newChromosomeIndex);
01437 
01438     // more testing... at least test and use PackedRead:
01439     //
01440     PackedRead pr;
01441 
01442     pr.set("ATCGATCG", 0);
01443     assert(pr.size()==8);
01444     assert(pr[0]==GenomeSequence::base2int[(int) 'A']);
01445     assert(pr[1]==GenomeSequence::base2int[(int) 'T']);
01446     assert(pr[2]==GenomeSequence::base2int[(int) 'C']);
01447     assert(pr[3]==GenomeSequence::base2int[(int) 'G']);
01448     pr.set("ATCGATCG", 1);
01449     assert(pr.size()==9);
01450     pr.set("", 0);
01451     assert(pr.size()==0);
01452     pr.set("", 1);
01453     assert(pr.size()==1);
01454     pr.set("", 2);
01455     assert(pr.size()==2);
01456     pr.set("", 3);
01457     assert(pr.size()==3);
01458     assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
01459     assert(pr[1]==GenomeSequence::base2int[(int) 'N']);
01460     assert(pr[2]==GenomeSequence::base2int[(int) 'N']);
01461     pr.set("C", 1);
01462     assert(pr.size()==2);
01463     assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
01464     assert(pr[1]==GenomeSequence::base2int[(int) 'C']);
01465 
01466 }
01467 
01468 //
01469 // After I build libcsg, I compile and run this test code using:
01470 //
01471 // g++ -DTEST -o try GenomeSequence.cpp -L. -lcsg -lm -lz -lssl
01472 // you also may need -fno-rtti
01473 // ./try
01474 //
01475 int main(int argc, const char **argv)
01476 {
01477 #if 1
01478     simplestExample();
01479 #else
01480     testGenomeSequence();
01481 #endif
01482     exit(0);
01483 }
01484 
01485 #endif