GenomeSequence.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "assert.h"
00019 #include "ctype.h"
00020 #include "stdio.h"
00021 #include "zlib.h"
00022 #include "Error.h"
00023 
00024 
00025 #include "Generic.h"
00026 #include "GenomeSequence.h"
00027 
00028 #include <algorithm>
00029 #include <istream>
00030 #include <fstream>
00031 #include <sstream>
00032 #include <stdexcept>
00033 
00034 
00035 //
00036 // Map ASCII values to a 2 (or 3) bit encoding for the base pair value
00037 //  class 0 -> 'A' (Adenine - 0x41 and 0x61)
00038 //  class 1 -> 'C' (Cytosine - 0x43 and 0x63)
00039 //  class 2 -> 'G' (Guanine - 0x47 and 0x67)
00040 //  class 3 -> 'T' (Thymine - 0x54 and 0x74)
00041 //  class 4 -> 'N' (Unknown - read error or incomplete data - 0x4E and 0x6E)
00042 //  class 5 -> not a valid DNA base pair character
00043 //
00044 // Note: The +1 array size is for the terminating NUL character
00045 //
00046 // NB: This table also maps 0, 1, 2, and 3 to the corresponding integers,
00047 // and '.' to class 4.  This allows ABI SOLiD reads to be converted
00048 // to integers via ReadIndexer::Word2Integer.
00049 //
00050 unsigned char GenomeSequence::base2int[256+1] =
00051     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0x00-0x0F
00052     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0x10-0x1F
00053     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\004\005"  // 0x20-0x2F
00054     "\000\001\002\003\005\005\005\005\005\005\005\005\005\005\005\005"  // 0x30-0x3F
00055     "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005"  // 0x40-0x4F
00056     "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005"  // 0x50-0x5F
00057     "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005"  // 0x60-0x6F
00058     "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005"  // 0x70-0x7F
00059 // not used, but included for completeness:
00060     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0x80-0x8F
00061     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0x90-0x9F
00062     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0xA0-0xAF
00063     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0xB0-0xBF
00064     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0xC0-0xCF
00065     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0xD0-0xDF
00066     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0xE0-0xEF
00067     "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"  // 0xF0-0xFF
00068     ;
00069 
00070 //
00071 // This is obviously for base space use only:
00072 //
00073 const char GenomeSequence::int2base[] = "ACGTNMXXXXXXXXXX";
00074 //
00075 // convert int to color space value
00076 //
00077 const char GenomeSequence::int2colorSpace[] = "0123NXXXXXXXXXXX";
00078 
00079 //
00080 // This table maps 5' base space to the 3' complement base space
00081 // values, as well as 5' color space values to the corresponding
00082 // 3' complement color space values.
00083 //
00084 // In both cases, invalids are mapped to 'N', which isn't accurate
00085 // for ABI SOLiD, but internally it shouldn't matter (on output it
00086 // will).
00087 //
00088 unsigned char GenomeSequence::base2complement[256+1 /* for NUL char */] =
00089     "NNNNNNNNNNNNNNNN"  // 0x00-0x0F
00090     "NNNNNNNNNNNNNNNN"  // 0x10-0x1F
00091     "NNNNNNNNNNNNNNNN"  // 0x20-0x2F
00092     "0123NNNNNNNNNNNN"  // 0x30-0x3F
00093     "NTNGNNNCNNNNNNNN"  // 0x40-0x4F
00094     "NNNNANNNNNNNNNNN"  // 0x50-0x5F
00095     "NTNGNNNCNNNNNNNN"  // 0x60-0x6F
00096     "NNNNANNNNNNNNNNN"  // 0x70-0x7F
00097 // not used, but included for completeness:
00098     "NNNNNNNNNNNNNNNN"  // 0x80-0x8F
00099     "NNNNNNNNNNNNNNNN"  // 0x90-0x9F
00100     "NNNNNNNNNNNNNNNN"  // 0xA0-0xAF
00101     "NNNNNNNNNNNNNNNN"  // 0xB0-0xBF
00102     "NNNNNNNNNNNNNNNN"  // 0xC0-0xCF
00103     "NNNNNNNNNNNNNNNN"  // 0xD0-0xDF
00104     "NNNNNNNNNNNNNNNN"  // 0xE0-0xEF
00105     "NNNNNNNNNNNNNNNN"  // 0xF0-0xFF
00106     ;
00107 
00108 //
00109 // given a read in a string, pack it into the vector of
00110 // bytes coded as two bases per byte.
00111 //
00112 // The goal is to allow us to more rapidly compare against
00113 // the genome, which is itself packed 2 bases per byte.
00114 //
00115 // Unfortunately, the match position may be odd or even,
00116 // so as a result, we also need to be able to prepad
00117 // with 'N' bases to get the byte alignment the same, so
00118 // padWithNCount may be a positive number indicating how
00119 // many N bases to prepend.
00120 //
00121 void PackedRead::set(const char *rhs, int padWithNCount)
00122 {
00123     clear();
00124 
00125     // pad this packed read with 'N' bases two at a time
00126     while (padWithNCount>1)
00127     {
00128         packedBases.push_back(
00129             GenomeSequence::base2int[(int) 'N'] << 4 |
00130             GenomeSequence::base2int[(int) 'N']
00131         );
00132         padWithNCount -= 2;
00133         length+=2;
00134     }
00135 
00136     // when we have only one base, pack one 'N' base with
00137     // the first base in rhs if there is one.
00138     if (padWithNCount)
00139     {
00140         // NB: *rhs could be NUL, which is ok here - just keep
00141         // the length straight.
00142         packedBases.push_back(
00143             GenomeSequence::base2int[(int) *rhs] << 4 |
00144             GenomeSequence::base2int[(int) 'N']
00145         );
00146         // two cases - have characters in rhs or we don't:
00147         if (*rhs)
00148         {
00149             length+=2;  // pad byte plus one byte from rhs
00150             rhs++;
00151         }
00152         else
00153         {
00154             length++;
00155         }
00156         padWithNCount--;    // should now be zero, so superfluous.
00157         assert(padWithNCount==0);
00158     }
00159 
00160     // pad pairs of bases from rhs, two at a time:
00161     while (*rhs && *(rhs+1))
00162     {
00163         packedBases.push_back(
00164             GenomeSequence::base2int[(int) *(rhs+1)] << 4 |
00165             GenomeSequence::base2int[(int) *(rhs+0)]
00166         );
00167         rhs+=2;
00168         length+=2;
00169     }
00170 
00171     // if there is an odd base left at the end, put it
00172     // in a byte all its own (low 4 bits == 0):
00173     if (*rhs)
00174     {
00175         packedBases.push_back(
00176             GenomeSequence::base2int[(int) *(rhs+0)]
00177         );
00178         length++;
00179     }
00180     return;
00181 }
00182 
00183 std::string GenomeSequence::IntegerToSeq(unsigned int n, unsigned int wordsize)
00184 {
00185     std::string sequence("");
00186     for (unsigned int i = 0; i < wordsize; i ++)
00187         sequence += "N";
00188 
00189     unsigned int clearHigherBits = ~(3U << (wordsize<<1));  // XXX refactor - this appears several places
00190 
00191     if (n > clearHigherBits)
00192         error("%d needs to be a non-negative integer < clearHigherBits\n", n);
00193 
00194     for (unsigned int i = 0; i < wordsize; i++)
00195     {
00196         sequence[wordsize-1-i] = int2base[n & 3];
00197         n >>= 2;
00198     }
00199     return sequence;
00200 }
00201 
00202 
00203 
00204 GenomeSequence::GenomeSequence()
00205 {
00206     constructorClear();
00207 }
00208 
00209 void GenomeSequence::constructorClear()
00210 {
00211     _debugFlag = 0;
00212     _progressStream = NULL;
00213     _colorSpace = false;
00214 }
00215 
00216 void GenomeSequence::setup(const char *referenceFilename)
00217 {
00218     setReferenceName(referenceFilename);
00219 
00220     if (_progressStream) *_progressStream << "open and prefetch reference genome " << referenceFilename << ": " << std::flush;
00221 
00222     if (open(false))
00223     {
00224         std::cerr << "Failed to open reference genome " << referenceFilename << std::endl;
00225         std::cerr << errorStr << std::endl;
00226         exit(1);
00227     }
00228 
00229     prefetch();
00230     if (_progressStream) *_progressStream << "done." << std::endl << std::flush;
00231 }
00232 
00233 GenomeSequence::~GenomeSequence()
00234 {
00235     // free up resources:
00236     _umfaFile.close();
00237 }
00238 
00239 //
00240 // mapped open.
00241 //
00242 // if the file exists, map in into memory, and fill in a few useful
00243 // fields.
00244 //
00245 // isColorSpace is used only when the baseFilename is empty, so
00246 // that we can choose an appropriate default reference file.
00247 //
00248 
00249 bool GenomeSequence::open(bool isColorSpace, int flags)
00250 {
00251     bool rc;
00252 
00253     if (isColorSpace)
00254     {
00255         _umfaFilename = _baseFilename + "-cs.umfa";
00256     }
00257     else
00258     {
00259         _umfaFilename = _baseFilename + "-bs.umfa";
00260     }
00261 
00262     rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags);
00263     if (rc)
00264     {
00265         std::cerr << "GenomeSequence::open: failed to open file "
00266                   << _umfaFilename
00267                   << std::endl;
00268         return true;
00269     }
00270 
00271     _colorSpace = header->_colorSpace;
00272 
00273     return false;
00274 }
00275 
00276 void GenomeSequence::sanityCheck(MemoryMap &fasta)
00277 {
00278     unsigned int i;
00279 
00280     unsigned int genomeIndex = 0;
00281     for (i=0; i<fasta.length(); i++)
00282     {
00283         switch (fasta[i])
00284         {
00285             case '>':
00286                 while (fasta[i]!='\n' && fasta[i]!='\r') i++;
00287                 break;
00288             case '\n':
00289             case '\r':
00290                 break;
00291             default:
00292                 assert(base2int[(int)(*this)[genomeIndex]] == base2int[(int) fasta[i]]);
00293 #if 0
00294                 if (base2int[(*this)[genomeIndex]] != base2int[(int) fasta[i]])
00295                 {
00296                     int bp1 = base2int[(*this)[genomeIndex]];
00297                     int bp2 = base2int[fasta[i]];
00298                     printf("failing at genome index = %u, fasta index = %u.\n",
00299                            genomeIndex,i);
00300                 }
00301 #endif
00302                 genomeIndex++;
00303                 break;
00304         }
00305     }
00306 }
00307 
00308 #define HAS_SUFFIX(str, suffix) ((strlen(suffix) < str.size()) && (str.substr(str.size() - strlen(suffix)) == suffix))
00309 //
00310 // referenceFilename is  either a fasta or a UM fasta (.fa or .umfa)
00311 // filename.  In both cases, the suffix gets removed and the
00312 // base name is kept for later use depending on context.
00313 //
00314 bool GenomeSequence::setReferenceName(std::string referenceFilename)
00315 {
00316 
00317     if (HAS_SUFFIX(referenceFilename, ".fa"))
00318     {
00319         _referenceFilename = referenceFilename;
00320         _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3);
00321     }
00322     else if (HAS_SUFFIX(referenceFilename, ".umfa"))
00323     {
00324         _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5);
00325     }
00326     else if (HAS_SUFFIX(referenceFilename, "-cs.umfa"))
00327     {
00328         _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00329     }
00330     else if (HAS_SUFFIX(referenceFilename, "-bs.umfa"))
00331     {
00332         _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
00333     }
00334     else
00335     {
00336         _baseFilename = referenceFilename;
00337     }
00338     _fastaFilename = _baseFilename + ".fa";
00339 
00340     return false;
00341 }
00342 
00343 //
00344 // this works in lockstep with ::create to populate
00345 // the per chromosome header fields size and md5
00346 // checksum.
00347 //
00348 // It relies on header->elementCount being set to
00349 // the data length loaded so far ... not the ultimate
00350 // reference length.
00351 //
00352 bool GenomeSequence::setChromosomeMD5andLength(uint32_t whichChromosome)
00353 {
00354     if (whichChromosome>=header->_chromosomeCount) return true;
00355 
00356     ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00357     c->size = header->elementCount - c->start;
00358 
00359 #if !defined(WIN32)
00360     MD5_CTX md5Context;
00361     uint8_t md5Signature[MD5_DIGEST_LENGTH];
00362 
00363     //
00364     // it's easier to ensure we do this right if we just do it
00365     // in one big chunk:
00366     //
00367     char *md5Buffer = (char *) malloc(c->size);
00368 
00369     MD5_Init(&md5Context);
00370 
00371     for (genomeIndex_t i = 0; i < c->size; i ++)
00372     {
00373         md5Buffer[i] = (*this)[c->start + i];
00374     }
00375     MD5_Update(&md5Context, md5Buffer, c->size);
00376     MD5_Final((unsigned char *) &md5Signature, &md5Context);
00377     free(md5Buffer);
00378     for (int i=0; i<MD5_DIGEST_LENGTH; i++)
00379     {
00380         // cheesy but works:
00381         sprintf(c->md5+2*i, "%02x", md5Signature[i]);
00382     }
00383     // redundant, strictly speaking due to sprintf NUL terminating
00384     // it's output strings, but put it here anyway.
00385     c->md5[2*MD5_DIGEST_LENGTH] = '\0';
00386 #else
00387     memset(c->md5, 0, 2*MD5_DIGEST_LENGTH);
00388 #endif
00389     return false;
00390 }
00391 
00392 //
00393 // Given a buffer with a fasta format contents, count
00394 // the number of chromsomes in it and return that value.
00395 //
00396 static bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
00397 {
00398     chromosomeCount = 0;
00399     baseCount = 0;
00400     bool atLineStart = true;
00401 
00402     //
00403     // loop over the fasta file, essentially matching for the
00404     // pattern '^>.*$' and counting them.
00405     //
00406     for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00407     {
00408         switch (fastaData[fastaIndex])
00409         {
00410             case '\n':
00411             case '\r':
00412                 atLineStart = true;
00413                 break;
00414             case '>':
00415             {
00416                 if (!atLineStart) break;
00417                 chromosomeCount++;
00418                 //
00419                 // eat the rest of the line
00420                 //
00421                 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
00422                 {
00423                     fastaIndex++;
00424                 }
00425                 break;
00426             }
00427             default:
00428                 baseCount++;
00429                 atLineStart = false;
00430                 break;
00431         }
00432 
00433     }
00434     return false;
00435 }
00436 
00437 //
00438 // recreate the umfa file from a reference fasta format file
00439 //
00440 // The general format of a FASTA file is best described
00441 // on wikipedia at http://en.wikipedia.org/wiki/FASTA_format
00442 //
00443 // The format parsed here is a simpler subset, and is
00444 // described here http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
00445 //
00446 bool GenomeSequence::create()
00447 {
00448     // quick sanity check...
00449     assert(int2base[base2int[(int) 'A']]=='A');
00450     assert(int2base[base2int[(int) 'a']]=='A');
00451     assert(int2base[base2int[(int) 'T']]=='T');
00452     assert(int2base[base2int[(int) 't']]=='T');
00453     assert(int2base[base2int[(int) 'C']]=='C');
00454     assert(int2base[base2int[(int) 'c']]=='C');
00455     assert(int2base[base2int[(int) 'G']]=='G');
00456     assert(int2base[base2int[(int) 'g']]=='G');
00457     assert(int2base[base2int[(int) 'N']]=='N');
00458     assert(int2base[base2int[(int) 'n']]=='N');
00459     assert(int2base[base2int[(int) 'M']]=='M');
00460     assert(int2base[base2int[(int) 'm']]=='M');
00461 
00462     assert(base2int[(int) 'N']==4);
00463     assert(base2int[(int) 'n']==4);
00464     assert(base2int[(int) 'A']==0);
00465     assert(base2int[(int) 'a']==0);
00466     assert(base2int[(int) 'T']==3);
00467     assert(base2int[(int) 't']==3);
00468     assert(base2int[(int) 'C']==1);
00469     assert(base2int[(int) 'c']==1);
00470     assert(base2int[(int) 'G']==2);
00471     assert(base2int[(int) 'g']==2);
00472 
00473     if (_baseFilename=="")
00474     {
00475         std::cerr << "Base reference filename is empty." << std::endl;
00476         return true;
00477     }
00478 
00479     if (isColorSpace())
00480     {
00481         _umfaFilename = _baseFilename + "-cs.umfa";
00482     }
00483     else
00484     {
00485         _umfaFilename = _baseFilename + "-bs.umfa";
00486     }
00487 
00488     if (access(_umfaFilename.c_str(), R_OK) == 0)
00489     {
00490         std::cerr << "Output file '" << _umfaFilename << "' exists or is not writable - please remove." << std::endl;
00491         return true;
00492     }
00493 
00494     MemoryMap fastaFile;
00495 
00496     if (fastaFile.open(_fastaFilename.c_str()))
00497     {
00498         std::cerr << "failed to open input fasta file '" << _fastaFilename << "'." << std::endl;
00499         return true;
00500     }
00501 
00502     std::cerr << "Creating FASTA "
00503               << (isColorSpace() ? "color space " : "")
00504               << "binary cache file '"
00505               << _umfaFilename
00506               << "'."
00507               << std::endl;
00508 
00509     std::cerr << std::flush;
00510 
00511     //
00512     // simple ptr to fasta data -- just treat the memory map
00513     // as an array of fastaDataSize characters...
00514     //
00515     const char *fasta = (const char *) fastaFile.data;
00516     size_t fastaDataSize = fastaFile.length();
00517 
00518     uint32_t    chromosomeCount = 0;
00519     uint64_t    baseCount = 0;
00520     getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount);
00521 
00522     if (genomeSequenceArray::create(_umfaFilename.c_str(), baseCount, chromosomeCount))
00523     {
00524         std::cerr << "failed to create '"
00525                   << _umfaFilename
00526                   << "'."
00527                   << std::endl;
00528         perror("");
00529         return true;
00530     }
00531     header->elementCount = 0;
00532     header->_colorSpace = isColorSpace();
00533     header->setApplication(_application.c_str());
00534     header->_chromosomeCount = chromosomeCount;
00535     //
00536     // clear out the variable length chromosome info array
00537     //
00538     for (uint32_t i=0; i<header->_chromosomeCount; i++) header->_chromosomes[i].constructorClear();
00539 
00540     std::string chromosomeName;
00541 
00542     //
00543     // for converting the reference to colorspace, the first base is always 5 (in base space it is 'N')
00544     signed char lastBase = base2int[(int) 'N'];
00545     bool terminateLoad = false;
00546     int percent = -1, newPercent;
00547     uint32_t whichChromosome = 0;
00548     for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00549     {
00550         if (_progressStream)
00551         {
00552             newPercent = (1.0 * fastaIndex / fastaDataSize) * 100;
00553             if (newPercent>percent)
00554             {
00555                 std::cerr << "\r" << newPercent << "% ";
00556                 std::cerr << std::flush;
00557                 percent = newPercent;
00558             }
00559         }
00560         switch (fasta[fastaIndex])
00561         {
00562             case '\n':
00563             case '\r':
00564                 break;
00565             case '>':
00566             {
00567                 chromosomeName = "";
00568                 fastaIndex++;       // skip the > char
00569                 //
00570                 // pull out the chromosome new name
00571                 //
00572                 while (!isspace(fasta[fastaIndex]))
00573                 {
00574                     chromosomeName += fasta[fastaIndex++];  // slow, but who cares
00575                 }
00576                 //
00577                 // eat the rest of the line
00578                 //
00579                 while (fasta[fastaIndex]!='\n' && fasta[fastaIndex]!='\r')
00580                 {
00581                     fastaIndex++;
00582                 }
00583                 //
00584                 // save the Chromosome name and index into our
00585                 // header so we can use them later.
00586                 //
00587                 ChromosomeInfo *c = &header->_chromosomes[whichChromosome];
00588                 c->setChromosomeName(chromosomeName.c_str());
00589                 c->start = header->elementCount;
00590                 // c->size gets computed at the next '>' line or at the EOF
00591 
00592                 if (whichChromosome>0)
00593                 {
00594                     //
00595                     // compute md5 checksum for the chromosome that we just
00596                     // loaded (if there was one) - note that on the last
00597                     // chromosome, we have to duplicate this code after
00598                     // the end of this loop
00599                     //
00600                     setChromosomeMD5andLength(whichChromosome - 1);
00601                 }
00602                 whichChromosome++;
00603                 if (whichChromosome > header->_chromosomeCount)
00604                 {
00605                     std::cerr << "BUG: Exceeded computed chromosome count ("
00606                               << header->_chromosomeCount
00607                               << ") - genome is now truncated at chromosome "
00608                               << header->_chromosomes[header->_chromosomeCount-1].name
00609                               << " (index "
00610                               << header->_chromosomeCount
00611                               << ")."
00612                               << std::endl;
00613                     terminateLoad = true;
00614                 }
00615                 break;
00616             }
00617             default:
00618                 // save the base pair value
00619                 // Note: invalid characters come here as well, but we
00620                 // let ::set deal with mapping them.
00621                 if (isColorSpace())
00622                 {
00623 //
00624 // anything outside these values represents an invalid base
00625 // base codes: 0-> A,    1-> C,     2-> G,      3-> T
00626 // colorspace: 0-> blue, 1-> green, 2-> oragne, 3->red
00627 //
00628                     const char fromBase2CS[] =
00629                     {
00630                         /* 0000 */ 0,   // A->A
00631                         /* 0001 */ 1,   // A->C
00632                         /* 0010 */ 2,   // A->G
00633                         /* 0011 */ 3,   // A->T
00634                         /* 0100 */ 1,   // C->A
00635                         /* 0101 */ 0,   // C->C
00636                         /* 0110 */ 3,   // C->G
00637                         /* 0111 */ 2,   // C->T
00638                         /* 1000 */ 2,   // G->A
00639                         /* 1001 */ 3,   // G->C
00640                         /* 1010 */ 0,   // G->G
00641                         /* 1011 */ 1,   // G->T
00642                         /* 1100 */ 3,   // T->A
00643                         /* 1101 */ 2,   // T->C
00644                         /* 1110 */ 1,   // T->G
00645                         /* 1111 */ 0,   // T->T
00646                     };
00647                     //
00648                     // we are writing color space values on transitions,
00649                     // so we don't write a colorspace value when we
00650                     // get the first base value.
00651                     //
00652                     // On second and subsequent bases, write based on
00653                     // the index table above
00654                     //
00655                     char thisBase = base2int[(int)(fasta[fastaIndex])];
00656                     if (lastBase>=0)
00657                     {
00658                         char color;
00659                         if (lastBase>3 || thisBase>3) color=4;
00660                         else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
00661                         // re-use the int to base, because ::set expects a base char (ATCG), not
00662                         // a color code (0123).  It should only matter on final output.
00663                         set(header->elementCount++, int2base[(int) color]);
00664                     }
00665                     lastBase = thisBase;
00666                 }
00667                 else
00668                 {
00669                     set(header->elementCount++, toupper(fasta[fastaIndex]));
00670                 }
00671                 break;
00672         }
00673 
00674         //
00675         // slightly awkward exit handling when we exceed the fixed
00676         // number of chromosomes
00677         //
00678         if (terminateLoad) break;
00679     }
00680 
00681     //
00682     // also slightly awkward code to handle the last dangling chromosome...
00683     // all we should need to do is compute the md5 checksum
00684     //
00685     if (whichChromosome==0)
00686     {
00687         fastaFile.close();
00688         throw std::runtime_error("No chromosomes found - aborting!");
00689     }
00690     else
00691     {
00692         setChromosomeMD5andLength(whichChromosome-1);
00693     }
00694 
00695 #if 0
00696     sanityCheck(fastaFile);
00697 #endif
00698     fastaFile.close();
00699 
00700     if (_progressStream) *_progressStream << "\r";
00701 
00702     std::cerr << "FASTA binary cache file '"
00703               << _umfaFilename
00704               << "' created."
00705               << std::endl;
00706 
00707     //
00708     // leave the umfastaFile open in case caller wants to use it
00709     //
00710     return false;
00711 }
00712 
00713 int GenomeSequence::getChromosomeCount()
00714 {
00715     return header->_chromosomeCount;
00716 }
00717 
00718 //return chromosome index: 0, 1, ... 24;
00719 int GenomeSequence::getChromosome(genomeIndex_t position)
00720 {
00721     if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX;
00722 
00723     if (header->_chromosomeCount == 0)
00724         return -1;
00725 
00726     int start = 0;
00727     int stop = header->_chromosomeCount - 1;
00728 
00729     // eliminate case where position is in the last chromosome, since the loop
00730     // below falls off the end of the list if it in the last one.
00731 
00732     if (position > header->_chromosomes[stop].start)
00733         return (stop);
00734 
00735     while (start <= stop)
00736     {
00737         int middle = (start + stop) / 2;
00738 
00739         if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start)
00740             return middle;
00741 
00742         if (position == header->_chromosomes[middle + 1].start)
00743             return (middle + 1);
00744 
00745         if (position > header->_chromosomes[middle + 1].start)
00746             start = middle + 1;
00747 
00748         if (position < header->_chromosomes[middle].start)
00749             stop = middle - 1;
00750     }
00751 
00752     return -1;
00753 }
00754 
00755 //
00756 // Given a chromosome name and 1-based chromosome index, return the
00757 // genome index (0 based) into sequence for it.
00758 //
00759 // NB: the header->chromosomes array contains zero based genome positions
00760 //
00761 genomeIndex_t GenomeSequence::getGenomePosition(
00762     const char *chromosomeName,
00763     unsigned int chromosomeIndex)
00764 {
00765     genomeIndex_t i = getGenomePosition(chromosomeName);
00766     if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00767     return i + chromosomeIndex - 1;
00768 }
00769 
00770 genomeIndex_t GenomeSequence::getGenomePosition(
00771     int chromosome,
00772     unsigned int chromosomeIndex)
00773 {
00774     if (chromosome<0 || chromosome >= (int) header->_chromosomeCount) return INVALID_GENOME_INDEX;
00775 
00776     genomeIndex_t i = header->_chromosomes[chromosome].start;
00777     if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX;
00778     return i + chromosomeIndex - 1;
00779 }
00780 
00781 //
00782 // return the genome index (0 based) of the start of the named
00783 // chromosome.  If none is found, INVALID_GENOME_INDEX is returned.
00784 //
00785 // XXX may need to speed this up - and smarten it up with some
00786 // modest chromosome name parsing.... e.g. '%d/X/Y' or 'chr%d/chrX/chrY' or
00787 // other schemes.
00788 //
00789 genomeIndex_t GenomeSequence::getGenomePosition(const char *chromosomeName)
00790 {
00791     int chromosome = getChromosome(chromosomeName);
00792     if (chromosome==INVALID_CHROMOSOME_INDEX) return INVALID_GENOME_INDEX;
00793     return header->_chromosomes[chromosome].start;
00794 }
00795 
00796 int GenomeSequence::getChromosome(const char *chromosomeName)
00797 {
00798     unsigned int i;
00799     for (i=0; i<header->_chromosomeCount; i++)
00800     {
00801         if (strcmp(header->_chromosomes[i].name, chromosomeName)==0)
00802         {
00803             return i;
00804         }
00805     }
00806     return INVALID_CHROMOSOME_INDEX;
00807 }
00808 
00809 //
00810 // Given a read, reverse the string and swap the base
00811 // pairs for the reverse strand equivalents.
00812 //
00813 void GenomeSequence::getReverseRead(std::string &read)
00814 {
00815     std::string newRead;
00816     if (read.size()) for (int32_t i=(int) read.size() - 1; i>=0; i--)
00817         {
00818             newRead.push_back(BasePair(read[i]));
00819         }
00820     read = newRead;
00821 }
00822 
00823 void GenomeSequence::getReverseRead(String& read)
00824 {
00825     int i = 0;
00826     int j = read.Length()-1;
00827     char temp;
00828     while (i < j)
00829     {
00830         temp = read[j];
00831         read[j] = read[i];
00832         read[i] = temp;
00833     }
00834 }
00835 
00836 #define ABS(x) ( (x) > 0 ? (x) : -(x) )
00837 int GenomeSequence::debugPrintReadValidation(
00838     std::string &read,
00839     std::string &quality,
00840     char   direction,
00841     genomeIndex_t   readLocation,
00842     int sumQuality,
00843     int mismatchCount,
00844     bool recurse
00845 )
00846 {
00847     int validateSumQ = 0;
00848     int validateMismatchCount = 0;
00849     int rc = 0;
00850     std::string genomeData;
00851 
00852     for (uint32_t i=0; i<read.size(); i++)
00853     {
00854         if (tolower(read[i]) != tolower((*this)[readLocation + i]))
00855         {
00856             validateSumQ += quality[i] - '!';
00857             // XXX no longer valid:
00858             if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
00859             genomeData.push_back(tolower((*this)[readLocation + i]));
00860         }
00861         else
00862         {
00863             genomeData.push_back(toupper((*this)[readLocation + i]));
00864         }
00865     }
00866     assert(validateSumQ>=0);
00867     if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
00868     {
00869         printf("SUMQ: Original Genome: %s  test read: %s : actual sumQ = %d, test sumQ = %d\n",
00870                genomeData.c_str(),
00871                read.c_str(),
00872                validateSumQ,
00873                sumQuality
00874               );
00875         rc++;
00876     }
00877     else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
00878     {
00879         printf("MISM: Original Genome: %s  test read: %s : actual mismatch %d test mismatches %d\n",
00880                genomeData.c_str(),
00881                read.c_str(),
00882                validateMismatchCount,
00883                mismatchCount
00884               );
00885         rc++;
00886     }
00887     else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
00888     {
00889         printf("BOTH: Original Genome: %s  test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
00890                genomeData.c_str(),
00891                read.c_str(),
00892                validateSumQ,
00893                sumQuality,
00894                validateMismatchCount,
00895                mismatchCount
00896               );
00897         rc++;
00898     }
00899 
00900     if (recurse && ABS(validateMismatchCount - mismatchCount) > (int) read.size()/2)
00901     {
00902         printf("large mismatch difference, trying reverse strand: ");
00903         std::string reverseRead = read;
00904         std::string reverseQuality = quality;
00905         getReverseRead(reverseRead);
00906         reverse(reverseQuality.begin(), reverseQuality.end());
00907         rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
00908     }
00909     return rc;
00910 }
00911 #undef ABS
00912 
00913 
00914 bool GenomeSequence::wordMatch(unsigned int index, std::string &word)
00915 {
00916     for (uint32_t i = 0; i<word.size(); i++)
00917     {
00918         if ((*this)[index + i] != word[i]) return false;
00919     }
00920     return true;
00921 }
00922 
00923 bool GenomeSequence::printNearbyWords(unsigned int index, unsigned int deviation, std::string &word)
00924 {
00925     for (unsigned int i = index - deviation; i < index + deviation; i++)
00926     {
00927         if (wordMatch(i, word))
00928         {
00929             std::cerr << "word '"
00930                       << word
00931                       << "' found "
00932                       << index - i
00933                       << " away from position "
00934                       << index
00935                       << "."
00936                       << std::endl;
00937         }
00938     }
00939     return false;
00940 }
00941 
00942 void GenomeSequence::dumpSequenceSAMDictionary(std::ostream &file)
00943 {
00944     for (unsigned int i=0; i<header->_chromosomeCount; i++)
00945     {
00946         file
00947         <<  "@SQ"
00948         << "\tSN:" << header->_chromosomes[i].name  // name
00949         << "\tLN:" << header->_chromosomes[i].size  // number of bases
00950         << "\tAS:" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
00951         << "\tM5:" << header->_chromosomes[i].md5
00952         << "\tUR:" << header->_chromosomes[i].uri
00953         << "\tSP:" << header->_chromosomes[i].species // e.g. Homo_sapiens
00954         << std::endl;
00955     }
00956 }
00957 
00958 void GenomeSequence::dumpHeaderTSV(std::ostream &file)
00959 {
00960     file << "# Reference: " << _baseFilename << std::endl;
00961     file << "# SN: sample name - must be unique" << std::endl;
00962     file << "# AS: assembly name" << std::endl;
00963     file << "# SP: species" << std::endl;
00964     file << "# LN: chromosome/contig length" << std::endl;
00965     file << "# M5: chromosome/contig MD5 checksum" << std::endl;
00966     file << "# LN and M5 are only printed for informational purposes." << std::endl;
00967     file << "# Karma will only set those values when creating the index." << std::endl;
00968     file << "SN" << "\t" << "AS" << "\t" << "SP" << "\t" << "UR" << "\t" << "LN" << "\t" << "M5" << std::endl;
00969     for (unsigned int i=0; i<header->_chromosomeCount; i++)
00970     {
00971         file
00972         << header->_chromosomes[i].name  // name
00973         << "\t" << header->_chromosomes[i].assemblyID // e.g. NCBI36.3
00974         << "\t" << header->_chromosomes[i].uri
00975         << "\t" << header->_chromosomes[i].species // e.g. Homo_sapiens
00976         << "\t" << header->_chromosomes[i].size  // number of bases
00977         << "\t" << header->_chromosomes[i].md5
00978         << std::endl;
00979     }
00980 }
00981 
00982 bool loadHeaderFromTSV(std::istream &input)
00983 {
00984     return false;
00985 }
00986 
00987 
00988 void GenomeSequence::getString(std::string &str, int chromosome, genomeIndex_t index, int baseCount)
00989 {
00990     //
00991     // calculate the genome index for the lazy caller...
00992     //
00993     genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1;
00994 
00995     getString(str, genomeIndex, baseCount);
00996 }
00997 
00998 void GenomeSequence::getString(String &str, int chromosome, genomeIndex_t index, int baseCount)
00999 {
01000     std::string string;
01001     this-> getString(string, chromosome, index, baseCount);
01002     str = string.c_str();
01003 }
01004 
01005 void GenomeSequence::getString(std::string &str, genomeIndex_t index, int baseCount)
01006 {
01007     str.clear();
01008     if (baseCount > 0)
01009     {
01010         for (int i=0; i<baseCount; i++)
01011         {
01012             str.push_back((*this)[index + i]);
01013         }
01014     }
01015     else
01016     {
01017         // if caller passed negative basecount, give them
01018         // the read for the 3' end
01019         for (int i=0; i< -baseCount; i++)
01020         {
01021             str.push_back(base2complement[(int)(*this)[index + i]]);
01022         }
01023     }
01024 }
01025 
01026 void GenomeSequence::getString(String &str, genomeIndex_t index, int baseCount)
01027 {
01028     std::string string;
01029     getString(string, index, baseCount);
01030     str = string.c_str();
01031 }
01032 
01033 void GenomeSequence::getHighLightedString(std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd)
01034 {
01035     str.clear();
01036     if (baseCount > 0)
01037     {
01038         for (int i=0; i<baseCount; i++)
01039         {
01040             char base = (*this)[index + i];
01041             if (in(index+i, highLightStart, highLightEnd))
01042                 base = tolower(base);
01043             str.push_back(base);
01044         }
01045     }
01046     else
01047     {
01048         // if caller passed negative basecount, give them
01049         // the read for the 3' end
01050         for (int i=0; i< -baseCount; i++)
01051         {
01052             char base = base2complement[(int)(*this)[index + i]];
01053             if (in(index+i, highLightStart, highLightEnd))
01054                 base = tolower(base);
01055             str.push_back(base);
01056         }
01057     }
01058 }
01059 
01060 void GenomeSequence::print30(genomeIndex_t index)
01061 {
01062     std::cout << "index: " << index << "\n";
01063     for (genomeIndex_t i=index-30; i<index+30; i++)
01064         std::cout << (*this)[i];
01065     std::cout << "\n";
01066     for (genomeIndex_t i=index-30; i<index; i++)
01067         std::cout << " ";
01068     std::cout << "^";
01069     std::cout << std::endl;
01070 }
01071 
01072 void GenomeSequence::getMismatchHatString(std::string &result, const std::string &read, genomeIndex_t location)
01073 {
01074     result = "";
01075     for (uint32_t i=0; i < read.size(); i++)
01076     {
01077         if (read[i] == (*this)[location+i])
01078             result.push_back(' ');
01079         else
01080             result.push_back('^');
01081     }
01082 }
01083 
01084 void GenomeSequence::getMismatchString(std::string &result, const std::string &read, genomeIndex_t location)
01085 {
01086     result = "";
01087     for (uint32_t i=0; i < read.size(); i++)
01088     {
01089         if (read[i] == (*this)[location+i])
01090             result.push_back(toupper(read[i]));
01091         else
01092             result.push_back(tolower(read[i]));
01093     }
01094 }
01095 
01096 genomeIndex_t GenomeSequence::simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index, int windowSize)
01097 {
01098     int bestScore = 1000000; // either mismatch count or sumQ
01099     genomeIndex_t bestMatchLocation = INVALID_GENOME_INDEX;
01100     for (int i=-windowSize; i<windowSize; i++)
01101     {
01102         int newScore;
01103 
01104         if (i<0 && ((uint32_t) -i) > index) continue;
01105         if (index + i + read.size() >= getNumberBases()) continue;
01106 
01107         if (quality=="")
01108         {
01109             newScore = this->getMismatchCount(read, index + i);
01110         }
01111         else
01112         {
01113             newScore = this->getSumQ(read, quality, index + i);
01114         }
01115         if (newScore < bestScore)
01116         {
01117             bestScore = newScore;
01118             bestMatchLocation = index + i;
01119         }
01120     }
01121     return bestMatchLocation;
01122 }
01123 
01124 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h)
01125 {
01126     stream << (MemoryMapArrayHeader &) h;
01127     stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01128     stream << "isColorSpace: " << h._colorSpace << std::endl;
01129     stream << "chromosomeCount: " << h._chromosomeCount << std::endl;
01130     uint64_t totalSize = 0;
01131     for (uint32_t i=0; i < h._chromosomeCount; i++)
01132     {
01133         totalSize += h._chromosomes[i].size;
01134         stream << "Chromosome Index " << i << " name: " << h._chromosomes[i].name << std::endl;
01135         stream << "Chromosome Index " << i << " whole genome start: " << h._chromosomes[i].start << std::endl;
01136         stream << "Chromosome Index " << i << " whole genome size: " << h._chromosomes[i].size << std::endl;
01137         stream << "Chromosome Index " << i << " md5 checksum: " << h._chromosomes[i].md5 << std::endl;
01138         stream << "Chromosome Index " << i << " assemblyID: " << h._chromosomes[i].assemblyID << std::endl;
01139         stream << "Chromosome Index " << i << " species: " << h._chromosomes[i].species << std::endl;
01140         stream << "Chromosome Index " << i << " URI: " << h._chromosomes[i].uri << std::endl;
01141     }
01142     stream << "Total Genome Size: " << totalSize << " bases."<< std::endl;
01143     if (totalSize != h.elementCount)
01144     {
01145         stream << "Total Genome Size: does not match elementCount!\n";
01146     }
01147 
01148     stream << std::endl;
01149     return stream;
01150 }
01151 
01152 void GenomeSequence::getChromosomeAndIndex(std::string &s, genomeIndex_t i)
01153 {
01154     int whichChromosome = 0;
01155 
01156     whichChromosome = getChromosome(i);
01157 
01158     if (whichChromosome == INVALID_CHROMOSOME_INDEX)
01159     {
01160         s = "invalid genome index";     // TODO include the index in error
01161     }
01162     else
01163     {
01164         std::ostringstream buf;
01165         genomeIndex_t   chromosomeIndex = i - getChromosomeStart(whichChromosome) + 1;
01166         buf << header->_chromosomes[whichChromosome].name  << ":"  << chromosomeIndex;
01167 #if 0
01168         buf << " (GenomeIndex " << i << ")";
01169 #endif
01170         s = buf.str();
01171     }
01172     return;
01173 }
01174 
01175 
01176 void GenomeSequence::getChromosomeAndIndex(String& s, genomeIndex_t i)
01177 {
01178     std::string ss;
01179     getChromosomeAndIndex(ss, i);
01180     s = ss.c_str();
01181     return;
01182 }
01183 
01184 
01185 //
01186 // This is intended to be a helper routine to get dbSNP files
01187 // loaded.  In some cases, we will load into an mmap() file (ie
01188 // when we are creating it), in others, we will simply be loading
01189 // an existing dbSNP file into RAM (when the binary file does not
01190 // exist or when we are running with useMemoryMapFlag == false.
01191 //
01192 // Assume that dbSNP exists, is writable, and is the right size.
01193 //
01194 // Using the dbSNPFilename given, mark each dbSNP position
01195 // with a bool true.
01196 //
01197 // Return value:
01198 //   True: if populateDBSNP() succeed
01199 //   False: if not succeed
01200 bool GenomeSequence::populateDBSNP(
01201     mmapArrayBool_t &dbSNP,
01202     std::ifstream &inputFile)
01203 {
01204     std::string inputLine;
01205 
01206     assert(dbSNP.getElementCount() == getNumberBases());
01207 
01208     std::string chromosomeName;
01209     genomeIndex_t chromosomePosition1;  // 1-based
01210     uint64_t    ignoredLineCount = 0;
01211 
01212     while (getline(inputFile, inputLine))
01213     {
01214         std::istringstream inputLine2(inputLine);
01215         genomeIndex_t genomeIndex;  // 1-based
01216 
01217         inputLine2 >> chromosomeName;
01218         inputLine2 >> chromosomePosition1;
01219 
01220         // check for a comment line
01221         if (chromosomeName.size()>0 && chromosomeName[0]=='#')
01222         {
01223             continue;
01224         }
01225 
01226 #if 0
01227         //
01228         // enable this check if we want only chromosomes
01229         // 1-22.  Some code is sensitive to this, so we may
01230         // need to make it an option.
01231         //
01232         if (!isdigit(chromosomeName[0])) continue;
01233 #endif
01234 
01235         genomeIndex = getGenomePosition(chromosomeName.c_str(), chromosomePosition1);
01236 
01237         // if the genome index is invalid, ignore it
01238         if (genomeIndex == INVALID_GENOME_INDEX)
01239         {
01240             ignoredLineCount++;
01241             continue;
01242         }
01243 
01244         dbSNP.set(genomeIndex, true);
01245     }
01246 
01247     inputFile.close();
01248 
01249     if (ignoredLineCount > 0)
01250     {
01251         std::cerr << "GenomeSequence::populateDBSNP: ignored " << ignoredLineCount << " SNP positions due to invalid format of line." << std::endl;
01252         return false;
01253     }
01254     return true;
01255 }
01256 
01257 bool GenomeSequence::loadDBSNP(
01258     mmapArrayBool_t &dbSNP,
01259     const char *inputFileName)
01260 {
01261     //
01262     // the goal in this section of code is to allow the user
01263     // to either specify a valid binary version of the SNP file,
01264     // or the original text file that it gets created from.
01265     //
01266     // To do this, we basically open, sniff the error message,
01267     // and if it claims it is not a binary version of the file,
01268     // we go ahead and treat it as the text file and use the
01269     // GenomeSequence::populateDBSNP method to load it.
01270     //
01271     // Further checking is really needed to ensure users don't
01272     // mix a dbSNP file for a different reference, since it is really
01273     // easy to do.
01274     //
01275     if (strlen(inputFileName)!=0)
01276     {
01277         std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush;
01278 
01279         if (dbSNP.open(inputFileName, O_RDONLY))
01280         {
01281             //
01282             // failed to open, possibly due to bad magic.
01283             //
01284             // this is really awful ... need to have a return
01285             // code that is smart enough to avoid this ugliness:
01286             //
01287             if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos)
01288             {
01289                 std::cerr << "Error: " << dbSNP.getErrorString() << std::endl;
01290                 exit(1);
01291             }
01292             //
01293             // we have a file, assume we can load it as a text file
01294             //
01295             std::ifstream inputFile;
01296             inputFile.open(inputFileName);
01297             if (inputFile.fail())
01298             {
01299                 std::cerr << "Error: failed to open " << inputFileName << std::endl;
01300                 exit(1);
01301             }
01302 
01303             std::cerr << "(as text file) ";
01304 
01305             // anonymously (RAM resident only) create:
01306             dbSNP.create(getNumberBases());
01307 
01308             // now load it into RAM
01309             populateDBSNP(dbSNP, inputFile);
01310             inputFile.close();
01311 
01312         }
01313         else
01314         {
01315             std::cerr << "(as binary mapped file) ";
01316         }
01317 
01318         std::cerr << "DONE!" << std::endl;
01319         return false;
01320     }
01321     else
01322     {
01323         return true;
01324     }
01325 }
01326 
01327 
01328 #if defined(TEST)
01329 
01330 void simplestExample(void)
01331 {
01332     GenomeSequence  reference;
01333     genomeIndex_t   index;
01334 
01335     // a particular reference is set by:
01336     // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa")
01337     //
01338     // In the above example, the suffix .fa is stripped and replaced with .umfa,
01339     // which contains the actual file being opened.
01340     //
01341     if (reference.open())
01342     {
01343         perror("GenomeSequence::open");
01344         exit(1);
01345     }
01346 
01347 
01348     index = 1000000000; // 10^9
01349     //
01350     // Write the base at the given index.  Here, index is 0 based,
01351     // and is across the whole genome, as all chromosomes are sequentially
01352     // concatenated, so the allowed range is
01353     //
01354     // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last))
01355     //
01356     // (where int last = reference.getChromosomeCount() - 1;)
01357     //
01358     std::cout << "base[" << index << "] = " << reference[index] << std::endl;
01359 
01360     //
01361     // Example for finding chromosome and one based chromosome position given
01362     // and absolute position on the genome in 'index':
01363     //
01364     int chr = reference.getChromosome(index);
01365     genomeIndex_t   chrIndex = index - reference.getChromosomeStart(chr) + 1;   // 1-based
01366 
01367     std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
01368 
01369     //
01370     // Example for finding an absolute genome index position when the
01371     // chromosome name and one based position are known:
01372     //
01373     const char *chromosomeName = "5";
01374     chr = reference.getChromosome(chromosomeName);     // 0-based
01375     chrIndex = 100000;                      // 1-based
01376 
01377     index = reference.getChromosomeStart(chr) + chrIndex - 1;
01378 
01379     std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
01380 
01381     reference.close();
01382 }
01383 
01384 void testGenomeSequence(void)
01385 {
01386     GenomeSequence reference;
01387 
01388 #if 0
01389     std::string referenceName = "someotherreference";
01390     if (reference.setFastaName(referenceName))
01391     {
01392         std::cerr << "failed to open reference file "
01393                   << referenceName
01394                   << std::endl;
01395         exit(1);
01396     }
01397 #endif
01398 
01399     std::cerr << "open and prefetch the reference genome: ";
01400 
01401     // open it
01402     if (reference.open())
01403     {
01404         exit(1);
01405     }
01406     std::cerr << "done!" << std::endl;
01407 
01408     //
01409     // For the human genome, genomeIndex ranges from 0 to 3.2x10^9
01410     //
01411     genomeIndex_t   genomeIndex;    // 0 based
01412     unsigned int chromosomeIndex;   // 1 based
01413     unsigned int chromosome;        // 0..23 or so
01414     std::string chromosomeName;
01415 
01416     //
01417     // Here we'll start with a chromosome name, then obtain the genome
01418     // index, and use it to find the base we want:
01419     //
01420     chromosomeName = "2";
01421     chromosomeIndex = 1234567;
01422     // this call is slow (string search for chromsomeName):
01423     genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
01424     assert(genomeIndex!=INVALID_GENOME_INDEX);
01425     std::cout << "Chromosome " << chromosomeName << ", index ";
01426     std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
01427     std::cout << " at genome index position " << genomeIndex << std::endl;
01428 
01429     //
01430     // now reverse it - given a genomeIndex from above, find the chromosome
01431     // name and index:
01432     //
01433 
01434     // slow (binary search on genomeIndex):
01435     chromosome = reference.getChromosome(genomeIndex);
01436     unsigned int newChromosomeIndex;
01437     // not slow:
01438     newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
01439 
01440     assert(chromosomeIndex == newChromosomeIndex);
01441 
01442     // more testing... at least test and use PackedRead:
01443     //
01444     PackedRead pr;
01445 
01446     pr.set("ATCGATCG", 0);
01447     assert(pr.size()==8);
01448     assert(pr[0]==GenomeSequence::base2int[(int) 'A']);
01449     assert(pr[1]==GenomeSequence::base2int[(int) 'T']);
01450     assert(pr[2]==GenomeSequence::base2int[(int) 'C']);
01451     assert(pr[3]==GenomeSequence::base2int[(int) 'G']);
01452     pr.set("ATCGATCG", 1);
01453     assert(pr.size()==9);
01454     pr.set("", 0);
01455     assert(pr.size()==0);
01456     pr.set("", 1);
01457     assert(pr.size()==1);
01458     pr.set("", 2);
01459     assert(pr.size()==2);
01460     pr.set("", 3);
01461     assert(pr.size()==3);
01462     assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
01463     assert(pr[1]==GenomeSequence::base2int[(int) 'N']);
01464     assert(pr[2]==GenomeSequence::base2int[(int) 'N']);
01465     pr.set("C", 1);
01466     assert(pr.size()==2);
01467     assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
01468     assert(pr[1]==GenomeSequence::base2int[(int) 'C']);
01469 
01470 }
01471 
01472 //
01473 // After I build libcsg, I compile and run this test code using:
01474 //
01475 // g++ -DTEST -o try GenomeSequence.cpp -L. -lcsg -lm -lz -lssl
01476 // you also may need -fno-rtti
01477 // ./try
01478 //
01479 int main(int argc, const char **argv)
01480 {
01481 #if 1
01482     simplestExample();
01483 #else
01484     testGenomeSequence();
01485 #endif
01486     exit(0);
01487 }
01488 
01489 #endif
Generated on Wed Nov 17 15:38:28 2010 for StatGen Software by  doxygen 1.6.3