Create/Access/Modify/Load Genome Sequences stored as binary mapped files. More...
#include <GenomeSequence.h>
Public Member Functions | |
GenomeSequence () | |
Simple constructor - no implicit file open. | |
void | constructorClear () |
GenomeSequence (std::string &referenceFilename) | |
attempt to open an existing sequence | |
GenomeSequence (const char *referenceFilename) | |
Smarter constructor - attempt to open an existing sequence. | |
~GenomeSequence () | |
Close the file if open and destroy the object. | |
bool | open (bool isColorSpace=false, int flags=O_RDONLY) |
open the reference specified using GenomeSequence::setReferenceName | |
bool | open (const char *filename, int flags=O_RDONLY) |
open the given file as the genome (no filename munging occurs). | |
bool | create () |
bool | create (bool isColor) |
void | setProgressStream (std::ostream &progressStream) |
void | setColorSpace (bool colorSpace) |
void | setSearchCommonFileSuffix (bool searchCommonFileSuffix) |
void | setCreateOverwrite (bool createOverwrite) |
bool | loadFastaData (const char *filename) |
bool | setReferenceName (std::string referenceFilename) |
set the reference name that will be used in open() | |
void | setApplication (std::string application) |
set the application name in the binary file header | |
const std::string & | getFastaName () const |
const std::string & | getReferenceName () const |
bool | isColorSpace () const |
tell us if we are a color space reference or not | |
genomeIndex_t | getNumberBases () const |
return the number of bases represented in this reference | |
int | getChromosome (genomeIndex_t position) const |
given a whole genome index, get the chromosome it is located in | |
int | getChromosome (const char *chromosomeName) const |
given a chromosome name, return the chromosome index | |
int | getChromosomeCount () const |
Return the number of chromosomes in the genome. | |
genomeIndex_t | getChromosomeStart (int chromosomeIndex) const |
given a chromosome, return the genome base it starts in | |
genomeIndex_t | getChromosomeSize (int chromosomeIndex) const |
given a chromosome, return its size in bases | |
genomeIndex_t | getGenomePosition (const char *chromosomeName, unsigned int chromosomeIndex) const |
given a chromosome name and position, return the genome position | |
genomeIndex_t | getGenomePosition (int chromosome, unsigned int chromosomeIndex) const |
given a chromosome index and position, return the genome position | |
genomeIndex_t | getGenomePosition (const char *chromosomeName) const |
given the chromosome name, get the corresponding 0 based genome index for the start of that chromosome | |
genomeIndex_t | getGenomePosition (int chromosomeIndex) const |
const std::string & | getBaseFilename () const |
const char * | getChromosomeName (int chromosomeIndex) const |
void | setDebugFlag (bool d) |
genomeIndex_t | sequenceLength () const |
const char * | chromosomeName (int chr) const |
void | sanityCheck (MemoryMap &fasta) const |
std::string | IntegerToSeq (unsigned int n, unsigned int wordsize) const |
bool | wordMatch (unsigned int index, std::string &word) const |
bool | printNearbyWords (unsigned int index, unsigned int variance, std::string &word) const |
char | BasePair (char c) const |
void | dumpSequenceSAMDictionary (std::ostream &) const |
void | dumpHeaderTSV (std::ostream &) const |
char | operator[] (genomeIndex_t index) const |
Return the bases in base space or color space for within range index, ot. | |
uint8_t | getInteger (genomeIndex_t index) const |
void | set (genomeIndex_t index, char value) |
uint8_t * | getDataPtr (genomeIndex_t index) |
obtain the pointer to the raw data for other access methods | |
void | getReverseRead (std::string &read) |
void | getReverseRead (String &read) |
int | debugPrintReadValidation (std::string &read, std::string &quality, char direction, genomeIndex_t readLocation, int sumQuality, int mismatchCount, bool recurse=true) |
void | getString (std::string &str, int chromosome, genomeIndex_t index, int baseCount) const |
void | getString (String &str, int chromosome, genomeIndex_t index, int baseCount) const |
void | getString (std::string &str, genomeIndex_t index, int baseCount) const |
void | getString (String &str, genomeIndex_t index, int baseCount) const |
void | getHighLightedString (std::string &str, genomeIndex_t index, int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd) const |
void | print30 (genomeIndex_t) const |
genomeIndex_t | simpleLocalAligner (std::string &read, std::string &quality, genomeIndex_t index, int windowSize) const |
int | getMismatchCount (std::string &read, genomeIndex_t location, char exclude='\0') const |
Return the mismatch count, disregarding CIGAR strings. | |
int | getSumQ (std::string &read, std::string &qualities, genomeIndex_t location) const |
brute force sumQ - no sanity checking | |
void | getMismatchHatString (std::string &result, const std::string &read, genomeIndex_t location) const |
void | getMismatchString (std::string &result, const std::string &read, genomeIndex_t location) const |
void | getChromosomeAndIndex (std::string &, genomeIndex_t) const |
void | getChromosomeAndIndex (String &, genomeIndex_t) const |
bool | checkRead (std::string &read, std::string &qualities, std::string &cigar, int &sumQ, int &gapOpenCount, int &gapExtendCount, int &gapDeleteCount, std::string &result) const |
check a SAM format read, using phred quality scores and the CIGAR string to determine if it is correct. | |
bool | populateDBSNP (mmapArrayBool_t &dbSNP, std::ifstream &inputFile) const |
bool | loadDBSNP (mmapArrayBool_t &dbSNP, const char *inputFileName) const |
user friendly dbSNP loader. | |
Static Public Attributes | |
static const int | baseAIndex = 000 |
static const int | baseTIndex = 001 |
static const int | baseCIndex = 002 |
static const int | baseGIndex = 003 |
static const int | baseNIndex = 004 |
static const int | baseXIndex = 005 |
static unsigned char | base2int [] |
static const char | int2base [] = "ACGTNMXXXXXXXXXX" |
static const char | int2colorSpace [] = "0123NXXXXXXXXXXX" |
static unsigned char | base2complement [] |
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
GenomeSequence is designed to be a high performance shared access reference object.
It is implemented as a MemoryMapArray template object with unsigned 8 bit ints, each of which stores two bases. Although 2 bits could be used, most references have more than four symbols (usually at least including 'N', indicating an unknown or masked out base).
Normal use of this class follows these steps:
Sharing is accomplished using the mmap() function via the MemoryMap base class. This allows a potentially large genome reference to be shared among a number of simultaneously executing instances of one or more programs sharing the same reference.
Definition at line 238 of file GenomeSequence.h.
GenomeSequence::GenomeSequence | ( | std::string & | referenceFilename | ) | [inline] |
attempt to open an existing sequence
referenceFilename | the name of the reference fasta file to open | |
debug | if true, additional debug information is printed |
Definition at line 287 of file GenomeSequence.h.
GenomeSequence::GenomeSequence | ( | const char * | referenceFilename | ) | [inline] |
Smarter constructor - attempt to open an existing sequence.
referenceFilename | the name of the reference fasta file to open | |
debug | if true, additional debug information is printed |
Definition at line 297 of file GenomeSequence.h.
bool GenomeSequence::checkRead | ( | std::string & | read, | |
std::string & | qualities, | |||
std::string & | cigar, | |||
int & | sumQ, | |||
int & | gapOpenCount, | |||
int & | gapExtendCount, | |||
int & | gapDeleteCount, | |||
std::string & | result | |||
) | const |
check a SAM format read, using phred quality scores and the CIGAR string to determine if it is correct.
read | the read in base space | |
qualities | the phred encoded qualities (Sanger, not Illumina) | |
cigar | the SAM file CIGAR column | |
sumQ | if >0 on entry, is checked against the computed sumQ | |
insertions | count of insertions found in |
int GenomeSequence::getChromosome | ( | const char * | chromosomeName | ) | const |
given a chromosome name, return the chromosome index
This is done via a linear search of the chromosome table in the header of the mapped file, so it is O(N)
chromosomeName | the name of the chromosome - exact match only |
Definition at line 797 of file GenomeSequence.cpp.
int GenomeSequence::getChromosome | ( | genomeIndex_t | position | ) | const |
given a whole genome index, get the chromosome it is located in
This is done via a binary search of the chromosome table in the header of the mapped file, so it is O(log(N))
0-based | position the base in the genome |
Definition at line 720 of file GenomeSequence.cpp.
Referenced by getGenomePosition().
00721 { 00722 if (position == INVALID_GENOME_INDEX) return INVALID_CHROMOSOME_INDEX; 00723 00724 if (header->_chromosomeCount == 0) 00725 return INVALID_CHROMOSOME_INDEX; 00726 00727 int start = 0; 00728 int stop = header->_chromosomeCount - 1; 00729 00730 // eliminate case where position is in the last chromosome, since the loop 00731 // below falls off the end of the list if it in the last one. 00732 00733 if (position > header->_chromosomes[stop].start) 00734 return (stop); 00735 00736 while (start <= stop) 00737 { 00738 int middle = (start + stop) / 2; 00739 00740 if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start) 00741 return middle; 00742 00743 if (position == header->_chromosomes[middle + 1].start) 00744 return (middle + 1); 00745 00746 if (position > header->_chromosomes[middle + 1].start) 00747 start = middle + 1; 00748 00749 if (position < header->_chromosomes[middle].start) 00750 stop = middle - 1; 00751 } 00752 00753 return -1; 00754 }
int GenomeSequence::getChromosomeCount | ( | ) | const |
Return the number of chromosomes in the genome.
Definition at line 714 of file GenomeSequence.cpp.
genomeIndex_t GenomeSequence::getChromosomeSize | ( | int | chromosomeIndex | ) | const [inline] |
given a chromosome, return its size in bases
0-based | chromosome index |
Definition at line 417 of file GenomeSequence.h.
genomeIndex_t GenomeSequence::getChromosomeStart | ( | int | chromosomeIndex | ) | const [inline] |
given a chromosome, return the genome base it starts in
0-based | chromosome index |
Definition at line 407 of file GenomeSequence.h.
uint8_t* GenomeSequence::getDataPtr | ( | genomeIndex_t | index | ) | [inline] |
obtain the pointer to the raw data for other access methods
this is a fairly ugly hack to reach into the raw genome vector, get the byte that encodes two bases, and return it. This is used by karma ReadIndexer::getSumQ to compare genome matchines by byte (two bases at a time) to speed it up.
Definition at line 563 of file GenomeSequence.h.
genomeIndex_t GenomeSequence::getGenomePosition | ( | int | chromosome, | |
unsigned int | chromosomeIndex | |||
) | const |
given a chromosome index and position, return the genome position
chromosome | index of the chromosome | |
chromosomeIndex | 1-based chromosome position |
Definition at line 771 of file GenomeSequence.cpp.
genomeIndex_t GenomeSequence::getGenomePosition | ( | const char * | chromosomeName, | |
unsigned int | chromosomeIndex | |||
) | const |
given a chromosome name and position, return the genome position
chromosomeName | name of the chromosome - exact match only | |
chromosomeIndex | 1-based chromosome position |
Definition at line 762 of file GenomeSequence.cpp.
Referenced by SamTags::createMDTag(), SamQuerySeqWithRefIter::reset(), SamQuerySeqWithRef::seqWithEquals(), and SamQuerySeqWithRef::seqWithoutEquals().
00765 { 00766 genomeIndex_t i = getGenomePosition(chromosomeName); 00767 if (i == INVALID_GENOME_INDEX) return INVALID_GENOME_INDEX; 00768 return i + chromosomeIndex - 1; 00769 }
int GenomeSequence::getMismatchCount | ( | std::string & | read, | |
genomeIndex_t | location, | |||
char | exclude = '\0' | |||
) | const [inline] |
Return the mismatch count, disregarding CIGAR strings.
read | is the sequence we're counting mismatches in | |
location | is where in the genmoe we start comparing | |
exclude | is a wildcard character (e.g. '.' or 'N') |
Definition at line 625 of file GenomeSequence.h.
genomeIndex_t GenomeSequence::getNumberBases | ( | ) | const [inline] |
return the number of bases represented in this reference
Definition at line 377 of file GenomeSequence.h.
Referenced by loadDBSNP(), and operator[]().
int GenomeSequence::getSumQ | ( | std::string & | read, | |
std::string & | qualities, | |||
genomeIndex_t | location | |||
) | const [inline] |
brute force sumQ - no sanity checking
read | shotgun sequencer read string | |
qualities | phred quality string of same length | |
location | the alignment location to check sumQ |
Definition at line 638 of file GenomeSequence.h.
bool GenomeSequence::isColorSpace | ( | ) | const [inline] |
tell us if we are a color space reference or not
Definition at line 370 of file GenomeSequence.h.
Referenced by operator[]().
bool GenomeSequence::loadDBSNP | ( | mmapArrayBool_t & | dbSNP, | |
const char * | inputFileName | |||
) | const |
user friendly dbSNP loader.
inputFileName | may be empty, point to a text file or a dbSNP vector file |
In all cases, dbSNP is returned the same length as this genome.
When no SNPs are loaded, all values are false.
When a text file is given, the file is parsed with two space separated columns - the first column is the chromosome name, and the second is the 1-based chromosome position of the SNP.
Definition at line 1253 of file GenomeSequence.cpp.
References MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >::create(), getNumberBases(), and MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >::open().
01256 { 01257 // 01258 // the goal in this section of code is to allow the user 01259 // to either specify a valid binary version of the SNP file, 01260 // or the original text file that it gets created from. 01261 // 01262 // To do this, we basically open, sniff the error message, 01263 // and if it claims it is not a binary version of the file, 01264 // we go ahead and treat it as the text file and use the 01265 // GenomeSequence::populateDBSNP method to load it. 01266 // 01267 // Further checking is really needed to ensure users don't 01268 // mix a dbSNP file for a different reference, since it is really 01269 // easy to do. 01270 // 01271 if (strlen(inputFileName)!=0) 01272 { 01273 std::cerr << "Load dbSNP file '" << inputFileName << "': " << std::flush; 01274 01275 if (dbSNP.open(inputFileName, O_RDONLY)) 01276 { 01277 // 01278 // failed to open, possibly due to bad magic. 01279 // 01280 // this is really awful ... need to have a return 01281 // code that is smart enough to avoid this ugliness: 01282 // 01283 if (dbSNP.getErrorString().find("wrong type of file")==std::string::npos) 01284 { 01285 std::cerr << "Error: " << dbSNP.getErrorString() << std::endl; 01286 exit(1); 01287 } 01288 // 01289 // we have a file, assume we can load it as a text file 01290 // 01291 std::ifstream inputFile; 01292 inputFile.open(inputFileName); 01293 if (inputFile.fail()) 01294 { 01295 std::cerr << "Error: failed to open " << inputFileName << std::endl; 01296 exit(1); 01297 } 01298 01299 std::cerr << "(as text file) "; 01300 01301 // anonymously (RAM resident only) create: 01302 dbSNP.create(getNumberBases()); 01303 01304 // now load it into RAM 01305 populateDBSNP(dbSNP, inputFile); 01306 inputFile.close(); 01307 01308 } 01309 else 01310 { 01311 std::cerr << "(as binary mapped file) "; 01312 } 01313 01314 std::cerr << "DONE!" << std::endl; 01315 return false; 01316 } 01317 else 01318 { 01319 return true; 01320 } 01321 }
bool GenomeSequence::open | ( | const char * | filename, | |
int | flags = O_RDONLY | |||
) | [inline, virtual] |
open the given file as the genome (no filename munging occurs).
filename | the name of the file to open | |
flags | pass through to the open() call (O_RDWR lets you modify the contents) |
Reimplemented from MemoryMapArray< elementT, indexT, cookieVal, versionVal, accessorFunc, setterFunc, elementCount2BytesFunc, arrayHeaderClass >.
Definition at line 318 of file GenomeSequence.h.
References open().
00319 { 00320 _umfaFilename = filename; 00321 return genomeSequenceArray::open(filename, flags); 00322 }
bool GenomeSequence::open | ( | bool | isColorSpace = false , |
|
int | flags = O_RDONLY | |||
) |
open the reference specified using GenomeSequence::setReferenceName
isColorSpace | open the color space reference | |
flags | pass through to the open() call (O_RDWR lets you modify the contents) |
Definition at line 250 of file GenomeSequence.cpp.
Referenced by open().
00251 { 00252 bool rc; 00253 00254 if (isColorSpace) 00255 { 00256 _umfaFilename = _baseFilename + "-cs.umfa"; 00257 } 00258 else 00259 { 00260 _umfaFilename = _baseFilename + "-bs.umfa"; 00261 } 00262 00263 rc = genomeSequenceArray::open(_umfaFilename.c_str(), flags); 00264 if (rc) 00265 { 00266 std::cerr << "GenomeSequence::open: failed to open file " 00267 << _umfaFilename 00268 << std::endl; 00269 return true; 00270 } 00271 00272 _colorSpace = header->_colorSpace; 00273 00274 return false; 00275 }
char GenomeSequence::operator[] | ( | genomeIndex_t | index | ) | const [inline] |
Return the bases in base space or color space for within range index, ot.
index | the array-like index (0 based). |
NB: bounds checking here needs to be deprecated - do not assume it will exist - the call must clip reads so that this routine is never called with a index value larger than the genome.
The reason for this is simply that this routine gets called hundreds of billions of time in one run of karma, which will absolutely kill performance. Every single instruction here matters a great, great deal.
Definition at line 522 of file GenomeSequence.h.
References getNumberBases(), and isColorSpace().
00523 { 00524 uint8_t val; 00525 if (index < getNumberBases()) 00526 { 00527 if ((index&1)==0) 00528 { 00529 val = ((uint8_t *) data)[index>>1] & 0xf; 00530 } 00531 else 00532 { 00533 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4; 00534 } 00535 } 00536 else 00537 { 00538 val = baseNIndex; 00539 } 00540 val = isColorSpace() ? int2colorSpace[val] : int2base[val]; 00541 return val; 00542 }
void GenomeSequence::setApplication | ( | std::string | application | ) | [inline] |
set the application name in the binary file header
application | name of the application |
Definition at line 355 of file GenomeSequence.h.
bool GenomeSequence::setReferenceName | ( | std::string | referenceFilename | ) |
set the reference name that will be used in open()
referenceFilename | the name of the reference fasta file to open |
Definition at line 315 of file GenomeSequence.cpp.
00316 { 00317 00318 if (HAS_SUFFIX(referenceFilename, ".fa")) 00319 { 00320 _referenceFilename = referenceFilename; 00321 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3); 00322 } 00323 else if (HAS_SUFFIX(referenceFilename, ".umfa")) 00324 { 00325 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5); 00326 } 00327 else if (HAS_SUFFIX(referenceFilename, "-cs.umfa")) 00328 { 00329 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8); 00330 } 00331 else if (HAS_SUFFIX(referenceFilename, "-bs.umfa")) 00332 { 00333 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8); 00334 } 00335 else 00336 { 00337 _baseFilename = referenceFilename; 00338 } 00339 _fastaFilename = _baseFilename + ".fa"; 00340 00341 return false; 00342 }
unsigned char GenomeSequence::base2complement [static] |
"NNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNN" "0123NNNNNNNNNNNN" "NTNGNNNCNNNNNNNN" "NNNNANNNNNNNNNNN" "NTNGNNNCNNNNNNNN" "NNNNANNNNNNNNNNN" "NNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNN" "NNNNNNNNNNNNNNNN"
Definition at line 258 of file GenomeSequence.h.
unsigned char GenomeSequence::base2int [static] |
"\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\004\005" "\000\001\002\003\005\005\005\005\005\005\005\005\005\005\005\005" "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005" "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005" "\005\000\005\001\005\005\005\002\005\005\005\005\005\005\004\005" "\005\005\005\005\003\005\005\005\005\005\005\005\005\005\005\005" "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005" "\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005\005"
Definition at line 253 of file GenomeSequence.h.