Classes | |
class | Bin |
class | Reference |
Public Member Functions | |
void | resetIndex () |
Reset the member data for a new index file. | |
SamStatus::Status | readIndex (const char *filename) |
bool | getChunksForRegion (int32_t refID, int32_t start, int32_t end, SortedChunkList &chunkList) |
Get the list of chunks associated with this region. | |
uint64_t | getMaxOffset () const |
bool | getReferenceMinMax (int32_t refID, uint64_t &minOffset, uint64_t &maxOffset) const |
Get the minimum and maximum file offsets for the specfied reference ID. | |
int32_t | getNumRefs () const |
Get the number of references in this index. | |
int32_t | getNumMappedReads (int32_t refID) |
Get the number of mapped reads for this reference id. | |
int32_t | getNumUnMappedReads (int32_t refID) |
Get the number of unmapped reads for this reference id. | |
void | printIndex (int32_t refID, bool summary=false) |
Print the index information. | |
uint64_t | getMinOffsetFromLinearIndex (int32_t refID, uint32_t position) const |
Static Public Attributes | |
static const int32_t | UNKNOWN_NUM_READS = -1 |
The number used for an unknown number of reads. | |
static const int32_t | REF_ID_UNMAPPED = -1 |
The number used for the reference id of unmapped reads. | |
static const int32_t | REF_ID_ALL = -2 |
The number used to indicate that all reference ids should be used. |
Definition at line 61 of file BamIndex.h.
bool BamIndex::getChunksForRegion | ( | int32_t | refID, | |
int32_t | start, | |||
int32_t | end, | |||
SortedChunkList & | chunkList | |||
) |
Get the list of chunks associated with this region.
For an entire reference ID, set start and end to -1. To start at the beginning of the region, set start to 0/-1. To go to the end of the region, set end to -1.
Definition at line 305 of file BamIndex.cpp.
References REF_ID_UNMAPPED.
00307 { 00308 chunkList.clear(); 00309 00310 // If start is >= to end, there will be no sections, return no 00311 // regions. 00312 if((start >= end) && (end != -1)) 00313 { 00314 std::cerr << "Warning, requesting region where start <= end, so " 00315 << "no values will be returned.\n"; 00316 return(false); 00317 } 00318 00319 // Handle REF_ID_UNMAPPED. This uses a default chunk which covers 00320 // from the max offset to the end of the file. 00321 if(refID == REF_ID_UNMAPPED) 00322 { 00323 Chunk refChunk; 00324 // The start of the unmapped region is the max offset found 00325 // in the index file. 00326 refChunk.chunk_beg = getMaxOffset(); 00327 // The end of the unmapped region is the end of the file, so 00328 // set chunk end to the max value. 00329 refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE; 00330 return(chunkList.insert(refChunk)); 00331 } 00332 00333 if((refID < 0) || (refID >= n_ref)) 00334 { 00335 // The specified refID is out of range, return false. 00336 std::cerr << "Warning, requesting refID is out of range, so " 00337 << "no values will be returned.\n"; 00338 return(false); 00339 } 00340 00341 const Reference* ref = &(myRefs[refID]); 00342 00343 // Handle where start/end are defaults. 00344 if(start == -1) 00345 { 00346 if(end == -1) 00347 { 00348 // This is whole chromosome, so take a shortcut. 00349 if(ref->maxChunkOffset == 0) 00350 { 00351 // No chunks for this region, but this is not an error. 00352 return(true); 00353 } 00354 Chunk refChunk; 00355 refChunk.chunk_beg = ref->minChunkOffset; 00356 refChunk.chunk_end = ref->maxChunkOffset; 00357 return(chunkList.insert(refChunk)); 00358 } 00359 else 00360 { 00361 start = 0; 00362 } 00363 } 00364 if(end == -1) 00365 { 00366 // MAX_POSITION is inclusive, but end is exclusive, so add 1. 00367 end = MAX_POSITION + 1; 00368 } 00369 00370 // Determine the minimum offset for the given start position. This 00371 // is done by using the linear index for the specified start position. 00372 uint64_t minOffset = getMinOffsetFromLinearIndex(refID, start); 00373 00374 uint16_t binInRangeList[MAX_NUM_BINS + 1]; 00375 00376 int numBins = getBinsForRegion(start, end, binInRangeList); 00377 00378 // loop through the bins in the range and get the chunks. 00379 for(int i = 0; i < numBins; ++i) 00380 { 00381 int binNum = binInRangeList[i]; 00382 const Bin* bin = &(ref->bins[binNum]); 00383 00384 // Add each chunk in the bin to the map. 00385 for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++) 00386 { 00387 // If the end of the chunk is less than the minimum offset 00388 // for the 16K block that starts our region, then no 00389 // records in this chunk will cross our region, so do 00390 // not add it to the chunks we need to use. 00391 if(bin->chunks[chunkIndex].chunk_end < minOffset) 00392 { 00393 continue; 00394 } 00395 // Add the chunk to the map. 00396 if(!chunkList.insert(bin->chunks[chunkIndex])) 00397 { 00398 // Failed to add to the map, return false. 00399 std::cerr << "Warning, Failed to add a chunk, so " 00400 << "no values will be returned.\n"; 00401 return(false); 00402 } 00403 } 00404 } 00405 00406 // Now that all chunks have been added to the list, 00407 // handle overlapping chunks. 00408 return(chunkList.mergeOverlapping()); 00409 }
int32_t BamIndex::getNumMappedReads | ( | int32_t | refID | ) |
Get the number of mapped reads for this reference id.
Returns -1 for out of range refIDs.
refID | reference ID for which to extract the number of mapped reads. |
Definition at line 445 of file BamIndex.cpp.
References REF_ID_UNMAPPED.
Referenced by SamFile::getNumMappedReadsFromIndex().
00446 { 00447 // If it is the reference id of unmapped reads, return 00448 // that there are no mapped reads. 00449 if(refID == REF_ID_UNMAPPED) 00450 { 00451 // These are by definition all unmapped reads. 00452 return(0); 00453 } 00454 00455 if((refID < 0) || (refID >= (int32_t)myRefs.size())) 00456 { 00457 // Reference ID is out of range for this index file. 00458 return(-1); 00459 } 00460 00461 // Get this reference. 00462 return(myRefs[refID].n_mapped); 00463 }
int32_t BamIndex::getNumRefs | ( | ) | const |
Get the number of references in this index.
Definition at line 437 of file BamIndex.cpp.
int32_t BamIndex::getNumUnMappedReads | ( | int32_t | refID | ) |
Get the number of unmapped reads for this reference id.
Returns -1 for out of range refIDs.
refID | reference ID for which to extract the number of unmapped reads. |
Definition at line 467 of file BamIndex.cpp.
References REF_ID_UNMAPPED.
Referenced by SamFile::getNumUnMappedReadsFromIndex().
00468 { 00469 // If it is the reference id of unmapped reads, return 00470 // that value. 00471 if(refID == REF_ID_UNMAPPED) 00472 { 00473 return(myUnMappedNumReads); 00474 } 00475 00476 if((refID < 0) || (refID >= (int32_t)myRefs.size())) 00477 { 00478 // Reference ID is out of range for this index file. 00479 return(-1); 00480 } 00481 00482 // Get this reference. 00483 return(myRefs[refID].n_unmapped); 00484 }
bool BamIndex::getReferenceMinMax | ( | int32_t | refID, | |
uint64_t & | minOffset, | |||
uint64_t & | maxOffset | |||
) | const |
Get the minimum and maximum file offsets for the specfied reference ID.
refID | the reference ID to locate in the file. | |
minOffset | returns the min file offset for the specified reference | |
maxOffset | returns the max file offset for the specified reference |
Definition at line 419 of file BamIndex.cpp.
00422 { 00423 if((refID < 0) || (refID >= (int32_t)myRefs.size())) 00424 { 00425 // Reference ID is out of range for this index file. 00426 return(false); 00427 } 00428 00429 // Get this reference. 00430 minOffset = myRefs[refID].minChunkOffset; 00431 maxOffset = myRefs[refID].maxChunkOffset; 00432 return(true); 00433 }
void BamIndex::printIndex | ( | int32_t | refID, | |
bool | summary = false | |||
) |
Print the index information.
refID | reference ID for which to print info for. -1 means print for all references. | |
summary | whether or not to just print a summary (defaults to false). The summary just contains summary info for each reference and not every bin/chunk. |
Definition at line 488 of file BamIndex.cpp.
00489 { 00490 std::cout << "BAM Index: " << std::endl; 00491 std::cout << "# Reference Sequences: " << n_ref << std::endl; 00492 00493 unsigned int startRef = 0; 00494 unsigned int endRef = myRefs.size() - 1; 00495 std::vector<Reference> refsToProcess; 00496 if(refID != -1) 00497 { 00498 // Set start and end ref to the specified reference id. 00499 startRef = refID; 00500 endRef = refID; 00501 } 00502 00503 // Print out the information for each bin. 00504 for(unsigned int i = startRef; i <= endRef; ++i) 00505 { 00506 std::cout << std::dec 00507 << "\tReference ID: " << std::setw(4) << i 00508 << "; #Bins: "<< std::setw(6) << myRefs[i].n_bin 00509 << "; #Linear Index Entries: " 00510 << std::setw(6) << myRefs[i].n_intv 00511 << "; Min Chunk Offset: " 00512 << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset 00513 << "; Max Chunk Offset: " 00514 << std::setw(18) << myRefs[i].maxChunkOffset 00515 << std::dec; 00516 // Print the mapped/unmapped if set. 00517 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO) 00518 { 00519 std::cout << "; " << myRefs[i].n_mapped << " Mapped Reads"; 00520 } 00521 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO) 00522 { 00523 std::cout << "; " << myRefs[i].n_unmapped << " Unmapped Reads"; 00524 } 00525 std::cout << std::endl; 00526 00527 // Only print more details if not summary. 00528 if(!summary) 00529 { 00530 std::vector<Bin>::iterator binIter; 00531 for(binIter = myRefs[i].bins.begin(); 00532 binIter != myRefs[i].bins.end(); 00533 ++binIter) 00534 { 00535 Bin* binPtr = &(*binIter); 00536 if(binPtr->bin == Bin::NOT_USED_BIN) 00537 { 00538 // This bin is not used, continue. 00539 continue; 00540 } 00541 // Print the bin info. 00542 std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl; 00543 std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl; 00544 std::cout << std::hex << std::showbase; 00545 00546 for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk; 00547 ++chunkIndex) 00548 { 00549 // If this is the last chunk of the MAX_NUM_BINS - it 00550 // contains a mapped/unmapped count rather than the regular 00551 // chunk addresses. 00552 if((binPtr->bin != MAX_NUM_BINS) || 00553 (chunkIndex != (binPtr->n_chunk - 1))) 00554 { 00555 std::cout << "\t\t\t\tchunk_beg: " 00556 << binPtr->chunks[chunkIndex].chunk_beg 00557 << std::endl; 00558 std::cout << "\t\t\t\tchunk_end: " 00559 << binPtr->chunks[chunkIndex].chunk_end 00560 << std::endl; 00561 } 00562 } 00563 } 00564 std::cout << std::dec; 00565 00566 // Print the linear index. 00567 for(int linearIndex = 0; linearIndex < myRefs[i].n_intv; 00568 ++linearIndex) 00569 { 00570 if(myRefs[i].ioffsets[linearIndex] != 0) 00571 { 00572 std::cout << "\t\t\tLinearIndex[" 00573 << std::dec << linearIndex << "] Offset: " 00574 << std::hex << myRefs[i].ioffsets[linearIndex] 00575 << std::endl; 00576 } 00577 } 00578 } 00579 } 00580 }
SamStatus::Status BamIndex::readIndex | ( | const char * | filename | ) |
filename | the bam index file to be read. |
Definition at line 142 of file BamIndex.cpp.
References SamStatus::FAIL_IO, SamStatus::FAIL_PARSE, ifopen(), ifread(), resetIndex(), and SamStatus::SUCCESS.
Referenced by SamFile::ReadBamIndex().
00143 { 00144 // Reset the index from anything that may previously be set. 00145 resetIndex(); 00146 00147 IFILE indexFile = ifopen(filename, "rb"); 00148 00149 // Failed to open the index file. 00150 if(indexFile == NULL) 00151 { 00152 return(SamStatus::FAIL_IO); 00153 } 00154 00155 // generate the bam index structure. 00156 00157 // Read the magic string. 00158 char magic[4]; 00159 if(ifread(indexFile, magic, 4) != 4) 00160 { 00161 // Failed to read the magic 00162 return(SamStatus::FAIL_IO); 00163 } 00164 00165 // If this is not an index file, set num references to 0. 00166 if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1) 00167 { 00168 // Not a BAM Index file. 00169 return(SamStatus::FAIL_PARSE); 00170 } 00171 00172 // It is a bam index file. 00173 // Read the number of reference sequences. 00174 if(ifread(indexFile, &n_ref, 4) != 4) 00175 { 00176 // Failed to read. 00177 return(SamStatus::FAIL_IO); 00178 } 00179 00180 // Size the references. 00181 myRefs.resize(n_ref); 00182 00183 for(int refIndex = 0; refIndex < n_ref; refIndex++) 00184 { 00185 // Read each reference. 00186 Reference* ref = &(myRefs[refIndex]); 00187 00188 // Resize the bins so they can be indexed by bin number. 00189 ref->bins.resize(MAX_NUM_BINS + 1); 00190 00191 // Read the number of bins. 00192 if(ifread(indexFile, &(ref->n_bin), 4) != 4) 00193 { 00194 // Failed to read the number of bins. 00195 // Return failure. 00196 return(SamStatus::FAIL_PARSE); 00197 } 00198 00199 // If there are no bins, then there are no 00200 // mapped/unmapped reads. 00201 if(ref->n_bin == 0) 00202 { 00203 ref->n_mapped = 0; 00204 ref->n_unmapped = 0; 00205 } 00206 00207 // Read each bin. 00208 for(int binIndex = 0; binIndex < ref->n_bin; binIndex++) 00209 { 00210 uint32_t binNumber; 00211 00212 // Read in the bin number. 00213 if(ifread(indexFile, &(binNumber), 4) != 4) 00214 { 00215 // Failed to read the bin number. 00216 // Return failure. 00217 return(SamStatus::FAIL_IO); 00218 } 00219 00220 // Add the bin to the reference and get the 00221 // pointer back so the values can be set in it. 00222 Bin* binPtr = &(ref->bins[binNumber]); 00223 binPtr->bin = binNumber; 00224 00225 // Read in the number of chunks. 00226 if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4) 00227 { 00228 // Failed to read number of chunks. 00229 // Return failure. 00230 return(SamStatus::FAIL_IO); 00231 } 00232 00233 // Read in the chunks. 00234 // Allocate space for the chunks. 00235 uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk); 00236 binPtr->chunks = (Chunk*)malloc(sizeOfChunkList); 00237 if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList) 00238 { 00239 // Failed to read the chunks. 00240 // Return failure. 00241 return(SamStatus::FAIL_IO); 00242 } 00243 00244 // Determine the min/max for this bin if it is not the max bin. 00245 if(binPtr->bin != MAX_NUM_BINS) 00246 { 00247 for(int i = 0; i < binPtr->n_chunk; i++) 00248 { 00249 if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset) 00250 { 00251 ref->minChunkOffset = binPtr->chunks[i].chunk_beg; 00252 } 00253 if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset) 00254 { 00255 ref->maxChunkOffset = binPtr->chunks[i].chunk_end; 00256 } 00257 if(binPtr->chunks[i].chunk_end > maxOverallOffset) 00258 { 00259 maxOverallOffset = binPtr->chunks[i].chunk_end; 00260 } 00261 } 00262 } 00263 else 00264 { 00265 // Mapped/unmapped are the last chunk of the 00266 // MAX BIN 00267 ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg; 00268 ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end; 00269 } 00270 } 00271 00272 // Read the number of intervals. 00273 if(ifread(indexFile, &(ref->n_intv), 4) != 4) 00274 { 00275 // Failed to read, set to 0. 00276 ref->n_intv = 0; 00277 // Return failure. 00278 return(SamStatus::FAIL_IO); 00279 } 00280 00281 // Allocate space for the intervals and read them. 00282 uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t); 00283 ref->ioffsets = (uint64_t*)malloc(linearIndexSize); 00284 if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize) 00285 { 00286 // Failed to read the linear index. 00287 // Return failure. 00288 return(SamStatus::FAIL_IO); 00289 } 00290 } 00291 00292 int32_t numUnmapped = 0; 00293 if(ifread(indexFile, &numUnmapped, sizeof(int32_t)) == sizeof(int32_t)) 00294 { 00295 myUnMappedNumReads = numUnmapped; 00296 } 00297 00298 // Successfully read teh bam index file. 00299 return(SamStatus::SUCCESS); 00300 }