Classes | |
class | Bin |
class | Reference |
Public Member Functions | |
void | resetIndex () |
Reset the member data for a new index file. | |
SamStatus::Status | readIndex (const char *filename) |
bool | getChunksForRegion (int32_t refID, int32_t start, int32_t end, SortedChunkList &chunkList) |
Get the list of chunks associated with this region. | |
uint64_t | getMaxOffset () const |
bool | getReferenceMinMax (int32_t refID, uint64_t &minOffset, uint64_t &maxOffset) const |
Get the minimum and maximum file offsets for the specfied reference ID. | |
int32_t | getNumRefs () const |
Get the number of references in this index. | |
void | printIndex (int32_t refID, bool summary=false) |
Print the index information. | |
Static Public Attributes | |
static const int32_t | REF_ID_UNMAPPED = -1 |
The number used for the reference id of unmapped reads. | |
static const int32_t | REF_ID_ALL = -2 |
The number used to indicate that all reference ids should be used. |
Definition at line 61 of file BamIndex.h.
bool BamIndex::getChunksForRegion | ( | int32_t | refID, | |
int32_t | start, | |||
int32_t | end, | |||
SortedChunkList & | chunkList | |||
) |
Get the list of chunks associated with this region.
For an entire reference ID, set start and end to -1. To start at the beginning of the region, set start to 0/-1. To go to the end of the region, set end to -1.
Definition at line 289 of file BamIndex.cpp.
References REF_ID_UNMAPPED.
00291 { 00292 chunkList.clear(); 00293 00294 // If start is >= to end, there will be no sections, return no 00295 // regions. 00296 if((start >= end) && (end != -1)) 00297 { 00298 std::cerr << "Warning, requesting region where start <= end, so " 00299 << "no values will be returned.\n"; 00300 return(false); 00301 } 00302 00303 // Handle REF_ID_UNMAPPED. This uses a default chunk which covers 00304 // from the max offset to the end of the file. 00305 if(refID == REF_ID_UNMAPPED) 00306 { 00307 Chunk refChunk; 00308 // The start of the unmapped region is the max offset found 00309 // in the index file. 00310 refChunk.chunk_beg = getMaxOffset(); 00311 // The end of the unmapped region is the end of the file, so 00312 // set chunk end to the max value. 00313 refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE; 00314 return(chunkList.insert(refChunk)); 00315 } 00316 00317 if((refID < 0) || (refID >= n_ref)) 00318 { 00319 // The specified refID is out of range, return false. 00320 return(false); 00321 } 00322 00323 const Reference* ref = &(myRefs[refID]); 00324 00325 // Handle where start/end are defaults. 00326 if(start == -1) 00327 { 00328 if(end == -1) 00329 { 00330 // This is whole chromosome, so take a shortcut. 00331 if(ref->maxChunkOffset == 0) 00332 { 00333 // No chunks for this region, but this is not an error. 00334 return(true); 00335 } 00336 Chunk refChunk; 00337 refChunk.chunk_beg = ref->minChunkOffset; 00338 refChunk.chunk_end = ref->maxChunkOffset; 00339 return(chunkList.insert(refChunk)); 00340 } 00341 else 00342 { 00343 start = 0; 00344 } 00345 } 00346 if(end == -1) 00347 { 00348 // MAX_POSITION is inclusive, but end is exclusive, so add 1. 00349 end = MAX_POSITION + 1; 00350 } 00351 00352 // Determine the minimum offset for the given start position. This 00353 // is done by using the linear index for the specified start position. 00354 uint64_t minOffset = getMinOffsetFromLinearIndex(refID, start); 00355 00356 uint16_t binInRangeList[MAX_NUM_BINS + 1]; 00357 00358 int numBins = getBinsForRegion(start, end, binInRangeList); 00359 00360 // loop through the bins in the range and get the chunks. 00361 for(int i = 0; i < numBins; ++i) 00362 { 00363 int binNum = binInRangeList[i]; 00364 const Bin* bin = &(ref->bins[binNum]); 00365 00366 // Add each chunk in the bin to the map. 00367 for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++) 00368 { 00369 // If the end of the chunk is less than the minimum offset 00370 // for the 16K block that starts our region, then no 00371 // records in this chunk will cross our region, so do 00372 // not add it to the chunks we need to use. 00373 if(bin->chunks[chunkIndex].chunk_end < minOffset) 00374 { 00375 continue; 00376 } 00377 // Add the chunk to the map. 00378 if(!chunkList.insert(bin->chunks[chunkIndex])) 00379 { 00380 // Failed to add to the map, return false. 00381 return(false); 00382 } 00383 } 00384 } 00385 00386 // Now that all chunks have been added to the list, 00387 // handle overlapping chunks. 00388 return(chunkList.mergeOverlapping()); 00389 }
int32_t BamIndex::getNumRefs | ( | ) | const |
Get the number of references in this index.
Definition at line 417 of file BamIndex.cpp.
bool BamIndex::getReferenceMinMax | ( | int32_t | refID, | |
uint64_t & | minOffset, | |||
uint64_t & | maxOffset | |||
) | const |
Get the minimum and maximum file offsets for the specfied reference ID.
refID | the reference ID to locate in the file. | |
minOffset | returns the min file offset for the specified reference | |
maxOffset | returns the max file offset for the specified reference |
Definition at line 399 of file BamIndex.cpp.
00402 { 00403 if((refID < 0) || (refID >= (int32_t)myRefs.size())) 00404 { 00405 // Reference ID is out of range for this index file. 00406 return(false); 00407 } 00408 00409 // Get this reference. 00410 minOffset = myRefs[refID].minChunkOffset; 00411 maxOffset = myRefs[refID].maxChunkOffset; 00412 return(true); 00413 }
void BamIndex::printIndex | ( | int32_t | refID, | |
bool | summary = false | |||
) |
Print the index information.
refID | reference ID for which to print info for. -1 means print for all references. | |
summary | whether or not to just print a summary (defaults to false). The summary just contains summary info for each reference and not every bin/chunk. |
Definition at line 425 of file BamIndex.cpp.
00426 { 00427 std::cout << "BAM Index: " << std::endl; 00428 std::cout << "# Reference Sequences: " << n_ref << std::endl; 00429 00430 unsigned int startRef = 0; 00431 unsigned int endRef = myRefs.size() - 1; 00432 std::vector<Reference> refsToProcess; 00433 if(refID != -1) 00434 { 00435 // Set start and end ref to the specified reference id. 00436 startRef = refID; 00437 endRef = refID; 00438 } 00439 00440 // Print out the information for each bin. 00441 for(unsigned int i = startRef; i <= endRef; ++i) 00442 { 00443 std::cout << std::dec 00444 << "\tReference ID: " << std::setw(4) << i 00445 << "; #Bins: "<< std::setw(6) << myRefs[i].n_bin 00446 << "; #Linear Index Entries: " 00447 << std::setw(6) << myRefs[i].n_intv 00448 << "; Min Chunk Offset: " 00449 << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset 00450 << "; Max Chunk Offset: " 00451 << std::setw(18) << myRefs[i].maxChunkOffset 00452 << std::dec; 00453 // Print the mapped/unmapped if set. 00454 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO) 00455 { 00456 std::cout << "; " << myRefs[i].n_mapped << " Mapped Reads"; 00457 } 00458 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO) 00459 { 00460 std::cout << "; " << myRefs[i].n_unmapped << " Unmapped Reads"; 00461 } 00462 std::cout << std::endl; 00463 00464 // Only print more details if not summary. 00465 if(!summary) 00466 { 00467 std::vector<Bin>::iterator binIter; 00468 for(binIter = myRefs[i].bins.begin(); 00469 binIter != myRefs[i].bins.end(); 00470 ++binIter) 00471 { 00472 Bin* binPtr = &(*binIter); 00473 if(binPtr->bin == Bin::NOT_USED_BIN) 00474 { 00475 // This bin is not used, continue. 00476 continue; 00477 } 00478 // Print the bin info. 00479 std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl; 00480 std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl; 00481 std::cout << std::hex << std::showbase; 00482 00483 for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk; 00484 ++chunkIndex) 00485 { 00486 // If this is the last chunk of the MAX_NUM_BINS - it 00487 // contains a mapped/unmapped count rather than the regular 00488 // chunk addresses. 00489 if((binPtr->bin != MAX_NUM_BINS) || 00490 (chunkIndex != (binPtr->n_chunk - 1))) 00491 { 00492 std::cout << "\t\t\t\tchunk_beg: " 00493 << binPtr->chunks[chunkIndex].chunk_beg 00494 << std::endl; 00495 std::cout << "\t\t\t\tchunk_end: " 00496 << binPtr->chunks[chunkIndex].chunk_end 00497 << std::endl; 00498 } 00499 } 00500 } 00501 std::cout << std::dec; 00502 00503 // Print the linear index. 00504 for(int linearIndex = 0; linearIndex < myRefs[i].n_intv; 00505 ++linearIndex) 00506 { 00507 if(myRefs[i].ioffsets[linearIndex] != 0) 00508 { 00509 std::cout << "\t\t\tLinearIndex[" 00510 << std::dec << linearIndex << "] Offset: " 00511 << std::hex << myRefs[i].ioffsets[linearIndex] 00512 << std::endl; 00513 } 00514 } 00515 } 00516 } 00517 }
SamStatus::Status BamIndex::readIndex | ( | const char * | filename | ) |
filename | the bam index file to be read. |
Definition at line 140 of file BamIndex.cpp.
References resetIndex().
Referenced by SamFile::ReadBamIndex().
00141 { 00142 // Reset the index from anything that may previously be set. 00143 resetIndex(); 00144 00145 IFILE indexFile = ifopen(filename, "rb"); 00146 00147 // Failed to opren the index file. 00148 if(indexFile == NULL) 00149 { 00150 return(SamStatus::FAIL_IO); 00151 } 00152 00153 // generate the bam index structure. 00154 00155 // Read the magic string. 00156 char magic[4]; 00157 if(ifread(indexFile, magic, 4) != 4) 00158 { 00159 // Failed to read the magic 00160 return(SamStatus::FAIL_IO); 00161 } 00162 00163 // If this is not an index file, set num references to 0. 00164 if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1) 00165 { 00166 // Not a BAM Index file. 00167 return(SamStatus::FAIL_PARSE); 00168 } 00169 00170 // It is a bam index file. 00171 // Read the number of reference sequences. 00172 if(ifread(indexFile, &n_ref, 4) != 4) 00173 { 00174 // Failed to read. 00175 return(SamStatus::FAIL_IO); 00176 } 00177 00178 // Size the references. 00179 myRefs.resize(n_ref); 00180 00181 for(int refIndex = 0; refIndex < n_ref; refIndex++) 00182 { 00183 // Read each reference. 00184 Reference* ref = &(myRefs[refIndex]); 00185 00186 // Resize the bins so they can be indexed by bin number. 00187 ref->bins.resize(MAX_NUM_BINS + 1); 00188 00189 // Read the number of bins. 00190 if(ifread(indexFile, &(ref->n_bin), 4) != 4) 00191 { 00192 // Failed to read the number of bins. 00193 // Return failure. 00194 return(SamStatus::FAIL_PARSE); 00195 } 00196 00197 // Read each bin. 00198 for(int binIndex = 0; binIndex < ref->n_bin; binIndex++) 00199 { 00200 uint32_t binNumber; 00201 00202 // Read in the bin number. 00203 if(ifread(indexFile, &(binNumber), 4) != 4) 00204 { 00205 // Failed to read the bin number. 00206 // Return failure. 00207 return(SamStatus::FAIL_IO); 00208 } 00209 00210 // Add the bin to the reference and get the 00211 // pointer back so the values can be set in it. 00212 Bin* binPtr = &(ref->bins[binNumber]); 00213 binPtr->bin = binNumber; 00214 00215 // Read in the number of chunks. 00216 if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4) 00217 { 00218 // Failed to read number of chunks. 00219 // Return failure. 00220 return(SamStatus::FAIL_IO); 00221 } 00222 00223 // Read in the chunks. 00224 // Allocate space for the chunks. 00225 uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk); 00226 binPtr->chunks = (Chunk*)malloc(sizeOfChunkList); 00227 if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList) 00228 { 00229 // Failed to read the chunks. 00230 // Return failure. 00231 return(SamStatus::FAIL_IO); 00232 } 00233 00234 // Determine the min/max for this bin if it is not the max bin. 00235 if(binPtr->bin != MAX_NUM_BINS) 00236 { 00237 for(int i = 0; i < binPtr->n_chunk; i++) 00238 { 00239 if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset) 00240 { 00241 ref->minChunkOffset = binPtr->chunks[i].chunk_beg; 00242 } 00243 if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset) 00244 { 00245 ref->maxChunkOffset = binPtr->chunks[i].chunk_end; 00246 } 00247 if(binPtr->chunks[i].chunk_end > maxOverallOffset) 00248 { 00249 maxOverallOffset = binPtr->chunks[i].chunk_end; 00250 } 00251 } 00252 } 00253 else 00254 { 00255 // Mapped/unmapped are the last chunk of the 00256 // MAX BIN 00257 ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg; 00258 ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end; 00259 } 00260 } 00261 00262 // Read the number of intervals. 00263 if(ifread(indexFile, &(ref->n_intv), 4) != 4) 00264 { 00265 // Failed to read, set to 0. 00266 ref->n_intv = 0; 00267 // Return failure. 00268 return(SamStatus::FAIL_IO); 00269 } 00270 00271 // Allocate space for the intervals and read them. 00272 uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t); 00273 ref->ioffsets = (uint64_t*)malloc(linearIndexSize); 00274 if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize) 00275 { 00276 // Failed to read the linear index. 00277 // Return failure. 00278 return(SamStatus::FAIL_IO); 00279 } 00280 } 00281 00282 // Successfully read teh bam index file. 00283 return(SamStatus::SUCCESS); 00284 }