Public Member Functions | |
virtual void | resetIndex () |
Reset the member data for a new index file. | |
SamStatus::Status | readIndex (const char *filename) |
bool | getChunksForRegion (int32_t refID, int32_t start, int32_t end, SortedChunkList &chunkList) |
Get the list of chunks associated with this region. | |
uint64_t | getMaxOffset () const |
bool | getReferenceMinMax (int32_t refID, uint64_t &minOffset, uint64_t &maxOffset) const |
Get the minimum and maximum file offsets for the specfied reference ID. | |
int32_t | getNumMappedReads (int32_t refID) |
Get the number of mapped reads for this reference id. | |
int32_t | getNumUnMappedReads (int32_t refID) |
Get the number of unmapped reads for this reference id. | |
void | printIndex (int32_t refID, bool summary=false) |
Print the index information. | |
Static Public Attributes | |
static const int32_t | UNKNOWN_NUM_READS = -1 |
The number used for an unknown number of reads. | |
static const int32_t | REF_ID_UNMAPPED = -1 |
The number used for the reference id of unmapped reads. | |
static const int32_t | REF_ID_ALL = -2 |
The number used to indicate that all reference ids should be used. |
Definition at line 31 of file BamIndex.h.
bool BamIndex::getChunksForRegion | ( | int32_t | refID, | |
int32_t | start, | |||
int32_t | end, | |||
SortedChunkList & | chunkList | |||
) |
Get the list of chunks associated with this region.
For an entire reference ID, set start and end to -1. To start at the beginning of the region, set start to 0/-1. To go to the end of the region, set end to -1.
Definition at line 218 of file BamIndex.cpp.
References REF_ID_UNMAPPED.
00220 { 00221 chunkList.clear(); 00222 00223 // If start is >= to end, there will be no sections, return no 00224 // regions. 00225 if((start >= end) && (end != -1)) 00226 { 00227 std::cerr << "Warning, requesting region where start <= end, so " 00228 << "no values will be returned.\n"; 00229 return(false); 00230 } 00231 00232 // Handle REF_ID_UNMAPPED. This uses a default chunk which covers 00233 // from the max offset to the end of the file. 00234 if(refID == REF_ID_UNMAPPED) 00235 { 00236 Chunk refChunk; 00237 // The start of the unmapped region is the max offset found 00238 // in the index file. 00239 refChunk.chunk_beg = getMaxOffset(); 00240 // The end of the unmapped region is the end of the file, so 00241 // set chunk end to the max value. 00242 refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE; 00243 return(chunkList.insert(refChunk)); 00244 } 00245 00246 if((refID < 0) || (refID >= n_ref)) 00247 { 00248 // The specified refID is out of range, return false. 00249 std::cerr << "Warning, requesting refID is out of range, so " 00250 << "no values will be returned.\n"; 00251 return(false); 00252 } 00253 00254 const Reference* ref = &(myRefs[refID]); 00255 00256 // Handle where start/end are defaults. 00257 if(start == -1) 00258 { 00259 if(end == -1) 00260 { 00261 // This is whole chromosome, so take a shortcut. 00262 if(ref->maxChunkOffset == 0) 00263 { 00264 // No chunks for this region, but this is not an error. 00265 return(true); 00266 } 00267 Chunk refChunk; 00268 refChunk.chunk_beg = ref->minChunkOffset; 00269 refChunk.chunk_end = ref->maxChunkOffset; 00270 return(chunkList.insert(refChunk)); 00271 } 00272 else 00273 { 00274 start = 0; 00275 } 00276 } 00277 if(end == -1) 00278 { 00279 // MAX_POSITION is inclusive, but end is exclusive, so add 1. 00280 end = MAX_POSITION + 1; 00281 } 00282 00283 // Determine the minimum offset for the given start position. This 00284 // is done by using the linear index for the specified start position. 00285 uint64_t minOffset = 0; 00286 getMinOffsetFromLinearIndex(refID, start, minOffset); 00287 00288 uint16_t binInRangeList[MAX_NUM_BINS + 1]; 00289 00290 int numBins = getBinsForRegion(start, end, binInRangeList); 00291 00292 // loop through the bins in the range and get the chunks. 00293 for(int i = 0; i < numBins; ++i) 00294 { 00295 int binNum = binInRangeList[i]; 00296 const Bin* bin = &(ref->bins[binNum]); 00297 00298 // Add each chunk in the bin to the map. 00299 for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++) 00300 { 00301 // If the end of the chunk is less than the minimum offset 00302 // for the 16K block that starts our region, then no 00303 // records in this chunk will cross our region, so do 00304 // not add it to the chunks we need to use. 00305 if(bin->chunks[chunkIndex].chunk_end < minOffset) 00306 { 00307 continue; 00308 } 00309 // Add the chunk to the map. 00310 if(!chunkList.insert(bin->chunks[chunkIndex])) 00311 { 00312 // Failed to add to the map, return false. 00313 std::cerr << "Warning, Failed to add a chunk, so " 00314 << "no values will be returned.\n"; 00315 return(false); 00316 } 00317 } 00318 } 00319 00320 // Now that all chunks have been added to the list, 00321 // handle overlapping chunks. 00322 return(chunkList.mergeOverlapping()); 00323 }
int32_t BamIndex::getNumMappedReads | ( | int32_t | refID | ) |
Get the number of mapped reads for this reference id.
Returns -1 for out of range refIDs.
refID | reference ID for which to extract the number of mapped reads. |
Definition at line 351 of file BamIndex.cpp.
References REF_ID_UNMAPPED.
Referenced by SamFile::getNumMappedReadsFromIndex().
00352 { 00353 // If it is the reference id of unmapped reads, return 00354 // that there are no mapped reads. 00355 if(refID == REF_ID_UNMAPPED) 00356 { 00357 // These are by definition all unmapped reads. 00358 return(0); 00359 } 00360 00361 if((refID < 0) || (refID >= (int32_t)myRefs.size())) 00362 { 00363 // Reference ID is out of range for this index file. 00364 return(-1); 00365 } 00366 00367 // Get this reference. 00368 return(myRefs[refID].n_mapped); 00369 }
int32_t BamIndex::getNumUnMappedReads | ( | int32_t | refID | ) |
Get the number of unmapped reads for this reference id.
Returns -1 for out of range refIDs.
refID | reference ID for which to extract the number of unmapped reads. |
Definition at line 373 of file BamIndex.cpp.
References REF_ID_UNMAPPED.
Referenced by SamFile::getNumUnMappedReadsFromIndex().
00374 { 00375 // If it is the reference id of unmapped reads, return 00376 // that value. 00377 if(refID == REF_ID_UNMAPPED) 00378 { 00379 return(myUnMappedNumReads); 00380 } 00381 00382 if((refID < 0) || (refID >= (int32_t)myRefs.size())) 00383 { 00384 // Reference ID is out of range for this index file. 00385 return(-1); 00386 } 00387 00388 // Get this reference. 00389 return(myRefs[refID].n_unmapped); 00390 }
bool BamIndex::getReferenceMinMax | ( | int32_t | refID, | |
uint64_t & | minOffset, | |||
uint64_t & | maxOffset | |||
) | const |
Get the minimum and maximum file offsets for the specfied reference ID.
refID | the reference ID to locate in the file. | |
minOffset | returns the min file offset for the specified reference | |
maxOffset | returns the max file offset for the specified reference |
Definition at line 333 of file BamIndex.cpp.
00336 { 00337 if((refID < 0) || (refID >= (int32_t)myRefs.size())) 00338 { 00339 // Reference ID is out of range for this index file. 00340 return(false); 00341 } 00342 00343 // Get this reference. 00344 minOffset = myRefs[refID].minChunkOffset; 00345 maxOffset = myRefs[refID].maxChunkOffset; 00346 return(true); 00347 }
void BamIndex::printIndex | ( | int32_t | refID, | |
bool | summary = false | |||
) |
Print the index information.
refID | reference ID for which to print info for. -1 means print for all references. | |
summary | whether or not to just print a summary (defaults to false). The summary just contains summary info for each reference and not every bin/chunk. |
Definition at line 394 of file BamIndex.cpp.
00395 { 00396 std::cout << "BAM Index: " << std::endl; 00397 std::cout << "# Reference Sequences: " << n_ref << std::endl; 00398 00399 unsigned int startRef = 0; 00400 unsigned int endRef = myRefs.size() - 1; 00401 std::vector<Reference> refsToProcess; 00402 if(refID != -1) 00403 { 00404 // Set start and end ref to the specified reference id. 00405 startRef = refID; 00406 endRef = refID; 00407 } 00408 00409 // Print out the information for each bin. 00410 for(unsigned int i = startRef; i <= endRef; ++i) 00411 { 00412 std::cout << std::dec 00413 << "\tReference ID: " << std::setw(4) << i 00414 << "; #Bins: "<< std::setw(6) << myRefs[i].n_bin 00415 << "; #Linear Index Entries: " 00416 << std::setw(6) << myRefs[i].n_intv 00417 << "; Min Chunk Offset: " 00418 << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset 00419 << "; Max Chunk Offset: " 00420 << std::setw(18) << myRefs[i].maxChunkOffset 00421 << std::dec; 00422 // Print the mapped/unmapped if set. 00423 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO) 00424 { 00425 std::cout << "; " << myRefs[i].n_mapped << " Mapped Reads"; 00426 } 00427 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO) 00428 { 00429 std::cout << "; " << myRefs[i].n_unmapped << " Unmapped Reads"; 00430 } 00431 std::cout << std::endl; 00432 00433 // Only print more details if not summary. 00434 if(!summary) 00435 { 00436 std::vector<Bin>::iterator binIter; 00437 for(binIter = myRefs[i].bins.begin(); 00438 binIter != myRefs[i].bins.end(); 00439 ++binIter) 00440 { 00441 Bin* binPtr = &(*binIter); 00442 if(binPtr->bin == Bin::NOT_USED_BIN) 00443 { 00444 // This bin is not used, continue. 00445 continue; 00446 } 00447 // Print the bin info. 00448 std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl; 00449 std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl; 00450 std::cout << std::hex << std::showbase; 00451 00452 for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk; 00453 ++chunkIndex) 00454 { 00455 // If this is the last chunk of the MAX_NUM_BINS - it 00456 // contains a mapped/unmapped count rather than the regular 00457 // chunk addresses. 00458 if((binPtr->bin != MAX_NUM_BINS) || 00459 (chunkIndex != (binPtr->n_chunk - 1))) 00460 { 00461 std::cout << "\t\t\t\tchunk_beg: " 00462 << binPtr->chunks[chunkIndex].chunk_beg 00463 << std::endl; 00464 std::cout << "\t\t\t\tchunk_end: " 00465 << binPtr->chunks[chunkIndex].chunk_end 00466 << std::endl; 00467 } 00468 } 00469 } 00470 std::cout << std::dec; 00471 00472 // Print the linear index. 00473 for(int linearIndex = 0; linearIndex < myRefs[i].n_intv; 00474 ++linearIndex) 00475 { 00476 if(myRefs[i].ioffsets[linearIndex] != 0) 00477 { 00478 std::cout << "\t\t\tLinearIndex[" 00479 << std::dec << linearIndex << "] Offset: " 00480 << std::hex << myRefs[i].ioffsets[linearIndex] 00481 << std::endl; 00482 } 00483 } 00484 } 00485 } 00486 }
SamStatus::Status BamIndex::readIndex | ( | const char * | filename | ) | [virtual] |
filename | the bam index file to be read. |
Implements IndexBase.
Definition at line 45 of file BamIndex.cpp.
References StatGenStatus::FAIL_IO, StatGenStatus::FAIL_PARSE, ifclose(), ifopen(), ifread(), resetIndex(), and StatGenStatus::SUCCESS.
Referenced by SamFile::ReadBamIndex().
00046 { 00047 // Reset the index from anything that may previously be set. 00048 resetIndex(); 00049 00050 IFILE indexFile = ifopen(filename, "rb"); 00051 00052 // Failed to open the index file. 00053 if(indexFile == NULL) 00054 { 00055 return(SamStatus::FAIL_IO); 00056 } 00057 00058 // generate the bam index structure. 00059 00060 // Read the magic string. 00061 char magic[4]; 00062 if(ifread(indexFile, magic, 4) != 4) 00063 { 00064 // Failed to read the magic 00065 ifclose(indexFile); 00066 return(SamStatus::FAIL_IO); 00067 } 00068 00069 // If this is not an index file, set num references to 0. 00070 if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1) 00071 { 00072 // Not a BAM Index file. 00073 ifclose(indexFile); 00074 return(SamStatus::FAIL_PARSE); 00075 } 00076 00077 // It is a bam index file. 00078 // Read the number of reference sequences. 00079 if(ifread(indexFile, &n_ref, 4) != 4) 00080 { 00081 // Failed to read. 00082 ifclose(indexFile); 00083 return(SamStatus::FAIL_IO); 00084 } 00085 00086 // Size the references. 00087 myRefs.resize(n_ref); 00088 00089 for(int refIndex = 0; refIndex < n_ref; refIndex++) 00090 { 00091 // Read each reference. 00092 Reference* ref = &(myRefs[refIndex]); 00093 00094 // Resize the bins so they can be indexed by bin number. 00095 ref->bins.resize(MAX_NUM_BINS + 1); 00096 00097 // Read the number of bins. 00098 if(ifread(indexFile, &(ref->n_bin), 4) != 4) 00099 { 00100 // Failed to read the number of bins. 00101 // Return failure. 00102 ifclose(indexFile); 00103 return(SamStatus::FAIL_PARSE); 00104 } 00105 00106 // If there are no bins, then there are no 00107 // mapped/unmapped reads. 00108 if(ref->n_bin == 0) 00109 { 00110 ref->n_mapped = 0; 00111 ref->n_unmapped = 0; 00112 } 00113 00114 // Read each bin. 00115 for(int binIndex = 0; binIndex < ref->n_bin; binIndex++) 00116 { 00117 uint32_t binNumber; 00118 00119 // Read in the bin number. 00120 if(ifread(indexFile, &(binNumber), 4) != 4) 00121 { 00122 // Failed to read the bin number. 00123 // Return failure. 00124 ifclose(indexFile); 00125 return(SamStatus::FAIL_IO); 00126 } 00127 00128 // Add the bin to the reference and get the 00129 // pointer back so the values can be set in it. 00130 Bin* binPtr = &(ref->bins[binNumber]); 00131 binPtr->bin = binNumber; 00132 00133 // Read in the number of chunks. 00134 if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4) 00135 { 00136 // Failed to read number of chunks. 00137 // Return failure. 00138 ifclose(indexFile); 00139 return(SamStatus::FAIL_IO); 00140 } 00141 00142 // Read in the chunks. 00143 // Allocate space for the chunks. 00144 uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk); 00145 binPtr->chunks = (Chunk*)malloc(sizeOfChunkList); 00146 if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList) 00147 { 00148 // Failed to read the chunks. 00149 // Return failure. 00150 ifclose(indexFile); 00151 return(SamStatus::FAIL_IO); 00152 } 00153 00154 // Determine the min/max for this bin if it is not the max bin. 00155 if(binPtr->bin != MAX_NUM_BINS) 00156 { 00157 for(int i = 0; i < binPtr->n_chunk; i++) 00158 { 00159 if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset) 00160 { 00161 ref->minChunkOffset = binPtr->chunks[i].chunk_beg; 00162 } 00163 if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset) 00164 { 00165 ref->maxChunkOffset = binPtr->chunks[i].chunk_end; 00166 } 00167 if(binPtr->chunks[i].chunk_end > maxOverallOffset) 00168 { 00169 maxOverallOffset = binPtr->chunks[i].chunk_end; 00170 } 00171 } 00172 } 00173 else 00174 { 00175 // Mapped/unmapped are the last chunk of the 00176 // MAX BIN 00177 ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg; 00178 ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end; 00179 } 00180 } 00181 00182 // Read the number of intervals. 00183 if(ifread(indexFile, &(ref->n_intv), 4) != 4) 00184 { 00185 // Failed to read, set to 0. 00186 ref->n_intv = 0; 00187 // Return failure. 00188 ifclose(indexFile); 00189 return(SamStatus::FAIL_IO); 00190 } 00191 00192 // Allocate space for the intervals and read them. 00193 uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t); 00194 ref->ioffsets = (uint64_t*)malloc(linearIndexSize); 00195 if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize) 00196 { 00197 // Failed to read the linear index. 00198 // Return failure. 00199 ifclose(indexFile); 00200 return(SamStatus::FAIL_IO); 00201 } 00202 } 00203 00204 int32_t numUnmapped = 0; 00205 if(ifread(indexFile, &numUnmapped, sizeof(int32_t)) == sizeof(int32_t)) 00206 { 00207 myUnMappedNumReads = numUnmapped; 00208 } 00209 00210 // Successfully read the bam index file. 00211 ifclose(indexFile); 00212 return(SamStatus::SUCCESS); 00213 }