BamIndex.cpp
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "BamIndex.h"
00019 #include <iomanip>
00020
00021 BamIndex::BamIndex()
00022 : IndexBase(),
00023 maxOverallOffset(0),
00024 myUnMappedNumReads(-1)
00025 {
00026 }
00027
00028
00029 BamIndex::~BamIndex()
00030 {
00031 }
00032
00033
00034
00035 void BamIndex::resetIndex()
00036 {
00037 IndexBase::resetIndex();
00038
00039 maxOverallOffset = 0;
00040 myUnMappedNumReads = -1;
00041 }
00042
00043
00044
00045 SamStatus::Status BamIndex::readIndex(const char* filename)
00046 {
00047
00048 resetIndex();
00049
00050 IFILE indexFile = ifopen(filename, "rb");
00051
00052
00053 if(indexFile == NULL)
00054 {
00055 return(SamStatus::FAIL_IO);
00056 }
00057
00058
00059
00060
00061 char magic[4];
00062 if(ifread(indexFile, magic, 4) != 4)
00063 {
00064
00065 ifclose(indexFile);
00066 return(SamStatus::FAIL_IO);
00067 }
00068
00069
00070 if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1)
00071 {
00072
00073 ifclose(indexFile);
00074 return(SamStatus::FAIL_PARSE);
00075 }
00076
00077
00078
00079 if(ifread(indexFile, &n_ref, 4) != 4)
00080 {
00081
00082 ifclose(indexFile);
00083 return(SamStatus::FAIL_IO);
00084 }
00085
00086
00087 myRefs.resize(n_ref);
00088
00089 for(int refIndex = 0; refIndex < n_ref; refIndex++)
00090 {
00091
00092 Reference* ref = &(myRefs[refIndex]);
00093
00094
00095 ref->bins.resize(MAX_NUM_BINS + 1);
00096
00097
00098 if(ifread(indexFile, &(ref->n_bin), 4) != 4)
00099 {
00100
00101
00102 ifclose(indexFile);
00103 return(SamStatus::FAIL_PARSE);
00104 }
00105
00106
00107
00108 if(ref->n_bin == 0)
00109 {
00110 ref->n_mapped = 0;
00111 ref->n_unmapped = 0;
00112 }
00113
00114
00115 for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
00116 {
00117 uint32_t binNumber;
00118
00119
00120 if(ifread(indexFile, &(binNumber), 4) != 4)
00121 {
00122
00123
00124 ifclose(indexFile);
00125 return(SamStatus::FAIL_IO);
00126 }
00127
00128
00129
00130 Bin* binPtr = &(ref->bins[binNumber]);
00131 binPtr->bin = binNumber;
00132
00133
00134 if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
00135 {
00136
00137
00138 ifclose(indexFile);
00139 return(SamStatus::FAIL_IO);
00140 }
00141
00142
00143
00144 uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
00145 binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
00146 if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
00147 {
00148
00149
00150 ifclose(indexFile);
00151 return(SamStatus::FAIL_IO);
00152 }
00153
00154
00155 if(binPtr->bin != MAX_NUM_BINS)
00156 {
00157 for(int i = 0; i < binPtr->n_chunk; i++)
00158 {
00159 if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset)
00160 {
00161 ref->minChunkOffset = binPtr->chunks[i].chunk_beg;
00162 }
00163 if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset)
00164 {
00165 ref->maxChunkOffset = binPtr->chunks[i].chunk_end;
00166 }
00167 if(binPtr->chunks[i].chunk_end > maxOverallOffset)
00168 {
00169 maxOverallOffset = binPtr->chunks[i].chunk_end;
00170 }
00171 }
00172 }
00173 else
00174 {
00175
00176
00177 ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg;
00178 ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end;
00179 }
00180 }
00181
00182
00183 if(ifread(indexFile, &(ref->n_intv), 4) != 4)
00184 {
00185
00186 ref->n_intv = 0;
00187
00188 ifclose(indexFile);
00189 return(SamStatus::FAIL_IO);
00190 }
00191
00192
00193 uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
00194 ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
00195 if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
00196 {
00197
00198
00199 ifclose(indexFile);
00200 return(SamStatus::FAIL_IO);
00201 }
00202 }
00203
00204 int32_t numUnmapped = 0;
00205 if(ifread(indexFile, &numUnmapped, sizeof(int32_t)) == sizeof(int32_t))
00206 {
00207 myUnMappedNumReads = numUnmapped;
00208 }
00209
00210
00211 ifclose(indexFile);
00212 return(SamStatus::SUCCESS);
00213 }
00214
00215
00216
00217
00218 bool BamIndex::getChunksForRegion(int32_t refID, int32_t start, int32_t end,
00219 SortedChunkList& chunkList)
00220 {
00221 chunkList.clear();
00222
00223
00224
00225 if((start >= end) && (end != -1))
00226 {
00227 std::cerr << "Warning, requesting region where start <= end, so "
00228 << "no values will be returned.\n";
00229 return(false);
00230 }
00231
00232
00233
00234 if(refID == REF_ID_UNMAPPED)
00235 {
00236 Chunk refChunk;
00237
00238
00239 refChunk.chunk_beg = getMaxOffset();
00240
00241
00242 refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE;
00243 return(chunkList.insert(refChunk));
00244 }
00245
00246 if((refID < 0) || (refID >= n_ref))
00247 {
00248
00249 std::cerr << "Warning, requesting refID is out of range, so "
00250 << "no values will be returned.\n";
00251 return(false);
00252 }
00253
00254 const Reference* ref = &(myRefs[refID]);
00255
00256
00257 if(start == -1)
00258 {
00259 if(end == -1)
00260 {
00261
00262 if(ref->maxChunkOffset == 0)
00263 {
00264
00265 return(true);
00266 }
00267 Chunk refChunk;
00268 refChunk.chunk_beg = ref->minChunkOffset;
00269 refChunk.chunk_end = ref->maxChunkOffset;
00270 return(chunkList.insert(refChunk));
00271 }
00272 else
00273 {
00274 start = 0;
00275 }
00276 }
00277 if(end == -1)
00278 {
00279
00280 end = MAX_POSITION + 1;
00281 }
00282
00283
00284
00285 uint64_t minOffset = 0;
00286 getMinOffsetFromLinearIndex(refID, start, minOffset);
00287
00288 uint16_t binInRangeList[MAX_NUM_BINS + 1];
00289
00290 int numBins = getBinsForRegion(start, end, binInRangeList);
00291
00292
00293 for(int i = 0; i < numBins; ++i)
00294 {
00295 int binNum = binInRangeList[i];
00296 const Bin* bin = &(ref->bins[binNum]);
00297
00298
00299 for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++)
00300 {
00301
00302
00303
00304
00305 if(bin->chunks[chunkIndex].chunk_end < minOffset)
00306 {
00307 continue;
00308 }
00309
00310 if(!chunkList.insert(bin->chunks[chunkIndex]))
00311 {
00312
00313 std::cerr << "Warning, Failed to add a chunk, so "
00314 << "no values will be returned.\n";
00315 return(false);
00316 }
00317 }
00318 }
00319
00320
00321
00322 return(chunkList.mergeOverlapping());
00323 }
00324
00325
00326
00327 uint64_t BamIndex::getMaxOffset() const
00328 {
00329 return(maxOverallOffset);
00330 }
00331
00332
00333 bool BamIndex::getReferenceMinMax(int32_t refID,
00334 uint64_t& minOffset,
00335 uint64_t& maxOffset) const
00336 {
00337 if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00338 {
00339
00340 return(false);
00341 }
00342
00343
00344 minOffset = myRefs[refID].minChunkOffset;
00345 maxOffset = myRefs[refID].maxChunkOffset;
00346 return(true);
00347 }
00348
00349
00350
00351 int32_t BamIndex::getNumMappedReads(int32_t refID)
00352 {
00353
00354
00355 if(refID == REF_ID_UNMAPPED)
00356 {
00357
00358 return(0);
00359 }
00360
00361 if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00362 {
00363
00364 return(-1);
00365 }
00366
00367
00368 return(myRefs[refID].n_mapped);
00369 }
00370
00371
00372
00373 int32_t BamIndex::getNumUnMappedReads(int32_t refID)
00374 {
00375
00376
00377 if(refID == REF_ID_UNMAPPED)
00378 {
00379 return(myUnMappedNumReads);
00380 }
00381
00382 if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00383 {
00384
00385 return(-1);
00386 }
00387
00388
00389 return(myRefs[refID].n_unmapped);
00390 }
00391
00392
00393
00394 void BamIndex::printIndex(int32_t refID, bool summary)
00395 {
00396 std::cout << "BAM Index: " << std::endl;
00397 std::cout << "# Reference Sequences: " << n_ref << std::endl;
00398
00399 unsigned int startRef = 0;
00400 unsigned int endRef = myRefs.size() - 1;
00401 std::vector<Reference> refsToProcess;
00402 if(refID != -1)
00403 {
00404
00405 startRef = refID;
00406 endRef = refID;
00407 }
00408
00409
00410 for(unsigned int i = startRef; i <= endRef; ++i)
00411 {
00412 std::cout << std::dec
00413 << "\tReference ID: " << std::setw(4) << i
00414 << "; #Bins: "<< std::setw(6) << myRefs[i].n_bin
00415 << "; #Linear Index Entries: "
00416 << std::setw(6) << myRefs[i].n_intv
00417 << "; Min Chunk Offset: "
00418 << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset
00419 << "; Max Chunk Offset: "
00420 << std::setw(18) << myRefs[i].maxChunkOffset
00421 << std::dec;
00422
00423 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00424 {
00425 std::cout << "; " << myRefs[i].n_mapped << " Mapped Reads";
00426 }
00427 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00428 {
00429 std::cout << "; " << myRefs[i].n_unmapped << " Unmapped Reads";
00430 }
00431 std::cout << std::endl;
00432
00433
00434 if(!summary)
00435 {
00436 std::vector<Bin>::iterator binIter;
00437 for(binIter = myRefs[i].bins.begin();
00438 binIter != myRefs[i].bins.end();
00439 ++binIter)
00440 {
00441 Bin* binPtr = &(*binIter);
00442 if(binPtr->bin == Bin::NOT_USED_BIN)
00443 {
00444
00445 continue;
00446 }
00447
00448 std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl;
00449 std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl;
00450 std::cout << std::hex << std::showbase;
00451
00452 for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk;
00453 ++chunkIndex)
00454 {
00455
00456
00457
00458 if((binPtr->bin != MAX_NUM_BINS) ||
00459 (chunkIndex != (binPtr->n_chunk - 1)))
00460 {
00461 std::cout << "\t\t\t\tchunk_beg: "
00462 << binPtr->chunks[chunkIndex].chunk_beg
00463 << std::endl;
00464 std::cout << "\t\t\t\tchunk_end: "
00465 << binPtr->chunks[chunkIndex].chunk_end
00466 << std::endl;
00467 }
00468 }
00469 }
00470 std::cout << std::dec;
00471
00472
00473 for(int linearIndex = 0; linearIndex < myRefs[i].n_intv;
00474 ++linearIndex)
00475 {
00476 if(myRefs[i].ioffsets[linearIndex] != 0)
00477 {
00478 std::cout << "\t\t\tLinearIndex["
00479 << std::dec << linearIndex << "] Offset: "
00480 << std::hex << myRefs[i].ioffsets[linearIndex]
00481 << std::endl;
00482 }
00483 }
00484 }
00485 }
00486 }