BamIndex.cpp
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "BamIndex.h"
00019 #include <iomanip>
00020
00021 Chunk SortedChunkList::pop()
00022 {
00023 Chunk newChunk = chunkList.begin()->second;
00024 chunkList.erase(chunkList.begin());
00025 return(newChunk);
00026 }
00027
00028
00029 bool SortedChunkList::insert(const Chunk& chunkToInsert)
00030 {
00031 std::pair<std::map<uint64_t, Chunk>::iterator, bool> insertRes;
00032
00033 insertRes =
00034 chunkList.insert(std::pair<uint64_t, Chunk>(chunkToInsert.chunk_beg,
00035 chunkToInsert));
00036
00037 if(!insertRes.second)
00038 {
00039
00040 std::cerr << "Failed to insert into the SortedChunkList.\n";
00041 std::cerr << "\tpreviously found chunk:\tbeg = " << std::hex
00042 << insertRes.first->second.chunk_beg
00043 << "\tend = "
00044 << insertRes.first->second.chunk_end
00045 << "\nnew chunk:\tbeg = " << std::hex
00046 << chunkToInsert.chunk_beg
00047 << "\tend = "
00048 << chunkToInsert.chunk_end
00049 << std::endl;
00050 }
00051
00052 return(insertRes.second);
00053 }
00054
00055 void SortedChunkList::clear()
00056 {
00057 chunkList.clear();
00058 }
00059
00060 bool SortedChunkList::empty()
00061 {
00062 return(chunkList.empty());
00063 }
00064
00065
00066
00067 bool SortedChunkList::mergeOverlapping()
00068 {
00069
00070 std::map<uint64_t, Chunk>::iterator currentPos = chunkList.begin();
00071 std::map<uint64_t, Chunk>::iterator nextPos = chunkList.begin();
00072 if(nextPos != chunkList.end())
00073 {
00074 ++nextPos;
00075 }
00076
00077
00078 while(nextPos != chunkList.end())
00079 {
00080
00081
00082
00083 if(nextPos->second.chunk_end < currentPos->second.chunk_end)
00084 {
00085 chunkList.erase(nextPos);
00086 nextPos = currentPos;
00087 ++nextPos;
00088 continue;
00089 }
00090
00091
00092
00093
00094 if((nextPos->second.chunk_beg >> 16) <=
00095 (currentPos->second.chunk_end >> 16))
00096 {
00097 currentPos->second.chunk_end = nextPos->second.chunk_end;
00098
00099
00100 chunkList.erase(nextPos);
00101 nextPos = currentPos;
00102 ++nextPos;
00103 continue;
00104 }
00105 else
00106 {
00107
00108 currentPos = nextPos;
00109 ++nextPos;
00110 }
00111 }
00112 return(true);
00113 }
00114
00115
00116 BamIndex::BamIndex()
00117 {
00118 resetIndex();
00119 }
00120
00121
00122
00123 BamIndex::~BamIndex()
00124 {
00125 }
00126
00127
00128
00129 void BamIndex::resetIndex()
00130 {
00131 n_ref = 0;
00132 maxOverallOffset = 0;
00133
00134 myUnMappedNumReads = -1;
00135
00136
00137 myRefs.clear();
00138 }
00139
00140
00141
00142 SamStatus::Status BamIndex::readIndex(const char* filename)
00143 {
00144
00145 resetIndex();
00146
00147 IFILE indexFile = ifopen(filename, "rb");
00148
00149
00150 if(indexFile == NULL)
00151 {
00152 return(SamStatus::FAIL_IO);
00153 }
00154
00155
00156
00157
00158 char magic[4];
00159 if(ifread(indexFile, magic, 4) != 4)
00160 {
00161
00162 return(SamStatus::FAIL_IO);
00163 }
00164
00165
00166 if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1)
00167 {
00168
00169 return(SamStatus::FAIL_PARSE);
00170 }
00171
00172
00173
00174 if(ifread(indexFile, &n_ref, 4) != 4)
00175 {
00176
00177 return(SamStatus::FAIL_IO);
00178 }
00179
00180
00181 myRefs.resize(n_ref);
00182
00183 for(int refIndex = 0; refIndex < n_ref; refIndex++)
00184 {
00185
00186 Reference* ref = &(myRefs[refIndex]);
00187
00188
00189 ref->bins.resize(MAX_NUM_BINS + 1);
00190
00191
00192 if(ifread(indexFile, &(ref->n_bin), 4) != 4)
00193 {
00194
00195
00196 return(SamStatus::FAIL_PARSE);
00197 }
00198
00199
00200
00201 if(ref->n_bin == 0)
00202 {
00203 ref->n_mapped = 0;
00204 ref->n_unmapped = 0;
00205 }
00206
00207
00208 for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
00209 {
00210 uint32_t binNumber;
00211
00212
00213 if(ifread(indexFile, &(binNumber), 4) != 4)
00214 {
00215
00216
00217 return(SamStatus::FAIL_IO);
00218 }
00219
00220
00221
00222 Bin* binPtr = &(ref->bins[binNumber]);
00223 binPtr->bin = binNumber;
00224
00225
00226 if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
00227 {
00228
00229
00230 return(SamStatus::FAIL_IO);
00231 }
00232
00233
00234
00235 uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
00236 binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
00237 if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
00238 {
00239
00240
00241 return(SamStatus::FAIL_IO);
00242 }
00243
00244
00245 if(binPtr->bin != MAX_NUM_BINS)
00246 {
00247 for(int i = 0; i < binPtr->n_chunk; i++)
00248 {
00249 if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset)
00250 {
00251 ref->minChunkOffset = binPtr->chunks[i].chunk_beg;
00252 }
00253 if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset)
00254 {
00255 ref->maxChunkOffset = binPtr->chunks[i].chunk_end;
00256 }
00257 if(binPtr->chunks[i].chunk_end > maxOverallOffset)
00258 {
00259 maxOverallOffset = binPtr->chunks[i].chunk_end;
00260 }
00261 }
00262 }
00263 else
00264 {
00265
00266
00267 ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg;
00268 ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end;
00269 }
00270 }
00271
00272
00273 if(ifread(indexFile, &(ref->n_intv), 4) != 4)
00274 {
00275
00276 ref->n_intv = 0;
00277
00278 return(SamStatus::FAIL_IO);
00279 }
00280
00281
00282 uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
00283 ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
00284 if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
00285 {
00286
00287
00288 return(SamStatus::FAIL_IO);
00289 }
00290 }
00291
00292 int32_t numUnmapped = 0;
00293 if(ifread(indexFile, &numUnmapped, sizeof(int32_t)) == sizeof(int32_t))
00294 {
00295 myUnMappedNumReads = numUnmapped;
00296 }
00297
00298
00299 return(SamStatus::SUCCESS);
00300 }
00301
00302
00303
00304
00305 bool BamIndex::getChunksForRegion(int32_t refID, int32_t start, int32_t end,
00306 SortedChunkList& chunkList)
00307 {
00308 chunkList.clear();
00309
00310
00311
00312 if((start >= end) && (end != -1))
00313 {
00314 std::cerr << "Warning, requesting region where start <= end, so "
00315 << "no values will be returned.\n";
00316 return(false);
00317 }
00318
00319
00320
00321 if(refID == REF_ID_UNMAPPED)
00322 {
00323 Chunk refChunk;
00324
00325
00326 refChunk.chunk_beg = getMaxOffset();
00327
00328
00329 refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE;
00330 return(chunkList.insert(refChunk));
00331 }
00332
00333 if((refID < 0) || (refID >= n_ref))
00334 {
00335
00336 std::cerr << "Warning, requesting refID is out of range, so "
00337 << "no values will be returned.\n";
00338 return(false);
00339 }
00340
00341 const Reference* ref = &(myRefs[refID]);
00342
00343
00344 if(start == -1)
00345 {
00346 if(end == -1)
00347 {
00348
00349 if(ref->maxChunkOffset == 0)
00350 {
00351
00352 return(true);
00353 }
00354 Chunk refChunk;
00355 refChunk.chunk_beg = ref->minChunkOffset;
00356 refChunk.chunk_end = ref->maxChunkOffset;
00357 return(chunkList.insert(refChunk));
00358 }
00359 else
00360 {
00361 start = 0;
00362 }
00363 }
00364 if(end == -1)
00365 {
00366
00367 end = MAX_POSITION + 1;
00368 }
00369
00370
00371
00372 uint64_t minOffset = getMinOffsetFromLinearIndex(refID, start);
00373
00374 uint16_t binInRangeList[MAX_NUM_BINS + 1];
00375
00376 int numBins = getBinsForRegion(start, end, binInRangeList);
00377
00378
00379 for(int i = 0; i < numBins; ++i)
00380 {
00381 int binNum = binInRangeList[i];
00382 const Bin* bin = &(ref->bins[binNum]);
00383
00384
00385 for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++)
00386 {
00387
00388
00389
00390
00391 if(bin->chunks[chunkIndex].chunk_end < minOffset)
00392 {
00393 continue;
00394 }
00395
00396 if(!chunkList.insert(bin->chunks[chunkIndex]))
00397 {
00398
00399 std::cerr << "Warning, Failed to add a chunk, so "
00400 << "no values will be returned.\n";
00401 return(false);
00402 }
00403 }
00404 }
00405
00406
00407
00408 return(chunkList.mergeOverlapping());
00409 }
00410
00411
00412
00413 uint64_t BamIndex::getMaxOffset() const
00414 {
00415 return(maxOverallOffset);
00416 }
00417
00418
00419 bool BamIndex::getReferenceMinMax(int32_t refID,
00420 uint64_t& minOffset,
00421 uint64_t& maxOffset) const
00422 {
00423 if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00424 {
00425
00426 return(false);
00427 }
00428
00429
00430 minOffset = myRefs[refID].minChunkOffset;
00431 maxOffset = myRefs[refID].maxChunkOffset;
00432 return(true);
00433 }
00434
00435
00436
00437 int32_t BamIndex::getNumRefs() const
00438 {
00439
00440 return(myRefs.size());
00441 }
00442
00443
00444
00445 int32_t BamIndex::getNumMappedReads(int32_t refID)
00446 {
00447
00448
00449 if(refID == REF_ID_UNMAPPED)
00450 {
00451
00452 return(0);
00453 }
00454
00455 if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00456 {
00457
00458 return(-1);
00459 }
00460
00461
00462 return(myRefs[refID].n_mapped);
00463 }
00464
00465
00466
00467 int32_t BamIndex::getNumUnMappedReads(int32_t refID)
00468 {
00469
00470
00471 if(refID == REF_ID_UNMAPPED)
00472 {
00473 return(myUnMappedNumReads);
00474 }
00475
00476 if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00477 {
00478
00479 return(-1);
00480 }
00481
00482
00483 return(myRefs[refID].n_unmapped);
00484 }
00485
00486
00487
00488 void BamIndex::printIndex(int32_t refID, bool summary)
00489 {
00490 std::cout << "BAM Index: " << std::endl;
00491 std::cout << "# Reference Sequences: " << n_ref << std::endl;
00492
00493 unsigned int startRef = 0;
00494 unsigned int endRef = myRefs.size() - 1;
00495 std::vector<Reference> refsToProcess;
00496 if(refID != -1)
00497 {
00498
00499 startRef = refID;
00500 endRef = refID;
00501 }
00502
00503
00504 for(unsigned int i = startRef; i <= endRef; ++i)
00505 {
00506 std::cout << std::dec
00507 << "\tReference ID: " << std::setw(4) << i
00508 << "; #Bins: "<< std::setw(6) << myRefs[i].n_bin
00509 << "; #Linear Index Entries: "
00510 << std::setw(6) << myRefs[i].n_intv
00511 << "; Min Chunk Offset: "
00512 << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset
00513 << "; Max Chunk Offset: "
00514 << std::setw(18) << myRefs[i].maxChunkOffset
00515 << std::dec;
00516
00517 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00518 {
00519 std::cout << "; " << myRefs[i].n_mapped << " Mapped Reads";
00520 }
00521 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00522 {
00523 std::cout << "; " << myRefs[i].n_unmapped << " Unmapped Reads";
00524 }
00525 std::cout << std::endl;
00526
00527
00528 if(!summary)
00529 {
00530 std::vector<Bin>::iterator binIter;
00531 for(binIter = myRefs[i].bins.begin();
00532 binIter != myRefs[i].bins.end();
00533 ++binIter)
00534 {
00535 Bin* binPtr = &(*binIter);
00536 if(binPtr->bin == Bin::NOT_USED_BIN)
00537 {
00538
00539 continue;
00540 }
00541
00542 std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl;
00543 std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl;
00544 std::cout << std::hex << std::showbase;
00545
00546 for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk;
00547 ++chunkIndex)
00548 {
00549
00550
00551
00552 if((binPtr->bin != MAX_NUM_BINS) ||
00553 (chunkIndex != (binPtr->n_chunk - 1)))
00554 {
00555 std::cout << "\t\t\t\tchunk_beg: "
00556 << binPtr->chunks[chunkIndex].chunk_beg
00557 << std::endl;
00558 std::cout << "\t\t\t\tchunk_end: "
00559 << binPtr->chunks[chunkIndex].chunk_end
00560 << std::endl;
00561 }
00562 }
00563 }
00564 std::cout << std::dec;
00565
00566
00567 for(int linearIndex = 0; linearIndex < myRefs[i].n_intv;
00568 ++linearIndex)
00569 {
00570 if(myRefs[i].ioffsets[linearIndex] != 0)
00571 {
00572 std::cout << "\t\t\tLinearIndex["
00573 << std::dec << linearIndex << "] Offset: "
00574 << std::hex << myRefs[i].ioffsets[linearIndex]
00575 << std::endl;
00576 }
00577 }
00578 }
00579 }
00580 }
00581
00582
00583
00584
00585 uint64_t BamIndex::getMinOffsetFromLinearIndex(int32_t refID, uint32_t position) const
00586 {
00587 int32_t linearIndex = position >> LINEAR_INDEX_SHIFT;
00588 uint64_t minOffset = 0;
00589
00590 int32_t linearOffsetSize = myRefs[refID].n_intv;
00591
00592
00593
00594
00595
00596 if((linearOffsetSize == 0) || (linearIndex >= linearOffsetSize))
00597
00598 {
00599 return(0);
00600 }
00601 else
00602 {
00603
00604
00605 minOffset = myRefs[refID].ioffsets[linearIndex];
00606
00607
00608
00609
00610
00611
00612
00613 while((minOffset == 0) && (--linearIndex >= 0))
00614 {
00615 minOffset = myRefs[refID].ioffsets[linearIndex];
00616 }
00617 }
00618 return(minOffset);
00619 }
00620
00621
00622
00623
00624
00625 int BamIndex::getBinsForRegion(uint32_t start, uint32_t end, uint16_t binList[MAX_NUM_BINS])
00626 {
00627 uint32_t binListIndex = 0, binNum;
00628 --end;
00629
00630
00631 if(start > MAX_POSITION)
00632 {
00633 start = MAX_POSITION;
00634 }
00635 if(end > MAX_POSITION)
00636 {
00637 end = MAX_POSITION;
00638 }
00639
00640 binList[binListIndex++] = 0;
00641 for (binNum = 1 + (start>>26); binNum <= 1 + (end>>26); ++binNum)
00642 binList[binListIndex++] = binNum;
00643 for (binNum = 9 + (start>>23); binNum <= 9 + (end>>23); ++binNum)
00644 binList[binListIndex++] = binNum;
00645 for (binNum = 73 + (start>>20); binNum <= 73 + (end>>20); ++binNum)
00646 binList[binListIndex++] = binNum;
00647 for (binNum = 585 + (start>>17); binNum <= 585 + (end>>17); ++binNum)
00648 binList[binListIndex++] = binNum;
00649 for (binNum = 4681 + (start>>14); binNum <= 4681 + (end>>14); ++binNum)
00650 binList[binListIndex++] = binNum;
00651
00652
00653 return binListIndex;
00654 }
00655
00656