BamIndex.cpp
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "BamIndex.h"
00019 #include <iomanip>
00020
00021 Chunk SortedChunkList::pop()
00022 {
00023 Chunk newChunk = chunkList.begin()->second;
00024 chunkList.erase(chunkList.begin());
00025 return(newChunk);
00026 }
00027
00028
00029 bool SortedChunkList::insert(const Chunk& chunkToInsert)
00030 {
00031 std::pair<std::map<uint64_t, Chunk>::iterator, bool> insertRes;
00032
00033 insertRes =
00034 chunkList.insert(std::pair<uint64_t, Chunk>(chunkToInsert.chunk_beg,
00035 chunkToInsert));
00036
00037 if(!insertRes.second)
00038 {
00039
00040 std::cerr << "Failed to insert into the SortedChunkList.\n";
00041 std::cerr << "\tpreviously found chunk:\tbeg = " << std::hex
00042 << insertRes.first->second.chunk_beg
00043 << "\tend = "
00044 << insertRes.first->second.chunk_end
00045 << "\nnew chunk:\tbeg = " << std::hex
00046 << chunkToInsert.chunk_beg
00047 << "\tend = "
00048 << chunkToInsert.chunk_end
00049 << std::endl;
00050 }
00051
00052 return(insertRes.second);
00053 }
00054
00055 void SortedChunkList::clear()
00056 {
00057 chunkList.clear();
00058 }
00059
00060 bool SortedChunkList::empty()
00061 {
00062 return(chunkList.empty());
00063 }
00064
00065
00066
00067 bool SortedChunkList::mergeOverlapping()
00068 {
00069
00070 std::map<uint64_t, Chunk>::iterator currentPos = chunkList.begin();
00071 std::map<uint64_t, Chunk>::iterator nextPos = chunkList.begin();
00072 if(nextPos != chunkList.end())
00073 {
00074 ++nextPos;
00075 }
00076
00077
00078 while(nextPos != chunkList.end())
00079 {
00080
00081
00082
00083 if(nextPos->second.chunk_end < currentPos->second.chunk_end)
00084 {
00085 chunkList.erase(nextPos);
00086 nextPos = currentPos;
00087 ++nextPos;
00088 continue;
00089 }
00090
00091
00092
00093
00094 if((nextPos->second.chunk_beg >> 16) <=
00095 (currentPos->second.chunk_end >> 16))
00096 {
00097 currentPos->second.chunk_end = nextPos->second.chunk_end;
00098
00099
00100 chunkList.erase(nextPos);
00101 nextPos = currentPos;
00102 ++nextPos;
00103 continue;
00104 }
00105 else
00106 {
00107
00108 currentPos = nextPos;
00109 ++nextPos;
00110 }
00111 }
00112 return(true);
00113 }
00114
00115
00116 BamIndex::BamIndex()
00117 {
00118 resetIndex();
00119 }
00120
00121
00122
00123 BamIndex::~BamIndex()
00124 {
00125 }
00126
00127
00128
00129 void BamIndex::resetIndex()
00130 {
00131 n_ref = 0;
00132 maxOverallOffset = 0;
00133
00134 myUnMappedNumReads = -1;
00135
00136
00137 myRefs.clear();
00138 }
00139
00140
00141
00142 SamStatus::Status BamIndex::readIndex(const char* filename)
00143 {
00144
00145 resetIndex();
00146
00147 IFILE indexFile = ifopen(filename, "rb");
00148
00149
00150 if(indexFile == NULL)
00151 {
00152 return(SamStatus::FAIL_IO);
00153 }
00154
00155
00156
00157
00158 char magic[4];
00159 if(ifread(indexFile, magic, 4) != 4)
00160 {
00161
00162 return(SamStatus::FAIL_IO);
00163 }
00164
00165
00166 if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1)
00167 {
00168
00169 return(SamStatus::FAIL_PARSE);
00170 }
00171
00172
00173
00174 if(ifread(indexFile, &n_ref, 4) != 4)
00175 {
00176
00177 return(SamStatus::FAIL_IO);
00178 }
00179
00180
00181 myRefs.resize(n_ref);
00182
00183 for(int refIndex = 0; refIndex < n_ref; refIndex++)
00184 {
00185
00186 Reference* ref = &(myRefs[refIndex]);
00187
00188
00189 ref->bins.resize(MAX_NUM_BINS + 1);
00190
00191
00192 if(ifread(indexFile, &(ref->n_bin), 4) != 4)
00193 {
00194
00195
00196 return(SamStatus::FAIL_PARSE);
00197 }
00198
00199
00200
00201 if(ref->n_bin == 0)
00202 {
00203 ref->n_mapped = 0;
00204 ref->n_unmapped = 0;
00205 }
00206
00207
00208 for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
00209 {
00210 uint32_t binNumber;
00211
00212
00213 if(ifread(indexFile, &(binNumber), 4) != 4)
00214 {
00215
00216
00217 return(SamStatus::FAIL_IO);
00218 }
00219
00220
00221
00222 Bin* binPtr = &(ref->bins[binNumber]);
00223 binPtr->bin = binNumber;
00224
00225
00226 if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
00227 {
00228
00229
00230 return(SamStatus::FAIL_IO);
00231 }
00232
00233
00234
00235 uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
00236 binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
00237 if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
00238 {
00239
00240
00241 return(SamStatus::FAIL_IO);
00242 }
00243
00244
00245 if(binPtr->bin != MAX_NUM_BINS)
00246 {
00247 for(int i = 0; i < binPtr->n_chunk; i++)
00248 {
00249 if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset)
00250 {
00251 ref->minChunkOffset = binPtr->chunks[i].chunk_beg;
00252 }
00253 if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset)
00254 {
00255 ref->maxChunkOffset = binPtr->chunks[i].chunk_end;
00256 }
00257 if(binPtr->chunks[i].chunk_end > maxOverallOffset)
00258 {
00259 maxOverallOffset = binPtr->chunks[i].chunk_end;
00260 }
00261 }
00262 }
00263 else
00264 {
00265
00266
00267 ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg;
00268 ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end;
00269 }
00270 }
00271
00272
00273 if(ifread(indexFile, &(ref->n_intv), 4) != 4)
00274 {
00275
00276 ref->n_intv = 0;
00277
00278 return(SamStatus::FAIL_IO);
00279 }
00280
00281
00282 uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
00283 ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
00284 if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
00285 {
00286
00287
00288 return(SamStatus::FAIL_IO);
00289 }
00290 }
00291
00292 int32_t numUnmapped = 0;
00293 if(ifread(indexFile, &numUnmapped, sizeof(int32_t)) == sizeof(int32_t))
00294 {
00295 myUnMappedNumReads = numUnmapped;
00296 }
00297
00298
00299 return(SamStatus::SUCCESS);
00300 }
00301
00302
00303
00304
00305 bool BamIndex::getChunksForRegion(int32_t refID, int32_t start, int32_t end,
00306 SortedChunkList& chunkList)
00307 {
00308 chunkList.clear();
00309
00310
00311
00312 if((start >= end) && (end != -1))
00313 {
00314 std::cerr << "Warning, requesting region where start <= end, so "
00315 << "no values will be returned.\n";
00316 return(false);
00317 }
00318
00319
00320
00321 if(refID == REF_ID_UNMAPPED)
00322 {
00323 Chunk refChunk;
00324
00325
00326 refChunk.chunk_beg = getMaxOffset();
00327
00328
00329 refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE;
00330 return(chunkList.insert(refChunk));
00331 }
00332
00333 if((refID < 0) || (refID >= n_ref))
00334 {
00335
00336 return(false);
00337 }
00338
00339 const Reference* ref = &(myRefs[refID]);
00340
00341
00342 if(start == -1)
00343 {
00344 if(end == -1)
00345 {
00346
00347 if(ref->maxChunkOffset == 0)
00348 {
00349
00350 return(true);
00351 }
00352 Chunk refChunk;
00353 refChunk.chunk_beg = ref->minChunkOffset;
00354 refChunk.chunk_end = ref->maxChunkOffset;
00355 return(chunkList.insert(refChunk));
00356 }
00357 else
00358 {
00359 start = 0;
00360 }
00361 }
00362 if(end == -1)
00363 {
00364
00365 end = MAX_POSITION + 1;
00366 }
00367
00368
00369
00370 uint64_t minOffset = getMinOffsetFromLinearIndex(refID, start);
00371
00372 uint16_t binInRangeList[MAX_NUM_BINS + 1];
00373
00374 int numBins = getBinsForRegion(start, end, binInRangeList);
00375
00376
00377 for(int i = 0; i < numBins; ++i)
00378 {
00379 int binNum = binInRangeList[i];
00380 const Bin* bin = &(ref->bins[binNum]);
00381
00382
00383 for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++)
00384 {
00385
00386
00387
00388
00389 if(bin->chunks[chunkIndex].chunk_end < minOffset)
00390 {
00391 continue;
00392 }
00393
00394 if(!chunkList.insert(bin->chunks[chunkIndex]))
00395 {
00396
00397 return(false);
00398 }
00399 }
00400 }
00401
00402
00403
00404 return(chunkList.mergeOverlapping());
00405 }
00406
00407
00408
00409 uint64_t BamIndex::getMaxOffset() const
00410 {
00411 return(maxOverallOffset);
00412 }
00413
00414
00415 bool BamIndex::getReferenceMinMax(int32_t refID,
00416 uint64_t& minOffset,
00417 uint64_t& maxOffset) const
00418 {
00419 if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00420 {
00421
00422 return(false);
00423 }
00424
00425
00426 minOffset = myRefs[refID].minChunkOffset;
00427 maxOffset = myRefs[refID].maxChunkOffset;
00428 return(true);
00429 }
00430
00431
00432
00433 int32_t BamIndex::getNumRefs() const
00434 {
00435
00436 return(myRefs.size());
00437 }
00438
00439
00440
00441 int32_t BamIndex::getNumMappedReads(int32_t refID)
00442 {
00443
00444
00445 if(refID == REF_ID_UNMAPPED)
00446 {
00447
00448 return(0);
00449 }
00450
00451 if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00452 {
00453
00454 return(-1);
00455 }
00456
00457
00458 return(myRefs[refID].n_mapped);
00459 }
00460
00461
00462
00463 int32_t BamIndex::getNumUnMappedReads(int32_t refID)
00464 {
00465
00466
00467 if(refID == REF_ID_UNMAPPED)
00468 {
00469 return(myUnMappedNumReads);
00470 }
00471
00472 if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00473 {
00474
00475 return(-1);
00476 }
00477
00478
00479 return(myRefs[refID].n_unmapped);
00480 }
00481
00482
00483
00484 void BamIndex::printIndex(int32_t refID, bool summary)
00485 {
00486 std::cout << "BAM Index: " << std::endl;
00487 std::cout << "# Reference Sequences: " << n_ref << std::endl;
00488
00489 unsigned int startRef = 0;
00490 unsigned int endRef = myRefs.size() - 1;
00491 std::vector<Reference> refsToProcess;
00492 if(refID != -1)
00493 {
00494
00495 startRef = refID;
00496 endRef = refID;
00497 }
00498
00499
00500 for(unsigned int i = startRef; i <= endRef; ++i)
00501 {
00502 std::cout << std::dec
00503 << "\tReference ID: " << std::setw(4) << i
00504 << "; #Bins: "<< std::setw(6) << myRefs[i].n_bin
00505 << "; #Linear Index Entries: "
00506 << std::setw(6) << myRefs[i].n_intv
00507 << "; Min Chunk Offset: "
00508 << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset
00509 << "; Max Chunk Offset: "
00510 << std::setw(18) << myRefs[i].maxChunkOffset
00511 << std::dec;
00512
00513 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00514 {
00515 std::cout << "; " << myRefs[i].n_mapped << " Mapped Reads";
00516 }
00517 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00518 {
00519 std::cout << "; " << myRefs[i].n_unmapped << " Unmapped Reads";
00520 }
00521 std::cout << std::endl;
00522
00523
00524 if(!summary)
00525 {
00526 std::vector<Bin>::iterator binIter;
00527 for(binIter = myRefs[i].bins.begin();
00528 binIter != myRefs[i].bins.end();
00529 ++binIter)
00530 {
00531 Bin* binPtr = &(*binIter);
00532 if(binPtr->bin == Bin::NOT_USED_BIN)
00533 {
00534
00535 continue;
00536 }
00537
00538 std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl;
00539 std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl;
00540 std::cout << std::hex << std::showbase;
00541
00542 for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk;
00543 ++chunkIndex)
00544 {
00545
00546
00547
00548 if((binPtr->bin != MAX_NUM_BINS) ||
00549 (chunkIndex != (binPtr->n_chunk - 1)))
00550 {
00551 std::cout << "\t\t\t\tchunk_beg: "
00552 << binPtr->chunks[chunkIndex].chunk_beg
00553 << std::endl;
00554 std::cout << "\t\t\t\tchunk_end: "
00555 << binPtr->chunks[chunkIndex].chunk_end
00556 << std::endl;
00557 }
00558 }
00559 }
00560 std::cout << std::dec;
00561
00562
00563 for(int linearIndex = 0; linearIndex < myRefs[i].n_intv;
00564 ++linearIndex)
00565 {
00566 if(myRefs[i].ioffsets[linearIndex] != 0)
00567 {
00568 std::cout << "\t\t\tLinearIndex["
00569 << std::dec << linearIndex << "] Offset: "
00570 << std::hex << myRefs[i].ioffsets[linearIndex]
00571 << std::endl;
00572 }
00573 }
00574 }
00575 }
00576 }
00577
00578
00579
00580
00581
00582 int BamIndex::getBinsForRegion(uint32_t start, uint32_t end, uint16_t binList[MAX_NUM_BINS])
00583 {
00584 uint32_t binListIndex = 0, binNum;
00585 --end;
00586
00587
00588 if(start > MAX_POSITION)
00589 {
00590 start = MAX_POSITION;
00591 }
00592 if(end > MAX_POSITION)
00593 {
00594 end = MAX_POSITION;
00595 }
00596
00597 binList[binListIndex++] = 0;
00598 for (binNum = 1 + (start>>26); binNum <= 1 + (end>>26); ++binNum)
00599 binList[binListIndex++] = binNum;
00600 for (binNum = 9 + (start>>23); binNum <= 9 + (end>>23); ++binNum)
00601 binList[binListIndex++] = binNum;
00602 for (binNum = 73 + (start>>20); binNum <= 73 + (end>>20); ++binNum)
00603 binList[binListIndex++] = binNum;
00604 for (binNum = 585 + (start>>17); binNum <= 585 + (end>>17); ++binNum)
00605 binList[binListIndex++] = binNum;
00606 for (binNum = 4681 + (start>>14); binNum <= 4681 + (end>>14); ++binNum)
00607 binList[binListIndex++] = binNum;
00608
00609
00610 return binListIndex;
00611 }
00612
00613
00614
00615
00616 uint64_t BamIndex::getMinOffsetFromLinearIndex(int32_t refID, uint32_t position)
00617 {
00618 int32_t linearIndex = position >> LINEAR_INDEX_SHIFT;
00619 uint64_t minOffset = 0;
00620
00621 int32_t linearOffsetSize = myRefs[refID].n_intv;
00622
00623
00624
00625
00626
00627 if((linearOffsetSize == 0) || (linearIndex >= linearOffsetSize))
00628
00629 {
00630 return(0);
00631 }
00632 else
00633 {
00634
00635
00636 minOffset = myRefs[refID].ioffsets[linearIndex];
00637
00638
00639
00640
00641
00642
00643
00644 while((minOffset == 0) && (--linearIndex >= 0))
00645 {
00646 minOffset = myRefs[refID].ioffsets[linearIndex];
00647 }
00648 }
00649 return(minOffset);
00650 }
00651
00652