BamIndex.cpp
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "BamIndex.h"
00019 #include <iomanip>
00020
00021 Chunk SortedChunkList::pop()
00022 {
00023 Chunk newChunk = chunkList.begin()->second;
00024 chunkList.erase(chunkList.begin());
00025 return(newChunk);
00026 }
00027
00028
00029 bool SortedChunkList::insert(const Chunk& chunkToInsert)
00030 {
00031 std::pair<std::map<uint64_t, Chunk>::iterator, bool> insertRes;
00032
00033 insertRes =
00034 chunkList.insert(std::pair<uint64_t, Chunk>(chunkToInsert.chunk_beg,
00035 chunkToInsert));
00036
00037 if(!insertRes.second)
00038 {
00039
00040 std::cerr << "Failed to insert into the SortedChunkList.\n";
00041 std::cerr << "\tpreviously found chunk:\tbeg = " << std::hex
00042 << insertRes.first->second.chunk_beg
00043 << "\tend = "
00044 << insertRes.first->second.chunk_end
00045 << "\nnew chunk:\tbeg = " << std::hex
00046 << chunkToInsert.chunk_beg
00047 << "\tend = "
00048 << chunkToInsert.chunk_end
00049 << std::endl;
00050 }
00051
00052 return(insertRes.second);
00053 }
00054
00055 void SortedChunkList::clear()
00056 {
00057 chunkList.clear();
00058 }
00059
00060 bool SortedChunkList::empty()
00061 {
00062 return(chunkList.empty());
00063 }
00064
00065
00066
00067 bool SortedChunkList::mergeOverlapping()
00068 {
00069
00070 std::map<uint64_t, Chunk>::iterator currentPos = chunkList.begin();
00071 std::map<uint64_t, Chunk>::iterator nextPos = chunkList.begin();
00072 if(nextPos != chunkList.end())
00073 {
00074 ++nextPos;
00075 }
00076
00077
00078 while(nextPos != chunkList.end())
00079 {
00080
00081
00082
00083 if(nextPos->second.chunk_end < currentPos->second.chunk_end)
00084 {
00085 chunkList.erase(nextPos);
00086 nextPos = currentPos;
00087 ++nextPos;
00088 continue;
00089 }
00090
00091
00092
00093
00094 if((nextPos->second.chunk_beg >> 16) <=
00095 (currentPos->second.chunk_end >> 16))
00096 {
00097 currentPos->second.chunk_end = nextPos->second.chunk_end;
00098
00099
00100 chunkList.erase(nextPos);
00101 nextPos = currentPos;
00102 ++nextPos;
00103 continue;
00104 }
00105 else
00106 {
00107
00108 currentPos = nextPos;
00109 ++nextPos;
00110 }
00111 }
00112 return(true);
00113 }
00114
00115
00116 BamIndex::BamIndex()
00117 {
00118 resetIndex();
00119 }
00120
00121
00122
00123 BamIndex::~BamIndex()
00124 {
00125 }
00126
00127
00128
00129 void BamIndex::resetIndex()
00130 {
00131 n_ref = 0;
00132 maxOverallOffset = 0;
00133
00134
00135 myRefs.clear();
00136 }
00137
00138
00139
00140 SamStatus::Status BamIndex::readIndex(const char* filename)
00141 {
00142
00143 resetIndex();
00144
00145 IFILE indexFile = ifopen(filename, "rb");
00146
00147
00148 if(indexFile == NULL)
00149 {
00150 return(SamStatus::FAIL_IO);
00151 }
00152
00153
00154
00155
00156 char magic[4];
00157 if(ifread(indexFile, magic, 4) != 4)
00158 {
00159
00160 return(SamStatus::FAIL_IO);
00161 }
00162
00163
00164 if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1)
00165 {
00166
00167 return(SamStatus::FAIL_PARSE);
00168 }
00169
00170
00171
00172 if(ifread(indexFile, &n_ref, 4) != 4)
00173 {
00174
00175 return(SamStatus::FAIL_IO);
00176 }
00177
00178
00179 myRefs.resize(n_ref);
00180
00181 for(int refIndex = 0; refIndex < n_ref; refIndex++)
00182 {
00183
00184 Reference* ref = &(myRefs[refIndex]);
00185
00186
00187 ref->bins.resize(MAX_NUM_BINS + 1);
00188
00189
00190 if(ifread(indexFile, &(ref->n_bin), 4) != 4)
00191 {
00192
00193
00194 return(SamStatus::FAIL_PARSE);
00195 }
00196
00197
00198 for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
00199 {
00200 uint32_t binNumber;
00201
00202
00203 if(ifread(indexFile, &(binNumber), 4) != 4)
00204 {
00205
00206
00207 return(SamStatus::FAIL_IO);
00208 }
00209
00210
00211
00212 Bin* binPtr = &(ref->bins[binNumber]);
00213 binPtr->bin = binNumber;
00214
00215
00216 if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
00217 {
00218
00219
00220 return(SamStatus::FAIL_IO);
00221 }
00222
00223
00224
00225 uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
00226 binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
00227 if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
00228 {
00229
00230
00231 return(SamStatus::FAIL_IO);
00232 }
00233
00234
00235 if(binPtr->bin != MAX_NUM_BINS)
00236 {
00237 for(int i = 0; i < binPtr->n_chunk; i++)
00238 {
00239 if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset)
00240 {
00241 ref->minChunkOffset = binPtr->chunks[i].chunk_beg;
00242 }
00243 if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset)
00244 {
00245 ref->maxChunkOffset = binPtr->chunks[i].chunk_end;
00246 }
00247 if(binPtr->chunks[i].chunk_end > maxOverallOffset)
00248 {
00249 maxOverallOffset = binPtr->chunks[i].chunk_end;
00250 }
00251 }
00252 }
00253 else
00254 {
00255
00256
00257 ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg;
00258 ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end;
00259 }
00260 }
00261
00262
00263 if(ifread(indexFile, &(ref->n_intv), 4) != 4)
00264 {
00265
00266 ref->n_intv = 0;
00267
00268 return(SamStatus::FAIL_IO);
00269 }
00270
00271
00272 uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
00273 ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
00274 if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
00275 {
00276
00277
00278 return(SamStatus::FAIL_IO);
00279 }
00280 }
00281
00282
00283 return(SamStatus::SUCCESS);
00284 }
00285
00286
00287
00288
00289 bool BamIndex::getChunksForRegion(int32_t refID, int32_t start, int32_t end,
00290 SortedChunkList& chunkList)
00291 {
00292 chunkList.clear();
00293
00294
00295
00296 if((start >= end) && (end != -1))
00297 {
00298 std::cerr << "Warning, requesting region where start <= end, so "
00299 << "no values will be returned.\n";
00300 return(false);
00301 }
00302
00303
00304
00305 if(refID == REF_ID_UNMAPPED)
00306 {
00307 Chunk refChunk;
00308
00309
00310 refChunk.chunk_beg = getMaxOffset();
00311
00312
00313 refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE;
00314 return(chunkList.insert(refChunk));
00315 }
00316
00317 if((refID < 0) || (refID >= n_ref))
00318 {
00319
00320 return(false);
00321 }
00322
00323 const Reference* ref = &(myRefs[refID]);
00324
00325
00326 if(start == -1)
00327 {
00328 if(end == -1)
00329 {
00330
00331 if(ref->maxChunkOffset == 0)
00332 {
00333
00334 return(true);
00335 }
00336 Chunk refChunk;
00337 refChunk.chunk_beg = ref->minChunkOffset;
00338 refChunk.chunk_end = ref->maxChunkOffset;
00339 return(chunkList.insert(refChunk));
00340 }
00341 else
00342 {
00343 start = 0;
00344 }
00345 }
00346 if(end == -1)
00347 {
00348
00349 end = MAX_POSITION + 1;
00350 }
00351
00352
00353
00354 uint64_t minOffset = getMinOffsetFromLinearIndex(refID, start);
00355
00356 uint16_t binInRangeList[MAX_NUM_BINS + 1];
00357
00358 int numBins = getBinsForRegion(start, end, binInRangeList);
00359
00360
00361 for(int i = 0; i < numBins; ++i)
00362 {
00363 int binNum = binInRangeList[i];
00364 const Bin* bin = &(ref->bins[binNum]);
00365
00366
00367 for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++)
00368 {
00369
00370
00371
00372
00373 if(bin->chunks[chunkIndex].chunk_end < minOffset)
00374 {
00375 continue;
00376 }
00377
00378 if(!chunkList.insert(bin->chunks[chunkIndex]))
00379 {
00380
00381 return(false);
00382 }
00383 }
00384 }
00385
00386
00387
00388 return(chunkList.mergeOverlapping());
00389 }
00390
00391
00392
00393 uint64_t BamIndex::getMaxOffset() const
00394 {
00395 return(maxOverallOffset);
00396 }
00397
00398
00399 bool BamIndex::getReferenceMinMax(int32_t refID,
00400 uint64_t& minOffset,
00401 uint64_t& maxOffset) const
00402 {
00403 if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00404 {
00405
00406 return(false);
00407 }
00408
00409
00410 minOffset = myRefs[refID].minChunkOffset;
00411 maxOffset = myRefs[refID].maxChunkOffset;
00412 return(true);
00413 }
00414
00415
00416
00417 int32_t BamIndex::getNumRefs() const
00418 {
00419
00420 return(myRefs.size());
00421 }
00422
00423
00424
00425 void BamIndex::printIndex(int32_t refID, bool summary)
00426 {
00427 std::cout << "BAM Index: " << std::endl;
00428 std::cout << "# Reference Sequences: " << n_ref << std::endl;
00429
00430 unsigned int startRef = 0;
00431 unsigned int endRef = myRefs.size() - 1;
00432 std::vector<Reference> refsToProcess;
00433 if(refID != -1)
00434 {
00435
00436 startRef = refID;
00437 endRef = refID;
00438 }
00439
00440
00441 for(unsigned int i = startRef; i <= endRef; ++i)
00442 {
00443 std::cout << std::dec
00444 << "\tReference ID: " << std::setw(4) << i
00445 << "; #Bins: "<< std::setw(6) << myRefs[i].n_bin
00446 << "; #Linear Index Entries: "
00447 << std::setw(6) << myRefs[i].n_intv
00448 << "; Min Chunk Offset: "
00449 << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset
00450 << "; Max Chunk Offset: "
00451 << std::setw(18) << myRefs[i].maxChunkOffset
00452 << std::dec;
00453
00454 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00455 {
00456 std::cout << "; " << myRefs[i].n_mapped << " Mapped Reads";
00457 }
00458 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00459 {
00460 std::cout << "; " << myRefs[i].n_unmapped << " Unmapped Reads";
00461 }
00462 std::cout << std::endl;
00463
00464
00465 if(!summary)
00466 {
00467 std::vector<Bin>::iterator binIter;
00468 for(binIter = myRefs[i].bins.begin();
00469 binIter != myRefs[i].bins.end();
00470 ++binIter)
00471 {
00472 Bin* binPtr = &(*binIter);
00473 if(binPtr->bin == Bin::NOT_USED_BIN)
00474 {
00475
00476 continue;
00477 }
00478
00479 std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl;
00480 std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl;
00481 std::cout << std::hex << std::showbase;
00482
00483 for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk;
00484 ++chunkIndex)
00485 {
00486
00487
00488
00489 if((binPtr->bin != MAX_NUM_BINS) ||
00490 (chunkIndex != (binPtr->n_chunk - 1)))
00491 {
00492 std::cout << "\t\t\t\tchunk_beg: "
00493 << binPtr->chunks[chunkIndex].chunk_beg
00494 << std::endl;
00495 std::cout << "\t\t\t\tchunk_end: "
00496 << binPtr->chunks[chunkIndex].chunk_end
00497 << std::endl;
00498 }
00499 }
00500 }
00501 std::cout << std::dec;
00502
00503
00504 for(int linearIndex = 0; linearIndex < myRefs[i].n_intv;
00505 ++linearIndex)
00506 {
00507 if(myRefs[i].ioffsets[linearIndex] != 0)
00508 {
00509 std::cout << "\t\t\tLinearIndex["
00510 << std::dec << linearIndex << "] Offset: "
00511 << std::hex << myRefs[i].ioffsets[linearIndex]
00512 << std::endl;
00513 }
00514 }
00515 }
00516 }
00517 }
00518
00519
00520
00521
00522
00523 int BamIndex::getBinsForRegion(uint32_t start, uint32_t end, uint16_t binList[MAX_NUM_BINS])
00524 {
00525 uint32_t binListIndex = 0, binNum;
00526 --end;
00527
00528
00529 if(start > MAX_POSITION)
00530 {
00531 start = MAX_POSITION;
00532 }
00533 if(end > MAX_POSITION)
00534 {
00535 end = MAX_POSITION;
00536 }
00537
00538 binList[binListIndex++] = 0;
00539 for (binNum = 1 + (start>>26); binNum <= 1 + (end>>26); ++binNum)
00540 binList[binListIndex++] = binNum;
00541 for (binNum = 9 + (start>>23); binNum <= 9 + (end>>23); ++binNum)
00542 binList[binListIndex++] = binNum;
00543 for (binNum = 73 + (start>>20); binNum <= 73 + (end>>20); ++binNum)
00544 binList[binListIndex++] = binNum;
00545 for (binNum = 585 + (start>>17); binNum <= 585 + (end>>17); ++binNum)
00546 binList[binListIndex++] = binNum;
00547 for (binNum = 4681 + (start>>14); binNum <= 4681 + (end>>14); ++binNum)
00548 binList[binListIndex++] = binNum;
00549
00550
00551 return binListIndex;
00552 }
00553
00554
00555
00556
00557 uint64_t BamIndex::getMinOffsetFromLinearIndex(int32_t refID, uint32_t position)
00558 {
00559 int32_t linearIndex = position >> LINEAR_INDEX_SHIFT;
00560 uint64_t minOffset = 0;
00561
00562 int32_t linearOffsetSize = myRefs[refID].n_intv;
00563
00564
00565
00566
00567
00568 if((linearOffsetSize == 0) || (linearIndex >= linearOffsetSize))
00569
00570 {
00571 return(0);
00572 }
00573 else
00574 {
00575
00576
00577 minOffset = myRefs[refID].ioffsets[linearIndex];
00578
00579
00580
00581
00582
00583
00584
00585 while((minOffset == 0) && (--linearIndex >= 0))
00586 {
00587 minOffset = myRefs[refID].ioffsets[linearIndex];
00588 }
00589 }
00590 return(minOffset);
00591 }
00592
00593