BamIndex.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "BamIndex.h"
00019 #include <iomanip>
00020 
00021 Chunk SortedChunkList::pop()
00022 {
00023     Chunk newChunk = chunkList.begin()->second;
00024     chunkList.erase(chunkList.begin());
00025     return(newChunk);
00026 }
00027 
00028 
00029 bool SortedChunkList::insert(const Chunk& chunkToInsert)
00030 {
00031     std::pair<std::map<uint64_t, Chunk>::iterator, bool> insertRes;
00032     // Insert the passed in chunk.
00033     insertRes = 
00034         chunkList.insert(std::pair<uint64_t, Chunk>(chunkToInsert.chunk_beg,
00035                                                     chunkToInsert));
00036 
00037     if(!insertRes.second)
00038     {
00039         // Failed to insert the chunk.
00040         std::cerr << "Failed to insert into the SortedChunkList.\n";
00041         std::cerr << "\tpreviously found chunk:\tbeg = " << std::hex
00042                   << insertRes.first->second.chunk_beg 
00043                   << "\tend = "
00044                   << insertRes.first->second.chunk_end
00045                   << "\nnew chunk:\tbeg = " << std::hex
00046                   << chunkToInsert.chunk_beg 
00047                   << "\tend = "
00048                   << chunkToInsert.chunk_end
00049                   << std::endl;
00050     }
00051     // return the result that comes from insertRes.
00052     return(insertRes.second);
00053 }
00054 
00055 void SortedChunkList::clear()
00056 {
00057     chunkList.clear();
00058 }
00059 
00060 bool SortedChunkList::empty()
00061 {
00062     return(chunkList.empty());
00063 }
00064 
00065 
00066 // Merge overlapping chunks found in this list.
00067 bool SortedChunkList::mergeOverlapping()
00068 {
00069     // Start at the beginning of the list and iterate through.
00070     std::map<uint64_t, Chunk>::iterator currentPos = chunkList.begin();
00071     std::map<uint64_t, Chunk>::iterator nextPos = chunkList.begin();
00072     if(nextPos != chunkList.end())
00073     {
00074         ++nextPos;
00075     }
00076     
00077     // Loop until the end is reached.
00078     while(nextPos != chunkList.end())
00079     {
00080         // If the next chunk is completely contained within the current 
00081         // chunk (its end is less than the current chunk's end), 
00082         // delete it since its position is already covered.
00083         if(nextPos->second.chunk_end < currentPos->second.chunk_end)
00084         {
00085             chunkList.erase(nextPos);
00086             nextPos = currentPos;
00087             ++nextPos;
00088             continue;
00089         }
00090         
00091         // If the next chunk's start position's BGZF block is less than or
00092         // equal to the BGZF block of the current chunk's end position,
00093         // combine the two chunks into the current chunk.
00094         if((nextPos->second.chunk_beg >> 16) <= 
00095            (currentPos->second.chunk_end >> 16))
00096         {
00097             currentPos->second.chunk_end = nextPos->second.chunk_end;
00098             // nextPos has now been included in the current pos, so
00099             // remove it.
00100             chunkList.erase(nextPos);
00101             nextPos = currentPos;
00102             ++nextPos;
00103             continue;
00104         }
00105         else
00106         {
00107             // Nothing to combine.  So try combining at the next 
00108             currentPos = nextPos;
00109             ++nextPos;
00110         }
00111     }
00112     return(true);
00113 }
00114 
00115 
00116 BamIndex::BamIndex()
00117 {
00118     resetIndex();
00119 }
00120 
00121 
00122 
00123 BamIndex::~BamIndex()
00124 {
00125 }
00126 
00127 
00128 // Reset the member data for a new index file.
00129 void BamIndex::resetIndex()
00130 {
00131     n_ref = 0;
00132     maxOverallOffset = 0;
00133     
00134     myUnMappedNumReads = -1;
00135 
00136     // Clear the references.
00137     myRefs.clear();
00138 }
00139 
00140 
00141 // Read & parse the specified index file.
00142 SamStatus::Status BamIndex::readIndex(const char* filename)
00143 {
00144     // Reset the index from anything that may previously be set.
00145     resetIndex();
00146 
00147     IFILE indexFile = ifopen(filename, "rb");
00148 
00149     // Failed to open the index file.
00150     if(indexFile == NULL)
00151     {
00152         return(SamStatus::FAIL_IO);
00153     }
00154 
00155     // generate the bam index structure.
00156 
00157     // Read the magic string.
00158     char magic[4];
00159     if(ifread(indexFile, magic, 4) != 4)
00160     {
00161         // Failed to read the magic
00162         return(SamStatus::FAIL_IO);
00163     }
00164 
00165     // If this is not an index file, set num references to 0. 
00166     if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1)
00167     {
00168         // Not a BAM Index file.
00169         return(SamStatus::FAIL_PARSE);
00170     }
00171 
00172     // It is a bam index file.
00173     // Read the number of reference sequences.
00174     if(ifread(indexFile, &n_ref, 4) != 4)
00175     {
00176         // Failed to read.
00177         return(SamStatus::FAIL_IO);
00178     }
00179 
00180     // Size the references.
00181     myRefs.resize(n_ref);
00182 
00183     for(int refIndex = 0; refIndex < n_ref; refIndex++)
00184     {
00185         // Read each reference.
00186         Reference* ref = &(myRefs[refIndex]);
00187         
00188         // Resize the bins so they can be indexed by bin number.
00189         ref->bins.resize(MAX_NUM_BINS + 1);
00190         
00191         // Read the number of bins.
00192         if(ifread(indexFile, &(ref->n_bin), 4) != 4)
00193         {
00194             // Failed to read the number of bins.
00195             // Return failure.
00196             return(SamStatus::FAIL_PARSE);
00197         }
00198 
00199         // If there are no bins, then there are no
00200         // mapped/unmapped reads.
00201         if(ref->n_bin == 0)
00202         {
00203             ref->n_mapped = 0;
00204             ref->n_unmapped = 0;
00205         }
00206 
00207         // Read each bin.
00208         for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
00209         {
00210             uint32_t binNumber;
00211 
00212             // Read in the bin number.
00213             if(ifread(indexFile, &(binNumber), 4) != 4)
00214             {
00215                 // Failed to read the bin number.
00216                 // Return failure.
00217                 return(SamStatus::FAIL_IO);
00218             }
00219 
00220             // Add the bin to the reference and get the
00221             // pointer back so the values can be set in it.
00222             Bin* binPtr = &(ref->bins[binNumber]);
00223             binPtr->bin = binNumber;
00224          
00225             // Read in the number of chunks.
00226             if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
00227             {
00228                 // Failed to read number of chunks.
00229                 // Return failure.
00230                 return(SamStatus::FAIL_IO);
00231             }
00232 
00233             // Read in the chunks.
00234             // Allocate space for the chunks.
00235             uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
00236             binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
00237             if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
00238             {
00239                 // Failed to read the chunks.
00240                 // Return failure.
00241                 return(SamStatus::FAIL_IO);
00242             }
00243 
00244             // Determine the min/max for this bin if it is not the max bin.
00245             if(binPtr->bin != MAX_NUM_BINS)
00246             {
00247                 for(int i = 0; i < binPtr->n_chunk; i++)
00248                 {
00249                     if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset)
00250                     {
00251                         ref->minChunkOffset = binPtr->chunks[i].chunk_beg;
00252                     }
00253                     if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset)
00254                     {
00255                         ref->maxChunkOffset = binPtr->chunks[i].chunk_end;
00256                     }
00257                     if(binPtr->chunks[i].chunk_end > maxOverallOffset)
00258                     {
00259                         maxOverallOffset = binPtr->chunks[i].chunk_end;
00260                     }
00261                 }
00262             }
00263             else
00264             {
00265                 // Mapped/unmapped are the last chunk of the
00266                 // MAX BIN
00267                 ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg;
00268                 ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end;
00269             }
00270         }
00271 
00272         // Read the number of intervals.
00273         if(ifread(indexFile, &(ref->n_intv), 4) != 4)
00274         {
00275             // Failed to read, set to 0.
00276             ref->n_intv = 0;
00277             // Return failure.
00278             return(SamStatus::FAIL_IO);
00279         }
00280 
00281         // Allocate space for the intervals and read them.
00282         uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
00283         ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
00284         if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
00285         {
00286             // Failed to read the linear index.
00287             // Return failure.
00288             return(SamStatus::FAIL_IO);
00289         }
00290     }
00291 
00292     int32_t numUnmapped = 0;
00293     if(ifread(indexFile, &numUnmapped, sizeof(int32_t)) == sizeof(int32_t))
00294     {
00295         myUnMappedNumReads = numUnmapped;
00296     }
00297 
00298     // Successfully read teh bam index file.
00299     return(SamStatus::SUCCESS);
00300 }
00301 
00302 
00303 // Get the chunks for the specified reference id and start/end 0-based
00304 // coordinates.
00305 bool BamIndex::getChunksForRegion(int32_t refID, int32_t start, int32_t end, 
00306                                   SortedChunkList& chunkList)
00307 {
00308     chunkList.clear();
00309 
00310     // If start is >= to end, there will be no sections, return no
00311     // regions.
00312     if((start >= end) && (end != -1))
00313     {
00314         std::cerr << "Warning, requesting region where start <= end, so "
00315                   << "no values will be returned.\n";
00316         return(false);
00317     }
00318 
00319     // Handle REF_ID_UNMAPPED.  This uses a default chunk which covers
00320     // from the max offset to the end of the file.
00321     if(refID == REF_ID_UNMAPPED)
00322     {
00323         Chunk refChunk;
00324         // The start of the unmapped region is the max offset found
00325         // in the index file.
00326         refChunk.chunk_beg = getMaxOffset();
00327         // The end of the unmapped region is the end of the file, so
00328         // set chunk end to the max value.
00329         refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE;
00330         return(chunkList.insert(refChunk));
00331     }
00332 
00333     if((refID < 0) || (refID >= n_ref))
00334     {
00335         // The specified refID is out of range, return false.
00336         return(false);
00337     }
00338 
00339     const Reference* ref = &(myRefs[refID]);
00340 
00341     // Handle where start/end are defaults.    
00342     if(start == -1)
00343     {
00344         if(end == -1)
00345         {
00346             // This is whole chromosome, so take a shortcut.
00347             if(ref->maxChunkOffset == 0)
00348             {
00349                 // No chunks for this region, but this is not an error.
00350                 return(true);
00351             }
00352             Chunk refChunk;
00353             refChunk.chunk_beg = ref->minChunkOffset;
00354             refChunk.chunk_end = ref->maxChunkOffset;
00355             return(chunkList.insert(refChunk));
00356         }
00357         else
00358         {
00359             start = 0;
00360         }
00361     }
00362     if(end == -1)
00363     {
00364         // MAX_POSITION is inclusive, but end is exclusive, so add 1.
00365         end = MAX_POSITION + 1;
00366     }
00367 
00368     // Determine the minimum offset for the given start position.  This
00369     // is done by using the linear index for the specified start position.
00370     uint64_t minOffset = getMinOffsetFromLinearIndex(refID, start);
00371 
00372     uint16_t binInRangeList[MAX_NUM_BINS + 1];
00373     
00374     int numBins = getBinsForRegion(start, end, binInRangeList);
00375 
00376     // loop through the bins in the range and get the chunks.
00377     for(int i = 0; i < numBins; ++i)
00378     {
00379         int binNum = binInRangeList[i];
00380         const Bin* bin = &(ref->bins[binNum]);
00381 
00382         // Add each chunk in the bin to the map.
00383         for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++)
00384         {
00385             // If the end of the chunk is less than the minimum offset
00386             // for the 16K block that starts our region, then no
00387             // records in this chunk will cross our region, so do
00388             // not add it to the chunks we need to use.
00389             if(bin->chunks[chunkIndex].chunk_end < minOffset)
00390             {
00391                 continue;
00392             }
00393             // Add the chunk to the map.
00394             if(!chunkList.insert(bin->chunks[chunkIndex]))
00395             {
00396                 // Failed to add to the map, return false.
00397                 return(false);
00398             }
00399         }
00400     }
00401 
00402     // Now that all chunks have been added to the list,
00403     // handle overlapping chunks.
00404     return(chunkList.mergeOverlapping());
00405 }
00406 
00407 
00408 // Get the max offset.
00409 uint64_t BamIndex::getMaxOffset() const
00410 {
00411     return(maxOverallOffset);
00412 }
00413 
00414 // Get the min & max file offsets for the reference ID.
00415 bool BamIndex::getReferenceMinMax(int32_t refID,
00416                                   uint64_t& minOffset,
00417                                   uint64_t& maxOffset) const
00418 {
00419     if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00420     {
00421         // Reference ID is out of range for this index file.
00422         return(false);
00423     }
00424 
00425     // Get this reference.
00426     minOffset = myRefs[refID].minChunkOffset;
00427     maxOffset = myRefs[refID].maxChunkOffset;
00428     return(true);
00429 }
00430 
00431     
00432 // Get the number of references in this index.
00433 int32_t BamIndex::getNumRefs() const
00434 {
00435     // Return the number of references.
00436     return(myRefs.size());
00437 }
00438 
00439 
00440 // Get the number of mapped reads for this reference id.
00441 int32_t BamIndex::getNumMappedReads(int32_t refID)
00442 {
00443     // If it is the reference id of unmapped reads, return
00444     // that there are no mapped reads.
00445     if(refID == REF_ID_UNMAPPED)
00446    {
00447        // These are by definition all unmapped reads.
00448        return(0);
00449    }
00450 
00451     if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00452     {
00453         // Reference ID is out of range for this index file.
00454         return(-1);
00455     }
00456 
00457     // Get this reference.
00458     return(myRefs[refID].n_mapped);
00459 }
00460 
00461 
00462 // Get the number of unmapped reads for this reference id.
00463 int32_t BamIndex::getNumUnMappedReads(int32_t refID)
00464 {
00465     // If it is the reference id of unmapped reads, return
00466     // that value.
00467     if(refID == REF_ID_UNMAPPED)
00468     {
00469         return(myUnMappedNumReads);
00470     }
00471 
00472     if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00473     {
00474         // Reference ID is out of range for this index file.
00475         return(-1);
00476     }
00477 
00478     // Get this reference.
00479     return(myRefs[refID].n_unmapped);
00480 }
00481 
00482 
00483 // Print the bam index.
00484 void BamIndex::printIndex(int32_t refID, bool summary)
00485 {
00486     std::cout << "BAM Index: " << std::endl;
00487     std::cout << "# Reference Sequences: " << n_ref << std::endl;
00488 
00489     unsigned int startRef = 0;
00490     unsigned int endRef = myRefs.size() - 1;
00491     std::vector<Reference> refsToProcess;
00492     if(refID != -1)
00493     {
00494         // Set start and end ref to the specified reference id.
00495         startRef = refID;
00496         endRef = refID;
00497     }
00498 
00499     // Print out the information for each bin.
00500     for(unsigned int i = startRef; i <= endRef; ++i)
00501     {
00502         std::cout << std::dec 
00503                   << "\tReference ID: " << std::setw(4) << i
00504                   << ";  #Bins: "<< std::setw(6) << myRefs[i].n_bin 
00505                   << ";  #Linear Index Entries: " 
00506                   << std::setw(6) << myRefs[i].n_intv
00507                   << ";  Min Chunk Offset: " 
00508                   << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset
00509                   << ";  Max Chunk Offset: "
00510                   << std::setw(18) << myRefs[i].maxChunkOffset
00511                   << std::dec;
00512         // Print the mapped/unmapped if set.
00513         if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00514         {            
00515             std::cout << ";  " << myRefs[i].n_mapped << " Mapped Reads";
00516         }
00517         if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00518         {            
00519             std::cout << ";  " << myRefs[i].n_unmapped << " Unmapped Reads";
00520         }
00521         std::cout << std::endl;
00522         
00523         // Only print more details if not summary.
00524         if(!summary)
00525         {
00526             std::vector<Bin>::iterator binIter;
00527             for(binIter = myRefs[i].bins.begin(); 
00528                 binIter != myRefs[i].bins.end();
00529                 ++binIter)
00530             {
00531                 Bin* binPtr = &(*binIter);
00532                 if(binPtr->bin == Bin::NOT_USED_BIN)
00533                 {
00534                     // This bin is not used, continue.
00535                     continue;
00536                 }
00537                 // Print the bin info.
00538                 std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl;
00539                 std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl;
00540                 std::cout << std::hex << std::showbase;
00541 
00542                 for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk;
00543                     ++chunkIndex)
00544                 {
00545                     // If this is the last chunk of the MAX_NUM_BINS - it
00546                     // contains a mapped/unmapped count rather than the regular
00547                     // chunk addresses.
00548                     if((binPtr->bin != MAX_NUM_BINS) ||
00549                        (chunkIndex != (binPtr->n_chunk - 1)))
00550                     {
00551                         std::cout << "\t\t\t\tchunk_beg: "
00552                                   << binPtr->chunks[chunkIndex].chunk_beg 
00553                                   << std::endl;
00554                         std::cout << "\t\t\t\tchunk_end: "
00555                                   << binPtr->chunks[chunkIndex].chunk_end
00556                                   << std::endl;
00557                     }
00558                 }
00559             }
00560             std::cout << std::dec;
00561             
00562             // Print the linear index.
00563             for(int linearIndex = 0; linearIndex < myRefs[i].n_intv;
00564                 ++linearIndex)
00565             {
00566                 if(myRefs[i].ioffsets[linearIndex] != 0)
00567                 {
00568                     std::cout << "\t\t\tLinearIndex["
00569                               << std::dec << linearIndex << "] Offset: " 
00570                               << std::hex << myRefs[i].ioffsets[linearIndex]
00571                               << std::endl;
00572                 }
00573             }
00574         }
00575     }
00576 }
00577 
00578 
00579 // Returns the minimum offset of records that cross the 16K block that
00580 // contains the specified position for the given reference id.
00581 // The basic logic is from samtools reg2bins and the samtools format specification pdf.
00582 int BamIndex::getBinsForRegion(uint32_t start, uint32_t end, uint16_t binList[MAX_NUM_BINS])
00583 {
00584     uint32_t binListIndex = 0, binNum;
00585     --end;
00586 
00587         // Check if beg/end go too high, set to max position.
00588         if(start > MAX_POSITION)
00589         {
00590             start = MAX_POSITION;
00591         }
00592         if(end > MAX_POSITION)
00593         {
00594             end = MAX_POSITION;
00595         }
00596 
00597     binList[binListIndex++] = 0;
00598     for (binNum =    1 + (start>>26); binNum <=    1 + (end>>26); ++binNum) 
00599             binList[binListIndex++] = binNum;
00600     for (binNum =    9 + (start>>23); binNum <=    9 + (end>>23); ++binNum) 
00601             binList[binListIndex++] = binNum;
00602     for (binNum =   73 + (start>>20); binNum <=   73 + (end>>20); ++binNum)
00603             binList[binListIndex++] = binNum;
00604     for (binNum =  585 + (start>>17); binNum <=  585 + (end>>17); ++binNum)
00605             binList[binListIndex++] = binNum;
00606     for (binNum = 4681 + (start>>14); binNum <= 4681 + (end>>14); ++binNum)
00607             binList[binListIndex++] = binNum;
00608 
00609         // binListIndex contains the number of items added to the list.
00610     return binListIndex;
00611 }
00612 
00613 
00614 // Returns the minimum offset of records that cross the 16K block that
00615 // contains the specified position for the given reference id.
00616 uint64_t BamIndex::getMinOffsetFromLinearIndex(int32_t refID, uint32_t position)
00617 {
00618     int32_t linearIndex = position >> LINEAR_INDEX_SHIFT;
00619     uint64_t minOffset = 0;
00620     // Check to see if the position is out of range of the linear index.
00621     int32_t linearOffsetSize = myRefs[refID].n_intv;
00622 
00623     // If there are no entries in the linear index, return 0.
00624     // Or if the linear index is not large enough to include
00625     // the start block, then there can be no records that cross
00626     // our region, so return 0.
00627     if((linearOffsetSize == 0) || (linearIndex >= linearOffsetSize))
00628 
00629     {
00630         return(0);
00631     }
00632     else
00633     {
00634         // The linear index is specified for this block, so return that
00635         // value.
00636         minOffset = myRefs[refID].ioffsets[linearIndex];
00637 
00638         // If the offset is 0, go to the previous block that has an offset.
00639         // This is due to a couple of bugs in older sam tools indexes.
00640         // 1) they add one to the index location (so when reading those, you
00641         // may be starting earlier than necessary)
00642         // 2) (the bigger issue) They did not include bins 4681-37449 in
00643         // the linear index.
00644         while((minOffset == 0) && (--linearIndex >= 0))
00645         {
00646             minOffset = myRefs[refID].ioffsets[linearIndex];            
00647         }
00648     }
00649     return(minOffset);
00650 }
00651 
00652 
Generated on Tue Mar 22 22:50:13 2011 for StatGen Software by  doxygen 1.6.3