libStatGen Software  1
BamIndex.cpp
00001 /*
00002  *  Copyright (C) 2010-2012  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "BamIndex.h"
00019 #include <iomanip>
00020 
00021 BamIndex::BamIndex()
00022     : IndexBase(),
00023       maxOverallOffset(0),
00024       myUnMappedNumReads(-1)
00025 {
00026 }
00027 
00028 
00029 BamIndex::~BamIndex()
00030 {
00031 }
00032 
00033 
00034 // Reset the member data for a new index file.
00035 void BamIndex::resetIndex()
00036 {
00037     IndexBase::resetIndex();
00038 
00039     maxOverallOffset = 0;    
00040     myUnMappedNumReads = -1;
00041 }
00042 
00043 
00044 // Read & parse the specified index file.
00045 SamStatus::Status BamIndex::readIndex(const char* filename)
00046 {
00047     // Reset the index from anything that may previously be set.
00048     resetIndex();
00049 
00050     IFILE indexFile = ifopen(filename, "rb");
00051 
00052     // Failed to open the index file.
00053     if(indexFile == NULL)
00054     {
00055         return(SamStatus::FAIL_IO);
00056     }
00057 
00058     // generate the bam index structure.
00059 
00060     // Read the magic string.
00061     char magic[4];
00062     if(ifread(indexFile, magic, 4) != 4)
00063     {
00064         // Failed to read the magic
00065         ifclose(indexFile);
00066         return(SamStatus::FAIL_IO);
00067     }
00068 
00069     // If this is not an index file, set num references to 0. 
00070     if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1)
00071     {
00072         // Not a BAM Index file.
00073         ifclose(indexFile);
00074         return(SamStatus::FAIL_PARSE);
00075     }
00076 
00077     // It is a bam index file.
00078     // Read the number of reference sequences.
00079     if(ifread(indexFile, &n_ref, 4) != 4)
00080     {
00081         // Failed to read.
00082         ifclose(indexFile);
00083         return(SamStatus::FAIL_IO);
00084     }
00085 
00086     // Size the references.
00087     myRefs.resize(n_ref);
00088 
00089     for(int refIndex = 0; refIndex < n_ref; refIndex++)
00090     {
00091         // Read each reference.
00092         Reference* ref = &(myRefs[refIndex]);
00093         
00094         // Resize the bins so they can be indexed by bin number.
00095         ref->bins.resize(MAX_NUM_BINS + 1);
00096         
00097         // Read the number of bins.
00098         if(ifread(indexFile, &(ref->n_bin), 4) != 4)
00099         {
00100             // Failed to read the number of bins.
00101             // Return failure.
00102             ifclose(indexFile);
00103             return(SamStatus::FAIL_PARSE);
00104         }
00105 
00106         // If there are no bins, then there are no
00107         // mapped/unmapped reads.
00108         if(ref->n_bin == 0)
00109         {
00110             ref->n_mapped = 0;
00111             ref->n_unmapped = 0;
00112         }
00113 
00114         // Read each bin.
00115         for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
00116         {
00117             uint32_t binNumber;
00118 
00119             // Read in the bin number.
00120             if(ifread(indexFile, &(binNumber), 4) != 4)
00121             {
00122                 // Failed to read the bin number.
00123                 // Return failure.
00124                 ifclose(indexFile);
00125                 return(SamStatus::FAIL_IO);
00126             }
00127 
00128             // Add the bin to the reference and get the
00129             // pointer back so the values can be set in it.
00130             Bin* binPtr = &(ref->bins[binNumber]);
00131             binPtr->bin = binNumber;
00132          
00133             // Read in the number of chunks.
00134             if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
00135             {
00136                 // Failed to read number of chunks.
00137                 // Return failure.
00138                 ifclose(indexFile);
00139                 return(SamStatus::FAIL_IO);
00140             }
00141 
00142             // Read in the chunks.
00143             // Allocate space for the chunks.
00144             uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
00145             binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
00146             if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
00147             {
00148                 // Failed to read the chunks.
00149                 // Return failure.
00150                 ifclose(indexFile);
00151                 return(SamStatus::FAIL_IO);
00152             }
00153 
00154             // Determine the min/max for this bin if it is not the max bin.
00155             if(binPtr->bin != MAX_NUM_BINS)
00156             {
00157                 for(int i = 0; i < binPtr->n_chunk; i++)
00158                 {
00159                     if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset)
00160                     {
00161                         ref->minChunkOffset = binPtr->chunks[i].chunk_beg;
00162                     }
00163                     if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset)
00164                     {
00165                         ref->maxChunkOffset = binPtr->chunks[i].chunk_end;
00166                     }
00167                     if(binPtr->chunks[i].chunk_end > maxOverallOffset)
00168                     {
00169                         maxOverallOffset = binPtr->chunks[i].chunk_end;
00170                     }
00171                 }
00172             }
00173             else
00174             {
00175                 // Mapped/unmapped are the last chunk of the
00176                 // MAX BIN
00177                 ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg;
00178                 ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end;
00179             }
00180         }
00181 
00182         // Read the number of intervals.
00183         if(ifread(indexFile, &(ref->n_intv), 4) != 4)
00184         {
00185             // Failed to read, set to 0.
00186             ref->n_intv = 0;
00187             // Return failure.
00188             ifclose(indexFile);
00189             return(SamStatus::FAIL_IO);
00190         }
00191 
00192         // Allocate space for the intervals and read them.
00193         uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
00194         ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
00195         if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
00196         {
00197             // Failed to read the linear index.
00198             // Return failure.
00199             ifclose(indexFile);
00200             return(SamStatus::FAIL_IO);
00201         }
00202     }
00203 
00204     int32_t numUnmapped = 0;
00205     if(ifread(indexFile, &numUnmapped, sizeof(int32_t)) == sizeof(int32_t))
00206     {
00207         myUnMappedNumReads = numUnmapped;
00208     }
00209 
00210     // Successfully read the bam index file.
00211     ifclose(indexFile);
00212     return(SamStatus::SUCCESS);
00213 }
00214 
00215 
00216 // Get the chunks for the specified reference id and start/end 0-based
00217 // coordinates.
00218 bool BamIndex::getChunksForRegion(int32_t refID, int32_t start, int32_t end, 
00219                                   SortedChunkList& chunkList)
00220 {
00221     chunkList.clear();
00222 
00223     // If start is >= to end, there will be no sections, return no
00224     // regions.
00225     if((start >= end) && (end != -1))
00226     {
00227         std::cerr << "Warning, requesting region where start <= end, so "
00228                   << "no values will be returned.\n";
00229         return(false);
00230     }
00231 
00232     // Handle REF_ID_UNMAPPED.  This uses a default chunk which covers
00233     // from the max offset to the end of the file.
00234     if(refID == REF_ID_UNMAPPED)
00235     {
00236         Chunk refChunk;
00237         // The start of the unmapped region is the max offset found
00238         // in the index file.
00239         refChunk.chunk_beg = getMaxOffset();
00240         // The end of the unmapped region is the end of the file, so
00241         // set chunk end to the max value.
00242         refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE;
00243         return(chunkList.insert(refChunk));
00244     }
00245 
00246     if((refID < 0) || (refID >= n_ref))
00247     {
00248         // The specified refID is out of range, return false.
00249         std::cerr << "Warning, requesting refID is out of range, so "
00250                   << "no values will be returned.\n";
00251         return(false);
00252     }
00253 
00254     const Reference* ref = &(myRefs[refID]);
00255 
00256     // Handle where start/end are defaults.    
00257     if(start == -1)
00258     {
00259         if(end == -1)
00260         {
00261             // This is whole chromosome, so take a shortcut.
00262             if(ref->maxChunkOffset == 0)
00263             {
00264                 // No chunks for this region, but this is not an error.
00265                 return(true);
00266             }
00267             Chunk refChunk;
00268             refChunk.chunk_beg = ref->minChunkOffset;
00269             refChunk.chunk_end = ref->maxChunkOffset;
00270             return(chunkList.insert(refChunk));
00271         }
00272         else
00273         {
00274             start = 0;
00275         }
00276     }
00277     if(end == -1)
00278     {
00279         // MAX_POSITION is inclusive, but end is exclusive, so add 1.
00280         end = MAX_POSITION + 1;
00281     }
00282 
00283     // Determine the minimum offset for the given start position.  This
00284     // is done by using the linear index for the specified start position.
00285     uint64_t minOffset = 0;
00286     getMinOffsetFromLinearIndex(refID, start, minOffset);
00287 
00288     uint16_t binInRangeList[MAX_NUM_BINS + 1];
00289     
00290     int numBins = getBinsForRegion(start, end, binInRangeList);
00291 
00292     // loop through the bins in the range and get the chunks.
00293     for(int i = 0; i < numBins; ++i)
00294     {
00295         int binNum = binInRangeList[i];
00296         const Bin* bin = &(ref->bins[binNum]);
00297 
00298         // Add each chunk in the bin to the map.
00299         for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++)
00300         {
00301             // If the end of the chunk is less than the minimum offset
00302             // for the 16K block that starts our region, then no
00303             // records in this chunk will cross our region, so do
00304             // not add it to the chunks we need to use.
00305             if(bin->chunks[chunkIndex].chunk_end < minOffset)
00306             {
00307                 continue;
00308             }
00309             // Add the chunk to the map.
00310             if(!chunkList.insert(bin->chunks[chunkIndex]))
00311             {
00312                 // Failed to add to the map, return false.
00313                 std::cerr << "Warning, Failed to add a chunk, so "
00314                           << "no values will be returned.\n";
00315                 return(false);
00316             }
00317         }
00318     }
00319 
00320     // Now that all chunks have been added to the list,
00321     // handle overlapping chunks.
00322     return(chunkList.mergeOverlapping());
00323 }
00324 
00325 
00326 // Get the max offset.
00327 uint64_t BamIndex::getMaxOffset() const
00328 {
00329     return(maxOverallOffset);
00330 }
00331 
00332 // Get the min & max file offsets for the reference ID.
00333 bool BamIndex::getReferenceMinMax(int32_t refID,
00334                                   uint64_t& minOffset,
00335                                   uint64_t& maxOffset) const
00336 {
00337     if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00338     {
00339         // Reference ID is out of range for this index file.
00340         return(false);
00341     }
00342 
00343     // Get this reference.
00344     minOffset = myRefs[refID].minChunkOffset;
00345     maxOffset = myRefs[refID].maxChunkOffset;
00346     return(true);
00347 }
00348 
00349 
00350 // Get the number of mapped reads for this reference id.
00351 int32_t BamIndex::getNumMappedReads(int32_t refID)
00352 {
00353     // If it is the reference id of unmapped reads, return
00354     // that there are no mapped reads.
00355     if(refID == REF_ID_UNMAPPED)
00356    {
00357        // These are by definition all unmapped reads.
00358        return(0);
00359    }
00360 
00361     if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00362     {
00363         // Reference ID is out of range for this index file.
00364         return(-1);
00365     }
00366 
00367     // Get this reference.
00368     return(myRefs[refID].n_mapped);
00369 }
00370 
00371 
00372 // Get the number of unmapped reads for this reference id.
00373 int32_t BamIndex::getNumUnMappedReads(int32_t refID)
00374 {
00375     // If it is the reference id of unmapped reads, return
00376     // that value.
00377     if(refID == REF_ID_UNMAPPED)
00378     {
00379         return(myUnMappedNumReads);
00380     }
00381 
00382     if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00383     {
00384         // Reference ID is out of range for this index file.
00385         return(-1);
00386     }
00387 
00388     // Get this reference.
00389     return(myRefs[refID].n_unmapped);
00390 }
00391 
00392 
00393 // Print the bam index.
00394 void BamIndex::printIndex(int32_t refID, bool summary)
00395 {
00396     std::cout << "BAM Index: " << std::endl;
00397     std::cout << "# Reference Sequences: " << n_ref << std::endl;
00398 
00399     unsigned int startRef = 0;
00400     unsigned int endRef = myRefs.size() - 1;
00401     std::vector<Reference> refsToProcess;
00402     if(refID != -1)
00403     {
00404         // Set start and end ref to the specified reference id.
00405         startRef = refID;
00406         endRef = refID;
00407     }
00408 
00409     // Print out the information for each bin.
00410     for(unsigned int i = startRef; i <= endRef; ++i)
00411     {
00412         std::cout << std::dec 
00413                   << "\tReference ID: " << std::setw(4) << i
00414                   << ";  #Bins: "<< std::setw(6) << myRefs[i].n_bin 
00415                   << ";  #Linear Index Entries: " 
00416                   << std::setw(6) << myRefs[i].n_intv
00417                   << ";  Min Chunk Offset: " 
00418                   << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset
00419                   << ";  Max Chunk Offset: "
00420                   << std::setw(18) << myRefs[i].maxChunkOffset
00421                   << std::dec;
00422         // Print the mapped/unmapped if set.
00423         if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00424         {            
00425             std::cout << ";  " << myRefs[i].n_mapped << " Mapped Reads";
00426         }
00427         if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00428         {            
00429             std::cout << ";  " << myRefs[i].n_unmapped << " Unmapped Reads";
00430         }
00431         std::cout << std::endl;
00432         
00433         // Only print more details if not summary.
00434         if(!summary)
00435         {
00436             std::vector<Bin>::iterator binIter;
00437             for(binIter = myRefs[i].bins.begin(); 
00438                 binIter != myRefs[i].bins.end();
00439                 ++binIter)
00440             {
00441                 Bin* binPtr = &(*binIter);
00442                 if(binPtr->bin == Bin::NOT_USED_BIN)
00443                 {
00444                     // This bin is not used, continue.
00445                     continue;
00446                 }
00447                 // Print the bin info.
00448                 std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl;
00449                 std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl;
00450                 std::cout << std::hex << std::showbase;
00451 
00452                 for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk;
00453                     ++chunkIndex)
00454                 {
00455                     // If this is the last chunk of the MAX_NUM_BINS - it
00456                     // contains a mapped/unmapped count rather than the regular
00457                     // chunk addresses.
00458                     if((binPtr->bin != MAX_NUM_BINS) ||
00459                        (chunkIndex != (binPtr->n_chunk - 1)))
00460                     {
00461                         std::cout << "\t\t\t\tchunk_beg: "
00462                                   << binPtr->chunks[chunkIndex].chunk_beg 
00463                                   << std::endl;
00464                         std::cout << "\t\t\t\tchunk_end: "
00465                                   << binPtr->chunks[chunkIndex].chunk_end
00466                                   << std::endl;
00467                     }
00468                 }
00469             }
00470             std::cout << std::dec;
00471             
00472             // Print the linear index.
00473             for(int linearIndex = 0; linearIndex < myRefs[i].n_intv;
00474                 ++linearIndex)
00475             {
00476                 if(myRefs[i].ioffsets[linearIndex] != 0)
00477                 {
00478                     std::cout << "\t\t\tLinearIndex["
00479                               << std::dec << linearIndex << "] Offset: " 
00480                               << std::hex << myRefs[i].ioffsets[linearIndex]
00481                               << std::endl;
00482                 }
00483             }
00484         }
00485     }
00486 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends