BamIndex Class Reference

Collaboration diagram for BamIndex:
Collaboration graph
[legend]

List of all members.

Classes

class  Bin
class  Reference

Public Member Functions

void resetIndex ()
 Reset the member data for a new index file.
SamStatus::Status readIndex (const char *filename)
bool getChunksForRegion (int32_t refID, int32_t start, int32_t end, SortedChunkList &chunkList)
 Get the list of chunks associated with this region.
uint64_t getMaxOffset () const
bool getReferenceMinMax (int32_t refID, uint64_t &minOffset, uint64_t &maxOffset) const
 Get the minimum and maximum file offsets for the specfied reference ID.
int32_t getNumRefs () const
 Get the number of references in this index.
int32_t getNumMappedReads (int32_t refID)
 Get the number of mapped reads for this reference id.
int32_t getNumUnMappedReads (int32_t refID)
 Get the number of unmapped reads for this reference id.
void printIndex (int32_t refID, bool summary=false)
 Print the index information.
uint64_t getMinOffsetFromLinearIndex (int32_t refID, uint32_t position) const

Static Public Attributes

static const int32_t UNKNOWN_NUM_READS = -1
 The number used for an unknown number of reads.
static const int32_t REF_ID_UNMAPPED = -1
 The number used for the reference id of unmapped reads.
static const int32_t REF_ID_ALL = -2
 The number used to indicate that all reference ids should be used.

Detailed Description

Definition at line 61 of file BamIndex.h.


Member Function Documentation

bool BamIndex::getChunksForRegion ( int32_t  refID,
int32_t  start,
int32_t  end,
SortedChunkList chunkList 
)

Get the list of chunks associated with this region.

For an entire reference ID, set start and end to -1. To start at the beginning of the region, set start to 0/-1. To go to the end of the region, set end to -1.

Definition at line 305 of file BamIndex.cpp.

References REF_ID_UNMAPPED.

00307 {
00308     chunkList.clear();
00309 
00310     // If start is >= to end, there will be no sections, return no
00311     // regions.
00312     if((start >= end) && (end != -1))
00313     {
00314         std::cerr << "Warning, requesting region where start <= end, so "
00315                   << "no values will be returned.\n";
00316         return(false);
00317     }
00318 
00319     // Handle REF_ID_UNMAPPED.  This uses a default chunk which covers
00320     // from the max offset to the end of the file.
00321     if(refID == REF_ID_UNMAPPED)
00322     {
00323         Chunk refChunk;
00324         // The start of the unmapped region is the max offset found
00325         // in the index file.
00326         refChunk.chunk_beg = getMaxOffset();
00327         // The end of the unmapped region is the end of the file, so
00328         // set chunk end to the max value.
00329         refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE;
00330         return(chunkList.insert(refChunk));
00331     }
00332 
00333     if((refID < 0) || (refID >= n_ref))
00334     {
00335         // The specified refID is out of range, return false.
00336         std::cerr << "Warning, requesting refID is out of range, so "
00337                   << "no values will be returned.\n";
00338         return(false);
00339     }
00340 
00341     const Reference* ref = &(myRefs[refID]);
00342 
00343     // Handle where start/end are defaults.    
00344     if(start == -1)
00345     {
00346         if(end == -1)
00347         {
00348             // This is whole chromosome, so take a shortcut.
00349             if(ref->maxChunkOffset == 0)
00350             {
00351                 // No chunks for this region, but this is not an error.
00352                 return(true);
00353             }
00354             Chunk refChunk;
00355             refChunk.chunk_beg = ref->minChunkOffset;
00356             refChunk.chunk_end = ref->maxChunkOffset;
00357             return(chunkList.insert(refChunk));
00358         }
00359         else
00360         {
00361             start = 0;
00362         }
00363     }
00364     if(end == -1)
00365     {
00366         // MAX_POSITION is inclusive, but end is exclusive, so add 1.
00367         end = MAX_POSITION + 1;
00368     }
00369 
00370     // Determine the minimum offset for the given start position.  This
00371     // is done by using the linear index for the specified start position.
00372     uint64_t minOffset = getMinOffsetFromLinearIndex(refID, start);
00373 
00374     uint16_t binInRangeList[MAX_NUM_BINS + 1];
00375     
00376     int numBins = getBinsForRegion(start, end, binInRangeList);
00377 
00378     // loop through the bins in the range and get the chunks.
00379     for(int i = 0; i < numBins; ++i)
00380     {
00381         int binNum = binInRangeList[i];
00382         const Bin* bin = &(ref->bins[binNum]);
00383 
00384         // Add each chunk in the bin to the map.
00385         for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++)
00386         {
00387             // If the end of the chunk is less than the minimum offset
00388             // for the 16K block that starts our region, then no
00389             // records in this chunk will cross our region, so do
00390             // not add it to the chunks we need to use.
00391             if(bin->chunks[chunkIndex].chunk_end < minOffset)
00392             {
00393                 continue;
00394             }
00395             // Add the chunk to the map.
00396             if(!chunkList.insert(bin->chunks[chunkIndex]))
00397             {
00398                 // Failed to add to the map, return false.
00399                 std::cerr << "Warning, Failed to add a chunk, so "
00400                           << "no values will be returned.\n";
00401                 return(false);
00402             }
00403         }
00404     }
00405 
00406     // Now that all chunks have been added to the list,
00407     // handle overlapping chunks.
00408     return(chunkList.mergeOverlapping());
00409 }

int32_t BamIndex::getNumMappedReads ( int32_t  refID  ) 

Get the number of mapped reads for this reference id.

Returns -1 for out of range refIDs.

Parameters:
refID reference ID for which to extract the number of mapped reads.
Returns:
number of mapped reads for the specified reference id.

Definition at line 445 of file BamIndex.cpp.

References REF_ID_UNMAPPED.

Referenced by SamFile::getNumMappedReadsFromIndex().

00446 {
00447     // If it is the reference id of unmapped reads, return
00448     // that there are no mapped reads.
00449     if(refID == REF_ID_UNMAPPED)
00450    {
00451        // These are by definition all unmapped reads.
00452        return(0);
00453    }
00454 
00455     if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00456     {
00457         // Reference ID is out of range for this index file.
00458         return(-1);
00459     }
00460 
00461     // Get this reference.
00462     return(myRefs[refID].n_mapped);
00463 }

int32_t BamIndex::getNumRefs (  )  const

Get the number of references in this index.

Returns:
number of references

Definition at line 437 of file BamIndex.cpp.

00438 {
00439     // Return the number of references.
00440     return(myRefs.size());
00441 }

int32_t BamIndex::getNumUnMappedReads ( int32_t  refID  ) 

Get the number of unmapped reads for this reference id.

Returns -1 for out of range refIDs.

Parameters:
refID reference ID for which to extract the number of unmapped reads.
Returns:
number of unmapped reads for the specified reference id

Definition at line 467 of file BamIndex.cpp.

References REF_ID_UNMAPPED.

Referenced by SamFile::getNumUnMappedReadsFromIndex().

00468 {
00469     // If it is the reference id of unmapped reads, return
00470     // that value.
00471     if(refID == REF_ID_UNMAPPED)
00472     {
00473         return(myUnMappedNumReads);
00474     }
00475 
00476     if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00477     {
00478         // Reference ID is out of range for this index file.
00479         return(-1);
00480     }
00481 
00482     // Get this reference.
00483     return(myRefs[refID].n_unmapped);
00484 }

bool BamIndex::getReferenceMinMax ( int32_t  refID,
uint64_t &  minOffset,
uint64_t &  maxOffset 
) const

Get the minimum and maximum file offsets for the specfied reference ID.

Parameters:
refID the reference ID to locate in the file.
minOffset returns the min file offset for the specified reference
maxOffset returns the max file offset for the specified reference
Returns:
whether or not the reference was found in the file

Definition at line 419 of file BamIndex.cpp.

00422 {
00423     if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00424     {
00425         // Reference ID is out of range for this index file.
00426         return(false);
00427     }
00428 
00429     // Get this reference.
00430     minOffset = myRefs[refID].minChunkOffset;
00431     maxOffset = myRefs[refID].maxChunkOffset;
00432     return(true);
00433 }

void BamIndex::printIndex ( int32_t  refID,
bool  summary = false 
)

Print the index information.

Parameters:
refID reference ID for which to print info for. -1 means print for all references.
summary whether or not to just print a summary (defaults to false). The summary just contains summary info for each reference and not every bin/chunk.

Definition at line 488 of file BamIndex.cpp.

00489 {
00490     std::cout << "BAM Index: " << std::endl;
00491     std::cout << "# Reference Sequences: " << n_ref << std::endl;
00492 
00493     unsigned int startRef = 0;
00494     unsigned int endRef = myRefs.size() - 1;
00495     std::vector<Reference> refsToProcess;
00496     if(refID != -1)
00497     {
00498         // Set start and end ref to the specified reference id.
00499         startRef = refID;
00500         endRef = refID;
00501     }
00502 
00503     // Print out the information for each bin.
00504     for(unsigned int i = startRef; i <= endRef; ++i)
00505     {
00506         std::cout << std::dec 
00507                   << "\tReference ID: " << std::setw(4) << i
00508                   << ";  #Bins: "<< std::setw(6) << myRefs[i].n_bin 
00509                   << ";  #Linear Index Entries: " 
00510                   << std::setw(6) << myRefs[i].n_intv
00511                   << ";  Min Chunk Offset: " 
00512                   << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset
00513                   << ";  Max Chunk Offset: "
00514                   << std::setw(18) << myRefs[i].maxChunkOffset
00515                   << std::dec;
00516         // Print the mapped/unmapped if set.
00517         if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00518         {            
00519             std::cout << ";  " << myRefs[i].n_mapped << " Mapped Reads";
00520         }
00521         if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00522         {            
00523             std::cout << ";  " << myRefs[i].n_unmapped << " Unmapped Reads";
00524         }
00525         std::cout << std::endl;
00526         
00527         // Only print more details if not summary.
00528         if(!summary)
00529         {
00530             std::vector<Bin>::iterator binIter;
00531             for(binIter = myRefs[i].bins.begin(); 
00532                 binIter != myRefs[i].bins.end();
00533                 ++binIter)
00534             {
00535                 Bin* binPtr = &(*binIter);
00536                 if(binPtr->bin == Bin::NOT_USED_BIN)
00537                 {
00538                     // This bin is not used, continue.
00539                     continue;
00540                 }
00541                 // Print the bin info.
00542                 std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl;
00543                 std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl;
00544                 std::cout << std::hex << std::showbase;
00545 
00546                 for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk;
00547                     ++chunkIndex)
00548                 {
00549                     // If this is the last chunk of the MAX_NUM_BINS - it
00550                     // contains a mapped/unmapped count rather than the regular
00551                     // chunk addresses.
00552                     if((binPtr->bin != MAX_NUM_BINS) ||
00553                        (chunkIndex != (binPtr->n_chunk - 1)))
00554                     {
00555                         std::cout << "\t\t\t\tchunk_beg: "
00556                                   << binPtr->chunks[chunkIndex].chunk_beg 
00557                                   << std::endl;
00558                         std::cout << "\t\t\t\tchunk_end: "
00559                                   << binPtr->chunks[chunkIndex].chunk_end
00560                                   << std::endl;
00561                     }
00562                 }
00563             }
00564             std::cout << std::dec;
00565             
00566             // Print the linear index.
00567             for(int linearIndex = 0; linearIndex < myRefs[i].n_intv;
00568                 ++linearIndex)
00569             {
00570                 if(myRefs[i].ioffsets[linearIndex] != 0)
00571                 {
00572                     std::cout << "\t\t\tLinearIndex["
00573                               << std::dec << linearIndex << "] Offset: " 
00574                               << std::hex << myRefs[i].ioffsets[linearIndex]
00575                               << std::endl;
00576                 }
00577             }
00578         }
00579     }
00580 }

SamStatus::Status BamIndex::readIndex ( const char *  filename  ) 
Parameters:
filename the bam index file to be read.
Returns:
the status of the read.

Definition at line 142 of file BamIndex.cpp.

References SamStatus::FAIL_IO, SamStatus::FAIL_PARSE, ifopen(), ifread(), resetIndex(), and SamStatus::SUCCESS.

Referenced by SamFile::ReadBamIndex().

00143 {
00144     // Reset the index from anything that may previously be set.
00145     resetIndex();
00146 
00147     IFILE indexFile = ifopen(filename, "rb");
00148 
00149     // Failed to open the index file.
00150     if(indexFile == NULL)
00151     {
00152         return(SamStatus::FAIL_IO);
00153     }
00154 
00155     // generate the bam index structure.
00156 
00157     // Read the magic string.
00158     char magic[4];
00159     if(ifread(indexFile, magic, 4) != 4)
00160     {
00161         // Failed to read the magic
00162         return(SamStatus::FAIL_IO);
00163     }
00164 
00165     // If this is not an index file, set num references to 0. 
00166     if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1)
00167     {
00168         // Not a BAM Index file.
00169         return(SamStatus::FAIL_PARSE);
00170     }
00171 
00172     // It is a bam index file.
00173     // Read the number of reference sequences.
00174     if(ifread(indexFile, &n_ref, 4) != 4)
00175     {
00176         // Failed to read.
00177         return(SamStatus::FAIL_IO);
00178     }
00179 
00180     // Size the references.
00181     myRefs.resize(n_ref);
00182 
00183     for(int refIndex = 0; refIndex < n_ref; refIndex++)
00184     {
00185         // Read each reference.
00186         Reference* ref = &(myRefs[refIndex]);
00187         
00188         // Resize the bins so they can be indexed by bin number.
00189         ref->bins.resize(MAX_NUM_BINS + 1);
00190         
00191         // Read the number of bins.
00192         if(ifread(indexFile, &(ref->n_bin), 4) != 4)
00193         {
00194             // Failed to read the number of bins.
00195             // Return failure.
00196             return(SamStatus::FAIL_PARSE);
00197         }
00198 
00199         // If there are no bins, then there are no
00200         // mapped/unmapped reads.
00201         if(ref->n_bin == 0)
00202         {
00203             ref->n_mapped = 0;
00204             ref->n_unmapped = 0;
00205         }
00206 
00207         // Read each bin.
00208         for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
00209         {
00210             uint32_t binNumber;
00211 
00212             // Read in the bin number.
00213             if(ifread(indexFile, &(binNumber), 4) != 4)
00214             {
00215                 // Failed to read the bin number.
00216                 // Return failure.
00217                 return(SamStatus::FAIL_IO);
00218             }
00219 
00220             // Add the bin to the reference and get the
00221             // pointer back so the values can be set in it.
00222             Bin* binPtr = &(ref->bins[binNumber]);
00223             binPtr->bin = binNumber;
00224          
00225             // Read in the number of chunks.
00226             if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
00227             {
00228                 // Failed to read number of chunks.
00229                 // Return failure.
00230                 return(SamStatus::FAIL_IO);
00231             }
00232 
00233             // Read in the chunks.
00234             // Allocate space for the chunks.
00235             uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
00236             binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
00237             if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
00238             {
00239                 // Failed to read the chunks.
00240                 // Return failure.
00241                 return(SamStatus::FAIL_IO);
00242             }
00243 
00244             // Determine the min/max for this bin if it is not the max bin.
00245             if(binPtr->bin != MAX_NUM_BINS)
00246             {
00247                 for(int i = 0; i < binPtr->n_chunk; i++)
00248                 {
00249                     if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset)
00250                     {
00251                         ref->minChunkOffset = binPtr->chunks[i].chunk_beg;
00252                     }
00253                     if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset)
00254                     {
00255                         ref->maxChunkOffset = binPtr->chunks[i].chunk_end;
00256                     }
00257                     if(binPtr->chunks[i].chunk_end > maxOverallOffset)
00258                     {
00259                         maxOverallOffset = binPtr->chunks[i].chunk_end;
00260                     }
00261                 }
00262             }
00263             else
00264             {
00265                 // Mapped/unmapped are the last chunk of the
00266                 // MAX BIN
00267                 ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg;
00268                 ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end;
00269             }
00270         }
00271 
00272         // Read the number of intervals.
00273         if(ifread(indexFile, &(ref->n_intv), 4) != 4)
00274         {
00275             // Failed to read, set to 0.
00276             ref->n_intv = 0;
00277             // Return failure.
00278             return(SamStatus::FAIL_IO);
00279         }
00280 
00281         // Allocate space for the intervals and read them.
00282         uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
00283         ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
00284         if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
00285         {
00286             // Failed to read the linear index.
00287             // Return failure.
00288             return(SamStatus::FAIL_IO);
00289         }
00290     }
00291 
00292     int32_t numUnmapped = 0;
00293     if(ifread(indexFile, &numUnmapped, sizeof(int32_t)) == sizeof(int32_t))
00294     {
00295         myUnMappedNumReads = numUnmapped;
00296     }
00297 
00298     // Successfully read teh bam index file.
00299     return(SamStatus::SUCCESS);
00300 }


The documentation for this class was generated from the following files:
Generated on Tue Sep 6 17:52:01 2011 for libStatGen Software by  doxygen 1.6.3