BamIndex Class Reference

Collaboration diagram for BamIndex:
Collaboration graph
[legend]

List of all members.

Classes

class  Bin
class  Reference

Public Member Functions

void resetIndex ()
 Reset the member data for a new index file.
SamStatus::Status readIndex (const char *filename)
bool getChunksForRegion (int32_t refID, int32_t start, int32_t end, SortedChunkList &chunkList)
 Get the list of chunks associated with this region.
uint64_t getMaxOffset () const
bool getReferenceMinMax (int32_t refID, uint64_t &minOffset, uint64_t &maxOffset) const
 Get the minimum and maximum file offsets for the specfied reference ID.
int32_t getNumRefs () const
 Get the number of references in this index.
void printIndex (int32_t refID, bool summary=false)
 Print the index information.

Static Public Attributes

static const int32_t REF_ID_UNMAPPED = -1
 The number used for the reference id of unmapped reads.
static const int32_t REF_ID_ALL = -2
 The number used to indicate that all reference ids should be used.

Detailed Description

Definition at line 61 of file BamIndex.h.


Member Function Documentation

bool BamIndex::getChunksForRegion ( int32_t  refID,
int32_t  start,
int32_t  end,
SortedChunkList chunkList 
)

Get the list of chunks associated with this region.

For an entire reference ID, set start and end to -1. To start at the beginning of the region, set start to 0/-1. To go to the end of the region, set end to -1.

Definition at line 289 of file BamIndex.cpp.

References REF_ID_UNMAPPED.

00291 {
00292     chunkList.clear();
00293 
00294     // If start is >= to end, there will be no sections, return no
00295     // regions.
00296     if((start >= end) && (end != -1))
00297     {
00298         std::cerr << "Warning, requesting region where start <= end, so "
00299                   << "no values will be returned.\n";
00300         return(false);
00301     }
00302 
00303     // Handle REF_ID_UNMAPPED.  This uses a default chunk which covers
00304     // from the max offset to the end of the file.
00305     if(refID == REF_ID_UNMAPPED)
00306     {
00307         Chunk refChunk;
00308         // The start of the unmapped region is the max offset found
00309         // in the index file.
00310         refChunk.chunk_beg = getMaxOffset();
00311         // The end of the unmapped region is the end of the file, so
00312         // set chunk end to the max value.
00313         refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE;
00314         return(chunkList.insert(refChunk));
00315     }
00316 
00317     if((refID < 0) || (refID >= n_ref))
00318     {
00319         // The specified refID is out of range, return false.
00320         return(false);
00321     }
00322 
00323     const Reference* ref = &(myRefs[refID]);
00324 
00325     // Handle where start/end are defaults.    
00326     if(start == -1)
00327     {
00328         if(end == -1)
00329         {
00330             // This is whole chromosome, so take a shortcut.
00331             if(ref->maxChunkOffset == 0)
00332             {
00333                 // No chunks for this region, but this is not an error.
00334                 return(true);
00335             }
00336             Chunk refChunk;
00337             refChunk.chunk_beg = ref->minChunkOffset;
00338             refChunk.chunk_end = ref->maxChunkOffset;
00339             return(chunkList.insert(refChunk));
00340         }
00341         else
00342         {
00343             start = 0;
00344         }
00345     }
00346     if(end == -1)
00347     {
00348         // MAX_POSITION is inclusive, but end is exclusive, so add 1.
00349         end = MAX_POSITION + 1;
00350     }
00351 
00352     // Determine the minimum offset for the given start position.  This
00353     // is done by using the linear index for the specified start position.
00354     uint64_t minOffset = getMinOffsetFromLinearIndex(refID, start);
00355 
00356     uint16_t binInRangeList[MAX_NUM_BINS + 1];
00357     
00358     int numBins = getBinsForRegion(start, end, binInRangeList);
00359 
00360     // loop through the bins in the range and get the chunks.
00361     for(int i = 0; i < numBins; ++i)
00362     {
00363         int binNum = binInRangeList[i];
00364         const Bin* bin = &(ref->bins[binNum]);
00365 
00366         // Add each chunk in the bin to the map.
00367         for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++)
00368         {
00369             // If the end of the chunk is less than the minimum offset
00370             // for the 16K block that starts our region, then no
00371             // records in this chunk will cross our region, so do
00372             // not add it to the chunks we need to use.
00373             if(bin->chunks[chunkIndex].chunk_end < minOffset)
00374             {
00375                 continue;
00376             }
00377             // Add the chunk to the map.
00378             if(!chunkList.insert(bin->chunks[chunkIndex]))
00379             {
00380                 // Failed to add to the map, return false.
00381                 return(false);
00382             }
00383         }
00384     }
00385 
00386     // Now that all chunks have been added to the list,
00387     // handle overlapping chunks.
00388     return(chunkList.mergeOverlapping());
00389 }

int32_t BamIndex::getNumRefs (  )  const

Get the number of references in this index.

Returns:
number of references

Definition at line 417 of file BamIndex.cpp.

00418 {
00419     // Return the number of references.
00420     return(myRefs.size());
00421 }

bool BamIndex::getReferenceMinMax ( int32_t  refID,
uint64_t &  minOffset,
uint64_t &  maxOffset 
) const

Get the minimum and maximum file offsets for the specfied reference ID.

Parameters:
refID the reference ID to locate in the file.
minOffset returns the min file offset for the specified reference
maxOffset returns the max file offset for the specified reference
Returns:
whether or not the reference was found in the file

Definition at line 399 of file BamIndex.cpp.

00402 {
00403     if((refID < 0) || (refID >= (int32_t)myRefs.size()))
00404     {
00405         // Reference ID is out of range for this index file.
00406         return(false);
00407     }
00408 
00409     // Get this reference.
00410     minOffset = myRefs[refID].minChunkOffset;
00411     maxOffset = myRefs[refID].maxChunkOffset;
00412     return(true);
00413 }

void BamIndex::printIndex ( int32_t  refID,
bool  summary = false 
)

Print the index information.

Parameters:
refID reference ID for which to print info for. -1 means print for all references.
summary whether or not to just print a summary (defaults to false). The summary just contains summary info for each reference and not every bin/chunk.

Definition at line 425 of file BamIndex.cpp.

00426 {
00427     std::cout << "BAM Index: " << std::endl;
00428     std::cout << "# Reference Sequences: " << n_ref << std::endl;
00429 
00430     unsigned int startRef = 0;
00431     unsigned int endRef = myRefs.size() - 1;
00432     std::vector<Reference> refsToProcess;
00433     if(refID != -1)
00434     {
00435         // Set start and end ref to the specified reference id.
00436         startRef = refID;
00437         endRef = refID;
00438     }
00439 
00440     // Print out the information for each bin.
00441     for(unsigned int i = startRef; i <= endRef; ++i)
00442     {
00443         std::cout << std::dec 
00444                   << "\tReference ID: " << std::setw(4) << i
00445                   << ";  #Bins: "<< std::setw(6) << myRefs[i].n_bin 
00446                   << ";  #Linear Index Entries: " 
00447                   << std::setw(6) << myRefs[i].n_intv
00448                   << ";  Min Chunk Offset: " 
00449                   << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset
00450                   << ";  Max Chunk Offset: "
00451                   << std::setw(18) << myRefs[i].maxChunkOffset
00452                   << std::dec;
00453         // Print the mapped/unmapped if set.
00454         if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00455         {            
00456             std::cout << ";  " << myRefs[i].n_mapped << " Mapped Reads";
00457         }
00458         if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
00459         {            
00460             std::cout << ";  " << myRefs[i].n_unmapped << " Unmapped Reads";
00461         }
00462         std::cout << std::endl;
00463         
00464         // Only print more details if not summary.
00465         if(!summary)
00466         {
00467             std::vector<Bin>::iterator binIter;
00468             for(binIter = myRefs[i].bins.begin(); 
00469                 binIter != myRefs[i].bins.end();
00470                 ++binIter)
00471             {
00472                 Bin* binPtr = &(*binIter);
00473                 if(binPtr->bin == Bin::NOT_USED_BIN)
00474                 {
00475                     // This bin is not used, continue.
00476                     continue;
00477                 }
00478                 // Print the bin info.
00479                 std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl;
00480                 std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl;
00481                 std::cout << std::hex << std::showbase;
00482 
00483                 for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk;
00484                     ++chunkIndex)
00485                 {
00486                     // If this is the last chunk of the MAX_NUM_BINS - it
00487                     // contains a mapped/unmapped count rather than the regular
00488                     // chunk addresses.
00489                     if((binPtr->bin != MAX_NUM_BINS) ||
00490                        (chunkIndex != (binPtr->n_chunk - 1)))
00491                     {
00492                         std::cout << "\t\t\t\tchunk_beg: "
00493                                   << binPtr->chunks[chunkIndex].chunk_beg 
00494                                   << std::endl;
00495                         std::cout << "\t\t\t\tchunk_end: "
00496                                   << binPtr->chunks[chunkIndex].chunk_end
00497                                   << std::endl;
00498                     }
00499                 }
00500             }
00501             std::cout << std::dec;
00502             
00503             // Print the linear index.
00504             for(int linearIndex = 0; linearIndex < myRefs[i].n_intv;
00505                 ++linearIndex)
00506             {
00507                 if(myRefs[i].ioffsets[linearIndex] != 0)
00508                 {
00509                     std::cout << "\t\t\tLinearIndex["
00510                               << std::dec << linearIndex << "] Offset: " 
00511                               << std::hex << myRefs[i].ioffsets[linearIndex]
00512                               << std::endl;
00513                 }
00514             }
00515         }
00516     }
00517 }

SamStatus::Status BamIndex::readIndex ( const char *  filename  ) 
Parameters:
filename the bam index file to be read.
Returns:
the status of the read.

Definition at line 140 of file BamIndex.cpp.

References resetIndex().

Referenced by SamFile::ReadBamIndex().

00141 {
00142     // Reset the index from anything that may previously be set.
00143     resetIndex();
00144 
00145     IFILE indexFile = ifopen(filename, "rb");
00146 
00147     // Failed to opren the index file.
00148     if(indexFile == NULL)
00149     {
00150         return(SamStatus::FAIL_IO);
00151     }
00152 
00153     // generate the bam index structure.
00154 
00155     // Read the magic string.
00156     char magic[4];
00157     if(ifread(indexFile, magic, 4) != 4)
00158     {
00159         // Failed to read the magic
00160         return(SamStatus::FAIL_IO);
00161     }
00162 
00163     // If this is not an index file, set num references to 0. 
00164     if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1)
00165     {
00166         // Not a BAM Index file.
00167         return(SamStatus::FAIL_PARSE);
00168     }
00169 
00170     // It is a bam index file.
00171     // Read the number of reference sequences.
00172     if(ifread(indexFile, &n_ref, 4) != 4)
00173     {
00174         // Failed to read.
00175         return(SamStatus::FAIL_IO);
00176     }
00177 
00178     // Size the references.
00179     myRefs.resize(n_ref);
00180 
00181     for(int refIndex = 0; refIndex < n_ref; refIndex++)
00182     {
00183         // Read each reference.
00184         Reference* ref = &(myRefs[refIndex]);
00185         
00186         // Resize the bins so they can be indexed by bin number.
00187         ref->bins.resize(MAX_NUM_BINS + 1);
00188         
00189         // Read the number of bins.
00190         if(ifread(indexFile, &(ref->n_bin), 4) != 4)
00191         {
00192             // Failed to read the number of bins.
00193             // Return failure.
00194             return(SamStatus::FAIL_PARSE);
00195         }
00196 
00197         // Read each bin.
00198         for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
00199         {
00200             uint32_t binNumber;
00201 
00202             // Read in the bin number.
00203             if(ifread(indexFile, &(binNumber), 4) != 4)
00204             {
00205                 // Failed to read the bin number.
00206                 // Return failure.
00207                 return(SamStatus::FAIL_IO);
00208             }
00209 
00210             // Add the bin to the reference and get the
00211             // pointer back so the values can be set in it.
00212             Bin* binPtr = &(ref->bins[binNumber]);
00213             binPtr->bin = binNumber;
00214          
00215             // Read in the number of chunks.
00216             if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
00217             {
00218                 // Failed to read number of chunks.
00219                 // Return failure.
00220                 return(SamStatus::FAIL_IO);
00221             }
00222 
00223             // Read in the chunks.
00224             // Allocate space for the chunks.
00225             uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
00226             binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
00227             if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
00228             {
00229                 // Failed to read the chunks.
00230                 // Return failure.
00231                 return(SamStatus::FAIL_IO);
00232             }
00233 
00234             // Determine the min/max for this bin if it is not the max bin.
00235             if(binPtr->bin != MAX_NUM_BINS)
00236             {
00237                 for(int i = 0; i < binPtr->n_chunk; i++)
00238                 {
00239                     if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset)
00240                     {
00241                         ref->minChunkOffset = binPtr->chunks[i].chunk_beg;
00242                     }
00243                     if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset)
00244                     {
00245                         ref->maxChunkOffset = binPtr->chunks[i].chunk_end;
00246                     }
00247                     if(binPtr->chunks[i].chunk_end > maxOverallOffset)
00248                     {
00249                         maxOverallOffset = binPtr->chunks[i].chunk_end;
00250                     }
00251                 }
00252             }
00253             else
00254             {
00255                 // Mapped/unmapped are the last chunk of the
00256                 // MAX BIN
00257                 ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg;
00258                 ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end;
00259             }
00260         }
00261 
00262         // Read the number of intervals.
00263         if(ifread(indexFile, &(ref->n_intv), 4) != 4)
00264         {
00265             // Failed to read, set to 0.
00266             ref->n_intv = 0;
00267             // Return failure.
00268             return(SamStatus::FAIL_IO);
00269         }
00270 
00271         // Allocate space for the intervals and read them.
00272         uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
00273         ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
00274         if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
00275         {
00276             // Failed to read the linear index.
00277             // Return failure.
00278             return(SamStatus::FAIL_IO);
00279         }
00280     }
00281 
00282     // Successfully read teh bam index file.
00283     return(SamStatus::SUCCESS);
00284 }


The documentation for this class was generated from the following files:
Generated on Wed Nov 17 15:38:30 2010 for StatGen Software by  doxygen 1.6.3