BamIndex.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __BAM_INDEX_H__
00019 #define __BAM_INDEX_H__
00020 
00021 #include <stdint.h>
00022 #include <vector>
00023 #include <map>
00024 #include <stdlib.h>
00025 
00026 #include "InputFile.h"
00027 #include "SamStatus.h"
00028 
00029 class Chunk
00030 {
00031 public:
00032     uint64_t chunk_beg; // offset of the start of the chunk
00033     uint64_t chunk_end; // offset of the end of the chunk
00034     
00035     static const uint64_t MAX_CHUNK_VALUE = 0xFFFFFFFFFFFFFFFFULL;
00036 
00037     bool operator< (const Chunk& otherChunk) const
00038     {
00039         return(this->chunk_beg < otherChunk.chunk_beg);
00040     }
00041 };
00042 
00043 
00044 // This class contains chunks that are sorted by the beginning position.
00045 // This class hides how the chunks are actually stored (map, list ,etc),
00046 // so they can be interchanged.
00047 class SortedChunkList
00048 {
00049 public:
00050     // Returns the first chunk in the list and  removes it.
00051     Chunk pop();
00052     bool insert(const Chunk& chunkToInsert);
00053     void clear();
00054     bool empty();
00055     bool mergeOverlapping();
00056 
00057 private:
00058     std::map<uint64_t, Chunk> chunkList;
00059 };
00060 
00061 class BamIndex
00062 {
00063 public:
00064 
00065     BamIndex();
00066     ~BamIndex();
00067 
00068     /// Reset the member data for a new index file.
00069     void resetIndex();
00070 
00071     // Read & parse the specified index file.
00072     /// \param filename the bam index file to be read.
00073     /// \return the status of the read.
00074     SamStatus::Status readIndex(const char* filename);
00075 
00076     /// Get the list of chunks associated with this region.
00077     /// For an entire reference ID, set start and end to -1.
00078     /// To start at the beginning of the region, set start to 0/-1.
00079     /// To go to the end of the region, set end to -1.
00080     bool getChunksForRegion(int32_t refID, int32_t start, int32_t end, 
00081                             SortedChunkList& chunkList);
00082 
00083     uint64_t getMaxOffset() const;
00084 
00085     /// Get the minimum and maximum file offsets for the specfied reference ID.
00086     /// \param refID the reference ID to locate in the file.
00087     /// \param minOffset returns the min file offset for the specified reference
00088     /// \param maxOffset returns the max file offset for the specified reference
00089     /// \return whether or not the reference was found in the file
00090     bool getReferenceMinMax(int32_t refID, 
00091                             uint64_t& minOffset, 
00092                             uint64_t& maxOffset) const;
00093     
00094     /// Get the number of references in this index.
00095     /// \return number of references
00096     int32_t getNumRefs() const;
00097 
00098     /// Get the number of mapped reads for this reference id.  Returns -1 for
00099     /// out of range refIDs.
00100     /// \param refID reference ID for which to extract the number of mapped reads.
00101     /// \return number of mapped reads for the specified reference id.
00102     int32_t getNumMappedReads(int32_t refID);
00103 
00104     /// Get the number of unmapped reads for this reference id.  Returns -1 for
00105     /// out of range refIDs.
00106     /// \param refID reference ID for which to extract the number of unmapped reads.
00107     /// \return number of unmapped reads for the specified reference id
00108     int32_t getNumUnMappedReads(int32_t refID);
00109 
00110     /// Print the index information.
00111     /// \param refID reference ID for which to print info for.  -1 means print for all references.
00112     /// \param summary whether or not to just print a summary (defaults to false).  The summary just contains summary info for each reference and not every bin/chunk.
00113     void printIndex(int32_t refID, bool summary = false);
00114 
00115     // Returns the minimum offset of records that cross the 16K block that
00116     // contains the specified position for the given reference id.
00117     uint64_t getMinOffsetFromLinearIndex(int32_t refID, uint32_t position) const;
00118 
00119     // Number of reference sequences.
00120     /// The number used for an unknown number of reads.
00121     static const int32_t UNKNOWN_NUM_READS = -1;
00122 
00123     /// The number used for the reference id of unmapped reads.
00124     static const int32_t REF_ID_UNMAPPED = -1;
00125 
00126     /// The number used to indicate that all reference ids should be used.
00127     static const int32_t REF_ID_ALL = -2;
00128 
00129 private:
00130 
00131     const static uint32_t MAX_NUM_BINS = 37450; // per specs, at most 37450 bins.
00132     // Maximum allowed position (inclusive 512MB - 1)
00133     const static uint32_t MAX_POSITION = 536870911;
00134 
00135     // Number of bits in 1 linear index - how much to shift a position by
00136     // to determine which offset into the linear index to look for it.
00137     const static uint32_t LINEAR_INDEX_SHIFT = 14;
00138 
00139     class Bin
00140     {
00141     public:
00142         Bin(){chunks = NULL; reset();}
00143         ~Bin() {reset();}
00144         void reset()
00145         {
00146             if(chunks != NULL)
00147             {
00148                 free(chunks);
00149                 chunks = NULL;
00150             }
00151             n_chunk = 0; 
00152             bin = NOT_USED_BIN;
00153         }
00154         uint32_t bin; // The bin id.
00155         int32_t n_chunk; // The number of chunks.
00156         Chunk* chunks; // The chunks for this bin.
00157         static const uint32_t NOT_USED_BIN = 0xFFFFFFFF;
00158     };
00159 
00160     class Reference
00161     {
00162         // Add one to the max since there may now be an extra bin containing
00163         // the mapped/unmapped counts.
00164     public:
00165         static const int32_t UNKNOWN_MAP_INFO = -1;
00166         Reference(){ioffsets = NULL; reset();}
00167         ~Reference(){reset();}
00168         void reset()
00169         { 
00170             bins.clear(); 
00171             if(ioffsets != NULL)
00172             {
00173                 free(ioffsets);
00174                 ioffsets = NULL;
00175             }
00176             n_bin = 0; 
00177             n_intv = 0;
00178             minChunkOffset = -1;
00179             maxChunkOffset = 0;
00180             n_mapped = UNKNOWN_MAP_INFO;
00181             n_unmapped = UNKNOWN_MAP_INFO;
00182         }
00183         int32_t n_bin; // The number of bins.
00184         int32_t n_intv; // Number of intervals.
00185         std::vector<Bin> bins;  // The bins for this reference.
00186         uint64_t* ioffsets; // Offsets of intervals first alignments
00187         uint64_t minChunkOffset;
00188         uint64_t maxChunkOffset;
00189         int32_t n_mapped; // Number of mapped reads.
00190         int32_t n_unmapped; // Number of unmapped reads.
00191     };
00192 
00193     // Add the bins associated with the specified region to the passed in list.
00194     // start is incluive, end is exclusive.
00195     static int getBinsForRegion(uint32_t start, uint32_t end, uint16_t binList[MAX_NUM_BINS]);
00196 
00197     // Number of reference sequences.
00198     int32_t n_ref;
00199 
00200     uint64_t maxOverallOffset;
00201 
00202     int32_t myUnMappedNumReads;
00203 
00204     // The references.
00205     std::vector<Reference> myRefs;
00206 };
00207 
00208 
00209 #endif
Generated on Tue Sep 6 17:51:59 2011 for libStatGen Software by  doxygen 1.6.3