BamIndex.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __BAM_INDEX_H__
00019 #define __BAM_INDEX_H__
00020 
00021 #include <stdint.h>
00022 #include <vector>
00023 #include <map>
00024 #include <stdlib.h>
00025 
00026 #include "InputFile.h"
00027 #include "SamStatus.h"
00028 
00029 class Chunk
00030 {
00031 public:
00032     uint64_t chunk_beg; // offset of the start of the chunk
00033     uint64_t chunk_end; // offset of the end of the chunk
00034     
00035     static const uint64_t MAX_CHUNK_VALUE = 0xFFFFFFFFFFFFFFFFULL;
00036 
00037     bool operator< (const Chunk& otherChunk) const
00038     {
00039         return(this->chunk_beg < otherChunk.chunk_beg);
00040     }
00041 };
00042 
00043 
00044 // This class contains chunks that are sorted by the beginning position.
00045 // This class hides how the chunks are actually stored (map, list ,etc),
00046 // so they can be interchanged.
00047 class SortedChunkList
00048 {
00049 public:
00050     // Returns the first chunk in the list and  removes it.
00051     Chunk pop();
00052     bool insert(const Chunk& chunkToInsert);
00053     void clear();
00054     bool empty();
00055     bool mergeOverlapping();
00056 
00057 private:
00058     std::map<uint64_t, Chunk> chunkList;
00059 };
00060 
00061 class BamIndex
00062 {
00063 public:
00064 
00065     BamIndex();
00066     ~BamIndex();
00067 
00068     /// Reset the member data for a new index file.
00069     void resetIndex();
00070 
00071     // Read & parse the specified index file.
00072     /// \param filename the bam index file to be read.
00073     /// \return the status of the read.
00074     SamStatus::Status readIndex(const char* filename);
00075 
00076     /// Get the list of chunks associated with this region.
00077     /// For an entire reference ID, set start and end to -1.
00078     /// To start at the beginning of the region, set start to 0/-1.
00079     /// To go to the end of the region, set end to -1.
00080     bool getChunksForRegion(int32_t refID, int32_t start, int32_t end, 
00081                             SortedChunkList& chunkList);
00082 
00083     uint64_t getMaxOffset() const;
00084 
00085     /// Get the minimum and maximum file offsets for the specfied reference ID.
00086     /// \param refID the reference ID to locate in the file.
00087     /// \param minOffset returns the min file offset for the specified reference
00088     /// \param maxOffset returns the max file offset for the specified reference
00089     /// \return whether or not the reference was found in the file
00090     bool getReferenceMinMax(int32_t refID, 
00091                             uint64_t& minOffset, 
00092                             uint64_t& maxOffset) const;
00093     
00094     /// Get the number of references in this index.
00095     /// \return number of references
00096     int32_t getNumRefs() const;
00097 
00098     /// Print the index information.
00099     /// \param refID reference ID for which to print info for.  -1 means print for all references.
00100     /// \param summary whether or not to just print a summary (defaults to false).  The summary just contains summary info for each reference and not every bin/chunk.
00101     void printIndex(int32_t refID, bool summary = false);
00102 
00103     /// The number used for the reference id of unmapped reads.
00104     static const int32_t REF_ID_UNMAPPED = -1;
00105 
00106     /// The number used to indicate that all reference ids should be used.
00107     static const int32_t REF_ID_ALL = -2;
00108 
00109 private:
00110 
00111     const static uint32_t MAX_NUM_BINS = 37450; // per specs, at most 37450 bins.
00112     // Maximum allowed position (inclusive 512MB - 1)
00113     const static uint32_t MAX_POSITION = 536870911;
00114 
00115     // Number of bits in 1 linear index - how much to shift a position by
00116     // to determine which offset into the linear index to look for it.
00117     const static uint32_t LINEAR_INDEX_SHIFT = 14;
00118 
00119     class Bin
00120     {
00121     public:
00122         Bin(){chunks = NULL; reset();}
00123         ~Bin() {reset();}
00124         void reset()
00125         {
00126             if(chunks != NULL)
00127             {
00128                 free(chunks);
00129                 chunks = NULL;
00130             }
00131             n_chunk = 0; 
00132             bin = NOT_USED_BIN;
00133         }
00134         uint32_t bin; // The bin id.
00135         int32_t n_chunk; // The number of chunks.
00136         Chunk* chunks; // The chunks for this bin.
00137         static const uint32_t NOT_USED_BIN = 0xFFFFFFFF;
00138     };
00139 
00140     class Reference
00141     {
00142         // Add one to the max since there may now be an extra bin containing
00143         // the mapped/unmapped counts.
00144     public:
00145         static const int32_t UNKNOWN_MAP_INFO = -1;
00146         Reference(){ioffsets = NULL; reset();}
00147         ~Reference(){reset();}
00148         void reset()
00149         { 
00150             bins.clear(); 
00151             if(ioffsets != NULL)
00152             {
00153                 free(ioffsets);
00154                 ioffsets = NULL;
00155             }
00156             n_bin = 0; 
00157             n_intv = 0;
00158             minChunkOffset = -1;
00159             maxChunkOffset = 0;
00160             n_mapped = UNKNOWN_MAP_INFO;
00161             n_unmapped = UNKNOWN_MAP_INFO;
00162         }
00163         int32_t n_bin; // The number of bins.
00164         int32_t n_intv; // Number of intervals.
00165         std::vector<Bin> bins;  // The bins for this reference.
00166         uint64_t* ioffsets; // Offsets of intervals first alignments
00167         uint64_t minChunkOffset;
00168         uint64_t maxChunkOffset;
00169         int32_t n_mapped; // Number of mapped reads.
00170         int32_t n_unmapped; // Number of unmapped reads.
00171     };
00172 
00173     // Add the bins associated with the specified region to the passed in list.
00174     // start is incluive, end is exclusive.
00175     static int getBinsForRegion(uint32_t start, uint32_t end, uint16_t binList[MAX_NUM_BINS]);
00176 
00177     // Returns the minimum offset of records that cross the 16K block that
00178     // contains the specified position for the given reference id.
00179     uint64_t getMinOffsetFromLinearIndex(int32_t refID, uint32_t position);
00180 
00181     // Number of reference sequences.
00182     int32_t n_ref;
00183 
00184     uint64_t maxOverallOffset;
00185 
00186     // The references.
00187     std::vector<Reference> myRefs;
00188 };
00189 
00190 
00191 #endif
Generated on Wed Nov 17 15:38:27 2010 for StatGen Software by  doxygen 1.6.3