libStatGen Software  1
IndexBase.h
00001 /*
00002  *  Copyright (C) 2011-2012  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __INDEX_BASE_H__
00019 #define __INDEX_BASE_H__
00020 
00021 #include <stdint.h>
00022 #include <vector>
00023 #include <map>
00024 #include <stdlib.h>
00025 
00026 #include "InputFile.h"
00027 #include "StatGenStatus.h"
00028 
00029 
00030 class Chunk
00031 {
00032 public:
00033     uint64_t chunk_beg; // offset of the start of the chunk
00034     uint64_t chunk_end; // offset of the end of the chunk
00035     
00036     static const uint64_t MAX_CHUNK_VALUE = 0xFFFFFFFFFFFFFFFFULL;
00037 
00038     bool operator< (const Chunk& otherChunk) const
00039     {
00040         return(this->chunk_beg < otherChunk.chunk_beg);
00041     }
00042 };
00043 
00044 
00045 // This class contains chunks that are sorted by the beginning position.
00046 // This class hides how the chunks are actually stored (map, list ,etc),
00047 // so they can be interchanged.
00048 class SortedChunkList
00049 {
00050 public:
00051     // Returns the first chunk in the list and  removes it.
00052     Chunk pop();
00053     bool insert(const Chunk& chunkToInsert);
00054     void clear();
00055     bool empty();
00056     bool mergeOverlapping();
00057 
00058 private:
00059     std::map<uint64_t, Chunk> chunkList;
00060 };
00061 
00062 class IndexBase
00063 {
00064 public:
00065 
00066     IndexBase();
00067     virtual ~IndexBase();
00068 
00069     /// Reset the member data for a new index file.
00070     virtual void resetIndex();
00071 
00072     // Read & parse the specified index file.
00073     /// \param filename the bam index file to be read.
00074     /// \return the status of the read.
00075     virtual StatGenStatus::Status readIndex(const char* filename) = 0;
00076 
00077     /// Get the number of references in this index.
00078     /// \return number of references
00079     int32_t getNumRefs() const;
00080 
00081     // Returns the minimum offset of records that cross the 16K block that
00082     // contains the specified position for the given reference id.
00083     bool getMinOffsetFromLinearIndex(int32_t refID, uint32_t position,
00084                                      uint64_t& minOffset) const;
00085 
00086 protected:
00087     const static uint32_t MAX_NUM_BINS = 37450; // per specs, at most 37450 bins
00088 
00089     // Maximum allowed position (inclusive 512MB - 1)
00090     const static uint32_t MAX_POSITION = 536870911;
00091 
00092     // Number of bits in 1 linear index - how much to shift a position by
00093     // to determine which offset into the linear index to look for it.
00094     const static uint32_t LINEAR_INDEX_SHIFT = 14;
00095 
00096     class Bin
00097     {
00098     public:
00099         Bin(){chunks = NULL; reset();}
00100         ~Bin() {reset();}
00101         void reset()
00102         {
00103             if(chunks != NULL)
00104             {
00105                 free(chunks);
00106                 chunks = NULL;
00107             }
00108             n_chunk = 0; 
00109             bin = NOT_USED_BIN;
00110         }
00111         uint32_t bin; // The bin id.
00112         int32_t n_chunk; // The number of chunks.
00113         Chunk* chunks; // The chunks for this bin.
00114         static const uint32_t NOT_USED_BIN = 0xFFFFFFFF;
00115     };
00116 
00117     class Reference
00118     {
00119         // Add one to the max since there may now be an extra bin containing
00120         // the mapped/unmapped counts.
00121     public:
00122         static const int32_t UNKNOWN_MAP_INFO = -1;
00123         Reference(){ioffsets = NULL; reset();}
00124         ~Reference(){reset();}
00125         void reset()
00126         { 
00127             bins.clear(); 
00128             if(ioffsets != NULL)
00129             {
00130                 free(ioffsets);
00131                 ioffsets = NULL;
00132             }
00133             n_bin = 0; 
00134             n_intv = 0;
00135             minChunkOffset = UNSET_MIN_CHUNK_OFFSET;
00136             maxChunkOffset = 0;
00137             n_mapped = UNKNOWN_MAP_INFO;
00138             n_unmapped = UNKNOWN_MAP_INFO;
00139         }
00140         int32_t n_bin; // The number of bins.
00141         int32_t n_intv; // Number of intervals.
00142         std::vector<Bin> bins;  // The bins for this reference.
00143         uint64_t* ioffsets; // Offsets of intervals first alignments
00144         uint64_t minChunkOffset;
00145         uint64_t maxChunkOffset;
00146         int32_t n_mapped; // Number of mapped reads.
00147         int32_t n_unmapped; // Number of unmapped reads.
00148 
00149         static const uint64_t UNSET_MIN_CHUNK_OFFSET = 0xFFFFFFFFFFFFFFFFULL;
00150     };
00151 
00152     // Add the bins associated with the specified region to the passed in list.
00153     // start is incluive, end is exclusive.
00154     static int getBinsForRegion(uint32_t start, uint32_t end, uint16_t binList[MAX_NUM_BINS]);
00155 
00156     // Number of reference sequences.
00157     int32_t n_ref;
00158 
00159     // The references.
00160     std::vector<Reference> myRefs;
00161 };
00162 
00163 
00164 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends