libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2011-2012 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __INDEX_BASE_H__ 00019 #define __INDEX_BASE_H__ 00020 00021 #include <stdint.h> 00022 #include <vector> 00023 #include <map> 00024 #include <stdlib.h> 00025 00026 #include "InputFile.h" 00027 #include "StatGenStatus.h" 00028 00029 00030 class Chunk 00031 { 00032 public: 00033 uint64_t chunk_beg; // offset of the start of the chunk 00034 uint64_t chunk_end; // offset of the end of the chunk 00035 00036 static const uint64_t MAX_CHUNK_VALUE = 0xFFFFFFFFFFFFFFFFULL; 00037 00038 bool operator< (const Chunk& otherChunk) const 00039 { 00040 return(this->chunk_beg < otherChunk.chunk_beg); 00041 } 00042 }; 00043 00044 00045 // This class contains chunks that are sorted by the beginning position. 00046 // This class hides how the chunks are actually stored (map, list ,etc), 00047 // so they can be interchanged. 00048 class SortedChunkList 00049 { 00050 public: 00051 // Returns the first chunk in the list and removes it. 00052 Chunk pop(); 00053 bool insert(const Chunk& chunkToInsert); 00054 void clear(); 00055 bool empty(); 00056 bool mergeOverlapping(); 00057 00058 private: 00059 std::map<uint64_t, Chunk> chunkList; 00060 }; 00061 00062 class IndexBase 00063 { 00064 public: 00065 00066 IndexBase(); 00067 virtual ~IndexBase(); 00068 00069 /// Reset the member data for a new index file. 00070 virtual void resetIndex(); 00071 00072 // Read & parse the specified index file. 00073 /// \param filename the bam index file to be read. 00074 /// \return the status of the read. 00075 virtual StatGenStatus::Status readIndex(const char* filename) = 0; 00076 00077 /// Get the number of references in this index. 00078 /// \return number of references 00079 int32_t getNumRefs() const; 00080 00081 // Returns the minimum offset of records that cross the 16K block that 00082 // contains the specified position for the given reference id. 00083 bool getMinOffsetFromLinearIndex(int32_t refID, uint32_t position, 00084 uint64_t& minOffset) const; 00085 00086 protected: 00087 const static uint32_t MAX_NUM_BINS = 37450; // per specs, at most 37450 bins 00088 00089 // Maximum allowed position (inclusive 512MB - 1) 00090 const static uint32_t MAX_POSITION = 536870911; 00091 00092 // Number of bits in 1 linear index - how much to shift a position by 00093 // to determine which offset into the linear index to look for it. 00094 const static uint32_t LINEAR_INDEX_SHIFT = 14; 00095 00096 class Bin 00097 { 00098 public: 00099 Bin(){chunks = NULL; reset();} 00100 ~Bin() {reset();} 00101 void reset() 00102 { 00103 if(chunks != NULL) 00104 { 00105 free(chunks); 00106 chunks = NULL; 00107 } 00108 n_chunk = 0; 00109 bin = NOT_USED_BIN; 00110 } 00111 uint32_t bin; // The bin id. 00112 int32_t n_chunk; // The number of chunks. 00113 Chunk* chunks; // The chunks for this bin. 00114 static const uint32_t NOT_USED_BIN = 0xFFFFFFFF; 00115 }; 00116 00117 class Reference 00118 { 00119 // Add one to the max since there may now be an extra bin containing 00120 // the mapped/unmapped counts. 00121 public: 00122 static const int32_t UNKNOWN_MAP_INFO = -1; 00123 Reference(){ioffsets = NULL; reset();} 00124 ~Reference(){reset();} 00125 void reset() 00126 { 00127 bins.clear(); 00128 if(ioffsets != NULL) 00129 { 00130 free(ioffsets); 00131 ioffsets = NULL; 00132 } 00133 n_bin = 0; 00134 n_intv = 0; 00135 minChunkOffset = UNSET_MIN_CHUNK_OFFSET; 00136 maxChunkOffset = 0; 00137 n_mapped = UNKNOWN_MAP_INFO; 00138 n_unmapped = UNKNOWN_MAP_INFO; 00139 } 00140 int32_t n_bin; // The number of bins. 00141 int32_t n_intv; // Number of intervals. 00142 std::vector<Bin> bins; // The bins for this reference. 00143 uint64_t* ioffsets; // Offsets of intervals first alignments 00144 uint64_t minChunkOffset; 00145 uint64_t maxChunkOffset; 00146 int32_t n_mapped; // Number of mapped reads. 00147 int32_t n_unmapped; // Number of unmapped reads. 00148 00149 static const uint64_t UNSET_MIN_CHUNK_OFFSET = 0xFFFFFFFFFFFFFFFFULL; 00150 }; 00151 00152 // Add the bins associated with the specified region to the passed in list. 00153 // start is incluive, end is exclusive. 00154 static int getBinsForRegion(uint32_t start, uint32_t end, uint16_t binList[MAX_NUM_BINS]); 00155 00156 // Number of reference sequences. 00157 int32_t n_ref; 00158 00159 // The references. 00160 std::vector<Reference> myRefs; 00161 }; 00162 00163 00164 #endif