00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __BAM_INDEX_H__ 00019 #define __BAM_INDEX_H__ 00020 00021 #include <stdint.h> 00022 #include <vector> 00023 #include <map> 00024 #include <stdlib.h> 00025 00026 #include "InputFile.h" 00027 #include "SamStatus.h" 00028 00029 class Chunk 00030 { 00031 public: 00032 uint64_t chunk_beg; // offset of the start of the chunk 00033 uint64_t chunk_end; // offset of the end of the chunk 00034 00035 static const uint64_t MAX_CHUNK_VALUE = 0xFFFFFFFFFFFFFFFFULL; 00036 00037 bool operator< (const Chunk& otherChunk) const 00038 { 00039 return(this->chunk_beg < otherChunk.chunk_beg); 00040 } 00041 }; 00042 00043 00044 // This class contains chunks that are sorted by the beginning position. 00045 // This class hides how the chunks are actually stored (map, list ,etc), 00046 // so they can be interchanged. 00047 class SortedChunkList 00048 { 00049 public: 00050 // Returns the first chunk in the list and removes it. 00051 Chunk pop(); 00052 bool insert(const Chunk& chunkToInsert); 00053 void clear(); 00054 bool empty(); 00055 bool mergeOverlapping(); 00056 00057 private: 00058 std::map<uint64_t, Chunk> chunkList; 00059 }; 00060 00061 class BamIndex 00062 { 00063 public: 00064 00065 BamIndex(); 00066 ~BamIndex(); 00067 00068 /// Reset the member data for a new index file. 00069 void resetIndex(); 00070 00071 // Read & parse the specified index file. 00072 /// \param filename the bam index file to be read. 00073 /// \return the status of the read. 00074 SamStatus::Status readIndex(const char* filename); 00075 00076 /// Get the list of chunks associated with this region. 00077 /// For an entire reference ID, set start and end to -1. 00078 /// To start at the beginning of the region, set start to 0/-1. 00079 /// To go to the end of the region, set end to -1. 00080 bool getChunksForRegion(int32_t refID, int32_t start, int32_t end, 00081 SortedChunkList& chunkList); 00082 00083 uint64_t getMaxOffset() const; 00084 00085 /// Get the minimum and maximum file offsets for the specfied reference ID. 00086 /// \param refID the reference ID to locate in the file. 00087 /// \param minOffset returns the min file offset for the specified reference 00088 /// \param maxOffset returns the max file offset for the specified reference 00089 /// \return whether or not the reference was found in the file 00090 bool getReferenceMinMax(int32_t refID, 00091 uint64_t& minOffset, 00092 uint64_t& maxOffset) const; 00093 00094 /// Get the number of references in this index. 00095 /// \return number of references 00096 int32_t getNumRefs() const; 00097 00098 /// Get the number of mapped reads for this reference id. Returns -1 for 00099 /// out of range refIDs. 00100 /// \param refID reference ID for which to extract the number of mapped reads. 00101 /// \return number of mapped reads for the specified reference id. 00102 int32_t getNumMappedReads(int32_t refID); 00103 00104 /// Get the number of unmapped reads for this reference id. Returns -1 for 00105 /// out of range refIDs. 00106 /// \param refID reference ID for which to extract the number of unmapped reads. 00107 /// \return number of unmapped reads for the specified reference id 00108 int32_t getNumUnMappedReads(int32_t refID); 00109 00110 /// Print the index information. 00111 /// \param refID reference ID for which to print info for. -1 means print for all references. 00112 /// \param summary whether or not to just print a summary (defaults to false). The summary just contains summary info for each reference and not every bin/chunk. 00113 void printIndex(int32_t refID, bool summary = false); 00114 00115 // Returns the minimum offset of records that cross the 16K block that 00116 // contains the specified position for the given reference id. 00117 uint64_t getMinOffsetFromLinearIndex(int32_t refID, uint32_t position) const; 00118 00119 // Number of reference sequences. 00120 /// The number used for an unknown number of reads. 00121 static const int32_t UNKNOWN_NUM_READS = -1; 00122 00123 /// The number used for the reference id of unmapped reads. 00124 static const int32_t REF_ID_UNMAPPED = -1; 00125 00126 /// The number used to indicate that all reference ids should be used. 00127 static const int32_t REF_ID_ALL = -2; 00128 00129 private: 00130 00131 const static uint32_t MAX_NUM_BINS = 37450; // per specs, at most 37450 bins. 00132 // Maximum allowed position (inclusive 512MB - 1) 00133 const static uint32_t MAX_POSITION = 536870911; 00134 00135 // Number of bits in 1 linear index - how much to shift a position by 00136 // to determine which offset into the linear index to look for it. 00137 const static uint32_t LINEAR_INDEX_SHIFT = 14; 00138 00139 class Bin 00140 { 00141 public: 00142 Bin(){chunks = NULL; reset();} 00143 ~Bin() {reset();} 00144 void reset() 00145 { 00146 if(chunks != NULL) 00147 { 00148 free(chunks); 00149 chunks = NULL; 00150 } 00151 n_chunk = 0; 00152 bin = NOT_USED_BIN; 00153 } 00154 uint32_t bin; // The bin id. 00155 int32_t n_chunk; // The number of chunks. 00156 Chunk* chunks; // The chunks for this bin. 00157 static const uint32_t NOT_USED_BIN = 0xFFFFFFFF; 00158 }; 00159 00160 class Reference 00161 { 00162 // Add one to the max since there may now be an extra bin containing 00163 // the mapped/unmapped counts. 00164 public: 00165 static const int32_t UNKNOWN_MAP_INFO = -1; 00166 Reference(){ioffsets = NULL; reset();} 00167 ~Reference(){reset();} 00168 void reset() 00169 { 00170 bins.clear(); 00171 if(ioffsets != NULL) 00172 { 00173 free(ioffsets); 00174 ioffsets = NULL; 00175 } 00176 n_bin = 0; 00177 n_intv = 0; 00178 minChunkOffset = -1; 00179 maxChunkOffset = 0; 00180 n_mapped = UNKNOWN_MAP_INFO; 00181 n_unmapped = UNKNOWN_MAP_INFO; 00182 } 00183 int32_t n_bin; // The number of bins. 00184 int32_t n_intv; // Number of intervals. 00185 std::vector<Bin> bins; // The bins for this reference. 00186 uint64_t* ioffsets; // Offsets of intervals first alignments 00187 uint64_t minChunkOffset; 00188 uint64_t maxChunkOffset; 00189 int32_t n_mapped; // Number of mapped reads. 00190 int32_t n_unmapped; // Number of unmapped reads. 00191 }; 00192 00193 // Add the bins associated with the specified region to the passed in list. 00194 // start is incluive, end is exclusive. 00195 static int getBinsForRegion(uint32_t start, uint32_t end, uint16_t binList[MAX_NUM_BINS]); 00196 00197 // Number of reference sequences. 00198 int32_t n_ref; 00199 00200 uint64_t maxOverallOffset; 00201 00202 int32_t myUnMappedNumReads; 00203 00204 // The references. 00205 std::vector<Reference> myRefs; 00206 }; 00207 00208 00209 #endif