libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010-2012 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include "IndexBase.h" 00019 #include <iomanip> 00020 00021 Chunk SortedChunkList::pop() 00022 { 00023 Chunk newChunk = chunkList.begin()->second; 00024 chunkList.erase(chunkList.begin()); 00025 return(newChunk); 00026 } 00027 00028 00029 bool SortedChunkList::insert(const Chunk& chunkToInsert) 00030 { 00031 std::pair<std::map<uint64_t, Chunk>::iterator, bool> insertRes; 00032 // Insert the passed in chunk. 00033 insertRes = 00034 chunkList.insert(std::pair<uint64_t, Chunk>(chunkToInsert.chunk_beg, 00035 chunkToInsert)); 00036 00037 if(!insertRes.second) 00038 { 00039 // Failed to insert the chunk. 00040 std::cerr << "Failed to insert into the SortedChunkList.\n"; 00041 std::cerr << "\tpreviously found chunk:\tbeg = " << std::hex 00042 << insertRes.first->second.chunk_beg 00043 << "\tend = " 00044 << insertRes.first->second.chunk_end 00045 << "\nnew chunk:\tbeg = " << std::hex 00046 << chunkToInsert.chunk_beg 00047 << "\tend = " 00048 << chunkToInsert.chunk_end 00049 << std::endl; 00050 } 00051 // return the result that comes from insertRes. 00052 return(insertRes.second); 00053 } 00054 00055 void SortedChunkList::clear() 00056 { 00057 chunkList.clear(); 00058 } 00059 00060 bool SortedChunkList::empty() 00061 { 00062 return(chunkList.empty()); 00063 } 00064 00065 00066 // Merge overlapping chunks found in this list. 00067 bool SortedChunkList::mergeOverlapping() 00068 { 00069 // Start at the beginning of the list and iterate through. 00070 std::map<uint64_t, Chunk>::iterator currentPos = chunkList.begin(); 00071 std::map<uint64_t, Chunk>::iterator nextPos = chunkList.begin(); 00072 if(nextPos != chunkList.end()) 00073 { 00074 ++nextPos; 00075 } 00076 00077 // Loop until the end is reached. 00078 while(nextPos != chunkList.end()) 00079 { 00080 // If the next chunk is completely contained within the current 00081 // chunk (its end is less than the current chunk's end), 00082 // delete it since its position is already covered. 00083 if(nextPos->second.chunk_end < currentPos->second.chunk_end) 00084 { 00085 chunkList.erase(nextPos); 00086 nextPos = currentPos; 00087 ++nextPos; 00088 continue; 00089 } 00090 00091 // If the next chunk's start position's BGZF block is less than or 00092 // equal to the BGZF block of the current chunk's end position, 00093 // combine the two chunks into the current chunk. 00094 if((nextPos->second.chunk_beg >> 16) <= 00095 (currentPos->second.chunk_end >> 16)) 00096 { 00097 currentPos->second.chunk_end = nextPos->second.chunk_end; 00098 // nextPos has now been included in the current pos, so 00099 // remove it. 00100 chunkList.erase(nextPos); 00101 nextPos = currentPos; 00102 ++nextPos; 00103 continue; 00104 } 00105 else 00106 { 00107 // Nothing to combine. So try combining at the next 00108 currentPos = nextPos; 00109 ++nextPos; 00110 } 00111 } 00112 return(true); 00113 } 00114 00115 00116 IndexBase::IndexBase() 00117 : n_ref(0) 00118 { 00119 myRefs.clear(); 00120 } 00121 00122 00123 00124 IndexBase::~IndexBase() 00125 { 00126 } 00127 00128 00129 // Reset the member data for a new index file. 00130 void IndexBase::resetIndex() 00131 { 00132 n_ref = 0; 00133 // Clear the references. 00134 myRefs.clear(); 00135 } 00136 00137 00138 // Get the number of references in this index. 00139 int32_t IndexBase::getNumRefs() const 00140 { 00141 // Return the number of references. 00142 return(myRefs.size()); 00143 } 00144 00145 00146 // Returns the minimum offset of records that cross the 16K block that 00147 // contains the specified position for the given reference id. 00148 // The basic logic is from samtools reg2bins and the samtools format specification pdf. 00149 int IndexBase::getBinsForRegion(uint32_t start, uint32_t end, uint16_t binList[MAX_NUM_BINS]) 00150 { 00151 uint32_t binListIndex = 0, binNum; 00152 --end; 00153 00154 // Check if beg/end go too high, set to max position. 00155 if(start > MAX_POSITION) 00156 { 00157 start = MAX_POSITION; 00158 } 00159 if(end > MAX_POSITION) 00160 { 00161 end = MAX_POSITION; 00162 } 00163 00164 binList[binListIndex++] = 0; 00165 for (binNum = 1 + (start>>26); binNum <= 1 + (end>>26); ++binNum) 00166 binList[binListIndex++] = binNum; 00167 for (binNum = 9 + (start>>23); binNum <= 9 + (end>>23); ++binNum) 00168 binList[binListIndex++] = binNum; 00169 for (binNum = 73 + (start>>20); binNum <= 73 + (end>>20); ++binNum) 00170 binList[binListIndex++] = binNum; 00171 for (binNum = 585 + (start>>17); binNum <= 585 + (end>>17); ++binNum) 00172 binList[binListIndex++] = binNum; 00173 for (binNum = 4681 + (start>>14); binNum <= 4681 + (end>>14); ++binNum) 00174 binList[binListIndex++] = binNum; 00175 00176 // binListIndex contains the number of items added to the list. 00177 return binListIndex; 00178 } 00179 00180 00181 // Returns the minimum offset of records that cross the 16K block that 00182 // contains the specified position for the given reference id. 00183 bool IndexBase::getMinOffsetFromLinearIndex(int32_t refID, uint32_t position, 00184 uint64_t& minOffset) const 00185 { 00186 int32_t linearIndex = position >> LINEAR_INDEX_SHIFT; 00187 00188 minOffset = 0; 00189 00190 if(refID > n_ref) 00191 { 00192 // out of range of the references, return false. 00193 return(false); 00194 } 00195 // Check to see if the position is out of range of the linear index. 00196 int32_t linearOffsetSize = myRefs[refID].n_intv; 00197 00198 // If there are no entries in the linear index, return false. 00199 // Or if the linear index is not large enough to include 00200 // the start block, then there can be no records that cross 00201 // our region, so return false. 00202 if((linearOffsetSize == 0) || (linearIndex >= linearOffsetSize)) 00203 00204 { 00205 return(false); 00206 } 00207 00208 // The linear index is specified for this block, so return that value. 00209 minOffset = myRefs[refID].ioffsets[linearIndex]; 00210 00211 // If the offset is 0, go to the previous block that has an offset. 00212 // This is due to a couple of bugs in older sam tools indexes. 00213 // 1) they add one to the index location (so when reading those, you 00214 // may be starting earlier than necessary) 00215 // 2) (the bigger issue) They did not include bins 4681-37449 in 00216 // the linear index. 00217 while((minOffset == 0) && (--linearIndex >= 0)) 00218 { 00219 minOffset = myRefs[refID].ioffsets[linearIndex]; 00220 } 00221 00222 00223 // If the minOffset is still 0 when moving forward, 00224 // check later indices to find a non-zero since we don't want to return 00225 // an offset of 0 since the record can't start at 0 we want to at least 00226 // return the first record position for this reference. 00227 linearIndex = 0; 00228 while((minOffset == 0) && (linearIndex < linearOffsetSize)) 00229 { 00230 minOffset = myRefs[refID].ioffsets[linearIndex]; 00231 linearIndex++; 00232 } 00233 if(minOffset == 0) 00234 { 00235 // Could not find a valid start position for this reference. 00236 return(false); 00237 } 00238 return(true); 00239 }