libStatGen Software  1
IndexBase.cpp
00001 /*
00002  *  Copyright (C) 2010-2012  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "IndexBase.h"
00019 #include <iomanip>
00020 
00021 Chunk SortedChunkList::pop()
00022 {
00023     Chunk newChunk = chunkList.begin()->second;
00024     chunkList.erase(chunkList.begin());
00025     return(newChunk);
00026 }
00027 
00028 
00029 bool SortedChunkList::insert(const Chunk& chunkToInsert)
00030 {
00031     std::pair<std::map<uint64_t, Chunk>::iterator, bool> insertRes;
00032     // Insert the passed in chunk.
00033     insertRes = 
00034         chunkList.insert(std::pair<uint64_t, Chunk>(chunkToInsert.chunk_beg,
00035                                                     chunkToInsert));
00036 
00037     if(!insertRes.second)
00038     {
00039         // Failed to insert the chunk.
00040         std::cerr << "Failed to insert into the SortedChunkList.\n";
00041         std::cerr << "\tpreviously found chunk:\tbeg = " << std::hex
00042                   << insertRes.first->second.chunk_beg 
00043                   << "\tend = "
00044                   << insertRes.first->second.chunk_end
00045                   << "\nnew chunk:\tbeg = " << std::hex
00046                   << chunkToInsert.chunk_beg 
00047                   << "\tend = "
00048                   << chunkToInsert.chunk_end
00049                   << std::endl;
00050     }
00051     // return the result that comes from insertRes.
00052     return(insertRes.second);
00053 }
00054 
00055 void SortedChunkList::clear()
00056 {
00057     chunkList.clear();
00058 }
00059 
00060 bool SortedChunkList::empty()
00061 {
00062     return(chunkList.empty());
00063 }
00064 
00065 
00066 // Merge overlapping chunks found in this list.
00067 bool SortedChunkList::mergeOverlapping()
00068 {
00069     // Start at the beginning of the list and iterate through.
00070     std::map<uint64_t, Chunk>::iterator currentPos = chunkList.begin();
00071     std::map<uint64_t, Chunk>::iterator nextPos = chunkList.begin();
00072     if(nextPos != chunkList.end())
00073     {
00074         ++nextPos;
00075     }
00076     
00077     // Loop until the end is reached.
00078     while(nextPos != chunkList.end())
00079     {
00080         // If the next chunk is completely contained within the current 
00081         // chunk (its end is less than the current chunk's end), 
00082         // delete it since its position is already covered.
00083         if(nextPos->second.chunk_end < currentPos->second.chunk_end)
00084         {
00085             chunkList.erase(nextPos);
00086             nextPos = currentPos;
00087             ++nextPos;
00088             continue;
00089         }
00090         
00091         // If the next chunk's start position's BGZF block is less than or
00092         // equal to the BGZF block of the current chunk's end position,
00093         // combine the two chunks into the current chunk.
00094         if((nextPos->second.chunk_beg >> 16) <= 
00095            (currentPos->second.chunk_end >> 16))
00096         {
00097             currentPos->second.chunk_end = nextPos->second.chunk_end;
00098             // nextPos has now been included in the current pos, so
00099             // remove it.
00100             chunkList.erase(nextPos);
00101             nextPos = currentPos;
00102             ++nextPos;
00103             continue;
00104         }
00105         else
00106         {
00107             // Nothing to combine.  So try combining at the next 
00108             currentPos = nextPos;
00109             ++nextPos;
00110         }
00111     }
00112     return(true);
00113 }
00114 
00115 
00116 IndexBase::IndexBase()
00117     : n_ref(0)
00118 {
00119     myRefs.clear();
00120 }
00121 
00122 
00123 
00124 IndexBase::~IndexBase()
00125 {
00126 }
00127 
00128 
00129 // Reset the member data for a new index file.
00130 void IndexBase::resetIndex()
00131 {
00132     n_ref = 0;
00133     // Clear the references.
00134     myRefs.clear();
00135 }
00136 
00137     
00138 // Get the number of references in this index.
00139 int32_t IndexBase::getNumRefs() const
00140 {
00141     // Return the number of references.
00142     return(myRefs.size());
00143 }
00144 
00145 
00146 // Returns the minimum offset of records that cross the 16K block that
00147 // contains the specified position for the given reference id.
00148 // The basic logic is from samtools reg2bins and the samtools format specification pdf.
00149 int IndexBase::getBinsForRegion(uint32_t start, uint32_t end, uint16_t binList[MAX_NUM_BINS])
00150 {
00151     uint32_t binListIndex = 0, binNum;
00152     --end;
00153 
00154         // Check if beg/end go too high, set to max position.
00155         if(start > MAX_POSITION)
00156         {
00157             start = MAX_POSITION;
00158         }
00159         if(end > MAX_POSITION)
00160         {
00161             end = MAX_POSITION;
00162         }
00163 
00164     binList[binListIndex++] = 0;
00165     for (binNum =    1 + (start>>26); binNum <=    1 + (end>>26); ++binNum) 
00166             binList[binListIndex++] = binNum;
00167     for (binNum =    9 + (start>>23); binNum <=    9 + (end>>23); ++binNum) 
00168             binList[binListIndex++] = binNum;
00169     for (binNum =   73 + (start>>20); binNum <=   73 + (end>>20); ++binNum)
00170             binList[binListIndex++] = binNum;
00171     for (binNum =  585 + (start>>17); binNum <=  585 + (end>>17); ++binNum)
00172             binList[binListIndex++] = binNum;
00173     for (binNum = 4681 + (start>>14); binNum <= 4681 + (end>>14); ++binNum)
00174             binList[binListIndex++] = binNum;
00175 
00176         // binListIndex contains the number of items added to the list.
00177     return binListIndex;
00178 }
00179 
00180 
00181 // Returns the minimum offset of records that cross the 16K block that
00182 // contains the specified position for the given reference id.
00183 bool IndexBase::getMinOffsetFromLinearIndex(int32_t refID, uint32_t position,
00184                                             uint64_t& minOffset) const
00185 {
00186     int32_t linearIndex = position >> LINEAR_INDEX_SHIFT;
00187 
00188     minOffset = 0;
00189 
00190     if(refID > n_ref)
00191     {
00192         // out of range of the references, return false.
00193         return(false);
00194     }
00195     // Check to see if the position is out of range of the linear index.
00196     int32_t linearOffsetSize = myRefs[refID].n_intv;
00197 
00198     // If there are no entries in the linear index, return false.
00199     // Or if the linear index is not large enough to include
00200     // the start block, then there can be no records that cross
00201     // our region, so return false.
00202     if((linearOffsetSize == 0) || (linearIndex >= linearOffsetSize))
00203 
00204     {
00205         return(false);
00206     }
00207 
00208     // The linear index is specified for this block, so return that value.
00209     minOffset = myRefs[refID].ioffsets[linearIndex];
00210     
00211     // If the offset is 0, go to the previous block that has an offset.
00212     // This is due to a couple of bugs in older sam tools indexes.
00213     // 1) they add one to the index location (so when reading those, you
00214     // may be starting earlier than necessary)
00215     // 2) (the bigger issue) They did not include bins 4681-37449 in
00216     // the linear index.
00217     while((minOffset == 0) && (--linearIndex >= 0))
00218     {
00219         minOffset = myRefs[refID].ioffsets[linearIndex]; 
00220     }
00221 
00222 
00223     // If the minOffset is still 0 when moving forward,
00224     // check later indices to find a non-zero since we don't want to return
00225     // an offset of 0 since the record can't start at 0 we want to at least
00226     // return the first record position for this reference.
00227     linearIndex = 0;
00228     while((minOffset == 0) && (linearIndex < linearOffsetSize))
00229     {
00230          minOffset = myRefs[refID].ioffsets[linearIndex]; 
00231          linearIndex++;
00232     }
00233     if(minOffset == 0)
00234     {
00235         // Could not find a valid start position for this reference.
00236         return(false);
00237     }
00238     return(true);
00239 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends