libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010-2012 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include "BamIndex.h" 00019 #include <iomanip> 00020 00021 BamIndex::BamIndex() 00022 : IndexBase(), 00023 maxOverallOffset(0), 00024 myUnMappedNumReads(-1) 00025 { 00026 } 00027 00028 00029 BamIndex::~BamIndex() 00030 { 00031 } 00032 00033 00034 // Reset the member data for a new index file. 00035 void BamIndex::resetIndex() 00036 { 00037 IndexBase::resetIndex(); 00038 00039 maxOverallOffset = 0; 00040 myUnMappedNumReads = -1; 00041 } 00042 00043 00044 // Read & parse the specified index file. 00045 SamStatus::Status BamIndex::readIndex(const char* filename) 00046 { 00047 // Reset the index from anything that may previously be set. 00048 resetIndex(); 00049 00050 IFILE indexFile = ifopen(filename, "rb"); 00051 00052 // Failed to open the index file. 00053 if(indexFile == NULL) 00054 { 00055 return(SamStatus::FAIL_IO); 00056 } 00057 00058 // generate the bam index structure. 00059 00060 // Read the magic string. 00061 char magic[4]; 00062 if(ifread(indexFile, magic, 4) != 4) 00063 { 00064 // Failed to read the magic 00065 ifclose(indexFile); 00066 return(SamStatus::FAIL_IO); 00067 } 00068 00069 // If this is not an index file, set num references to 0. 00070 if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1) 00071 { 00072 // Not a BAM Index file. 00073 ifclose(indexFile); 00074 return(SamStatus::FAIL_PARSE); 00075 } 00076 00077 // It is a bam index file. 00078 // Read the number of reference sequences. 00079 if(ifread(indexFile, &n_ref, 4) != 4) 00080 { 00081 // Failed to read. 00082 ifclose(indexFile); 00083 return(SamStatus::FAIL_IO); 00084 } 00085 00086 // Size the references. 00087 myRefs.resize(n_ref); 00088 00089 for(int refIndex = 0; refIndex < n_ref; refIndex++) 00090 { 00091 // Read each reference. 00092 Reference* ref = &(myRefs[refIndex]); 00093 00094 // Resize the bins so they can be indexed by bin number. 00095 ref->bins.resize(MAX_NUM_BINS + 1); 00096 00097 // Read the number of bins. 00098 if(ifread(indexFile, &(ref->n_bin), 4) != 4) 00099 { 00100 // Failed to read the number of bins. 00101 // Return failure. 00102 ifclose(indexFile); 00103 return(SamStatus::FAIL_PARSE); 00104 } 00105 00106 // If there are no bins, then there are no 00107 // mapped/unmapped reads. 00108 if(ref->n_bin == 0) 00109 { 00110 ref->n_mapped = 0; 00111 ref->n_unmapped = 0; 00112 } 00113 00114 // Read each bin. 00115 for(int binIndex = 0; binIndex < ref->n_bin; binIndex++) 00116 { 00117 uint32_t binNumber; 00118 00119 // Read in the bin number. 00120 if(ifread(indexFile, &(binNumber), 4) != 4) 00121 { 00122 // Failed to read the bin number. 00123 // Return failure. 00124 ifclose(indexFile); 00125 return(SamStatus::FAIL_IO); 00126 } 00127 00128 // Add the bin to the reference and get the 00129 // pointer back so the values can be set in it. 00130 Bin* binPtr = &(ref->bins[binNumber]); 00131 binPtr->bin = binNumber; 00132 00133 // Read in the number of chunks. 00134 if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4) 00135 { 00136 // Failed to read number of chunks. 00137 // Return failure. 00138 ifclose(indexFile); 00139 return(SamStatus::FAIL_IO); 00140 } 00141 00142 // Read in the chunks. 00143 // Allocate space for the chunks. 00144 uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk); 00145 binPtr->chunks = (Chunk*)malloc(sizeOfChunkList); 00146 if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList) 00147 { 00148 // Failed to read the chunks. 00149 // Return failure. 00150 ifclose(indexFile); 00151 return(SamStatus::FAIL_IO); 00152 } 00153 00154 // Determine the min/max for this bin if it is not the max bin. 00155 if(binPtr->bin != MAX_NUM_BINS) 00156 { 00157 for(int i = 0; i < binPtr->n_chunk; i++) 00158 { 00159 if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset) 00160 { 00161 ref->minChunkOffset = binPtr->chunks[i].chunk_beg; 00162 } 00163 if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset) 00164 { 00165 ref->maxChunkOffset = binPtr->chunks[i].chunk_end; 00166 } 00167 if(binPtr->chunks[i].chunk_end > maxOverallOffset) 00168 { 00169 maxOverallOffset = binPtr->chunks[i].chunk_end; 00170 } 00171 } 00172 } 00173 else 00174 { 00175 // Mapped/unmapped are the last chunk of the 00176 // MAX BIN 00177 ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg; 00178 ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end; 00179 } 00180 } 00181 00182 // Read the number of intervals. 00183 if(ifread(indexFile, &(ref->n_intv), 4) != 4) 00184 { 00185 // Failed to read, set to 0. 00186 ref->n_intv = 0; 00187 // Return failure. 00188 ifclose(indexFile); 00189 return(SamStatus::FAIL_IO); 00190 } 00191 00192 // Allocate space for the intervals and read them. 00193 uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t); 00194 ref->ioffsets = (uint64_t*)malloc(linearIndexSize); 00195 if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize) 00196 { 00197 // Failed to read the linear index. 00198 // Return failure. 00199 ifclose(indexFile); 00200 return(SamStatus::FAIL_IO); 00201 } 00202 } 00203 00204 int32_t numUnmapped = 0; 00205 if(ifread(indexFile, &numUnmapped, sizeof(int32_t)) == sizeof(int32_t)) 00206 { 00207 myUnMappedNumReads = numUnmapped; 00208 } 00209 00210 // Successfully read the bam index file. 00211 ifclose(indexFile); 00212 return(SamStatus::SUCCESS); 00213 } 00214 00215 00216 // Get the chunks for the specified reference id and start/end 0-based 00217 // coordinates. 00218 bool BamIndex::getChunksForRegion(int32_t refID, int32_t start, int32_t end, 00219 SortedChunkList& chunkList) 00220 { 00221 chunkList.clear(); 00222 00223 // If start is >= to end, there will be no sections, return no 00224 // regions. 00225 if((start >= end) && (end != -1)) 00226 { 00227 std::cerr << "Warning, requesting region where start <= end, so " 00228 << "no values will be returned.\n"; 00229 return(false); 00230 } 00231 00232 // Handle REF_ID_UNMAPPED. This uses a default chunk which covers 00233 // from the max offset to the end of the file. 00234 if(refID == REF_ID_UNMAPPED) 00235 { 00236 Chunk refChunk; 00237 // The start of the unmapped region is the max offset found 00238 // in the index file. 00239 refChunk.chunk_beg = getMaxOffset(); 00240 // The end of the unmapped region is the end of the file, so 00241 // set chunk end to the max value. 00242 refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE; 00243 return(chunkList.insert(refChunk)); 00244 } 00245 00246 if((refID < 0) || (refID >= n_ref)) 00247 { 00248 // The specified refID is out of range, return false. 00249 std::cerr << "Warning, requesting refID is out of range, so " 00250 << "no values will be returned.\n"; 00251 return(false); 00252 } 00253 00254 const Reference* ref = &(myRefs[refID]); 00255 00256 // Handle where start/end are defaults. 00257 if(start == -1) 00258 { 00259 if(end == -1) 00260 { 00261 // This is whole chromosome, so take a shortcut. 00262 if(ref->maxChunkOffset == 0) 00263 { 00264 // No chunks for this region, but this is not an error. 00265 return(true); 00266 } 00267 Chunk refChunk; 00268 refChunk.chunk_beg = ref->minChunkOffset; 00269 refChunk.chunk_end = ref->maxChunkOffset; 00270 return(chunkList.insert(refChunk)); 00271 } 00272 else 00273 { 00274 start = 0; 00275 } 00276 } 00277 if(end == -1) 00278 { 00279 // MAX_POSITION is inclusive, but end is exclusive, so add 1. 00280 end = MAX_POSITION + 1; 00281 } 00282 00283 // Determine the minimum offset for the given start position. This 00284 // is done by using the linear index for the specified start position. 00285 uint64_t minOffset = 0; 00286 getMinOffsetFromLinearIndex(refID, start, minOffset); 00287 00288 uint16_t binInRangeList[MAX_NUM_BINS + 1]; 00289 00290 int numBins = getBinsForRegion(start, end, binInRangeList); 00291 00292 // loop through the bins in the range and get the chunks. 00293 for(int i = 0; i < numBins; ++i) 00294 { 00295 int binNum = binInRangeList[i]; 00296 const Bin* bin = &(ref->bins[binNum]); 00297 00298 // Add each chunk in the bin to the map. 00299 for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++) 00300 { 00301 // If the end of the chunk is less than the minimum offset 00302 // for the 16K block that starts our region, then no 00303 // records in this chunk will cross our region, so do 00304 // not add it to the chunks we need to use. 00305 if(bin->chunks[chunkIndex].chunk_end < minOffset) 00306 { 00307 continue; 00308 } 00309 // Add the chunk to the map. 00310 if(!chunkList.insert(bin->chunks[chunkIndex])) 00311 { 00312 // Failed to add to the map, return false. 00313 std::cerr << "Warning, Failed to add a chunk, so " 00314 << "no values will be returned.\n"; 00315 return(false); 00316 } 00317 } 00318 } 00319 00320 // Now that all chunks have been added to the list, 00321 // handle overlapping chunks. 00322 return(chunkList.mergeOverlapping()); 00323 } 00324 00325 00326 // Get the max offset. 00327 uint64_t BamIndex::getMaxOffset() const 00328 { 00329 return(maxOverallOffset); 00330 } 00331 00332 // Get the min & max file offsets for the reference ID. 00333 bool BamIndex::getReferenceMinMax(int32_t refID, 00334 uint64_t& minOffset, 00335 uint64_t& maxOffset) const 00336 { 00337 if((refID < 0) || (refID >= (int32_t)myRefs.size())) 00338 { 00339 // Reference ID is out of range for this index file. 00340 return(false); 00341 } 00342 00343 // Get this reference. 00344 minOffset = myRefs[refID].minChunkOffset; 00345 maxOffset = myRefs[refID].maxChunkOffset; 00346 return(true); 00347 } 00348 00349 00350 // Get the number of mapped reads for this reference id. 00351 int32_t BamIndex::getNumMappedReads(int32_t refID) 00352 { 00353 // If it is the reference id of unmapped reads, return 00354 // that there are no mapped reads. 00355 if(refID == REF_ID_UNMAPPED) 00356 { 00357 // These are by definition all unmapped reads. 00358 return(0); 00359 } 00360 00361 if((refID < 0) || (refID >= (int32_t)myRefs.size())) 00362 { 00363 // Reference ID is out of range for this index file. 00364 return(-1); 00365 } 00366 00367 // Get this reference. 00368 return(myRefs[refID].n_mapped); 00369 } 00370 00371 00372 // Get the number of unmapped reads for this reference id. 00373 int32_t BamIndex::getNumUnMappedReads(int32_t refID) 00374 { 00375 // If it is the reference id of unmapped reads, return 00376 // that value. 00377 if(refID == REF_ID_UNMAPPED) 00378 { 00379 return(myUnMappedNumReads); 00380 } 00381 00382 if((refID < 0) || (refID >= (int32_t)myRefs.size())) 00383 { 00384 // Reference ID is out of range for this index file. 00385 return(-1); 00386 } 00387 00388 // Get this reference. 00389 return(myRefs[refID].n_unmapped); 00390 } 00391 00392 00393 // Print the bam index. 00394 void BamIndex::printIndex(int32_t refID, bool summary) 00395 { 00396 std::cout << "BAM Index: " << std::endl; 00397 std::cout << "# Reference Sequences: " << n_ref << std::endl; 00398 00399 unsigned int startRef = 0; 00400 unsigned int endRef = myRefs.size() - 1; 00401 std::vector<Reference> refsToProcess; 00402 if(refID != -1) 00403 { 00404 // Set start and end ref to the specified reference id. 00405 startRef = refID; 00406 endRef = refID; 00407 } 00408 00409 // Print out the information for each bin. 00410 for(unsigned int i = startRef; i <= endRef; ++i) 00411 { 00412 std::cout << std::dec 00413 << "\tReference ID: " << std::setw(4) << i 00414 << "; #Bins: "<< std::setw(6) << myRefs[i].n_bin 00415 << "; #Linear Index Entries: " 00416 << std::setw(6) << myRefs[i].n_intv 00417 << "; Min Chunk Offset: " 00418 << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset 00419 << "; Max Chunk Offset: " 00420 << std::setw(18) << myRefs[i].maxChunkOffset 00421 << std::dec; 00422 // Print the mapped/unmapped if set. 00423 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO) 00424 { 00425 std::cout << "; " << myRefs[i].n_mapped << " Mapped Reads"; 00426 } 00427 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO) 00428 { 00429 std::cout << "; " << myRefs[i].n_unmapped << " Unmapped Reads"; 00430 } 00431 std::cout << std::endl; 00432 00433 // Only print more details if not summary. 00434 if(!summary) 00435 { 00436 std::vector<Bin>::iterator binIter; 00437 for(binIter = myRefs[i].bins.begin(); 00438 binIter != myRefs[i].bins.end(); 00439 ++binIter) 00440 { 00441 Bin* binPtr = &(*binIter); 00442 if(binPtr->bin == Bin::NOT_USED_BIN) 00443 { 00444 // This bin is not used, continue. 00445 continue; 00446 } 00447 // Print the bin info. 00448 std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl; 00449 std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl; 00450 std::cout << std::hex << std::showbase; 00451 00452 for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk; 00453 ++chunkIndex) 00454 { 00455 // If this is the last chunk of the MAX_NUM_BINS - it 00456 // contains a mapped/unmapped count rather than the regular 00457 // chunk addresses. 00458 if((binPtr->bin != MAX_NUM_BINS) || 00459 (chunkIndex != (binPtr->n_chunk - 1))) 00460 { 00461 std::cout << "\t\t\t\tchunk_beg: " 00462 << binPtr->chunks[chunkIndex].chunk_beg 00463 << std::endl; 00464 std::cout << "\t\t\t\tchunk_end: " 00465 << binPtr->chunks[chunkIndex].chunk_end 00466 << std::endl; 00467 } 00468 } 00469 } 00470 std::cout << std::dec; 00471 00472 // Print the linear index. 00473 for(int linearIndex = 0; linearIndex < myRefs[i].n_intv; 00474 ++linearIndex) 00475 { 00476 if(myRefs[i].ioffsets[linearIndex] != 0) 00477 { 00478 std::cout << "\t\t\tLinearIndex[" 00479 << std::dec << linearIndex << "] Offset: " 00480 << std::hex << myRefs[i].ioffsets[linearIndex] 00481 << std::endl; 00482 } 00483 } 00484 } 00485 } 00486 }