libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __PILEUP_H__ 00019 #define __PILEUP_H__ 00020 00021 #include <stdexcept> 00022 #include "SamFile.h" 00023 #include "PosList.h" 00024 00025 00026 class PileupHelper 00027 { 00028 public: 00029 static const int DEFAULT_WINDOW_SIZE = 1024; 00030 }; 00031 00032 template <class PILEUP_TYPE> 00033 class defaultPileup 00034 { 00035 public: 00036 bool operator() (PILEUP_TYPE& element) 00037 { 00038 element.analyze(); 00039 return(true); 00040 } 00041 void analyze(PILEUP_TYPE element) 00042 { 00043 element.analyze(); 00044 } 00045 }; 00046 00047 template <class PILEUP_TYPE> 00048 void defaultPileupAnalyze(PILEUP_TYPE& element) 00049 { 00050 element.analyze(); 00051 } 00052 00053 00054 /// Class to perform a pileup of all reads by position, assuming 00055 /// the reads are coordinate sorted. 00056 template <class PILEUP_TYPE, 00057 class FUNC_CLASS = defaultPileup<PILEUP_TYPE> > 00058 class Pileup 00059 { 00060 public: 00061 /// Constructor using the default maximum number of bases a read spans. 00062 Pileup(const FUNC_CLASS& fp = FUNC_CLASS()); 00063 00064 /// Constructor that sets the maximum number of bases a read spans. 00065 /// This is the "window" the length of the buffer that holds 00066 /// the pileups for each position until the read start has moved 00067 /// past the position. 00068 Pileup(int window, const FUNC_CLASS& fp = FUNC_CLASS()); 00069 00070 /// Perform pileup with a reference. 00071 Pileup(const std::string& refSeqFileName, 00072 const FUNC_CLASS& fp = FUNC_CLASS()); 00073 00074 /// Perform pileup with a reference and a specified window size. 00075 Pileup(int window, const std::string& refSeqFileName, 00076 const FUNC_CLASS& fp = FUNC_CLASS()); 00077 00078 /// Destructor 00079 virtual ~Pileup(); 00080 00081 /// Performs a pileup on the specified file. 00082 /// \param excludeFlag if specified, if any bit set in the exclude flag 00083 /// is set in the record's flag, it will be dropped. 00084 /// Defaulted to exclude: 00085 /// * unmapped, 00086 /// * not primary alignment 00087 /// * failed platform/vendor quality check 00088 /// * PCR or optical duplicate 00089 /// \param includeFlag if specified, every bit must be set in the 00090 /// record's flag for it to be included - 00091 /// defaulted to 0, no bits are required to be set. 00092 /// \return 0 for success and non-zero for failure. 00093 virtual int processFile(const std::string& fileName, 00094 uint16_t excludeFlag = 0x0704, 00095 uint16_t includeFlag = 0); 00096 00097 /// Add an alignment to the pileup. 00098 virtual void processAlignment(SamRecord& record); 00099 00100 /// Add only positions that fall within the specified region of the 00101 /// alignment to the pileup and outside of the specified excluded positions. 00102 /// \param record alignment to be added to the pileup. 00103 /// \param startPos 0-based start position of the bases that should be 00104 /// added to the pileup. 00105 /// \param endPos 0-based end position of the bases that should be added 00106 /// to the pileup (this position is not added). 00107 /// Set to -1 if there is no end position to the region. 00108 /// \param excludeList list of refID/positions to exclude from processing. 00109 virtual void processAlignmentRegion(SamRecord& record, 00110 int startPos, 00111 int endPos, 00112 PosList* excludeList = NULL); 00113 00114 /// Done processing, flush every position that is currently being stored 00115 /// in the pileup. 00116 void flushPileup(); 00117 00118 protected: 00119 FUNC_CLASS myAnalyzeFuncPtr; 00120 00121 // Always need the reference position. 00122 void addAlignmentPosition(int refPosition, SamRecord& record); 00123 00124 00125 virtual void flushPileup(int refID, int refPosition); 00126 void flushPileup(int refPosition); 00127 00128 // Get the position in the myElements container that is associated 00129 // with the specified position. If the specified position cannot 00130 // fit within the myElements container, -1 is returned. 00131 int pileupPosition(int refPosition); 00132 00133 virtual void resetElement(PILEUP_TYPE& element, int position); 00134 virtual void addElement(PILEUP_TYPE& element, SamRecord& record); 00135 virtual void analyzeElement(PILEUP_TYPE& element); 00136 virtual void analyzeHead(); 00137 00138 std::vector<PILEUP_TYPE> myElements; 00139 00140 int pileupStart; 00141 int pileupHead; 00142 int pileupTail; // last used position 00143 int pileupWindow; 00144 00145 int myCurrentRefID; 00146 00147 GenomeSequence* myRefPtr; 00148 }; 00149 00150 00151 template <class PILEUP_TYPE, class FUNC_CLASS> 00152 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const FUNC_CLASS& fp) 00153 : myAnalyzeFuncPtr(fp), 00154 myElements(), 00155 pileupStart(0), 00156 pileupHead(0), 00157 pileupTail(-1), 00158 pileupWindow(PileupHelper::DEFAULT_WINDOW_SIZE), 00159 myCurrentRefID(-2), 00160 myRefPtr(NULL) 00161 { 00162 // Not using pointers since this is templated. 00163 myElements.resize(pileupWindow); 00164 } 00165 00166 00167 template <class PILEUP_TYPE, class FUNC_CLASS> 00168 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const FUNC_CLASS& fp) 00169 : myAnalyzeFuncPtr(fp), 00170 myElements(), 00171 pileupStart(0), 00172 pileupHead(0), 00173 pileupTail(-1), 00174 pileupWindow(window), 00175 myCurrentRefID(-2), 00176 myRefPtr(NULL) 00177 { 00178 // Not using pointers since this is templated. 00179 myElements.resize(window); 00180 } 00181 00182 00183 template <class PILEUP_TYPE, class FUNC_CLASS> 00184 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const std::string& refSeqFileName, const FUNC_CLASS& fp) 00185 : myAnalyzeFuncPtr(fp), 00186 myElements(), 00187 pileupStart(0), 00188 pileupHead(0), 00189 pileupTail(-1), 00190 pileupWindow(PileupHelper::DEFAULT_WINDOW_SIZE), 00191 myCurrentRefID(-2), 00192 myRefPtr(NULL) 00193 { 00194 myRefPtr = new GenomeSequence(refSeqFileName.c_str()); 00195 00196 // Not using pointers since this is templated. 00197 myElements.resize(pileupWindow); 00198 00199 PILEUP_TYPE::setReference(myRefPtr); 00200 } 00201 00202 00203 template <class PILEUP_TYPE, class FUNC_CLASS> 00204 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const std::string& refSeqFileName, const FUNC_CLASS& fp) 00205 : myAnalyzeFuncPtr(fp), 00206 myElements(), 00207 pileupStart(0), 00208 pileupHead(0), 00209 pileupTail(-1), 00210 pileupWindow(window), 00211 myCurrentRefID(-2), 00212 myRefPtr(NULL) 00213 { 00214 myRefPtr = new GenomeSequence(refSeqFileName.c_str()); 00215 00216 // Not using pointers since this is templated. 00217 myElements.resize(window); 00218 00219 PILEUP_TYPE::setReference(myRefPtr); 00220 } 00221 00222 00223 template <class PILEUP_TYPE, class FUNC_CLASS> 00224 Pileup<PILEUP_TYPE, FUNC_CLASS>::~Pileup() 00225 { 00226 flushPileup(); 00227 if(myRefPtr != NULL) 00228 { 00229 delete myRefPtr; 00230 myRefPtr = NULL; 00231 } 00232 } 00233 00234 template <class PILEUP_TYPE, class FUNC_CLASS> 00235 int Pileup<PILEUP_TYPE, FUNC_CLASS>::processFile(const std::string& fileName, 00236 uint16_t excludeFlag, 00237 uint16_t includeFlag) 00238 { 00239 SamFile samIn; 00240 SamFileHeader header; 00241 SamRecord record; 00242 00243 if(myRefPtr != NULL) 00244 { 00245 samIn.SetReference(myRefPtr); 00246 } 00247 00248 if(!samIn.OpenForRead(fileName.c_str())) 00249 { 00250 fprintf(stderr, "%s\n", samIn.GetStatusMessage()); 00251 return(samIn.GetStatus()); 00252 } 00253 00254 if(!samIn.ReadHeader(header)) 00255 { 00256 fprintf(stderr, "%s\n", samIn.GetStatusMessage()); 00257 return(samIn.GetStatus()); 00258 } 00259 00260 // The file needs to be sorted by coordinate. 00261 samIn.setSortedValidation(SamFile::COORDINATE); 00262 00263 // Iterate over all records 00264 while (samIn.ReadRecord(header, record)) 00265 { 00266 uint16_t flag = record.getFlag(); 00267 if(flag & excludeFlag) 00268 { 00269 // This record has an excluded flag set, 00270 // so continue to the next one. 00271 continue; 00272 } 00273 if((flag & includeFlag) != includeFlag) 00274 { 00275 // This record does not have all required flags set, 00276 // so continue to the next one. 00277 continue; 00278 } 00279 processAlignment(record); 00280 } 00281 00282 flushPileup(); 00283 00284 int returnValue = 0; 00285 if(samIn.GetStatus() != SamStatus::NO_MORE_RECS) 00286 { 00287 // Failed to read a record. 00288 fprintf(stderr, "%s\n", samIn.GetStatusMessage()); 00289 returnValue = samIn.GetStatus(); 00290 } 00291 return(returnValue); 00292 } 00293 00294 00295 template <class PILEUP_TYPE, class FUNC_CLASS> 00296 void Pileup<PILEUP_TYPE, FUNC_CLASS>::processAlignment(SamRecord& record) 00297 { 00298 int refPosition = record.get0BasedPosition(); 00299 int refID = record.getReferenceID(); 00300 00301 // Flush any elements from the pileup that are prior to this record 00302 // since the file is sorted, we are done with those positions. 00303 flushPileup(refID, refPosition); 00304 00305 // Loop through for each reference position covered by the record. 00306 // It is up to the PILEUP_TYPE to handle insertions/deletions, etc 00307 // that are related with the given reference position. 00308 for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition) 00309 { 00310 addAlignmentPosition(refPosition, record); 00311 } 00312 } 00313 00314 00315 template <class PILEUP_TYPE, class FUNC_CLASS> 00316 void Pileup<PILEUP_TYPE, FUNC_CLASS>::processAlignmentRegion(SamRecord& record, 00317 int startPos, 00318 int endPos, 00319 PosList* excludeList) 00320 { 00321 int refPosition = record.get0BasedPosition(); 00322 int refID = record.getReferenceID(); 00323 00324 // Flush any elements from the pileup that are prior to this record 00325 // since the file is sorted, we are done with those positions. 00326 flushPileup(refID, refPosition); 00327 00328 // Check if the region starts after this reference starts. If so, 00329 // we only want to start adding at the region start position. 00330 if(startPos > refPosition) 00331 { 00332 refPosition = startPos; 00333 } 00334 00335 // Loop through for each reference position covered by the record. 00336 // It is up to the PILEUP_TYPE to handle insertions/deletions, etc 00337 // that are related with the given reference position. 00338 for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition) 00339 { 00340 // Check to see if we have gone past the end of the region, in which 00341 // case we can stop processing this record. Check >= since the 00342 // end position is not in the region. 00343 if((endPos != -1) && (refPosition >= endPos)) 00344 { 00345 break; 00346 } 00347 00348 // Check to see if this position is in the exclude list. 00349 bool addPos = true; 00350 if(excludeList != NULL) 00351 { 00352 // There is an exclude list, so lookup the position. 00353 if(excludeList->hasPosition(refID, refPosition)) 00354 { 00355 // This position is in the exclude list, so don't add it. 00356 addPos = false; 00357 } 00358 } 00359 if(addPos) 00360 { 00361 addAlignmentPosition(refPosition, record); 00362 } 00363 } 00364 } 00365 00366 00367 template <class PILEUP_TYPE, class FUNC_CLASS> 00368 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup() 00369 { 00370 // while there are still entries between the head and tail, flush, 00371 // but no need to flush if pileupTail == -1 because in that case 00372 // no entries have been added 00373 while ((pileupHead <= pileupTail) && (pileupTail != -1)) 00374 { 00375 flushPileup(pileupHead+1); 00376 } 00377 pileupStart = pileupHead = 0; 00378 pileupTail = -1; 00379 } 00380 00381 00382 // Always need the reference position. 00383 template <class PILEUP_TYPE, class FUNC_CLASS> 00384 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addAlignmentPosition(int refPosition, 00385 SamRecord& record) 00386 { 00387 int offset = 0; 00388 try{ 00389 offset = pileupPosition(refPosition); 00390 } 00391 catch(std::runtime_error& err) 00392 { 00393 const char* overflowErr = "Overflow on the pileup buffer:"; 00394 String errorMessage = err.what(); 00395 if(strncmp(err.what(), overflowErr, strlen(overflowErr)) == 0) 00396 { 00397 00398 errorMessage += "\n\tPileup Buffer Overflow: recordName = "; 00399 errorMessage += record.getReadName(); 00400 errorMessage += "; Cigar = "; 00401 errorMessage += record.getCigar(); 00402 } 00403 throw std::runtime_error(errorMessage.c_str()); 00404 } 00405 00406 if((offset < 0) || (offset >= pileupWindow)) 00407 { 00408 std::cerr << "Pileup Buffer Overflow: position = " << refPosition 00409 << "; refID = " << record.getReferenceID() 00410 << "; recStartPos = " << record.get1BasedPosition() 00411 << "; pileupStart = " << pileupStart 00412 << "; pileupHead = " << pileupHead 00413 << "; pileupTail = " << pileupTail; 00414 } 00415 00416 addElement(myElements[offset], record); 00417 } 00418 00419 00420 template <class PILEUP_TYPE, class FUNC_CLASS> 00421 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int refID, int position) 00422 { 00423 // if new chromosome, flush the entire pileup. 00424 if(refID != myCurrentRefID) 00425 { 00426 // New chromosome, flush everything. 00427 flushPileup(); 00428 myCurrentRefID = refID; 00429 } 00430 else 00431 { 00432 // on the same chromosome, so flush just up to this new position. 00433 flushPileup(position); 00434 } 00435 } 00436 00437 00438 template <class PILEUP_TYPE, class FUNC_CLASS> 00439 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int position) 00440 { 00441 // Flush up to this new position, but no reason to flush if 00442 // pileupHead has not been set. 00443 while((pileupHead < position) && (pileupHead <= pileupTail)) 00444 { 00445 analyzeHead(); 00446 00447 pileupHead++; 00448 00449 if(pileupHead - pileupStart >= pileupWindow) 00450 pileupStart += pileupWindow; 00451 } 00452 00453 if(pileupHead > pileupTail) 00454 { 00455 // All positions have been flushed, so reset pileup info 00456 pileupHead = pileupStart = 0; 00457 pileupTail = -1; 00458 } 00459 } 00460 00461 00462 // Get the position in the myElements container that is associated 00463 // with the specified position. If the specified position cannot 00464 // fit within the myElements container, -1 is returned. 00465 template <class PILEUP_TYPE, class FUNC_CLASS> 00466 int Pileup<PILEUP_TYPE, FUNC_CLASS>::pileupPosition(int position) 00467 { 00468 // Check to see if this is the first position (pileupTail == -1) 00469 if(pileupTail == -1) 00470 { 00471 pileupStart = pileupHead = position; 00472 // This is the first time this position is being used, so 00473 // reset the element. 00474 resetElement(myElements[0], position); 00475 pileupTail = position; 00476 return(0); 00477 } 00478 00479 00480 if((position < pileupHead) || (position > (pileupHead + pileupWindow))) 00481 { 00482 String errorMessage = 00483 "Overflow on the pileup buffer: specifiedPosition = "; 00484 errorMessage += position; 00485 errorMessage += ", pileup buffer start position: "; 00486 errorMessage += pileupHead; 00487 errorMessage += ", pileup buffer end position: "; 00488 errorMessage += pileupHead + pileupWindow; 00489 00490 throw std::runtime_error(errorMessage.c_str()); 00491 } 00492 00493 // int offset = position - pileupStart; 00494 int offset = position - pileupStart; 00495 00496 if(offset >= pileupWindow) 00497 { 00498 offset -= pileupWindow; 00499 } 00500 00501 // Check to see if position is past the end of the currently 00502 // setup pileup positions. 00503 while(position > pileupTail) 00504 { 00505 // Increment pileupTail to the next position since the current 00506 // pileupTail is already in use. 00507 ++pileupTail; 00508 00509 // Figure out the offset for this next position. 00510 offset = pileupTail - pileupStart; 00511 if(offset >= pileupWindow) 00512 { 00513 offset -= pileupWindow; 00514 } 00515 00516 // This is the first time this position is being used, so 00517 // reset the element. 00518 resetElement(myElements[offset], pileupTail); 00519 } 00520 00521 return(offset); 00522 } 00523 00524 00525 template <class PILEUP_TYPE, class FUNC_CLASS> 00526 void Pileup<PILEUP_TYPE, FUNC_CLASS>::resetElement(PILEUP_TYPE& element, 00527 int position) 00528 { 00529 element.reset(position); 00530 } 00531 00532 00533 template <class PILEUP_TYPE, class FUNC_CLASS> 00534 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addElement(PILEUP_TYPE& element, 00535 SamRecord& record) 00536 { 00537 element.addEntry(record); 00538 } 00539 00540 00541 template <class PILEUP_TYPE, class FUNC_CLASS> 00542 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeElement(PILEUP_TYPE& element) 00543 { 00544 myAnalyzeFuncPtr(element); 00545 } 00546 00547 00548 template <class PILEUP_TYPE, class FUNC_CLASS> 00549 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeHead() 00550 { 00551 myAnalyzeFuncPtr(myElements[pileupHead - pileupStart]); 00552 } 00553 00554 00555 #endif