libStatGen Software  1
Pileup.h
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __PILEUP_H__
00019 #define __PILEUP_H__
00020 
00021 #include <stdexcept>
00022 #include "SamFile.h"
00023 #include "PosList.h"
00024 
00025 
00026 class PileupHelper
00027 {
00028 public:
00029     static const int DEFAULT_WINDOW_SIZE = 1024;
00030 };
00031 
00032 template <class PILEUP_TYPE>
00033 class defaultPileup
00034 {
00035 public:
00036     bool operator() (PILEUP_TYPE& element)
00037     {
00038         element.analyze();
00039         return(true);
00040     }
00041     void analyze(PILEUP_TYPE element)
00042     {
00043         element.analyze();
00044     }
00045 };
00046 
00047 template <class PILEUP_TYPE>
00048 void defaultPileupAnalyze(PILEUP_TYPE& element)
00049 {
00050     element.analyze();
00051 }
00052 
00053 
00054 /// Class to perform a pileup of all reads by position, assuming
00055 /// the reads are coordinate sorted.
00056 template <class PILEUP_TYPE, 
00057           class FUNC_CLASS = defaultPileup<PILEUP_TYPE> >
00058 class Pileup
00059 {
00060 public:
00061     /// Constructor using the default maximum number of bases a read spans.
00062     Pileup(const FUNC_CLASS& fp = FUNC_CLASS());
00063 
00064     /// Constructor that sets the maximum number of bases a read spans.
00065     /// This is the "window" the length of the buffer that holds
00066     /// the pileups for each position until the read start has moved
00067     /// past the position.
00068     Pileup(int window, const FUNC_CLASS& fp = FUNC_CLASS());
00069 
00070     /// Perform pileup with a reference.
00071     Pileup(const std::string& refSeqFileName,
00072            const FUNC_CLASS& fp = FUNC_CLASS());
00073 
00074     /// Perform pileup with a reference and a specified window size.
00075     Pileup(int window, const std::string& refSeqFileName, 
00076            const FUNC_CLASS& fp = FUNC_CLASS());
00077 
00078     /// Destructor
00079     virtual ~Pileup();
00080 
00081     /// Performs a pileup on the specified file.
00082     /// \param excludeFlag if specified, if any bit set in the exclude flag
00083     ///                    is set in the record's flag, it will be dropped.
00084     /// Defaulted to exclude:
00085     ///    * unmapped, 
00086     ///    * not primary alignment
00087     ///    * failed platform/vendor quality check
00088     ///    * PCR or optical duplicate
00089     /// \param includeFlag if specified, every bit must be set in the 
00090     ///                    record's flag for it to be included - 
00091     ///                    defaulted to 0, no bits are required to be set.
00092     /// \return 0 for success and non-zero for failure.
00093     virtual int processFile(const std::string& fileName, 
00094                             uint16_t excludeFlag = 0x0704, 
00095                             uint16_t includeFlag = 0);
00096 
00097     /// Add an alignment to the pileup.
00098     virtual void processAlignment(SamRecord& record);
00099    
00100     /// Add only positions that fall within the specified region of the
00101     /// alignment to the pileup and outside of the specified excluded positions.
00102     /// \param record alignment to be added to the pileup.
00103     /// \param startPos 0-based start position of the bases that should be
00104     ///                 added to the pileup.
00105     /// \param endPos   0-based end position of the bases that should be added
00106     ///                 to the pileup (this position is not added).
00107     ///                 Set to -1 if there is no end position to the region.
00108     /// \param excludeList list of refID/positions to exclude from processing.
00109     virtual void processAlignmentRegion(SamRecord& record,
00110                                         int startPos,
00111                                         int endPos,
00112                                         PosList* excludeList = NULL);
00113    
00114     /// Done processing, flush every position that is currently being stored
00115     /// in the pileup.
00116     void flushPileup();
00117 
00118 protected:
00119     FUNC_CLASS myAnalyzeFuncPtr;
00120 
00121     // Always need the reference position.
00122     void addAlignmentPosition(int refPosition, SamRecord& record);
00123 
00124 
00125     virtual void flushPileup(int refID, int refPosition);
00126     void flushPileup(int refPosition);
00127     
00128     // Get the position in the myElements container that is associated
00129     // with the specified position.  If the specified position cannot
00130     // fit within the myElements container, -1 is returned.
00131     int pileupPosition(int refPosition);
00132 
00133     virtual void resetElement(PILEUP_TYPE& element, int position);
00134     virtual void addElement(PILEUP_TYPE& element, SamRecord& record);
00135     virtual void analyzeElement(PILEUP_TYPE& element);
00136     virtual void analyzeHead();
00137 
00138     std::vector<PILEUP_TYPE> myElements;
00139 
00140     int    pileupStart;
00141     int    pileupHead;
00142     int    pileupTail; // last used position
00143     int    pileupWindow;
00144 
00145     int myCurrentRefID;
00146 
00147     GenomeSequence* myRefPtr;
00148 };
00149 
00150 
00151 template <class PILEUP_TYPE, class FUNC_CLASS>
00152 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const FUNC_CLASS& fp)
00153     : myAnalyzeFuncPtr(fp),
00154       myElements(),
00155       pileupStart(0),
00156       pileupHead(0),
00157       pileupTail(-1),
00158       pileupWindow(PileupHelper::DEFAULT_WINDOW_SIZE),
00159       myCurrentRefID(-2),
00160       myRefPtr(NULL)
00161 {
00162     // Not using pointers since this is templated.
00163     myElements.resize(pileupWindow);
00164 }
00165 
00166 
00167 template <class PILEUP_TYPE, class FUNC_CLASS>
00168 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const FUNC_CLASS& fp)
00169     : myAnalyzeFuncPtr(fp),
00170       myElements(),
00171       pileupStart(0),
00172       pileupHead(0),
00173       pileupTail(-1),
00174       pileupWindow(window),
00175       myCurrentRefID(-2),
00176       myRefPtr(NULL)
00177 {
00178     // Not using pointers since this is templated.
00179     myElements.resize(window);
00180 }
00181 
00182 
00183 template <class PILEUP_TYPE, class FUNC_CLASS>
00184 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const std::string& refSeqFileName, const FUNC_CLASS& fp)
00185     : myAnalyzeFuncPtr(fp),
00186       myElements(),
00187       pileupStart(0),
00188       pileupHead(0),
00189       pileupTail(-1),
00190       pileupWindow(PileupHelper::DEFAULT_WINDOW_SIZE),
00191       myCurrentRefID(-2),
00192       myRefPtr(NULL)
00193 {
00194     myRefPtr = new GenomeSequence(refSeqFileName.c_str());
00195 
00196     // Not using pointers since this is templated.
00197     myElements.resize(pileupWindow);
00198 
00199     PILEUP_TYPE::setReference(myRefPtr);
00200 }
00201 
00202 
00203 template <class PILEUP_TYPE, class FUNC_CLASS>
00204 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const std::string& refSeqFileName, const FUNC_CLASS& fp)
00205     : myAnalyzeFuncPtr(fp),
00206       myElements(),
00207       pileupStart(0),
00208       pileupHead(0),
00209       pileupTail(-1),
00210       pileupWindow(window),
00211       myCurrentRefID(-2),
00212       myRefPtr(NULL)
00213 {
00214     myRefPtr = new GenomeSequence(refSeqFileName.c_str());
00215 
00216     // Not using pointers since this is templated.
00217     myElements.resize(window);
00218 
00219     PILEUP_TYPE::setReference(myRefPtr);
00220 }
00221 
00222 
00223 template <class PILEUP_TYPE, class FUNC_CLASS>
00224 Pileup<PILEUP_TYPE, FUNC_CLASS>::~Pileup()
00225 {
00226     flushPileup();
00227     if(myRefPtr != NULL)
00228     {
00229         delete myRefPtr;
00230         myRefPtr = NULL;
00231     }
00232 }
00233 
00234 template <class PILEUP_TYPE, class FUNC_CLASS>
00235 int Pileup<PILEUP_TYPE, FUNC_CLASS>::processFile(const std::string& fileName, 
00236                                                  uint16_t excludeFlag,
00237                                                  uint16_t includeFlag)
00238 {
00239     SamFile samIn;
00240     SamFileHeader header;
00241     SamRecord record;
00242     
00243     if(myRefPtr != NULL)
00244     {
00245         samIn.SetReference(myRefPtr);
00246     }
00247 
00248     if(!samIn.OpenForRead(fileName.c_str()))
00249     {
00250         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00251         return(samIn.GetStatus());
00252     }
00253     
00254     if(!samIn.ReadHeader(header))
00255     {
00256         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00257         return(samIn.GetStatus());
00258     }
00259 
00260     // The file needs to be sorted by coordinate.
00261     samIn.setSortedValidation(SamFile::COORDINATE);
00262 
00263     // Iterate over all records
00264     while (samIn.ReadRecord(header, record))
00265     {
00266         uint16_t flag = record.getFlag();
00267         if(flag & excludeFlag)
00268         {
00269             // This record has an excluded flag set, 
00270             // so continue to the next one.
00271             continue;
00272         }
00273         if((flag & includeFlag) != includeFlag)
00274         {
00275             // This record does not have all required flags set, 
00276             // so continue to the next one.
00277             continue;
00278         }
00279         processAlignment(record);
00280     }
00281 
00282     flushPileup();
00283 
00284     int returnValue = 0;
00285     if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
00286     {
00287         // Failed to read a record.
00288         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00289         returnValue = samIn.GetStatus();
00290     }
00291     return(returnValue);  
00292 }
00293 
00294 
00295 template <class PILEUP_TYPE, class FUNC_CLASS>
00296 void Pileup<PILEUP_TYPE, FUNC_CLASS>::processAlignment(SamRecord& record)
00297 {
00298     int refPosition = record.get0BasedPosition();
00299     int refID = record.getReferenceID();
00300 
00301     // Flush any elements from the pileup that are prior to this record
00302     // since the file is sorted, we are done with those positions.
00303     flushPileup(refID, refPosition);
00304     
00305     // Loop through for each reference position covered by the record.
00306     // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
00307     // that are related with the given reference position.
00308     for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
00309     {
00310         addAlignmentPosition(refPosition, record);
00311     }
00312 }
00313 
00314 
00315 template <class PILEUP_TYPE, class FUNC_CLASS>
00316 void Pileup<PILEUP_TYPE, FUNC_CLASS>::processAlignmentRegion(SamRecord& record,
00317                                                              int startPos,
00318                                                              int endPos,
00319                                                              PosList* excludeList)
00320 {
00321     int refPosition = record.get0BasedPosition();
00322     int refID = record.getReferenceID();
00323 
00324     // Flush any elements from the pileup that are prior to this record
00325     // since the file is sorted, we are done with those positions.
00326     flushPileup(refID, refPosition);
00327     
00328     // Check if the region starts after this reference starts.  If so,
00329     // we only want to start adding at the region start position.
00330     if(startPos > refPosition)
00331     {
00332         refPosition = startPos;
00333     }
00334 
00335     // Loop through for each reference position covered by the record.
00336     // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
00337     // that are related with the given reference position.
00338     for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
00339     {
00340         // Check to see if we have gone past the end of the region, in which
00341         // case we can stop processing this record.  Check >= since the
00342         // end position is not in the region.
00343         if((endPos != -1) && (refPosition >= endPos))
00344         {
00345             break;
00346         }
00347 
00348         // Check to see if this position is in the exclude list.
00349         bool addPos = true;
00350         if(excludeList != NULL)
00351         {
00352             // There is an exclude list, so lookup the position.
00353             if(excludeList->hasPosition(refID, refPosition))
00354             {
00355                 // This position is in the exclude list, so don't add it.
00356                 addPos = false;
00357             }
00358         }
00359         if(addPos)
00360         {
00361             addAlignmentPosition(refPosition, record);
00362         }
00363     }
00364 }
00365 
00366 
00367 template <class PILEUP_TYPE, class FUNC_CLASS>
00368 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup()
00369 {
00370     // while there are still entries between the head and tail, flush,
00371     // but no need to flush if pileupTail == -1 because in that case 
00372     // no entries have been added
00373     while ((pileupHead <= pileupTail) && (pileupTail != -1))
00374     {
00375         flushPileup(pileupHead+1);
00376     }
00377     pileupStart = pileupHead = 0;
00378     pileupTail = -1;
00379 }
00380 
00381 
00382 // Always need the reference position.
00383 template <class PILEUP_TYPE, class FUNC_CLASS>
00384 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addAlignmentPosition(int refPosition,
00385                                                            SamRecord& record)
00386 {
00387     int offset = 0;
00388     try{
00389         offset = pileupPosition(refPosition);
00390     }
00391     catch(std::runtime_error& err)
00392     {
00393         const char* overflowErr = "Overflow on the pileup buffer:";
00394         String errorMessage = err.what();
00395         if(strncmp(err.what(), overflowErr, strlen(overflowErr)) == 0)
00396         {
00397             
00398             errorMessage += "\n\tPileup Buffer Overflow: recordName = ";
00399             errorMessage += record.getReadName();
00400             errorMessage += "; Cigar = ";
00401             errorMessage += record.getCigar();
00402         }
00403         throw std::runtime_error(errorMessage.c_str());
00404     }
00405     
00406     if((offset < 0) || (offset >= pileupWindow))
00407     {
00408         std::cerr << "Pileup Buffer Overflow: position = " << refPosition
00409                   << "; refID = " << record.getReferenceID() 
00410                   << "; recStartPos = " << record.get1BasedPosition()
00411                   << "; pileupStart = " << pileupStart
00412                   << "; pileupHead = " << pileupHead
00413                   << "; pileupTail = " << pileupTail;
00414     }
00415 
00416     addElement(myElements[offset], record);
00417 }
00418 
00419 
00420 template <class PILEUP_TYPE, class FUNC_CLASS>
00421 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int refID, int position)
00422 {
00423     // if new chromosome, flush the entire pileup.
00424     if(refID != myCurrentRefID)
00425     {
00426         // New chromosome, flush everything.
00427         flushPileup();
00428         myCurrentRefID = refID;
00429     }
00430     else
00431     {
00432         // on the same chromosome, so flush just up to this new position.
00433         flushPileup(position);
00434     }
00435 }
00436 
00437 
00438 template <class PILEUP_TYPE, class FUNC_CLASS>
00439 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int position)
00440 {
00441     // Flush up to this new position, but no reason to flush if
00442     // pileupHead has not been set.
00443     while((pileupHead < position) && (pileupHead <= pileupTail))
00444     {
00445         analyzeHead();
00446 
00447         pileupHead++;
00448         
00449         if(pileupHead - pileupStart >= pileupWindow)
00450             pileupStart += pileupWindow;
00451     }
00452 
00453     if(pileupHead > pileupTail)
00454     {
00455         // All positions have been flushed, so reset pileup info
00456         pileupHead = pileupStart = 0;
00457         pileupTail = -1;
00458     }
00459 }
00460 
00461 
00462 // Get the position in the myElements container that is associated
00463 // with the specified position.  If the specified position cannot
00464 // fit within the myElements container, -1 is returned.
00465 template <class PILEUP_TYPE, class FUNC_CLASS>
00466 int Pileup<PILEUP_TYPE, FUNC_CLASS>::pileupPosition(int position)
00467 {
00468     // Check to see if this is the first position (pileupTail == -1)
00469     if(pileupTail == -1)
00470     {
00471         pileupStart = pileupHead = position;
00472         // This is the first time this position is being used, so
00473         // reset the element.
00474         resetElement(myElements[0], position);
00475         pileupTail = position;
00476         return(0);
00477     }
00478 
00479 
00480     if((position < pileupHead) || (position > (pileupHead + pileupWindow)))
00481     {
00482         String errorMessage =
00483             "Overflow on the pileup buffer: specifiedPosition = ";
00484         errorMessage += position;
00485         errorMessage += ", pileup buffer start position: ";
00486         errorMessage += pileupHead;
00487         errorMessage += ", pileup buffer end position: ";
00488         errorMessage += pileupHead + pileupWindow;
00489 
00490         throw std::runtime_error(errorMessage.c_str());
00491     }    
00492 
00493     //   int offset = position - pileupStart;
00494     int offset = position - pileupStart;
00495     
00496     if(offset >= pileupWindow)
00497     {
00498         offset -= pileupWindow;
00499     }
00500 
00501     // Check to see if position is past the end of the currently
00502     // setup pileup positions.
00503     while(position > pileupTail)
00504     {
00505         // Increment pileupTail to the next position since the current
00506         // pileupTail is already in use.
00507         ++pileupTail;
00508 
00509         // Figure out the offset for this next position.
00510         offset = pileupTail - pileupStart;
00511         if(offset >= pileupWindow)
00512         {
00513             offset -= pileupWindow;
00514         }
00515 
00516         // This is the first time this position is being used, so
00517         // reset the element.
00518         resetElement(myElements[offset], pileupTail);
00519     }
00520 
00521     return(offset);
00522 }
00523 
00524 
00525 template <class PILEUP_TYPE, class FUNC_CLASS>
00526 void Pileup<PILEUP_TYPE, FUNC_CLASS>::resetElement(PILEUP_TYPE& element,
00527                                                    int position)
00528 {
00529     element.reset(position);
00530 }
00531 
00532 
00533 template <class PILEUP_TYPE, class FUNC_CLASS>
00534 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addElement(PILEUP_TYPE& element,
00535                                                  SamRecord& record)
00536 {
00537     element.addEntry(record);
00538 }
00539 
00540 
00541 template <class PILEUP_TYPE, class FUNC_CLASS>
00542 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeElement(PILEUP_TYPE& element)
00543 {
00544     myAnalyzeFuncPtr(element);
00545 }
00546 
00547 
00548 template <class PILEUP_TYPE, class FUNC_CLASS>
00549 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeHead()
00550 {
00551     myAnalyzeFuncPtr(myElements[pileupHead - pileupStart]);
00552 }
00553 
00554 
00555 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends