Pileup.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __PILEUP_H__
00019 #define __PILEUP_H__
00020 
00021 #include <stdexcept>
00022 #include "SamFile.h"
00023 #include "PosList.h"
00024 
00025 
00026 class PileupHelper
00027 {
00028 public:
00029     static const int DEFAULT_WINDOW_SIZE = 1024;
00030 };
00031 
00032 template <class PILEUP_TYPE>
00033 class defaultPileup
00034 {
00035 public:
00036     bool operator() (PILEUP_TYPE& element)
00037     {
00038         element.analyze();
00039         return(true);
00040     }
00041     void analyze(PILEUP_TYPE element)
00042     {
00043         element.analyze();
00044     }
00045 };
00046 
00047 template <class PILEUP_TYPE>
00048 void defaultPileupAnalyze(PILEUP_TYPE& element)
00049 {
00050     element.analyze();
00051 }
00052 
00053 
00054 /// Class to perform a pileup of all reads by position, assuming
00055 /// the reads are coordinate sorted.
00056 template <class PILEUP_TYPE, 
00057           class FUNC_CLASS = defaultPileup<PILEUP_TYPE> >
00058 class Pileup
00059 {
00060 public:
00061     /// Constructor using the default maximum number of bases a read spans.
00062     Pileup(const FUNC_CLASS& fp = FUNC_CLASS());
00063 
00064     /// Constructor that sets the maximum number of bases a read spans.
00065     /// This is the "window" the length of the buffer that holds
00066     /// the pileups for each position until the read start has moved
00067     /// past the position.
00068     Pileup(int window, const FUNC_CLASS& fp = FUNC_CLASS());
00069 
00070     /// Perform pileup with a reference.
00071     Pileup(const std::string& refSeqFileName,
00072            const FUNC_CLASS& fp = FUNC_CLASS());
00073 
00074     /// Perform pileup with a reference and a specified window size.
00075     Pileup(int window, const std::string& refSeqFileName, 
00076            const FUNC_CLASS& fp = FUNC_CLASS());
00077 
00078     /// Destructor
00079     virtual ~Pileup();
00080 
00081     /// Performs a pileup on the specified file.
00082     /// \param excludeFlag if specified, if any bit set in the exclude flag
00083     ///                    is set in the record's flag, it will be dropped.
00084     /// Defaulted to exclude:
00085     ///    * unmapped, 
00086     ///    * not primary alignment
00087     ///    * failed platform/vendor quality check
00088     ///    * PCR or optical duplicate
00089     /// \param includeFlag if specified, every bit must be set in the 
00090     ///                    record's flag for it to be included - 
00091     ///                    defaulted to 0, no bits are required to be set.
00092     /// \return 0 for success and non-zero for failure.
00093     virtual int processFile(const std::string& fileName, 
00094                             uint16_t excludeFlag = 0x0704, 
00095                             uint16_t includeFlag = 0);
00096 
00097     /// Add an alignment to the pileup.
00098     virtual void processAlignment(SamRecord& record);
00099    
00100     /// Add only positions that fall within the specified region of the
00101     /// alignment to the pileup and outside of the specified excluded positions.
00102     /// \param record alignment to be added to the pileup.
00103     /// \param startPos 0-based start position of the bases that should be
00104     ///                 added to the pileup.
00105     /// \param endPos   0-based end position of the bases that should be added
00106     ///                 to the pileup (this position is not added).
00107     ///                 Set to -1 if there is no end position to the region.
00108     /// \param excludeList list of refID/positions to exclude from processing.
00109     virtual void processAlignmentRegion(SamRecord& record,
00110                                         int startPos,
00111                                         int endPos,
00112                                         PosList* excludeList = NULL);
00113    
00114     /// Done processing, flush every position that is currently being stored
00115     /// in the pileup.
00116     void flushPileup();
00117 
00118 protected:
00119     FUNC_CLASS myAnalyzeFuncPtr;
00120 
00121     // Always need the reference position.
00122     void addAlignmentPosition(int refPosition, SamRecord& record);
00123 
00124 
00125     virtual void flushPileup(int refID, int refPosition);
00126     void flushPileup(int refPosition);
00127     
00128     // Get the position in the myElements container that is associated
00129     // with the specified position.  If the specified position cannot
00130     // fit within the myElements container, -1 is returned.
00131     int pileupPosition(int refPosition);
00132 
00133     virtual void resetElement(PILEUP_TYPE& element, int position);
00134     virtual void addElement(PILEUP_TYPE& element, SamRecord& record);
00135     virtual void analyzeElement(PILEUP_TYPE& element);
00136     virtual void analyzeHead();
00137 
00138     std::vector<PILEUP_TYPE> myElements;
00139 
00140     int    pileupStart;
00141     int    pileupHead;
00142     int    pileupTail; // last used position
00143     int    pileupWindow;
00144 
00145     int myCurrentRefID;
00146 
00147     GenomeSequence* myRefPtr;
00148 };
00149 
00150 
00151 template <class PILEUP_TYPE, class FUNC_CLASS>
00152 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const FUNC_CLASS& fp)
00153     : myAnalyzeFuncPtr(fp),
00154       myElements(),
00155       pileupStart(0),
00156       pileupHead(0),
00157       pileupTail(-1),
00158       pileupWindow(PileupHelper::DEFAULT_WINDOW_SIZE),
00159       myCurrentRefID(-2),
00160       myRefPtr(NULL)
00161 {
00162     // Not using pointers since this is templated.
00163     myElements.resize(pileupWindow);
00164 }
00165 
00166 
00167 template <class PILEUP_TYPE, class FUNC_CLASS>
00168 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const FUNC_CLASS& fp)
00169     : myAnalyzeFuncPtr(fp),
00170       myElements(),
00171       pileupStart(0),
00172       pileupHead(0),
00173       pileupTail(-1),
00174       pileupWindow(window),
00175       myCurrentRefID(-2),
00176       myRefPtr(NULL)
00177 {
00178     // Not using pointers since this is templated.
00179     myElements.resize(window);
00180 }
00181 
00182 
00183 template <class PILEUP_TYPE, class FUNC_CLASS>
00184 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const std::string& refSeqFileName, const FUNC_CLASS& fp)
00185     : myAnalyzeFuncPtr(fp),
00186       myElements(),
00187       pileupStart(0),
00188       pileupHead(0),
00189       pileupTail(-1),
00190       pileupWindow(PileupHelper::DEFAULT_WINDOW_SIZE),
00191       myCurrentRefID(-2),
00192       myRefPtr(NULL)
00193 {
00194     myRefPtr = new GenomeSequence(refSeqFileName.c_str());
00195 
00196     // Not using pointers since this is templated.
00197     myElements.resize(pileupWindow);
00198 
00199     PILEUP_TYPE::setReference(myRefPtr);
00200 }
00201 
00202 
00203 template <class PILEUP_TYPE, class FUNC_CLASS>
00204 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const std::string& refSeqFileName, const FUNC_CLASS& fp)
00205     : myAnalyzeFuncPtr(fp),
00206       myElements(),
00207       pileupStart(0),
00208       pileupHead(0),
00209       pileupTail(-1),
00210       pileupWindow(window),
00211       myCurrentRefID(-2),
00212       myRefPtr(NULL)
00213 {
00214     myRefPtr = new GenomeSequence(refSeqFileName.c_str());
00215 
00216     // Not using pointers since this is templated.
00217     myElements.resize(window);
00218 
00219     PILEUP_TYPE::setReference(myRefPtr);
00220 }
00221 
00222 
00223 template <class PILEUP_TYPE, class FUNC_CLASS>
00224 Pileup<PILEUP_TYPE, FUNC_CLASS>::~Pileup()
00225 {
00226     flushPileup();
00227     if(myRefPtr != NULL)
00228     {
00229         delete myRefPtr;
00230         myRefPtr = NULL;
00231     }
00232 }
00233 
00234 template <class PILEUP_TYPE, class FUNC_CLASS>
00235 int Pileup<PILEUP_TYPE, FUNC_CLASS>::processFile(const std::string& fileName, 
00236                                                  uint16_t excludeFlag,
00237                                                  uint16_t includeFlag)
00238 {
00239     SamFile samIn;
00240     SamFileHeader header;
00241     SamRecord record;
00242     
00243     if(myRefPtr != NULL)
00244     {
00245         samIn.SetReference(myRefPtr);
00246     }
00247 
00248     if(!samIn.OpenForRead(fileName.c_str()))
00249     {
00250         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00251         return(samIn.GetStatus());
00252     }
00253     
00254     if(!samIn.ReadHeader(header))
00255     {
00256         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00257         return(samIn.GetStatus());
00258     }
00259 
00260     // The file needs to be sorted by coordinate.
00261     samIn.setSortedValidation(SamFile::COORDINATE);
00262 
00263     // Iterate over all records
00264     while (samIn.ReadRecord(header, record))
00265     {
00266         uint16_t flag = record.getFlag();
00267         if(flag & excludeFlag)
00268         {
00269             // This record has an excluded flag set, 
00270             // so continue to the next one.
00271             continue;
00272         }
00273         if((flag & includeFlag) != includeFlag)
00274         {
00275             // This record does not have all required flags set, 
00276             // so continue to the next one.
00277             continue;
00278         }
00279         processAlignment(record);
00280     }
00281 
00282     flushPileup();
00283 
00284     int returnValue = 0;
00285     if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
00286     {
00287         // Failed to read a record.
00288         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00289         returnValue = samIn.GetStatus();
00290     }
00291     return(returnValue);  
00292 }
00293 
00294 
00295 template <class PILEUP_TYPE, class FUNC_CLASS>
00296 void Pileup<PILEUP_TYPE, FUNC_CLASS>::processAlignment(SamRecord& record)
00297 {
00298     int refPosition = record.get0BasedPosition();
00299     int refID = record.getReferenceID();
00300 
00301     // Flush any elements from the pileup that are prior to this record
00302     // since the file is sorted, we are done with those positions.
00303     flushPileup(refID, refPosition);
00304     
00305     // Loop through for each reference position covered by the record.
00306     // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
00307     // that are related with the given reference position.
00308     for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
00309     {
00310         addAlignmentPosition(refPosition, record);
00311     }
00312 }
00313 
00314 
00315 template <class PILEUP_TYPE, class FUNC_CLASS>
00316 void Pileup<PILEUP_TYPE, FUNC_CLASS>::processAlignmentRegion(SamRecord& record,
00317                                                              int startPos,
00318                                                              int endPos,
00319                                                              PosList* excludeList)
00320 {
00321     int refPosition = record.get0BasedPosition();
00322     int refID = record.getReferenceID();
00323 
00324     // Flush any elements from the pileup that are prior to this record
00325     // since the file is sorted, we are done with those positions.
00326     flushPileup(refID, refPosition);
00327     
00328     // Check if the region starts after this reference starts.  If so,
00329     // we only want to start adding at the region start position.
00330     if(startPos > refPosition)
00331     {
00332         refPosition = startPos;
00333     }
00334 
00335     // Loop through for each reference position covered by the record.
00336     // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
00337     // that are related with the given reference position.
00338     for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
00339     {
00340         // Check to see if we have gone past the end of the region, in which
00341         // case we can stop processing this record.  Check >= since the
00342         // end position is not in the region.
00343         if((endPos != -1) && (refPosition >= endPos))
00344         {
00345             break;
00346         }
00347 
00348         // Check to see if this position is in the exclude list.
00349         bool addPos = true;
00350         if(excludeList != NULL)
00351         {
00352             // There is an exclude list, so lookup the position.
00353             if(excludeList->hasPosition(refID, refPosition))
00354             {
00355                 // This position is in the exclude list, so don't add it.
00356                 addPos = false;
00357             }
00358         }
00359         if(addPos)
00360         {
00361             addAlignmentPosition(refPosition, record);
00362         }
00363     }
00364 }
00365 
00366 
00367 template <class PILEUP_TYPE, class FUNC_CLASS>
00368 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup()
00369 {
00370     // while there are still entries between the head and tail, flush,
00371     // but no need to flush if pileupTail == -1 because in that case 
00372     // no entries have been added
00373     while ((pileupHead <= pileupTail) && (pileupTail != -1))
00374     {
00375         flushPileup(pileupHead+1);
00376     }
00377     pileupStart = pileupHead = 0;
00378     pileupTail = -1;
00379 }
00380 
00381 
00382 // Always need the reference position.
00383 template <class PILEUP_TYPE, class FUNC_CLASS>
00384 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addAlignmentPosition(int refPosition,
00385                                                            SamRecord& record)
00386 {
00387     int offset = pileupPosition(refPosition);
00388     
00389     if((offset < 0) || (offset >= pileupWindow))
00390     {
00391         std::cerr << "Pileup Buffer Overflow: position = " << refPosition
00392                   << "; refID = " << record.getReferenceID() 
00393                   << "; recStartPos = " << record.get1BasedPosition()
00394                   << "; pileupStart = " << pileupStart
00395                   << "; pileupHead = " << pileupHead
00396                   << "; pileupTail = " << pileupTail;
00397     }
00398 
00399     addElement(myElements[offset], record);
00400 }
00401 
00402 
00403 template <class PILEUP_TYPE, class FUNC_CLASS>
00404 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int refID, int position)
00405 {
00406     // if new chromosome, flush the entire pileup.
00407     if(refID != myCurrentRefID)
00408     {
00409         // New chromosome, flush everything.
00410         flushPileup();
00411         myCurrentRefID = refID;
00412     }
00413     else
00414     {
00415         // on the same chromosome, so flush just up to this new position.
00416         flushPileup(position);
00417     }
00418 }
00419 
00420 
00421 template <class PILEUP_TYPE, class FUNC_CLASS>
00422 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int position)
00423 {
00424     // Flush up to this new position, but no reason to flush if
00425     // pileupHead has not been set.
00426     while((pileupHead < position) && (pileupHead <= pileupTail))
00427     {
00428         analyzeHead();
00429 
00430         pileupHead++;
00431         
00432         if(pileupHead - pileupStart >= pileupWindow)
00433             pileupStart += pileupWindow;
00434     }
00435 
00436     if(pileupHead > pileupTail)
00437     {
00438         // All positions have been flushed, so reset pileup info
00439         pileupHead = pileupStart = 0;
00440         pileupTail = -1;
00441     }
00442 }
00443 
00444 
00445 // Get the position in the myElements container that is associated
00446 // with the specified position.  If the specified position cannot
00447 // fit within the myElements container, -1 is returned.
00448 template <class PILEUP_TYPE, class FUNC_CLASS>
00449 int Pileup<PILEUP_TYPE, FUNC_CLASS>::pileupPosition(int position)
00450 {
00451     // Check to see if this is the first position (pileupTail == -1)
00452     if(pileupTail == -1)
00453     {
00454         pileupStart = pileupHead = position;
00455         // This is the first time this position is being used, so
00456         // reset the element.
00457         resetElement(myElements[0], position);
00458         pileupTail = position;
00459         return(0);
00460     }
00461 
00462 
00463     if((position < pileupHead) || (position > (pileupHead + pileupWindow)))
00464     {
00465         String errorMessage =
00466             "Overflow on the pileup buffer: specifiedPosition = ";
00467         errorMessage += position;
00468         errorMessage += ", pileup buffer start position: ";
00469         errorMessage += pileupHead;
00470         errorMessage += ", pileup buffer end position: ";
00471         errorMessage += pileupHead + pileupWindow;
00472 
00473         throw std::runtime_error(errorMessage.c_str());
00474     }    
00475 
00476     //   int offset = position - pileupStart;
00477     int offset = position - pileupStart;
00478     
00479     if(offset >= pileupWindow)
00480     {
00481         offset -= pileupWindow;
00482     }
00483 
00484     // Check to see if position is past the end of the currently
00485     // setup pileup positions.
00486     while(position > pileupTail)
00487     {
00488         // Increment pileupTail to the next position since the current
00489         // pileupTail is already in use.
00490         ++pileupTail;
00491 
00492         // Figure out the offset for this next position.
00493         offset = pileupTail - pileupStart;
00494         if(offset >= pileupWindow)
00495         {
00496             offset -= pileupWindow;
00497         }
00498 
00499         // This is the first time this position is being used, so
00500         // reset the element.
00501         resetElement(myElements[offset], pileupTail);
00502     }
00503 
00504     return(offset);
00505 }
00506 
00507 
00508 template <class PILEUP_TYPE, class FUNC_CLASS>
00509 void Pileup<PILEUP_TYPE, FUNC_CLASS>::resetElement(PILEUP_TYPE& element,
00510                                                    int position)
00511 {
00512     element.reset(position);
00513 }
00514 
00515 
00516 template <class PILEUP_TYPE, class FUNC_CLASS>
00517 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addElement(PILEUP_TYPE& element,
00518                                                  SamRecord& record)
00519 {
00520     element.addEntry(record);
00521 }
00522 
00523 
00524 template <class PILEUP_TYPE, class FUNC_CLASS>
00525 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeElement(PILEUP_TYPE& element)
00526 {
00527     myAnalyzeFuncPtr(element);
00528 }
00529 
00530 
00531 template <class PILEUP_TYPE, class FUNC_CLASS>
00532 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeHead()
00533 {
00534     myAnalyzeFuncPtr(myElements[pileupHead - pileupStart]);
00535 }
00536 
00537 
00538 #endif
Generated on Mon Feb 11 13:45:17 2013 for libStatGen Software by  doxygen 1.6.3