Pileup.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __PILEUP_H__
00019 #define __PILEUP_H__
00020 
00021 #include <stdexcept>
00022 #include "SamFile.h"
00023 #include "PosList.h"
00024 
00025 template <class PILEUP_TYPE>
00026 class defaultPileup
00027 {
00028 public:
00029     bool operator() (PILEUP_TYPE& element)
00030     {
00031         element.analyze();
00032         return(true);
00033     }
00034     void analyze(PILEUP_TYPE element)
00035     {
00036         element.analyze();
00037     }
00038 };
00039 
00040 template <class PILEUP_TYPE>
00041 void defaultPileupAnalyze(PILEUP_TYPE& element)
00042 {
00043     element.analyze();
00044 }
00045 
00046 
00047 template <class PILEUP_TYPE, 
00048           class FUNC_CLASS = defaultPileup<PILEUP_TYPE> >
00049 class Pileup
00050 {
00051 public:
00052     Pileup(const FUNC_CLASS& fp = FUNC_CLASS());
00053 
00054     Pileup(int window, const FUNC_CLASS& fp = FUNC_CLASS());
00055 
00056     // Perform pileup with a reference.
00057     Pileup(const std::string& refSeqFileName,
00058            const FUNC_CLASS& fp = FUNC_CLASS());
00059 
00060     // Perform pileup with a reference.
00061     Pileup(int window, const std::string& refSeqFileName, 
00062            const FUNC_CLASS& fp = FUNC_CLASS());
00063 
00064     ~Pileup();
00065 
00066     // Performs a pileup on the specified file.
00067     // Returns 0 for success and non-zero for failure.
00068     // If excludeFlag is specified, any bit may be set for the
00069     // record to be dropped.
00070     // Defaulted to exclude:
00071     //    * unmapped, 
00072     //    * not primary alignment
00073     //    * failed platform/vendor quality check
00074     //    * PCR or optical duplicate
00075     // If includeFlag is specified, every bit must be set for the
00076     // record to be included - defaulted to 0 - no bits are required to be set.
00077     virtual int processFile(const std::string& fileName, 
00078                             uint16_t excludeFlag = 0x0704, 
00079                             uint16_t includeFlag = 0);
00080 
00081     /// Add an alignment to the pileup.
00082     virtual void processAlignment(SamRecord& record);
00083    
00084     /// Add only positions that fall within the specified region of the
00085     /// alignment to the pileup and outside of the specified excluded positions.
00086     /// \param record alignment to be added to the pileup.
00087     /// \param startPos 0-based start position of the bases that should be
00088     ///                 added to the pileup.
00089     /// \param endPos   0-based end position of the bases that should be added
00090     ///                 to the pileup (this position is not added).
00091     ///                 Set to -1 if there is no end position to the region.
00092     /// \param excludeList list of refID/positions to exclude from processing.
00093     virtual void processAlignmentRegion(SamRecord& record,
00094                                         int startPos,
00095                                         int endPos,
00096                                         PosList* excludeList = NULL);
00097    
00098 
00099     void flushPileup();
00100 
00101 protected:
00102     FUNC_CLASS myAnalyzeFuncPtr;
00103 
00104     // Always need the reference position.
00105     void addAlignmentPosition(int refPosition, SamRecord& record);
00106 
00107 
00108     virtual void flushPileup(int refID, int refPosition);
00109     void flushPileup(int refPosition);
00110     
00111     // Get the position in the myElements container that is associated
00112     // with the specified position.  If the specified position cannot
00113     // fit within the myElements container, -1 is returned.
00114     int pileupPosition(int refPosition);
00115 
00116     virtual void resetElement(PILEUP_TYPE& element, int position);
00117     virtual void addElement(PILEUP_TYPE& element, SamRecord& record);
00118     virtual void analyzeElement(PILEUP_TYPE& element);
00119     virtual void analyzeHead();
00120 
00121     std::vector<PILEUP_TYPE> myElements;
00122 
00123     int    pileupStart;
00124     int    pileupHead;
00125     int    pileupTail; // last used position
00126     int    pileupWindow;
00127 
00128     int myCurrentRefID;
00129 
00130     GenomeSequence* myRefPtr;
00131 };
00132 
00133 
00134 template <class PILEUP_TYPE, class FUNC_CLASS>
00135 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const FUNC_CLASS& fp)
00136     : myAnalyzeFuncPtr(fp),
00137       myElements(),
00138       pileupStart(0),
00139       pileupHead(0),
00140       pileupTail(-1),
00141       pileupWindow(1024),
00142       myCurrentRefID(-2),
00143       myRefPtr(NULL)
00144 {
00145     // Not using pointers since this is templated.
00146     myElements.resize(pileupWindow);
00147 }
00148 
00149 
00150 template <class PILEUP_TYPE, class FUNC_CLASS>
00151 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const FUNC_CLASS& fp)
00152     : myAnalyzeFuncPtr(fp),
00153       myElements(),
00154       pileupStart(0),
00155       pileupHead(0),
00156       pileupTail(-1),
00157       pileupWindow(window),
00158       myCurrentRefID(-2),
00159       myRefPtr(NULL)
00160 {
00161     // Not using pointers since this is templated.
00162     myElements.resize(window);
00163 }
00164 
00165 
00166 template <class PILEUP_TYPE, class FUNC_CLASS>
00167 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const std::string& refSeqFileName, const FUNC_CLASS& fp)
00168     : myAnalyzeFuncPtr(fp),
00169       myElements(),
00170       pileupStart(0),
00171       pileupHead(0),
00172       pileupTail(-1),
00173       pileupWindow(1024),
00174       myCurrentRefID(-2),
00175       myRefPtr(NULL)
00176 {
00177     myRefPtr = new GenomeSequence(refSeqFileName.c_str());
00178 
00179     // Not using pointers since this is templated.
00180     myElements.resize(pileupWindow);
00181 
00182     PILEUP_TYPE::setReference(myRefPtr);
00183 }
00184 
00185 
00186 template <class PILEUP_TYPE, class FUNC_CLASS>
00187 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const std::string& refSeqFileName, const FUNC_CLASS& fp)
00188     : myAnalyzeFuncPtr(fp),
00189       myElements(),
00190       pileupStart(0),
00191       pileupHead(0),
00192       pileupTail(-1),
00193       pileupWindow(window),
00194       myCurrentRefID(-2),
00195       myRefPtr(NULL)
00196 {
00197     myRefPtr = new GenomeSequence(refSeqFileName.c_str());
00198 
00199     // Not using pointers since this is templated.
00200     myElements.resize(window);
00201 
00202     PILEUP_TYPE::setReference(myRefPtr);
00203 }
00204 
00205 
00206 template <class PILEUP_TYPE, class FUNC_CLASS>
00207 Pileup<PILEUP_TYPE, FUNC_CLASS>::~Pileup()
00208 {
00209     flushPileup();
00210     if(myRefPtr != NULL)
00211     {
00212         delete myRefPtr;
00213         myRefPtr = NULL;
00214     }
00215 }
00216 
00217 template <class PILEUP_TYPE, class FUNC_CLASS>
00218 int Pileup<PILEUP_TYPE, FUNC_CLASS>::processFile(const std::string& fileName, 
00219                                                  uint16_t excludeFlag,
00220                                                  uint16_t includeFlag)
00221 {
00222     SamFile samIn;
00223     SamFileHeader header;
00224     SamRecord record;
00225     
00226     if(myRefPtr != NULL)
00227     {
00228         samIn.SetReference(myRefPtr);
00229     }
00230 
00231     if(!samIn.OpenForRead(fileName.c_str()))
00232     {
00233         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00234         return(samIn.GetStatus());
00235     }
00236     
00237     if(!samIn.ReadHeader(header))
00238     {
00239         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00240         return(samIn.GetStatus());
00241     }
00242 
00243     // The file needs to be sorted by coordinate.
00244     samIn.setSortedValidation(SamFile::COORDINATE);
00245 
00246     // Iterate over all records
00247     while (samIn.ReadRecord(header, record))
00248     {
00249         uint16_t flag = record.getFlag();
00250         if(flag & excludeFlag)
00251         {
00252             // This record has an excluded flag set, 
00253             // so continue to the next one.
00254             continue;
00255         }
00256         if((flag & includeFlag) != includeFlag)
00257         {
00258             // This record does not have all required flags set, 
00259             // so continue to the next one.
00260             continue;
00261         }
00262         processAlignment(record);
00263     }
00264 
00265     flushPileup();
00266 
00267     int returnValue = 0;
00268     if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
00269     {
00270         // Failed to read a record.
00271         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00272         returnValue = samIn.GetStatus();
00273     }
00274     return(returnValue);  
00275 }
00276 
00277 
00278 template <class PILEUP_TYPE, class FUNC_CLASS>
00279 void Pileup<PILEUP_TYPE, FUNC_CLASS>::processAlignment(SamRecord& record)
00280 {
00281     int refPosition = record.get0BasedPosition();
00282     int refID = record.getReferenceID();
00283 
00284     // Flush any elements from the pileup that are prior to this record
00285     // since the file is sorted, we are done with those positions.
00286     flushPileup(refID, refPosition);
00287     
00288     // Loop through for each reference position covered by the record.
00289     // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
00290     // that are related with the given reference position.
00291     for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
00292     {
00293         addAlignmentPosition(refPosition, record);
00294     }
00295 }
00296 
00297 
00298 template <class PILEUP_TYPE, class FUNC_CLASS>
00299 void Pileup<PILEUP_TYPE, FUNC_CLASS>::processAlignmentRegion(SamRecord& record,
00300                                                              int startPos,
00301                                                              int endPos,
00302                                                              PosList* excludeList)
00303 {
00304     int refPosition = record.get0BasedPosition();
00305     int refID = record.getReferenceID();
00306 
00307     // Flush any elements from the pileup that are prior to this record
00308     // since the file is sorted, we are done with those positions.
00309     flushPileup(refID, refPosition);
00310     
00311     // Check if the region starts after this reference starts.  If so,
00312     // we only want to start adding at the region start position.
00313     if(startPos > refPosition)
00314     {
00315         refPosition = startPos;
00316     }
00317 
00318     // Loop through for each reference position covered by the record.
00319     // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
00320     // that are related with the given reference position.
00321     for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
00322     {
00323         // Check to see if we have gone past the end of the region, in which
00324         // case we can stop processing this record.  Check >= since the
00325         // end position is not in the region.
00326         if((endPos != -1) && (refPosition >= endPos))
00327         {
00328             break;
00329         }
00330 
00331         // Check to see if this position is in the exclude list.
00332         bool addPos = true;
00333         if(excludeList != NULL)
00334         {
00335             // There is an exclude list, so lookup the position.
00336             if(excludeList->hasPosition(refID, refPosition))
00337             {
00338                 // This position is in the exclude list, so don't add it.
00339                 addPos = false;
00340             }
00341         }
00342         if(addPos)
00343         {
00344             addAlignmentPosition(refPosition, record);
00345         }
00346     }
00347 }
00348 
00349 
00350 template <class PILEUP_TYPE, class FUNC_CLASS>
00351 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup()
00352 {
00353     // while there are still entries between the head and tail, flush,
00354     // but no need to flush if pileupTail == -1 because in that case 
00355     // no entries have been added
00356     while ((pileupHead <= pileupTail) && (pileupTail != -1))
00357     {
00358         flushPileup(pileupHead+1);
00359     }
00360     pileupStart = pileupHead = 0;
00361     pileupTail = -1;
00362 }
00363 
00364 
00365 // Always need the reference position.
00366 template <class PILEUP_TYPE, class FUNC_CLASS>
00367 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addAlignmentPosition(int refPosition,
00368                                                            SamRecord& record)
00369 {
00370     int offset = pileupPosition(refPosition);
00371     
00372     if((offset < 0) || (offset >= pileupWindow))
00373     {
00374         std::cerr << "Pileup Buffer Overflow: position = " << refPosition
00375                   << "; refID = " << record.getReferenceID() 
00376                   << "; recStartPos = " << record.get1BasedPosition()
00377                   << "; pileupStart = " << pileupStart
00378                   << "; pileupHead = " << pileupHead
00379                   << "; pileupTail = " << pileupTail;
00380     }
00381 
00382     addElement(myElements[offset], record);
00383 }
00384 
00385 
00386 template <class PILEUP_TYPE, class FUNC_CLASS>
00387 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int refID, int position)
00388 {
00389     // if new chromosome, flush the entire pileup.
00390     if(refID != myCurrentRefID)
00391     {
00392         // New chromosome, flush everything.
00393         flushPileup();
00394         myCurrentRefID = refID;
00395     }
00396     else
00397     {
00398         // on the same chromosome, so flush just up to this new position.
00399         flushPileup(position);
00400     }
00401 }
00402 
00403 
00404 template <class PILEUP_TYPE, class FUNC_CLASS>
00405 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int position)
00406 {
00407     // Flush up to this new position, but no reason to flush if
00408     // pileupHead has not been set.
00409     while((pileupHead < position) && (pileupHead <= pileupTail))
00410     {
00411         analyzeHead();
00412 
00413         pileupHead++;
00414         
00415         if(pileupHead - pileupStart >= pileupWindow)
00416             pileupStart += pileupWindow;
00417     }
00418 
00419     if(pileupHead > pileupTail)
00420     {
00421         // All positions have been flushed, so reset pileup info
00422         pileupHead = pileupStart = 0;
00423         pileupTail = -1;
00424     }
00425 }
00426 
00427 
00428 // Get the position in the myElements container that is associated
00429 // with the specified position.  If the specified position cannot
00430 // fit within the myElements container, -1 is returned.
00431 template <class PILEUP_TYPE, class FUNC_CLASS>
00432 int Pileup<PILEUP_TYPE, FUNC_CLASS>::pileupPosition(int position)
00433 {
00434     // Check to see if this is the first position (pileupTail == -1)
00435     if(pileupTail == -1)
00436     {
00437         pileupStart = pileupHead = position;
00438         // This is the first time this position is being used, so
00439         // reset the element.
00440         resetElement(myElements[0], position);
00441         pileupTail = position;
00442         return(0);
00443     }
00444 
00445 
00446     if((position < pileupHead) || (position > (pileupHead + pileupWindow)))
00447     {
00448         String errorMessage =
00449             "Overflow on the pileup buffer: specifiedPosition = ";
00450         errorMessage += position;
00451         errorMessage += ", pileup buffer start position: ";
00452         errorMessage += pileupHead;
00453         errorMessage += ", pileup buffer end position: ";
00454         errorMessage += pileupHead + pileupWindow;
00455 
00456         throw std::runtime_error(errorMessage.c_str());
00457     }    
00458 
00459     //   int offset = position - pileupStart;
00460     int offset = position - pileupStart;
00461     
00462     if(offset >= pileupWindow)
00463     {
00464         offset -= pileupWindow;
00465     }
00466 
00467     // Check to see if position is past the end of the currently
00468     // setup pileup positions.
00469     while(position > pileupTail)
00470     {
00471         // Increment pileupTail to the next position since the current
00472         // pileupTail is already in use.
00473         ++pileupTail;
00474 
00475         // Figure out the offset for this next position.
00476         offset = pileupTail - pileupStart;
00477         if(offset >= pileupWindow)
00478         {
00479             offset -= pileupWindow;
00480         }
00481 
00482         // This is the first time this position is being used, so
00483         // reset the element.
00484         resetElement(myElements[offset], pileupTail);
00485     }
00486 
00487     return(offset);
00488 }
00489 
00490 
00491 template <class PILEUP_TYPE, class FUNC_CLASS>
00492 void Pileup<PILEUP_TYPE, FUNC_CLASS>::resetElement(PILEUP_TYPE& element,
00493                                                    int position)
00494 {
00495     element.reset(position);
00496 }
00497 
00498 
00499 template <class PILEUP_TYPE, class FUNC_CLASS>
00500 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addElement(PILEUP_TYPE& element,
00501                                                  SamRecord& record)
00502 {
00503     element.addEntry(record);
00504 }
00505 
00506 
00507 template <class PILEUP_TYPE, class FUNC_CLASS>
00508 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeElement(PILEUP_TYPE& element)
00509 {
00510     myAnalyzeFuncPtr(element);
00511 }
00512 
00513 
00514 template <class PILEUP_TYPE, class FUNC_CLASS>
00515 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeHead()
00516 {
00517     myAnalyzeFuncPtr(myElements[pileupHead - pileupStart]);
00518 }
00519 
00520 
00521 #endif
Generated on Tue Sep 6 17:51:59 2011 for libStatGen Software by  doxygen 1.6.3