Pileup.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __PILEUP_H__
00019 #define __PILEUP_H__
00020 
00021 #include <stdexcept>
00022 #include "SamFile.h"
00023 
00024 template <class PILEUP_TYPE>
00025 class defaultPileup
00026 {
00027 public:
00028     bool operator() (PILEUP_TYPE& element)
00029     {
00030         element.analyze();
00031         return(true);
00032     }
00033     void analyze(PILEUP_TYPE element)
00034     {
00035         element.analyze();
00036     }
00037 };
00038 
00039 template <class PILEUP_TYPE>
00040 void defaultPileupAnalyze(PILEUP_TYPE& element)
00041 {
00042     element.analyze();
00043 }
00044 
00045 
00046 template <class PILEUP_TYPE, 
00047           class FUNC_CLASS = defaultPileup<PILEUP_TYPE> >
00048 class Pileup
00049 {
00050 public:
00051     Pileup(const FUNC_CLASS& fp = FUNC_CLASS());
00052 
00053     Pileup(int window, const FUNC_CLASS& fp = FUNC_CLASS());
00054 
00055     // Perform pileup with a reference.
00056     Pileup(const std::string& refSeqFileName,
00057            const FUNC_CLASS& fp = FUNC_CLASS());
00058 
00059     // Perform pileup with a reference.
00060     Pileup(int window, const std::string& refSeqFileName, 
00061            const FUNC_CLASS& fp = FUNC_CLASS());
00062 
00063     ~Pileup();
00064 
00065     // Performs a pileup on the specified file.
00066     // Returns 0 for success and non-zero for failure.
00067     // If excludeFlag is specified, any bit may be set for the
00068     // record to be dropped.
00069     // Defaulted to exclude:
00070     //    * unmapped, 
00071     //    * not primary alignment
00072     //    * failed platform/vendor quality check
00073     //    * PCR or optical duplicate
00074     // If includeFlag is specified, every bit must be set for the
00075     // record to be included - defaulted to 0 - no bits are required to be set.
00076     virtual int processFile(const std::string& fileName, 
00077                             uint16_t excludeFlag = 0x0704, 
00078                             uint16_t includeFlag = 0);
00079 
00080     // This should be overwritten to perform any necessary
00081     // filtering on the record.
00082     virtual void processAlignment(SamRecord& record);
00083    
00084 
00085     void flushPileup();
00086 
00087 protected:
00088     FUNC_CLASS myAnalyzeFuncPtr;
00089 
00090     // Always need the reference position.
00091     void addAlignmentPosition(int refPosition, SamRecord& record);
00092 
00093 
00094     virtual void flushPileup(int refID, int refPosition);
00095     void flushPileup(int refPosition);
00096     
00097     // Get the position in the myElements container that is associated
00098     // with the specified position.  If the specified position cannot
00099     // fit within the myElements container, -1 is returned.
00100     int pileupPosition(int refPosition);
00101 
00102     virtual void resetElement(PILEUP_TYPE& element, int position);
00103     virtual void addElement(PILEUP_TYPE& element, SamRecord& record);
00104     virtual void analyzeElement(PILEUP_TYPE& element);
00105     virtual void analyzeHead();
00106 
00107     std::vector<PILEUP_TYPE> myElements;
00108 
00109     int    pileupStart;
00110     int    pileupHead;
00111     int    pileupTail;
00112     int    pileupWindow;
00113 
00114     int myCurrentRefID;
00115 
00116     GenomeSequence* myRefPtr;
00117 };
00118 
00119 
00120 template <class PILEUP_TYPE, class FUNC_CLASS>
00121 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const FUNC_CLASS& fp)
00122     : myAnalyzeFuncPtr(fp),
00123       myElements(),
00124       pileupStart(0),
00125       pileupHead(0),
00126       pileupTail(-1),
00127       pileupWindow(1024),
00128       myCurrentRefID(-2),
00129       myRefPtr(NULL)
00130 {
00131     // Not using pointers since this is templated.
00132     myElements.resize(pileupWindow);
00133 }
00134 
00135 
00136 template <class PILEUP_TYPE, class FUNC_CLASS>
00137 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const FUNC_CLASS& fp)
00138     : myAnalyzeFuncPtr(fp),
00139       myElements(),
00140       pileupStart(0),
00141       pileupHead(0),
00142       pileupTail(-1),
00143       pileupWindow(window),
00144       myCurrentRefID(-2),
00145       myRefPtr(NULL)
00146 {
00147     // Not using pointers since this is templated.
00148     myElements.resize(window);
00149 }
00150 
00151 
00152 template <class PILEUP_TYPE, class FUNC_CLASS>
00153 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const std::string& refSeqFileName, const FUNC_CLASS& fp)
00154     : myAnalyzeFuncPtr(fp),
00155       myElements(),
00156       pileupStart(0),
00157       pileupHead(0),
00158       pileupTail(-1),
00159       pileupWindow(1024),
00160       myCurrentRefID(-2),
00161       myRefPtr(NULL)
00162 {
00163     myRefPtr = new GenomeSequence(refSeqFileName.c_str());
00164 
00165     // Not using pointers since this is templated.
00166     myElements.resize(pileupWindow);
00167 
00168     PILEUP_TYPE::setReference(myRefPtr);
00169 }
00170 
00171 
00172 template <class PILEUP_TYPE, class FUNC_CLASS>
00173 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const std::string& refSeqFileName, const FUNC_CLASS& fp)
00174     : myAnalyzeFuncPtr(fp),
00175       myElements(),
00176       pileupStart(0),
00177       pileupHead(0),
00178       pileupTail(-1),
00179       pileupWindow(window),
00180       myCurrentRefID(-2),
00181       myRefPtr(NULL)
00182 {
00183     myRefPtr = new GenomeSequence(refSeqFileName.c_str());
00184 
00185     // Not using pointers since this is templated.
00186     myElements.resize(window);
00187 
00188     PILEUP_TYPE::setReference(myRefPtr);
00189 }
00190 
00191 
00192 template <class PILEUP_TYPE, class FUNC_CLASS>
00193 Pileup<PILEUP_TYPE, FUNC_CLASS>::~Pileup()
00194 {
00195     if(myRefPtr != NULL)
00196     {
00197         delete myRefPtr;
00198         myRefPtr = NULL;
00199     }
00200 }
00201 
00202 template <class PILEUP_TYPE, class FUNC_CLASS>
00203 int Pileup<PILEUP_TYPE, FUNC_CLASS>::processFile(const std::string& fileName, 
00204                                                  uint16_t excludeFlag,
00205                                                  uint16_t includeFlag)
00206 {
00207     SamFile samIn;
00208     SamFileHeader header;
00209     SamRecord record;
00210     
00211     if(myRefPtr != NULL)
00212     {
00213         samIn.SetReference(myRefPtr);
00214     }
00215 
00216     if(!samIn.OpenForRead(fileName.c_str()))
00217     {
00218         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00219         return(samIn.GetStatus());
00220     }
00221     
00222     if(!samIn.ReadHeader(header))
00223     {
00224         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00225         return(samIn.GetStatus());
00226     }
00227 
00228     // The file needs to be sorted by coordinate.
00229     samIn.setSortedValidation(SamFile::COORDINATE);
00230 
00231     // Iterate over all records
00232     while (samIn.ReadRecord(header, record))
00233     {
00234         uint16_t flag = record.getFlag();
00235         if(flag & excludeFlag)
00236         {
00237             // This record has an excluded flag set, 
00238             // so continue to the next one.
00239             continue;
00240         }
00241         if((flag & includeFlag) != includeFlag)
00242         {
00243             // This record does not have all required flags set, 
00244             // so continue to the next one.
00245             continue;
00246         }
00247         processAlignment(record);
00248     }
00249 
00250     flushPileup();
00251 
00252     int returnValue = 0;
00253     if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
00254     {
00255         // Failed to read a record.
00256         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00257         returnValue = samIn.GetStatus();
00258     }
00259     return(returnValue);  
00260 }
00261 
00262 
00263 template <class PILEUP_TYPE, class FUNC_CLASS>
00264 void Pileup<PILEUP_TYPE, FUNC_CLASS>::processAlignment(SamRecord& record)
00265 {
00266     int refPosition = record.get0BasedPosition();
00267     int refID = record.getReferenceID();
00268 
00269     // Flush any elements from the pileup that are prior to this record
00270     // since the file is sorted, we are done with those positions.
00271     flushPileup(refID, refPosition);
00272     
00273     // Loop through for each reference position covered by the record.
00274     // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
00275     // that are related with the given reference position.
00276     for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
00277     {
00278         addAlignmentPosition(refPosition, record);
00279     }
00280 }
00281 
00282 
00283 template <class PILEUP_TYPE, class FUNC_CLASS>
00284 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup()
00285 {
00286     // while there are still entries between the head and tail, flush,
00287     // but no need to flush if pileupTail == -1 because in that case 
00288     // no entries have been added
00289     while ((pileupHead <= pileupTail) && (pileupTail != -1))
00290     {
00291         flushPileup(pileupHead+1);
00292     }
00293     pileupStart = pileupHead = 0;
00294     pileupTail = -1;
00295 }
00296 
00297 
00298 // Always need the reference position.
00299 template <class PILEUP_TYPE, class FUNC_CLASS>
00300 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addAlignmentPosition(int refPosition,
00301                                                            SamRecord& record)
00302 {
00303     int offset = pileupPosition(refPosition);
00304     
00305     if((offset < 0) || (offset >= pileupWindow))
00306     {
00307         std::cerr << "Pileup Buffer Overflow: position = " << refPosition
00308                   << "; refID = " << record.getReferenceID() 
00309                   << "; recStartPos = " << record.get1BasedPosition()
00310                   << "; pileupStart = " << pileupStart
00311                   << "; pileupHead = " << pileupHead
00312                   << "; pileupTail = " << pileupTail;
00313     }
00314 
00315     addElement(myElements[offset], record);
00316 }
00317 
00318 
00319 template <class PILEUP_TYPE, class FUNC_CLASS>
00320 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int refID, int position)
00321 {
00322     // if new chromosome, flush the entire pileup.
00323     if(refID != myCurrentRefID)
00324     {
00325         // New chromosome, flush everything.
00326         flushPileup();
00327         myCurrentRefID = refID;
00328         // Set the starts to this position - first entry
00329         // in this chromosome.
00330         pileupStart = pileupHead = position;
00331     }
00332     else
00333     {
00334         // on the same chromosome, so flush just up to this new position.
00335         flushPileup(position);
00336     }
00337 }
00338 
00339 
00340 template <class PILEUP_TYPE, class FUNC_CLASS>
00341 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int position)
00342 {
00343     // Flush up to this new position, but no reason to flush if
00344     // pileupHead has not been set.
00345     while((pileupHead < position) && (pileupHead <= pileupTail))
00346     {
00347         analyzeHead();
00348 
00349         pileupHead++;
00350         
00351         if(pileupHead - pileupStart >= pileupWindow)
00352             pileupStart += pileupWindow;
00353     }
00354 
00355     // If pileupHead != position, then we stopped because we had already
00356     // flushed all the data we had, just set the head and start to the position.
00357     if(pileupHead != position)
00358     {
00359         pileupHead = pileupStart = position;
00360     }
00361 }
00362 
00363 
00364 // Get the position in the myElements container that is associated
00365 // with the specified position.  If the specified position cannot
00366 // fit within the myElements container, -1 is returned.
00367 template <class PILEUP_TYPE, class FUNC_CLASS>
00368 int Pileup<PILEUP_TYPE, FUNC_CLASS>::pileupPosition(int position)
00369 {
00370     // Check to see if this is the first position (pileupTail == -1)
00371     if(pileupTail == -1)
00372     {
00373         pileupStart = pileupHead = position;
00374     }
00375 
00376 
00377     if((position < pileupHead) || (position > (pileupHead + pileupWindow)))
00378     {
00379         String errorMessage =
00380             "Overflow on the pileup buffer: specifiedPosition = ";
00381         errorMessage += position;
00382         errorMessage += ", pileup buffer start position: ";
00383         errorMessage += pileupHead;
00384         errorMessage += ", pileup buffer end position: ";
00385         errorMessage += pileupHead + pileupWindow;
00386 
00387         throw std::runtime_error(errorMessage.c_str());
00388     }    
00389 
00390     int offset = position - pileupStart;
00391     
00392     if(offset >= pileupWindow)
00393     {
00394         offset -= pileupWindow;
00395     }
00396 
00397     if(position > pileupTail)
00398     {
00399         // This is the first time this position is being used.
00400         // reset the element for this position.
00401         resetElement(myElements[offset], position);
00402         pileupTail = position;
00403     }
00404 
00405     return(offset);
00406 }
00407 
00408 
00409 template <class PILEUP_TYPE, class FUNC_CLASS>
00410 void Pileup<PILEUP_TYPE, FUNC_CLASS>::resetElement(PILEUP_TYPE& element,
00411                                                    int position)
00412 {
00413     element.reset(position);
00414 }
00415 
00416 
00417 template <class PILEUP_TYPE, class FUNC_CLASS>
00418 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addElement(PILEUP_TYPE& element,
00419                                                  SamRecord& record)
00420 {
00421     element.addEntry(record);
00422 }
00423 
00424 
00425 template <class PILEUP_TYPE, class FUNC_CLASS>
00426 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeElement(PILEUP_TYPE& element)
00427 {
00428     myAnalyzeFuncPtr(element);
00429 }
00430 
00431 
00432 template <class PILEUP_TYPE, class FUNC_CLASS>
00433 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeHead()
00434 {
00435     myAnalyzeFuncPtr(myElements[pileupHead - pileupStart]);
00436 }
00437 
00438 
00439 #endif
Generated on Tue Aug 23 18:19:04 2011 for libStatGen Software by  doxygen 1.6.3