Pileup.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __PILEUP_H__
00019 #define __PILEUP_H__
00020 
00021 template <class PILEUP_TYPE>
00022 class defaultPileup
00023 {
00024 public:
00025     bool operator() (PILEUP_TYPE& element)
00026     {
00027         element.analyze();
00028         return(true);
00029     }
00030     void analyze(PILEUP_TYPE element)
00031     {
00032         element.analyze();
00033     }
00034 };
00035 
00036 template <class PILEUP_TYPE>
00037 void defaultPileupAnalyze(PILEUP_TYPE& element)
00038 {
00039     element.analyze();
00040 }
00041 
00042 
00043 template <class PILEUP_TYPE, 
00044           class FUNC_CLASS = defaultPileup<PILEUP_TYPE> >
00045 class Pileup
00046 {
00047 public:
00048     Pileup(const FUNC_CLASS& fp = FUNC_CLASS());
00049 
00050     Pileup(int window, const FUNC_CLASS& fp = FUNC_CLASS());
00051 
00052     // Perform pileup with a reference.
00053     Pileup(const std::string& refSeqFileName,
00054            const FUNC_CLASS& fp = FUNC_CLASS());
00055 
00056     // Perform pileup with a reference.
00057     Pileup(int window, const std::string& refSeqFileName, 
00058            const FUNC_CLASS& fp = FUNC_CLASS());
00059 
00060     ~Pileup();
00061 
00062     // Performs a pileup on the specified file.
00063     // Returns 0 for success and non-zero for failure.
00064     // If excludeFlag is specified, any bit may be set for the
00065     // record to be dropped.
00066     // Defaulted to exclude:
00067     //    * unmapped, 
00068     //    * not primary alignment
00069     //    * failed platform/vendor quality check
00070     //    * PCR or optical duplicate
00071     // If includeFlag is specified, every bit must be set for the
00072     // record to be included - defaulted to 0 - no bits are required to be set.
00073     virtual int processFile(const std::string& fileName, 
00074                             uint16_t excludeFlag = 0x0704, 
00075                             uint16_t includeFlag = 0);
00076 
00077     // This should be overwritten to perform any necessary
00078     // filtering on the record.
00079     virtual void processAlignment(SamRecord& record);
00080    
00081 
00082     void flushPileup();
00083 
00084 protected:
00085     FUNC_CLASS myAnalyzeFuncPtr;
00086 
00087     // Always need the reference position.
00088     void addAlignmentPosition(int refPosition, SamRecord& record);
00089 
00090 
00091     void flushPileup(int refID, int refPosition);
00092     void flushPileup(int refPosition);
00093     
00094     // Get the position in the myElements container that is associated
00095     // with the specified position.  If the specified position cannot
00096     // fit within the myElements container, -1 is returned.
00097     int pileupPosition(int refPosition);
00098 
00099     virtual void resetElement(PILEUP_TYPE& element, int position);
00100     virtual void addElement(PILEUP_TYPE& element, SamRecord& record);
00101     virtual void analyzeElement(PILEUP_TYPE& element);
00102 
00103     std::vector<PILEUP_TYPE> myElements;
00104 
00105     int    pileupStart;
00106     int    pileupHead;
00107     int    pileupTail;
00108     int    pileupWindow;
00109 
00110     int myCurrentRefID;
00111 
00112     GenomeSequence* myRefPtr;
00113 };
00114 
00115 
00116 template <class PILEUP_TYPE, class FUNC_CLASS>
00117 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const FUNC_CLASS& fp)
00118     : myAnalyzeFuncPtr(fp),
00119       myElements(),
00120       pileupStart(0),
00121       pileupHead(0),
00122       pileupTail(-1),
00123       pileupWindow(1024),
00124       myCurrentRefID(-2),
00125       myRefPtr(NULL)
00126 {
00127     // Not using pointers since this is templated.
00128     myElements.resize(pileupWindow);
00129 }
00130 
00131 
00132 template <class PILEUP_TYPE, class FUNC_CLASS>
00133 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const FUNC_CLASS& fp)
00134     : myAnalyzeFuncPtr(fp),
00135       myElements(),
00136       pileupStart(0),
00137       pileupHead(0),
00138       pileupTail(-1),
00139       pileupWindow(window),
00140       myCurrentRefID(-2),
00141       myRefPtr(NULL)
00142 {
00143     // Not using pointers since this is templated.
00144     myElements.resize(window);
00145 }
00146 
00147 
00148 template <class PILEUP_TYPE, class FUNC_CLASS>
00149 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const std::string& refSeqFileName, const FUNC_CLASS& fp)
00150     : myAnalyzeFuncPtr(fp),
00151       myElements(),
00152       pileupStart(0),
00153       pileupHead(0),
00154       pileupTail(-1),
00155       pileupWindow(1024),
00156       myCurrentRefID(-2),
00157       myRefPtr(NULL)
00158 {
00159     myRefPtr = new GenomeSequence(refSeqFileName.c_str());
00160 
00161     // Not using pointers since this is templated.
00162     myElements.resize(pileupWindow);
00163 
00164     PILEUP_TYPE::setReference(myRefPtr);
00165 }
00166 
00167 
00168 template <class PILEUP_TYPE, class FUNC_CLASS>
00169 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const std::string& refSeqFileName, const FUNC_CLASS& fp)
00170     : myAnalyzeFuncPtr(fp),
00171       myElements(),
00172       pileupStart(0),
00173       pileupHead(0),
00174       pileupTail(-1),
00175       pileupWindow(window),
00176       myCurrentRefID(-2),
00177       myRefPtr(NULL)
00178 {
00179     myRefPtr = new GenomeSequence(refSeqFileName.c_str());
00180 
00181     // Not using pointers since this is templated.
00182     myElements.resize(window);
00183 
00184     PILEUP_TYPE::setReference(myRefPtr);
00185 }
00186 
00187 
00188 template <class PILEUP_TYPE, class FUNC_CLASS>
00189 Pileup<PILEUP_TYPE, FUNC_CLASS>::~Pileup()
00190 {
00191     if(myRefPtr != NULL)
00192     {
00193         delete myRefPtr;
00194         myRefPtr = NULL;
00195     }
00196 }
00197 
00198 template <class PILEUP_TYPE, class FUNC_CLASS>
00199 int Pileup<PILEUP_TYPE, FUNC_CLASS>::processFile(const std::string& fileName, 
00200                                                  uint16_t excludeFlag,
00201                                                  uint16_t includeFlag)
00202 {
00203     SamFile samIn;
00204     SamFileHeader header;
00205     SamRecord record;
00206     
00207     if(myRefPtr != NULL)
00208     {
00209         samIn.SetReference(myRefPtr);
00210     }
00211 
00212     if(!samIn.OpenForRead(fileName.c_str()))
00213     {
00214         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00215         return(samIn.GetStatus());
00216     }
00217     
00218     if(!samIn.ReadHeader(header))
00219     {
00220         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00221         return(samIn.GetStatus());
00222     }
00223 
00224     // The file needs to be sorted by coordinate.
00225     samIn.setSortedValidation(SamFile::COORDINATE);
00226 
00227     // Iterate over all records
00228     while (samIn.ReadRecord(header, record))
00229     {
00230         uint16_t flag = record.getFlag();
00231         if(flag & excludeFlag)
00232         {
00233             // This record has an excluded flag set, 
00234             // so continue to the next one.
00235             continue;
00236         }
00237         if((flag & includeFlag) != includeFlag)
00238         {
00239             // This record does not have all required flags set, 
00240             // so continue to the next one.
00241             continue;
00242         }
00243         processAlignment(record);
00244     }
00245 
00246     flushPileup();
00247 
00248     int returnValue = 0;
00249     if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
00250     {
00251         // Failed to read a record.
00252         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00253         returnValue = samIn.GetStatus();
00254     }
00255     return(returnValue);  
00256 }
00257 
00258 
00259 template <class PILEUP_TYPE, class FUNC_CLASS>
00260 void Pileup<PILEUP_TYPE, FUNC_CLASS>::processAlignment(SamRecord& record)
00261 {
00262     int refPosition = record.get0BasedPosition();
00263     int refID = record.getReferenceID();
00264 
00265     // Flush any elements from the pileup that are prior to this record
00266     // since the file is sorted, we are done with those positions.
00267     flushPileup(refID, refPosition);
00268     
00269     // Loop through for each reference position covered by the record.
00270     // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
00271     // that are related with the given reference position.
00272     for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
00273     {
00274         addAlignmentPosition(refPosition, record);
00275     }
00276 }
00277 
00278 
00279 template <class PILEUP_TYPE, class FUNC_CLASS>
00280 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup()
00281 {
00282     // while there are still entries between the head and tail, flush,
00283     // but no need to flush if pileupTail == -1 because in that case 
00284     // no entries have been added
00285     while ((pileupHead <= pileupTail) && (pileupTail != -1))
00286     {
00287         flushPileup(pileupHead+1);
00288     }
00289     pileupStart = pileupHead = 0;
00290     pileupTail = -1;
00291 }
00292 
00293 
00294 // Always need the reference position.
00295 template <class PILEUP_TYPE, class FUNC_CLASS>
00296 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addAlignmentPosition(int refPosition,
00297                                                            SamRecord& record)
00298 {
00299     int offset = pileupPosition(refPosition);
00300     
00301     if((offset < 0) || (offset >= pileupWindow))
00302     {
00303         std::cerr << "Pileup Buffer Overflow: position = " << refPosition
00304                   << "; refID = " << record.getReferenceID() 
00305                   << "; recStartPos = " << record.get1BasedPosition()
00306                   << "; pileupStart = " << pileupStart
00307                   << "; pileupHead = " << pileupHead
00308                   << "; pileupTail = " << pileupTail;
00309     }
00310 
00311     addElement(myElements[offset], record);
00312 }
00313 
00314 
00315 template <class PILEUP_TYPE, class FUNC_CLASS>
00316 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int refID, int position)
00317 {
00318     // if new chromosome, flush the entire pileup.
00319     if(refID != myCurrentRefID)
00320     {
00321         // New chromosome, flush everything.
00322         flushPileup();
00323         myCurrentRefID = refID;
00324         // Set the starts to this position - first entry
00325         // in this chromosome.
00326         pileupStart = pileupHead = position;
00327     }
00328     else
00329     {
00330         // on the same chromosome, so flush just up to this new position.
00331         flushPileup(position);
00332     }
00333 }
00334 
00335 
00336 template <class PILEUP_TYPE, class FUNC_CLASS>
00337 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int position)
00338 {
00339     // Flush up to this new position, but no reason to flush if
00340     // pileupHead has not been set.
00341     while((pileupHead < position) && (pileupHead <= pileupTail))
00342     {
00343         analyzeElement(myElements[pileupHead - pileupStart]);
00344 
00345         pileupHead++;
00346         
00347         if(pileupHead - pileupStart >= pileupWindow)
00348             pileupStart += pileupWindow;
00349     }
00350 
00351     // If pileupHead != position, then we stopped because we had already
00352     // flushed all the data we had, just set the head and start to the position.
00353     if(pileupHead != position)
00354     {
00355         pileupHead = pileupStart = position;
00356     }
00357 }
00358 
00359 
00360 // Get the position in the myElements container that is associated
00361 // with the specified position.  If the specified position cannot
00362 // fit within the myElements container, -1 is returned.
00363 template <class PILEUP_TYPE, class FUNC_CLASS>
00364 int Pileup<PILEUP_TYPE, FUNC_CLASS>::pileupPosition(int position)
00365 {
00366     if((position < pileupHead) || (position > (pileupHead + pileupWindow)))
00367     {
00368         return -1;
00369         error("Pileup Buffer Overflow");
00370     }    
00371 
00372     int offset = position - pileupStart;
00373     
00374     if(offset >= pileupWindow)
00375     {
00376         offset -= pileupWindow;
00377     }
00378 
00379     if(position > pileupTail)
00380     {
00381         // This is the first time this position is being used.
00382         // reset the element for this position.
00383         resetElement(myElements[offset], position);
00384         pileupTail = position;
00385     }
00386 
00387     return(offset);
00388 }
00389 
00390 
00391 template <class PILEUP_TYPE, class FUNC_CLASS>
00392 void Pileup<PILEUP_TYPE, FUNC_CLASS>::resetElement(PILEUP_TYPE& element,
00393                                                    int position)
00394 {
00395     element.reset(position);
00396 }
00397 
00398 
00399 template <class PILEUP_TYPE, class FUNC_CLASS>
00400 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addElement(PILEUP_TYPE& element,
00401                                                  SamRecord& record)
00402 {
00403     element.addEntry(record);
00404 }
00405 
00406 
00407 template <class PILEUP_TYPE, class FUNC_CLASS>
00408 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeElement(PILEUP_TYPE& element)
00409 {
00410     myAnalyzeFuncPtr(element);
00411 }
00412 
00413 
00414 
00415 
00416 #endif
Generated on Tue Mar 22 22:50:13 2011 for StatGen Software by  doxygen 1.6.3