Pileup.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __PILEUP_H__
00019 #define __PILEUP_H__
00020 
00021 template <class PILEUP_TYPE>
00022 class defaultPileup
00023 {
00024 public:
00025     bool operator() (PILEUP_TYPE& element)
00026     {
00027         element.analyze();
00028         return(true);
00029     }
00030     void analyze(PILEUP_TYPE element)
00031     {
00032         element.analyze();
00033     }
00034 };
00035 
00036 template <class PILEUP_TYPE>
00037 void defaultPileupAnalyze(PILEUP_TYPE& element)
00038 {
00039     element.analyze();
00040 }
00041 
00042 
00043 template <class PILEUP_TYPE, 
00044           class FUNC_CLASS = defaultPileup<PILEUP_TYPE> >
00045 class Pileup
00046 {
00047 public:
00048     Pileup(const FUNC_CLASS& fp = FUNC_CLASS());
00049 
00050     Pileup(int window, const FUNC_CLASS& fp = FUNC_CLASS());
00051 
00052     // Performs a pileup on the specified file.
00053     // Returns 0 for success and non-zero for failure.
00054     // If excludeFlag is specified, any bit may be set for the
00055     // record to be dropped.
00056     // Defaulted to exclude:
00057     //    * unmapped, 
00058     //    * not primary alignment
00059     //    * failed platform/vendor quality check
00060     //    * PCR or optical duplicate
00061     // If includeFlag is specified, every bit must be set for the
00062     // record to be included - defaulted to 0 - no bits are required to be set.
00063     virtual int processFile(const std::string& fileName, 
00064                             uint16_t excludeFlag = 0x0704, 
00065                             uint16_t includeFlag = 0);
00066 
00067     // This should be overwritten to perform any necessary
00068     // filtering on the record.  processFile does no
00069     // filtering and process all records in the file.
00070     virtual void processAlignment(SamRecord& record);
00071    
00072 
00073     void flushPileup();
00074 
00075 protected:
00076     FUNC_CLASS myAnalyzeFuncPtr;
00077 
00078     // Always need the reference position.
00079     void addAlignmentPosition(int refPosition, SamRecord& record);
00080 
00081 
00082     void flushPileup(int refID, int refPosition);
00083     void flushPileup(int refPosition);
00084     
00085     // Get the position in the myElements container that is associated
00086     // with the specified position.  If the specified position cannot
00087     // fit within the myElements container, -1 is returned.
00088     int pileupPosition(int refPosition);
00089 
00090     virtual void resetElement(PILEUP_TYPE& element, int position);
00091     virtual void addElement(PILEUP_TYPE& element, SamRecord& record);
00092     virtual void analyzeElement(PILEUP_TYPE& element);
00093 
00094     std::vector<PILEUP_TYPE> myElements;
00095 
00096     int    pileupStart;
00097     int    pileupHead;
00098     int    pileupTail;
00099     int    pileupWindow;
00100 
00101     int myCurrentRefID;
00102 };
00103 
00104 
00105 template <class PILEUP_TYPE, class FUNC_CLASS>
00106 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const FUNC_CLASS& fp)
00107     : myAnalyzeFuncPtr(fp),
00108       myElements(),
00109       pileupStart(0),
00110       pileupHead(0),
00111       pileupTail(0),
00112       pileupWindow(1024),
00113       myCurrentRefID(-2)
00114 {
00115     // Not using pointers since this is templated.
00116     myElements.resize(pileupWindow, PILEUP_TYPE());
00117 }
00118 
00119 
00120 template <class PILEUP_TYPE, class FUNC_CLASS>
00121 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const FUNC_CLASS& fp)
00122     : myAnalyzeFuncPtr(fp),
00123       myElements(),
00124       pileupStart(0),
00125       pileupHead(0),
00126       pileupTail(0),
00127       pileupWindow(window),
00128       myCurrentRefID(-2)
00129 {
00130     // Not using pointers since this is templated.
00131     myElements.resize(window, PILEUP_TYPE());
00132 }
00133 
00134 
00135 template <class PILEUP_TYPE, class FUNC_CLASS>
00136 int Pileup<PILEUP_TYPE, FUNC_CLASS>::processFile(const std::string& fileName, 
00137                                                  uint16_t excludeFlag,
00138                                                  uint16_t includeFlag)
00139 {
00140     SamFile samIn;
00141     SamFileHeader header;
00142     SamRecord record;
00143     
00144     if(!samIn.OpenForRead(fileName.c_str()))
00145     {
00146         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00147         return(samIn.GetStatus());
00148     }
00149     
00150     if(!samIn.ReadHeader(header))
00151     {
00152         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00153         return(samIn.GetStatus());
00154     }
00155 
00156     // The file needs to be sorted by coordinate.
00157     samIn.setSortedValidation(SamFile::COORDINATE);
00158 
00159     // Iterate over all records
00160     while (samIn.ReadRecord(header, record))
00161     {
00162         uint16_t flag = record.getFlag();
00163         if(flag & excludeFlag)
00164         {
00165             // This record has an excluded flag set, 
00166             // so continue to the next one.
00167             continue;
00168         }
00169         if((flag & includeFlag) != includeFlag)
00170         {
00171             // This record does not have all required flags set, 
00172             // so continue to the next one.
00173             continue;
00174         }
00175         processAlignment(record);
00176     }
00177 
00178     flushPileup();
00179 
00180     int returnValue = 0;
00181     if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
00182     {
00183         // Failed to read a record.
00184         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00185         returnValue = samIn.GetStatus();
00186     }
00187     return(returnValue);  
00188 }
00189 
00190 
00191 template <class PILEUP_TYPE, class FUNC_CLASS>
00192 void Pileup<PILEUP_TYPE, FUNC_CLASS>::processAlignment(SamRecord& record)
00193 {
00194     int refPosition = record.get0BasedPosition();
00195     int refID = record.getReferenceID();
00196 
00197     // Flush any elements from the pileup that are prior to this record
00198     // since the file is sorted, we are done with those positions.
00199     flushPileup(refID, refPosition);
00200     
00201     // Loop through for each reference position covered by the record.
00202     // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
00203     // that are related with the given reference position.
00204     for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
00205     {
00206         addAlignmentPosition(refPosition, record);
00207     }
00208 }
00209 
00210 
00211 template <class PILEUP_TYPE, class FUNC_CLASS>
00212 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup()
00213 {
00214     while (pileupHead <= pileupTail)
00215     {
00216         flushPileup(pileupHead+1);
00217     }
00218     pileupStart = pileupHead = pileupTail = 0;
00219 }
00220 
00221 
00222 // Always need the reference position.
00223 template <class PILEUP_TYPE, class FUNC_CLASS>
00224 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addAlignmentPosition(int refPosition,
00225                                                            SamRecord& record)
00226 {
00227     int offset = pileupPosition(refPosition);
00228     
00229     if((offset < 0) || (offset >= pileupWindow))
00230     {
00231         std::cerr << "Pileup Buffer Overflow: position = " << refPosition
00232                   << "; refID = " << record.getReferenceID() 
00233                   << "; recStartPos = " << record.get1BasedPosition()
00234                   << "; pileupStart = " << pileupStart
00235                   << "; pileupHead = " << pileupHead
00236                   << "; pileupTail = " << pileupTail;
00237     }
00238 
00239     addElement(myElements[offset], record);
00240 }
00241 
00242 
00243 template <class PILEUP_TYPE, class FUNC_CLASS>
00244 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int refID, int position)
00245 {
00246     // if new chromosome, flush the entire pileup.
00247     if(refID != myCurrentRefID)
00248     {
00249         // New chromosome, flush everything.
00250         flushPileup();
00251         myCurrentRefID = refID;
00252         // Set the starts to this position - first entry
00253         // in this chromosome.
00254         pileupStart = pileupHead = position;
00255     }
00256     else
00257     {
00258         // on the same chromosome, so flush just up to this new position.
00259         flushPileup(position);
00260     }
00261 }
00262 
00263 
00264 template <class PILEUP_TYPE, class FUNC_CLASS>
00265 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int position)
00266 {
00267     // Flush up to this new position.
00268     while((pileupHead < position) && (pileupHead <= pileupTail))
00269     {
00270         analyzeElement(myElements[pileupHead - pileupStart]);
00271 
00272         pileupHead++;
00273         
00274         if(pileupHead - pileupStart >= pileupWindow)
00275             pileupStart += pileupWindow;
00276     }
00277 
00278     // If pileupHead != position, then we stopped because we had already
00279     // flushed all the data we had, just set the head and start to the position.
00280     if(pileupHead != position)
00281     {
00282         pileupHead = pileupStart = position;
00283     }
00284 }
00285 
00286 
00287 // Get the position in the myElements container that is associated
00288 // with the specified position.  If the specified position cannot
00289 // fit within the myElements container, -1 is returned.
00290 template <class PILEUP_TYPE, class FUNC_CLASS>
00291 int Pileup<PILEUP_TYPE, FUNC_CLASS>::pileupPosition(int position)
00292 {
00293     if((position < pileupHead) || (position > (pileupHead + pileupWindow)))
00294     {
00295         return -1;
00296         error("Pileup Buffer Overflow");
00297     }    
00298 
00299     int offset = position - pileupStart;
00300     
00301     if(offset >= pileupWindow)
00302     {
00303         offset -= pileupWindow;
00304     }
00305 
00306     if(position > pileupTail)
00307     {
00308         // This is the first time this position is being used.
00309         // reset the element for this position.
00310         resetElement(myElements[offset], position);
00311         pileupTail = position;
00312     }
00313 
00314     return(offset);
00315 }
00316 
00317 
00318 template <class PILEUP_TYPE, class FUNC_CLASS>
00319 void Pileup<PILEUP_TYPE, FUNC_CLASS>::resetElement(PILEUP_TYPE& element,
00320                                                    int position)
00321 {
00322     element.reset(position);
00323 }
00324 
00325 
00326 template <class PILEUP_TYPE, class FUNC_CLASS>
00327 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addElement(PILEUP_TYPE& element,
00328                                                  SamRecord& record)
00329 {
00330     element.addEntry(record);
00331 }
00332 
00333 
00334 template <class PILEUP_TYPE, class FUNC_CLASS>
00335 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeElement(PILEUP_TYPE& element)
00336 {
00337     myAnalyzeFuncPtr(element);
00338 }
00339 
00340 
00341 
00342 
00343 #endif
Generated on Thu Dec 9 12:22:12 2010 for StatGen Software by  doxygen 1.6.3