Pileup.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __PILEUP_H__
00019 #define __PILEUP_H__
00020 
00021 template <class PILEUP_TYPE>
00022 class defaultPileup
00023 {
00024 public:
00025     bool operator() (PILEUP_TYPE& element)
00026     {
00027         element.analyze();
00028         return(true);
00029     }
00030     void analyze(PILEUP_TYPE element)
00031     {
00032         element.analyze();
00033     }
00034 };
00035 
00036 template <class PILEUP_TYPE>
00037 void defaultPileupAnalyze(PILEUP_TYPE& element)
00038 {
00039     element.analyze();
00040 }
00041 
00042 
00043 template <class PILEUP_TYPE, 
00044           class FUNC_CLASS = defaultPileup<PILEUP_TYPE> >
00045 class Pileup
00046 {
00047 public:
00048     Pileup(const FUNC_CLASS& fp = FUNC_CLASS());
00049 
00050     Pileup(int window, const FUNC_CLASS& fp = FUNC_CLASS());
00051 
00052     // Performs a pileup on the specified file.
00053     // Returns 0 for success and non-zero for failure.
00054     // If excludeFlag is specified, any bit may be set for the
00055     // record to be dropped.
00056     // Defaulted to exclude:
00057     //    * unmapped, 
00058     //    * not primary alignment
00059     //    * failed platform/vendor quality check
00060     //    * PCR or optical duplicate
00061     // If includeFlag is specified, every bit must be set for the
00062     // record to be included - defaulted to 0 - no bits are required to be set.
00063     virtual int processFile(const std::string& fileName, 
00064                             uint16_t excludeFlag = 0x0704, 
00065                             uint16_t includeFlag = 0);
00066 
00067     // This should be overwritten to perform any necessary
00068     // filtering on the record.  processFile does no
00069     // filtering and process all records in the file.
00070     virtual void processAlignment(SamRecord& record);
00071    
00072 
00073     void flushPileup();
00074 
00075 protected:
00076     FUNC_CLASS myAnalyzeFuncPtr;
00077 
00078 private:
00079 
00080     // Always need the reference position.
00081     void addAlignmentPosition(int refPosition, SamRecord& record);
00082 
00083 
00084     void flushPileup(int refID, int refPosition);
00085     void flushPileup(int refPosition);
00086     
00087     // Get the position in the myElements container that is associated
00088     // with the specified position.  If the specified position cannot
00089     // fit within the myElements container, -1 is returned.
00090     int pileupPosition(int refPosition);
00091 
00092     virtual void resetElement(PILEUP_TYPE& element, int position);
00093     virtual void addElement(PILEUP_TYPE& element, SamRecord& record);
00094     virtual void analyzeElement(PILEUP_TYPE& element);
00095 
00096 
00097     std::vector<PILEUP_TYPE> myElements;
00098 
00099     int    pileupStart;
00100     int    pileupHead;
00101     int    pileupTail;
00102     int    pileupWindow;
00103 
00104     int myCurrentRefID;
00105 };
00106 
00107 
00108 template <class PILEUP_TYPE, class FUNC_CLASS>
00109 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(const FUNC_CLASS& fp)
00110     : myAnalyzeFuncPtr(fp),
00111       myElements(),
00112       pileupStart(0),
00113       pileupHead(0),
00114       pileupTail(0),
00115       pileupWindow(1024),
00116       myCurrentRefID(-2)
00117 {
00118     // Not using pointers since this is templated.
00119     myElements.resize(pileupWindow, PILEUP_TYPE());
00120 }
00121 
00122 
00123 template <class PILEUP_TYPE, class FUNC_CLASS>
00124 Pileup<PILEUP_TYPE, FUNC_CLASS>::Pileup(int window, const FUNC_CLASS& fp)
00125     : myAnalyzeFuncPtr(fp),
00126       myElements(),
00127       pileupStart(0),
00128       pileupHead(0),
00129       pileupTail(0),
00130       pileupWindow(window),
00131       myCurrentRefID(-2)
00132 {
00133     // Not using pointers since this is templated.
00134     myElements.resize(window, PILEUP_TYPE());
00135 }
00136 
00137 
00138 template <class PILEUP_TYPE, class FUNC_CLASS>
00139 int Pileup<PILEUP_TYPE, FUNC_CLASS>::processFile(const std::string& fileName, 
00140                                                  uint16_t excludeFlag,
00141                                                  uint16_t includeFlag)
00142 {
00143     SamFile samIn;
00144     SamFileHeader header;
00145     SamRecord record;
00146     
00147     if(!samIn.OpenForRead(fileName.c_str()))
00148     {
00149         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00150         return(samIn.GetStatus());
00151     }
00152     
00153     if(!samIn.ReadHeader(header))
00154     {
00155         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00156         return(samIn.GetStatus());
00157     }
00158 
00159     // The file needs to be sorted by coordinate.
00160     samIn.setSortedValidation(SamFile::COORDINATE);
00161 
00162     // Iterate over all records
00163     while (samIn.ReadRecord(header, record))
00164     {
00165         uint16_t flag = record.getFlag();
00166         if(flag & excludeFlag)
00167         {
00168             // This record has an excluded flag set, 
00169             // so continue to the next one.
00170             continue;
00171         }
00172         if((flag & includeFlag) != includeFlag)
00173         {
00174             // This record does not have all required flags set, 
00175             // so continue to the next one.
00176             continue;
00177         }
00178         processAlignment(record);
00179     }
00180 
00181     flushPileup();
00182 
00183     int returnValue = 0;
00184     if(samIn.GetStatus() != SamStatus::NO_MORE_RECS)
00185     {
00186         // Failed to read a record.
00187         fprintf(stderr, "%s\n", samIn.GetStatusMessage());
00188         returnValue = samIn.GetStatus();
00189     }
00190     return(returnValue);  
00191 }
00192 
00193 
00194 template <class PILEUP_TYPE, class FUNC_CLASS>
00195 void Pileup<PILEUP_TYPE, FUNC_CLASS>::processAlignment(SamRecord& record)
00196 {
00197     int refPosition = record.get0BasedPosition();
00198     int refID = record.getReferenceID();
00199 
00200     // Flush any elements from the pileup that are prior to this record
00201     // since the file is sorted, we are done with those positions.
00202     flushPileup(refID, refPosition);
00203     
00204     // Loop through for each reference position covered by the record.
00205     // It is up to the PILEUP_TYPE to handle insertions/deletions, etc
00206     // that are related with the given reference position.
00207     for(; refPosition <= record.get0BasedAlignmentEnd(); ++refPosition)
00208     {
00209         addAlignmentPosition(refPosition, record);
00210     }
00211 }
00212 
00213 
00214 template <class PILEUP_TYPE, class FUNC_CLASS>
00215 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup()
00216 {
00217     while (pileupHead <= pileupTail)
00218     {
00219         flushPileup(pileupHead+1);
00220     }
00221     pileupStart = pileupHead = pileupTail = 0;
00222 }
00223 
00224 
00225 // Always need the reference position.
00226 template <class PILEUP_TYPE, class FUNC_CLASS>
00227 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addAlignmentPosition(int refPosition,
00228                                                            SamRecord& record)
00229 {
00230     int offset = pileupPosition(refPosition);
00231     
00232     if((offset < 0) || (offset >= pileupWindow))
00233     {
00234         std::cerr << "Pileup Buffer Overflow: position = " << refPosition
00235                   << "; refID = " << record.getReferenceID() 
00236                   << "; recStartPos = " << record.get1BasedPosition()
00237                   << "; pileupStart = " << pileupStart
00238                   << "; pileupHead = " << pileupHead
00239                   << "; pileupTail = " << pileupTail;
00240     }
00241 
00242     addElement(myElements[offset], record);
00243 }
00244 
00245 
00246 template <class PILEUP_TYPE, class FUNC_CLASS>
00247 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int refID, int position)
00248 {
00249     // if new chromosome, flush the entire pileup.
00250     if(refID != myCurrentRefID)
00251     {
00252         // New chromosome, flush everything.
00253         flushPileup();
00254         myCurrentRefID = refID;
00255         // Set the starts to this position - first entry
00256         // in this chromosome.
00257         pileupStart = pileupHead = position;
00258     }
00259     else
00260     {
00261         // on the same chromosome, so flush just up to this new position.
00262         flushPileup(position);
00263     }
00264 }
00265 
00266 
00267 template <class PILEUP_TYPE, class FUNC_CLASS>
00268 void Pileup<PILEUP_TYPE, FUNC_CLASS>::flushPileup(int position)
00269 {
00270     // Flush up to this new position.
00271     while((pileupHead < position) && (pileupHead <= pileupTail))
00272     {
00273         analyzeElement(myElements[pileupHead - pileupStart]);
00274 
00275         pileupHead++;
00276         
00277         if(pileupHead - pileupStart >= pileupWindow)
00278             pileupStart += pileupWindow;
00279     }
00280 
00281     // If pileupHead != position, then we stopped because we had already
00282     // flushed all the data we had, just set the head and start to the position.
00283     if(pileupHead != position)
00284     {
00285         pileupHead = pileupStart = position;
00286     }
00287 }
00288 
00289 
00290 // Get the position in the myElements container that is associated
00291 // with the specified position.  If the specified position cannot
00292 // fit within the myElements container, -1 is returned.
00293 template <class PILEUP_TYPE, class FUNC_CLASS>
00294 int Pileup<PILEUP_TYPE, FUNC_CLASS>::pileupPosition(int position)
00295 {
00296     if((position < pileupHead) || (position > (pileupHead + pileupWindow)))
00297     {
00298         return -1;
00299         error("Pileup Buffer Overflow");
00300     }    
00301 
00302     int offset = position - pileupStart;
00303     
00304     if(offset >= pileupWindow)
00305     {
00306         offset -= pileupWindow;
00307     }
00308 
00309     if(position > pileupTail)
00310     {
00311         // This is the first time this position is being used.
00312         // reset the element for this position.
00313         resetElement(myElements[offset], position);
00314         pileupTail = position;
00315     }
00316 
00317     return(offset);
00318 }
00319 
00320 
00321 template <class PILEUP_TYPE, class FUNC_CLASS>
00322 void Pileup<PILEUP_TYPE, FUNC_CLASS>::resetElement(PILEUP_TYPE& element,
00323                                                    int position)
00324 {
00325     element.reset(position);
00326 }
00327 
00328 
00329 template <class PILEUP_TYPE, class FUNC_CLASS>
00330 void Pileup<PILEUP_TYPE, FUNC_CLASS>::addElement(PILEUP_TYPE& element,
00331                                                  SamRecord& record)
00332 {
00333     element.addEntry(record);
00334 }
00335 
00336 
00337 template <class PILEUP_TYPE, class FUNC_CLASS>
00338 void Pileup<PILEUP_TYPE, FUNC_CLASS>::analyzeElement(PILEUP_TYPE& element)
00339 {
00340     myAnalyzeFuncPtr(element);
00341 }
00342 
00343 
00344 
00345 
00346 #endif
Generated on Wed Nov 17 15:38:27 2010 for StatGen Software by  doxygen 1.6.3