SamFilter.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 //////////////////////////////////////////////////////////////////////////
00019 
00020 
00021 #include "SamFilter.h"
00022 
00023 #include "SamQuerySeqWithRefHelper.h"
00024 #include "BaseUtilities.h"
00025 #include "SamFlag.h"
00026 
00027 SamFilter::FilterStatus SamFilter::clipOnMismatchThreshold(SamRecord& record, 
00028                                                            GenomeSequence& refSequence,
00029                                                            double mismatchThreshold)
00030 {
00031     // Read & clip from the left & right.    
00032     SamQuerySeqWithRefIter iterFromStart(record, refSequence, true);
00033     SamQuerySeqWithRefIter iterFromEnd(record, refSequence, false);
00034 
00035     SamSingleBaseMatchInfo baseMatchInfo;
00036 
00037     int32_t readLength = record.getReadLength();
00038     // Init start clip to be prior to the start index (0).
00039     const int32_t initialStartClipPos = -1;
00040     int32_t startClipPos = initialStartClipPos;
00041     // Init end clip to be past the last index (readLength).
00042     int32_t endClipPos = readLength;
00043 
00044     bool fromStartComplete = false;
00045     bool fromEndComplete = false;
00046     int32_t numBasesFromStart = 0;
00047     int32_t numBasesFromEnd = 0;
00048     int32_t numMismatchFromStart = 0;
00049     int32_t numMismatchFromEnd = 0;
00050 
00051     FilterStatus status = NONE;
00052 
00053     //////////////////////////////////////////////////////////
00054     // Determining the clip positions.
00055     while(!fromStartComplete || !fromEndComplete)
00056     {
00057         // Read from the start (left to right) on the read until
00058         // more have been read from that direction than the opposite direction.
00059         while(!fromStartComplete && 
00060               ((numBasesFromStart <= numBasesFromEnd) ||
00061                (fromEndComplete)))
00062         {
00063             if(iterFromStart.getNextMatchMismatch(baseMatchInfo) == false)
00064             {
00065                 // Nothing more to read in this direction.
00066                 fromStartComplete = true;
00067                 break;
00068             }
00069             // Got a read.  Check to see if it is to or past the last clip.
00070             if(baseMatchInfo.getQueryIndex() >= endClipPos)
00071             {
00072                 // This base is past where we are clipping, so we
00073                 // are done reading in this direction.
00074                 fromStartComplete = true;
00075                 break;
00076             }
00077             // This is an actual base read from the left to the
00078             // right, so up the counter and determine if it was a mismatch.
00079             ++numBasesFromStart;
00080 
00081             if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
00082             {
00083                 // Mismatch
00084                 ++numMismatchFromStart;
00085                 // Check to see if it is over the threshold.
00086                 double mismatchPercent = 
00087                     (double)numMismatchFromStart / numBasesFromStart;
00088                 if(mismatchPercent > mismatchThreshold)
00089                 {
00090                     // Need to clip.
00091                     startClipPos = baseMatchInfo.getQueryIndex();
00092                     // Reset the counters.
00093                     numBasesFromStart = 0;
00094                     numMismatchFromStart = 0;
00095                 }
00096             }
00097         }
00098 
00099         // Now, read from right to left until more have been read
00100         // from the end than from the start.
00101         while(!fromEndComplete && 
00102               ((numBasesFromEnd <= numBasesFromStart) ||
00103                (fromStartComplete)))
00104         {
00105             if(iterFromEnd.getNextMatchMismatch(baseMatchInfo) == false)
00106             {
00107                 // Nothing more to read in this direction.
00108                 fromEndComplete = true;
00109                 break;
00110             }
00111             // Got a read.  Check to see if it is to or past the first clip.
00112             if(baseMatchInfo.getQueryIndex() <= startClipPos)
00113             {
00114                 // This base is past where we are clipping, so we
00115                 // are done reading in this direction.
00116                 fromEndComplete = true;
00117                 break;
00118             }
00119             // This is an actual base read from the right to the
00120             // left, so up the counter and determine if it was a mismatch.
00121             ++numBasesFromEnd;
00122 
00123             if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
00124             {
00125                 // Mismatch
00126                 ++numMismatchFromEnd;
00127                 // Check to see if it is over the threshold.
00128                 double mismatchPercent = 
00129                     (double)numMismatchFromEnd / numBasesFromEnd;
00130                 if(mismatchPercent > mismatchThreshold)
00131                 {
00132                     // Need to clip.
00133                     endClipPos = baseMatchInfo.getQueryIndex();
00134                     // Reset the counters.
00135                     numBasesFromEnd = 0;
00136                     numMismatchFromEnd = 0;
00137                 }
00138             }
00139         }
00140     }
00141 
00142     //////////////////////////////////////////////////////////
00143     // Done determining the clip positions.
00144     // Check to see if clipping needs to be done.
00145     Cigar* cigar = record.getCigarInfo();
00146     if((startClipPos != initialStartClipPos) || (endClipPos != readLength))
00147     {
00148         // Clipping from front and/or from the end.
00149         // Check to see if the entire read was clipped.
00150         if((startClipPos + 1) >= endClipPos)
00151         {
00152             /////////////////////////////
00153             // The entire read was clipped.
00154             filterRead(record);
00155             status = FILTERED;
00156         }
00157         else
00158         {
00159             // Part of the read was clipped.
00160             status = CLIPPED;
00161             
00162             // Need to create a new cigar.
00163             CigarRoller newCigar;
00164             
00165             // Loop through, creating an updated cigar.
00166 
00167             int origCigarOpIndex = 0;
00168             // Track how many read positions are covered up to this
00169             // point by the cigar to determine up to up to what
00170             // point in the cigar is affected by this clipping.
00171             int32_t numPositions = 0;
00172 
00173             // Add 1 to the startClipPos to determine the number of
00174             // softclips since startClipPos starts at 0.
00175             int32_t numSoftClips = startClipPos + 1;
00176 
00177             // Track if any non-clips are in the new cigar.
00178             bool onlyClips = true;
00179 
00180             const Cigar::CigarOperator* op = NULL;
00181 
00182             //////////////////
00183             // Clip from front
00184             // keep any hard clips that are prior to the soft clip that
00185             // is being performed..
00186             while((origCigarOpIndex < cigar->size()) &&
00187                   (numPositions < numSoftClips))
00188             {
00189                 op = &(cigar->getOperator(origCigarOpIndex));
00190                 switch(op->operation)
00191                 {
00192                     case Cigar::hardClip:
00193                         // Keep this operation as the new clips do not
00194                         // affect other clips.
00195                         newCigar += *op;
00196                         break;
00197                     case Cigar::del:
00198                     case Cigar::skip:
00199                         // Skip and delete are going to be dropped, and
00200                         // are not in the read, so the read index doesn't
00201                         // need to be updated
00202                         break;
00203                     case Cigar::insert:
00204                     case Cigar::match:
00205                     case Cigar::mismatch:
00206                     case Cigar::softClip:
00207                         // Update the read index as these types
00208                         // are found in the read.
00209                         numPositions += op->count;
00210                         break;
00211                     case Cigar::none:
00212                     default:
00213                         // Nothing to do for none.
00214                         break;
00215                 };
00216                 ++origCigarOpIndex;
00217             }
00218              
00219             if(numSoftClips != 0)
00220             {
00221                 // Add the softclip from the front of the read.
00222                 newCigar.Add(Cigar::softClip, numSoftClips);
00223 
00224                 // Add the rest of the last Cigar operation if
00225                 // it is not entirely clipped.
00226                 if(numPositions > numSoftClips)
00227                 {
00228                     // Before adding it, check to see if the same
00229                     // operation is clipped from the end.
00230                     // numPositions greater than the endClipPos
00231                     // means that it is equal or past that position,
00232                     // so shorten the number of positions.
00233                     int32_t newCount = numPositions - numSoftClips;
00234                     if(numPositions > endClipPos)
00235                     {
00236                         newCount -= (numPositions - endClipPos);
00237                     }
00238                     if(newCount > 0)
00239                     {
00240                         newCigar.Add(op->operation, newCount);
00241                         if(!Cigar::isClip(op->operation))
00242                         {
00243                             onlyClips = false;
00244                         }
00245                     }
00246                 }
00247             }
00248 
00249             // Add operations until the point of the end clip is reached.
00250             // For example...
00251             //   2M1D3M = MMDMMM  readLength = 5
00252             // readIndex: 01 234
00253             //   at cigarOpIndex 0 (2M), numPositions = 2.
00254             //   at cigarOpIndex 1 (1D), numPositions = 2.
00255             //   at cigarOpIndex 2 (3M), numPositions = 5.
00256             // if endClipPos = 2, we still want to consume the 1D, so
00257             // need to keep looping until numPositions > endClipPos
00258             while((origCigarOpIndex < cigar->size()) &&
00259                   (numPositions <= endClipPos))
00260             {
00261                 op = &(cigar->getOperator(origCigarOpIndex));
00262                 
00263                 // Update the numPositions count if the operations indicates
00264                 // bases within the read.
00265                 if(!Cigar::foundInQuery(op->operation))
00266                 {
00267                     // This operation is not in the query read sequence,
00268                     // so it is not yet to the endClipPos, just add the
00269                     // operation do not increment the number of positions.
00270                     newCigar += *op;
00271                     if(!Cigar::isClip(op->operation))
00272                     {
00273                         onlyClips = false;
00274                     }
00275                 }
00276                 else
00277                 {
00278                     // This operation appears in the query sequence, so
00279                     // check to see if the clip occurs in this operation.
00280 
00281                     // endClipPos is 0 based & numPositions is a count.
00282                     // If endClipPos is 4, then it is the 5th position.
00283                     // If 4 positions are covered so far (numPositions = 4), 
00284                     // then we are right at endCLipPos: 4-4 = 0, none of 
00285                     // this operation should be kept. 
00286                     // If only 3 positions were covered, then we are at offset
00287                     // 3, so offset 3 should be added: 4-3 = 1.
00288                     uint32_t numPosTilClip = endClipPos - numPositions;
00289 
00290                     if(numPosTilClip < op->count)
00291                     {
00292                         // this operation is partially clipped, write the part
00293                         // that was not clipped if it is not all clipped.
00294                         if(numPosTilClip != 0)
00295                         {
00296                             newCigar.Add(op->operation,
00297                                          numPosTilClip);
00298                             if(!Cigar::isClip(op->operation))
00299                             {
00300                                 onlyClips = false;
00301                             }
00302                         }
00303                     }
00304                     else
00305                     {
00306                         // This operation is not clipped, so add it
00307                         newCigar += *op;
00308                         if(!Cigar::isClip(op->operation))
00309                         {
00310                             onlyClips = false;
00311                         }
00312                     }
00313                     // This operation occurs in the query sequence, so 
00314                     // increment the number of positions covered.
00315                     numPositions += op->count;
00316                 }
00317 
00318                 // Move to the next cigar position.
00319                 ++origCigarOpIndex;
00320             }
00321             
00322             //////////////////
00323             // Add the softclip to the back.
00324             numSoftClips = readLength - endClipPos;
00325             if(numSoftClips != 0)
00326             {
00327                 // Add the softclip to the end
00328                 newCigar.Add(Cigar::softClip, numSoftClips);
00329             }
00330 
00331             //////////////////
00332             // Add any hardclips remaining in the original cigar to the back.
00333             while(origCigarOpIndex < cigar->size())
00334             {
00335                 op = &(cigar->getOperator(origCigarOpIndex));
00336                 if(op->operation == Cigar::hardClip)
00337                 {
00338                     // Keep this operation as the new clips do not
00339                     // affect other clips.
00340                     newCigar += *op;
00341                 }
00342                 ++origCigarOpIndex;
00343             }
00344 
00345             // Check to see if the new cigar is only clips.
00346             if(onlyClips)
00347             {
00348                 // Only clips in the new cigar, so mark the read as filtered
00349                 // instead of updating the cigar.
00350                 /////////////////////////////
00351                 // The entire read was clipped.
00352                 filterRead(record);
00353                 status = FILTERED;
00354             }
00355             else
00356             {
00357                 // Part of the read was filtered, and now that we have
00358                 // an updated cigar, update the read.
00359                 record.setCigar(newCigar);
00360 
00361                 // Update the starting position if a clip was added to
00362                 // the front.
00363                 if(startClipPos != initialStartClipPos)
00364                 {
00365                     // Convert from query index to reference position (from the
00366                     // old cigar)
00367                     // Get the position for the start clipped position by
00368                     // getting the position associated with the clipped base on
00369                     // the reference.  Then add one to get to the first
00370                     // non-clipped position.
00371                     uint32_t newStartPos = 
00372                         cigar->getRefPosition(startClipPos, 
00373                                               record.get0BasedPosition()) + 1;
00374                     record.set0BasedPosition(newStartPos);
00375                 }
00376             }
00377         }
00378     }
00379     return(status);
00380 }
00381 
00382 
00383 SamFilter::FilterStatus SamFilter::filterOnMismatchQuality(SamRecord& record, 
00384                                                            GenomeSequence& refSequence,
00385                                                            uint32_t qualityThreshold, 
00386                                                            uint8_t defaultQualityInt)
00387 {
00388     uint32_t totalMismatchQuality = 
00389         sumMismatchQuality(record, refSequence, defaultQualityInt); 
00390     
00391     // If the total mismatch quality is over the threshold, 
00392     // filter the read.
00393     if(totalMismatchQuality > qualityThreshold)
00394     {
00395         filterRead(record);
00396         return(FILTERED);
00397     }
00398     return(NONE);
00399 }
00400 
00401 
00402 // NOTE: Only positions where the reference and read both have bases that
00403 //       are different and not 'N' are considered mismatches.
00404 uint32_t SamFilter::sumMismatchQuality(SamRecord& record, 
00405                                        GenomeSequence& refSequence,
00406                                        uint8_t defaultQualityInt)
00407 {
00408     // Track the mismatch info.
00409     int mismatchQual = 0;
00410     int numMismatch = 0;
00411 
00412     SamQuerySeqWithRefIter sequenceIter(record, refSequence);
00413 
00414     SamSingleBaseMatchInfo baseMatchInfo;
00415     while(sequenceIter.getNextMatchMismatch(baseMatchInfo))
00416     {
00417         if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
00418         {
00419             // Got a mismatch, get the associated quality.
00420             char readQualityChar = 
00421                 record.getQuality(baseMatchInfo.getQueryIndex());
00422             uint8_t readQualityInt = 
00423                 BaseUtilities::getPhredBaseQuality(readQualityChar);
00424             
00425             if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT)
00426             {
00427                 // Quality was not specified, so use the configured setting.
00428                 readQualityInt = defaultQualityInt;
00429             }
00430             mismatchQual += readQualityInt;
00431             ++numMismatch;
00432         }
00433     }
00434 
00435     return(mismatchQual);
00436 }
00437 
00438 
00439 void SamFilter::filterRead(SamRecord& record)
00440 {
00441     // Filter the read by marking it as unmapped.
00442     uint16_t flag = record.getFlag(); 
00443     SamFlag::setUnmapped(flag);
00444     record.setFlag(flag);
00445 }
Generated on Wed Nov 17 15:38:27 2010 for StatGen Software by  doxygen 1.6.3