libStatGen Software  1
SamFilter.cpp
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 //////////////////////////////////////////////////////////////////////////
00019 
00020 
00021 #include "SamFilter.h"
00022 
00023 #include "SamQuerySeqWithRefHelper.h"
00024 #include "BaseUtilities.h"
00025 #include "SamFlag.h"
00026 
00027 SamFilter::FilterStatus SamFilter::clipOnMismatchThreshold(SamRecord& record, 
00028                                                            GenomeSequence& refSequence,
00029                                                            double mismatchThreshold)
00030 {
00031     // Read & clip from the left & right.    
00032     SamQuerySeqWithRefIter iterFromFront(record, refSequence, true);
00033     SamQuerySeqWithRefIter iterFromBack(record, refSequence, false);
00034 
00035     SamSingleBaseMatchInfo baseMatchInfo;
00036 
00037     int32_t readLength = record.getReadLength();
00038     // Init last front clip to be prior to the lastFront index (0).
00039     const int32_t initialLastFrontClipPos = -1;
00040     int32_t lastFrontClipPos = initialLastFrontClipPos;
00041     // Init first back clip to be past the last index (readLength).
00042     int32_t firstBackClipPos = readLength;
00043 
00044     bool fromFrontComplete = false;
00045     bool fromBackComplete = false;
00046     int32_t numBasesFromFront = 0;
00047     int32_t numBasesFromBack = 0;
00048     int32_t numMismatchFromFront = 0;
00049     int32_t numMismatchFromBack = 0;
00050 
00051     //////////////////////////////////////////////////////////
00052     // Determining the clip positions.
00053     while(!fromFrontComplete || !fromBackComplete)
00054     {
00055         // Read from the front (left to right) of the read until
00056         // more have been read from that direction than the opposite direction.
00057         while(!fromFrontComplete && 
00058               ((numBasesFromFront <= numBasesFromBack) ||
00059                (fromBackComplete)))
00060         {
00061             if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false)
00062             {
00063                 // Nothing more to read in this direction.
00064                 fromFrontComplete = true;
00065                 break;
00066             }
00067             // Got a read.  Check to see if it is to or past the last clip.
00068             if(baseMatchInfo.getQueryIndex() >= firstBackClipPos)
00069             {
00070                 // This base is past where we are clipping, so we
00071                 // are done reading in this direction.
00072                 fromFrontComplete = true;
00073                 break;
00074             }
00075             // This is an actual base read from the left to the
00076             // right, so up the counter and determine if it was a mismatch.
00077             ++numBasesFromFront;
00078 
00079             if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
00080             {
00081                 // Mismatch
00082                 ++numMismatchFromFront;
00083                 // Check to see if it is over the threshold.
00084                 double mismatchPercent = 
00085                     (double)numMismatchFromFront / numBasesFromFront;
00086                 if(mismatchPercent > mismatchThreshold)
00087                 {
00088                     // Need to clip.
00089                     lastFrontClipPos = baseMatchInfo.getQueryIndex();
00090                     // Reset the counters.
00091                     numBasesFromFront = 0;
00092                     numMismatchFromFront = 0;
00093                 }
00094             }
00095         }
00096 
00097         // Now, read from right to left until more have been read
00098         // from the back than from the front.
00099         while(!fromBackComplete && 
00100               ((numBasesFromBack <= numBasesFromFront) ||
00101                (fromFrontComplete)))
00102         {
00103             if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false)
00104             {
00105                 // Nothing more to read in this direction.
00106                 fromBackComplete = true;
00107                 break;
00108             }
00109             // Got a read.  Check to see if it is to or past the first clip.
00110             if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos)
00111             {
00112                 // This base is past where we are clipping, so we
00113                 // are done reading in this direction.
00114                 fromBackComplete = true;
00115                 break;
00116             }
00117             // This is an actual base read from the right to the
00118             // left, so up the counter and determine if it was a mismatch.
00119             ++numBasesFromBack;
00120 
00121             if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
00122             {
00123                 // Mismatch
00124                 ++numMismatchFromBack;
00125                 // Check to see if it is over the threshold.
00126                 double mismatchPercent = 
00127                     (double)numMismatchFromBack / numBasesFromBack;
00128                 if(mismatchPercent > mismatchThreshold)
00129                 {
00130                     // Need to clip.
00131                     firstBackClipPos = baseMatchInfo.getQueryIndex();
00132                     // Reset the counters.
00133                     numBasesFromBack = 0;
00134                     numMismatchFromBack = 0;
00135                 }
00136             }
00137         }
00138     }
00139 
00140     //////////////////////////////////////////////////////////
00141     // Done determining the clip positions, so clip.
00142     // To determine the number of clips from the front, add 1 to the
00143     // lastFrontClipPos since the index starts at 0.
00144     // To determine the number of clips from the back, subtract the
00145     // firstBackClipPos from the readLength.
00146     // Example:
00147     // Pos:  012345
00148     // Read: AAAAAA
00149     // Read Length = 6.  If lastFrontClipPos = 2 and firstBackClipPos = 4, numFrontClips = 3 & numBack = 2.
00150     return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos));
00151 }
00152 
00153 
00154 // Soft clip the record from the front and/or the back.
00155 SamFilter::FilterStatus SamFilter::softClip(SamRecord& record, 
00156                                             int32_t numFrontClips,
00157                                             int32_t numBackClips)
00158 {
00159     //////////////////////////////////////////////////////////
00160     Cigar* cigar = record.getCigarInfo();
00161     FilterStatus status = NONE;
00162     int32_t startPos = record.get0BasedPosition();
00163     CigarRoller updatedCigar;
00164 
00165     status = softClip(*cigar, numFrontClips, numBackClips, 
00166                       startPos, updatedCigar);
00167 
00168     if(status == FILTERED)
00169     {
00170         /////////////////////////////
00171         // The entire read is clipped, so rather than clipping it,
00172         // filter it out.
00173         filterRead(record);
00174         return(FILTERED);
00175     }
00176     else if(status == CLIPPED)
00177     {
00178         // Part of the read was clipped, and now that we have
00179         // an updated cigar, update the read.
00180         record.setCigar(updatedCigar);
00181 
00182         // Update the starting position.
00183         record.set0BasedPosition(startPos);
00184     }
00185     return(status);
00186 }
00187 
00188 
00189 // Soft clip the cigar from the front and/or the back, writing the value
00190 // into the new cigar.
00191 SamFilter::FilterStatus SamFilter::softClip(Cigar& oldCigar, 
00192                                             int32_t numFrontClips,
00193                                             int32_t numBackClips,
00194                                             int32_t& startPos,
00195                                             CigarRoller& updatedCigar)
00196 {
00197     int32_t readLength = oldCigar.getExpectedQueryBaseCount();
00198     int32_t endClipPos = readLength - numBackClips;
00199     FilterStatus status = NONE;
00200 
00201     if((numFrontClips != 0) || (numBackClips != 0))
00202     {
00203         // Clipping from front and/or from the back.
00204 
00205         // Check to see if the entire read was clipped.
00206         int32_t totalClips = numFrontClips + numBackClips;
00207         if(totalClips >= readLength)
00208         {
00209             /////////////////////////////
00210             // The entire read is clipped, so rather than clipping it,
00211             // filter it out.
00212             return(FILTERED);
00213         }
00214          
00215         // Part of the read was clipped.
00216         status = CLIPPED;
00217             
00218         // Loop through, creating an updated cigar.
00219         int origCigarOpIndex = 0;
00220         
00221         // Track how many read positions are covered up to this
00222         // point by the cigar to determine up to up to what
00223         // point in the cigar is affected by this clipping.
00224         int32_t numPositions = 0;
00225         
00226         // Track if any non-clips are in the new cigar.
00227         bool onlyClips = true;
00228 
00229         const Cigar::CigarOperator* op = NULL;
00230 
00231         //////////////////
00232         // Clip from front
00233         while((origCigarOpIndex < oldCigar.size()) &&
00234               (numPositions < numFrontClips))
00235         {
00236             op = &(oldCigar.getOperator(origCigarOpIndex));
00237             switch(op->operation)
00238             {
00239                 case Cigar::hardClip:
00240                     // Keep this operation as the new clips do not
00241                     // affect other clips.
00242                     updatedCigar += *op;
00243                     break;
00244                 case Cigar::del:
00245                 case Cigar::skip:
00246                     // Skip and delete are going to be dropped, and
00247                     // are not in the read, so the read index doesn't
00248                     // need to be updated
00249                     break;
00250                 case Cigar::insert:
00251                 case Cigar::match:
00252                 case Cigar::mismatch:
00253                 case Cigar::softClip:
00254                     // Update the read index as these types
00255                     // are found in the read.
00256                     numPositions += op->count;
00257                     break;
00258                 case Cigar::none:
00259                 default:
00260                     // Nothing to do for none.
00261                     break;
00262             };
00263             ++origCigarOpIndex;
00264         }
00265     
00266         // If bases were clipped from the front, add the clip and
00267         // any partial cigar operation as necessary.
00268         if(numFrontClips != 0)
00269         {
00270             // Add the softclip to the front of the read.
00271             updatedCigar.Add(Cigar::softClip, numFrontClips);
00272         
00273             // Add the rest of the last Cigar operation if
00274             // it is not entirely clipped.
00275             int32_t newCount = numPositions - numFrontClips;
00276             if(newCount > 0)
00277             {
00278                 // Before adding it, check to see if the same
00279                 // operation is clipped from the end.
00280                 // numPositions greater than the endClipPos
00281                 // means that it is equal or past that position,
00282                 // so shorten the number of positions.
00283                 if(numPositions > endClipPos)
00284                 {
00285                     newCount -= (numPositions - endClipPos);
00286                 }
00287                 if(newCount > 0)
00288                 {
00289                     updatedCigar.Add(op->operation, newCount);
00290                     if(!Cigar::isClip(op->operation))
00291                     {
00292                         onlyClips = false;
00293                     }
00294                 }
00295             }
00296         }
00297     
00298         // Add operations until the point of the end clip is reached.
00299         // For example...
00300         //   2M1D3M = MMDMMM  readLength = 5
00301         // readIndex: 01 234
00302         //   at cigarOpIndex 0 (2M), numPositions = 2.
00303         //   at cigarOpIndex 1 (1D), numPositions = 2.
00304         //   at cigarOpIndex 2 (3M), numPositions = 5.
00305         // if endClipPos = 2, we still want to consume the 1D, so
00306         // need to keep looping until numPositions > endClipPos
00307         while((origCigarOpIndex < oldCigar.size()) &&
00308               (numPositions <= endClipPos))
00309         {
00310             op = &(oldCigar.getOperator(origCigarOpIndex));
00311             
00312             // Update the numPositions count if the operations indicates
00313             // bases within the read.
00314             if(!Cigar::foundInQuery(op->operation))
00315             {
00316                 // This operation is not in the query read sequence,
00317                 // so it is not yet to the endClipPos, just add the
00318                 // operation do not increment the number of positions.
00319                 updatedCigar += *op;
00320                 if(!Cigar::isClip(op->operation))
00321                 {
00322                     onlyClips = false;
00323                 }
00324             }
00325             else
00326             {
00327                 // This operation appears in the query sequence, so
00328                 // check to see if the clip occurs in this operation.
00329                 
00330                 // endClipPos is 0 based & numPositions is a count.
00331                 // If endClipPos is 4, then it is the 5th position.
00332                 // If 4 positions are covered so far (numPositions = 4), 
00333                 // then we are right at endCLipPos: 4-4 = 0, none of 
00334                 // this operation should be kept. 
00335                 // If only 3 positions were covered, then we are at offset
00336                 // 3, so offset 3 should be added: 4-3 = 1.
00337                 uint32_t numPosTilClip = endClipPos - numPositions;
00338                 
00339                 if(numPosTilClip < op->count)
00340                 {
00341                     // this operation is partially clipped, write the part
00342                     // that was not clipped if it is not all clipped.
00343                     if(numPosTilClip != 0)
00344                     {
00345                         updatedCigar.Add(op->operation,
00346                                      numPosTilClip);
00347                         if(!Cigar::isClip(op->operation))
00348                         {
00349                             onlyClips = false;
00350                         }
00351                     }
00352                 }
00353                 else
00354                 {
00355                     // This operation is not clipped, so add it
00356                     updatedCigar += *op;
00357                     if(!Cigar::isClip(op->operation))
00358                     {
00359                         onlyClips = false;
00360                     }
00361                 }
00362                 // This operation occurs in the query sequence, so 
00363                 // increment the number of positions covered.
00364                 numPositions += op->count;
00365             }
00366 
00367             // Move to the next cigar position.
00368             ++origCigarOpIndex;
00369         }
00370             
00371         //////////////////
00372         // Add the softclip to the back.
00373         if(numBackClips != 0)
00374         {
00375             // Add the softclip to the end
00376             updatedCigar.Add(Cigar::softClip, numBackClips);
00377         }
00378         
00379         //////////////////
00380         // Add any hardclips remaining in the original cigar to the back.
00381         while(origCigarOpIndex < oldCigar.size())
00382         {
00383             op = &(oldCigar.getOperator(origCigarOpIndex));
00384             if(op->operation == Cigar::hardClip)
00385             {
00386                 // Keep this operation as the new clips do not
00387                 // affect other clips.
00388                 updatedCigar += *op;
00389             }
00390             ++origCigarOpIndex;
00391         }
00392         
00393         // Check to see if the new cigar is only clips.
00394         if(onlyClips)
00395         {
00396             // Only clips in the new cigar, so mark the read as filtered
00397             // instead of updating the cigar.
00398             /////////////////////////////
00399             // The entire read was clipped.
00400             status = FILTERED;
00401         }
00402         else
00403         {
00404             // Part of the read was clipped.
00405             // Update the starting position if a clip was added to
00406             // the front.
00407             if(numFrontClips > 0)
00408             {
00409                 // Convert from query index to reference position (from the
00410                 // old cigar)
00411                 // Get the position for the last front clipped position by
00412                 // getting the position associated with the clipped base on
00413                 // the reference.  Then add one to get to the first
00414                 // non-clipped position.
00415                 int32_t lastFrontClipPos = numFrontClips - 1;
00416                 int32_t newStartPos = oldCigar.getRefPosition(lastFrontClipPos, 
00417                                                               startPos);
00418                 if(newStartPos != Cigar::INDEX_NA)
00419                 {
00420                     // Add one to get first non-clipped position.
00421                     startPos = newStartPos + 1;
00422                 }
00423             }
00424         }
00425     }
00426     return(status);
00427 }
00428 
00429 
00430 SamFilter::FilterStatus SamFilter::filterOnMismatchQuality(SamRecord& record, 
00431                                                            GenomeSequence& refSequence,
00432                                                            uint32_t qualityThreshold, 
00433                                                            uint8_t defaultQualityInt)
00434 {
00435     uint32_t totalMismatchQuality = 
00436         sumMismatchQuality(record, refSequence, defaultQualityInt); 
00437     
00438     // If the total mismatch quality is over the threshold, 
00439     // filter the read.
00440     if(totalMismatchQuality > qualityThreshold)
00441     {
00442         filterRead(record);
00443         return(FILTERED);
00444     }
00445     return(NONE);
00446 }
00447 
00448 
00449 // NOTE: Only positions where the reference and read both have bases that
00450 //       are different and not 'N' are considered mismatches.
00451 uint32_t SamFilter::sumMismatchQuality(SamRecord& record, 
00452                                        GenomeSequence& refSequence,
00453                                        uint8_t defaultQualityInt)
00454 {
00455     // Track the mismatch info.
00456     int mismatchQual = 0;
00457     int numMismatch = 0;
00458 
00459     SamQuerySeqWithRefIter sequenceIter(record, refSequence);
00460 
00461     SamSingleBaseMatchInfo baseMatchInfo;
00462     while(sequenceIter.getNextMatchMismatch(baseMatchInfo))
00463     {
00464         if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
00465         {
00466             // Got a mismatch, get the associated quality.
00467             char readQualityChar = 
00468                 record.getQuality(baseMatchInfo.getQueryIndex());
00469             uint8_t readQualityInt = 
00470                 BaseUtilities::getPhredBaseQuality(readQualityChar);
00471             
00472             if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT)
00473             {
00474                 // Quality was not specified, so use the configured setting.
00475                 readQualityInt = defaultQualityInt;
00476             }
00477             mismatchQual += readQualityInt;
00478             ++numMismatch;
00479         }
00480     }
00481 
00482     return(mismatchQual);
00483 }
00484 
00485 
00486 void SamFilter::filterRead(SamRecord& record)
00487 {
00488     // Filter the read by marking it as unmapped.
00489     uint16_t flag = record.getFlag(); 
00490     SamFlag::setUnmapped(flag);
00491     // Clear N/A flags.
00492     flag &= ~SamFlag::PROPER_PAIR;
00493     flag &= ~SamFlag::SECONDARY_ALIGNMENT;
00494     flag &= ~SamFlag::SUPPLEMENTARY_ALIGNMENT;
00495     record.setFlag(flag);
00496     // Clear Cigar
00497     record.setCigar("*");
00498     // Clear mapping quality
00499     record.setMapQuality(0);
00500 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends