libStatGen Software  1
SamFilter Class Reference

Class for helping to filter a SAM/BAM record. More...

#include <SamFilter.h>

List of all members.

Public Types

enum  FilterStatus { NONE, CLIPPED, FILTERED }
 Enum describing what sort of filtering was done. More...

Static Public Member Functions

static FilterStatus clipOnMismatchThreshold (SamRecord &record, GenomeSequence &refSequence, double mismatchThreshold)
 Clip the read based on the specified mismatch threshold.
static FilterStatus softClip (SamRecord &record, int32_t numFrontClips, int32_t numBackClips)
 Soft clip the record from the front and/or the back.
static FilterStatus softClip (Cigar &oldCigar, int32_t numFrontClips, int32_t numBackClips, int32_t &startPos, CigarRoller &updatedCigar)
 Soft clip the cigar from the front and/or the back, writing the value into the new cigar, updatedCigar & startPos are only updated if the return FilterStatus is CLIPPED.
static FilterStatus filterOnMismatchQuality (SamRecord &record, GenomeSequence &refSequence, uint32_t qualityThreshold, uint8_t defaultQualityInt)
 Filter the read based on the specified quality threshold.
static uint32_t sumMismatchQuality (SamRecord &record, GenomeSequence &refSequence, uint8_t defaultQualityInt)
 Get the sum of the qualities of all mismatches in the record.
static void filterRead (SamRecord &record)
 Filter the read by marking it as unmapped.

Detailed Description

Class for helping to filter a SAM/BAM record.

Definition at line 25 of file SamFilter.h.


Member Enumeration Documentation

Enum describing what sort of filtering was done.

Enumerator:
NONE 

The filter did not affect the read.

CLIPPED 

Filtering clipped the read.

FILTERED 

Filtering caused the read to be modified to unmapped.

Definition at line 29 of file SamFilter.h.

                      {
        NONE,    ///< The filter did not affect the read.
        CLIPPED, ///< Filtering clipped the read.
        FILTERED ///< Filtering caused the read to be modified to unmapped.
    };

Member Function Documentation

SamFilter::FilterStatus SamFilter::clipOnMismatchThreshold ( SamRecord record,
GenomeSequence refSequence,
double  mismatchThreshold 
) [static]

Clip the read based on the specified mismatch threshold.

Returns:
how the read was affected, NONE if the read was not modified, CLIPPED if the read was clipped, FILTERED if the whole read would have been clipped so instead the read was modified to unmapped.

Definition at line 27 of file SamFilter.cpp.

References SamQuerySeqWithRefIter::getNextMatchMismatch(), SamSingleBaseMatchInfo::getQueryIndex(), SamRecord::getReadLength(), SamSingleBaseMatchInfo::getType(), and softClip().

{
    // Read & clip from the left & right.    
    SamQuerySeqWithRefIter iterFromFront(record, refSequence, true);
    SamQuerySeqWithRefIter iterFromBack(record, refSequence, false);

    SamSingleBaseMatchInfo baseMatchInfo;

    int32_t readLength = record.getReadLength();
    // Init last front clip to be prior to the lastFront index (0).
    const int32_t initialLastFrontClipPos = -1;
    int32_t lastFrontClipPos = initialLastFrontClipPos;
    // Init first back clip to be past the last index (readLength).
    int32_t firstBackClipPos = readLength;

    bool fromFrontComplete = false;
    bool fromBackComplete = false;
    int32_t numBasesFromFront = 0;
    int32_t numBasesFromBack = 0;
    int32_t numMismatchFromFront = 0;
    int32_t numMismatchFromBack = 0;

    //////////////////////////////////////////////////////////
    // Determining the clip positions.
    while(!fromFrontComplete || !fromBackComplete)
    {
        // Read from the front (left to right) of the read until
        // more have been read from that direction than the opposite direction.
        while(!fromFrontComplete && 
              ((numBasesFromFront <= numBasesFromBack) ||
               (fromBackComplete)))
        {
            if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false)
            {
                // Nothing more to read in this direction.
                fromFrontComplete = true;
                break;
            }
            // Got a read.  Check to see if it is to or past the last clip.
            if(baseMatchInfo.getQueryIndex() >= firstBackClipPos)
            {
                // This base is past where we are clipping, so we
                // are done reading in this direction.
                fromFrontComplete = true;
                break;
            }
            // This is an actual base read from the left to the
            // right, so up the counter and determine if it was a mismatch.
            ++numBasesFromFront;

            if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
            {
                // Mismatch
                ++numMismatchFromFront;
                // Check to see if it is over the threshold.
                double mismatchPercent = 
                    (double)numMismatchFromFront / numBasesFromFront;
                if(mismatchPercent > mismatchThreshold)
                {
                    // Need to clip.
                    lastFrontClipPos = baseMatchInfo.getQueryIndex();
                    // Reset the counters.
                    numBasesFromFront = 0;
                    numMismatchFromFront = 0;
                }
            }
        }

        // Now, read from right to left until more have been read
        // from the back than from the front.
        while(!fromBackComplete && 
              ((numBasesFromBack <= numBasesFromFront) ||
               (fromFrontComplete)))
        {
            if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false)
            {
                // Nothing more to read in this direction.
                fromBackComplete = true;
                break;
            }
            // Got a read.  Check to see if it is to or past the first clip.
            if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos)
            {
                // This base is past where we are clipping, so we
                // are done reading in this direction.
                fromBackComplete = true;
                break;
            }
            // This is an actual base read from the right to the
            // left, so up the counter and determine if it was a mismatch.
            ++numBasesFromBack;

            if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
            {
                // Mismatch
                ++numMismatchFromBack;
                // Check to see if it is over the threshold.
                double mismatchPercent = 
                    (double)numMismatchFromBack / numBasesFromBack;
                if(mismatchPercent > mismatchThreshold)
                {
                    // Need to clip.
                    firstBackClipPos = baseMatchInfo.getQueryIndex();
                    // Reset the counters.
                    numBasesFromBack = 0;
                    numMismatchFromBack = 0;
                }
            }
        }
    }

    //////////////////////////////////////////////////////////
    // Done determining the clip positions, so clip.
    // To determine the number of clips from the front, add 1 to the
    // lastFrontClipPos since the index starts at 0.
    // To determine the number of clips from the back, subtract the
    // firstBackClipPos from the readLength.
    // Example:
    // Pos:  012345
    // Read: AAAAAA
    // Read Length = 6.  If lastFrontClipPos = 2 and firstBackClipPos = 4, numFrontClips = 3 & numBack = 2.
    return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos));
}
SamFilter::FilterStatus SamFilter::filterOnMismatchQuality ( SamRecord record,
GenomeSequence refSequence,
uint32_t  qualityThreshold,
uint8_t  defaultQualityInt 
) [static]

Filter the read based on the specified quality threshold.

Returns:
how the read was affected, NONE if the read was not modified, FILTERED if the read was modified to unmapped because it was over the quality threshold.

Definition at line 430 of file SamFilter.cpp.

References FILTERED, filterRead(), NONE, and sumMismatchQuality().

{
    uint32_t totalMismatchQuality = 
        sumMismatchQuality(record, refSequence, defaultQualityInt); 
    
    // If the total mismatch quality is over the threshold, 
    // filter the read.
    if(totalMismatchQuality > qualityThreshold)
    {
        filterRead(record);
        return(FILTERED);
    }
    return(NONE);
}
SamFilter::FilterStatus SamFilter::softClip ( SamRecord record,
int32_t  numFrontClips,
int32_t  numBackClips 
) [static]

Soft clip the record from the front and/or the back.

Parameters:
recordrecord to be clipped (input/output parameter).
numFrontClipsnumber of bases that should be clipped from the front of the sequence read. (total count, including any that are already clipped.)
backClipPosnumber of bases that should be clipped from the back of the sequence read. (total count, including any that are already clipped.)

Definition at line 155 of file SamFilter.cpp.

References CLIPPED, FILTERED, filterRead(), SamRecord::get0BasedPosition(), SamRecord::getCigarInfo(), NONE, SamRecord::set0BasedPosition(), and SamRecord::setCigar().

Referenced by clipOnMismatchThreshold().

{
    //////////////////////////////////////////////////////////
    Cigar* cigar = record.getCigarInfo();
    FilterStatus status = NONE;
    int32_t startPos = record.get0BasedPosition();
    CigarRoller updatedCigar;

    status = softClip(*cigar, numFrontClips, numBackClips, 
                      startPos, updatedCigar);

    if(status == FILTERED)
    {
        /////////////////////////////
        // The entire read is clipped, so rather than clipping it,
        // filter it out.
        filterRead(record);
        return(FILTERED);
    }
    else if(status == CLIPPED)
    {
        // Part of the read was clipped, and now that we have
        // an updated cigar, update the read.
        record.setCigar(updatedCigar);

        // Update the starting position.
        record.set0BasedPosition(startPos);
    }
    return(status);
}
SamFilter::FilterStatus SamFilter::softClip ( Cigar oldCigar,
int32_t  numFrontClips,
int32_t  numBackClips,
int32_t &  startPos,
CigarRoller updatedCigar 
) [static]

Soft clip the cigar from the front and/or the back, writing the value into the new cigar, updatedCigar & startPos are only updated if the return FilterStatus is CLIPPED.

Parameters:
oldCigarcigar prior to clipping
numFrontClipsnumber of bases that should be clipped from the front of the sequence read. (total count, including any that are already clipped.)
numBackClipsnumber of bases that should be clipped from the back of the sequence read. (total count, including any that are already clipped.)
startPos0-based start position associated with the cigar prior to updating (input) and set to the 0-based start position after updating (output) the cigar if it was CLIPPED.
updatedCigarset to the clipped cigar if CLIPPED (output param).

Definition at line 191 of file SamFilter.cpp.

References CigarRoller::Add(), CLIPPED, Cigar::del, FILTERED, Cigar::foundInQuery(), Cigar::getExpectedQueryBaseCount(), Cigar::getOperator(), Cigar::getRefPosition(), Cigar::hardClip, Cigar::INDEX_NA, Cigar::insert, Cigar::isClip(), Cigar::match, Cigar::mismatch, NONE, Cigar::none, Cigar::size(), Cigar::skip, and Cigar::softClip.

{
    int32_t readLength = oldCigar.getExpectedQueryBaseCount();
    int32_t endClipPos = readLength - numBackClips;
    FilterStatus status = NONE;

    if((numFrontClips != 0) || (numBackClips != 0))
    {
        // Clipping from front and/or from the back.

        // Check to see if the entire read was clipped.
        int32_t totalClips = numFrontClips + numBackClips;
        if(totalClips >= readLength)
        {
            /////////////////////////////
            // The entire read is clipped, so rather than clipping it,
            // filter it out.
            return(FILTERED);
        }
         
        // Part of the read was clipped.
        status = CLIPPED;
            
        // Loop through, creating an updated cigar.
        int origCigarOpIndex = 0;
        
        // Track how many read positions are covered up to this
        // point by the cigar to determine up to up to what
        // point in the cigar is affected by this clipping.
        int32_t numPositions = 0;
        
        // Track if any non-clips are in the new cigar.
        bool onlyClips = true;

        const Cigar::CigarOperator* op = NULL;

        //////////////////
        // Clip from front
        while((origCigarOpIndex < oldCigar.size()) &&
              (numPositions < numFrontClips))
        {
            op = &(oldCigar.getOperator(origCigarOpIndex));
            switch(op->operation)
            {
                case Cigar::hardClip:
                    // Keep this operation as the new clips do not
                    // affect other clips.
                    updatedCigar += *op;
                    break;
                case Cigar::del:
                case Cigar::skip:
                    // Skip and delete are going to be dropped, and
                    // are not in the read, so the read index doesn't
                    // need to be updated
                    break;
                case Cigar::insert:
                case Cigar::match:
                case Cigar::mismatch:
                case Cigar::softClip:
                    // Update the read index as these types
                    // are found in the read.
                    numPositions += op->count;
                    break;
                case Cigar::none:
                default:
                    // Nothing to do for none.
                    break;
            };
            ++origCigarOpIndex;
        }
    
        // If bases were clipped from the front, add the clip and
        // any partial cigar operation as necessary.
        if(numFrontClips != 0)
        {
            // Add the softclip to the front of the read.
            updatedCigar.Add(Cigar::softClip, numFrontClips);
        
            // Add the rest of the last Cigar operation if
            // it is not entirely clipped.
            int32_t newCount = numPositions - numFrontClips;
            if(newCount > 0)
            {
                // Before adding it, check to see if the same
                // operation is clipped from the end.
                // numPositions greater than the endClipPos
                // means that it is equal or past that position,
                // so shorten the number of positions.
                if(numPositions > endClipPos)
                {
                    newCount -= (numPositions - endClipPos);
                }
                if(newCount > 0)
                {
                    updatedCigar.Add(op->operation, newCount);
                    if(!Cigar::isClip(op->operation))
                    {
                        onlyClips = false;
                    }
                }
            }
        }
    
        // Add operations until the point of the end clip is reached.
        // For example...
        //   2M1D3M = MMDMMM  readLength = 5
        // readIndex: 01 234
        //   at cigarOpIndex 0 (2M), numPositions = 2.
        //   at cigarOpIndex 1 (1D), numPositions = 2.
        //   at cigarOpIndex 2 (3M), numPositions = 5.
        // if endClipPos = 2, we still want to consume the 1D, so
        // need to keep looping until numPositions > endClipPos
        while((origCigarOpIndex < oldCigar.size()) &&
              (numPositions <= endClipPos))
        {
            op = &(oldCigar.getOperator(origCigarOpIndex));
            
            // Update the numPositions count if the operations indicates
            // bases within the read.
            if(!Cigar::foundInQuery(op->operation))
            {
                // This operation is not in the query read sequence,
                // so it is not yet to the endClipPos, just add the
                // operation do not increment the number of positions.
                updatedCigar += *op;
                if(!Cigar::isClip(op->operation))
                {
                    onlyClips = false;
                }
            }
            else
            {
                // This operation appears in the query sequence, so
                // check to see if the clip occurs in this operation.
                
                // endClipPos is 0 based & numPositions is a count.
                // If endClipPos is 4, then it is the 5th position.
                // If 4 positions are covered so far (numPositions = 4), 
                // then we are right at endCLipPos: 4-4 = 0, none of 
                // this operation should be kept. 
                // If only 3 positions were covered, then we are at offset
                // 3, so offset 3 should be added: 4-3 = 1.
                uint32_t numPosTilClip = endClipPos - numPositions;
                
                if(numPosTilClip < op->count)
                {
                    // this operation is partially clipped, write the part
                    // that was not clipped if it is not all clipped.
                    if(numPosTilClip != 0)
                    {
                        updatedCigar.Add(op->operation,
                                     numPosTilClip);
                        if(!Cigar::isClip(op->operation))
                        {
                            onlyClips = false;
                        }
                    }
                }
                else
                {
                    // This operation is not clipped, so add it
                    updatedCigar += *op;
                    if(!Cigar::isClip(op->operation))
                    {
                        onlyClips = false;
                    }
                }
                // This operation occurs in the query sequence, so 
                // increment the number of positions covered.
                numPositions += op->count;
            }

            // Move to the next cigar position.
            ++origCigarOpIndex;
        }
            
        //////////////////
        // Add the softclip to the back.
        if(numBackClips != 0)
        {
            // Add the softclip to the end
            updatedCigar.Add(Cigar::softClip, numBackClips);
        }
        
        //////////////////
        // Add any hardclips remaining in the original cigar to the back.
        while(origCigarOpIndex < oldCigar.size())
        {
            op = &(oldCigar.getOperator(origCigarOpIndex));
            if(op->operation == Cigar::hardClip)
            {
                // Keep this operation as the new clips do not
                // affect other clips.
                updatedCigar += *op;
            }
            ++origCigarOpIndex;
        }
        
        // Check to see if the new cigar is only clips.
        if(onlyClips)
        {
            // Only clips in the new cigar, so mark the read as filtered
            // instead of updating the cigar.
            /////////////////////////////
            // The entire read was clipped.
            status = FILTERED;
        }
        else
        {
            // Part of the read was clipped.
            // Update the starting position if a clip was added to
            // the front.
            if(numFrontClips > 0)
            {
                // Convert from query index to reference position (from the
                // old cigar)
                // Get the position for the last front clipped position by
                // getting the position associated with the clipped base on
                // the reference.  Then add one to get to the first
                // non-clipped position.
                int32_t lastFrontClipPos = numFrontClips - 1;
                int32_t newStartPos = oldCigar.getRefPosition(lastFrontClipPos, 
                                                              startPos);
                if(newStartPos != Cigar::INDEX_NA)
                {
                    // Add one to get first non-clipped position.
                    startPos = newStartPos + 1;
                }
            }
        }
    }
    return(status);
}
uint32_t SamFilter::sumMismatchQuality ( SamRecord record,
GenomeSequence refSequence,
uint8_t  defaultQualityInt 
) [static]

Get the sum of the qualities of all mismatches in the record.

Parameters:
recordrecord on which to calculate the sum the mismatch qualities
refSequencereference to use to check for mismatches.
defaultQualityIntdefault value to use for the quality if no quality was specified in the read.
Returns:
sum of the qualities of mismatches

Definition at line 451 of file SamFilter.cpp.

References SamQuerySeqWithRefIter::getNextMatchMismatch(), BaseUtilities::getPhredBaseQuality(), SamRecord::getQuality(), SamSingleBaseMatchInfo::getQueryIndex(), SamSingleBaseMatchInfo::getType(), and BaseUtilities::UNKNOWN_QUALITY_INT.

Referenced by filterOnMismatchQuality().

{
    // Track the mismatch info.
    int mismatchQual = 0;
    int numMismatch = 0;

    SamQuerySeqWithRefIter sequenceIter(record, refSequence);

    SamSingleBaseMatchInfo baseMatchInfo;
    while(sequenceIter.getNextMatchMismatch(baseMatchInfo))
    {
        if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
        {
            // Got a mismatch, get the associated quality.
            char readQualityChar = 
                record.getQuality(baseMatchInfo.getQueryIndex());
            uint8_t readQualityInt = 
                BaseUtilities::getPhredBaseQuality(readQualityChar);
            
            if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT)
            {
                // Quality was not specified, so use the configured setting.
                readQualityInt = defaultQualityInt;
            }
            mismatchQual += readQualityInt;
            ++numMismatch;
        }
    }

    return(mismatchQual);
}

The documentation for this class was generated from the following files:
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends