libStatGen Software
1
|
Class for helping to filter a SAM/BAM record. More...
#include <SamFilter.h>
Public Types | |
enum | FilterStatus { NONE, CLIPPED, FILTERED } |
Enum describing what sort of filtering was done. More... | |
Static Public Member Functions | |
static FilterStatus | clipOnMismatchThreshold (SamRecord &record, GenomeSequence &refSequence, double mismatchThreshold) |
Clip the read based on the specified mismatch threshold. | |
static FilterStatus | softClip (SamRecord &record, int32_t numFrontClips, int32_t numBackClips) |
Soft clip the record from the front and/or the back. | |
static FilterStatus | softClip (Cigar &oldCigar, int32_t numFrontClips, int32_t numBackClips, int32_t &startPos, CigarRoller &updatedCigar) |
Soft clip the cigar from the front and/or the back, writing the value into the new cigar, updatedCigar & startPos are only updated if the return FilterStatus is CLIPPED. | |
static FilterStatus | filterOnMismatchQuality (SamRecord &record, GenomeSequence &refSequence, uint32_t qualityThreshold, uint8_t defaultQualityInt) |
Filter the read based on the specified quality threshold. | |
static uint32_t | sumMismatchQuality (SamRecord &record, GenomeSequence &refSequence, uint8_t defaultQualityInt) |
Get the sum of the qualities of all mismatches in the record. | |
static void | filterRead (SamRecord &record) |
Filter the read by marking it as unmapped. |
Class for helping to filter a SAM/BAM record.
Definition at line 25 of file SamFilter.h.
Enum describing what sort of filtering was done.
NONE |
The filter did not affect the read. |
CLIPPED |
Filtering clipped the read. |
FILTERED |
Filtering caused the read to be modified to unmapped. |
Definition at line 29 of file SamFilter.h.
SamFilter::FilterStatus SamFilter::clipOnMismatchThreshold | ( | SamRecord & | record, |
GenomeSequence & | refSequence, | ||
double | mismatchThreshold | ||
) | [static] |
Clip the read based on the specified mismatch threshold.
Definition at line 27 of file SamFilter.cpp.
References SamQuerySeqWithRefIter::getNextMatchMismatch(), SamSingleBaseMatchInfo::getQueryIndex(), SamRecord::getReadLength(), SamSingleBaseMatchInfo::getType(), and softClip().
{ // Read & clip from the left & right. SamQuerySeqWithRefIter iterFromFront(record, refSequence, true); SamQuerySeqWithRefIter iterFromBack(record, refSequence, false); SamSingleBaseMatchInfo baseMatchInfo; int32_t readLength = record.getReadLength(); // Init last front clip to be prior to the lastFront index (0). const int32_t initialLastFrontClipPos = -1; int32_t lastFrontClipPos = initialLastFrontClipPos; // Init first back clip to be past the last index (readLength). int32_t firstBackClipPos = readLength; bool fromFrontComplete = false; bool fromBackComplete = false; int32_t numBasesFromFront = 0; int32_t numBasesFromBack = 0; int32_t numMismatchFromFront = 0; int32_t numMismatchFromBack = 0; ////////////////////////////////////////////////////////// // Determining the clip positions. while(!fromFrontComplete || !fromBackComplete) { // Read from the front (left to right) of the read until // more have been read from that direction than the opposite direction. while(!fromFrontComplete && ((numBasesFromFront <= numBasesFromBack) || (fromBackComplete))) { if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false) { // Nothing more to read in this direction. fromFrontComplete = true; break; } // Got a read. Check to see if it is to or past the last clip. if(baseMatchInfo.getQueryIndex() >= firstBackClipPos) { // This base is past where we are clipping, so we // are done reading in this direction. fromFrontComplete = true; break; } // This is an actual base read from the left to the // right, so up the counter and determine if it was a mismatch. ++numBasesFromFront; if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) { // Mismatch ++numMismatchFromFront; // Check to see if it is over the threshold. double mismatchPercent = (double)numMismatchFromFront / numBasesFromFront; if(mismatchPercent > mismatchThreshold) { // Need to clip. lastFrontClipPos = baseMatchInfo.getQueryIndex(); // Reset the counters. numBasesFromFront = 0; numMismatchFromFront = 0; } } } // Now, read from right to left until more have been read // from the back than from the front. while(!fromBackComplete && ((numBasesFromBack <= numBasesFromFront) || (fromFrontComplete))) { if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false) { // Nothing more to read in this direction. fromBackComplete = true; break; } // Got a read. Check to see if it is to or past the first clip. if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos) { // This base is past where we are clipping, so we // are done reading in this direction. fromBackComplete = true; break; } // This is an actual base read from the right to the // left, so up the counter and determine if it was a mismatch. ++numBasesFromBack; if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) { // Mismatch ++numMismatchFromBack; // Check to see if it is over the threshold. double mismatchPercent = (double)numMismatchFromBack / numBasesFromBack; if(mismatchPercent > mismatchThreshold) { // Need to clip. firstBackClipPos = baseMatchInfo.getQueryIndex(); // Reset the counters. numBasesFromBack = 0; numMismatchFromBack = 0; } } } } ////////////////////////////////////////////////////////// // Done determining the clip positions, so clip. // To determine the number of clips from the front, add 1 to the // lastFrontClipPos since the index starts at 0. // To determine the number of clips from the back, subtract the // firstBackClipPos from the readLength. // Example: // Pos: 012345 // Read: AAAAAA // Read Length = 6. If lastFrontClipPos = 2 and firstBackClipPos = 4, numFrontClips = 3 & numBack = 2. return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos)); }
SamFilter::FilterStatus SamFilter::filterOnMismatchQuality | ( | SamRecord & | record, |
GenomeSequence & | refSequence, | ||
uint32_t | qualityThreshold, | ||
uint8_t | defaultQualityInt | ||
) | [static] |
Filter the read based on the specified quality threshold.
Definition at line 430 of file SamFilter.cpp.
References FILTERED, filterRead(), NONE, and sumMismatchQuality().
{ uint32_t totalMismatchQuality = sumMismatchQuality(record, refSequence, defaultQualityInt); // If the total mismatch quality is over the threshold, // filter the read. if(totalMismatchQuality > qualityThreshold) { filterRead(record); return(FILTERED); } return(NONE); }
SamFilter::FilterStatus SamFilter::softClip | ( | SamRecord & | record, |
int32_t | numFrontClips, | ||
int32_t | numBackClips | ||
) | [static] |
Soft clip the record from the front and/or the back.
record | record to be clipped (input/output parameter). |
numFrontClips | number of bases that should be clipped from the front of the sequence read. (total count, including any that are already clipped.) |
backClipPos | number of bases that should be clipped from the back of the sequence read. (total count, including any that are already clipped.) |
Definition at line 155 of file SamFilter.cpp.
References CLIPPED, FILTERED, filterRead(), SamRecord::get0BasedPosition(), SamRecord::getCigarInfo(), NONE, SamRecord::set0BasedPosition(), and SamRecord::setCigar().
Referenced by clipOnMismatchThreshold().
{ ////////////////////////////////////////////////////////// Cigar* cigar = record.getCigarInfo(); FilterStatus status = NONE; int32_t startPos = record.get0BasedPosition(); CigarRoller updatedCigar; status = softClip(*cigar, numFrontClips, numBackClips, startPos, updatedCigar); if(status == FILTERED) { ///////////////////////////// // The entire read is clipped, so rather than clipping it, // filter it out. filterRead(record); return(FILTERED); } else if(status == CLIPPED) { // Part of the read was clipped, and now that we have // an updated cigar, update the read. record.setCigar(updatedCigar); // Update the starting position. record.set0BasedPosition(startPos); } return(status); }
SamFilter::FilterStatus SamFilter::softClip | ( | Cigar & | oldCigar, |
int32_t | numFrontClips, | ||
int32_t | numBackClips, | ||
int32_t & | startPos, | ||
CigarRoller & | updatedCigar | ||
) | [static] |
Soft clip the cigar from the front and/or the back, writing the value into the new cigar, updatedCigar & startPos are only updated if the return FilterStatus is CLIPPED.
oldCigar | cigar prior to clipping |
numFrontClips | number of bases that should be clipped from the front of the sequence read. (total count, including any that are already clipped.) |
numBackClips | number of bases that should be clipped from the back of the sequence read. (total count, including any that are already clipped.) |
startPos | 0-based start position associated with the cigar prior to updating (input) and set to the 0-based start position after updating (output) the cigar if it was CLIPPED. |
updatedCigar | set to the clipped cigar if CLIPPED (output param). |
Definition at line 191 of file SamFilter.cpp.
References CigarRoller::Add(), CLIPPED, Cigar::del, FILTERED, Cigar::foundInQuery(), Cigar::getExpectedQueryBaseCount(), Cigar::getOperator(), Cigar::getRefPosition(), Cigar::hardClip, Cigar::INDEX_NA, Cigar::insert, Cigar::isClip(), Cigar::match, Cigar::mismatch, NONE, Cigar::none, Cigar::size(), Cigar::skip, and Cigar::softClip.
{ int32_t readLength = oldCigar.getExpectedQueryBaseCount(); int32_t endClipPos = readLength - numBackClips; FilterStatus status = NONE; if((numFrontClips != 0) || (numBackClips != 0)) { // Clipping from front and/or from the back. // Check to see if the entire read was clipped. int32_t totalClips = numFrontClips + numBackClips; if(totalClips >= readLength) { ///////////////////////////// // The entire read is clipped, so rather than clipping it, // filter it out. return(FILTERED); } // Part of the read was clipped. status = CLIPPED; // Loop through, creating an updated cigar. int origCigarOpIndex = 0; // Track how many read positions are covered up to this // point by the cigar to determine up to up to what // point in the cigar is affected by this clipping. int32_t numPositions = 0; // Track if any non-clips are in the new cigar. bool onlyClips = true; const Cigar::CigarOperator* op = NULL; ////////////////// // Clip from front while((origCigarOpIndex < oldCigar.size()) && (numPositions < numFrontClips)) { op = &(oldCigar.getOperator(origCigarOpIndex)); switch(op->operation) { case Cigar::hardClip: // Keep this operation as the new clips do not // affect other clips. updatedCigar += *op; break; case Cigar::del: case Cigar::skip: // Skip and delete are going to be dropped, and // are not in the read, so the read index doesn't // need to be updated break; case Cigar::insert: case Cigar::match: case Cigar::mismatch: case Cigar::softClip: // Update the read index as these types // are found in the read. numPositions += op->count; break; case Cigar::none: default: // Nothing to do for none. break; }; ++origCigarOpIndex; } // If bases were clipped from the front, add the clip and // any partial cigar operation as necessary. if(numFrontClips != 0) { // Add the softclip to the front of the read. updatedCigar.Add(Cigar::softClip, numFrontClips); // Add the rest of the last Cigar operation if // it is not entirely clipped. int32_t newCount = numPositions - numFrontClips; if(newCount > 0) { // Before adding it, check to see if the same // operation is clipped from the end. // numPositions greater than the endClipPos // means that it is equal or past that position, // so shorten the number of positions. if(numPositions > endClipPos) { newCount -= (numPositions - endClipPos); } if(newCount > 0) { updatedCigar.Add(op->operation, newCount); if(!Cigar::isClip(op->operation)) { onlyClips = false; } } } } // Add operations until the point of the end clip is reached. // For example... // 2M1D3M = MMDMMM readLength = 5 // readIndex: 01 234 // at cigarOpIndex 0 (2M), numPositions = 2. // at cigarOpIndex 1 (1D), numPositions = 2. // at cigarOpIndex 2 (3M), numPositions = 5. // if endClipPos = 2, we still want to consume the 1D, so // need to keep looping until numPositions > endClipPos while((origCigarOpIndex < oldCigar.size()) && (numPositions <= endClipPos)) { op = &(oldCigar.getOperator(origCigarOpIndex)); // Update the numPositions count if the operations indicates // bases within the read. if(!Cigar::foundInQuery(op->operation)) { // This operation is not in the query read sequence, // so it is not yet to the endClipPos, just add the // operation do not increment the number of positions. updatedCigar += *op; if(!Cigar::isClip(op->operation)) { onlyClips = false; } } else { // This operation appears in the query sequence, so // check to see if the clip occurs in this operation. // endClipPos is 0 based & numPositions is a count. // If endClipPos is 4, then it is the 5th position. // If 4 positions are covered so far (numPositions = 4), // then we are right at endCLipPos: 4-4 = 0, none of // this operation should be kept. // If only 3 positions were covered, then we are at offset // 3, so offset 3 should be added: 4-3 = 1. uint32_t numPosTilClip = endClipPos - numPositions; if(numPosTilClip < op->count) { // this operation is partially clipped, write the part // that was not clipped if it is not all clipped. if(numPosTilClip != 0) { updatedCigar.Add(op->operation, numPosTilClip); if(!Cigar::isClip(op->operation)) { onlyClips = false; } } } else { // This operation is not clipped, so add it updatedCigar += *op; if(!Cigar::isClip(op->operation)) { onlyClips = false; } } // This operation occurs in the query sequence, so // increment the number of positions covered. numPositions += op->count; } // Move to the next cigar position. ++origCigarOpIndex; } ////////////////// // Add the softclip to the back. if(numBackClips != 0) { // Add the softclip to the end updatedCigar.Add(Cigar::softClip, numBackClips); } ////////////////// // Add any hardclips remaining in the original cigar to the back. while(origCigarOpIndex < oldCigar.size()) { op = &(oldCigar.getOperator(origCigarOpIndex)); if(op->operation == Cigar::hardClip) { // Keep this operation as the new clips do not // affect other clips. updatedCigar += *op; } ++origCigarOpIndex; } // Check to see if the new cigar is only clips. if(onlyClips) { // Only clips in the new cigar, so mark the read as filtered // instead of updating the cigar. ///////////////////////////// // The entire read was clipped. status = FILTERED; } else { // Part of the read was clipped. // Update the starting position if a clip was added to // the front. if(numFrontClips > 0) { // Convert from query index to reference position (from the // old cigar) // Get the position for the last front clipped position by // getting the position associated with the clipped base on // the reference. Then add one to get to the first // non-clipped position. int32_t lastFrontClipPos = numFrontClips - 1; int32_t newStartPos = oldCigar.getRefPosition(lastFrontClipPos, startPos); if(newStartPos != Cigar::INDEX_NA) { // Add one to get first non-clipped position. startPos = newStartPos + 1; } } } } return(status); }
uint32_t SamFilter::sumMismatchQuality | ( | SamRecord & | record, |
GenomeSequence & | refSequence, | ||
uint8_t | defaultQualityInt | ||
) | [static] |
Get the sum of the qualities of all mismatches in the record.
record | record on which to calculate the sum the mismatch qualities |
refSequence | reference to use to check for mismatches. |
defaultQualityInt | default value to use for the quality if no quality was specified in the read. |
Definition at line 451 of file SamFilter.cpp.
References SamQuerySeqWithRefIter::getNextMatchMismatch(), BaseUtilities::getPhredBaseQuality(), SamRecord::getQuality(), SamSingleBaseMatchInfo::getQueryIndex(), SamSingleBaseMatchInfo::getType(), and BaseUtilities::UNKNOWN_QUALITY_INT.
Referenced by filterOnMismatchQuality().
{ // Track the mismatch info. int mismatchQual = 0; int numMismatch = 0; SamQuerySeqWithRefIter sequenceIter(record, refSequence); SamSingleBaseMatchInfo baseMatchInfo; while(sequenceIter.getNextMatchMismatch(baseMatchInfo)) { if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) { // Got a mismatch, get the associated quality. char readQualityChar = record.getQuality(baseMatchInfo.getQueryIndex()); uint8_t readQualityInt = BaseUtilities::getPhredBaseQuality(readQualityChar); if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT) { // Quality was not specified, so use the configured setting. readQualityInt = defaultQualityInt; } mismatchQual += readQualityInt; ++numMismatch; } } return(mismatchQual); }