Class for helping to filter a SAM/BAM record. More...
#include <SamFilter.h>
Public Types | |
enum | FilterStatus { NONE, CLIPPED, FILTERED } |
Enum describing what sort of filtering was done. More... | |
Static Public Member Functions | |
static FilterStatus | clipOnMismatchThreshold (SamRecord &record, GenomeSequence &refSequence, double mismatchThreshold) |
Clip the read based on the specified mismatch threshold. | |
static FilterStatus | softClip (SamRecord &record, int32_t numFrontClips, int32_t numBackClips) |
Soft clip the record from the front and/or the back. | |
static FilterStatus | softClip (Cigar &oldCigar, int32_t numFrontClips, int32_t numBackClips, int32_t &startPos, CigarRoller &updatedCigar) |
Soft clip the cigar from the front and/or the back, writing the value into the new cigar, updatedCigar & startPos are only updated if the return FilterStatus is CLIPPED. | |
static FilterStatus | filterOnMismatchQuality (SamRecord &record, GenomeSequence &refSequence, uint32_t qualityThreshold, uint8_t defaultQualityInt) |
Filter the read based on the specified quality threshold. | |
static uint32_t | sumMismatchQuality (SamRecord &record, GenomeSequence &refSequence, uint8_t defaultQualityInt) |
Get the sum of the qualities of all mismatches in the record. | |
static void | filterRead (SamRecord &record) |
Filter the read by marking it as unmapped. |
Class for helping to filter a SAM/BAM record.
Definition at line 25 of file SamFilter.h.
Enum describing what sort of filtering was done.
NONE |
The filter did not affect the read. |
CLIPPED |
Filtering clipped the read. |
FILTERED |
Filtering caused the read to be modified to unmapped. |
Definition at line 29 of file SamFilter.h.
SamFilter::FilterStatus SamFilter::clipOnMismatchThreshold | ( | SamRecord & | record, | |
GenomeSequence & | refSequence, | |||
double | mismatchThreshold | |||
) | [static] |
Clip the read based on the specified mismatch threshold.
Definition at line 27 of file SamFilter.cpp.
References SamQuerySeqWithRefIter::getNextMatchMismatch(), SamSingleBaseMatchInfo::getQueryIndex(), SamRecord::getReadLength(), SamSingleBaseMatchInfo::getType(), and softClip().
00030 { 00031 // Read & clip from the left & right. 00032 SamQuerySeqWithRefIter iterFromFront(record, refSequence, true); 00033 SamQuerySeqWithRefIter iterFromBack(record, refSequence, false); 00034 00035 SamSingleBaseMatchInfo baseMatchInfo; 00036 00037 int32_t readLength = record.getReadLength(); 00038 // Init last front clip to be prior to the lastFront index (0). 00039 const int32_t initialLastFrontClipPos = -1; 00040 int32_t lastFrontClipPos = initialLastFrontClipPos; 00041 // Init first back clip to be past the last index (readLength). 00042 int32_t firstBackClipPos = readLength; 00043 00044 bool fromFrontComplete = false; 00045 bool fromBackComplete = false; 00046 int32_t numBasesFromFront = 0; 00047 int32_t numBasesFromBack = 0; 00048 int32_t numMismatchFromFront = 0; 00049 int32_t numMismatchFromBack = 0; 00050 00051 ////////////////////////////////////////////////////////// 00052 // Determining the clip positions. 00053 while(!fromFrontComplete || !fromBackComplete) 00054 { 00055 // Read from the front (left to right) of the read until 00056 // more have been read from that direction than the opposite direction. 00057 while(!fromFrontComplete && 00058 ((numBasesFromFront <= numBasesFromBack) || 00059 (fromBackComplete))) 00060 { 00061 if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false) 00062 { 00063 // Nothing more to read in this direction. 00064 fromFrontComplete = true; 00065 break; 00066 } 00067 // Got a read. Check to see if it is to or past the last clip. 00068 if(baseMatchInfo.getQueryIndex() >= firstBackClipPos) 00069 { 00070 // This base is past where we are clipping, so we 00071 // are done reading in this direction. 00072 fromFrontComplete = true; 00073 break; 00074 } 00075 // This is an actual base read from the left to the 00076 // right, so up the counter and determine if it was a mismatch. 00077 ++numBasesFromFront; 00078 00079 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) 00080 { 00081 // Mismatch 00082 ++numMismatchFromFront; 00083 // Check to see if it is over the threshold. 00084 double mismatchPercent = 00085 (double)numMismatchFromFront / numBasesFromFront; 00086 if(mismatchPercent > mismatchThreshold) 00087 { 00088 // Need to clip. 00089 lastFrontClipPos = baseMatchInfo.getQueryIndex(); 00090 // Reset the counters. 00091 numBasesFromFront = 0; 00092 numMismatchFromFront = 0; 00093 } 00094 } 00095 } 00096 00097 // Now, read from right to left until more have been read 00098 // from the back than from the front. 00099 while(!fromBackComplete && 00100 ((numBasesFromBack <= numBasesFromFront) || 00101 (fromFrontComplete))) 00102 { 00103 if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false) 00104 { 00105 // Nothing more to read in this direction. 00106 fromBackComplete = true; 00107 break; 00108 } 00109 // Got a read. Check to see if it is to or past the first clip. 00110 if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos) 00111 { 00112 // This base is past where we are clipping, so we 00113 // are done reading in this direction. 00114 fromBackComplete = true; 00115 break; 00116 } 00117 // This is an actual base read from the right to the 00118 // left, so up the counter and determine if it was a mismatch. 00119 ++numBasesFromBack; 00120 00121 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) 00122 { 00123 // Mismatch 00124 ++numMismatchFromBack; 00125 // Check to see if it is over the threshold. 00126 double mismatchPercent = 00127 (double)numMismatchFromBack / numBasesFromBack; 00128 if(mismatchPercent > mismatchThreshold) 00129 { 00130 // Need to clip. 00131 firstBackClipPos = baseMatchInfo.getQueryIndex(); 00132 // Reset the counters. 00133 numBasesFromBack = 0; 00134 numMismatchFromBack = 0; 00135 } 00136 } 00137 } 00138 } 00139 00140 ////////////////////////////////////////////////////////// 00141 // Done determining the clip positions, so clip. 00142 // To determine the number of clips from the front, add 1 to the 00143 // lastFrontClipPos since the index starts at 0. 00144 // To determine the number of clips from the back, subtract the 00145 // firstBackClipPos from the readLength. 00146 // Example: 00147 // Pos: 012345 00148 // Read: AAAAAA 00149 // Read Length = 6. If lastFrontClipPos = 2 and firstBackClipPos = 4, numFrontClips = 3 & numBack = 2. 00150 return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos)); 00151 }
SamFilter::FilterStatus SamFilter::filterOnMismatchQuality | ( | SamRecord & | record, | |
GenomeSequence & | refSequence, | |||
uint32_t | qualityThreshold, | |||
uint8_t | defaultQualityInt | |||
) | [static] |
Filter the read based on the specified quality threshold.
Definition at line 426 of file SamFilter.cpp.
References FILTERED, filterRead(), NONE, and sumMismatchQuality().
00430 { 00431 uint32_t totalMismatchQuality = 00432 sumMismatchQuality(record, refSequence, defaultQualityInt); 00433 00434 // If the total mismatch quality is over the threshold, 00435 // filter the read. 00436 if(totalMismatchQuality > qualityThreshold) 00437 { 00438 filterRead(record); 00439 return(FILTERED); 00440 } 00441 return(NONE); 00442 }
SamFilter::FilterStatus SamFilter::softClip | ( | Cigar & | oldCigar, | |
int32_t | numFrontClips, | |||
int32_t | numBackClips, | |||
int32_t & | startPos, | |||
CigarRoller & | updatedCigar | |||
) | [static] |
Soft clip the cigar from the front and/or the back, writing the value into the new cigar, updatedCigar & startPos are only updated if the return FilterStatus is CLIPPED.
oldCigar | cigar prior to clipping | |
numFrontClips | number of bases that should be clipped from the front of the sequence read. (total count, including any that are already clipped.) | |
numBackClips | number of bases that should be clipped from the back of the sequence read. (total count, including any that are already clipped.) | |
startPos | 0-based start position associated with the cigar prior to updating (input) and set to the 0-based start position after updating (output) the cigar if it was CLIPPED. | |
updatedCigar | set to the clipped cigar if CLIPPED (output param). |
Definition at line 191 of file SamFilter.cpp.
References CigarRoller::Add(), CLIPPED, Cigar::del, FILTERED, Cigar::foundInQuery(), Cigar::getExpectedQueryBaseCount(), Cigar::getOperator(), Cigar::getRefPosition(), Cigar::hardClip, Cigar::insert, Cigar::isClip(), Cigar::match, Cigar::mismatch, Cigar::none, NONE, Cigar::size(), Cigar::skip, and Cigar::softClip.
00196 { 00197 int32_t readLength = oldCigar.getExpectedQueryBaseCount(); 00198 int32_t endClipPos = readLength - numBackClips; 00199 FilterStatus status = NONE; 00200 00201 if((numFrontClips != 0) || (numBackClips != 0)) 00202 { 00203 // Clipping from front and/or from the back. 00204 00205 // Check to see if the entire read was clipped. 00206 int32_t totalClips = numFrontClips + numBackClips; 00207 if(totalClips >= readLength) 00208 { 00209 ///////////////////////////// 00210 // The entire read is clipped, so rather than clipping it, 00211 // filter it out. 00212 return(FILTERED); 00213 } 00214 00215 // Part of the read was clipped. 00216 status = CLIPPED; 00217 00218 // Loop through, creating an updated cigar. 00219 int origCigarOpIndex = 0; 00220 00221 // Track how many read positions are covered up to this 00222 // point by the cigar to determine up to up to what 00223 // point in the cigar is affected by this clipping. 00224 int32_t numPositions = 0; 00225 00226 // Track if any non-clips are in the new cigar. 00227 bool onlyClips = true; 00228 00229 const Cigar::CigarOperator* op = NULL; 00230 00231 ////////////////// 00232 // Clip from front 00233 while((origCigarOpIndex < oldCigar.size()) && 00234 (numPositions < numFrontClips)) 00235 { 00236 op = &(oldCigar.getOperator(origCigarOpIndex)); 00237 switch(op->operation) 00238 { 00239 case Cigar::hardClip: 00240 // Keep this operation as the new clips do not 00241 // affect other clips. 00242 updatedCigar += *op; 00243 break; 00244 case Cigar::del: 00245 case Cigar::skip: 00246 // Skip and delete are going to be dropped, and 00247 // are not in the read, so the read index doesn't 00248 // need to be updated 00249 break; 00250 case Cigar::insert: 00251 case Cigar::match: 00252 case Cigar::mismatch: 00253 case Cigar::softClip: 00254 // Update the read index as these types 00255 // are found in the read. 00256 numPositions += op->count; 00257 break; 00258 case Cigar::none: 00259 default: 00260 // Nothing to do for none. 00261 break; 00262 }; 00263 ++origCigarOpIndex; 00264 } 00265 00266 // If bases were clipped from the front, add the clip and 00267 // any partial cigar operation as necessary. 00268 if(numFrontClips != 0) 00269 { 00270 // Add the softclip to the front of the read. 00271 updatedCigar.Add(Cigar::softClip, numFrontClips); 00272 00273 // Add the rest of the last Cigar operation if 00274 // it is not entirely clipped. 00275 int32_t newCount = numPositions - numFrontClips; 00276 if(newCount > 0) 00277 { 00278 // Before adding it, check to see if the same 00279 // operation is clipped from the end. 00280 // numPositions greater than the endClipPos 00281 // means that it is equal or past that position, 00282 // so shorten the number of positions. 00283 if(numPositions > endClipPos) 00284 { 00285 newCount -= (numPositions - endClipPos); 00286 } 00287 if(newCount > 0) 00288 { 00289 updatedCigar.Add(op->operation, newCount); 00290 if(!Cigar::isClip(op->operation)) 00291 { 00292 onlyClips = false; 00293 } 00294 } 00295 } 00296 } 00297 00298 // Add operations until the point of the end clip is reached. 00299 // For example... 00300 // 2M1D3M = MMDMMM readLength = 5 00301 // readIndex: 01 234 00302 // at cigarOpIndex 0 (2M), numPositions = 2. 00303 // at cigarOpIndex 1 (1D), numPositions = 2. 00304 // at cigarOpIndex 2 (3M), numPositions = 5. 00305 // if endClipPos = 2, we still want to consume the 1D, so 00306 // need to keep looping until numPositions > endClipPos 00307 while((origCigarOpIndex < oldCigar.size()) && 00308 (numPositions <= endClipPos)) 00309 { 00310 op = &(oldCigar.getOperator(origCigarOpIndex)); 00311 00312 // Update the numPositions count if the operations indicates 00313 // bases within the read. 00314 if(!Cigar::foundInQuery(op->operation)) 00315 { 00316 // This operation is not in the query read sequence, 00317 // so it is not yet to the endClipPos, just add the 00318 // operation do not increment the number of positions. 00319 updatedCigar += *op; 00320 if(!Cigar::isClip(op->operation)) 00321 { 00322 onlyClips = false; 00323 } 00324 } 00325 else 00326 { 00327 // This operation appears in the query sequence, so 00328 // check to see if the clip occurs in this operation. 00329 00330 // endClipPos is 0 based & numPositions is a count. 00331 // If endClipPos is 4, then it is the 5th position. 00332 // If 4 positions are covered so far (numPositions = 4), 00333 // then we are right at endCLipPos: 4-4 = 0, none of 00334 // this operation should be kept. 00335 // If only 3 positions were covered, then we are at offset 00336 // 3, so offset 3 should be added: 4-3 = 1. 00337 uint32_t numPosTilClip = endClipPos - numPositions; 00338 00339 if(numPosTilClip < op->count) 00340 { 00341 // this operation is partially clipped, write the part 00342 // that was not clipped if it is not all clipped. 00343 if(numPosTilClip != 0) 00344 { 00345 updatedCigar.Add(op->operation, 00346 numPosTilClip); 00347 if(!Cigar::isClip(op->operation)) 00348 { 00349 onlyClips = false; 00350 } 00351 } 00352 } 00353 else 00354 { 00355 // This operation is not clipped, so add it 00356 updatedCigar += *op; 00357 if(!Cigar::isClip(op->operation)) 00358 { 00359 onlyClips = false; 00360 } 00361 } 00362 // This operation occurs in the query sequence, so 00363 // increment the number of positions covered. 00364 numPositions += op->count; 00365 } 00366 00367 // Move to the next cigar position. 00368 ++origCigarOpIndex; 00369 } 00370 00371 ////////////////// 00372 // Add the softclip to the back. 00373 if(numBackClips != 0) 00374 { 00375 // Add the softclip to the end 00376 updatedCigar.Add(Cigar::softClip, numBackClips); 00377 } 00378 00379 ////////////////// 00380 // Add any hardclips remaining in the original cigar to the back. 00381 while(origCigarOpIndex < oldCigar.size()) 00382 { 00383 op = &(oldCigar.getOperator(origCigarOpIndex)); 00384 if(op->operation == Cigar::hardClip) 00385 { 00386 // Keep this operation as the new clips do not 00387 // affect other clips. 00388 updatedCigar += *op; 00389 } 00390 ++origCigarOpIndex; 00391 } 00392 00393 // Check to see if the new cigar is only clips. 00394 if(onlyClips) 00395 { 00396 // Only clips in the new cigar, so mark the read as filtered 00397 // instead of updating the cigar. 00398 ///////////////////////////// 00399 // The entire read was clipped. 00400 status = FILTERED; 00401 } 00402 else 00403 { 00404 // Part of the read was clipped. 00405 // Update the starting position if a clip was added to 00406 // the front. 00407 if(numFrontClips > 0) 00408 { 00409 // Convert from query index to reference position (from the 00410 // old cigar) 00411 // Get the position for the start clipped position by 00412 // getting the position associated with the clipped base on 00413 // the reference. Then add one to get to the first 00414 // non-clipped position. 00415 int32_t lastFrontClipPos = numFrontClips - 1; 00416 startPos = 00417 oldCigar.getRefPosition(lastFrontClipPos, 00418 startPos) + 1; 00419 } 00420 } 00421 } 00422 return(status); 00423 }
SamFilter::FilterStatus SamFilter::softClip | ( | SamRecord & | record, | |
int32_t | numFrontClips, | |||
int32_t | numBackClips | |||
) | [static] |
Soft clip the record from the front and/or the back.
record | record to be clipped (input/output parameter). | |
numFrontClips | number of bases that should be clipped from the front of the sequence read. (total count, including any that are already clipped.) | |
backClipPos | number of bases that should be clipped from the back of the sequence read. (total count, including any that are already clipped.) |
Definition at line 155 of file SamFilter.cpp.
References CLIPPED, FILTERED, filterRead(), SamRecord::get0BasedPosition(), SamRecord::getCigarInfo(), NONE, SamRecord::set0BasedPosition(), and SamRecord::setCigar().
Referenced by clipOnMismatchThreshold().
00158 { 00159 ////////////////////////////////////////////////////////// 00160 Cigar* cigar = record.getCigarInfo(); 00161 FilterStatus status = NONE; 00162 int32_t startPos = record.get0BasedPosition(); 00163 CigarRoller updatedCigar; 00164 00165 status = softClip(*cigar, numFrontClips, numBackClips, 00166 startPos, updatedCigar); 00167 00168 if(status == FILTERED) 00169 { 00170 ///////////////////////////// 00171 // The entire read is clipped, so rather than clipping it, 00172 // filter it out. 00173 filterRead(record); 00174 return(FILTERED); 00175 } 00176 else if(status == CLIPPED) 00177 { 00178 // Part of the read was clipped, and now that we have 00179 // an updated cigar, update the read. 00180 record.setCigar(updatedCigar); 00181 00182 // Update the starting position. 00183 record.set0BasedPosition(startPos); 00184 } 00185 return(status); 00186 }
uint32_t SamFilter::sumMismatchQuality | ( | SamRecord & | record, | |
GenomeSequence & | refSequence, | |||
uint8_t | defaultQualityInt | |||
) | [static] |
Get the sum of the qualities of all mismatches in the record.
record | record on which to calculate the sum the mismatch qualities | |
refSequence | reference to use to check for mismatches. | |
defaultQualityInt | default value to use for the quality if no quality was specified in the read. |
Definition at line 447 of file SamFilter.cpp.
References SamQuerySeqWithRefIter::getNextMatchMismatch(), BaseUtilities::getPhredBaseQuality(), SamRecord::getQuality(), SamSingleBaseMatchInfo::getQueryIndex(), SamSingleBaseMatchInfo::getType(), and BaseUtilities::UNKNOWN_QUALITY_INT.
Referenced by filterOnMismatchQuality().
00450 { 00451 // Track the mismatch info. 00452 int mismatchQual = 0; 00453 int numMismatch = 0; 00454 00455 SamQuerySeqWithRefIter sequenceIter(record, refSequence); 00456 00457 SamSingleBaseMatchInfo baseMatchInfo; 00458 while(sequenceIter.getNextMatchMismatch(baseMatchInfo)) 00459 { 00460 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) 00461 { 00462 // Got a mismatch, get the associated quality. 00463 char readQualityChar = 00464 record.getQuality(baseMatchInfo.getQueryIndex()); 00465 uint8_t readQualityInt = 00466 BaseUtilities::getPhredBaseQuality(readQualityChar); 00467 00468 if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT) 00469 { 00470 // Quality was not specified, so use the configured setting. 00471 readQualityInt = defaultQualityInt; 00472 } 00473 mismatchQual += readQualityInt; 00474 ++numMismatch; 00475 } 00476 } 00477 00478 return(mismatchQual); 00479 }