libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 ////////////////////////////////////////////////////////////////////////// 00019 00020 00021 #include "SamFilter.h" 00022 00023 #include "SamQuerySeqWithRefHelper.h" 00024 #include "BaseUtilities.h" 00025 #include "SamFlag.h" 00026 00027 SamFilter::FilterStatus SamFilter::clipOnMismatchThreshold(SamRecord& record, 00028 GenomeSequence& refSequence, 00029 double mismatchThreshold) 00030 { 00031 // Read & clip from the left & right. 00032 SamQuerySeqWithRefIter iterFromFront(record, refSequence, true); 00033 SamQuerySeqWithRefIter iterFromBack(record, refSequence, false); 00034 00035 SamSingleBaseMatchInfo baseMatchInfo; 00036 00037 int32_t readLength = record.getReadLength(); 00038 // Init last front clip to be prior to the lastFront index (0). 00039 const int32_t initialLastFrontClipPos = -1; 00040 int32_t lastFrontClipPos = initialLastFrontClipPos; 00041 // Init first back clip to be past the last index (readLength). 00042 int32_t firstBackClipPos = readLength; 00043 00044 bool fromFrontComplete = false; 00045 bool fromBackComplete = false; 00046 int32_t numBasesFromFront = 0; 00047 int32_t numBasesFromBack = 0; 00048 int32_t numMismatchFromFront = 0; 00049 int32_t numMismatchFromBack = 0; 00050 00051 ////////////////////////////////////////////////////////// 00052 // Determining the clip positions. 00053 while(!fromFrontComplete || !fromBackComplete) 00054 { 00055 // Read from the front (left to right) of the read until 00056 // more have been read from that direction than the opposite direction. 00057 while(!fromFrontComplete && 00058 ((numBasesFromFront <= numBasesFromBack) || 00059 (fromBackComplete))) 00060 { 00061 if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false) 00062 { 00063 // Nothing more to read in this direction. 00064 fromFrontComplete = true; 00065 break; 00066 } 00067 // Got a read. Check to see if it is to or past the last clip. 00068 if(baseMatchInfo.getQueryIndex() >= firstBackClipPos) 00069 { 00070 // This base is past where we are clipping, so we 00071 // are done reading in this direction. 00072 fromFrontComplete = true; 00073 break; 00074 } 00075 // This is an actual base read from the left to the 00076 // right, so up the counter and determine if it was a mismatch. 00077 ++numBasesFromFront; 00078 00079 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) 00080 { 00081 // Mismatch 00082 ++numMismatchFromFront; 00083 // Check to see if it is over the threshold. 00084 double mismatchPercent = 00085 (double)numMismatchFromFront / numBasesFromFront; 00086 if(mismatchPercent > mismatchThreshold) 00087 { 00088 // Need to clip. 00089 lastFrontClipPos = baseMatchInfo.getQueryIndex(); 00090 // Reset the counters. 00091 numBasesFromFront = 0; 00092 numMismatchFromFront = 0; 00093 } 00094 } 00095 } 00096 00097 // Now, read from right to left until more have been read 00098 // from the back than from the front. 00099 while(!fromBackComplete && 00100 ((numBasesFromBack <= numBasesFromFront) || 00101 (fromFrontComplete))) 00102 { 00103 if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false) 00104 { 00105 // Nothing more to read in this direction. 00106 fromBackComplete = true; 00107 break; 00108 } 00109 // Got a read. Check to see if it is to or past the first clip. 00110 if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos) 00111 { 00112 // This base is past where we are clipping, so we 00113 // are done reading in this direction. 00114 fromBackComplete = true; 00115 break; 00116 } 00117 // This is an actual base read from the right to the 00118 // left, so up the counter and determine if it was a mismatch. 00119 ++numBasesFromBack; 00120 00121 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) 00122 { 00123 // Mismatch 00124 ++numMismatchFromBack; 00125 // Check to see if it is over the threshold. 00126 double mismatchPercent = 00127 (double)numMismatchFromBack / numBasesFromBack; 00128 if(mismatchPercent > mismatchThreshold) 00129 { 00130 // Need to clip. 00131 firstBackClipPos = baseMatchInfo.getQueryIndex(); 00132 // Reset the counters. 00133 numBasesFromBack = 0; 00134 numMismatchFromBack = 0; 00135 } 00136 } 00137 } 00138 } 00139 00140 ////////////////////////////////////////////////////////// 00141 // Done determining the clip positions, so clip. 00142 // To determine the number of clips from the front, add 1 to the 00143 // lastFrontClipPos since the index starts at 0. 00144 // To determine the number of clips from the back, subtract the 00145 // firstBackClipPos from the readLength. 00146 // Example: 00147 // Pos: 012345 00148 // Read: AAAAAA 00149 // Read Length = 6. If lastFrontClipPos = 2 and firstBackClipPos = 4, numFrontClips = 3 & numBack = 2. 00150 return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos)); 00151 } 00152 00153 00154 // Soft clip the record from the front and/or the back. 00155 SamFilter::FilterStatus SamFilter::softClip(SamRecord& record, 00156 int32_t numFrontClips, 00157 int32_t numBackClips) 00158 { 00159 ////////////////////////////////////////////////////////// 00160 Cigar* cigar = record.getCigarInfo(); 00161 FilterStatus status = NONE; 00162 int32_t startPos = record.get0BasedPosition(); 00163 CigarRoller updatedCigar; 00164 00165 status = softClip(*cigar, numFrontClips, numBackClips, 00166 startPos, updatedCigar); 00167 00168 if(status == FILTERED) 00169 { 00170 ///////////////////////////// 00171 // The entire read is clipped, so rather than clipping it, 00172 // filter it out. 00173 filterRead(record); 00174 return(FILTERED); 00175 } 00176 else if(status == CLIPPED) 00177 { 00178 // Part of the read was clipped, and now that we have 00179 // an updated cigar, update the read. 00180 record.setCigar(updatedCigar); 00181 00182 // Update the starting position. 00183 record.set0BasedPosition(startPos); 00184 } 00185 return(status); 00186 } 00187 00188 00189 // Soft clip the cigar from the front and/or the back, writing the value 00190 // into the new cigar. 00191 SamFilter::FilterStatus SamFilter::softClip(Cigar& oldCigar, 00192 int32_t numFrontClips, 00193 int32_t numBackClips, 00194 int32_t& startPos, 00195 CigarRoller& updatedCigar) 00196 { 00197 int32_t readLength = oldCigar.getExpectedQueryBaseCount(); 00198 int32_t endClipPos = readLength - numBackClips; 00199 FilterStatus status = NONE; 00200 00201 if((numFrontClips != 0) || (numBackClips != 0)) 00202 { 00203 // Clipping from front and/or from the back. 00204 00205 // Check to see if the entire read was clipped. 00206 int32_t totalClips = numFrontClips + numBackClips; 00207 if(totalClips >= readLength) 00208 { 00209 ///////////////////////////// 00210 // The entire read is clipped, so rather than clipping it, 00211 // filter it out. 00212 return(FILTERED); 00213 } 00214 00215 // Part of the read was clipped. 00216 status = CLIPPED; 00217 00218 // Loop through, creating an updated cigar. 00219 int origCigarOpIndex = 0; 00220 00221 // Track how many read positions are covered up to this 00222 // point by the cigar to determine up to up to what 00223 // point in the cigar is affected by this clipping. 00224 int32_t numPositions = 0; 00225 00226 // Track if any non-clips are in the new cigar. 00227 bool onlyClips = true; 00228 00229 const Cigar::CigarOperator* op = NULL; 00230 00231 ////////////////// 00232 // Clip from front 00233 while((origCigarOpIndex < oldCigar.size()) && 00234 (numPositions < numFrontClips)) 00235 { 00236 op = &(oldCigar.getOperator(origCigarOpIndex)); 00237 switch(op->operation) 00238 { 00239 case Cigar::hardClip: 00240 // Keep this operation as the new clips do not 00241 // affect other clips. 00242 updatedCigar += *op; 00243 break; 00244 case Cigar::del: 00245 case Cigar::skip: 00246 // Skip and delete are going to be dropped, and 00247 // are not in the read, so the read index doesn't 00248 // need to be updated 00249 break; 00250 case Cigar::insert: 00251 case Cigar::match: 00252 case Cigar::mismatch: 00253 case Cigar::softClip: 00254 // Update the read index as these types 00255 // are found in the read. 00256 numPositions += op->count; 00257 break; 00258 case Cigar::none: 00259 default: 00260 // Nothing to do for none. 00261 break; 00262 }; 00263 ++origCigarOpIndex; 00264 } 00265 00266 // If bases were clipped from the front, add the clip and 00267 // any partial cigar operation as necessary. 00268 if(numFrontClips != 0) 00269 { 00270 // Add the softclip to the front of the read. 00271 updatedCigar.Add(Cigar::softClip, numFrontClips); 00272 00273 // Add the rest of the last Cigar operation if 00274 // it is not entirely clipped. 00275 int32_t newCount = numPositions - numFrontClips; 00276 if(newCount > 0) 00277 { 00278 // Before adding it, check to see if the same 00279 // operation is clipped from the end. 00280 // numPositions greater than the endClipPos 00281 // means that it is equal or past that position, 00282 // so shorten the number of positions. 00283 if(numPositions > endClipPos) 00284 { 00285 newCount -= (numPositions - endClipPos); 00286 } 00287 if(newCount > 0) 00288 { 00289 updatedCigar.Add(op->operation, newCount); 00290 if(!Cigar::isClip(op->operation)) 00291 { 00292 onlyClips = false; 00293 } 00294 } 00295 } 00296 } 00297 00298 // Add operations until the point of the end clip is reached. 00299 // For example... 00300 // 2M1D3M = MMDMMM readLength = 5 00301 // readIndex: 01 234 00302 // at cigarOpIndex 0 (2M), numPositions = 2. 00303 // at cigarOpIndex 1 (1D), numPositions = 2. 00304 // at cigarOpIndex 2 (3M), numPositions = 5. 00305 // if endClipPos = 2, we still want to consume the 1D, so 00306 // need to keep looping until numPositions > endClipPos 00307 while((origCigarOpIndex < oldCigar.size()) && 00308 (numPositions <= endClipPos)) 00309 { 00310 op = &(oldCigar.getOperator(origCigarOpIndex)); 00311 00312 // Update the numPositions count if the operations indicates 00313 // bases within the read. 00314 if(!Cigar::foundInQuery(op->operation)) 00315 { 00316 // This operation is not in the query read sequence, 00317 // so it is not yet to the endClipPos, just add the 00318 // operation do not increment the number of positions. 00319 updatedCigar += *op; 00320 if(!Cigar::isClip(op->operation)) 00321 { 00322 onlyClips = false; 00323 } 00324 } 00325 else 00326 { 00327 // This operation appears in the query sequence, so 00328 // check to see if the clip occurs in this operation. 00329 00330 // endClipPos is 0 based & numPositions is a count. 00331 // If endClipPos is 4, then it is the 5th position. 00332 // If 4 positions are covered so far (numPositions = 4), 00333 // then we are right at endCLipPos: 4-4 = 0, none of 00334 // this operation should be kept. 00335 // If only 3 positions were covered, then we are at offset 00336 // 3, so offset 3 should be added: 4-3 = 1. 00337 uint32_t numPosTilClip = endClipPos - numPositions; 00338 00339 if(numPosTilClip < op->count) 00340 { 00341 // this operation is partially clipped, write the part 00342 // that was not clipped if it is not all clipped. 00343 if(numPosTilClip != 0) 00344 { 00345 updatedCigar.Add(op->operation, 00346 numPosTilClip); 00347 if(!Cigar::isClip(op->operation)) 00348 { 00349 onlyClips = false; 00350 } 00351 } 00352 } 00353 else 00354 { 00355 // This operation is not clipped, so add it 00356 updatedCigar += *op; 00357 if(!Cigar::isClip(op->operation)) 00358 { 00359 onlyClips = false; 00360 } 00361 } 00362 // This operation occurs in the query sequence, so 00363 // increment the number of positions covered. 00364 numPositions += op->count; 00365 } 00366 00367 // Move to the next cigar position. 00368 ++origCigarOpIndex; 00369 } 00370 00371 ////////////////// 00372 // Add the softclip to the back. 00373 if(numBackClips != 0) 00374 { 00375 // Add the softclip to the end 00376 updatedCigar.Add(Cigar::softClip, numBackClips); 00377 } 00378 00379 ////////////////// 00380 // Add any hardclips remaining in the original cigar to the back. 00381 while(origCigarOpIndex < oldCigar.size()) 00382 { 00383 op = &(oldCigar.getOperator(origCigarOpIndex)); 00384 if(op->operation == Cigar::hardClip) 00385 { 00386 // Keep this operation as the new clips do not 00387 // affect other clips. 00388 updatedCigar += *op; 00389 } 00390 ++origCigarOpIndex; 00391 } 00392 00393 // Check to see if the new cigar is only clips. 00394 if(onlyClips) 00395 { 00396 // Only clips in the new cigar, so mark the read as filtered 00397 // instead of updating the cigar. 00398 ///////////////////////////// 00399 // The entire read was clipped. 00400 status = FILTERED; 00401 } 00402 else 00403 { 00404 // Part of the read was clipped. 00405 // Update the starting position if a clip was added to 00406 // the front. 00407 if(numFrontClips > 0) 00408 { 00409 // Convert from query index to reference position (from the 00410 // old cigar) 00411 // Get the position for the last front clipped position by 00412 // getting the position associated with the clipped base on 00413 // the reference. Then add one to get to the first 00414 // non-clipped position. 00415 int32_t lastFrontClipPos = numFrontClips - 1; 00416 int32_t newStartPos = oldCigar.getRefPosition(lastFrontClipPos, 00417 startPos); 00418 if(newStartPos != Cigar::INDEX_NA) 00419 { 00420 // Add one to get first non-clipped position. 00421 startPos = newStartPos + 1; 00422 } 00423 } 00424 } 00425 } 00426 return(status); 00427 } 00428 00429 00430 SamFilter::FilterStatus SamFilter::filterOnMismatchQuality(SamRecord& record, 00431 GenomeSequence& refSequence, 00432 uint32_t qualityThreshold, 00433 uint8_t defaultQualityInt) 00434 { 00435 uint32_t totalMismatchQuality = 00436 sumMismatchQuality(record, refSequence, defaultQualityInt); 00437 00438 // If the total mismatch quality is over the threshold, 00439 // filter the read. 00440 if(totalMismatchQuality > qualityThreshold) 00441 { 00442 filterRead(record); 00443 return(FILTERED); 00444 } 00445 return(NONE); 00446 } 00447 00448 00449 // NOTE: Only positions where the reference and read both have bases that 00450 // are different and not 'N' are considered mismatches. 00451 uint32_t SamFilter::sumMismatchQuality(SamRecord& record, 00452 GenomeSequence& refSequence, 00453 uint8_t defaultQualityInt) 00454 { 00455 // Track the mismatch info. 00456 int mismatchQual = 0; 00457 int numMismatch = 0; 00458 00459 SamQuerySeqWithRefIter sequenceIter(record, refSequence); 00460 00461 SamSingleBaseMatchInfo baseMatchInfo; 00462 while(sequenceIter.getNextMatchMismatch(baseMatchInfo)) 00463 { 00464 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH) 00465 { 00466 // Got a mismatch, get the associated quality. 00467 char readQualityChar = 00468 record.getQuality(baseMatchInfo.getQueryIndex()); 00469 uint8_t readQualityInt = 00470 BaseUtilities::getPhredBaseQuality(readQualityChar); 00471 00472 if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT) 00473 { 00474 // Quality was not specified, so use the configured setting. 00475 readQualityInt = defaultQualityInt; 00476 } 00477 mismatchQual += readQualityInt; 00478 ++numMismatch; 00479 } 00480 } 00481 00482 return(mismatchQual); 00483 } 00484 00485 00486 void SamFilter::filterRead(SamRecord& record) 00487 { 00488 // Filter the read by marking it as unmapped. 00489 uint16_t flag = record.getFlag(); 00490 SamFlag::setUnmapped(flag); 00491 // Clear N/A flags. 00492 flag &= ~SamFlag::PROPER_PAIR; 00493 flag &= ~SamFlag::SECONDARY_ALIGNMENT; 00494 flag &= ~SamFlag::SUPPLEMENTARY_ALIGNMENT; 00495 record.setFlag(flag); 00496 // Clear Cigar 00497 record.setCigar("*"); 00498 // Clear mapping quality 00499 record.setMapQuality(0); 00500 }