SamFilter.cpp
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "SamFilter.h"
00022
00023 #include "SamQuerySeqWithRefHelper.h"
00024 #include "BaseUtilities.h"
00025 #include "SamFlag.h"
00026
00027 SamFilter::FilterStatus SamFilter::clipOnMismatchThreshold(SamRecord& record,
00028 GenomeSequence& refSequence,
00029 double mismatchThreshold)
00030 {
00031
00032 SamQuerySeqWithRefIter iterFromFront(record, refSequence, true);
00033 SamQuerySeqWithRefIter iterFromBack(record, refSequence, false);
00034
00035 SamSingleBaseMatchInfo baseMatchInfo;
00036
00037 int32_t readLength = record.getReadLength();
00038
00039 const int32_t initialLastFrontClipPos = -1;
00040 int32_t lastFrontClipPos = initialLastFrontClipPos;
00041
00042 int32_t firstBackClipPos = readLength;
00043
00044 bool fromFrontComplete = false;
00045 bool fromBackComplete = false;
00046 int32_t numBasesFromFront = 0;
00047 int32_t numBasesFromBack = 0;
00048 int32_t numMismatchFromFront = 0;
00049 int32_t numMismatchFromBack = 0;
00050
00051
00052
00053 while(!fromFrontComplete || !fromBackComplete)
00054 {
00055
00056
00057 while(!fromFrontComplete &&
00058 ((numBasesFromFront <= numBasesFromBack) ||
00059 (fromBackComplete)))
00060 {
00061 if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false)
00062 {
00063
00064 fromFrontComplete = true;
00065 break;
00066 }
00067
00068 if(baseMatchInfo.getQueryIndex() >= firstBackClipPos)
00069 {
00070
00071
00072 fromFrontComplete = true;
00073 break;
00074 }
00075
00076
00077 ++numBasesFromFront;
00078
00079 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
00080 {
00081
00082 ++numMismatchFromFront;
00083
00084 double mismatchPercent =
00085 (double)numMismatchFromFront / numBasesFromFront;
00086 if(mismatchPercent > mismatchThreshold)
00087 {
00088
00089 lastFrontClipPos = baseMatchInfo.getQueryIndex();
00090
00091 numBasesFromFront = 0;
00092 numMismatchFromFront = 0;
00093 }
00094 }
00095 }
00096
00097
00098
00099 while(!fromBackComplete &&
00100 ((numBasesFromBack <= numBasesFromFront) ||
00101 (fromFrontComplete)))
00102 {
00103 if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false)
00104 {
00105
00106 fromBackComplete = true;
00107 break;
00108 }
00109
00110 if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos)
00111 {
00112
00113
00114 fromBackComplete = true;
00115 break;
00116 }
00117
00118
00119 ++numBasesFromBack;
00120
00121 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
00122 {
00123
00124 ++numMismatchFromBack;
00125
00126 double mismatchPercent =
00127 (double)numMismatchFromBack / numBasesFromBack;
00128 if(mismatchPercent > mismatchThreshold)
00129 {
00130
00131 firstBackClipPos = baseMatchInfo.getQueryIndex();
00132
00133 numBasesFromBack = 0;
00134 numMismatchFromBack = 0;
00135 }
00136 }
00137 }
00138 }
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150 return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos));
00151 }
00152
00153
00154
00155 SamFilter::FilterStatus SamFilter::softClip(SamRecord& record,
00156 int32_t numFrontClips,
00157 int32_t numBackClips)
00158 {
00159
00160 Cigar* cigar = record.getCigarInfo();
00161 FilterStatus status = NONE;
00162 int32_t startPos = record.get0BasedPosition();
00163 CigarRoller updatedCigar;
00164
00165 status = softClip(*cigar, numFrontClips, numBackClips,
00166 startPos, updatedCigar);
00167
00168 if(status == FILTERED)
00169 {
00170
00171
00172
00173 filterRead(record);
00174 return(FILTERED);
00175 }
00176 else if(status == CLIPPED)
00177 {
00178
00179
00180 record.setCigar(updatedCigar);
00181
00182
00183 record.set0BasedPosition(startPos);
00184 }
00185 return(status);
00186 }
00187
00188
00189
00190
00191 SamFilter::FilterStatus SamFilter::softClip(Cigar& oldCigar,
00192 int32_t numFrontClips,
00193 int32_t numBackClips,
00194 int32_t& startPos,
00195 CigarRoller& updatedCigar)
00196 {
00197 int32_t readLength = oldCigar.getExpectedQueryBaseCount();
00198 int32_t endClipPos = readLength - numBackClips;
00199 FilterStatus status = NONE;
00200
00201 if((numFrontClips != 0) || (numBackClips != 0))
00202 {
00203
00204
00205
00206 int32_t totalClips = numFrontClips + numBackClips;
00207 if(totalClips >= readLength)
00208 {
00209
00210
00211
00212 return(FILTERED);
00213 }
00214
00215
00216 status = CLIPPED;
00217
00218
00219 int origCigarOpIndex = 0;
00220
00221
00222
00223
00224 int32_t numPositions = 0;
00225
00226
00227 bool onlyClips = true;
00228
00229 const Cigar::CigarOperator* op = NULL;
00230
00231
00232
00233 while((origCigarOpIndex < oldCigar.size()) &&
00234 (numPositions < numFrontClips))
00235 {
00236 op = &(oldCigar.getOperator(origCigarOpIndex));
00237 switch(op->operation)
00238 {
00239 case Cigar::hardClip:
00240
00241
00242 updatedCigar += *op;
00243 break;
00244 case Cigar::del:
00245 case Cigar::skip:
00246
00247
00248
00249 break;
00250 case Cigar::insert:
00251 case Cigar::match:
00252 case Cigar::mismatch:
00253 case Cigar::softClip:
00254
00255
00256 numPositions += op->count;
00257 break;
00258 case Cigar::none:
00259 default:
00260
00261 break;
00262 };
00263 ++origCigarOpIndex;
00264 }
00265
00266
00267
00268 if(numFrontClips != 0)
00269 {
00270
00271 updatedCigar.Add(Cigar::softClip, numFrontClips);
00272
00273
00274
00275 int32_t newCount = numPositions - numFrontClips;
00276 if(newCount > 0)
00277 {
00278
00279
00280
00281
00282
00283 if(numPositions > endClipPos)
00284 {
00285 newCount -= (numPositions - endClipPos);
00286 }
00287 if(newCount > 0)
00288 {
00289 updatedCigar.Add(op->operation, newCount);
00290 if(!Cigar::isClip(op->operation))
00291 {
00292 onlyClips = false;
00293 }
00294 }
00295 }
00296 }
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307 while((origCigarOpIndex < oldCigar.size()) &&
00308 (numPositions <= endClipPos))
00309 {
00310 op = &(oldCigar.getOperator(origCigarOpIndex));
00311
00312
00313
00314 if(!Cigar::foundInQuery(op->operation))
00315 {
00316
00317
00318
00319 updatedCigar += *op;
00320 if(!Cigar::isClip(op->operation))
00321 {
00322 onlyClips = false;
00323 }
00324 }
00325 else
00326 {
00327
00328
00329
00330
00331
00332
00333
00334
00335
00336
00337 uint32_t numPosTilClip = endClipPos - numPositions;
00338
00339 if(numPosTilClip < op->count)
00340 {
00341
00342
00343 if(numPosTilClip != 0)
00344 {
00345 updatedCigar.Add(op->operation,
00346 numPosTilClip);
00347 if(!Cigar::isClip(op->operation))
00348 {
00349 onlyClips = false;
00350 }
00351 }
00352 }
00353 else
00354 {
00355
00356 updatedCigar += *op;
00357 if(!Cigar::isClip(op->operation))
00358 {
00359 onlyClips = false;
00360 }
00361 }
00362
00363
00364 numPositions += op->count;
00365 }
00366
00367
00368 ++origCigarOpIndex;
00369 }
00370
00371
00372
00373 if(numBackClips != 0)
00374 {
00375
00376 updatedCigar.Add(Cigar::softClip, numBackClips);
00377 }
00378
00379
00380
00381 while(origCigarOpIndex < oldCigar.size())
00382 {
00383 op = &(oldCigar.getOperator(origCigarOpIndex));
00384 if(op->operation == Cigar::hardClip)
00385 {
00386
00387
00388 updatedCigar += *op;
00389 }
00390 ++origCigarOpIndex;
00391 }
00392
00393
00394 if(onlyClips)
00395 {
00396
00397
00398
00399
00400 status = FILTERED;
00401 }
00402 else
00403 {
00404
00405
00406
00407 if(numFrontClips > 0)
00408 {
00409
00410
00411
00412
00413
00414
00415 int32_t lastFrontClipPos = numFrontClips - 1;
00416 startPos =
00417 oldCigar.getRefPosition(lastFrontClipPos,
00418 startPos) + 1;
00419 }
00420 }
00421 }
00422 return(status);
00423 }
00424
00425
00426 SamFilter::FilterStatus SamFilter::filterOnMismatchQuality(SamRecord& record,
00427 GenomeSequence& refSequence,
00428 uint32_t qualityThreshold,
00429 uint8_t defaultQualityInt)
00430 {
00431 uint32_t totalMismatchQuality =
00432 sumMismatchQuality(record, refSequence, defaultQualityInt);
00433
00434
00435
00436 if(totalMismatchQuality > qualityThreshold)
00437 {
00438 filterRead(record);
00439 return(FILTERED);
00440 }
00441 return(NONE);
00442 }
00443
00444
00445
00446
00447 uint32_t SamFilter::sumMismatchQuality(SamRecord& record,
00448 GenomeSequence& refSequence,
00449 uint8_t defaultQualityInt)
00450 {
00451
00452 int mismatchQual = 0;
00453 int numMismatch = 0;
00454
00455 SamQuerySeqWithRefIter sequenceIter(record, refSequence);
00456
00457 SamSingleBaseMatchInfo baseMatchInfo;
00458 while(sequenceIter.getNextMatchMismatch(baseMatchInfo))
00459 {
00460 if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
00461 {
00462
00463 char readQualityChar =
00464 record.getQuality(baseMatchInfo.getQueryIndex());
00465 uint8_t readQualityInt =
00466 BaseUtilities::getPhredBaseQuality(readQualityChar);
00467
00468 if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT)
00469 {
00470
00471 readQualityInt = defaultQualityInt;
00472 }
00473 mismatchQual += readQualityInt;
00474 ++numMismatch;
00475 }
00476 }
00477
00478 return(mismatchQual);
00479 }
00480
00481
00482 void SamFilter::filterRead(SamRecord& record)
00483 {
00484
00485 uint16_t flag = record.getFlag();
00486 SamFlag::setUnmapped(flag);
00487 record.setFlag(flag);
00488 }