00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include <iostream>
00019
00020 #include "SamValidation.h"
00021 #include "CigarRoller.h"
00022 #include "SamTags.h"
00023
00024 const char* SamValidationError::enumSeverityString[] = {
00025 "WARNING", "ERROR"};
00026
00027 const char* SamValidationError::enumTypeString[] = {
00028 "INVALID_QNAME",
00029 "INVALID_REF_ID",
00030 "INVALID_RNAME",
00031 "INVALID_POS",
00032 "INVALID_MAPQ",
00033 "INVALID_CIGAR",
00034 "INVALID_MRNM",
00035 "INVALID_QUAL",
00036 "INVALID_TAG"
00037 };
00038
00039 const char* SamValidationError::getTypeString(Type type)
00040 {
00041 return(enumTypeString[type]);
00042 }
00043
00044
00045 SamValidationError::SamValidationError(Type type, Severity severity,
00046 std::string message)
00047 {
00048 myType = type;
00049 mySeverity = severity;
00050 myMessage = message;
00051 }
00052
00053
00054 SamValidationError::Type SamValidationError::getType() const
00055 {
00056 return(myType);
00057 }
00058
00059
00060 SamValidationError::Severity SamValidationError::getSeverity() const
00061 {
00062 return(mySeverity);
00063 }
00064
00065
00066 const char* SamValidationError::getMessage() const
00067 {
00068 return(myMessage.c_str());
00069 }
00070
00071
00072 const char* SamValidationError::getTypeString() const
00073 {
00074 return(enumTypeString[myType]);
00075 }
00076
00077
00078 const char* SamValidationError::getSeverityString() const
00079 {
00080 return(enumSeverityString[mySeverity]);
00081 }
00082
00083
00084 void SamValidationError::getErrorString(std::string& errorString) const
00085 {
00086 errorString = getTypeString();
00087 errorString += " (";
00088 errorString += getSeverityString();
00089 errorString += ") : ";
00090 errorString += getMessage();
00091 errorString += "\n";
00092 }
00093
00094
00095 void SamValidationError::printError() const
00096 {
00097 std::cerr << this;
00098 }
00099
00100
00101
00102
00103 SamValidationErrors::SamValidationErrors()
00104 : myValidationErrors()
00105 {
00106 myErrorIter = myValidationErrors.begin();
00107 }
00108
00109
00110
00111 SamValidationErrors::~SamValidationErrors()
00112 {
00113 clear();
00114 }
00115
00116
00117 void SamValidationErrors::clear()
00118 {
00119
00120 std::list<const SamValidationError*>::iterator errorIter;
00121 for(errorIter = myValidationErrors.begin();
00122 errorIter != myValidationErrors.end(); ++errorIter)
00123 {
00124 delete *errorIter;
00125 *errorIter = NULL;
00126 }
00127 myValidationErrors.clear();
00128 myErrorIter = myValidationErrors.end();
00129 }
00130
00131
00132 void SamValidationErrors::addError(SamValidationError::Type newType,
00133 SamValidationError::Severity newSeverity,
00134 const char* newMessage)
00135 {
00136 myValidationErrors.push_back(new SamValidationError(newType,
00137 newSeverity,
00138 newMessage));
00139
00140
00141 if(myValidationErrors.size() == 1)
00142 {
00143
00144 myErrorIter = myValidationErrors.begin();
00145 }
00146 }
00147
00148
00149
00150
00151 unsigned int SamValidationErrors::numErrors()
00152 {
00153 return(myValidationErrors.size());
00154 }
00155
00156
00157
00158
00159
00160 const SamValidationError* SamValidationErrors::getNextError()
00161 {
00162 if(myErrorIter == myValidationErrors.end())
00163 {
00164
00165 return(NULL);
00166 }
00167
00168 return(*myErrorIter++);
00169 }
00170
00171
00172
00173 void SamValidationErrors::resetErrorIter()
00174 {
00175 myErrorIter = myValidationErrors.begin();
00176 }
00177
00178
00179
00180 void SamValidationErrors::getErrorString(std::string& errorString) const
00181 {
00182 for(std::list<const SamValidationError*>::
00183 const_iterator validationErrorIter =
00184 myValidationErrors.begin();
00185 validationErrorIter != myValidationErrors.end();
00186 validationErrorIter++)
00187 {
00188 std::string error = "";
00189 (*validationErrorIter)->getErrorString(error);
00190 errorString += error;
00191 }
00192 }
00193
00194
00195 bool SamValidator::isValid(SamFileHeader& samHeader, SamRecord& samRecord,
00196 SamValidationErrors& validationErrors)
00197 {
00198 bool status = true;
00199 status &= isValidQname(samRecord.getReadName(),
00200 samRecord.getReadNameLength(),
00201 validationErrors);
00202
00203 status &= isValidFlag(samRecord.getFlag(),
00204 validationErrors);
00205
00206
00207 status &= isValidRname(samHeader,
00208 samRecord.getReferenceName(),
00209 validationErrors);
00210
00211 status &= isValidRefID(samRecord.getReferenceID(),
00212 *samHeader.getReferenceInfo(),
00213 validationErrors);
00214
00215 status &= isValid1BasedPos(samRecord.get1BasedPosition(),
00216 validationErrors);
00217
00218 status &= isValidMapQuality(samRecord.getMapQuality(), validationErrors);
00219
00220 status &= isValidSequence(samRecord, validationErrors);
00221
00222 status &= isValidCigar(samRecord, validationErrors);
00223
00224 status &= isValidQuality(samRecord, validationErrors);
00225
00226 status &= isValidTags(samRecord, validationErrors);
00227
00228 return(status);
00229 }
00230
00231
00232
00233
00234
00235
00236
00237
00238 bool SamValidator::isValidQname(const char* qname, uint8_t readNameLen,
00239 SamValidationErrors& validationErrors)
00240 {
00241
00242
00243
00244
00245
00246 bool status = true;
00247
00248
00249 int32_t qnameLenNull = strlen(qname) + 1;
00250
00251
00252
00253 if(qnameLenNull != readNameLen)
00254 {
00255
00256
00257
00258 String message = "Invalid Query Name - the string length (";
00259 message += qnameLenNull;
00260 message += ") does not match the specified query name length (";
00261 message += readNameLen;
00262 message += ").";
00263
00264 validationErrors.addError(SamValidationError::INVALID_QNAME,
00265 SamValidationError::ERROR,
00266 message.c_str());
00267 status = false;
00268 }
00269
00270
00271
00272
00273 if((qnameLenNull < 2) || (qnameLenNull > 255))
00274 {
00275 String message = "Invalid Query Name (QNAME) length: ";
00276 message += qnameLenNull;
00277 message += ". Length with the terminating null must be between 2 & 255.";
00278
00279 validationErrors.addError(SamValidationError::INVALID_QNAME,
00280 SamValidationError::WARNING,
00281 message.c_str());
00282 status = false;
00283 }
00284
00285
00286
00287
00288 String message;
00289 for(int i = 0; i < qnameLenNull; ++i)
00290 {
00291 switch(qname[i])
00292 {
00293 case ' ':
00294
00295 message = "Invalid character in the Query Name (QNAME): ' ' at position ";
00296 message += i;
00297 message += ".";
00298 validationErrors.addError(SamValidationError::INVALID_QNAME,
00299 SamValidationError::WARNING,
00300 message.c_str());
00301 status = false;
00302 break;
00303 case '\t':
00304
00305 message = "Invalid character in the Query Name (QNAME): '\t' at position ";
00306 message += i;
00307 message += ".";
00308 validationErrors.addError(SamValidationError::INVALID_QNAME,
00309 SamValidationError::WARNING,
00310 message.c_str());
00311 status = false;
00312 break;
00313 case '\n':
00314
00315 message = "Invalid character in the Query Name (QNAME): '\n' at position ";
00316 message += i;
00317 message += ".";
00318 validationErrors.addError(SamValidationError::INVALID_QNAME,
00319 SamValidationError::WARNING,
00320 message.c_str());
00321 status = false;
00322 break;
00323 case '\r':
00324
00325 message = "Invalid character in the Query Name (QNAME): '\r' at position ";
00326 message += i;
00327 message += ".";
00328 validationErrors.addError(SamValidationError::INVALID_QNAME,
00329 SamValidationError::WARNING,
00330 message.c_str());
00331 status = false;
00332 break;
00333 }
00334 }
00335
00336 return(status);
00337 }
00338
00339
00340 bool SamValidator::isValidFlag(uint16_t flag,
00341 SamValidationErrors& validationErrors)
00342 {
00343
00344 return(true);
00345 }
00346
00347
00348 bool SamValidator::isValidRname(SamFileHeader& samHeader,
00349 const char* rname,
00350 SamValidationErrors& validationErrors)
00351 {
00352 bool status = true;
00353
00354
00355
00356
00357
00358 if((strcmp(rname, "*") != 0) &&
00359 (samHeader.getNumSQs() != 0) &&
00360 (samHeader.getSQ(rname) == NULL))
00361 {
00362
00363 status = false;
00364 std::string message = "RNAME, ";
00365 message += rname;
00366 message += ", was not found in a SAM Header SQ record";
00367 validationErrors.addError(SamValidationError::INVALID_RNAME,
00368 SamValidationError::WARNING,
00369 message.c_str());
00370 }
00371 status &= isValidRname(rname, validationErrors);
00372 return(status);
00373 }
00374
00375
00376 bool SamValidator::isValidRname(const char* rname,
00377 SamValidationErrors& validationErrors)
00378 {
00379
00380
00381
00382
00383 bool status = true;
00384
00385
00386 int32_t rnameLen = strlen(rname);
00387
00388 String message;
00389
00390 if(rnameLen == 0)
00391 {
00392 validationErrors.addError(SamValidationError::INVALID_RNAME,
00393 SamValidationError::WARNING,
00394 "Reference Sequence Name (RNAME) cannot have 0 length.");
00395 status = false;
00396 }
00397
00398
00399
00400
00401
00402 for(int i = 0; i < rnameLen; ++i)
00403 {
00404 switch(rname[i])
00405 {
00406 case ' ':
00407
00408 message = "Invalid character in the Reference Sequence Name (RNAME): ' ' at position ";
00409 message += i;
00410 message += ".";
00411 validationErrors.addError(SamValidationError::INVALID_RNAME,
00412 SamValidationError::WARNING,
00413 message.c_str());
00414 status = false;
00415 break;
00416 case '\t':
00417
00418 message = "Invalid character in the Reference Sequence Name (RNAME): '\t' at position ";
00419 message += i;
00420 message += ".";
00421 validationErrors.addError(SamValidationError::INVALID_RNAME,
00422 SamValidationError::WARNING,
00423 message.c_str());
00424 status = false;
00425 break;
00426 case '\n':
00427
00428 message = "Invalid character in the Reference Sequence Name (RNAME): '\n' at position ";
00429 message += i;
00430 message += ".";
00431 validationErrors.addError(SamValidationError::INVALID_RNAME,
00432 SamValidationError::WARNING,
00433 message.c_str());
00434 status = false;
00435 break;
00436 case '\r':
00437
00438 message = "Invalid character in the Reference Sequence Name (RNAME): '\r' at position ";
00439 message += i;
00440 message += ".";
00441 validationErrors.addError(SamValidationError::INVALID_RNAME,
00442 SamValidationError::WARNING,
00443 message.c_str());
00444 status = false;
00445 break;
00446 case '@':
00447
00448 message = "Invalid character in the Reference Sequence Name (RNAME): '@' at position ";
00449 message += i;
00450 message += ".";
00451 validationErrors.addError(SamValidationError::INVALID_RNAME,
00452 SamValidationError::WARNING,
00453 message.c_str());
00454 status = false;
00455 break;
00456 case '=':
00457
00458 message = "Invalid character in the Reference Sequence Name (RNAME): '=' at position ";
00459 message += i;
00460 message += ".";
00461 validationErrors.addError(SamValidationError::INVALID_RNAME,
00462 SamValidationError::WARNING,
00463 message.c_str());
00464 status = false;
00465 break;
00466 default:
00467
00468 break;
00469 }
00470 }
00471
00472 return(status);
00473 }
00474
00475
00476 bool SamValidator::isValidRefID(int32_t refID,
00477 const SamReferenceInfo& refInfo,
00478 SamValidationErrors& validationErrors)
00479 {
00480
00481
00482
00483
00484 bool status = true;
00485 if((refID < -1) || (refID >= refInfo.getNumEntries()))
00486 {
00487
00488 String message = "Invalid Reference ID, out of range (";
00489 message += refID;
00490 message += ") must be between -1 and ";
00491 message += refInfo.getNumEntries() - 1;
00492 message += ".";
00493
00494 validationErrors.addError(SamValidationError::INVALID_REF_ID,
00495 SamValidationError::WARNING,
00496 message.c_str());
00497 status = false;
00498 }
00499
00500 return(status);
00501 }
00502
00503
00504 bool SamValidator::isValid1BasedPos(int32_t pos,
00505 SamValidationErrors& validationErrors)
00506 {
00507
00508
00509
00510 bool status = true;
00511
00512 if((pos < 0) || (pos > 536870911))
00513 {
00514 String message = "POS out of range (";
00515 message += pos;
00516 message += ") must be between 0 and (2^29)-1.";
00517
00518 validationErrors.addError(SamValidationError::INVALID_POS,
00519 SamValidationError::WARNING,
00520 message.c_str());
00521 status = false;
00522 }
00523
00524 return(status);
00525 }
00526
00527
00528 bool SamValidator::isValidMapQuality(uint8_t mapQuality,
00529 SamValidationErrors& validationErrors)
00530 {
00531
00532 return(true);
00533 }
00534
00535
00536 bool SamValidator::isValidSequence(SamRecord& samRecord,
00537 SamValidationErrors& validationErrors)
00538 {
00539 return(true);
00540 }
00541
00542
00543 bool SamValidator::isValidCigar(SamRecord& samRecord,
00544 SamValidationErrors& validationErrors)
00545 {
00546 return(isValidCigar(samRecord.getCigar(),
00547 samRecord.getReadLength(),
00548 validationErrors));
00549 }
00550
00551 bool SamValidator::isValidCigar(const char* cigar,
00552 const char* sequence,
00553 SamValidationErrors& validationErrors)
00554 {
00555 return(isValidCigar(cigar, strlen(sequence), validationErrors));
00556 }
00557
00558 bool SamValidator::isValidCigar(const char* cigar,
00559 int seqLen,
00560 SamValidationErrors& validationErrors)
00561 {
00562
00563
00564
00565
00566
00567
00568 bool status = true;
00569 String message;
00570
00571 int32_t cigarLen = strlen(cigar);
00572
00573
00574 if(cigarLen == 0)
00575 {
00576 validationErrors.addError(SamValidationError::INVALID_CIGAR,
00577 SamValidationError::WARNING,
00578 "Cigar must not be blank.");
00579 status = false;
00580 }
00581
00582 if(strcmp(cigar, "*") != 0)
00583 {
00584
00585 CigarRoller cigarRoller(cigar);
00586
00587
00588
00589
00590
00591
00592
00593 int cigarSeqLen = cigarRoller.getExpectedQueryBaseCount();
00594 if(cigarSeqLen != seqLen)
00595 {
00596 message = "CIGAR does not evaluate to the same length as SEQ, (";
00597 message += cigarSeqLen;
00598 message += " != ";
00599 message += seqLen;
00600 message += ").";
00601 validationErrors.addError(SamValidationError::INVALID_CIGAR,
00602 SamValidationError::WARNING,
00603 message.c_str());
00604 status = false;
00605 }
00606 }
00607 return(status);
00608 }
00609
00610
00611 bool SamValidator::isValidQuality(SamRecord& samRecord,
00612 SamValidationErrors& validationErrors)
00613 {
00614 return(isValidQuality(samRecord.getQuality(),
00615 samRecord.getReadLength(),
00616 validationErrors));
00617 }
00618
00619
00620 bool SamValidator::isValidQuality(const char* quality,
00621 const char* sequence,
00622 SamValidationErrors& validationErrors)
00623 {
00624
00625 int seqLen = strlen(sequence);
00626
00627
00628 if(strcmp(sequence, "*") == 0)
00629 {
00630 seqLen = 0;
00631 }
00632 return(isValidQuality(quality, seqLen, validationErrors));
00633 }
00634
00635
00636 bool SamValidator::isValidQuality(const char* quality,
00637 int seqLength,
00638 SamValidationErrors& validationErrors)
00639 {
00640 bool status = true;
00641
00642
00643
00644 if((seqLength != 0) && (strcmp(quality, "*") != 0))
00645 {
00646 int qualLen = strlen(quality);
00647
00648
00649 if(seqLength != qualLen)
00650 {
00651
00652
00653 String message = "QUAL is not the same length as SEQ, (";
00654 message += qualLen;
00655 message += " != ";
00656 message += seqLength;
00657 message += ").";
00658
00659 validationErrors.addError(SamValidationError::INVALID_QUAL,
00660 SamValidationError::WARNING,
00661 message.c_str());
00662 status = false;
00663 }
00664 }
00665 return(status);
00666 }
00667
00668
00669 bool SamValidator::isValidTags(SamRecord& samRecord,
00670 SamValidationErrors& validationErrors)
00671 {
00672 bool status = true;
00673
00674 GenomeSequence* reference = samRecord.getReference();
00675
00676 if(reference != NULL)
00677 {
00678 String* recordMD = samRecord.getStringTag(SamTags::MD_TAG);
00679 if(recordMD != NULL)
00680 {
00681
00682 if(!SamTags::isMDTagCorrect(samRecord, *reference))
00683 {
00684
00685 String correctMD;
00686 if(!SamTags::createMDTag(correctMD, samRecord, *reference))
00687 {
00688
00689 correctMD = "UNKNOWN";
00690 }
00691 String message = "Incorrect MD Tag, ";
00692 message += *recordMD;
00693 message += ", should be ";
00694 message += correctMD;
00695 message += ".";
00696
00697 validationErrors.addError(SamValidationError::INVALID_TAG,
00698 SamValidationError::WARNING,
00699 message.c_str());
00700
00701 status = false;
00702 }
00703 }
00704 }
00705
00706 return(status);
00707 }