libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include <iostream> 00019 00020 #include "SamValidation.h" 00021 #include "CigarRoller.h" 00022 #include "SamTags.h" 00023 00024 const char* SamValidationError::enumSeverityString[] = { 00025 "WARNING", "ERROR"}; 00026 00027 const char* SamValidationError::enumTypeString[] = { 00028 "INVALID_QNAME", 00029 "INVALID_REF_ID", 00030 "INVALID_RNAME", 00031 "INVALID_POS", 00032 "INVALID_MAPQ", 00033 "INVALID_CIGAR", 00034 "INVALID_MRNM", 00035 "INVALID_QUAL", 00036 "INVALID_TAG" 00037 }; 00038 00039 const char* SamValidationError::getTypeString(Type type) 00040 { 00041 return(enumTypeString[type]); 00042 } 00043 00044 00045 SamValidationError::SamValidationError(Type type, Severity severity, 00046 std::string message) 00047 { 00048 myType = type; 00049 mySeverity = severity; 00050 myMessage = message; 00051 } 00052 00053 00054 SamValidationError::Type SamValidationError::getType() const 00055 { 00056 return(myType); 00057 } 00058 00059 00060 SamValidationError::Severity SamValidationError::getSeverity() const 00061 { 00062 return(mySeverity); 00063 } 00064 00065 00066 const char* SamValidationError::getMessage() const 00067 { 00068 return(myMessage.c_str()); 00069 } 00070 00071 00072 const char* SamValidationError::getTypeString() const 00073 { 00074 return(enumTypeString[myType]); 00075 } 00076 00077 00078 const char* SamValidationError::getSeverityString() const 00079 { 00080 return(enumSeverityString[mySeverity]); 00081 } 00082 00083 00084 void SamValidationError::getErrorString(std::string& errorString) const 00085 { 00086 errorString = getTypeString(); 00087 errorString += " ("; 00088 errorString += getSeverityString(); 00089 errorString += ") : "; 00090 errorString += getMessage(); 00091 errorString += "\n"; 00092 } 00093 00094 00095 void SamValidationError::printError() const 00096 { 00097 std::cerr << this; 00098 } 00099 00100 00101 00102 // Constructor. 00103 SamValidationErrors::SamValidationErrors() 00104 : myValidationErrors() 00105 { 00106 myErrorIter = myValidationErrors.begin(); 00107 } 00108 00109 00110 // Destructor 00111 SamValidationErrors::~SamValidationErrors() 00112 { 00113 clear(); 00114 } 00115 00116 00117 void SamValidationErrors::clear() 00118 { 00119 // Clear the errors. 00120 std::list<const SamValidationError*>::iterator errorIter; 00121 for(errorIter = myValidationErrors.begin(); 00122 errorIter != myValidationErrors.end(); ++errorIter) 00123 { 00124 delete *errorIter; 00125 *errorIter = NULL; 00126 } 00127 myValidationErrors.clear(); 00128 myErrorIter = myValidationErrors.end(); 00129 } 00130 00131 00132 void SamValidationErrors::addError(SamValidationError::Type newType, 00133 SamValidationError::Severity newSeverity, 00134 const char* newMessage) 00135 { 00136 myValidationErrors.push_back(new SamValidationError(newType, 00137 newSeverity, 00138 newMessage)); 00139 00140 // If this is the first element in the list, set the iterator. 00141 if(myValidationErrors.size() == 1) 00142 { 00143 // set the iterator to the first element. 00144 myErrorIter = myValidationErrors.begin(); 00145 } 00146 } 00147 00148 00149 00150 // Return the number of validation errors that are contained in this object. 00151 unsigned int SamValidationErrors::numErrors() 00152 { 00153 return(myValidationErrors.size()); 00154 } 00155 00156 00157 // Return a pointer to the next error. It does not remove it from the list. 00158 // Returns null once all errors have been retrieved until resetErrorIter 00159 // is called. 00160 const SamValidationError* SamValidationErrors::getNextError() 00161 { 00162 if(myErrorIter == myValidationErrors.end()) 00163 { 00164 // at the end of the list, return null. 00165 return(NULL); 00166 } 00167 // Not at the end of the list, return the last element and increment. 00168 return(*myErrorIter++); 00169 } 00170 00171 00172 // Resets the iterator to the begining of the errors. 00173 void SamValidationErrors::resetErrorIter() 00174 { 00175 myErrorIter = myValidationErrors.begin(); 00176 } 00177 00178 00179 // Appends the error messages to the passed in string. 00180 void SamValidationErrors::getErrorString(std::string& errorString) const 00181 { 00182 for(std::list<const SamValidationError*>:: 00183 const_iterator validationErrorIter = 00184 myValidationErrors.begin(); 00185 validationErrorIter != myValidationErrors.end(); 00186 validationErrorIter++) 00187 { 00188 std::string error = ""; 00189 (*validationErrorIter)->getErrorString(error); 00190 errorString += error; 00191 } 00192 } 00193 00194 00195 bool SamValidator::isValid(SamFileHeader& samHeader, SamRecord& samRecord, 00196 SamValidationErrors& validationErrors) 00197 { 00198 bool status = true; 00199 status &= isValidQname(samRecord.getReadName(), 00200 samRecord.getReadNameLength(), 00201 validationErrors); 00202 00203 status &= isValidFlag(samRecord.getFlag(), 00204 validationErrors); 00205 00206 // Validate the RName including validating it against the header. 00207 status &= isValidRname(samHeader, 00208 samRecord.getReferenceName(), 00209 validationErrors); 00210 00211 status &= isValidRefID(samRecord.getReferenceID(), 00212 samHeader.getReferenceInfo(), 00213 validationErrors); 00214 00215 status &= isValid1BasedPos(samRecord.get1BasedPosition(), 00216 validationErrors); 00217 00218 status &= isValidMapQuality(samRecord.getMapQuality(), validationErrors); 00219 00220 status &= isValidSequence(samRecord, validationErrors); 00221 00222 status &= isValidCigar(samRecord, validationErrors); 00223 00224 status &= isValidQuality(samRecord, validationErrors); 00225 00226 status &= isValidTags(samRecord, validationErrors); 00227 00228 return(status); 00229 } 00230 00231 00232 // qname is the query (read) name - result of SamRecord::getReadName(). 00233 // readNameLen is the length of the read name including the null (the result 00234 // of SamRecord::getReadNameLength()). 00235 // For some invalid records, the getReadNameLength may be different than the 00236 // length of qname. 00237 // NOTE: Query Name and Read Name both refer to the same field. 00238 bool SamValidator::isValidQname(const char* qname, uint8_t readNameLen, 00239 SamValidationErrors& validationErrors) 00240 { 00241 // Validation for QNAME is: 00242 // a) length of the qname string is the same as the read name length 00243 // b) length is between 1 and 254. 00244 // c) [ \t\n\r] are not allowed in the name. 00245 00246 bool status = true; 00247 00248 // Get the length of the qname string. 00249 int32_t qnameLenNull = strlen(qname) + 1; 00250 00251 //////////////////////////////////// 00252 // a) length of the qname string is the same as the read name length 00253 if(qnameLenNull != readNameLen) 00254 { 00255 // This results from a poorly formatted bam file, where the null 00256 // terminated read_name field is not the same length as specified by 00257 // read_name_len. 00258 String message = "Invalid Query Name - the string length ("; 00259 message += qnameLenNull; 00260 message += ") does not match the specified query name length ("; 00261 message += readNameLen; 00262 message += ")."; 00263 00264 validationErrors.addError(SamValidationError::INVALID_QNAME, 00265 SamValidationError::ERROR, 00266 message.c_str()); 00267 status = false; 00268 } 00269 00270 //////////////////////////////////// 00271 // b) length is between 1 and 254 00272 // The length with the terminating null must be between 2 & 255, 00273 if((qnameLenNull < 2) || (qnameLenNull > 255)) 00274 { 00275 String message = "Invalid Query Name (QNAME) length: "; 00276 message += qnameLenNull; 00277 message += ". Length with the terminating null must be between 2 & 255."; 00278 00279 validationErrors.addError(SamValidationError::INVALID_QNAME, 00280 SamValidationError::WARNING, 00281 message.c_str()); 00282 status = false; 00283 } 00284 00285 //////////////////////////////////// 00286 // Loop through and validate they all characters are valid. 00287 // c) [ \t\n\r] are not allowed in the name. 00288 String message; 00289 for(int i = 0; i < qnameLenNull; ++i) 00290 { 00291 switch(qname[i]) 00292 { 00293 case ' ': 00294 // Invalid character. 00295 message = "Invalid character in the Query Name (QNAME): ' ' at position "; 00296 message += i; 00297 message += "."; 00298 validationErrors.addError(SamValidationError::INVALID_QNAME, 00299 SamValidationError::WARNING, 00300 message.c_str()); 00301 status = false; 00302 break; 00303 case '\t': 00304 // Invalid character. 00305 message = "Invalid character in the Query Name (QNAME): '\t' at position "; 00306 message += i; 00307 message += "."; 00308 validationErrors.addError(SamValidationError::INVALID_QNAME, 00309 SamValidationError::WARNING, 00310 message.c_str()); 00311 status = false; 00312 break; 00313 case '\n': 00314 // Invalid character. 00315 message = "Invalid character in the Query Name (QNAME): '\n' at position "; 00316 message += i; 00317 message += "."; 00318 validationErrors.addError(SamValidationError::INVALID_QNAME, 00319 SamValidationError::WARNING, 00320 message.c_str()); 00321 status = false; 00322 break; 00323 case '\r': 00324 // Invalid character. 00325 message = "Invalid character in the Query Name (QNAME): '\r' at position "; 00326 message += i; 00327 message += "."; 00328 validationErrors.addError(SamValidationError::INVALID_QNAME, 00329 SamValidationError::WARNING, 00330 message.c_str()); 00331 status = false; 00332 break; 00333 } 00334 } 00335 00336 return(status); 00337 } 00338 00339 00340 bool SamValidator::isValidFlag(uint16_t flag, 00341 SamValidationErrors& validationErrors) 00342 { 00343 // All values in a uint16_t are valid, so return true. 00344 return(true); 00345 } 00346 00347 00348 bool SamValidator::isValidRname(SamFileHeader& samHeader, 00349 const char* rname, 00350 SamValidationErrors& validationErrors) 00351 { 00352 bool status = true; 00353 00354 // Cross validate the rname and the header. 00355 // If the rname is not '*' 00356 // AND there are any SQ records in the header, 00357 // Then the rname must be in one of them. 00358 if((strcmp(rname, "*") != 0) && 00359 (samHeader.getNumSQs() != 0) && 00360 (samHeader.getSQ(rname) == NULL)) 00361 { 00362 // There are SQ fields, but the ref name is not in it. 00363 status = false; 00364 std::string message = "RNAME, "; 00365 message += rname; 00366 message += ", was not found in a SAM Header SQ record"; 00367 validationErrors.addError(SamValidationError::INVALID_RNAME, 00368 SamValidationError::WARNING, 00369 message.c_str()); 00370 } 00371 status &= isValidRname(rname, validationErrors); 00372 return(status); 00373 } 00374 00375 00376 bool SamValidator::isValidRname(const char* rname, 00377 SamValidationErrors& validationErrors) 00378 { 00379 // Validation for RNAME is: 00380 // a) cannot be 0 length. 00381 // b) [ \t\n\r@=] are not allowed in the name. 00382 00383 bool status = true; 00384 00385 // Get the length of the rname string. 00386 int32_t rnameLen = strlen(rname); 00387 00388 String message; 00389 00390 if(rnameLen == 0) 00391 { 00392 validationErrors.addError(SamValidationError::INVALID_RNAME, 00393 SamValidationError::WARNING, 00394 "Reference Sequence Name (RNAME) cannot have 0 length."); 00395 status = false; 00396 } 00397 00398 //////////////////////////////////// 00399 //////////////////////////////////// 00400 // Loop through and validate they all characters are valid. 00401 // b) [ \t\n\r] are not allowed in the name. 00402 for(int i = 0; i < rnameLen; ++i) 00403 { 00404 switch(rname[i]) 00405 { 00406 case ' ': 00407 // Invalid character. 00408 message = "Invalid character in the Reference Sequence Name (RNAME): ' ' at position "; 00409 message += i; 00410 message += "."; 00411 validationErrors.addError(SamValidationError::INVALID_RNAME, 00412 SamValidationError::WARNING, 00413 message.c_str()); 00414 status = false; 00415 break; 00416 case '\t': 00417 // Invalid character. 00418 message = "Invalid character in the Reference Sequence Name (RNAME): '\t' at position "; 00419 message += i; 00420 message += "."; 00421 validationErrors.addError(SamValidationError::INVALID_RNAME, 00422 SamValidationError::WARNING, 00423 message.c_str()); 00424 status = false; 00425 break; 00426 case '\n': 00427 // Invalid character. 00428 message = "Invalid character in the Reference Sequence Name (RNAME): '\n' at position "; 00429 message += i; 00430 message += "."; 00431 validationErrors.addError(SamValidationError::INVALID_RNAME, 00432 SamValidationError::WARNING, 00433 message.c_str()); 00434 status = false; 00435 break; 00436 case '\r': 00437 // Invalid character. 00438 message = "Invalid character in the Reference Sequence Name (RNAME): '\r' at position "; 00439 message += i; 00440 message += "."; 00441 validationErrors.addError(SamValidationError::INVALID_RNAME, 00442 SamValidationError::WARNING, 00443 message.c_str()); 00444 status = false; 00445 break; 00446 case '@': 00447 // Invalid character. 00448 message = "Invalid character in the Reference Sequence Name (RNAME): '@' at position "; 00449 message += i; 00450 message += "."; 00451 validationErrors.addError(SamValidationError::INVALID_RNAME, 00452 SamValidationError::WARNING, 00453 message.c_str()); 00454 status = false; 00455 break; 00456 case '=': 00457 // Invalid character. 00458 message = "Invalid character in the Reference Sequence Name (RNAME): '=' at position "; 00459 message += i; 00460 message += "."; 00461 validationErrors.addError(SamValidationError::INVALID_RNAME, 00462 SamValidationError::WARNING, 00463 message.c_str()); 00464 status = false; 00465 break; 00466 default: 00467 // Allowed character. 00468 break; 00469 } 00470 } 00471 00472 return(status); 00473 } 00474 00475 00476 bool SamValidator::isValidRefID(int32_t refID, 00477 const SamReferenceInfo& refInfo, 00478 SamValidationErrors& validationErrors) 00479 { 00480 // Validation for rID is: 00481 // a) must be between -1 and the number of refInfo. 00482 // -1 is allowed, and otherwise it must properly index into the array. 00483 00484 bool status = true; 00485 if((refID < -1) || (refID >= refInfo.getNumEntries())) 00486 { 00487 // Reference ID is too large or too small. 00488 String message = "Invalid Reference ID, out of range ("; 00489 message += refID; 00490 message += ") must be between -1 and "; 00491 message += refInfo.getNumEntries() - 1; 00492 message += "."; 00493 00494 validationErrors.addError(SamValidationError::INVALID_REF_ID, 00495 SamValidationError::WARNING, 00496 message.c_str()); 00497 status = false; 00498 } 00499 00500 return(status); 00501 } 00502 00503 00504 bool SamValidator::isValid1BasedPos(int32_t pos, 00505 SamValidationErrors& validationErrors) 00506 { 00507 // Validation for pos is: 00508 // a) must be between 0 and (2^29)-1. 00509 00510 bool status = true; 00511 00512 if((pos < 0) || (pos > 536870911)) 00513 { 00514 String message = "POS out of range ("; 00515 message += pos; 00516 message += ") must be between 0 and (2^29)-1."; 00517 00518 validationErrors.addError(SamValidationError::INVALID_POS, 00519 SamValidationError::WARNING, 00520 message.c_str()); 00521 status = false; 00522 } 00523 00524 return(status); 00525 } 00526 00527 00528 bool SamValidator::isValidMapQuality(uint8_t mapQuality, 00529 SamValidationErrors& validationErrors) 00530 { 00531 // All values in a uint8_t are valid, so return true. 00532 return(true); 00533 } 00534 00535 00536 bool SamValidator::isValidSequence(SamRecord& samRecord, 00537 SamValidationErrors& validationErrors) 00538 { 00539 return(true); 00540 } 00541 00542 00543 bool SamValidator::isValidCigar(SamRecord& samRecord, 00544 SamValidationErrors& validationErrors) 00545 { 00546 return(isValidCigar(samRecord.getCigar(), 00547 samRecord.getReadLength(), 00548 validationErrors)); 00549 } 00550 00551 bool SamValidator::isValidCigar(const char* cigar, 00552 const char* sequence, 00553 SamValidationErrors& validationErrors) 00554 { 00555 return(isValidCigar(cigar, strlen(sequence), validationErrors)); 00556 } 00557 00558 bool SamValidator::isValidCigar(const char* cigar, 00559 int seqLen, 00560 SamValidationErrors& validationErrors) 00561 { 00562 // Validation for CIGAR is: 00563 // a) cannot be 0 length. 00564 // if not "*", validate the following: 00565 // b) must have an integer length for each operator (if not "*"). TODO 00566 // c) all operators must be valid (if not "*"). TODO 00567 // d) evaluates to the same read length as the sequence string. 00568 bool status = true; 00569 String message; 00570 00571 int32_t cigarLen = strlen(cigar); 00572 00573 // a) cannot be 0 length. 00574 if(cigarLen == 0) 00575 { 00576 validationErrors.addError(SamValidationError::INVALID_CIGAR, 00577 SamValidationError::WARNING, 00578 "Cigar must not be blank."); 00579 status = false; 00580 } 00581 00582 if(strcmp(cigar, "*") != 0) 00583 { 00584 // The cigar is not "*", so validate it. 00585 CigarRoller cigarRoller(cigar); 00586 00587 // b) must have an integer length for each operator. 00588 // TODO 00589 // c) all operators must be valid. 00590 // TODO 00591 00592 // d) is the same length as the sequence string. 00593 int cigarSeqLen = cigarRoller.getExpectedQueryBaseCount(); 00594 if(cigarSeqLen != seqLen) 00595 { 00596 message = "CIGAR does not evaluate to the same length as SEQ, ("; 00597 message += cigarSeqLen; 00598 message += " != "; 00599 message += seqLen; 00600 message += ")."; 00601 validationErrors.addError(SamValidationError::INVALID_CIGAR, 00602 SamValidationError::WARNING, 00603 message.c_str()); 00604 status = false; 00605 } 00606 } 00607 return(status); 00608 } 00609 00610 00611 bool SamValidator::isValidQuality(SamRecord& samRecord, 00612 SamValidationErrors& validationErrors) 00613 { 00614 return(isValidQuality(samRecord.getQuality(), 00615 samRecord.getReadLength(), 00616 validationErrors)); 00617 } 00618 00619 00620 bool SamValidator::isValidQuality(const char* quality, 00621 const char* sequence, 00622 SamValidationErrors& validationErrors) 00623 { 00624 // Determine the length of the sequence. 00625 int seqLen = strlen(sequence); 00626 00627 // Check if the sequence is '*' since then the seqLength is 0. 00628 if(strcmp(sequence, "*") == 0) 00629 { 00630 seqLen = 0; 00631 } 00632 return(isValidQuality(quality, seqLen, validationErrors)); 00633 } 00634 00635 00636 bool SamValidator::isValidQuality(const char* quality, 00637 int seqLength, 00638 SamValidationErrors& validationErrors) 00639 { 00640 bool status = true; 00641 00642 // If the quality or the sequence are non-"*", validate that the quality 00643 // and sequence have the same length. 00644 if((seqLength != 0) && (strcmp(quality, "*") != 0)) 00645 { 00646 int qualLen = strlen(quality); 00647 // Both the sequence and the quality are not "*", so validate 00648 // that they are the same length. 00649 if(seqLength != qualLen) 00650 { 00651 // Both fields are specified but are different lengths. 00652 00653 String message = "QUAL is not the same length as SEQ, ("; 00654 message += qualLen; 00655 message += " != "; 00656 message += seqLength; 00657 message += ")."; 00658 00659 validationErrors.addError(SamValidationError::INVALID_QUAL, 00660 SamValidationError::WARNING, 00661 message.c_str()); 00662 status = false; 00663 } 00664 } 00665 return(status); 00666 } 00667 00668 00669 bool SamValidator::isValidTags(SamRecord& samRecord, 00670 SamValidationErrors& validationErrors) 00671 { 00672 bool status = true; 00673 00674 GenomeSequence* reference = samRecord.getReference(); 00675 // If the reference is not null, check the MD tag. 00676 if(reference != NULL) 00677 { 00678 const String* recordMD = samRecord.getStringTag(SamTags::MD_TAG); 00679 if(recordMD != NULL) 00680 { 00681 // The record has an MD tag so check to see if it is correct. 00682 if(!SamTags::isMDTagCorrect(samRecord, *reference)) 00683 { 00684 // Invalid MD tags. 00685 String correctMD; 00686 if(!SamTags::createMDTag(correctMD, samRecord, *reference)) 00687 { 00688 // Failed to get the MD tag, so indicate that it is unknown. 00689 correctMD = "UNKNOWN"; 00690 } 00691 String message = "Incorrect MD Tag, "; 00692 message += *recordMD; 00693 message += ", should be "; 00694 message += correctMD; 00695 message += "."; 00696 00697 validationErrors.addError(SamValidationError::INVALID_TAG, 00698 SamValidationError::WARNING, 00699 message.c_str()); 00700 00701 status = false; 00702 } 00703 } 00704 } 00705 00706 return(status); 00707 }