libStatGen Software  1
SamValidation.cpp
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <iostream>
00019 
00020 #include "SamValidation.h"
00021 #include "CigarRoller.h"
00022 #include "SamTags.h"
00023 
00024 const char* SamValidationError::enumSeverityString[] = {
00025     "WARNING", "ERROR"};
00026 
00027 const char* SamValidationError::enumTypeString[] = {
00028     "INVALID_QNAME",
00029     "INVALID_REF_ID",
00030     "INVALID_RNAME",
00031     "INVALID_POS",
00032     "INVALID_MAPQ",
00033     "INVALID_CIGAR",
00034     "INVALID_MRNM",
00035     "INVALID_QUAL",
00036     "INVALID_TAG"
00037 };
00038 
00039 const char* SamValidationError::getTypeString(Type type)
00040 {
00041     return(enumTypeString[type]);
00042 }
00043 
00044 
00045 SamValidationError::SamValidationError(Type type, Severity severity, 
00046                                        std::string message)
00047 {
00048     myType = type;
00049     mySeverity = severity;
00050     myMessage = message;
00051 }
00052 
00053 
00054 SamValidationError::Type SamValidationError::getType() const
00055 {
00056     return(myType);
00057 }
00058 
00059 
00060 SamValidationError::Severity SamValidationError::getSeverity() const
00061 {
00062     return(mySeverity);
00063 }
00064 
00065 
00066 const char* SamValidationError::getMessage() const
00067 {
00068     return(myMessage.c_str());
00069 }
00070 
00071 
00072 const char* SamValidationError::getTypeString() const
00073 {
00074     return(enumTypeString[myType]);
00075 }
00076 
00077 
00078 const char* SamValidationError::getSeverityString() const
00079 {
00080     return(enumSeverityString[mySeverity]);
00081 }
00082 
00083 
00084 void SamValidationError::getErrorString(std::string& errorString) const
00085 {
00086     errorString = getTypeString();
00087     errorString += " (";
00088     errorString += getSeverityString();
00089     errorString += ") : ";
00090     errorString += getMessage();
00091     errorString += "\n";
00092 }
00093 
00094 
00095 void SamValidationError::printError() const
00096 {
00097     std::cerr << this;
00098 }
00099 
00100 
00101 
00102 // Constructor.
00103 SamValidationErrors::SamValidationErrors()
00104     : myValidationErrors()
00105 {
00106     myErrorIter = myValidationErrors.begin();
00107 }
00108 
00109 
00110 // Destructor
00111 SamValidationErrors::~SamValidationErrors()
00112 {
00113     clear();
00114 }
00115 
00116 
00117 void SamValidationErrors::clear()
00118 {
00119     // Clear the errors.
00120     std::list<const SamValidationError*>::iterator errorIter;
00121     for(errorIter = myValidationErrors.begin(); 
00122         errorIter != myValidationErrors.end(); ++errorIter)
00123     {
00124         delete *errorIter;
00125         *errorIter = NULL;
00126     }
00127     myValidationErrors.clear();
00128     myErrorIter = myValidationErrors.end();
00129 }
00130 
00131 
00132 void SamValidationErrors::addError(SamValidationError::Type newType, 
00133                                    SamValidationError::Severity newSeverity, 
00134                                    const char* newMessage)
00135 {
00136     myValidationErrors.push_back(new SamValidationError(newType, 
00137                                                         newSeverity, 
00138                                                         newMessage));
00139 
00140     // If this is the first element in the list, set the iterator.
00141     if(myValidationErrors.size() == 1)
00142     {
00143         // set the iterator to the first element.
00144         myErrorIter = myValidationErrors.begin();
00145     }
00146 }
00147 
00148 
00149 
00150 // Return the number of validation errors that are contained in this object.
00151 unsigned int SamValidationErrors::numErrors()
00152 {
00153     return(myValidationErrors.size());
00154 }
00155 
00156 
00157 // Return a pointer to the next error.  It does not remove it from the list.
00158 // Returns null once all errors have been retrieved until resetErrorIter
00159 // is called.
00160 const SamValidationError* SamValidationErrors::getNextError()
00161 {
00162     if(myErrorIter == myValidationErrors.end())
00163     {
00164         // at the end of the list, return null.
00165         return(NULL);
00166     }
00167     // Not at the end of the list, return the last element and increment.
00168     return(*myErrorIter++);
00169 }
00170 
00171    
00172 // Resets the iterator to the begining of the errors.
00173 void SamValidationErrors::resetErrorIter()
00174 {
00175     myErrorIter = myValidationErrors.begin();
00176 }
00177 
00178 
00179 // Appends the error messages to the passed in string.
00180 void SamValidationErrors::getErrorString(std::string& errorString) const
00181 {
00182     for(std::list<const SamValidationError*>::
00183             const_iterator validationErrorIter = 
00184             myValidationErrors.begin(); 
00185         validationErrorIter != myValidationErrors.end(); 
00186         validationErrorIter++)
00187     {
00188         std::string error = "";
00189         (*validationErrorIter)->getErrorString(error);
00190         errorString += error;
00191     }
00192 }
00193 
00194 
00195 bool SamValidator::isValid(SamFileHeader& samHeader, SamRecord& samRecord, 
00196                            SamValidationErrors& validationErrors)
00197 {
00198     bool status = true;
00199     status &= isValidQname(samRecord.getReadName(), 
00200                            samRecord.getReadNameLength(), 
00201                            validationErrors);
00202 
00203     status &= isValidFlag(samRecord.getFlag(), 
00204                           validationErrors);
00205 
00206     // Validate the RName including validating it against the header.
00207     status &= isValidRname(samHeader,
00208                            samRecord.getReferenceName(), 
00209                            validationErrors);
00210 
00211     status &= isValidRefID(samRecord.getReferenceID(), 
00212                            samHeader.getReferenceInfo(), 
00213                            validationErrors);
00214 
00215     status &= isValid1BasedPos(samRecord.get1BasedPosition(),
00216                                validationErrors);
00217     
00218     status &= isValidMapQuality(samRecord.getMapQuality(), validationErrors);
00219 
00220     status &= isValidSequence(samRecord, validationErrors);
00221 
00222     status &= isValidCigar(samRecord, validationErrors);
00223     
00224     status &= isValidQuality(samRecord, validationErrors);
00225 
00226     status &= isValidTags(samRecord, validationErrors);
00227 
00228     return(status);
00229 }
00230 
00231 
00232 // qname is the query (read) name - result of SamRecord::getReadName().
00233 // readNameLen is the length of the read name including the null (the result
00234 // of SamRecord::getReadNameLength()).
00235 // For some invalid records, the getReadNameLength may be different than the 
00236 // length of qname.
00237 // NOTE: Query Name and Read Name both refer to the same field.
00238 bool SamValidator::isValidQname(const char* qname, uint8_t readNameLen, 
00239                                 SamValidationErrors& validationErrors)
00240 {
00241     // Validation for QNAME is:
00242     //   a) length of the qname string is the same as the read name length
00243     //   b) length is between 1 and 254.
00244     //   c) [ \t\n\r] are not allowed in the name.
00245 
00246     bool status = true;
00247 
00248     // Get the length of the qname string.
00249     int32_t qnameLenNull = strlen(qname) + 1;
00250 
00251     ////////////////////////////////////
00252     //   a) length of the qname string is the same as the read name length
00253     if(qnameLenNull != readNameLen)
00254     {
00255         // This results from a poorly formatted bam file, where the null
00256         // terminated read_name field is not the same length as specified by 
00257         // read_name_len.
00258         String message = "Invalid Query Name - the string length (";
00259         message += qnameLenNull;
00260         message += ") does not match the specified query name length (";
00261         message += readNameLen;
00262         message += ").";
00263 
00264         validationErrors.addError(SamValidationError::INVALID_QNAME,
00265                                   SamValidationError::ERROR, 
00266                                   message.c_str());
00267         status = false;
00268     }
00269 
00270     ////////////////////////////////////
00271     //   b) length is between 1 and 254
00272     // The length with the terminating null must be between 2 & 255,
00273     if((qnameLenNull < 2) || (qnameLenNull > 255))
00274     {
00275         String message = "Invalid Query Name (QNAME) length: ";
00276         message += qnameLenNull;
00277         message += ".  Length with the terminating null must be between 2 & 255.";
00278       
00279         validationErrors.addError(SamValidationError::INVALID_QNAME,
00280                                   SamValidationError::WARNING, 
00281                                   message.c_str());
00282         status = false;
00283     }
00284 
00285     ////////////////////////////////////
00286     // Loop through and validate they all characters are valid.
00287     //   c) [ \t\n\r] are not allowed in the name.
00288     String message;
00289     for(int i = 0; i < qnameLenNull; ++i)
00290     {
00291         switch(qname[i])
00292         {
00293             case ' ':
00294                 // Invalid character.
00295                 message = "Invalid character in the Query Name (QNAME): ' ' at position ";
00296                 message += i;
00297                 message += ".";
00298                 validationErrors.addError(SamValidationError::INVALID_QNAME,
00299                                           SamValidationError::WARNING, 
00300                                           message.c_str());
00301                 status = false;
00302                 break;
00303             case '\t':
00304                 // Invalid character.
00305                 message = "Invalid character in the Query Name (QNAME): '\t' at position ";
00306                 message += i;
00307                 message += ".";
00308                 validationErrors.addError(SamValidationError::INVALID_QNAME,
00309                                           SamValidationError::WARNING, 
00310                                           message.c_str());
00311                 status = false;
00312                 break;
00313             case '\n':
00314                 // Invalid character.
00315                 message = "Invalid character in the Query Name (QNAME): '\n' at position ";
00316                 message += i;
00317                 message += ".";
00318                 validationErrors.addError(SamValidationError::INVALID_QNAME,
00319                                           SamValidationError::WARNING, 
00320                                           message.c_str());
00321                 status = false;
00322                 break;
00323             case '\r':
00324                 // Invalid character.
00325                 message = "Invalid character in the Query Name (QNAME): '\r' at position ";
00326                 message += i;
00327                 message += ".";
00328                 validationErrors.addError(SamValidationError::INVALID_QNAME,
00329                                           SamValidationError::WARNING, 
00330                                           message.c_str());
00331                 status = false;
00332                 break;
00333         }
00334     }
00335 
00336     return(status);
00337 }
00338 
00339 
00340 bool SamValidator::isValidFlag(uint16_t flag,
00341                                SamValidationErrors& validationErrors)
00342 {
00343     // All values in a uint16_t are valid, so return true.
00344     return(true);
00345 }
00346 
00347 
00348 bool SamValidator::isValidRname(SamFileHeader& samHeader,
00349                                 const char* rname,
00350                                 SamValidationErrors& validationErrors)
00351 {
00352     bool status = true;
00353 
00354     // Cross validate the rname and the header.
00355     // If the rname is not '*'
00356     // AND there are any SQ records in the header,
00357     // Then the rname must be in one of them.
00358     if((strcmp(rname, "*") != 0) &&
00359        (samHeader.getNumSQs() != 0) && 
00360        (samHeader.getSQ(rname) == NULL))
00361     {
00362         // There are SQ fields, but the ref name is not in it.
00363         status = false;
00364         std::string message = "RNAME, ";
00365         message += rname;
00366         message += ", was not found in a SAM Header SQ record";
00367         validationErrors.addError(SamValidationError::INVALID_RNAME,
00368                                   SamValidationError::WARNING,
00369                                   message.c_str());
00370     }
00371     status &= isValidRname(rname, validationErrors);
00372     return(status);
00373 }
00374 
00375 
00376 bool SamValidator::isValidRname(const char* rname,
00377                                 SamValidationErrors& validationErrors)
00378 {
00379     // Validation for RNAME is:
00380     //   a) cannot be 0 length.
00381     //   b) [ \t\n\r@=] are not allowed in the name.
00382 
00383     bool status = true;
00384 
00385     // Get the length of the rname string.
00386     int32_t rnameLen = strlen(rname);
00387 
00388     String message;
00389 
00390     if(rnameLen == 0)
00391     {
00392         validationErrors.addError(SamValidationError::INVALID_RNAME,
00393                                   SamValidationError::WARNING, 
00394                                   "Reference Sequence Name (RNAME) cannot have 0 length.");
00395         status = false;
00396     }
00397 
00398     ////////////////////////////////////
00399     ////////////////////////////////////
00400     // Loop through and validate they all characters are valid.
00401     //   b) [ \t\n\r] are not allowed in the name.
00402     for(int i = 0; i < rnameLen; ++i)
00403     {
00404         switch(rname[i])
00405         {
00406             case ' ':
00407                 // Invalid character.
00408                 message = "Invalid character in the Reference Sequence Name (RNAME): ' ' at position ";
00409                 message += i;
00410                 message += ".";
00411                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00412                                           SamValidationError::WARNING, 
00413                                           message.c_str());
00414                 status = false;
00415                 break;
00416             case '\t':
00417                 // Invalid character.
00418                 message = "Invalid character in the Reference Sequence Name (RNAME): '\t' at position ";
00419                 message += i;
00420                 message += ".";
00421                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00422                                           SamValidationError::WARNING, 
00423                                           message.c_str());
00424                 status = false;
00425                 break;
00426             case '\n':
00427                 // Invalid character.
00428                 message = "Invalid character in the Reference Sequence Name (RNAME): '\n' at position ";
00429                 message += i;
00430                 message += ".";
00431                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00432                                           SamValidationError::WARNING, 
00433                                           message.c_str());
00434                 status = false;
00435                 break;
00436             case '\r':
00437                 // Invalid character.
00438                 message = "Invalid character in the Reference Sequence Name (RNAME): '\r' at position ";
00439                 message += i;
00440                 message += ".";
00441                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00442                                           SamValidationError::WARNING, 
00443                                           message.c_str());
00444                 status = false;
00445                 break;
00446             case '@':
00447                 // Invalid character.
00448                 message = "Invalid character in the Reference Sequence Name (RNAME): '@' at position ";
00449                 message += i;
00450                 message += ".";
00451                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00452                                           SamValidationError::WARNING, 
00453                                           message.c_str());
00454                 status = false;
00455                 break;
00456             case '=':
00457                 // Invalid character.
00458                 message = "Invalid character in the Reference Sequence Name (RNAME): '=' at position ";
00459                 message += i;
00460                 message += ".";
00461                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00462                                           SamValidationError::WARNING, 
00463                                           message.c_str());
00464                 status = false;
00465                 break;
00466             default:
00467                 // Allowed character.
00468                 break;
00469         }
00470     }
00471 
00472     return(status);
00473 }
00474 
00475 
00476 bool SamValidator::isValidRefID(int32_t refID, 
00477                                 const SamReferenceInfo& refInfo,
00478                                 SamValidationErrors& validationErrors)
00479 {
00480     // Validation for rID is:
00481     //   a) must be between -1 and the number of refInfo.
00482     //      -1 is allowed, and otherwise it must properly index into the array.
00483 
00484     bool status = true;
00485     if((refID < -1) || (refID >= refInfo.getNumEntries()))
00486     {
00487         // Reference ID is too large or too small.
00488         String message = "Invalid Reference ID, out of range (";
00489         message += refID;
00490         message += ") must be between -1 and ";
00491         message += refInfo.getNumEntries() - 1;
00492         message += ".";
00493 
00494         validationErrors.addError(SamValidationError::INVALID_REF_ID,
00495                                   SamValidationError::WARNING, 
00496                                   message.c_str());
00497         status = false;
00498     }
00499 
00500     return(status);
00501 }
00502 
00503 
00504 bool SamValidator::isValid1BasedPos(int32_t pos, 
00505                                     SamValidationErrors& validationErrors)
00506 {
00507     // Validation for pos is:
00508     //   a) must be between 0 and (2^29)-1.
00509 
00510     bool status = true;
00511 
00512     if((pos < 0) || (pos > 536870911))
00513     {
00514         String message = "POS out of range (";
00515         message += pos;
00516         message += ") must be between 0 and (2^29)-1.";
00517 
00518         validationErrors.addError(SamValidationError::INVALID_POS,
00519                                   SamValidationError::WARNING, 
00520                                   message.c_str());
00521         status = false;
00522     }
00523 
00524     return(status);
00525 }
00526 
00527 
00528 bool SamValidator::isValidMapQuality(uint8_t mapQuality,
00529                                      SamValidationErrors& validationErrors)
00530 {
00531     // All values in a uint8_t are valid, so return true.
00532     return(true);
00533 }
00534 
00535 
00536 bool SamValidator::isValidSequence(SamRecord& samRecord,
00537                                    SamValidationErrors& validationErrors)
00538 {
00539     return(true);
00540 }
00541 
00542 
00543 bool SamValidator::isValidCigar(SamRecord& samRecord,
00544                                 SamValidationErrors& validationErrors)
00545 {
00546     return(isValidCigar(samRecord.getCigar(), 
00547                         samRecord.getReadLength(),
00548                         validationErrors));
00549 }
00550 
00551 bool SamValidator::isValidCigar(const char* cigar,
00552                                 const char* sequence,
00553                                 SamValidationErrors& validationErrors)
00554 {
00555     return(isValidCigar(cigar, strlen(sequence), validationErrors));
00556 }
00557 
00558 bool SamValidator::isValidCigar(const char* cigar,
00559                                 int seqLen,
00560                                 SamValidationErrors& validationErrors)
00561 {
00562     // Validation for CIGAR is:
00563     //   a) cannot be 0 length.
00564     // if not "*", validate the following:
00565     //   b) must have an integer length for each operator (if not "*"). TODO
00566     //   c) all operators must be valid (if not "*"). TODO
00567     //   d) evaluates to the same read length as the sequence string.
00568     bool status = true;
00569     String message;
00570 
00571     int32_t cigarLen = strlen(cigar);
00572 
00573     //   a) cannot be 0 length.
00574     if(cigarLen == 0)
00575     {
00576         validationErrors.addError(SamValidationError::INVALID_CIGAR,
00577                                   SamValidationError::WARNING,
00578                                   "Cigar must not be blank.");
00579         status = false;
00580     }
00581     
00582     if(strcmp(cigar, "*") != 0)
00583     {
00584         // The cigar is not "*", so validate it.
00585         CigarRoller cigarRoller(cigar);
00586         
00587         //   b) must have an integer length for each operator.
00588         // TODO
00589         //   c) all operators must be valid.
00590         // TODO
00591 
00592         //   d) is the same length as the sequence string.
00593         int cigarSeqLen = cigarRoller.getExpectedQueryBaseCount();
00594         if(cigarSeqLen != seqLen)
00595         {
00596             message = "CIGAR does not evaluate to the same length as SEQ, (";
00597             message += cigarSeqLen;
00598             message += " != ";
00599             message += seqLen;
00600             message += ").";
00601             validationErrors.addError(SamValidationError::INVALID_CIGAR,
00602                                       SamValidationError::WARNING, 
00603                                       message.c_str());
00604             status = false;
00605         }
00606     }
00607     return(status);
00608 }
00609 
00610 
00611 bool SamValidator::isValidQuality(SamRecord& samRecord,
00612                                   SamValidationErrors& validationErrors)
00613 {
00614     return(isValidQuality(samRecord.getQuality(), 
00615                           samRecord.getReadLength(),
00616                           validationErrors));
00617 }
00618 
00619 
00620 bool SamValidator::isValidQuality(const char* quality,
00621                                   const char* sequence,
00622                                   SamValidationErrors& validationErrors)
00623 {
00624     // Determine the length of the sequence.
00625     int seqLen = strlen(sequence);
00626 
00627     // Check if the sequence is '*' since then the seqLength is 0.
00628     if(strcmp(sequence, "*") == 0)
00629     {
00630         seqLen = 0;
00631     }
00632     return(isValidQuality(quality, seqLen, validationErrors));
00633 }
00634 
00635 
00636 bool SamValidator::isValidQuality(const char* quality,
00637                                   int seqLength,
00638                                   SamValidationErrors& validationErrors)
00639 {
00640     bool status = true;
00641 
00642     // If the quality or the sequence are non-"*", validate that the quality
00643     // and sequence have the same length.
00644     if((seqLength != 0) && (strcmp(quality, "*") != 0))
00645     {
00646         int qualLen = strlen(quality);
00647         // Both the sequence and the quality are not "*", so validate
00648         // that they are the same length.
00649         if(seqLength != qualLen)
00650         {
00651             // Both fields are specified but are different lengths.
00652             
00653             String message = "QUAL is not the same length as SEQ, (";
00654             message += qualLen;
00655             message += " != ";
00656             message += seqLength;
00657             message += ").";
00658             
00659             validationErrors.addError(SamValidationError::INVALID_QUAL,
00660                                       SamValidationError::WARNING, 
00661                                       message.c_str());
00662         status = false;
00663         }
00664     }
00665     return(status);
00666 }
00667 
00668 
00669 bool SamValidator::isValidTags(SamRecord& samRecord,
00670                                SamValidationErrors& validationErrors)
00671 {
00672     bool status = true;
00673 
00674     GenomeSequence* reference = samRecord.getReference();
00675     // If the reference is not null, check the MD tag.
00676     if(reference != NULL)
00677     {
00678         const String* recordMD = samRecord.getStringTag(SamTags::MD_TAG);
00679         if(recordMD != NULL)
00680         {
00681             // The record has an MD tag so check to see if it is correct.
00682             if(!SamTags::isMDTagCorrect(samRecord, *reference))
00683             {
00684                 // Invalid MD tags.
00685                 String correctMD;
00686                 if(!SamTags::createMDTag(correctMD, samRecord, *reference))
00687                 {
00688                     // Failed to get the MD tag, so indicate that it is unknown.
00689                     correctMD = "UNKNOWN";
00690                 }
00691                 String message = "Incorrect MD Tag, ";
00692                 message += *recordMD;
00693                 message += ", should be ";
00694                 message += correctMD;
00695                 message += ".";
00696                 
00697                 validationErrors.addError(SamValidationError::INVALID_TAG,
00698                                           SamValidationError::WARNING, 
00699                                           message.c_str());
00700                 
00701                 status = false;
00702             }
00703         }
00704     }
00705 
00706     return(status);
00707 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends