SamValidation.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <iostream>
00019 
00020 #include "SamValidation.h"
00021 #include "CigarRoller.h"
00022 
00023 const char* SamValidationError::enumSeverityString[] = {
00024     "WARNING", "ERROR"};
00025 
00026 const char* SamValidationError::enumTypeString[] = {
00027     "INVALID_QNAME",
00028     "INVALID_REF_ID",
00029     "INVALID_RNAME",
00030     "INVALID_POS",
00031     "INVALID_MAPQ",
00032     "INVALID_CIGAR",
00033     "INVALID_MRNM",
00034     "INVALID_QUAL"
00035 };
00036 
00037 const char* SamValidationError::getTypeString(Type type)
00038 {
00039     return(enumTypeString[type]);
00040 }
00041 
00042 
00043 SamValidationError::SamValidationError(Type type, Severity severity, 
00044                                        std::string message)
00045 {
00046     myType = type;
00047     mySeverity = severity;
00048     myMessage = message;
00049 }
00050 
00051 
00052 SamValidationError::Type SamValidationError::getType() const
00053 {
00054     return(myType);
00055 }
00056 
00057 
00058 SamValidationError::Severity SamValidationError::getSeverity() const
00059 {
00060     return(mySeverity);
00061 }
00062 
00063 
00064 const char* SamValidationError::getMessage() const
00065 {
00066     return(myMessage.c_str());
00067 }
00068 
00069 
00070 const char* SamValidationError::getTypeString() const
00071 {
00072     return(enumTypeString[myType]);
00073 }
00074 
00075 
00076 const char* SamValidationError::getSeverityString() const
00077 {
00078     return(enumSeverityString[mySeverity]);
00079 }
00080 
00081 
00082 void SamValidationError::getErrorString(std::string& errorString) const
00083 {
00084     errorString = getTypeString();
00085     errorString += " (";
00086     errorString += getSeverityString();
00087     errorString += ") : ";
00088     errorString += getMessage();
00089     errorString += "\n";
00090 }
00091 
00092 
00093 void SamValidationError::printError() const
00094 {
00095     std::cerr << this;
00096 }
00097 
00098 
00099 
00100 // Constructor.
00101 SamValidationErrors::SamValidationErrors()
00102     : myValidationErrors()
00103 {
00104     myErrorIter = myValidationErrors.begin();
00105 }
00106 
00107 
00108 // Destructor
00109 SamValidationErrors::~SamValidationErrors()
00110 {
00111     clear();
00112 }
00113 
00114 
00115 void SamValidationErrors::clear()
00116 {
00117     // Clear the errors.
00118     std::list<const SamValidationError*>::iterator errorIter;
00119     for(errorIter = myValidationErrors.begin(); 
00120         errorIter != myValidationErrors.end(); ++errorIter)
00121     {
00122         delete *errorIter;
00123         *errorIter = NULL;
00124     }
00125     myValidationErrors.clear();
00126     myErrorIter = myValidationErrors.end();
00127 }
00128 
00129 
00130 void SamValidationErrors::addError(SamValidationError::Type newType, 
00131                                    SamValidationError::Severity newSeverity, 
00132                                    const char* newMessage)
00133 {
00134     myValidationErrors.push_back(new SamValidationError(newType, 
00135                                                         newSeverity, 
00136                                                         newMessage));
00137 
00138     // If this is the first element in the list, set the iterator.
00139     if(myValidationErrors.size() == 1)
00140     {
00141         // set the iterator to the first element.
00142         myErrorIter = myValidationErrors.begin();
00143     }
00144 }
00145 
00146 
00147 
00148 // Return the number of validation errors that are contained in this object.
00149 unsigned int SamValidationErrors::numErrors()
00150 {
00151     return(myValidationErrors.size());
00152 }
00153 
00154 
00155 // Return a pointer to the next error.  It does not remove it from the list.
00156 // Returns null once all errors have been retrieved until resetErrorIter
00157 // is called.
00158 const SamValidationError* SamValidationErrors::getNextError()
00159 {
00160     if(myErrorIter == myValidationErrors.end())
00161     {
00162         // at the end of the list, return null.
00163         return(NULL);
00164     }
00165     // Not at the end of the list, return the last element and increment.
00166     return(*myErrorIter++);
00167 }
00168 
00169    
00170 // Resets the iterator to the begining of the errors.
00171 void SamValidationErrors::resetErrorIter()
00172 {
00173     myErrorIter = myValidationErrors.begin();
00174 }
00175 
00176 
00177 // Appends the error messages to the passed in string.
00178 void SamValidationErrors::getErrorString(std::string& errorString) const
00179 {
00180     for(std::list<const SamValidationError*>::
00181             const_iterator validationErrorIter = 
00182             myValidationErrors.begin(); 
00183         validationErrorIter != myValidationErrors.end(); 
00184         validationErrorIter++)
00185     {
00186         std::string error = "";
00187         (*validationErrorIter)->getErrorString(error);
00188         errorString += error;
00189     }
00190 }
00191 
00192 
00193 bool SamValidator::isValid(SamFileHeader& samHeader, SamRecord& samRecord, 
00194                            SamValidationErrors& validationErrors)
00195 {
00196     bool status = true;
00197     status &= isValidQname(samRecord.getReadName(), 
00198                            samRecord.getReadNameLength(), 
00199                            validationErrors);
00200 
00201     status &= isValidFlag(samRecord.getFlag(), 
00202                           validationErrors);
00203 
00204     // Validate the RName including validating it against the header.
00205     status &= isValidRname(samHeader,
00206                            samRecord.getReferenceName(), 
00207                            validationErrors);
00208 
00209     status &= isValidRefID(samRecord.getReferenceID(), 
00210                            *samHeader.getReferenceInfo(), 
00211                            validationErrors);
00212 
00213     status &= isValid1BasedPos(samRecord.get1BasedPosition(),
00214                                validationErrors);
00215     
00216     status &= isValidMapQuality(samRecord.getMapQuality(), validationErrors);
00217 
00218     status &= isValidSequence(samRecord, validationErrors);
00219 
00220     status &= isValidCigar(samRecord, validationErrors);
00221     
00222     status &= isValidQuality(samRecord, validationErrors);
00223 
00224     return(status);
00225 }
00226 
00227 
00228 // qname is the query (read) name - result of SamRecord::getReadName().
00229 // readNameLen is the length of the read name including the null (the result
00230 // of SamRecord::getReadNameLength()).
00231 // For some invalid records, the getReadNameLength may be different than the 
00232 // length of qname.
00233 // NOTE: Query Name and Read Name both refer to the same field.
00234 bool SamValidator::isValidQname(const char* qname, uint8_t readNameLen, 
00235                                 SamValidationErrors& validationErrors)
00236 {
00237     // Validation for QNAME is:
00238     //   a) length of the qname string is the same as the read name length
00239     //   b) length is between 1 and 254.
00240     //   c) [ \t\n\r] are not allowed in the name.
00241 
00242     bool status = true;
00243 
00244     // Get the length of the qname string.
00245     int32_t qnameLenNull = strlen(qname) + 1;
00246 
00247     ////////////////////////////////////
00248     //   a) length of the qname string is the same as the read name length
00249     if(qnameLenNull != readNameLen)
00250     {
00251         // This results from a poorly formatted bam file, where the null
00252         // terminated read_name field is not the same length as specified by 
00253         // read_name_len.
00254         String message = "Invalid Query Name - the string length (";
00255         message += qnameLenNull;
00256         message += ") does not match the specified query name length (";
00257         message += readNameLen;
00258         message += ").";
00259 
00260         validationErrors.addError(SamValidationError::INVALID_QNAME,
00261                                   SamValidationError::ERROR, 
00262                                   message.c_str());
00263         status = false;
00264     }
00265 
00266     ////////////////////////////////////
00267     //   b) length is between 1 and 254
00268     // The length with the terminating null must be between 2 & 255,
00269     if((qnameLenNull < 2) || (qnameLenNull > 255))
00270     {
00271         String message = "Invalid Query Name (QNAME) length: ";
00272         message += qnameLenNull;
00273         message += ".  Length with the terminating null must be between 2 & 255.";
00274       
00275         validationErrors.addError(SamValidationError::INVALID_QNAME,
00276                                   SamValidationError::WARNING, 
00277                                   message.c_str());
00278         status = false;
00279     }
00280 
00281     ////////////////////////////////////
00282     // Loop through and validate they all characters are valid.
00283     //   c) [ \t\n\r] are not allowed in the name.
00284     String message;
00285     for(int i = 0; i < qnameLenNull; ++i)
00286     {
00287         switch(qname[i])
00288         {
00289             case ' ':
00290                 // Invalid character.
00291                 message = "Invalid character in the Query Name (QNAME): ' ' at position ";
00292                 message += i;
00293                 message += ".";
00294                 validationErrors.addError(SamValidationError::INVALID_QNAME,
00295                                           SamValidationError::WARNING, 
00296                                           message.c_str());
00297                 status = false;
00298                 break;
00299             case '\t':
00300                 // Invalid character.
00301                 message = "Invalid character in the Query Name (QNAME): '\t' at position ";
00302                 message += i;
00303                 message += ".";
00304                 validationErrors.addError(SamValidationError::INVALID_QNAME,
00305                                           SamValidationError::WARNING, 
00306                                           message.c_str());
00307                 status = false;
00308                 break;
00309             case '\n':
00310                 // Invalid character.
00311                 message = "Invalid character in the Query Name (QNAME): '\n' at position ";
00312                 message += i;
00313                 message += ".";
00314                 validationErrors.addError(SamValidationError::INVALID_QNAME,
00315                                           SamValidationError::WARNING, 
00316                                           message.c_str());
00317                 status = false;
00318                 break;
00319             case '\r':
00320                 // Invalid character.
00321                 message = "Invalid character in the Query Name (QNAME): '\r' at position ";
00322                 message += i;
00323                 message += ".";
00324                 validationErrors.addError(SamValidationError::INVALID_QNAME,
00325                                           SamValidationError::WARNING, 
00326                                           message.c_str());
00327                 status = false;
00328                 break;
00329         }
00330     }
00331 
00332     return(status);
00333 }
00334 
00335 
00336 bool SamValidator::isValidFlag(uint16_t flag,
00337                                SamValidationErrors& validationErrors)
00338 {
00339     // All values in a uint16_t are valid, so return true.
00340     return(true);
00341 }
00342 
00343 
00344 bool SamValidator::isValidRname(SamFileHeader& samHeader,
00345                                 const char* rname,
00346                                 SamValidationErrors& validationErrors)
00347 {
00348     bool status = true;
00349 
00350     // Cross validate the rname and the header.
00351     // If the rname is not '*'
00352     // AND there are any SQ records in the header,
00353     // Then the rname must be in one of them.
00354     if((strcmp(rname, "*") != 0) &&
00355        (samHeader.getNumSQs() != 0) && 
00356        (samHeader.getSQ(rname) == NULL))
00357     {
00358         // There are SQ fields, but the ref name is not in it.
00359         status = false;
00360         std::string message = "RNAME, ";
00361         message += rname;
00362         message += ", was not found in a SAM Header SQ record";
00363         validationErrors.addError(SamValidationError::INVALID_RNAME,
00364                                   SamValidationError::WARNING,
00365                                   message.c_str());
00366     }
00367     status &= isValidRname(rname, validationErrors);
00368     return(status);
00369 }
00370 
00371 
00372 bool SamValidator::isValidRname(const char* rname,
00373                                 SamValidationErrors& validationErrors)
00374 {
00375     // Validation for RNAME is:
00376     //   a) cannot be 0 length.
00377     //   b) [ \t\n\r@=] are not allowed in the name.
00378 
00379     bool status = true;
00380 
00381     // Get the length of the rname string.
00382     int32_t rnameLen = strlen(rname);
00383 
00384     String message;
00385 
00386     if(rnameLen == 0)
00387     {
00388         validationErrors.addError(SamValidationError::INVALID_RNAME,
00389                                   SamValidationError::WARNING, 
00390                                   "Reference Sequence Name (RNAME) cannot have 0 length.");
00391         status = false;
00392     }
00393 
00394     ////////////////////////////////////
00395     ////////////////////////////////////
00396     // Loop through and validate they all characters are valid.
00397     //   b) [ \t\n\r] are not allowed in the name.
00398     for(int i = 0; i < rnameLen; ++i)
00399     {
00400         switch(rname[i])
00401         {
00402             case ' ':
00403                 // Invalid character.
00404                 message = "Invalid character in the Reference Sequence Name (RNAME): ' ' at position ";
00405                 message += i;
00406                 message += ".";
00407                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00408                                           SamValidationError::WARNING, 
00409                                           message.c_str());
00410                 status = false;
00411                 break;
00412             case '\t':
00413                 // Invalid character.
00414                 message = "Invalid character in the Reference Sequence Name (RNAME): '\t' at position ";
00415                 message += i;
00416                 message += ".";
00417                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00418                                           SamValidationError::WARNING, 
00419                                           message.c_str());
00420                 status = false;
00421                 break;
00422             case '\n':
00423                 // Invalid character.
00424                 message = "Invalid character in the Reference Sequence Name (RNAME): '\n' at position ";
00425                 message += i;
00426                 message += ".";
00427                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00428                                           SamValidationError::WARNING, 
00429                                           message.c_str());
00430                 status = false;
00431                 break;
00432             case '\r':
00433                 // Invalid character.
00434                 message = "Invalid character in the Reference Sequence Name (RNAME): '\r' at position ";
00435                 message += i;
00436                 message += ".";
00437                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00438                                           SamValidationError::WARNING, 
00439                                           message.c_str());
00440                 status = false;
00441                 break;
00442             case '@':
00443                 // Invalid character.
00444                 message = "Invalid character in the Reference Sequence Name (RNAME): '@' at position ";
00445                 message += i;
00446                 message += ".";
00447                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00448                                           SamValidationError::WARNING, 
00449                                           message.c_str());
00450                 status = false;
00451                 break;
00452             case '=':
00453                 // Invalid character.
00454                 message = "Invalid character in the Reference Sequence Name (RNAME): '=' at position ";
00455                 message += i;
00456                 message += ".";
00457                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00458                                           SamValidationError::WARNING, 
00459                                           message.c_str());
00460                 status = false;
00461                 break;
00462             default:
00463                 // Allowed character.
00464                 break;
00465         }
00466     }
00467 
00468     return(status);
00469 }
00470 
00471 
00472 bool SamValidator::isValidRefID(int32_t refID, 
00473                                 const SamReferenceInfo& refInfo,
00474                                 SamValidationErrors& validationErrors)
00475 {
00476     // Validation for rID is:
00477     //   a) must be between -1 and the number of refInfo.
00478     //      -1 is allowed, and otherwise it must properly index into the array.
00479 
00480     bool status = true;
00481     if((refID < -1) || (refID >= refInfo.getNumEntries()))
00482     {
00483         // Reference ID is too large or too small.
00484         String message = "Invalid Reference ID, out of range (";
00485         message += refID;
00486         message += ") must be between -1 and ";
00487         message += refInfo.getNumEntries() - 1;
00488         message += ".";
00489 
00490         validationErrors.addError(SamValidationError::INVALID_REF_ID,
00491                                   SamValidationError::WARNING, 
00492                                   message.c_str());
00493         status = false;
00494     }
00495 
00496     return(status);
00497 }
00498 
00499 
00500 bool SamValidator::isValid1BasedPos(int32_t pos, 
00501                                     SamValidationErrors& validationErrors)
00502 {
00503     // Validation for pos is:
00504     //   a) must be between 0 and (2^29)-1.
00505 
00506     bool status = true;
00507 
00508     if((pos < 0) || (pos > 536870911))
00509     {
00510         String message = "POS out of range (";
00511         message += pos;
00512         message += ") must be between 0 and (2^29)-1.";
00513 
00514         validationErrors.addError(SamValidationError::INVALID_POS,
00515                                   SamValidationError::WARNING, 
00516                                   message.c_str());
00517         status = false;
00518     }
00519 
00520     return(status);
00521 }
00522 
00523 
00524 bool SamValidator::isValidMapQuality(uint8_t mapQuality,
00525                                      SamValidationErrors& validationErrors)
00526 {
00527     // All values in a uint8_t are valid, so return true.
00528     return(true);
00529 }
00530 
00531 
00532 bool SamValidator::isValidSequence(SamRecord& samRecord,
00533                                    SamValidationErrors& validationErrors)
00534 {
00535     return(true);
00536 }
00537 
00538 
00539 bool SamValidator::isValidCigar(SamRecord& samRecord,
00540                                 SamValidationErrors& validationErrors)
00541 {
00542     return(isValidCigar(samRecord.getCigar(), 
00543                         samRecord.getReadLength(),
00544                         validationErrors));
00545 }
00546 
00547 bool SamValidator::isValidCigar(const char* cigar,
00548                                 const char* sequence,
00549                                 SamValidationErrors& validationErrors)
00550 {
00551     return(isValidCigar(cigar, strlen(sequence), validationErrors));
00552 }
00553 
00554 bool SamValidator::isValidCigar(const char* cigar,
00555                                 int seqLen,
00556                                 SamValidationErrors& validationErrors)
00557 {
00558     // Validation for CIGAR is:
00559     //   a) cannot be 0 length.
00560     // if not "*", validate the following:
00561     //   b) must have an integer length for each operator (if not "*"). TODO
00562     //   c) all operators must be valid (if not "*"). TODO
00563     //   d) evaluates to the same read length as the sequence string.
00564     bool status = true;
00565     String message;
00566 
00567     int32_t cigarLen = strlen(cigar);
00568 
00569     //   a) cannot be 0 length.
00570     if(cigarLen == 0)
00571     {
00572         validationErrors.addError(SamValidationError::INVALID_CIGAR,
00573                                   SamValidationError::WARNING,
00574                                   "Cigar must not be blank.");
00575         status = false;
00576     }
00577     
00578     if(strcmp(cigar, "*") != 0)
00579     {
00580         // The cigar is not "*", so validate it.
00581         CigarRoller cigarRoller(cigar);
00582         
00583         //   b) must have an integer length for each operator.
00584         // TODO
00585         //   c) all operators must be valid.
00586         // TODO
00587 
00588         //   d) is the same length as the sequence string.
00589         int cigarSeqLen = cigarRoller.getExpectedQueryBaseCount();
00590         if(cigarSeqLen != seqLen)
00591         {
00592             message = "CIGAR does not evaluate to the same length as SEQ, (";
00593             message += cigarSeqLen;
00594             message += " != ";
00595             message += seqLen;
00596             message += ").";
00597             validationErrors.addError(SamValidationError::INVALID_CIGAR,
00598                                       SamValidationError::WARNING, 
00599                                       message.c_str());
00600             status = false;
00601         }
00602     }
00603     return(status);
00604 }
00605 
00606 
00607 bool SamValidator::isValidQuality(SamRecord& samRecord,
00608                                   SamValidationErrors& validationErrors)
00609 {
00610     return(isValidQuality(samRecord.getQuality(), 
00611                           samRecord.getReadLength(),
00612                           validationErrors));
00613 }
00614 
00615 
00616 bool SamValidator::isValidQuality(const char* quality,
00617                                   const char* sequence,
00618                                   SamValidationErrors& validationErrors)
00619 {
00620     // Determine the length of the sequence.
00621     int seqLen = strlen(sequence);
00622 
00623     // Check if the sequence is '*' since then the seqLength is 0.
00624     if(strcmp(sequence, "*") == 0)
00625     {
00626         seqLen = 0;
00627     }
00628     return(isValidQuality(quality, seqLen, validationErrors));
00629 }
00630 
00631 
00632 bool SamValidator::isValidQuality(const char* quality,
00633                                   int seqLength,
00634                                   SamValidationErrors& validationErrors)
00635 {
00636     bool status = true;
00637 
00638     // If the quality or the sequence are non-"*", validate that the quality
00639     // and sequence have the same length.
00640     if((seqLength != 0) && (strcmp(quality, "*") != 0))
00641     {
00642         int qualLen = strlen(quality);
00643         // Both the sequence and the quality are not "*", so validate
00644         // that they are the same length.
00645         if(seqLength != qualLen)
00646         {
00647             // Both fields are specified but are different lengths.
00648             
00649             String message = "QUAL is not the same length as SEQ, (";
00650             message += qualLen;
00651             message += " != ";
00652             message += seqLength;
00653             message += ").";
00654             
00655             validationErrors.addError(SamValidationError::INVALID_QUAL,
00656                                       SamValidationError::WARNING, 
00657                                       message.c_str());
00658         status = false;
00659         }
00660     }
00661     return(status);
00662 }
00663 
Generated on Tue Mar 22 22:50:13 2011 for StatGen Software by  doxygen 1.6.3