SamValidation.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <iostream>
00019 
00020 #include "SamValidation.h"
00021 #include "CigarRoller.h"
00022 
00023 const char* SamValidationError::enumSeverityString[] = {
00024     "WARNING", "ERROR"};
00025 
00026 const char* SamValidationError::enumTypeString[] = {
00027     "INVALID_QNAME",
00028     "INVALID_REF_ID",
00029     "INVALID_RNAME",
00030     "INVALID_POS",
00031     "INVALID_MAPQ",
00032     "INVALID_CIGAR",
00033     "INVALID_MRNM",
00034     "INVALID_QUAL"
00035 };
00036 
00037 const char* SamValidationError::getTypeString(Type type)
00038 {
00039     return(enumTypeString[type]);
00040 }
00041 
00042 
00043 SamValidationError::SamValidationError(Type type, Severity severity, 
00044                                        std::string message)
00045 {
00046     myType = type;
00047     mySeverity = severity;
00048     myMessage = message;
00049 }
00050 
00051 
00052 SamValidationError::Type SamValidationError::getType() const
00053 {
00054     return(myType);
00055 }
00056 
00057 
00058 SamValidationError::Severity SamValidationError::getSeverity() const
00059 {
00060     return(mySeverity);
00061 }
00062 
00063 
00064 const char* SamValidationError::getMessage() const
00065 {
00066     return(myMessage.c_str());
00067 }
00068 
00069 
00070 const char* SamValidationError::getTypeString() const
00071 {
00072     return(enumTypeString[myType]);
00073 }
00074 
00075 
00076 const char* SamValidationError::getSeverityString() const
00077 {
00078     return(enumSeverityString[mySeverity]);
00079 }
00080 
00081 
00082 void SamValidationError::getErrorString(std::string& errorString) const
00083 {
00084     errorString = getTypeString();
00085     errorString += " (";
00086     errorString += getSeverityString();
00087     errorString += ") : ";
00088     errorString += getMessage();
00089     errorString += "\n";
00090 }
00091 
00092 
00093 void SamValidationError::printError() const
00094 {
00095     std::cerr << this;
00096 }
00097 
00098 
00099 
00100 // Constructor.
00101 SamValidationErrors::SamValidationErrors()
00102     : myValidationErrors()
00103 {
00104     myErrorIter = myValidationErrors.begin();
00105 }
00106 
00107 
00108 // Destructor
00109 SamValidationErrors::~SamValidationErrors()
00110 {
00111     clear();
00112 }
00113 
00114 
00115 void SamValidationErrors::clear()
00116 {
00117     // Clear the errors.
00118     std::list<const SamValidationError*>::iterator errorIter;
00119     for(errorIter = myValidationErrors.begin(); 
00120         errorIter != myValidationErrors.end(); ++errorIter)
00121     {
00122         delete *errorIter;
00123         *errorIter = NULL;
00124     }
00125     myValidationErrors.clear();
00126     myErrorIter = myValidationErrors.end();
00127 }
00128 
00129 
00130 void SamValidationErrors::addError(SamValidationError::Type newType, 
00131                                    SamValidationError::Severity newSeverity, 
00132                                    const char* newMessage)
00133 {
00134     myValidationErrors.push_back(new SamValidationError(newType, 
00135                                                         newSeverity, 
00136                                                         newMessage));
00137 
00138     // If this is the first element in the list, set the iterator.
00139     if(myValidationErrors.size() == 1)
00140     {
00141         // set the iterator to the first element.
00142         myErrorIter = myValidationErrors.begin();
00143     }
00144 }
00145 
00146 
00147 
00148 // Return the number of validation errors that are contained in this object.
00149 unsigned int SamValidationErrors::numErrors()
00150 {
00151     return(myValidationErrors.size());
00152 }
00153 
00154 
00155 // Return a pointer to the next error.  It does not remove it from the list.
00156 // Returns null once all errors have been retrieved until resetErrorIter
00157 // is called.
00158 const SamValidationError* SamValidationErrors::getNextError()
00159 {
00160     if(myErrorIter == myValidationErrors.end())
00161     {
00162         // at the end of the list, return null.
00163         return(NULL);
00164     }
00165     // Not at the end of the list, return the last element and increment.
00166     return(*myErrorIter++);
00167 }
00168 
00169    
00170 // Resets the iterator to the begining of the errors.
00171 void SamValidationErrors::resetErrorIter()
00172 {
00173     myErrorIter = myValidationErrors.begin();
00174 }
00175 
00176 
00177 // Appends the error messages to the passed in string.
00178 void SamValidationErrors::getErrorString(std::string& errorString) const
00179 {
00180     for(std::list<const SamValidationError*>::
00181             const_iterator validationErrorIter = 
00182             myValidationErrors.begin(); 
00183         validationErrorIter != myValidationErrors.end(); 
00184         validationErrorIter++)
00185     {
00186         std::string error = "";
00187         (*validationErrorIter)->getErrorString(error);
00188         errorString += error;
00189     }
00190 }
00191 
00192 
00193 bool SamValidator::isValid(SamFileHeader& samHeader, SamRecord& samRecord, 
00194                            SamValidationErrors& validationErrors)
00195 {
00196     bool status = true;
00197     status &= isValidQname(samRecord.getReadName(), 
00198                            samRecord.getReadNameLength(), 
00199                            validationErrors);
00200 
00201     status &= isValidFlag(samRecord.getFlag(), 
00202                           validationErrors);
00203 
00204     // Validate the RName including validating it against the header.
00205     status &= isValidRname(samHeader,
00206                            samRecord.getReferenceName(), 
00207                            validationErrors);
00208 
00209     status &= isValidRefID(samRecord.getReferenceID(), 
00210                            *samHeader.getReferenceInfo(), 
00211                            validationErrors);
00212 
00213     status &= isValid1BasedPos(samRecord.get1BasedPosition(),
00214                                validationErrors);
00215     
00216     status &= isValidMapQuality(samRecord.getMapQuality(), 
00217                                 validationErrors);
00218 
00219     const char* sequence = samRecord.getSequence();
00220 
00221     status &= isValidCigar(samRecord.getCigar(), 
00222                            sequence,
00223                            validationErrors);
00224     
00225     status &= isValidQuality(samRecord.getQuality(),
00226                              sequence,
00227                              validationErrors);
00228 
00229     return(status);
00230 }
00231 
00232 
00233 // qname is the query (read) name - result of SamRecord::getReadName().
00234 // readNameLen is the length of the read name including the null (the result
00235 // of SamRecord::getReadNameLength()).
00236 // For some invalid records, the getReadNameLength may be different than the 
00237 // length of qname.
00238 // NOTE: Query Name and Read Name both refer to the same field.
00239 bool SamValidator::isValidQname(const char* qname, uint8_t readNameLen, 
00240                                 SamValidationErrors& validationErrors)
00241 {
00242     // Validation for QNAME is:
00243     //   a) length of the qname string is the same as the read name length
00244     //   b) length is between 1 and 254.
00245     //   c) [ \t\n\r] are not allowed in the name.
00246 
00247     bool status = true;
00248 
00249     // Get the length of the qname string.
00250     int32_t qnameLenNull = strlen(qname) + 1;
00251 
00252     ////////////////////////////////////
00253     //   a) length of the qname string is the same as the read name length
00254     if(qnameLenNull != readNameLen)
00255     {
00256         // This results from a poorly formatted bam file, where the null
00257         // terminated read_name field is not the same length as specified by 
00258         // read_name_len.
00259         String message = "Invalid Query Name - the string length (";
00260         message += qnameLenNull;
00261         message += ") does not match the specified query name length (";
00262         message += readNameLen;
00263         message += ").";
00264 
00265         validationErrors.addError(SamValidationError::INVALID_QNAME,
00266                                   SamValidationError::ERROR, 
00267                                   message.c_str());
00268         status = false;
00269     }
00270 
00271     ////////////////////////////////////
00272     //   b) length is between 1 and 254
00273     // The length with the terminating null must be between 2 & 255,
00274     if((qnameLenNull < 2) || (qnameLenNull > 255))
00275     {
00276         String message = "Invalid Query Name (QNAME) length: ";
00277         message += qnameLenNull;
00278         message += ".  Length with the terminating null must be between 2 & 255.";
00279       
00280         validationErrors.addError(SamValidationError::INVALID_QNAME,
00281                                   SamValidationError::WARNING, 
00282                                   message.c_str());
00283         status = false;
00284     }
00285 
00286     ////////////////////////////////////
00287     // Loop through and validate they all characters are valid.
00288     //   c) [ \t\n\r] are not allowed in the name.
00289     String message;
00290     for(int i = 0; i < qnameLenNull; ++i)
00291     {
00292         switch(qname[i])
00293         {
00294             case ' ':
00295                 // Invalid character.
00296                 message = "Invalid character in the Query Name (QNAME): ' ' at position ";
00297                 message += i;
00298                 message += ".";
00299                 validationErrors.addError(SamValidationError::INVALID_QNAME,
00300                                           SamValidationError::WARNING, 
00301                                           message.c_str());
00302                 status = false;
00303                 break;
00304             case '\t':
00305                 // Invalid character.
00306                 message = "Invalid character in the Query Name (QNAME): '\t' at position ";
00307                 message += i;
00308                 message += ".";
00309                 validationErrors.addError(SamValidationError::INVALID_QNAME,
00310                                           SamValidationError::WARNING, 
00311                                           message.c_str());
00312                 status = false;
00313                 break;
00314             case '\n':
00315                 // Invalid character.
00316                 message = "Invalid character in the Query Name (QNAME): '\n' at position ";
00317                 message += i;
00318                 message += ".";
00319                 validationErrors.addError(SamValidationError::INVALID_QNAME,
00320                                           SamValidationError::WARNING, 
00321                                           message.c_str());
00322                 status = false;
00323                 break;
00324             case '\r':
00325                 // Invalid character.
00326                 message = "Invalid character in the Query Name (QNAME): '\r' at position ";
00327                 message += i;
00328                 message += ".";
00329                 validationErrors.addError(SamValidationError::INVALID_QNAME,
00330                                           SamValidationError::WARNING, 
00331                                           message.c_str());
00332                 status = false;
00333                 break;
00334         }
00335     }
00336 
00337     return(status);
00338 }
00339 
00340 
00341 bool SamValidator::isValidFlag(uint16_t flag,
00342                                SamValidationErrors& validationErrors)
00343 {
00344     // All values in a uint16_t are valid, so return true.
00345     return(true);
00346 }
00347 
00348 
00349 bool SamValidator::isValidRname(SamFileHeader& samHeader,
00350                                 const char* rname,
00351                                 SamValidationErrors& validationErrors)
00352 {
00353     bool status = true;
00354 
00355     // Cross validate the rname and the header.
00356     // If the rname is not '*'
00357     // AND there are any SQ records in the header,
00358     // Then the rname must be in one of them.
00359     if((strcmp(rname, "*") != 0) &&
00360        (samHeader.getNumSQs() != 0) && 
00361        (samHeader.getSQ(rname) == NULL))
00362     {
00363         // There are SQ fields, but the ref name is not in it.
00364         status = false;
00365         std::string message = "RNAME, ";
00366         message += rname;
00367         message += ", was not found in a SAM Header SQ record";
00368         validationErrors.addError(SamValidationError::INVALID_RNAME,
00369                                   SamValidationError::WARNING,
00370                                   message.c_str());
00371     }
00372     status &= isValidRname(rname, validationErrors);
00373     return(status);
00374 }
00375 
00376 
00377 bool SamValidator::isValidRname(const char* rname,
00378                                 SamValidationErrors& validationErrors)
00379 {
00380     // Validation for RNAME is:
00381     //   a) cannot be 0 length.
00382     //   b) [ \t\n\r@=] are not allowed in the name.
00383 
00384     bool status = true;
00385 
00386     // Get the length of the rname string.
00387     int32_t rnameLen = strlen(rname);
00388 
00389     String message;
00390 
00391     if(rnameLen == 0)
00392     {
00393         validationErrors.addError(SamValidationError::INVALID_RNAME,
00394                                   SamValidationError::WARNING, 
00395                                   "Reference Sequence Name (RNAME) cannot have 0 length.");
00396         status = false;
00397     }
00398 
00399     ////////////////////////////////////
00400     ////////////////////////////////////
00401     // Loop through and validate they all characters are valid.
00402     //   b) [ \t\n\r] are not allowed in the name.
00403     for(int i = 0; i < rnameLen; ++i)
00404     {
00405         switch(rname[i])
00406         {
00407             case ' ':
00408                 // Invalid character.
00409                 message = "Invalid character in the Reference Sequence Name (RNAME): ' ' at position ";
00410                 message += i;
00411                 message += ".";
00412                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00413                                           SamValidationError::WARNING, 
00414                                           message.c_str());
00415                 status = false;
00416                 break;
00417             case '\t':
00418                 // Invalid character.
00419                 message = "Invalid character in the Reference Sequence Name (RNAME): '\t' at position ";
00420                 message += i;
00421                 message += ".";
00422                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00423                                           SamValidationError::WARNING, 
00424                                           message.c_str());
00425                 status = false;
00426                 break;
00427             case '\n':
00428                 // Invalid character.
00429                 message = "Invalid character in the Reference Sequence Name (RNAME): '\n' at position ";
00430                 message += i;
00431                 message += ".";
00432                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00433                                           SamValidationError::WARNING, 
00434                                           message.c_str());
00435                 status = false;
00436                 break;
00437             case '\r':
00438                 // Invalid character.
00439                 message = "Invalid character in the Reference Sequence Name (RNAME): '\r' at position ";
00440                 message += i;
00441                 message += ".";
00442                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00443                                           SamValidationError::WARNING, 
00444                                           message.c_str());
00445                 status = false;
00446                 break;
00447             case '@':
00448                 // Invalid character.
00449                 message = "Invalid character in the Reference Sequence Name (RNAME): '@' at position ";
00450                 message += i;
00451                 message += ".";
00452                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00453                                           SamValidationError::WARNING, 
00454                                           message.c_str());
00455                 status = false;
00456                 break;
00457             case '=':
00458                 // Invalid character.
00459                 message = "Invalid character in the Reference Sequence Name (RNAME): '=' at position ";
00460                 message += i;
00461                 message += ".";
00462                 validationErrors.addError(SamValidationError::INVALID_RNAME,
00463                                           SamValidationError::WARNING, 
00464                                           message.c_str());
00465                 status = false;
00466                 break;
00467             default:
00468                 // Allowed character.
00469                 break;
00470         }
00471     }
00472 
00473     return(status);
00474 }
00475 
00476 
00477 bool SamValidator::isValidRefID(int32_t refID, 
00478                                 const SamReferenceInfo& refInfo,
00479                                 SamValidationErrors& validationErrors)
00480 {
00481     // Validation for rID is:
00482     //   a) must be between -1 and the number of refInfo.
00483     //      -1 is allowed, and otherwise it must properly index into the array.
00484 
00485     bool status = true;
00486     if((refID < -1) || (refID >= refInfo.getNumEntries()))
00487     {
00488         // Reference ID is too large or too small.
00489         String message = "Invalid Reference ID, out of range (";
00490         message += refID;
00491         message += ") must be between -1 and ";
00492         message += refInfo.getNumEntries() - 1;
00493         message += ".";
00494 
00495         validationErrors.addError(SamValidationError::INVALID_REF_ID,
00496                                   SamValidationError::WARNING, 
00497                                   message.c_str());
00498         status = false;
00499     }
00500 
00501     return(status);
00502 }
00503 
00504 
00505 bool SamValidator::isValid1BasedPos(int32_t pos, 
00506                                     SamValidationErrors& validationErrors)
00507 {
00508     // Validation for pos is:
00509     //   a) must be between 0 and (2^29)-1.
00510 
00511     bool status = true;
00512 
00513     if((pos < 0) || (pos > 536870911))
00514     {
00515         String message = "POS out of range (";
00516         message += pos;
00517         message += ") must be between 0 and (2^29)-1.";
00518 
00519         validationErrors.addError(SamValidationError::INVALID_POS,
00520                                   SamValidationError::WARNING, 
00521                                   message.c_str());
00522         status = false;
00523     }
00524 
00525     return(status);
00526 }
00527 
00528 
00529 bool SamValidator::isValidMapQuality(uint8_t mapQuality,
00530                                      SamValidationErrors& validationErrors)
00531 {
00532     // All values in a uint8_t are valid, so return true.
00533     return(true);
00534 }
00535 
00536 
00537 bool SamValidator::isValidCigar(const char* cigar,
00538                                 const char* sequence,
00539                                 SamValidationErrors& validationErrors)
00540 {
00541     // Validation for CIGAR is:
00542     //   a) cannot be 0 length.
00543     // if not "*", validate the following:
00544     //   b) must have an integer length for each operator (if not "*"). TODO
00545     //   c) all operators must be valid (if not "*"). TODO
00546     //   d) evaluates to the same read length as the sequence string.
00547     bool status = true;
00548     String message;
00549 
00550     int32_t cigarLen = strlen(cigar);
00551 
00552     //   a) cannot be 0 length.
00553     if(cigarLen == 0)
00554     {
00555         validationErrors.addError(SamValidationError::INVALID_CIGAR,
00556                                   SamValidationError::WARNING,
00557                                   "Cigar must not be blank.");
00558         status = false;
00559     }
00560     
00561     if(strcmp(cigar, "*") != 0)
00562     {
00563         // The cigar is not "*", so validate it.
00564         CigarRoller cigarRoller(cigar);
00565         
00566         //   b) must have an integer length for each operator.
00567         // TODO
00568         //   c) all operators must be valid.
00569         // TODO
00570 
00571         //   d) is the same length as the sequence string.
00572         int seqLen = strlen(sequence);
00573         int cigarSeqLen = cigarRoller.getExpectedQueryBaseCount();
00574         if(cigarSeqLen != seqLen)
00575         {
00576             message = "CIGAR does not evaluate to the same length as SEQ, (";
00577             message += cigarSeqLen;
00578             message += " != ";
00579             message += seqLen;
00580             message += ").";
00581             validationErrors.addError(SamValidationError::INVALID_CIGAR,
00582                                       SamValidationError::WARNING, 
00583                                       message.c_str());
00584             status = false;
00585         }
00586     }
00587     return(status);
00588 }
00589 
00590 
00591 bool SamValidator::isValidQuality(const char* quality,
00592                                   const char* sequence,
00593                                   SamValidationErrors& validationErrors)
00594 {
00595     bool status = true;
00596 
00597     // If the quality or the sequence are non-"*", validate that the quality
00598     // and sequence have the same length.
00599     if((strcmp(sequence, "*") != 0) && (strcmp(quality, "*") != 0))
00600     {
00601         int seqLen = strlen(sequence);
00602         int qualLen = strlen(quality);
00603         // Both the sequence and the quality are not "*", so validate
00604         // that they are the same length.
00605         if(seqLen != qualLen)
00606         {
00607             // Both fields are specified but are different lengths.
00608             
00609             String message = "QUAL is not the same length as SEQ, (";
00610             message += qualLen;
00611             message += " != ";
00612             message += seqLen;
00613             message += ").";
00614             
00615             validationErrors.addError(SamValidationError::INVALID_QUAL,
00616                                       SamValidationError::WARNING, 
00617                                       message.c_str());
00618         status = false;
00619         }
00620     }
00621     return(status);
00622 }
Generated on Wed Nov 17 15:38:27 2010 for StatGen Software by  doxygen 1.6.3