libStatGen Software  1
SamValidation.h
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_VALIDATION_H__
00019 #define __SAM_VALIDATION_H__
00020 
00021 #include "SamFile.h"
00022 #include <list>
00023 
00024 // On windows, ERROR and WARNING are pre-defined macros, so undefine them.
00025 #ifdef WARNING
00026 #undef WARNING
00027 #endif
00028 #ifdef ERROR
00029 #undef ERROR
00030 #endif
00031 
00032 /// The SamValidationError class describes a validation error that occured,
00033 /// containing the error type, severity, and textual error message.
00034 class SamValidationError
00035 {
00036 public:
00037     /// Severity of the error.
00038     enum Severity 
00039         {
00040             WARNING, ///< Warning is used if it is just an invalid value.
00041             ERROR    ///< Error is used if parsing could not succeed.
00042         };
00043 
00044     /// Type of the error.
00045     /// TODO: NOT ALL INVALID TYPES HAVE BEEN ADDED SINCE NOT ALL VALIDATION
00046     /// IS COMPLETE YET
00047     enum Type
00048         {
00049             INVALID_QNAME, ///< Invalid read/query name
00050             INVALID_REF_ID, ///< Invalid reference id
00051             INVALID_RNAME, ///< Invalid reference name
00052             INVALID_POS, ///< Invalid position
00053             INVALID_MAPQ, ///< Invalid mapping quality
00054             INVALID_CIGAR, ///< Invalid CIGAR
00055             INVALID_MRNM, ///< Invalid mate/next fragment reference name
00056             INVALID_QUAL, ///< Invalid base quality
00057             INVALID_TAG ///< Invalid tag
00058         };
00059 
00060     /// Get the string representing the specified type of validation error.
00061     static const char* getTypeString(Type type);
00062 
00063     /// Constructor that sets the type, severity, and message for the
00064     /// validation error.
00065     SamValidationError(Type type, Severity severity, std::string Message);
00066    
00067     /// Return the type enum of this validation error object.
00068     Type getType() const;
00069 
00070     /// Return the severity enum of this validation error object.
00071     Severity getSeverity() const;
00072 
00073     /// Return the error message of this validation error object.
00074     const char* getMessage() const;
00075 
00076     /// Return the string representing this object's type of validation error.
00077     const char* getTypeString() const;
00078 
00079     /// Return the string representing this object's severity of validation
00080     /// error.
00081     const char* getSeverityString() const;
00082 
00083     /// Get the error string representing this object's error.
00084     void getErrorString(std::string& errorString) const;
00085 
00086     /// Print a formatted output of the error to cerr.
00087     void printError() const;
00088 
00089 private:
00090     SamValidationError();
00091 
00092     static const char* enumTypeString[];
00093     static const char* enumSeverityString[];
00094 
00095     Type myType;
00096     Severity mySeverity;
00097     std::string myMessage;
00098 
00099 };
00100 
00101 
00102 /// stream output for validation failure information
00103 inline std::ostream &operator << (std::ostream &stream, 
00104                                   const SamValidationError &error)
00105 {
00106     std::string errorMessage;
00107     error.getErrorString(errorMessage);
00108     stream << errorMessage;
00109     return stream;
00110 }
00111 
00112 
00113 /// The SamValidationErrors class is a container class that holds
00114 /// SamValidationError Objects, allowing a validation method to return all
00115 /// of the invalid errors rather than just one.
00116 class SamValidationErrors
00117 {
00118 public:
00119     /// Constructor.
00120     SamValidationErrors();
00121     /// Destructor
00122     ~SamValidationErrors();
00123 
00124     /// Remove all the errors from the container.
00125     void clear();
00126 
00127     /// Add the specified error to this container.
00128     void addError(SamValidationError::Type newType, 
00129                   SamValidationError::Severity newSeverity,
00130                   const char* newMessage);
00131 
00132     /// Return the number of validation errors contained in this object.
00133     unsigned int numErrors();
00134 
00135     /// Return a pointer to the next error without removing it from the 
00136     /// container, and returning null once all errors have been retrieved
00137     /// until resetErrorIter is called.
00138     const SamValidationError* getNextError();
00139    
00140     /// Reset the iterator to the begining of the errors.
00141     void resetErrorIter();
00142 
00143     /// Append the error messages contained in this container to the passed
00144     /// in string.
00145     void getErrorString(std::string& errorString) const;
00146 
00147 private:
00148     std::list<const SamValidationError*> myValidationErrors;
00149     std::list<const SamValidationError*>::const_iterator myErrorIter;
00150 };
00151 
00152 
00153 /// stream output for all validation failures information
00154 inline std::ostream& operator << (std::ostream& stream,
00155                                   const SamValidationErrors& errors)
00156 {
00157     std::string errorString = "";
00158     errors.getErrorString(errorString);
00159     stream << errorString;
00160     return stream;
00161 }
00162 
00163 
00164 /// The SamValidator class contains static methods for validating the SAM/BAM
00165 /// Record and each of its fields. The generic isValid method performs all of
00166 /// the other validations. The SamValidator methods return whether or not what
00167 /// is being validated is valid. True means it is valid, false means it is not.
00168 /// The specifics of the invalid value(s) are contained in the
00169 /// SamValidationErrors object that is passed in (by reference) to the method.
00170 /// The specific errors can be pulled out of that object.
00171 /// TODO: VALIDATION METHODS STILL NEED TO BE ADDED, and isValid does not yet
00172 /// validate all fields!!!
00173 class SamValidator
00174 {
00175 public:
00176 
00177     /// Validates whether or not the specified SamRecord is valid, calling
00178     /// all of the other validations.
00179     /// TODO: more validation needs to be added.
00180     /// \param samHeader header associated with the record to be validated.
00181     /// \param samRecord record to be validated.
00182     /// \param validationErrors status  to append any errors too.
00183     /// \return true if it is valid, false and appends to SamValidationErrors
00184     /// if it is not
00185     static bool isValid(SamFileHeader& samHeader, SamRecord& samRecord, 
00186                         SamValidationErrors& validationErrors);
00187 
00188     /// Determines whether or not the specified qname is valid.
00189     /// Validation for QNAME is:
00190     ///   a) length of the qname string is the same as the read name length
00191     ///   b) length is between 1 and 254.
00192     ///   c) [ \t\n\r] are not allowed in the name.
00193     /// \param qname the read/query name.
00194     /// \param qnameLen length of the read including the null (result of 
00195     /// SamRecord::getReadNameLength().
00196     /// \param validationErrors status  to append any errors too.
00197     /// \return true if it is valid, false and appends to SamValidationErrors
00198     /// if it is not
00199     static bool isValidQname(const char* qname, uint8_t qnameLen, 
00200                              SamValidationErrors& validationErrors);
00201 
00202     /// Determines whether or not the flag is valid.
00203     /// TODO: currently no validation is done on the flag.
00204     /// \param flag flag to be validated.
00205     /// \param validationErrors status  to append any errors too.
00206     /// \return true if it is valid, false and appends to SamValidationErrors
00207     /// if it is not
00208     static bool isValidFlag(uint16_t flag,
00209                             SamValidationErrors& validationErrors);
00210 
00211     /// Validate the reference name including validating against the header.
00212     /// 1) Cross validate the rname and the header.
00213     /// 2) perform the validation in the method that doesn't take the header.
00214     /// \param samHeader header associated with the rname to be validated.
00215     /// \param rname reference name to be validated.
00216     /// \param validationErrors status  to append any errors too.
00217     /// \return true if it is valid, false and appends to SamValidationErrors
00218     /// if it is not
00219     static bool isValidRname(SamFileHeader& samHeader, 
00220                              const char* rname,
00221                              SamValidationErrors& validationErrors);
00222     /// Validate the rname without validating against the header.
00223     /// Validation for RNAME is:
00224     ///   a) cannot be 0 length.
00225     ///   b) [ \t\n\r@=] are not allowed in the name.
00226     /// \param rname reference name to be validated.
00227     /// \param validationErrors status  to append any errors too.
00228     /// \return true if it is valid, false and appends to SamValidationErrors
00229     /// if it is not
00230     static bool isValidRname(const char* rname,
00231                              SamValidationErrors& validationErrors);
00232 
00233     /// Validate whether or not the specified reference id is valid.
00234     /// Validation for rID is:
00235     ///  a) must be between -1 and the number of refInfo.
00236     ///     -1 is allowed, and otherwise it must properly index into the array.
00237     /// \param refID reference id to be validated.
00238     /// \param refInfo sam reference information containing the mapping
00239     /// from reference id to reference name for this refID.
00240     /// \param validationErrors status  to append any errors too.
00241     /// \return true if it is valid, false and appends to SamValidationErrors
00242     /// if it is not
00243     static bool isValidRefID(int32_t refID, const SamReferenceInfo& refInfo, 
00244                              SamValidationErrors& validationErrors);
00245 
00246     /// Validate the refeference position.
00247     /// Validation for pos is:
00248     ///   a) must be between 0 and (2^29)-1.
00249     /// \param pos position to be validated.
00250     /// \param validationErrors status  to append any errors too.
00251     /// \return true if it is valid, false and appends to SamValidationErrors
00252     /// if it is not
00253     static bool isValid1BasedPos(int32_t pos, 
00254                                  SamValidationErrors& validationErrors);
00255 
00256     /// Validate the mapping quality.
00257     /// TODO: currently no validation is done on the mapping quality.
00258     /// \param mapQuality mapping quality to be validated.
00259     /// \param validationErrors status  to append any errors too.
00260     /// \return true if it is valid, false and appends to SamValidationErrors
00261     /// if it is not
00262     static bool isValidMapQuality(uint8_t mapQuality,
00263                                   SamValidationErrors& validationErrors);
00264 
00265     /// Validate the sequence, but not against the cigar or quality string.
00266     /// Validation against cigar is done in isValidCigar.
00267     /// Validation against the quality string is done in isValidQuality.
00268     /// TODO: currently no validation is done in this method.
00269     /// \param samRecord record whose sequence should be validated.
00270     /// \param validationErrors status  to append any errors too.
00271     /// \return true if it is valid, false and appends to SamValidationErrors
00272     /// if it is not
00273     static bool isValidSequence(SamRecord& samRecord,
00274                                 SamValidationErrors& validationErrors);
00275 
00276     /// Validate the cigar.  Cigar validation depends on sequence.
00277     /// Validation for CIGAR is:
00278     ///   a) cannot be 0 length.
00279     /// if not "*", validate the following:
00280     ///   b) must have an integer length for each operator (if not "*"). TODO
00281     ///   c) all operators must be valid (if not "*"). TODO
00282     ///   d) evaluates to the same read length as the sequence string.
00283     /// \param samRecord record whose cigar should be validated.
00284     /// \param validationErrors status  to append any errors too.
00285     /// \return true if it is valid, false and appends to SamValidationErrors
00286     /// if it is not
00287     static bool isValidCigar(SamRecord& samRecord,
00288                              SamValidationErrors& validationErrors);
00289 
00290     /// Validate the cigar.  Cigar validation depends on sequence.
00291     /// Validation for CIGAR is:
00292     ///   a) cannot be 0 length.
00293     /// if not "*", validate the following:
00294     ///   b) must have an integer length for each operator (if not "*"). TODO
00295     ///   c) all operators must be valid (if not "*"). TODO
00296     ///   d) evaluates to the same read length as the sequence string.
00297     /// \param cigar cigar string to be validated.
00298     /// \param sequence sequence to check the cigar against.
00299     /// \param validationErrors status  to append any errors too.
00300     /// \return true if it is valid, false and appends to SamValidationErrors
00301     /// if it is not
00302     static bool isValidCigar(const char* cigar, const char* sequence,
00303                              SamValidationErrors& validationErrors);
00304 
00305     /// Validate the cigar.  Cigar validation depends on sequence.
00306     /// Validation for CIGAR is:
00307     ///   a) cannot be 0 length.
00308     /// if not "*", validate the following:
00309     ///   b) TODO: must have an integer length for each operator (if not "*").
00310     ///   c) TODO: all operators must be valid (if not "*").
00311     ///   d) evaluates to the same read length as the sequence string.
00312     /// \param cigar cigar string to be validated.
00313     /// \param seqLen sequence length to check the cigar against.
00314     /// \param validationErrors status  to append any errors too.
00315     /// \return true if it is valid, false and appends to SamValidationErrors
00316     /// if it is not
00317     static bool isValidCigar(const char* cigar,
00318                              int seqLen,
00319                              SamValidationErrors& validationErrors);
00320 
00321     /// TODO: validate the mate/next fragment's reference name.
00322     /// \return true if it is valid, false and appends to SamValidationErrors
00323     /// if it is not
00324     static bool isValidMrnm();
00325 
00326     /// TODO: validate the mate/next fragment's position.
00327     /// \return true if it is valid, false and appends to SamValidationErrors
00328     /// if it is not
00329     static bool isValidMpos();
00330 
00331     /// TODO: validate the insertion size/observed template length.
00332     /// \return true if it is valid, false and appends to SamValidationErrors
00333     /// if it is not
00334     static bool isValidIsize();
00335 
00336     /// TODO, validate the sequence.
00337     /// \return true if it is valid, false and appends to SamValidationErrors
00338     /// if it is not
00339     static bool isValidSeq();
00340 
00341     /// Validate the base quality.
00342     /// Quality validation depends on sequence.
00343     /// Validation for quality is:
00344     ///   a) quality & sequence are the same length if both are specified.
00345     /// TODO: more validation.
00346     /// \param samRecord record whose quality should be validated.
00347     /// \param validationErrors status  to append any errors too.
00348     /// \return true if it is valid, false and appends to SamValidationErrors
00349     /// if it is not
00350     static bool isValidQuality(SamRecord& samRecord,
00351                                SamValidationErrors& validationErrors);
00352 
00353     /// Validate the base quality.
00354     /// Quality validation depends on sequence.
00355     /// Validation for quality is:
00356     ///   a) quality & sequence are the same length if both are specified.
00357     /// TODO: more validation.
00358     /// \param quality quality string to be validated.
00359     /// \param seqLen sequence length to check the quality against.
00360     /// \param validationErrors status  to append any errors too.
00361     /// \return true if it is valid, false and appends to SamValidationErrors
00362     /// if it is not
00363     static bool isValidQuality(const char* quality, const char* sequence,
00364                                SamValidationErrors& validationErrors);
00365 
00366     /// Validate the base quality.
00367     /// Quality validation depends on sequence.
00368     /// Validation for quality is:
00369     ///   a) quality & sequence are the same length if both are specified.
00370     /// TODO: more validation.
00371     /// \param quality quality string to be validated.
00372     /// \param seqLen sequence length to check the quality against.
00373     /// \param validationErrors status  to append any errors too.
00374     /// \return true if it is valid, false and appends to SamValidationErrors
00375     /// if it is not
00376     bool static isValidQuality(const char* quality,
00377                                int seqLength,
00378                                SamValidationErrors& validationErrors);
00379 
00380     /// Validate the tags.
00381     /// Validation for tags is:
00382     ///   a) check that the "MD" tag is correct if it is present.
00383     /// TODO: more validation.
00384     /// \param samRecord record whose tags should be validated.
00385     /// \param validationErrors status  to append any errors too.
00386     /// \return true if it is valid, false and appends to SamValidationErrors
00387     /// if it is not
00388     static bool isValidTags(SamRecord& samRecord,
00389                             SamValidationErrors& validationErrors);
00390 
00391     /// TODO validate the tag vtype
00392     /// \return true if it is valid, false and appends to SamValidationErrors
00393     /// if it is not
00394     static bool isValidVtype();
00395 
00396     /// TODO validate the tag vtype
00397     /// \return true if it is valid, false and appends to SamValidationErrors
00398     /// if it is not
00399     static bool isValidValue();
00400 };
00401 
00402 
00403 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends