libStatGen Software  1
FastQFile.h
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __FASTQ_VALIDATOR_H__
00019 #define __FASTQ_VALIDATOR_H__
00020 
00021 #include <iostream>
00022 #include <map>
00023 #include "StringBasics.h"
00024 #include "InputFile.h"
00025 #include "BaseComposition.h"
00026 #include "FastQStatus.h"
00027 
00028 /// Class for reading/validating a fastq file.
00029 class FastQFile
00030 {
00031  public:
00032     /// Constructor.
00033     /// /param minReadLength The minimum length that a base sequence must be for
00034     ///                      it to be valid.
00035     /// \param numPrintableErrors The maximum number of errors that should be reported
00036     ///                           in detail before suppressing the errors.
00037     FastQFile(int minReadLength = 10, int numPrintableErrors = 20);
00038     
00039     /// Disable messages - do not write to cout.
00040     void disableMessages();
00041     
00042     /// Enable messages - write to cout.
00043     void enableMessages();
00044     
00045     /// Disable Unique Sequence ID checking
00046     /// (Unique Sequence ID checking is enabled by default).
00047     void disableSeqIDCheck();
00048     
00049     /// Enable Unique Sequence ID checking.
00050     /// (Unique Sequence ID checking is enabled by default).
00051     void enableSeqIDCheck();
00052     
00053     /// Set the number of errors after which to quit reading/validating a file,
00054     /// defaults to -1.
00055     /// \param maxErrors # of errors before quitting, 
00056     /// -1 indicates to not quit until the entire file has been read/validated (default), 
00057     /// 0 indicates to quit without reading/validating anything.
00058     void setMaxErrors(int maxErrors);
00059 
00060     /// Open a FastQFile.
00061     /// Use the specified SPACE_TYPE to determine BASE, COLOR, or UNKNOWN.
00062     FastQStatus::Status openFile(const char* fileName,
00063                                  BaseAsciiMap::SPACE_TYPE spaceType = BaseAsciiMap::UNKNOWN);
00064 
00065     /// Close a FastQFile.
00066     FastQStatus::Status closeFile();
00067 
00068     /// Check to see if the file is open.
00069     bool isOpen();
00070 
00071     /// Check to see if the file is at the end of the file.
00072     bool isEof();
00073 
00074     /// Returns whether or not to keep reading the file,
00075     /// it stops reading (false) if eof or there is a problem reading the file.
00076     bool keepReadingFile();
00077 
00078     /// Validate the specified fastq file
00079     /// \param filename fastq file to be validated.
00080     /// \param printBaseComp whether or not to print the base composition for the file.
00081     ///                      true means print it, false means do not.
00082     /// \param spaceType the spaceType to use for validation - BASE_SPACE, COLOR_SPACE,
00083     ///                  or UNKNOWN (UNKNOWN means to determine the spaceType to
00084     ///                  validate against from the first character of the first
00085     ///                  sequence).
00086     /// \param printQualAvg  whether or not to print the quality averages for the file.
00087     ///                      true means to print it, false (default) means do not.
00088     /// \return the fastq validation status,  SUCCESS on a successfully
00089     /// validated fastq file.
00090     FastQStatus::Status validateFastQFile(const String &filename,  
00091                                           bool printBaseComp,
00092                                           BaseAsciiMap::SPACE_TYPE spaceType,
00093                                           bool printQualAvg = false);
00094 
00095     /// Read 1 FastQSequence, validating it.
00096     FastQStatus::Status readFastQSequence();
00097 
00098     ///////////////////////
00099     /// @name  Public Sequence Line variables.
00100     /// Keep public variables for a sequence's line so they can be accessed
00101     /// without having to do string copies.
00102     //@{
00103     String myRawSequence;
00104     String mySequenceIdLine;
00105     String mySequenceIdentifier;
00106     String myPlusLine;
00107     String myQualityString;
00108     //@}
00109 
00110     /// Get the space type used for this file.
00111     inline BaseAsciiMap::SPACE_TYPE getSpaceType()
00112     {
00113         return(myBaseComposition.getSpaceType());
00114     }
00115     
00116 private:
00117     // Validates a single fastq sequence from myFile.
00118     bool validateFastQSequence();
00119 
00120     // Reads and validates the sequence identifier line of a fastq sequence.
00121     bool validateSequenceIdentifierLine();
00122 
00123     // Reads and validates the raw sequence line(s) and the plus line.  Both are
00124     // included in one method since it is unknown when the raw sequence line
00125     // ends until you find the plus line that divides it from the quality
00126     // string.  Since this method will read the plus line to know when the
00127     // raw sequence ends, it also validates that line.
00128     bool validateRawSequenceAndPlusLines();
00129 
00130     // Reads and validates the quality string line(s).
00131     bool validateQualityStringLines();
00132 
00133     // Method to validate a line that contains part of the raw sequence.
00134     // offset specifies where in the sequence to start validating.
00135     bool validateRawSequence(int offset);
00136 
00137     // Method to validate the "+" line that seperates the raw sequence and the
00138     // quality string.
00139     bool validateSequencePlus();
00140 
00141     // Method to validate the quality string.
00142     // offset specifies where in the quality string to start validating.
00143     bool validateQualityString(int offset);
00144 
00145     // Helper method to read a line from the input file into a string.
00146     // It also tracks the line number.
00147     void readLine();
00148 
00149     // Helper method for printing the contents of myErrorString.  It will
00150     // only print the errors until the maximum number of reportable errors is
00151     // reached.
00152     void reportErrorOnLine();
00153 
00154     // Reset the member data for each fastq file.
00155     void reset();
00156 
00157     // Reset the member data for each sequence.
00158     void resetForEachSequence();
00159 
00160     // Log the specified message if enabled.
00161     void logMessage(const char* message);
00162 
00163     // Determine if it is time to quit by checking if we are to quit after a
00164     // certain number of errors and that many errors have been encountered.
00165     bool isTimeToQuit();
00166 
00167     void printAvgQual();
00168 
00169     //////////////////////////////////////////////////////////////////////
00170     // Following member data elements are reset for each validated sequence.
00171     //
00172 
00173     // Buffer for storing the contents of the line read.
00174     // Stored as member data so memory allocation is only done once.
00175     String myLineBuffer;
00176 
00177     // Buffer for storing the error string.  This prevents the reallocation of
00178     // the string buffer for each error.
00179     String myErrorString;
00180 
00181     String myTempPartialQuality;
00182 
00183     //////////////////////////////////////////////////////////////////////
00184     // Following member data elements are reset for each validated file.
00185     //
00186     IFILE myFile; // Input file to be read.
00187     String myFileName; // Name of file being processed.
00188     int myNumErrors;   // Tracks the number of errors.
00189     unsigned int myLineNum;    // Track the line number - used for reporting errors.
00190     BaseComposition myBaseComposition;  // Tracks the base composition.
00191     std::vector<int> myQualPerCycle;  // Tracks the quality by cycle.
00192     std::vector<int> myCountPerCycle;  // Tracks the number of entries by cycle.
00193 
00194     // Whether or not to check the sequence identifier for uniqueness.
00195     // Checking may use up a lot of memory.
00196     bool myCheckSeqID;
00197 
00198     // Map to track which identifiers have appeared in the file.
00199     std::map<std::string, unsigned int> myIdentifierMap;
00200  
00201     //////////////////////////////////////////////////////////////////////
00202     // Following member data do not change for each call to the validator.
00203     //
00204     int myMinReadLength; // Min Length for a read.
00205     int myNumPrintableErrors;  // Max number of errors to print the details of.
00206 
00207     // Number of errors after which to quit reading/validating a file.
00208     // Defaults to -1.
00209     //   -1 indicates to not quit until the entire file has been read/validated.
00210     //    0 indicates to quit without reading/validating anything.
00211     int myMaxErrors;
00212 
00213     // Whether or not messages should be printed.  
00214     // Defaulted to false (they should be printed).
00215     bool myDisableMessages;
00216 
00217     // Track if there is a problem reading the file.  If there are read
00218     // problems, stop reading the file.
00219     bool myFileProblem;
00220 };
00221 
00222 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends