FastQFile.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __FASTQ_VALIDATOR_H__
00019 #define __FASTQ_VALIDATOR_H__
00020 
00021 #include <iostream>
00022 #include <map>
00023 #include "StringBasics.h"
00024 #include "InputFile.h"
00025 #include "BaseComposition.h"
00026 #include "FastQStatus.h"
00027 
00028 class FastQFile
00029 {
00030  public:
00031    // Constructor.
00032    // minReadLength - The minimum length that a base sequence must be for
00033    //                 it to be valid.
00034    // numPrintableErrors - The maximum number of errors that should be reported
00035    //                      in detail before suppressing the errors.
00036    FastQFile(int minReadLength = 10, int numPrintableErrors = 20);
00037 
00038    // Disable messages - do not write to cout.
00039    void disableMessages();
00040 
00041    // Enable messages - write to cout.
00042    void enableMessages();
00043 
00044    // Disable Unique Sequence ID checking.  
00045    // Unique Sequence ID checking is enabled by default.
00046     void disableSeqIDCheck();
00047 
00048    // Enable Unique Sequence ID checking.
00049    // Unique Sequence ID checking is enabled by default.
00050    void enableSeqIDCheck();
00051 
00052    // Set the number of errors after which to quit reading/validating a file.
00053    // Defaults to -1.
00054    //   -1 indicates to not quit until the entire file has been read/validated.
00055    //    0 indicates to quit without reading/validating anything.
00056    void setMaxErrors(int maxErrors);
00057 
00058    // Open a FastQFile.
00059    // If baseLetter is specified to be non-"", then it will be used to
00060    // set the base sequence for this file.  If the letter is in base-space, that
00061    // will be used.  If it is in color-space, that will be used, if it is in
00062    // neither, then both are allowed.  If it is blank, then the first 
00063    // character of the sequence will be used to set the space type.
00064    FastQStatus::Status openFile(const char* fileName,
00065                                 BaseAsciiMap::SPACE_TYPE spaceType = BaseAsciiMap::UNKNOWN);
00066    
00067    // Close a FastQFile.
00068    FastQStatus::Status closeFile();
00069 
00070    // Check to see if the file is open.
00071    bool isOpen();
00072 
00073    // Check to see if the file is at the end of the file.
00074    bool isEof();
00075    
00076    // Returns whether or not to keep reading the file.
00077    // Stop reading (false) if eof or there is a problem reading the file.
00078    bool keepReadingFile();
00079       
00080    // Validate the specified fastq file
00081    // filename - fastq file to be validated.
00082    // printBaseComp - whether or not to print the base composition for the file.
00083    //                 true means print it, false means do not.
00084    // spaceType - the spaceType to use for validation - BASE_SPACE, COLOR_SPACE,
00085    //             or UNKNOWN.  UNKNOWN means to determine the spaceType to
00086    //             validate against from the first character of the first
00087    //             sequence.
00088    // Returns the fastq validation status -  SUCCESS on a successfully
00089    // validated fastq file.
00090    FastQStatus::Status validateFastQFile(const String &filename,  
00091                                          bool printBaseComp,
00092                                          BaseAsciiMap::SPACE_TYPE spaceType);
00093 
00094    // Read 1 FastQSequence, validating it.
00095    FastQStatus::Status readFastQSequence();
00096 
00097    // Keep public variables for a sequence's line so they can be accessed
00098    // without having to do string copies.
00099    String myRawSequence;
00100    String mySequenceIdLine;
00101    String mySequenceIdentifier;
00102    String myPlusLine;
00103    String myQualityString;
00104 
00105    inline BaseAsciiMap::SPACE_TYPE getSpaceType()
00106    {
00107       return(myBaseComposition.getSpaceType());
00108    }
00109 
00110  private:
00111 
00112    // Validates a single fastq sequence from myFile.
00113    bool validateFastQSequence();
00114 
00115    // Reads and validates the sequence identifier line of a fastq sequence.
00116    bool validateSequenceIdentifierLine();
00117 
00118    // Reads and validates the raw sequence line(s) and the plus line.  Both are
00119    // included in one method since it is unknown when the raw sequence line
00120    // ends until you find the plus line that divides it from the quality
00121    // string.  Since this method will read the plus line to know when the
00122    // raw sequence ends, it also validates that line.
00123    bool validateRawSequenceAndPlusLines();
00124 
00125    // Reads and validates the quality string line(s).
00126    bool validateQualityStringLines();
00127 
00128    // Method to validate a line that contains part of the raw sequence.
00129    // offset specifies where in the sequence to start validating.
00130    bool validateRawSequence(int offset);
00131 
00132    // Method to validate the "+" line that seperates the raw sequence and the
00133    // quality string.
00134    bool validateSequencePlus();
00135 
00136    // Method to validate the quality string.
00137    // offset specifies where in the quality string to start validating.
00138    bool validateQualityString(int offset);
00139 
00140    // Helper method to read a line from the input file into a string.
00141    // It also tracks the line number.
00142    void readLine();
00143 
00144    // Helper method for printing the contents of myErrorString.  It will
00145    // only print the errors until the maximum number of reportable errors is
00146    // reached.
00147    void reportErrorOnLine();
00148 
00149    // Reset the member data for each fastq file.
00150    void reset();
00151 
00152    // Reset the member data for each sequence.
00153    void resetForEachSequence();
00154 
00155    // Log the specified message if enabled.
00156    void logMessage(const char* message);
00157 
00158    // Determine if it is time to quit by checking if we are to quit after a
00159    // certain number of errors and that many errors have been encountered.
00160    bool isTimeToQuit();
00161 
00162    //////////////////////////////////////////////////////////////////////
00163    // Following member data elements are reset for each validated sequence.
00164    //
00165 
00166    // Buffer for storing the contents of the line read.
00167    // Stored as member data so memory allocation is only done once.
00168    String myLineBuffer;
00169 
00170    // Buffer for storing the error string.  This prevents the reallocation of
00171    // the string buffer for each error.
00172    String myErrorString;
00173 
00174    String myTempPartialQuality;
00175 
00176    //////////////////////////////////////////////////////////////////////
00177    // Following member data elements are reset for each validated file.
00178    //
00179    IFILE myFile; // Input file to be read.
00180    String myFileName; // Name of file being processed.
00181    int myNumErrors;   // Tracks the number of errors.
00182    uint myLineNum;    // Track the line number - used for reporting errors.
00183    BaseComposition myBaseComposition;  // Tracks the base composition.
00184 
00185     // Whether or not to check the sequence identifier for uniqueness.
00186     // Checking may use up a lot of memory.
00187     bool myCheckSeqID;
00188 
00189    // Map to track which identifiers have appeared in the file.
00190    std::map<std::string, uint> myIdentifierMap;
00191  
00192    //////////////////////////////////////////////////////////////////////
00193    // Following member data do not change for each call to the validator.
00194    //
00195    int myMinReadLength; // Min Length for a read.
00196    int myNumPrintableErrors;  // Max number of errors to print the details of.
00197 
00198    // Number of errors after which to quit reading/validating a file.
00199    // Defaults to -1.
00200    //   -1 indicates to not quit until the entire file has been read/validated.
00201    //    0 indicates to quit without reading/validating anything.
00202    int myMaxErrors;
00203 
00204 
00205 
00206    // Whether or not messages should be printed.  
00207    // Defaulted to false (they should be printed).
00208    bool myDisableMessages;
00209 
00210    // Track if there is a problem reading the file.  If there are read
00211    // problems, stop reading the file.
00212    bool myFileProblem;
00213 };
00214 
00215 #endif
Generated on Wed Nov 17 15:38:28 2010 for StatGen Software by  doxygen 1.6.3