00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __FASTQ_VALIDATOR_H__ 00019 #define __FASTQ_VALIDATOR_H__ 00020 00021 #include <iostream> 00022 #include <map> 00023 #include "StringBasics.h" 00024 #include "InputFile.h" 00025 #include "BaseComposition.h" 00026 #include "FastQStatus.h" 00027 00028 class FastQFile 00029 { 00030 public: 00031 // Constructor. 00032 // minReadLength - The minimum length that a base sequence must be for 00033 // it to be valid. 00034 // numPrintableErrors - The maximum number of errors that should be reported 00035 // in detail before suppressing the errors. 00036 FastQFile(int minReadLength = 10, int numPrintableErrors = 20); 00037 00038 // Disable messages - do not write to cout. 00039 void disableMessages(); 00040 00041 // Enable messages - write to cout. 00042 void enableMessages(); 00043 00044 // Disable Unique Sequence ID checking. 00045 // Unique Sequence ID checking is enabled by default. 00046 void disableSeqIDCheck(); 00047 00048 // Enable Unique Sequence ID checking. 00049 // Unique Sequence ID checking is enabled by default. 00050 void enableSeqIDCheck(); 00051 00052 // Set the number of errors after which to quit reading/validating a file. 00053 // Defaults to -1. 00054 // -1 indicates to not quit until the entire file has been read/validated. 00055 // 0 indicates to quit without reading/validating anything. 00056 void setMaxErrors(int maxErrors); 00057 00058 // Open a FastQFile. 00059 // If baseLetter is specified to be non-"", then it will be used to 00060 // set the base sequence for this file. If the letter is in base-space, that 00061 // will be used. If it is in color-space, that will be used, if it is in 00062 // neither, then both are allowed. If it is blank, then the first 00063 // character of the sequence will be used to set the space type. 00064 FastQStatus::Status openFile(const char* fileName, 00065 BaseAsciiMap::SPACE_TYPE spaceType = BaseAsciiMap::UNKNOWN); 00066 00067 // Close a FastQFile. 00068 FastQStatus::Status closeFile(); 00069 00070 // Check to see if the file is open. 00071 bool isOpen(); 00072 00073 // Check to see if the file is at the end of the file. 00074 bool isEof(); 00075 00076 // Returns whether or not to keep reading the file. 00077 // Stop reading (false) if eof or there is a problem reading the file. 00078 bool keepReadingFile(); 00079 00080 // Validate the specified fastq file 00081 // filename - fastq file to be validated. 00082 // printBaseComp - whether or not to print the base composition for the file. 00083 // true means print it, false means do not. 00084 // spaceType - the spaceType to use for validation - BASE_SPACE, COLOR_SPACE, 00085 // or UNKNOWN. UNKNOWN means to determine the spaceType to 00086 // validate against from the first character of the first 00087 // sequence. 00088 // Returns the fastq validation status - SUCCESS on a successfully 00089 // validated fastq file. 00090 FastQStatus::Status validateFastQFile(const String &filename, 00091 bool printBaseComp, 00092 BaseAsciiMap::SPACE_TYPE spaceType); 00093 00094 // Read 1 FastQSequence, validating it. 00095 FastQStatus::Status readFastQSequence(); 00096 00097 // Keep public variables for a sequence's line so they can be accessed 00098 // without having to do string copies. 00099 String myRawSequence; 00100 String mySequenceIdLine; 00101 String mySequenceIdentifier; 00102 String myPlusLine; 00103 String myQualityString; 00104 00105 inline BaseAsciiMap::SPACE_TYPE getSpaceType() 00106 { 00107 return(myBaseComposition.getSpaceType()); 00108 } 00109 00110 private: 00111 00112 // Validates a single fastq sequence from myFile. 00113 bool validateFastQSequence(); 00114 00115 // Reads and validates the sequence identifier line of a fastq sequence. 00116 bool validateSequenceIdentifierLine(); 00117 00118 // Reads and validates the raw sequence line(s) and the plus line. Both are 00119 // included in one method since it is unknown when the raw sequence line 00120 // ends until you find the plus line that divides it from the quality 00121 // string. Since this method will read the plus line to know when the 00122 // raw sequence ends, it also validates that line. 00123 bool validateRawSequenceAndPlusLines(); 00124 00125 // Reads and validates the quality string line(s). 00126 bool validateQualityStringLines(); 00127 00128 // Method to validate a line that contains part of the raw sequence. 00129 // offset specifies where in the sequence to start validating. 00130 bool validateRawSequence(int offset); 00131 00132 // Method to validate the "+" line that seperates the raw sequence and the 00133 // quality string. 00134 bool validateSequencePlus(); 00135 00136 // Method to validate the quality string. 00137 // offset specifies where in the quality string to start validating. 00138 bool validateQualityString(int offset); 00139 00140 // Helper method to read a line from the input file into a string. 00141 // It also tracks the line number. 00142 void readLine(); 00143 00144 // Helper method for printing the contents of myErrorString. It will 00145 // only print the errors until the maximum number of reportable errors is 00146 // reached. 00147 void reportErrorOnLine(); 00148 00149 // Reset the member data for each fastq file. 00150 void reset(); 00151 00152 // Reset the member data for each sequence. 00153 void resetForEachSequence(); 00154 00155 // Log the specified message if enabled. 00156 void logMessage(const char* message); 00157 00158 // Determine if it is time to quit by checking if we are to quit after a 00159 // certain number of errors and that many errors have been encountered. 00160 bool isTimeToQuit(); 00161 00162 ////////////////////////////////////////////////////////////////////// 00163 // Following member data elements are reset for each validated sequence. 00164 // 00165 00166 // Buffer for storing the contents of the line read. 00167 // Stored as member data so memory allocation is only done once. 00168 String myLineBuffer; 00169 00170 // Buffer for storing the error string. This prevents the reallocation of 00171 // the string buffer for each error. 00172 String myErrorString; 00173 00174 String myTempPartialQuality; 00175 00176 ////////////////////////////////////////////////////////////////////// 00177 // Following member data elements are reset for each validated file. 00178 // 00179 IFILE myFile; // Input file to be read. 00180 String myFileName; // Name of file being processed. 00181 int myNumErrors; // Tracks the number of errors. 00182 unsigned int myLineNum; // Track the line number - used for reporting errors. 00183 BaseComposition myBaseComposition; // Tracks the base composition. 00184 00185 // Whether or not to check the sequence identifier for uniqueness. 00186 // Checking may use up a lot of memory. 00187 bool myCheckSeqID; 00188 00189 // Map to track which identifiers have appeared in the file. 00190 std::map<std::string, unsigned int> myIdentifierMap; 00191 00192 ////////////////////////////////////////////////////////////////////// 00193 // Following member data do not change for each call to the validator. 00194 // 00195 int myMinReadLength; // Min Length for a read. 00196 int myNumPrintableErrors; // Max number of errors to print the details of. 00197 00198 // Number of errors after which to quit reading/validating a file. 00199 // Defaults to -1. 00200 // -1 indicates to not quit until the entire file has been read/validated. 00201 // 0 indicates to quit without reading/validating anything. 00202 int myMaxErrors; 00203 00204 00205 00206 // Whether or not messages should be printed. 00207 // Defaulted to false (they should be printed). 00208 bool myDisableMessages; 00209 00210 // Track if there is a problem reading the file. If there are read 00211 // problems, stop reading the file. 00212 bool myFileProblem; 00213 }; 00214 00215 #endif