libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __FASTQ_VALIDATOR_H__ 00019 #define __FASTQ_VALIDATOR_H__ 00020 00021 #include <iostream> 00022 #include <map> 00023 #include "StringBasics.h" 00024 #include "InputFile.h" 00025 #include "BaseComposition.h" 00026 #include "FastQStatus.h" 00027 00028 /// Class for reading/validating a fastq file. 00029 class FastQFile 00030 { 00031 public: 00032 /// Constructor. 00033 /// /param minReadLength The minimum length that a base sequence must be for 00034 /// it to be valid. 00035 /// \param numPrintableErrors The maximum number of errors that should be reported 00036 /// in detail before suppressing the errors. 00037 FastQFile(int minReadLength = 10, int numPrintableErrors = 20); 00038 00039 /// Disable messages - do not write to cout. 00040 void disableMessages(); 00041 00042 /// Enable messages - write to cout. 00043 void enableMessages(); 00044 00045 /// Disable Unique Sequence ID checking 00046 /// (Unique Sequence ID checking is enabled by default). 00047 void disableSeqIDCheck(); 00048 00049 /// Enable Unique Sequence ID checking. 00050 /// (Unique Sequence ID checking is enabled by default). 00051 void enableSeqIDCheck(); 00052 00053 /// Set the number of errors after which to quit reading/validating a file, 00054 /// defaults to -1. 00055 /// \param maxErrors # of errors before quitting, 00056 /// -1 indicates to not quit until the entire file has been read/validated (default), 00057 /// 0 indicates to quit without reading/validating anything. 00058 void setMaxErrors(int maxErrors); 00059 00060 /// Open a FastQFile. 00061 /// Use the specified SPACE_TYPE to determine BASE, COLOR, or UNKNOWN. 00062 FastQStatus::Status openFile(const char* fileName, 00063 BaseAsciiMap::SPACE_TYPE spaceType = BaseAsciiMap::UNKNOWN); 00064 00065 /// Close a FastQFile. 00066 FastQStatus::Status closeFile(); 00067 00068 /// Check to see if the file is open. 00069 bool isOpen(); 00070 00071 /// Check to see if the file is at the end of the file. 00072 bool isEof(); 00073 00074 /// Returns whether or not to keep reading the file, 00075 /// it stops reading (false) if eof or there is a problem reading the file. 00076 bool keepReadingFile(); 00077 00078 /// Validate the specified fastq file 00079 /// \param filename fastq file to be validated. 00080 /// \param printBaseComp whether or not to print the base composition for the file. 00081 /// true means print it, false means do not. 00082 /// \param spaceType the spaceType to use for validation - BASE_SPACE, COLOR_SPACE, 00083 /// or UNKNOWN (UNKNOWN means to determine the spaceType to 00084 /// validate against from the first character of the first 00085 /// sequence). 00086 /// \param printQualAvg whether or not to print the quality averages for the file. 00087 /// true means to print it, false (default) means do not. 00088 /// \return the fastq validation status, SUCCESS on a successfully 00089 /// validated fastq file. 00090 FastQStatus::Status validateFastQFile(const String &filename, 00091 bool printBaseComp, 00092 BaseAsciiMap::SPACE_TYPE spaceType, 00093 bool printQualAvg = false); 00094 00095 /// Read 1 FastQSequence, validating it. 00096 FastQStatus::Status readFastQSequence(); 00097 00098 /////////////////////// 00099 /// @name Public Sequence Line variables. 00100 /// Keep public variables for a sequence's line so they can be accessed 00101 /// without having to do string copies. 00102 //@{ 00103 String myRawSequence; 00104 String mySequenceIdLine; 00105 String mySequenceIdentifier; 00106 String myPlusLine; 00107 String myQualityString; 00108 //@} 00109 00110 /// Get the space type used for this file. 00111 inline BaseAsciiMap::SPACE_TYPE getSpaceType() 00112 { 00113 return(myBaseComposition.getSpaceType()); 00114 } 00115 00116 private: 00117 // Validates a single fastq sequence from myFile. 00118 bool validateFastQSequence(); 00119 00120 // Reads and validates the sequence identifier line of a fastq sequence. 00121 bool validateSequenceIdentifierLine(); 00122 00123 // Reads and validates the raw sequence line(s) and the plus line. Both are 00124 // included in one method since it is unknown when the raw sequence line 00125 // ends until you find the plus line that divides it from the quality 00126 // string. Since this method will read the plus line to know when the 00127 // raw sequence ends, it also validates that line. 00128 bool validateRawSequenceAndPlusLines(); 00129 00130 // Reads and validates the quality string line(s). 00131 bool validateQualityStringLines(); 00132 00133 // Method to validate a line that contains part of the raw sequence. 00134 // offset specifies where in the sequence to start validating. 00135 bool validateRawSequence(int offset); 00136 00137 // Method to validate the "+" line that seperates the raw sequence and the 00138 // quality string. 00139 bool validateSequencePlus(); 00140 00141 // Method to validate the quality string. 00142 // offset specifies where in the quality string to start validating. 00143 bool validateQualityString(int offset); 00144 00145 // Helper method to read a line from the input file into a string. 00146 // It also tracks the line number. 00147 void readLine(); 00148 00149 // Helper method for printing the contents of myErrorString. It will 00150 // only print the errors until the maximum number of reportable errors is 00151 // reached. 00152 void reportErrorOnLine(); 00153 00154 // Reset the member data for each fastq file. 00155 void reset(); 00156 00157 // Reset the member data for each sequence. 00158 void resetForEachSequence(); 00159 00160 // Log the specified message if enabled. 00161 void logMessage(const char* message); 00162 00163 // Determine if it is time to quit by checking if we are to quit after a 00164 // certain number of errors and that many errors have been encountered. 00165 bool isTimeToQuit(); 00166 00167 void printAvgQual(); 00168 00169 ////////////////////////////////////////////////////////////////////// 00170 // Following member data elements are reset for each validated sequence. 00171 // 00172 00173 // Buffer for storing the contents of the line read. 00174 // Stored as member data so memory allocation is only done once. 00175 String myLineBuffer; 00176 00177 // Buffer for storing the error string. This prevents the reallocation of 00178 // the string buffer for each error. 00179 String myErrorString; 00180 00181 String myTempPartialQuality; 00182 00183 ////////////////////////////////////////////////////////////////////// 00184 // Following member data elements are reset for each validated file. 00185 // 00186 IFILE myFile; // Input file to be read. 00187 String myFileName; // Name of file being processed. 00188 int myNumErrors; // Tracks the number of errors. 00189 unsigned int myLineNum; // Track the line number - used for reporting errors. 00190 BaseComposition myBaseComposition; // Tracks the base composition. 00191 std::vector<int> myQualPerCycle; // Tracks the quality by cycle. 00192 std::vector<int> myCountPerCycle; // Tracks the number of entries by cycle. 00193 00194 // Whether or not to check the sequence identifier for uniqueness. 00195 // Checking may use up a lot of memory. 00196 bool myCheckSeqID; 00197 00198 // Map to track which identifiers have appeared in the file. 00199 std::map<std::string, unsigned int> myIdentifierMap; 00200 00201 ////////////////////////////////////////////////////////////////////// 00202 // Following member data do not change for each call to the validator. 00203 // 00204 int myMinReadLength; // Min Length for a read. 00205 int myNumPrintableErrors; // Max number of errors to print the details of. 00206 00207 // Number of errors after which to quit reading/validating a file. 00208 // Defaults to -1. 00209 // -1 indicates to not quit until the entire file has been read/validated. 00210 // 0 indicates to quit without reading/validating anything. 00211 int myMaxErrors; 00212 00213 // Whether or not messages should be printed. 00214 // Defaulted to false (they should be printed). 00215 bool myDisableMessages; 00216 00217 // Track if there is a problem reading the file. If there are read 00218 // problems, stop reading the file. 00219 bool myFileProblem; 00220 }; 00221 00222 #endif