libStatGen Software  1
FastQFile.cpp
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <iostream>
00019 
00020 #include "InputFile.h"
00021 #include "FastQFile.h"
00022 #include "BaseUtilities.h"
00023 
00024 // Constructor.
00025 // minReadLength - The minimum length that a base sequence must be for
00026 //                 it to be valid.
00027 // numPrintableErrors - The maximum number of errors that should be reported
00028 //                      in detail before suppressing the errors.
00029 // 
00030 FastQFile::FastQFile(int minReadLength, int numPrintableErrors)
00031    : myFile(NULL),
00032      myBaseComposition(),
00033      myQualPerCycle(),
00034      myCountPerCycle(),
00035      myCheckSeqID(true),
00036      myMinReadLength(minReadLength),
00037      myNumPrintableErrors(numPrintableErrors),
00038      myMaxErrors(-1),
00039      myDisableMessages(false),
00040      myFileProblem(false)
00041 {
00042    // Reset the member data.
00043    reset();
00044 }
00045 
00046 
00047 void FastQFile::disableMessages()
00048 {
00049    myDisableMessages = true;
00050 }
00051 
00052 
00053 void FastQFile::enableMessages()
00054 {
00055    myDisableMessages = false;
00056 }
00057 
00058 
00059 // Disable Unique Sequence ID checking.  
00060 // Unique Sequence ID checking is enabled by default.
00061 void FastQFile::disableSeqIDCheck()
00062 {
00063     myCheckSeqID = false;
00064 }
00065 
00066 
00067 // Enable Unique Sequence ID checking.
00068 // Unique Sequence ID checking is enabled by default.
00069 void FastQFile::enableSeqIDCheck()
00070 {
00071     myCheckSeqID = true;
00072 }
00073 
00074 
00075 // Set the number of errors after which to quit reading/validating a file.
00076 void FastQFile::setMaxErrors(int maxErrors)
00077 {
00078    myMaxErrors = maxErrors;
00079 }
00080 
00081 
00082 // Open a FastQFile.
00083 FastQStatus::Status FastQFile::openFile(const char* fileName,
00084                                         BaseAsciiMap::SPACE_TYPE spaceType)
00085 {
00086    // reset the member data.
00087    reset();
00088 
00089    myBaseComposition.resetBaseMapType();
00090    myBaseComposition.setBaseMapType(spaceType);
00091    myQualPerCycle.clear();
00092    myCountPerCycle.clear();
00093 
00094    FastQStatus::Status status = FastQStatus::FASTQ_SUCCESS;
00095 
00096    // Close the file if there is already one open - checked by close.
00097    status = closeFile();
00098    if(status == FastQStatus::FASTQ_SUCCESS)
00099    {
00100       // Successfully closed a previously opened file if there was one.
00101       
00102       // Open the file
00103       myFile = ifopen(fileName, "rt");
00104       myFileName = fileName;
00105       
00106       if(myFile == NULL)
00107       {
00108          // Failed to open the file.
00109          status = FastQStatus::FASTQ_OPEN_ERROR;
00110       }
00111    }
00112 
00113    if(status != FastQStatus::FASTQ_SUCCESS)
00114    {
00115       // Failed to open the file.
00116       std::string errorMessage = "ERROR: Failed to open file: ";
00117       errorMessage += fileName;
00118       logMessage(errorMessage.c_str());
00119    }
00120    return(status);
00121 }
00122 
00123 
00124 // Close a FastQFile.
00125 FastQStatus::Status FastQFile::closeFile()
00126 {
00127    int closeStatus = 0; // Success.
00128 
00129    // If a file has been opened, close it.
00130    if(myFile != NULL)
00131    {
00132       // Close the file.
00133       closeStatus = ifclose(myFile);
00134       myFile = NULL;
00135    }
00136    if(closeStatus == 0)
00137    {
00138       // Success - either there wasn't a file to close or it was closed
00139       // successfully.
00140       return(FastQStatus::FASTQ_SUCCESS);
00141    }
00142    else
00143    {
00144       std::string errorMessage = "Failed to close file: ";
00145       errorMessage += myFileName.c_str();
00146       logMessage(errorMessage.c_str());
00147       return(FastQStatus::FASTQ_CLOSE_ERROR);
00148    }
00149 }
00150 
00151 
00152 // Check to see if the file is open.
00153 bool FastQFile::isOpen()
00154 {
00155    // Check to see if the file is open.
00156    if((myFile != NULL) && (myFile->isOpen()))
00157    {
00158       // File pointer exists and the file is open.
00159       return true;
00160    }
00161 
00162    // File is not open.
00163    return false;
00164 }
00165 
00166 
00167 // Check to see if the file is at the end of the file.
00168 bool FastQFile::isEof()
00169 {
00170    // Check to see if the file is open.
00171    if((myFile != NULL) && (ifeof(myFile)))
00172    {
00173       // At EOF.
00174       return true;
00175    }
00176 
00177    // Not at EOF.
00178    return false;
00179 }
00180 
00181 
00182 // Returns whether or not to keep reading the file.
00183 // Stop reading (false) if eof or there is a problem reading the file.
00184 bool FastQFile::keepReadingFile()
00185 {
00186    if(isEof() || myFileProblem)
00187    {
00188       return(false);
00189    }
00190    return(true);
00191 }
00192 
00193 
00194 // Validate the specified fastq file
00195 FastQStatus::Status FastQFile::validateFastQFile(const String& filename,
00196                                                  bool printBaseComp,
00197                                                  BaseAsciiMap::SPACE_TYPE spaceType,
00198                                                  bool printQualAvg)
00199 {
00200    // Open the fastqfile.
00201    if(openFile(filename, spaceType) != FastQStatus::FASTQ_SUCCESS)
00202    {
00203       // Failed to open the specified file.
00204       return(FastQStatus::FASTQ_OPEN_ERROR);
00205    }
00206 
00207    // Track the total number of sequences that were validated.
00208    int numSequences = 0;
00209 
00210    // Keep reading the file until there are no more fastq sequences to process
00211    // and not configured to quit after a certain number of errors or there
00212    // has not yet been that many errors.
00213    // Or exit if there is a problem reading the file.
00214    FastQStatus::Status status = FastQStatus::FASTQ_SUCCESS;
00215    while (keepReadingFile() &&
00216           ((myMaxErrors == -1) || (myMaxErrors > myNumErrors)))
00217    {
00218       // Validate one sequence.  This call will read all the lines for 
00219       // one sequence.
00220       status = readFastQSequence();
00221       if((status == FastQStatus::FASTQ_SUCCESS) || (status == FastQStatus::FASTQ_INVALID))
00222       {
00223          // Read a sequence and it is either valid or invalid, but
00224          // either way, a sequence was read, so increment the sequence count.
00225          ++numSequences;
00226       }
00227       else
00228       {
00229          // Other error, so break out of processing.
00230          break;
00231       }
00232    }
00233    
00234    // Report Base Composition Statistics.
00235    if(printBaseComp)
00236    {
00237       myBaseComposition.print();
00238    }
00239 
00240    if(printQualAvg)
00241    {
00242       printAvgQual();
00243    }
00244 
00245    std::string finishMessage = "Finished processing ";
00246    finishMessage += myFileName.c_str();
00247    char buffer[100];
00248    if(sprintf(buffer, 
00249               " with %u lines containing %d sequences.", 
00250               myLineNum, numSequences) > 0)
00251    {
00252       finishMessage += buffer;
00253       logMessage(finishMessage.c_str());
00254    }
00255    if(sprintf(buffer, 
00256               "There were a total of %d errors.", 
00257               myNumErrors) > 0)
00258    {
00259       logMessage(buffer);
00260    }
00261 
00262    // Close the input file.
00263    FastQStatus::Status closeStatus = closeFile();
00264 
00265    if((status != FastQStatus::FASTQ_SUCCESS) && (status != FastQStatus::FASTQ_INVALID))
00266    {
00267       // Stopped validating due to some error other than invalid, so
00268       // return that error.
00269       return(status);
00270    }
00271    else if(myNumErrors == 0)
00272    {
00273       // No errors, check to see if there were any sequences.
00274       // Finished processing all of the sequences in the file.
00275       // If there are no sequences, report an error.
00276       if(numSequences == 0)
00277       {
00278          // Empty file, return error.
00279          logMessage("ERROR: No FastQSequences in the file.");
00280          return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
00281       }
00282       return(FastQStatus::FASTQ_SUCCESS);
00283    }
00284    else
00285    {
00286       // The file is invalid.  But check the close status.  If the close
00287       // failed, it means there is a problem with the file itself not just
00288       // with validation, so the close failure should be returned.
00289       if(closeStatus != FastQStatus::FASTQ_SUCCESS)
00290       {
00291          return(closeStatus);
00292       }
00293       return(FastQStatus::FASTQ_INVALID);
00294    }
00295 }
00296 
00297 
00298 // Reads and validates a single fastq sequence from myFile.
00299 FastQStatus::Status FastQFile::readFastQSequence()
00300 {
00301    // First verify that a file is open, if not, return failure.
00302    if(!isOpen())
00303    {
00304       std::string message = 
00305          "ERROR: Trying to read a fastq file but no file is open.";
00306       logMessage(message.c_str());
00307       return(FastQStatus::FASTQ_ORDER_ERROR);
00308    }
00309 
00310    // Reset variables for each sequence.
00311    resetForEachSequence();
00312    
00313    bool valid = true;
00314 
00315    // No sequence was read.
00316    if(isTimeToQuit())
00317    {
00318       return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
00319    }
00320 
00321    // The first line is the sequence identifier, so validate that.
00322    valid = validateSequenceIdentifierLine();
00323    
00324    if(myFileProblem)
00325    {
00326       return(FastQStatus::FASTQ_READ_ERROR);
00327    }
00328     
00329    // If we are at the end of the file, check to see if it is a partial
00330    // sequence or just an empty line at the end.
00331    if(ifeof(myFile))
00332    {
00333       // If the sequence identifier line was empty and we are at the
00334       // end of the file, there is nothing more to validate.
00335       if(mySequenceIdLine.Length() != 0)
00336       { 
00337          // There was a sequence identifier line, so this is an incomplete 
00338          // sequence.
00339          myErrorString = "Incomplete Sequence.\n";
00340          reportErrorOnLine();
00341 
00342          valid = false;
00343       }
00344       if(valid)
00345       {
00346          // Return failure - no sequences were left to read.  At the end
00347          // of the file.  It wasn't invalid and it wasn't really an error.
00348          return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
00349       }
00350       else
00351       {
00352          return(FastQStatus::FASTQ_INVALID);
00353       }
00354    }
00355 
00356    // If enough errors, quit before reading any more.
00357    if(isTimeToQuit())
00358    {
00359       // Means there was an error, so mark it as invalid.
00360       return(FastQStatus::FASTQ_INVALID);
00361    }
00362 
00363    // Validate the Raw Sequence Line(s) and the "+" line.
00364    valid &= validateRawSequenceAndPlusLines();
00365 
00366    if(myFileProblem)
00367    {
00368       return(FastQStatus::FASTQ_READ_ERROR);
00369    }
00370     
00371    // If enough errors, quit before reading any more.
00372    if(isTimeToQuit())
00373    {
00374       return(FastQStatus::FASTQ_INVALID);
00375    }
00376 
00377    // If it is the end of a file, it is missing the quality string.
00378    if(ifeof(myFile))
00379    {
00380       // There was a sequence identifier line, so this is an incomplete 
00381       // sequence.
00382       myErrorString = "Incomplete Sequence, missing Quality String.";
00383       reportErrorOnLine();
00384       valid = false;
00385       return(FastQStatus::FASTQ_INVALID);
00386    }
00387     
00388    // All that is left is to validate the quality string line(s).
00389    valid &= validateQualityStringLines();
00390 
00391    if(myFileProblem)
00392    {
00393       return(FastQStatus::FASTQ_READ_ERROR);
00394    }
00395     
00396    if(valid)
00397    {
00398       return(FastQStatus::FASTQ_SUCCESS);
00399    }
00400    return(FastQStatus::FASTQ_INVALID);
00401 }
00402 
00403 
00404 // Reads and validates the sequence identifier line of a fastq sequence.
00405 bool FastQFile::validateSequenceIdentifierLine()
00406 {
00407    // Read the first line of the sequence.
00408    int readStatus = mySequenceIdLine.ReadLine(myFile);
00409 
00410    // Check to see if the read was successful.
00411    if(readStatus <= 0)
00412    {
00413       // If EOF, not an error.
00414       if(ifeof(myFile))
00415       {
00416          return true;
00417       }
00418       myFileProblem = true;
00419       myErrorString = "Failure trying to read sequence identifier line";
00420       reportErrorOnLine();
00421       return false;
00422    }
00423 
00424    // If the line is 0 length and it is the end of the file, just
00425    // return since this is the eof - no error.
00426    if((mySequenceIdLine.Length() == 0) && (ifeof(myFile)))
00427    {
00428       // Not an error, just a new line at the end of the file.
00429       return true;
00430    }
00431 
00432    // Increment the line number.
00433    myLineNum++;
00434    
00435    // Verify that the line has at least 2 characters: '@' and at least
00436    // one character for the sequence identifier.
00437    if(mySequenceIdLine.Length() < 2)
00438    {
00439       // Error. Sequence Identifier line not long enough.
00440       myErrorString = "The sequence identifier line was too short.";
00441       reportErrorOnLine();
00442       return false;
00443    }
00444    
00445    // The sequence identifier line must start wtih a '@'
00446    if(mySequenceIdLine[0] != '@')
00447    {
00448       // Error - sequence identifier line does not begin with an '@'.
00449       myErrorString = "First line of a sequence does not begin with @";
00450       reportErrorOnLine();
00451       return false;
00452    }
00453 
00454    // Valid Sequence Identifier Line.
00455 
00456    // The sequence identifier ends at the first space or at the end of the
00457    // line if there is no space.
00458    // Use fast find since this is a case insensitive search.
00459    // Start at 1 since we know that 0 is '@'
00460    int endSequenceIdentifier = mySequenceIdLine.FastFindChar(' ', 1);
00461    
00462    // Check if a " " was found.
00463    if(endSequenceIdentifier == -1)
00464    {
00465       // Did not find a ' ', so the identifier is the rest of the line.
00466       // It starts at 1 since @ is at offset 0.
00467       mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str();
00468    }
00469    else
00470    {
00471       // Found a ' ', so the identifier ends just before that.
00472       // The sequence identifier must be at least 1 character long, 
00473       // therefore the endSequenceIdentifier must be greater than 1.
00474       if(endSequenceIdentifier <= 1)
00475       {
00476          myErrorString = 
00477             "No Sequence Identifier specified before the comment.";
00478          reportErrorOnLine();
00479          return false;
00480       }
00481 
00482       mySequenceIdentifier = 
00483          (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str();
00484    }
00485    
00486    // Check if sequence identifier should be validated for uniqueness.
00487    if(myCheckSeqID)
00488    {
00489        // Check to see if the sequenceIdentifier is a repeat by adding
00490        // it to the set and seeing if it already existed.
00491        std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult;
00492        insertResult = 
00493            myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(), 
00494                                                  myLineNum));
00495        
00496        if(insertResult.second == false)
00497        {
00498            // Sequence Identifier is a repeat.
00499            myErrorString = "Repeated Sequence Identifier: ";
00500            myErrorString += mySequenceIdentifier.c_str();
00501            myErrorString += " at Lines ";
00502            myErrorString += insertResult.first->second;
00503            myErrorString += " and ";
00504            myErrorString += myLineNum;
00505            reportErrorOnLine();
00506            return(false);
00507        }
00508    }
00509 
00510    // Valid, return true.
00511    return(true);
00512 }
00513 
00514 
00515 // Reads and validates the raw sequence line(s) and the plus line.  Both are
00516 // included in one method since it is unknown when the raw sequence line
00517 // ends until you find the plus line that divides it from the quality
00518 // string.  Since this method will read the plus line to know when the
00519 // raw sequence ends, it also validates that line.
00520 bool FastQFile::validateRawSequenceAndPlusLines()
00521 {
00522    // Read the raw sequence.
00523    int readStatus = myRawSequence.ReadLine(myFile);
00524 
00525    myLineNum++;
00526 
00527    if(readStatus <= 0)
00528    {
00529       myFileProblem = true;
00530       myErrorString = "Failure trying to read sequence line";
00531       reportErrorOnLine();
00532       return false;
00533    }
00534 
00535    // Offset into the raw sequence to be validated.
00536    int offset = 0;
00537    
00538    // Validate the raw sequence.
00539    bool valid = validateRawSequence(offset);
00540 
00541    // Increment the offset for what was just read.
00542    offset = myRawSequence.Length();
00543 
00544    // The next line is either a continuation of the raw sequence or it starts
00545    // with a '+'
00546    // Keep validating til the '+' line or the end of file is found.
00547    bool stillRawLine = true;
00548    while(stillRawLine && 
00549          !ifeof(myFile))
00550    {
00551       // If enough errors, quit before reading any more.
00552       if(isTimeToQuit())
00553       {
00554          return(false);
00555       }
00556 
00557       // Read the next line.
00558       // Read into the plus line, but if it isn't a plus line, then
00559       // it will be copied into the raw sequence line.
00560       readStatus = myPlusLine.ReadLine(myFile);
00561       myLineNum++;
00562 
00563       if(readStatus <= 0)
00564       {
00565          myFileProblem = true;
00566          myErrorString = "Failure trying to read sequence/plus line";
00567          reportErrorOnLine();
00568          return false;
00569       }
00570 
00571       // Check if the next line is blank
00572       if(myPlusLine.Length() == 0)
00573       {
00574          // The next line is blank.  Assume it is part of the raw sequence and
00575          // report an error since there are no valid characters on the line.
00576          myErrorString = 
00577             "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence.";
00578          reportErrorOnLine();
00579       }
00580       // Check for the plus line.
00581       else if(myPlusLine[0] == '+')
00582       {
00583          // This is the + line.
00584          valid &= validateSequencePlus();
00585          stillRawLine = false;
00586       }
00587       else
00588       {
00589          // Not a plus line, so assume this is a continuation of the Raw
00590          // Sequence.
00591          // Copy from the plus line to the raw sequence line.
00592          myRawSequence += myPlusLine;
00593          myPlusLine.SetLength(0);
00594          valid &= validateRawSequence(offset);
00595          
00596          // Increment the offset.
00597          offset = myRawSequence.Length();
00598       }
00599    }
00600    
00601    // If enough errors, quit before reading any more.
00602    if(isTimeToQuit())
00603    {
00604       return(false);
00605    }
00606    
00607    // Now that the entire raw sequence has been obtained, check its length
00608    // against the minimum allowed length.
00609    if(myRawSequence.Length() < myMinReadLength)
00610    {
00611       // Raw sequence is not long enough - error.
00612       myErrorString = "Raw Sequence is shorter than the min read length: ";
00613       myErrorString += myRawSequence.Length();
00614       myErrorString += " < ";
00615       myErrorString += myMinReadLength;
00616       reportErrorOnLine();
00617       valid = false;
00618    }
00619 
00620    // If enough errors, quit before reading any more.
00621    if(isTimeToQuit())
00622    {
00623       return(false);
00624    }
00625 
00626    // if the flag still indicates it is processing the raw sequence that means
00627    // we reached the end of the file without a '+' line.  So report that error.
00628    if(stillRawLine)
00629    {
00630       myErrorString = "Reached the end of the file without a '+' line.";
00631       reportErrorOnLine();
00632       valid = false;
00633    }
00634 
00635    return(valid);
00636 }
00637 
00638 
00639 // Reads and validates the quality string line(s).
00640 bool FastQFile::validateQualityStringLines()
00641 {
00642    // Read the quality string.
00643    int readStatus = myQualityString.ReadLine(myFile);
00644    myLineNum++;
00645 
00646    if(readStatus <= 0)
00647    {
00648       myFileProblem = true;
00649       myErrorString = "Failure trying to read quality line";
00650       reportErrorOnLine();
00651       return false;
00652    }
00653 
00654    // track the offset into the quality string to validate.
00655    int offset = 0;
00656 
00657    // Validate this line of the quality string.
00658    bool valid = validateQualityString(offset);
00659 
00660    offset = myQualityString.Length();
00661 
00662    // Keep reading quality string lines until the length of the 
00663    // raw sequence has been hit or the end of the file is reached.
00664    while((myQualityString.Length() < myRawSequence.Length()) && 
00665          (!ifeof(myFile)))
00666    {
00667       // If enough errors, quit before reading any more.
00668       if(isTimeToQuit())
00669       {
00670          return(false);
00671       }
00672 
00673       // Read another line of the quality string.
00674       readStatus = myTempPartialQuality.ReadLine(myFile);
00675       myLineNum++;
00676 
00677       if(readStatus <= 0)
00678       {
00679          myFileProblem = true;
00680          myErrorString = "Failure trying to read quality line";
00681          reportErrorOnLine();
00682          return false;
00683       }
00684 
00685       myQualityString += myTempPartialQuality;
00686       myTempPartialQuality.Clear();
00687 
00688       // Validate this line of the quality string.
00689       valid &= validateQualityString(offset);
00690       offset = myQualityString.Length();
00691    }
00692 
00693    // If enough errors, quit before reading any more.
00694    if(isTimeToQuit())
00695    {
00696       return(false);
00697    }
00698 
00699    // Validate that the quality string length is the same as the
00700    // raw sequence length.
00701    if(myQualityString.Length() != myRawSequence.Length()) 
00702    {
00703       myErrorString = "Quality string length (";
00704       myErrorString += myQualityString.Length();
00705       myErrorString += ") does not equal raw sequence length (";
00706       myErrorString += myRawSequence.Length();
00707       myErrorString += ")";
00708       reportErrorOnLine();
00709       valid = false;
00710    }
00711    return(valid);
00712 }
00713 
00714 
00715 // Method to validate a line that contains part of the raw sequence.
00716 bool FastQFile::validateRawSequence(int offset)
00717 {
00718    bool validBase = false; 
00719    bool valid = true;
00720 
00721    // Loop through validating each character is valid for the raw sequence.
00722    for(int sequenceIndex = offset; sequenceIndex < myRawSequence.Length(); 
00723        sequenceIndex++)
00724    {
00725       // Update the composition for this position.  Returns false if the
00726       // character was not a valid base.
00727       validBase = 
00728          myBaseComposition.updateComposition(sequenceIndex, 
00729                                              myRawSequence[sequenceIndex]);
00730       // Check the return
00731       if(!validBase)
00732       {
00733          // Error, found a value that is not a valid base character.
00734          myErrorString = "Invalid character ('";
00735          myErrorString += myRawSequence[sequenceIndex];
00736          myErrorString += "') in base sequence.";
00737          reportErrorOnLine();
00738          valid = false;
00739          // If enough errors, quit before reading any more.
00740          if(isTimeToQuit())
00741          {
00742             return(false);
00743          }
00744       }
00745    }
00746    return(valid);
00747 }
00748 
00749 
00750 // Method to validate the "+" line that seperates the raw sequence and the
00751 // quality string.
00752 bool FastQFile::validateSequencePlus()
00753 {
00754    // Validate that optional sequence identifier is the same
00755    // as the one on the @ line.
00756 
00757    // Check to see if there is more to the line than just the plus
00758    int lineLength = myPlusLine.Length();
00759    
00760    // If the line is only 1 character or the second character is a space,
00761    // then there is no sequence identifier on this line and there is nothing
00762    // further to validate.
00763    if((lineLength == 1) || (myPlusLine[1] == ' '))
00764    {
00765       // No sequence identifier, so just return valid.
00766       return true;
00767    }
00768    
00769    // There is a sequence identifier on this line, so validate that
00770    // it matches the one from the associated @ line.
00771    // The read in line must be at least 1 more character ('+') than the
00772    // sequence identifier read from the '@' line.
00773    // If it is not longer than the sequence identifier, then we know that it
00774    // cannot be the same.
00775    int sequenceIdentifierLength = mySequenceIdentifier.Length();
00776    if(lineLength <= sequenceIdentifierLength)
00777    {
00778       myErrorString = 
00779          "Sequence Identifier on '+' line does not equal the one on the '@' line.";
00780       reportErrorOnLine();
00781       return false;
00782    }
00783 
00784    bool same = true;
00785    int seqIndex = 0;
00786    int lineIndex = 1;  // Start at 1 since offset 0 has '+'
00787 
00788    // Loop through the sequence index and the line buffer verifying they
00789    // are the same until a difference is found or the end of the sequence
00790    // identifier is found.
00791    while((same == true) && (seqIndex < sequenceIdentifierLength))
00792    {
00793       if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex])
00794       {
00795          myErrorString = 
00796             "Sequence Identifier on '+' line does not equal the one on the '@' line.";
00797          reportErrorOnLine();
00798          same = false;
00799       }
00800       lineIndex++;
00801       seqIndex++;
00802    }
00803    return(same);
00804 }
00805 
00806 
00807 // Method to validate the quality string.
00808 bool FastQFile::validateQualityString(int offset)
00809 {
00810    bool valid = true;
00811    if(myQualityString.Length() > (int)(myQualPerCycle.size()))
00812    {
00813        myQualPerCycle.resize(myQualityString.Length());
00814        myCountPerCycle.resize(myQualityString.Length());
00815    }
00816    // For each character in the line, verify that it is ascii > 32.
00817    for(int i=offset; i < myQualityString.Length(); i++)
00818    {
00819       if(myQualityString[i] <= 32)
00820       {
00821          myErrorString = "Invalid character ('";
00822          myErrorString += myQualityString[i];
00823          myErrorString += "') in quality string.";
00824          reportErrorOnLine();
00825          valid = false;
00826          // If enough errors, quit before reading any more.
00827          if(isTimeToQuit())
00828          {
00829             return(false);
00830          }
00831       }
00832       else
00833       {
00834           myQualPerCycle[i] += BaseUtilities::getPhredBaseQuality(myQualityString[i]);
00835           myCountPerCycle[i] += 1;
00836       }
00837    }
00838    return(valid);
00839 }
00840 
00841 
00842 // Helper method for printing the contents of myErrorString.  It will
00843 // only print the errors until the maximum number of reportable errors is
00844 // reached.
00845 void FastQFile::reportErrorOnLine()
00846 {
00847    // Increment the total number of errors.
00848    myNumErrors++;
00849    
00850    // Only display the first X number of errors.
00851    if(myNumErrors <= myNumPrintableErrors)
00852    {
00853       // Write the error with the line number.
00854       char buffer[100];
00855       sprintf(buffer, "ERROR on Line %u: ", myLineNum);
00856       std::string message = buffer;
00857       message += myErrorString.c_str();
00858       logMessage(message.c_str());
00859    }
00860 }
00861 
00862 
00863 // Reset member data that is unique for each fastQFile.
00864 void FastQFile::reset()
00865 {
00866    // Each fastq file processing needs to also reset the member data for each
00867    // sequence.
00868    resetForEachSequence();
00869    myNumErrors = 0;  // per fastqfile
00870    myLineNum = 0;    // per fastqfile
00871    myFileName.SetLength(0);  // reset the filename string.
00872    myIdentifierMap.clear(); // per fastqfile
00873    myBaseComposition.clear(); // clear the base composition.
00874    myQualPerCycle.clear();
00875    myCountPerCycle.clear();
00876    myFileProblem = false;
00877 }
00878 
00879 
00880 // Reset the member data that is unique for each sequence.
00881 void FastQFile::resetForEachSequence()
00882 {
00883    myLineBuffer.SetLength(0);
00884    myErrorString.SetLength(0);
00885    myRawSequence.SetLength(0);
00886    mySequenceIdLine.SetLength(0);
00887    mySequenceIdentifier.SetLength(0);
00888    myPlusLine.SetLength(0);
00889    myQualityString.SetLength(0);
00890    myTempPartialQuality.SetLength(0);
00891 }
00892 
00893 
00894 void FastQFile::logMessage(const char* logMessage)
00895 {
00896    // Write the message if they are not disabled.
00897    if(!myDisableMessages)
00898    {
00899       std::cout << logMessage << std::endl;
00900    }
00901 }
00902 
00903 
00904 // Determine if it is time to quit by checking if we are to quit after a
00905 // certain number of errors and that many errors have been encountered.
00906 bool FastQFile::isTimeToQuit()
00907 {
00908    // It is time to quit if we are to quit after a certain number of errors
00909    // and that many errors have been encountered.
00910    if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors))
00911    {
00912       return(true);
00913    }
00914    return(false);
00915 }
00916 
00917 
00918 void FastQFile::printAvgQual()
00919 {
00920    std::cout << std::endl << "Average Phred Quality by Read Index (starts at 0):" << std::endl;
00921    std::cout.precision(2);
00922    std::cout << std::fixed << "Read Index\tAverage Quality" 
00923              << std::endl;
00924    if(myQualPerCycle.size() != myCountPerCycle.size())
00925    {
00926        // This is a code error and should NEVER happen.
00927        std::cerr << "ERROR calculating the average Qualities per cycle\n";
00928    }
00929 
00930    double sumQual = 0;
00931    double count = 0;
00932    double avgQual = 0;
00933    for(unsigned int i = 0; i < myQualPerCycle.size(); i++)
00934    {
00935        avgQual = 0;
00936        if(myCountPerCycle[i] != 0)
00937        {
00938            avgQual = myQualPerCycle[i] / (double)(myCountPerCycle[i]);
00939        }
00940        std::cout << i << "\t" << avgQual << "\n";
00941        sumQual += myQualPerCycle[i];
00942        count += myCountPerCycle[i];
00943    }
00944    std::cout << std::endl;
00945    avgQual = 0;
00946    if(count != 0)
00947    {
00948        avgQual = sumQual / count;
00949    }
00950    std::cout << "Overall Average Phred Quality = " << avgQual << std::endl;
00951 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends