FastQFile.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <iostream>
00019 
00020 #include "InputFile.h"
00021 #include "FastQFile.h"
00022 
00023 
00024 // Constructor.
00025 // minReadLength - The minimum length that a base sequence must be for
00026 //                 it to be valid.
00027 // numPrintableErrors - The maximum number of errors that should be reported
00028 //                      in detail before suppressing the errors.
00029 // 
00030 FastQFile::FastQFile(int minReadLength, int numPrintableErrors)
00031    : myFile(NULL),
00032      myBaseComposition(),
00033      myCheckSeqID(true),
00034      myMinReadLength(minReadLength),
00035      myNumPrintableErrors(numPrintableErrors),
00036      myMaxErrors(-1),
00037      myDisableMessages(false),
00038      myFileProblem(false)
00039 {
00040    // Reset the member data.
00041    reset();
00042 }
00043 
00044 
00045 void FastQFile::disableMessages()
00046 {
00047    myDisableMessages = true;
00048 }
00049 
00050 
00051 void FastQFile::enableMessages()
00052 {
00053    myDisableMessages = false;
00054 }
00055 
00056 
00057 // Disable Unique Sequence ID checking.  
00058 // Unique Sequence ID checking is enabled by default.
00059 void FastQFile::disableSeqIDCheck()
00060 {
00061     myCheckSeqID = false;
00062 }
00063 
00064 
00065 // Enable Unique Sequence ID checking.
00066 // Unique Sequence ID checking is enabled by default.
00067 void FastQFile::enableSeqIDCheck()
00068 {
00069     myCheckSeqID = true;
00070 }
00071 
00072 
00073 // Set the number of errors after which to quit reading/validating a file.
00074 void FastQFile::setMaxErrors(int maxErrors)
00075 {
00076    myMaxErrors = maxErrors;
00077 }
00078 
00079 
00080 // Open a FastQFile.
00081 FastQStatus::Status FastQFile::openFile(const char* fileName,
00082                                         BaseAsciiMap::SPACE_TYPE spaceType)
00083 {
00084    // reset the member data.
00085    reset();
00086 
00087    myBaseComposition.resetBaseMapType();
00088    myBaseComposition.setBaseMapType(spaceType);
00089 
00090    FastQStatus::Status status = FastQStatus::FASTQ_SUCCESS;
00091 
00092    // Close the file if there is already one open - checked by close.
00093    status = closeFile();
00094    if(status == FastQStatus::FASTQ_SUCCESS)
00095    {
00096       // Successfully closed a previously opened file if there was one.
00097       
00098       // Open the file
00099       myFile = ifopen(fileName, "rt");
00100       myFileName = fileName;
00101       
00102       if(myFile == NULL)
00103       {
00104          // Failed to open the file.
00105          status = FastQStatus::FASTQ_OPEN_ERROR;
00106       }
00107    }
00108 
00109    if(status != FastQStatus::FASTQ_SUCCESS)
00110    {
00111       // Failed to open the file.
00112       std::string errorMessage = "ERROR: Failed to open file: ";
00113       errorMessage += fileName;
00114       logMessage(errorMessage.c_str());
00115    }
00116    return(status);
00117 }
00118 
00119 
00120 // Close a FastQFile.
00121 FastQStatus::Status FastQFile::closeFile()
00122 {
00123    int closeStatus = 0; // Success.
00124 
00125    // If a file has been opened, close it.
00126    if(myFile != NULL)
00127    {
00128       // Close the file.
00129       closeStatus = ifclose(myFile);
00130       myFile = NULL;
00131    }
00132    if(closeStatus == 0)
00133    {
00134       // Success - either there wasn't a file to close or it was closed
00135       // successfully.
00136       return(FastQStatus::FASTQ_SUCCESS);
00137    }
00138    else
00139    {
00140       std::string errorMessage = "Failed to close file: ";
00141       errorMessage += myFileName.c_str();
00142       logMessage(errorMessage.c_str());
00143       return(FastQStatus::FASTQ_CLOSE_ERROR);
00144    }
00145 }
00146 
00147 
00148 // Check to see if the file is open.
00149 bool FastQFile::isOpen()
00150 {
00151    // Check to see if the file is open.
00152    if((myFile != NULL) && (myFile->isOpen()))
00153    {
00154       // File pointer exists and the file is open.
00155       return true;
00156    }
00157 
00158    // File is not open.
00159    return false;
00160 }
00161 
00162 
00163 // Check to see if the file is at the end of the file.
00164 bool FastQFile::isEof()
00165 {
00166    // Check to see if the file is open.
00167    if((myFile != NULL) && (ifeof(myFile)))
00168    {
00169       // At EOF.
00170       return true;
00171    }
00172 
00173    // Not at EOF.
00174    return false;
00175 }
00176 
00177 
00178 // Returns whether or not to keep reading the file.
00179 // Stop reading (false) if eof or there is a problem reading the file.
00180 bool FastQFile::keepReadingFile()
00181 {
00182    if(isEof() || myFileProblem)
00183    {
00184       return(false);
00185    }
00186    return(true);
00187 }
00188 
00189 
00190 // Validate the specified fastq file
00191 FastQStatus::Status FastQFile::validateFastQFile(const String& filename,
00192                                                  bool printBaseComp,
00193                                                  BaseAsciiMap::SPACE_TYPE spaceType)
00194 {
00195    // Open the fastqfile.
00196    if(openFile(filename, spaceType) != FastQStatus::FASTQ_SUCCESS)
00197    {
00198       // Failed to open the specified file.
00199       return(FastQStatus::FASTQ_OPEN_ERROR);
00200    }
00201 
00202    // Track the total number of sequences that were validated.
00203    int numSequences = 0;
00204 
00205    // Keep reading the file until there are no more fastq sequences to process
00206    // and not configured to quit after a certain number of errors or there
00207    // has not yet been that many errors.
00208    // Or exit if there is a problem reading the file.
00209    FastQStatus::Status status = FastQStatus::FASTQ_SUCCESS;
00210    while (keepReadingFile() &&
00211           ((myMaxErrors == -1) || (myMaxErrors > myNumErrors)))
00212    {
00213       // Validate one sequence.  This call will read all the lines for 
00214       // one sequence.
00215       status = readFastQSequence();
00216       if((status == FastQStatus::FASTQ_SUCCESS) || (status == FastQStatus::FASTQ_INVALID))
00217       {
00218          // Read a sequence and it is either valid or invalid, but
00219          // either way, a sequence was read, so increment the sequence count.
00220          ++numSequences;
00221       }
00222       else
00223       {
00224          // Other error, so break out of processing.
00225          break;
00226       }
00227    }
00228    
00229    // Report Base Composition Statistics.
00230    if(printBaseComp)
00231    {
00232       myBaseComposition.print();
00233    }
00234 
00235    std::string finishMessage = "Finished processing ";
00236    finishMessage += myFileName.c_str();
00237    char buffer[100];
00238    if(sprintf(buffer, 
00239               " with %u lines containing %d sequences.", 
00240               myLineNum, numSequences) > 0)
00241    {
00242       finishMessage += buffer;
00243       logMessage(finishMessage.c_str());
00244    }
00245    if(sprintf(buffer, 
00246               "There were a total of %d errors.", 
00247               myNumErrors) > 0)
00248    {
00249       logMessage(buffer);
00250    }
00251 
00252    // Close the input file.
00253    FastQStatus::Status closeStatus = closeFile();
00254 
00255    if((status != FastQStatus::FASTQ_SUCCESS) && (status != FastQStatus::FASTQ_INVALID))
00256    {
00257       // Stopped validating due to some error other than invalid, so
00258       // return that error.
00259       return(status);
00260    }
00261    else if(myNumErrors == 0)
00262    {
00263       // No errors, check to see if there were any sequences.
00264       // Finished processing all of the sequences in the file.
00265       // If there are no sequences, report an error.
00266       if(numSequences == 0)
00267       {
00268          // Empty file, return error.
00269          logMessage("ERROR: No FastQSequences in the file.");
00270          return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
00271       }
00272       return(FastQStatus::FASTQ_SUCCESS);
00273    }
00274    else
00275    {
00276       // The file is invalid.  But check the close status.  If the close
00277       // failed, it means there is a problem with the file itself not just
00278       // with validation, so the close failure should be returned.
00279       if(closeStatus != FastQStatus::FASTQ_SUCCESS)
00280       {
00281          return(closeStatus);
00282       }
00283       return(FastQStatus::FASTQ_INVALID);
00284    }
00285 }
00286 
00287 
00288 // Reads and validates a single fastq sequence from myFile.
00289 FastQStatus::Status FastQFile::readFastQSequence()
00290 {
00291    // First verify that a file is open, if not, return failure.
00292    if(!isOpen())
00293    {
00294       std::string message = 
00295          "ERROR: Trying to read a fastq file but no file is open.";
00296       logMessage(message.c_str());
00297       return(FastQStatus::FASTQ_ORDER_ERROR);
00298    }
00299 
00300    // Reset variables for each sequence.
00301    resetForEachSequence();
00302    
00303    bool valid = true;
00304 
00305    // No sequence was read.
00306    if(isTimeToQuit())
00307    {
00308       return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
00309    }
00310 
00311    // The first line is the sequence identifier, so validate that.
00312    valid = validateSequenceIdentifierLine();
00313    
00314    if(myFileProblem)
00315    {
00316       return(FastQStatus::FASTQ_READ_ERROR);
00317    }
00318     
00319    // If we are at the end of the file, check to see if it is a partial
00320    // sequence or just an empty line at the end.
00321    if(ifeof(myFile))
00322    {
00323       // If the sequence identifier line was empty and we are at the
00324       // end of the file, there is nothing more to validate.
00325       if(mySequenceIdLine.Length() != 0)
00326       { 
00327          // There was a sequence identifier line, so this is an incomplete 
00328          // sequence.
00329          myErrorString = "Incomplete Sequence.\n";
00330          reportErrorOnLine();
00331 
00332          valid = false;
00333       }
00334       if(valid)
00335       {
00336          // Return failure - no sequences were left to read.  At the end
00337          // of the file.  It wasn't invalid and it wasn't really an error.
00338          return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
00339       }
00340       else
00341       {
00342          return(FastQStatus::FASTQ_INVALID);
00343       }
00344    }
00345 
00346    // If enough errors, quit before reading any more.
00347    if(isTimeToQuit())
00348    {
00349       // Means there was an error, so mark it as invalid.
00350       return(FastQStatus::FASTQ_INVALID);
00351    }
00352 
00353    // Validate the Raw Sequence Line(s) and the "+" line.
00354    valid &= validateRawSequenceAndPlusLines();
00355 
00356    if(myFileProblem)
00357    {
00358       return(FastQStatus::FASTQ_READ_ERROR);
00359    }
00360     
00361    // If enough errors, quit before reading any more.
00362    if(isTimeToQuit())
00363    {
00364       return(FastQStatus::FASTQ_INVALID);
00365    }
00366 
00367    // If it is the end of a file, it is missing the quality string.
00368    if(ifeof(myFile))
00369    {
00370       // There was a sequence identifier line, so this is an incomplete 
00371       // sequence.
00372       myErrorString = "Incomplete Sequence, missing Quality String.";
00373       reportErrorOnLine();
00374       valid = false;
00375       return(FastQStatus::FASTQ_INVALID);
00376    }
00377     
00378    // All that is left is to validate the quality string line(s).
00379    valid &= validateQualityStringLines();
00380 
00381    if(myFileProblem)
00382    {
00383       return(FastQStatus::FASTQ_READ_ERROR);
00384    }
00385     
00386    if(valid)
00387    {
00388       return(FastQStatus::FASTQ_SUCCESS);
00389    }
00390    return(FastQStatus::FASTQ_INVALID);
00391 }
00392 
00393 
00394 // Reads and validates the sequence identifier line of a fastq sequence.
00395 bool FastQFile::validateSequenceIdentifierLine()
00396 {
00397    // Read the first line of the sequence.
00398    int readStatus = mySequenceIdLine.ReadLine(myFile);
00399 
00400    // Check to see if the read was successful.
00401    if(readStatus <= 0)
00402    {
00403       // If EOF, not an error.
00404       if(ifeof(myFile))
00405       {
00406          return true;
00407       }
00408       myFileProblem = true;
00409       myErrorString = "Failure trying to read sequence identifier line";
00410       reportErrorOnLine();
00411       return false;
00412    }
00413 
00414    // If the line is 0 length and it is the end of the file, just
00415    // return since this is the eof - no error.
00416    if((mySequenceIdLine.Length() == 0) && (ifeof(myFile)))
00417    {
00418       // Not an error, just a new line at the end of the file.
00419       return true;
00420    }
00421 
00422    // Increment the line number.
00423    myLineNum++;
00424    
00425    // Verify that the line has at least 2 characters: '@' and at least
00426    // one character for the sequence identifier.
00427    if(mySequenceIdLine.Length() < 2)
00428    {
00429       // Error. Sequence Identifier line not long enough.
00430       myErrorString = "The sequence identifier line was too short.";
00431       reportErrorOnLine();
00432       return false;
00433    }
00434    
00435    // The sequence identifier line must start wtih a '@'
00436    if(mySequenceIdLine[0] != '@')
00437    {
00438       // Error - sequence identifier line does not begin with an '@'.
00439       myErrorString = "First line of a sequence does not begin with @";
00440       reportErrorOnLine();
00441       return false;
00442    }
00443 
00444    // Valid Sequence Identifier Line.
00445 
00446    // The sequence identifier ends at the first space or at the end of the
00447    // line if there is no space.
00448    // Use fast find since this is a case insensitive search.
00449    // Start at 1 since we know that 0 is '@'
00450    int endSequenceIdentifier = mySequenceIdLine.FastFindChar(' ', 1);
00451    
00452    // Check if a " " was found.
00453    if(endSequenceIdentifier == -1)
00454    {
00455       // Did not find a ' ', so the identifier is the rest of the line.
00456       // It starts at 1 since @ is at offset 0.
00457       mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str();
00458    }
00459    else
00460    {
00461       // Found a ' ', so the identifier ends just before that.
00462       // The sequence identifier must be at least 1 character long, 
00463       // therefore the endSequenceIdentifier must be greater than 1.
00464       if(endSequenceIdentifier <= 1)
00465       {
00466          myErrorString = 
00467             "No Sequence Identifier specified before the comment.";
00468          reportErrorOnLine();
00469          return false;
00470       }
00471 
00472       mySequenceIdentifier = 
00473          (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str();
00474    }
00475    
00476    // Check if sequence identifier should be validated for uniqueness.
00477    if(myCheckSeqID)
00478    {
00479        // Check to see if the sequenceIdentifier is a repeat by adding
00480        // it to the set and seeing if it already existed.
00481        std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult;
00482        insertResult = 
00483            myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(), 
00484                                                  myLineNum));
00485        
00486        if(insertResult.second == false)
00487        {
00488            // Sequence Identifier is a repeat.
00489            myErrorString = "Repeated Sequence Identifier: ";
00490            myErrorString += mySequenceIdentifier.c_str();
00491            myErrorString += " at Lines ";
00492            myErrorString += insertResult.first->second;
00493            myErrorString += " and ";
00494            myErrorString += myLineNum;
00495            reportErrorOnLine();
00496            return(false);
00497        }
00498    }
00499 
00500    // Valid, return true.
00501    return(true);
00502 }
00503 
00504 
00505 // Reads and validates the raw sequence line(s) and the plus line.  Both are
00506 // included in one method since it is unknown when the raw sequence line
00507 // ends until you find the plus line that divides it from the quality
00508 // string.  Since this method will read the plus line to know when the
00509 // raw sequence ends, it also validates that line.
00510 bool FastQFile::validateRawSequenceAndPlusLines()
00511 {
00512    // Read the raw sequence.
00513    int readStatus = myRawSequence.ReadLine(myFile);
00514 
00515    myLineNum++;
00516 
00517    if(readStatus <= 0)
00518    {
00519       myFileProblem = true;
00520       myErrorString = "Failure trying to read sequence line";
00521       reportErrorOnLine();
00522       return false;
00523    }
00524 
00525    // Offset into the raw sequence to be validated.
00526    int offset = 0;
00527    
00528    // Validate the raw sequence.
00529    bool valid = validateRawSequence(offset);
00530 
00531    // Increment the offset for what was just read.
00532    offset = myRawSequence.Length();
00533 
00534    // The next line is either a continuation of the raw sequence or it starts
00535    // with a '+'
00536    // Keep validating til the '+' line or the end of file is found.
00537    bool stillRawLine = true;
00538    while(stillRawLine && 
00539          !ifeof(myFile))
00540    {
00541       // If enough errors, quit before reading any more.
00542       if(isTimeToQuit())
00543       {
00544          return(false);
00545       }
00546 
00547       // Read the next line.
00548       // Read into the plus line, but if it isn't a plus line, then
00549       // it will be copied into the raw sequence line.
00550       readStatus = myPlusLine.ReadLine(myFile);
00551       myLineNum++;
00552 
00553       if(readStatus <= 0)
00554       {
00555          myFileProblem = true;
00556          myErrorString = "Failure trying to read sequence/plus line";
00557          reportErrorOnLine();
00558          return false;
00559       }
00560 
00561       // Check if the next line is blank
00562       if(myPlusLine.Length() == 0)
00563       {
00564          // The next line is blank.  Assume it is part of the raw sequence and
00565          // report an error since there are no valid characters on the line.
00566          myErrorString = 
00567             "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence.";
00568          reportErrorOnLine();
00569       }
00570       // Check for the plus line.
00571       else if(myPlusLine[0] == '+')
00572       {
00573          // This is the + line.
00574          valid &= validateSequencePlus();
00575          stillRawLine = false;
00576       }
00577       else
00578       {
00579          // Not a plus line, so assume this is a continuation of the Raw
00580          // Sequence.
00581          // Copy from the plus line to the raw sequence line.
00582          myRawSequence += myPlusLine;
00583          myPlusLine.SetLength(0);
00584          valid &= validateRawSequence(offset);
00585          
00586          // Increment the offset.
00587          offset = myRawSequence.Length();
00588       }
00589    }
00590    
00591    // If enough errors, quit before reading any more.
00592    if(isTimeToQuit())
00593    {
00594       return(false);
00595    }
00596    
00597    // Now that the entire raw sequence has been obtained, check its length
00598    // against the minimum allowed length.
00599    if(myRawSequence.Length() < myMinReadLength)
00600    {
00601       // Raw sequence is not long enough - error.
00602       myErrorString = "Raw Sequence is shorter than the min read length: ";
00603       myErrorString += myRawSequence.Length();
00604       myErrorString += " < ";
00605       myErrorString += myMinReadLength;
00606       reportErrorOnLine();
00607       valid = false;
00608    }
00609 
00610    // If enough errors, quit before reading any more.
00611    if(isTimeToQuit())
00612    {
00613       return(false);
00614    }
00615 
00616    // if the flag still indicates it is processing the raw sequence that means
00617    // we reached the end of the file without a '+' line.  So report that error.
00618    if(stillRawLine)
00619    {
00620       myErrorString = "Reached the end of the file without a '+' line.";
00621       reportErrorOnLine();
00622       valid = false;
00623    }
00624 
00625    return(valid);
00626 }
00627 
00628 
00629 // Reads and validates the quality string line(s).
00630 bool FastQFile::validateQualityStringLines()
00631 {
00632    // Read the quality string.
00633    int readStatus = myQualityString.ReadLine(myFile);
00634    myLineNum++;
00635 
00636    if(readStatus <= 0)
00637    {
00638       myFileProblem = true;
00639       myErrorString = "Failure trying to read quality line";
00640       reportErrorOnLine();
00641       return false;
00642    }
00643 
00644    // track the offset into the quality string to validate.
00645    int offset = 0;
00646 
00647    // Validate this line of the quality string.
00648    bool valid = validateQualityString(offset);
00649 
00650    offset = myQualityString.Length();
00651 
00652    // Keep reading quality string lines until the length of the 
00653    // raw sequence has been hit or the end of the file is reached.
00654    while((myQualityString.Length() < myRawSequence.Length()) && 
00655          (!ifeof(myFile)))
00656    {
00657       // If enough errors, quit before reading any more.
00658       if(isTimeToQuit())
00659       {
00660          return(false);
00661       }
00662 
00663       // Read another line of the quality string.
00664       readStatus = myTempPartialQuality.ReadLine(myFile);
00665       myLineNum++;
00666 
00667       if(readStatus <= 0)
00668       {
00669          myFileProblem = true;
00670          myErrorString = "Failure trying to read quality line";
00671          reportErrorOnLine();
00672          return false;
00673       }
00674 
00675       myQualityString += myTempPartialQuality;
00676       myTempPartialQuality.Clear();
00677 
00678       // Validate this line of the quality string.
00679       valid &= validateQualityString(offset);
00680       offset = myQualityString.Length();
00681    }
00682 
00683    // If enough errors, quit before reading any more.
00684    if(isTimeToQuit())
00685    {
00686       return(false);
00687    }
00688 
00689    // Validate that the quality string length is the same as the
00690    // raw sequence length.
00691    if(myQualityString.Length() != myRawSequence.Length()) 
00692    {
00693       myErrorString = "Quality string length (";
00694       myErrorString += myQualityString.Length();
00695       myErrorString += ") does not equal raw sequence length (";
00696       myErrorString += myRawSequence.Length();
00697       myErrorString += ")";
00698       reportErrorOnLine();
00699       valid = false;
00700    }
00701    return(valid);
00702 }
00703 
00704 
00705 // Method to validate a line that contains part of the raw sequence.
00706 bool FastQFile::validateRawSequence(int offset)
00707 {
00708    bool validBase = false; 
00709    bool valid = true;
00710 
00711    // Loop through validating each character is valid for the raw sequence.
00712    for(int sequenceIndex = offset; sequenceIndex < myRawSequence.Length(); 
00713        sequenceIndex++)
00714    {
00715       // Update the composition for this position.  Returns false if the
00716       // character was not a valid base.
00717       validBase = 
00718          myBaseComposition.updateComposition(sequenceIndex, 
00719                                              myRawSequence[sequenceIndex]);
00720       // Check the return
00721       if(!validBase)
00722       {
00723          // Error, found a value that is not a valid base character.
00724          myErrorString = "Invalid character ('";
00725          myErrorString += myRawSequence[sequenceIndex];
00726          myErrorString += "') in base sequence.";
00727          reportErrorOnLine();
00728          valid = false;
00729          // If enough errors, quit before reading any more.
00730          if(isTimeToQuit())
00731          {
00732             return(false);
00733          }
00734       }
00735    }
00736    return(valid);
00737 }
00738 
00739 
00740 // Method to validate the "+" line that seperates the raw sequence and the
00741 // quality string.
00742 bool FastQFile::validateSequencePlus()
00743 {
00744    // Validate that optional sequence identifier is the same
00745    // as the one on the @ line.
00746 
00747    // Check to see if there is more to the line than just the plus
00748    int lineLength = myPlusLine.Length();
00749    
00750    // If the line is only 1 character or the second character is a space,
00751    // then there is no sequence identifier on this line and there is nothing
00752    // further to validate.
00753    if((lineLength == 1) || (myPlusLine[1] == ' '))
00754    {
00755       // No sequence identifier, so just return valid.
00756       return true;
00757    }
00758    
00759    // There is a sequence identifier on this line, so validate that
00760    // it matches the one from the associated @ line.
00761    // The read in line must be at least 1 more character ('+') than the
00762    // sequence identifier read from the '@' line.
00763    // If it is not longer than the sequence identifier, then we know that it
00764    // cannot be the same.
00765    int sequenceIdentifierLength = mySequenceIdentifier.Length();
00766    if(lineLength <= sequenceIdentifierLength)
00767    {
00768       myErrorString = 
00769          "Sequence Identifier on '+' line does not equal the one on the '@' line.";
00770       reportErrorOnLine();
00771       return false;
00772    }
00773 
00774    bool same = true;
00775    int seqIndex = 0;
00776    int lineIndex = 1;  // Start at 1 since offset 0 has '+'
00777 
00778    // Loop through the sequence index and the line buffer verifying they
00779    // are the same until a difference is found or the end of the sequence
00780    // identifier is found.
00781    while((same == true) && (seqIndex < sequenceIdentifierLength))
00782    {
00783       if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex])
00784       {
00785          myErrorString = 
00786             "Sequence Identifier on '+' line does not equal the one on the '@' line.";
00787          reportErrorOnLine();
00788          same = false;
00789       }
00790       lineIndex++;
00791       seqIndex++;
00792    }
00793    return(same);
00794 }
00795 
00796 
00797 // Method to validate the quality string.
00798 bool FastQFile::validateQualityString(int offset)
00799 {
00800    bool valid = true;
00801    // For each character in the line, verify that it is ascii > 32.
00802    for(int i=offset; i < myQualityString.Length(); i++)
00803    {
00804       if(myQualityString[i] <= 32)
00805       {
00806          myErrorString = "Invalid character ('";
00807          myErrorString += myQualityString[i];
00808          myErrorString += "') in quality string.";
00809          reportErrorOnLine();
00810          valid = false;
00811          // If enough errors, quit before reading any more.
00812          if(isTimeToQuit())
00813          {
00814             return(false);
00815          }
00816       }
00817    }
00818    return(valid);
00819 }
00820 
00821 
00822 // Helper method for printing the contents of myErrorString.  It will
00823 // only print the errors until the maximum number of reportable errors is
00824 // reached.
00825 void FastQFile::reportErrorOnLine()
00826 {
00827    // Increment the total number of errors.
00828    myNumErrors++;
00829    
00830    // Only display the first X number of errors.
00831    if(myNumErrors <= myNumPrintableErrors)
00832    {
00833       // Write the error with the line number.
00834       char buffer[100];
00835       sprintf(buffer, "ERROR on Line %u: ", myLineNum);
00836       std::string message = buffer;
00837       message += myErrorString.c_str();
00838       logMessage(message.c_str());
00839    }
00840 }
00841 
00842 
00843 // Reset member data that is unique for each fastQFile.
00844 void FastQFile::reset()
00845 {
00846    // Each fastq file processing needs to also reset the member data for each
00847    // sequence.
00848    resetForEachSequence();
00849    myNumErrors = 0;  // per fastqfile
00850    myLineNum = 0;    // per fastqfile
00851    myFileName.SetLength(0);  // reset the filename string.
00852    myIdentifierMap.clear(); // per fastqfile
00853    myBaseComposition.clear(); // clear the base composition.
00854    myFileProblem = false;
00855 }
00856 
00857 
00858 // Reset the member data that is unique for each sequence.
00859 void FastQFile::resetForEachSequence()
00860 {
00861    myLineBuffer.SetLength(0);
00862    myErrorString.SetLength(0);
00863    myRawSequence.SetLength(0);
00864    mySequenceIdLine.SetLength(0);
00865    mySequenceIdentifier.SetLength(0);
00866    myPlusLine.SetLength(0);
00867    myQualityString.SetLength(0);
00868    myTempPartialQuality.SetLength(0);
00869 }
00870 
00871 
00872 void FastQFile::logMessage(const char* logMessage)
00873 {
00874    // Write the message if they are not disabled.
00875    if(!myDisableMessages)
00876    {
00877       std::cout << logMessage << std::endl;
00878    }
00879 }
00880 
00881 
00882 // Determine if it is time to quit by checking if we are to quit after a
00883 // certain number of errors and that many errors have been encountered.
00884 bool FastQFile::isTimeToQuit()
00885 {
00886    // It is time to quit if we are to quit after a certain number of errors
00887    // and that many errors have been encountered.
00888    if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors))
00889    {
00890       return(true);
00891    }
00892    return(false);
00893 }
Generated on Tue Sep 6 17:52:00 2011 for libStatGen Software by  doxygen 1.6.3