libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include <iostream> 00019 00020 #include "InputFile.h" 00021 #include "FastQFile.h" 00022 #include "BaseUtilities.h" 00023 00024 // Constructor. 00025 // minReadLength - The minimum length that a base sequence must be for 00026 // it to be valid. 00027 // numPrintableErrors - The maximum number of errors that should be reported 00028 // in detail before suppressing the errors. 00029 // 00030 FastQFile::FastQFile(int minReadLength, int numPrintableErrors) 00031 : myFile(NULL), 00032 myBaseComposition(), 00033 myQualPerCycle(), 00034 myCountPerCycle(), 00035 myCheckSeqID(true), 00036 myMinReadLength(minReadLength), 00037 myNumPrintableErrors(numPrintableErrors), 00038 myMaxErrors(-1), 00039 myDisableMessages(false), 00040 myFileProblem(false) 00041 { 00042 // Reset the member data. 00043 reset(); 00044 } 00045 00046 00047 void FastQFile::disableMessages() 00048 { 00049 myDisableMessages = true; 00050 } 00051 00052 00053 void FastQFile::enableMessages() 00054 { 00055 myDisableMessages = false; 00056 } 00057 00058 00059 // Disable Unique Sequence ID checking. 00060 // Unique Sequence ID checking is enabled by default. 00061 void FastQFile::disableSeqIDCheck() 00062 { 00063 myCheckSeqID = false; 00064 } 00065 00066 00067 // Enable Unique Sequence ID checking. 00068 // Unique Sequence ID checking is enabled by default. 00069 void FastQFile::enableSeqIDCheck() 00070 { 00071 myCheckSeqID = true; 00072 } 00073 00074 00075 // Set the number of errors after which to quit reading/validating a file. 00076 void FastQFile::setMaxErrors(int maxErrors) 00077 { 00078 myMaxErrors = maxErrors; 00079 } 00080 00081 00082 // Open a FastQFile. 00083 FastQStatus::Status FastQFile::openFile(const char* fileName, 00084 BaseAsciiMap::SPACE_TYPE spaceType) 00085 { 00086 // reset the member data. 00087 reset(); 00088 00089 myBaseComposition.resetBaseMapType(); 00090 myBaseComposition.setBaseMapType(spaceType); 00091 myQualPerCycle.clear(); 00092 myCountPerCycle.clear(); 00093 00094 FastQStatus::Status status = FastQStatus::FASTQ_SUCCESS; 00095 00096 // Close the file if there is already one open - checked by close. 00097 status = closeFile(); 00098 if(status == FastQStatus::FASTQ_SUCCESS) 00099 { 00100 // Successfully closed a previously opened file if there was one. 00101 00102 // Open the file 00103 myFile = ifopen(fileName, "rt"); 00104 myFileName = fileName; 00105 00106 if(myFile == NULL) 00107 { 00108 // Failed to open the file. 00109 status = FastQStatus::FASTQ_OPEN_ERROR; 00110 } 00111 } 00112 00113 if(status != FastQStatus::FASTQ_SUCCESS) 00114 { 00115 // Failed to open the file. 00116 std::string errorMessage = "ERROR: Failed to open file: "; 00117 errorMessage += fileName; 00118 logMessage(errorMessage.c_str()); 00119 } 00120 return(status); 00121 } 00122 00123 00124 // Close a FastQFile. 00125 FastQStatus::Status FastQFile::closeFile() 00126 { 00127 int closeStatus = 0; // Success. 00128 00129 // If a file has been opened, close it. 00130 if(myFile != NULL) 00131 { 00132 // Close the file. 00133 closeStatus = ifclose(myFile); 00134 myFile = NULL; 00135 } 00136 if(closeStatus == 0) 00137 { 00138 // Success - either there wasn't a file to close or it was closed 00139 // successfully. 00140 return(FastQStatus::FASTQ_SUCCESS); 00141 } 00142 else 00143 { 00144 std::string errorMessage = "Failed to close file: "; 00145 errorMessage += myFileName.c_str(); 00146 logMessage(errorMessage.c_str()); 00147 return(FastQStatus::FASTQ_CLOSE_ERROR); 00148 } 00149 } 00150 00151 00152 // Check to see if the file is open. 00153 bool FastQFile::isOpen() 00154 { 00155 // Check to see if the file is open. 00156 if((myFile != NULL) && (myFile->isOpen())) 00157 { 00158 // File pointer exists and the file is open. 00159 return true; 00160 } 00161 00162 // File is not open. 00163 return false; 00164 } 00165 00166 00167 // Check to see if the file is at the end of the file. 00168 bool FastQFile::isEof() 00169 { 00170 // Check to see if the file is open. 00171 if((myFile != NULL) && (ifeof(myFile))) 00172 { 00173 // At EOF. 00174 return true; 00175 } 00176 00177 // Not at EOF. 00178 return false; 00179 } 00180 00181 00182 // Returns whether or not to keep reading the file. 00183 // Stop reading (false) if eof or there is a problem reading the file. 00184 bool FastQFile::keepReadingFile() 00185 { 00186 if(isEof() || myFileProblem) 00187 { 00188 return(false); 00189 } 00190 return(true); 00191 } 00192 00193 00194 // Validate the specified fastq file 00195 FastQStatus::Status FastQFile::validateFastQFile(const String& filename, 00196 bool printBaseComp, 00197 BaseAsciiMap::SPACE_TYPE spaceType, 00198 bool printQualAvg) 00199 { 00200 // Open the fastqfile. 00201 if(openFile(filename, spaceType) != FastQStatus::FASTQ_SUCCESS) 00202 { 00203 // Failed to open the specified file. 00204 return(FastQStatus::FASTQ_OPEN_ERROR); 00205 } 00206 00207 // Track the total number of sequences that were validated. 00208 int numSequences = 0; 00209 00210 // Keep reading the file until there are no more fastq sequences to process 00211 // and not configured to quit after a certain number of errors or there 00212 // has not yet been that many errors. 00213 // Or exit if there is a problem reading the file. 00214 FastQStatus::Status status = FastQStatus::FASTQ_SUCCESS; 00215 while (keepReadingFile() && 00216 ((myMaxErrors == -1) || (myMaxErrors > myNumErrors))) 00217 { 00218 // Validate one sequence. This call will read all the lines for 00219 // one sequence. 00220 status = readFastQSequence(); 00221 if((status == FastQStatus::FASTQ_SUCCESS) || (status == FastQStatus::FASTQ_INVALID)) 00222 { 00223 // Read a sequence and it is either valid or invalid, but 00224 // either way, a sequence was read, so increment the sequence count. 00225 ++numSequences; 00226 } 00227 else 00228 { 00229 // Other error, so break out of processing. 00230 break; 00231 } 00232 } 00233 00234 // Report Base Composition Statistics. 00235 if(printBaseComp) 00236 { 00237 myBaseComposition.print(); 00238 } 00239 00240 if(printQualAvg) 00241 { 00242 printAvgQual(); 00243 } 00244 00245 std::string finishMessage = "Finished processing "; 00246 finishMessage += myFileName.c_str(); 00247 char buffer[100]; 00248 if(sprintf(buffer, 00249 " with %u lines containing %d sequences.", 00250 myLineNum, numSequences) > 0) 00251 { 00252 finishMessage += buffer; 00253 logMessage(finishMessage.c_str()); 00254 } 00255 if(sprintf(buffer, 00256 "There were a total of %d errors.", 00257 myNumErrors) > 0) 00258 { 00259 logMessage(buffer); 00260 } 00261 00262 // Close the input file. 00263 FastQStatus::Status closeStatus = closeFile(); 00264 00265 if((status != FastQStatus::FASTQ_SUCCESS) && (status != FastQStatus::FASTQ_INVALID)) 00266 { 00267 // Stopped validating due to some error other than invalid, so 00268 // return that error. 00269 return(status); 00270 } 00271 else if(myNumErrors == 0) 00272 { 00273 // No errors, check to see if there were any sequences. 00274 // Finished processing all of the sequences in the file. 00275 // If there are no sequences, report an error. 00276 if(numSequences == 0) 00277 { 00278 // Empty file, return error. 00279 logMessage("ERROR: No FastQSequences in the file."); 00280 return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR); 00281 } 00282 return(FastQStatus::FASTQ_SUCCESS); 00283 } 00284 else 00285 { 00286 // The file is invalid. But check the close status. If the close 00287 // failed, it means there is a problem with the file itself not just 00288 // with validation, so the close failure should be returned. 00289 if(closeStatus != FastQStatus::FASTQ_SUCCESS) 00290 { 00291 return(closeStatus); 00292 } 00293 return(FastQStatus::FASTQ_INVALID); 00294 } 00295 } 00296 00297 00298 // Reads and validates a single fastq sequence from myFile. 00299 FastQStatus::Status FastQFile::readFastQSequence() 00300 { 00301 // First verify that a file is open, if not, return failure. 00302 if(!isOpen()) 00303 { 00304 std::string message = 00305 "ERROR: Trying to read a fastq file but no file is open."; 00306 logMessage(message.c_str()); 00307 return(FastQStatus::FASTQ_ORDER_ERROR); 00308 } 00309 00310 // Reset variables for each sequence. 00311 resetForEachSequence(); 00312 00313 bool valid = true; 00314 00315 // No sequence was read. 00316 if(isTimeToQuit()) 00317 { 00318 return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR); 00319 } 00320 00321 // The first line is the sequence identifier, so validate that. 00322 valid = validateSequenceIdentifierLine(); 00323 00324 if(myFileProblem) 00325 { 00326 return(FastQStatus::FASTQ_READ_ERROR); 00327 } 00328 00329 // If we are at the end of the file, check to see if it is a partial 00330 // sequence or just an empty line at the end. 00331 if(ifeof(myFile)) 00332 { 00333 // If the sequence identifier line was empty and we are at the 00334 // end of the file, there is nothing more to validate. 00335 if(mySequenceIdLine.Length() != 0) 00336 { 00337 // There was a sequence identifier line, so this is an incomplete 00338 // sequence. 00339 myErrorString = "Incomplete Sequence.\n"; 00340 reportErrorOnLine(); 00341 00342 valid = false; 00343 } 00344 if(valid) 00345 { 00346 // Return failure - no sequences were left to read. At the end 00347 // of the file. It wasn't invalid and it wasn't really an error. 00348 return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR); 00349 } 00350 else 00351 { 00352 return(FastQStatus::FASTQ_INVALID); 00353 } 00354 } 00355 00356 // If enough errors, quit before reading any more. 00357 if(isTimeToQuit()) 00358 { 00359 // Means there was an error, so mark it as invalid. 00360 return(FastQStatus::FASTQ_INVALID); 00361 } 00362 00363 // Validate the Raw Sequence Line(s) and the "+" line. 00364 valid &= validateRawSequenceAndPlusLines(); 00365 00366 if(myFileProblem) 00367 { 00368 return(FastQStatus::FASTQ_READ_ERROR); 00369 } 00370 00371 // If enough errors, quit before reading any more. 00372 if(isTimeToQuit()) 00373 { 00374 return(FastQStatus::FASTQ_INVALID); 00375 } 00376 00377 // If it is the end of a file, it is missing the quality string. 00378 if(ifeof(myFile)) 00379 { 00380 // There was a sequence identifier line, so this is an incomplete 00381 // sequence. 00382 myErrorString = "Incomplete Sequence, missing Quality String."; 00383 reportErrorOnLine(); 00384 valid = false; 00385 return(FastQStatus::FASTQ_INVALID); 00386 } 00387 00388 // All that is left is to validate the quality string line(s). 00389 valid &= validateQualityStringLines(); 00390 00391 if(myFileProblem) 00392 { 00393 return(FastQStatus::FASTQ_READ_ERROR); 00394 } 00395 00396 if(valid) 00397 { 00398 return(FastQStatus::FASTQ_SUCCESS); 00399 } 00400 return(FastQStatus::FASTQ_INVALID); 00401 } 00402 00403 00404 // Reads and validates the sequence identifier line of a fastq sequence. 00405 bool FastQFile::validateSequenceIdentifierLine() 00406 { 00407 // Read the first line of the sequence. 00408 int readStatus = mySequenceIdLine.ReadLine(myFile); 00409 00410 // Check to see if the read was successful. 00411 if(readStatus <= 0) 00412 { 00413 // If EOF, not an error. 00414 if(ifeof(myFile)) 00415 { 00416 return true; 00417 } 00418 myFileProblem = true; 00419 myErrorString = "Failure trying to read sequence identifier line"; 00420 reportErrorOnLine(); 00421 return false; 00422 } 00423 00424 // If the line is 0 length and it is the end of the file, just 00425 // return since this is the eof - no error. 00426 if((mySequenceIdLine.Length() == 0) && (ifeof(myFile))) 00427 { 00428 // Not an error, just a new line at the end of the file. 00429 return true; 00430 } 00431 00432 // Increment the line number. 00433 myLineNum++; 00434 00435 // Verify that the line has at least 2 characters: '@' and at least 00436 // one character for the sequence identifier. 00437 if(mySequenceIdLine.Length() < 2) 00438 { 00439 // Error. Sequence Identifier line not long enough. 00440 myErrorString = "The sequence identifier line was too short."; 00441 reportErrorOnLine(); 00442 return false; 00443 } 00444 00445 // The sequence identifier line must start wtih a '@' 00446 if(mySequenceIdLine[0] != '@') 00447 { 00448 // Error - sequence identifier line does not begin with an '@'. 00449 myErrorString = "First line of a sequence does not begin with @"; 00450 reportErrorOnLine(); 00451 return false; 00452 } 00453 00454 // Valid Sequence Identifier Line. 00455 00456 // The sequence identifier ends at the first space or at the end of the 00457 // line if there is no space. 00458 // Use fast find since this is a case insensitive search. 00459 // Start at 1 since we know that 0 is '@' 00460 int endSequenceIdentifier = mySequenceIdLine.FastFindChar(' ', 1); 00461 00462 // Check if a " " was found. 00463 if(endSequenceIdentifier == -1) 00464 { 00465 // Did not find a ' ', so the identifier is the rest of the line. 00466 // It starts at 1 since @ is at offset 0. 00467 mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str(); 00468 } 00469 else 00470 { 00471 // Found a ' ', so the identifier ends just before that. 00472 // The sequence identifier must be at least 1 character long, 00473 // therefore the endSequenceIdentifier must be greater than 1. 00474 if(endSequenceIdentifier <= 1) 00475 { 00476 myErrorString = 00477 "No Sequence Identifier specified before the comment."; 00478 reportErrorOnLine(); 00479 return false; 00480 } 00481 00482 mySequenceIdentifier = 00483 (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str(); 00484 } 00485 00486 // Check if sequence identifier should be validated for uniqueness. 00487 if(myCheckSeqID) 00488 { 00489 // Check to see if the sequenceIdentifier is a repeat by adding 00490 // it to the set and seeing if it already existed. 00491 std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult; 00492 insertResult = 00493 myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(), 00494 myLineNum)); 00495 00496 if(insertResult.second == false) 00497 { 00498 // Sequence Identifier is a repeat. 00499 myErrorString = "Repeated Sequence Identifier: "; 00500 myErrorString += mySequenceIdentifier.c_str(); 00501 myErrorString += " at Lines "; 00502 myErrorString += insertResult.first->second; 00503 myErrorString += " and "; 00504 myErrorString += myLineNum; 00505 reportErrorOnLine(); 00506 return(false); 00507 } 00508 } 00509 00510 // Valid, return true. 00511 return(true); 00512 } 00513 00514 00515 // Reads and validates the raw sequence line(s) and the plus line. Both are 00516 // included in one method since it is unknown when the raw sequence line 00517 // ends until you find the plus line that divides it from the quality 00518 // string. Since this method will read the plus line to know when the 00519 // raw sequence ends, it also validates that line. 00520 bool FastQFile::validateRawSequenceAndPlusLines() 00521 { 00522 // Read the raw sequence. 00523 int readStatus = myRawSequence.ReadLine(myFile); 00524 00525 myLineNum++; 00526 00527 if(readStatus <= 0) 00528 { 00529 myFileProblem = true; 00530 myErrorString = "Failure trying to read sequence line"; 00531 reportErrorOnLine(); 00532 return false; 00533 } 00534 00535 // Offset into the raw sequence to be validated. 00536 int offset = 0; 00537 00538 // Validate the raw sequence. 00539 bool valid = validateRawSequence(offset); 00540 00541 // Increment the offset for what was just read. 00542 offset = myRawSequence.Length(); 00543 00544 // The next line is either a continuation of the raw sequence or it starts 00545 // with a '+' 00546 // Keep validating til the '+' line or the end of file is found. 00547 bool stillRawLine = true; 00548 while(stillRawLine && 00549 !ifeof(myFile)) 00550 { 00551 // If enough errors, quit before reading any more. 00552 if(isTimeToQuit()) 00553 { 00554 return(false); 00555 } 00556 00557 // Read the next line. 00558 // Read into the plus line, but if it isn't a plus line, then 00559 // it will be copied into the raw sequence line. 00560 readStatus = myPlusLine.ReadLine(myFile); 00561 myLineNum++; 00562 00563 if(readStatus <= 0) 00564 { 00565 myFileProblem = true; 00566 myErrorString = "Failure trying to read sequence/plus line"; 00567 reportErrorOnLine(); 00568 return false; 00569 } 00570 00571 // Check if the next line is blank 00572 if(myPlusLine.Length() == 0) 00573 { 00574 // The next line is blank. Assume it is part of the raw sequence and 00575 // report an error since there are no valid characters on the line. 00576 myErrorString = 00577 "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence."; 00578 reportErrorOnLine(); 00579 } 00580 // Check for the plus line. 00581 else if(myPlusLine[0] == '+') 00582 { 00583 // This is the + line. 00584 valid &= validateSequencePlus(); 00585 stillRawLine = false; 00586 } 00587 else 00588 { 00589 // Not a plus line, so assume this is a continuation of the Raw 00590 // Sequence. 00591 // Copy from the plus line to the raw sequence line. 00592 myRawSequence += myPlusLine; 00593 myPlusLine.SetLength(0); 00594 valid &= validateRawSequence(offset); 00595 00596 // Increment the offset. 00597 offset = myRawSequence.Length(); 00598 } 00599 } 00600 00601 // If enough errors, quit before reading any more. 00602 if(isTimeToQuit()) 00603 { 00604 return(false); 00605 } 00606 00607 // Now that the entire raw sequence has been obtained, check its length 00608 // against the minimum allowed length. 00609 if(myRawSequence.Length() < myMinReadLength) 00610 { 00611 // Raw sequence is not long enough - error. 00612 myErrorString = "Raw Sequence is shorter than the min read length: "; 00613 myErrorString += myRawSequence.Length(); 00614 myErrorString += " < "; 00615 myErrorString += myMinReadLength; 00616 reportErrorOnLine(); 00617 valid = false; 00618 } 00619 00620 // If enough errors, quit before reading any more. 00621 if(isTimeToQuit()) 00622 { 00623 return(false); 00624 } 00625 00626 // if the flag still indicates it is processing the raw sequence that means 00627 // we reached the end of the file without a '+' line. So report that error. 00628 if(stillRawLine) 00629 { 00630 myErrorString = "Reached the end of the file without a '+' line."; 00631 reportErrorOnLine(); 00632 valid = false; 00633 } 00634 00635 return(valid); 00636 } 00637 00638 00639 // Reads and validates the quality string line(s). 00640 bool FastQFile::validateQualityStringLines() 00641 { 00642 // Read the quality string. 00643 int readStatus = myQualityString.ReadLine(myFile); 00644 myLineNum++; 00645 00646 if(readStatus <= 0) 00647 { 00648 myFileProblem = true; 00649 myErrorString = "Failure trying to read quality line"; 00650 reportErrorOnLine(); 00651 return false; 00652 } 00653 00654 // track the offset into the quality string to validate. 00655 int offset = 0; 00656 00657 // Validate this line of the quality string. 00658 bool valid = validateQualityString(offset); 00659 00660 offset = myQualityString.Length(); 00661 00662 // Keep reading quality string lines until the length of the 00663 // raw sequence has been hit or the end of the file is reached. 00664 while((myQualityString.Length() < myRawSequence.Length()) && 00665 (!ifeof(myFile))) 00666 { 00667 // If enough errors, quit before reading any more. 00668 if(isTimeToQuit()) 00669 { 00670 return(false); 00671 } 00672 00673 // Read another line of the quality string. 00674 readStatus = myTempPartialQuality.ReadLine(myFile); 00675 myLineNum++; 00676 00677 if(readStatus <= 0) 00678 { 00679 myFileProblem = true; 00680 myErrorString = "Failure trying to read quality line"; 00681 reportErrorOnLine(); 00682 return false; 00683 } 00684 00685 myQualityString += myTempPartialQuality; 00686 myTempPartialQuality.Clear(); 00687 00688 // Validate this line of the quality string. 00689 valid &= validateQualityString(offset); 00690 offset = myQualityString.Length(); 00691 } 00692 00693 // If enough errors, quit before reading any more. 00694 if(isTimeToQuit()) 00695 { 00696 return(false); 00697 } 00698 00699 // Validate that the quality string length is the same as the 00700 // raw sequence length. 00701 if(myQualityString.Length() != myRawSequence.Length()) 00702 { 00703 myErrorString = "Quality string length ("; 00704 myErrorString += myQualityString.Length(); 00705 myErrorString += ") does not equal raw sequence length ("; 00706 myErrorString += myRawSequence.Length(); 00707 myErrorString += ")"; 00708 reportErrorOnLine(); 00709 valid = false; 00710 } 00711 return(valid); 00712 } 00713 00714 00715 // Method to validate a line that contains part of the raw sequence. 00716 bool FastQFile::validateRawSequence(int offset) 00717 { 00718 bool validBase = false; 00719 bool valid = true; 00720 00721 // Loop through validating each character is valid for the raw sequence. 00722 for(int sequenceIndex = offset; sequenceIndex < myRawSequence.Length(); 00723 sequenceIndex++) 00724 { 00725 // Update the composition for this position. Returns false if the 00726 // character was not a valid base. 00727 validBase = 00728 myBaseComposition.updateComposition(sequenceIndex, 00729 myRawSequence[sequenceIndex]); 00730 // Check the return 00731 if(!validBase) 00732 { 00733 // Error, found a value that is not a valid base character. 00734 myErrorString = "Invalid character ('"; 00735 myErrorString += myRawSequence[sequenceIndex]; 00736 myErrorString += "') in base sequence."; 00737 reportErrorOnLine(); 00738 valid = false; 00739 // If enough errors, quit before reading any more. 00740 if(isTimeToQuit()) 00741 { 00742 return(false); 00743 } 00744 } 00745 } 00746 return(valid); 00747 } 00748 00749 00750 // Method to validate the "+" line that seperates the raw sequence and the 00751 // quality string. 00752 bool FastQFile::validateSequencePlus() 00753 { 00754 // Validate that optional sequence identifier is the same 00755 // as the one on the @ line. 00756 00757 // Check to see if there is more to the line than just the plus 00758 int lineLength = myPlusLine.Length(); 00759 00760 // If the line is only 1 character or the second character is a space, 00761 // then there is no sequence identifier on this line and there is nothing 00762 // further to validate. 00763 if((lineLength == 1) || (myPlusLine[1] == ' ')) 00764 { 00765 // No sequence identifier, so just return valid. 00766 return true; 00767 } 00768 00769 // There is a sequence identifier on this line, so validate that 00770 // it matches the one from the associated @ line. 00771 // The read in line must be at least 1 more character ('+') than the 00772 // sequence identifier read from the '@' line. 00773 // If it is not longer than the sequence identifier, then we know that it 00774 // cannot be the same. 00775 int sequenceIdentifierLength = mySequenceIdentifier.Length(); 00776 if(lineLength <= sequenceIdentifierLength) 00777 { 00778 myErrorString = 00779 "Sequence Identifier on '+' line does not equal the one on the '@' line."; 00780 reportErrorOnLine(); 00781 return false; 00782 } 00783 00784 bool same = true; 00785 int seqIndex = 0; 00786 int lineIndex = 1; // Start at 1 since offset 0 has '+' 00787 00788 // Loop through the sequence index and the line buffer verifying they 00789 // are the same until a difference is found or the end of the sequence 00790 // identifier is found. 00791 while((same == true) && (seqIndex < sequenceIdentifierLength)) 00792 { 00793 if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex]) 00794 { 00795 myErrorString = 00796 "Sequence Identifier on '+' line does not equal the one on the '@' line."; 00797 reportErrorOnLine(); 00798 same = false; 00799 } 00800 lineIndex++; 00801 seqIndex++; 00802 } 00803 return(same); 00804 } 00805 00806 00807 // Method to validate the quality string. 00808 bool FastQFile::validateQualityString(int offset) 00809 { 00810 bool valid = true; 00811 if(myQualityString.Length() > (int)(myQualPerCycle.size())) 00812 { 00813 myQualPerCycle.resize(myQualityString.Length()); 00814 myCountPerCycle.resize(myQualityString.Length()); 00815 } 00816 // For each character in the line, verify that it is ascii > 32. 00817 for(int i=offset; i < myQualityString.Length(); i++) 00818 { 00819 if(myQualityString[i] <= 32) 00820 { 00821 myErrorString = "Invalid character ('"; 00822 myErrorString += myQualityString[i]; 00823 myErrorString += "') in quality string."; 00824 reportErrorOnLine(); 00825 valid = false; 00826 // If enough errors, quit before reading any more. 00827 if(isTimeToQuit()) 00828 { 00829 return(false); 00830 } 00831 } 00832 else 00833 { 00834 myQualPerCycle[i] += BaseUtilities::getPhredBaseQuality(myQualityString[i]); 00835 myCountPerCycle[i] += 1; 00836 } 00837 } 00838 return(valid); 00839 } 00840 00841 00842 // Helper method for printing the contents of myErrorString. It will 00843 // only print the errors until the maximum number of reportable errors is 00844 // reached. 00845 void FastQFile::reportErrorOnLine() 00846 { 00847 // Increment the total number of errors. 00848 myNumErrors++; 00849 00850 // Only display the first X number of errors. 00851 if(myNumErrors <= myNumPrintableErrors) 00852 { 00853 // Write the error with the line number. 00854 char buffer[100]; 00855 sprintf(buffer, "ERROR on Line %u: ", myLineNum); 00856 std::string message = buffer; 00857 message += myErrorString.c_str(); 00858 logMessage(message.c_str()); 00859 } 00860 } 00861 00862 00863 // Reset member data that is unique for each fastQFile. 00864 void FastQFile::reset() 00865 { 00866 // Each fastq file processing needs to also reset the member data for each 00867 // sequence. 00868 resetForEachSequence(); 00869 myNumErrors = 0; // per fastqfile 00870 myLineNum = 0; // per fastqfile 00871 myFileName.SetLength(0); // reset the filename string. 00872 myIdentifierMap.clear(); // per fastqfile 00873 myBaseComposition.clear(); // clear the base composition. 00874 myQualPerCycle.clear(); 00875 myCountPerCycle.clear(); 00876 myFileProblem = false; 00877 } 00878 00879 00880 // Reset the member data that is unique for each sequence. 00881 void FastQFile::resetForEachSequence() 00882 { 00883 myLineBuffer.SetLength(0); 00884 myErrorString.SetLength(0); 00885 myRawSequence.SetLength(0); 00886 mySequenceIdLine.SetLength(0); 00887 mySequenceIdentifier.SetLength(0); 00888 myPlusLine.SetLength(0); 00889 myQualityString.SetLength(0); 00890 myTempPartialQuality.SetLength(0); 00891 } 00892 00893 00894 void FastQFile::logMessage(const char* logMessage) 00895 { 00896 // Write the message if they are not disabled. 00897 if(!myDisableMessages) 00898 { 00899 std::cout << logMessage << std::endl; 00900 } 00901 } 00902 00903 00904 // Determine if it is time to quit by checking if we are to quit after a 00905 // certain number of errors and that many errors have been encountered. 00906 bool FastQFile::isTimeToQuit() 00907 { 00908 // It is time to quit if we are to quit after a certain number of errors 00909 // and that many errors have been encountered. 00910 if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors)) 00911 { 00912 return(true); 00913 } 00914 return(false); 00915 } 00916 00917 00918 void FastQFile::printAvgQual() 00919 { 00920 std::cout << std::endl << "Average Phred Quality by Read Index (starts at 0):" << std::endl; 00921 std::cout.precision(2); 00922 std::cout << std::fixed << "Read Index\tAverage Quality" 00923 << std::endl; 00924 if(myQualPerCycle.size() != myCountPerCycle.size()) 00925 { 00926 // This is a code error and should NEVER happen. 00927 std::cerr << "ERROR calculating the average Qualities per cycle\n"; 00928 } 00929 00930 double sumQual = 0; 00931 double count = 0; 00932 double avgQual = 0; 00933 for(unsigned int i = 0; i < myQualPerCycle.size(); i++) 00934 { 00935 avgQual = 0; 00936 if(myCountPerCycle[i] != 0) 00937 { 00938 avgQual = myQualPerCycle[i] / (double)(myCountPerCycle[i]); 00939 } 00940 std::cout << i << "\t" << avgQual << "\n"; 00941 sumQual += myQualPerCycle[i]; 00942 count += myCountPerCycle[i]; 00943 } 00944 std::cout << std::endl; 00945 avgQual = 0; 00946 if(count != 0) 00947 { 00948 avgQual = sumQual / count; 00949 } 00950 std::cout << "Overall Average Phred Quality = " << avgQual << std::endl; 00951 }