FastQFile.cpp
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include <iostream>
00019
00020 #include "InputFile.h"
00021 #include "FastQFile.h"
00022
00023
00024
00025
00026
00027
00028
00029
00030 FastQFile::FastQFile(int minReadLength, int numPrintableErrors)
00031 : myFile(NULL),
00032 myBaseComposition(),
00033 myCheckSeqID(true),
00034 myMinReadLength(minReadLength),
00035 myNumPrintableErrors(numPrintableErrors),
00036 myMaxErrors(-1),
00037 myDisableMessages(false),
00038 myFileProblem(false)
00039 {
00040
00041 reset();
00042 }
00043
00044
00045 void FastQFile::disableMessages()
00046 {
00047 myDisableMessages = true;
00048 }
00049
00050
00051 void FastQFile::enableMessages()
00052 {
00053 myDisableMessages = false;
00054 }
00055
00056
00057
00058
00059 void FastQFile::disableSeqIDCheck()
00060 {
00061 myCheckSeqID = false;
00062 }
00063
00064
00065
00066
00067 void FastQFile::enableSeqIDCheck()
00068 {
00069 myCheckSeqID = true;
00070 }
00071
00072
00073
00074 void FastQFile::setMaxErrors(int maxErrors)
00075 {
00076 myMaxErrors = maxErrors;
00077 }
00078
00079
00080
00081 FastQStatus::Status FastQFile::openFile(const char* fileName,
00082 BaseAsciiMap::SPACE_TYPE spaceType)
00083 {
00084
00085 reset();
00086
00087 myBaseComposition.resetBaseMapType();
00088 myBaseComposition.setBaseMapType(spaceType);
00089
00090 FastQStatus::Status status = FastQStatus::FASTQ_SUCCESS;
00091
00092
00093 status = closeFile();
00094 if(status == FastQStatus::FASTQ_SUCCESS)
00095 {
00096
00097
00098
00099 myFile = ifopen(fileName, "rt");
00100 myFileName = fileName;
00101
00102 if(myFile == NULL)
00103 {
00104
00105 status = FastQStatus::FASTQ_OPEN_ERROR;
00106 }
00107 }
00108
00109 if(status != FastQStatus::FASTQ_SUCCESS)
00110 {
00111
00112 std::string errorMessage = "ERROR: Failed to open file: ";
00113 errorMessage += fileName;
00114 logMessage(errorMessage.c_str());
00115 }
00116 return(status);
00117 }
00118
00119
00120
00121 FastQStatus::Status FastQFile::closeFile()
00122 {
00123 int closeStatus = 0;
00124
00125
00126 if(myFile != NULL)
00127 {
00128
00129 closeStatus = ifclose(myFile);
00130 myFile = NULL;
00131 }
00132 if(closeStatus == 0)
00133 {
00134
00135
00136 return(FastQStatus::FASTQ_SUCCESS);
00137 }
00138 else
00139 {
00140 std::string errorMessage = "Failed to close file: ";
00141 errorMessage += myFileName.c_str();
00142 logMessage(errorMessage.c_str());
00143 return(FastQStatus::FASTQ_CLOSE_ERROR);
00144 }
00145 }
00146
00147
00148
00149 bool FastQFile::isOpen()
00150 {
00151
00152 if((myFile != NULL) && (myFile->isOpen()))
00153 {
00154
00155 return true;
00156 }
00157
00158
00159 return false;
00160 }
00161
00162
00163
00164 bool FastQFile::isEof()
00165 {
00166
00167 if((myFile != NULL) && (ifeof(myFile)))
00168 {
00169
00170 return true;
00171 }
00172
00173
00174 return false;
00175 }
00176
00177
00178
00179
00180 bool FastQFile::keepReadingFile()
00181 {
00182 if(isEof() || myFileProblem)
00183 {
00184 return(false);
00185 }
00186 return(true);
00187 }
00188
00189
00190
00191 FastQStatus::Status FastQFile::validateFastQFile(const String& filename,
00192 bool printBaseComp,
00193 BaseAsciiMap::SPACE_TYPE spaceType)
00194 {
00195
00196 if(openFile(filename, spaceType) != FastQStatus::FASTQ_SUCCESS)
00197 {
00198
00199 return(FastQStatus::FASTQ_OPEN_ERROR);
00200 }
00201
00202
00203 int numSequences = 0;
00204
00205
00206
00207
00208
00209 FastQStatus::Status status = FastQStatus::FASTQ_SUCCESS;
00210 while (keepReadingFile() &&
00211 ((myMaxErrors == -1) || (myMaxErrors > myNumErrors)))
00212 {
00213
00214
00215 status = readFastQSequence();
00216 if((status == FastQStatus::FASTQ_SUCCESS) || (status == FastQStatus::FASTQ_INVALID))
00217 {
00218
00219
00220 ++numSequences;
00221 }
00222 else
00223 {
00224
00225 break;
00226 }
00227 }
00228
00229
00230 if(printBaseComp)
00231 {
00232 myBaseComposition.print();
00233 }
00234
00235 std::string finishMessage = "Finished processing ";
00236 finishMessage += myFileName.c_str();
00237 char buffer[100];
00238 if(sprintf(buffer,
00239 " with %u lines containing %d sequences.",
00240 myLineNum, numSequences) > 0)
00241 {
00242 finishMessage += buffer;
00243 logMessage(finishMessage.c_str());
00244 }
00245 if(sprintf(buffer,
00246 "There were a total of %d errors.",
00247 myNumErrors) > 0)
00248 {
00249 logMessage(buffer);
00250 }
00251
00252
00253 FastQStatus::Status closeStatus = closeFile();
00254
00255 if((status != FastQStatus::FASTQ_SUCCESS) && (status != FastQStatus::FASTQ_INVALID))
00256 {
00257
00258
00259 return(status);
00260 }
00261 else if(myNumErrors == 0)
00262 {
00263
00264
00265
00266 if(numSequences == 0)
00267 {
00268
00269 logMessage("ERROR: No FastQSequences in the file.");
00270 return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
00271 }
00272 return(FastQStatus::FASTQ_SUCCESS);
00273 }
00274 else
00275 {
00276
00277
00278
00279 if(closeStatus != FastQStatus::FASTQ_SUCCESS)
00280 {
00281 return(closeStatus);
00282 }
00283 return(FastQStatus::FASTQ_INVALID);
00284 }
00285 }
00286
00287
00288
00289 FastQStatus::Status FastQFile::readFastQSequence()
00290 {
00291
00292 if(!isOpen())
00293 {
00294 std::string message =
00295 "ERROR: Trying to read a fastq file but no file is open.";
00296 logMessage(message.c_str());
00297 return(FastQStatus::FASTQ_ORDER_ERROR);
00298 }
00299
00300
00301 resetForEachSequence();
00302
00303 bool valid = true;
00304
00305
00306 if(isTimeToQuit())
00307 {
00308 return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
00309 }
00310
00311
00312 valid = validateSequenceIdentifierLine();
00313
00314 if(myFileProblem)
00315 {
00316 return(FastQStatus::FASTQ_READ_ERROR);
00317 }
00318
00319
00320
00321 if(ifeof(myFile))
00322 {
00323
00324
00325 if(mySequenceIdLine.Length() != 0)
00326 {
00327
00328
00329 myErrorString = "Incomplete Sequence.\n";
00330 reportErrorOnLine();
00331
00332 valid = false;
00333 }
00334 if(valid)
00335 {
00336
00337
00338 return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
00339 }
00340 else
00341 {
00342 return(FastQStatus::FASTQ_INVALID);
00343 }
00344 }
00345
00346
00347 if(isTimeToQuit())
00348 {
00349
00350 return(FastQStatus::FASTQ_INVALID);
00351 }
00352
00353
00354 valid &= validateRawSequenceAndPlusLines();
00355
00356 if(myFileProblem)
00357 {
00358 return(FastQStatus::FASTQ_READ_ERROR);
00359 }
00360
00361
00362 if(isTimeToQuit())
00363 {
00364 return(FastQStatus::FASTQ_INVALID);
00365 }
00366
00367
00368 if(ifeof(myFile))
00369 {
00370
00371
00372 myErrorString = "Incomplete Sequence, missing Quality String.";
00373 reportErrorOnLine();
00374 valid = false;
00375 return(FastQStatus::FASTQ_INVALID);
00376 }
00377
00378
00379 valid &= validateQualityStringLines();
00380
00381 if(myFileProblem)
00382 {
00383 return(FastQStatus::FASTQ_READ_ERROR);
00384 }
00385
00386 if(valid)
00387 {
00388 return(FastQStatus::FASTQ_SUCCESS);
00389 }
00390 return(FastQStatus::FASTQ_INVALID);
00391 }
00392
00393
00394
00395 bool FastQFile::validateSequenceIdentifierLine()
00396 {
00397
00398 int readStatus = mySequenceIdLine.ReadLine(myFile);
00399
00400
00401 if(readStatus <= 0)
00402 {
00403
00404 if(ifeof(myFile))
00405 {
00406 return true;
00407 }
00408 myFileProblem = true;
00409 myErrorString = "Failure trying to read sequence identifier line";
00410 reportErrorOnLine();
00411 return false;
00412 }
00413
00414
00415
00416 if((mySequenceIdLine.Length() == 0) && (ifeof(myFile)))
00417 {
00418
00419 return true;
00420 }
00421
00422
00423 myLineNum++;
00424
00425
00426
00427 if(mySequenceIdLine.Length() < 2)
00428 {
00429
00430 myErrorString = "The sequence identifier line was too short.";
00431 reportErrorOnLine();
00432 return false;
00433 }
00434
00435
00436 if(mySequenceIdLine[0] != '@')
00437 {
00438
00439 myErrorString = "First line of a sequence does not begin with @";
00440 reportErrorOnLine();
00441 return false;
00442 }
00443
00444
00445
00446
00447
00448
00449
00450 int endSequenceIdentifier = mySequenceIdLine.FastFindChar(' ', 1);
00451
00452
00453 if(endSequenceIdentifier == -1)
00454 {
00455
00456
00457 mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str();
00458 }
00459 else
00460 {
00461
00462
00463
00464 if(endSequenceIdentifier <= 1)
00465 {
00466 myErrorString =
00467 "No Sequence Identifier specified before the comment.";
00468 reportErrorOnLine();
00469 return false;
00470 }
00471
00472 mySequenceIdentifier =
00473 (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str();
00474 }
00475
00476
00477 if(myCheckSeqID)
00478 {
00479
00480
00481 std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult;
00482 insertResult =
00483 myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(),
00484 myLineNum));
00485
00486 if(insertResult.second == false)
00487 {
00488
00489 myErrorString = "Repeated Sequence Identifier: ";
00490 myErrorString += mySequenceIdentifier.c_str();
00491 myErrorString += " at Lines ";
00492 myErrorString += insertResult.first->second;
00493 myErrorString += " and ";
00494 myErrorString += myLineNum;
00495 reportErrorOnLine();
00496 return(false);
00497 }
00498 }
00499
00500
00501 return(true);
00502 }
00503
00504
00505
00506
00507
00508
00509
00510 bool FastQFile::validateRawSequenceAndPlusLines()
00511 {
00512
00513 int readStatus = myRawSequence.ReadLine(myFile);
00514
00515 myLineNum++;
00516
00517 if(readStatus <= 0)
00518 {
00519 myFileProblem = true;
00520 myErrorString = "Failure trying to read sequence line";
00521 reportErrorOnLine();
00522 return false;
00523 }
00524
00525
00526 int offset = 0;
00527
00528
00529 bool valid = validateRawSequence(offset);
00530
00531
00532 offset = myRawSequence.Length();
00533
00534
00535
00536
00537 bool stillRawLine = true;
00538 while(stillRawLine &&
00539 !ifeof(myFile))
00540 {
00541
00542 if(isTimeToQuit())
00543 {
00544 return(false);
00545 }
00546
00547
00548
00549
00550 readStatus = myPlusLine.ReadLine(myFile);
00551 myLineNum++;
00552
00553 if(readStatus <= 0)
00554 {
00555 myFileProblem = true;
00556 myErrorString = "Failure trying to read sequence/plus line";
00557 reportErrorOnLine();
00558 return false;
00559 }
00560
00561
00562 if(myPlusLine.Length() == 0)
00563 {
00564
00565
00566 myErrorString =
00567 "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence.";
00568 reportErrorOnLine();
00569 }
00570
00571 else if(myPlusLine[0] == '+')
00572 {
00573
00574 valid &= validateSequencePlus();
00575 stillRawLine = false;
00576 }
00577 else
00578 {
00579
00580
00581
00582 myRawSequence += myPlusLine;
00583 myPlusLine.SetLength(0);
00584 valid &= validateRawSequence(offset);
00585
00586
00587 offset = myRawSequence.Length();
00588 }
00589 }
00590
00591
00592 if(isTimeToQuit())
00593 {
00594 return(false);
00595 }
00596
00597
00598
00599 if(myRawSequence.Length() < myMinReadLength)
00600 {
00601
00602 myErrorString = "Raw Sequence is shorter than the min read length: ";
00603 myErrorString += myRawSequence.Length();
00604 myErrorString += " < ";
00605 myErrorString += myMinReadLength;
00606 reportErrorOnLine();
00607 valid = false;
00608 }
00609
00610
00611 if(isTimeToQuit())
00612 {
00613 return(false);
00614 }
00615
00616
00617
00618 if(stillRawLine)
00619 {
00620 myErrorString = "Reached the end of the file without a '+' line.";
00621 reportErrorOnLine();
00622 valid = false;
00623 }
00624
00625 return(valid);
00626 }
00627
00628
00629
00630 bool FastQFile::validateQualityStringLines()
00631 {
00632
00633 int readStatus = myQualityString.ReadLine(myFile);
00634 myLineNum++;
00635
00636 if(readStatus <= 0)
00637 {
00638 myFileProblem = true;
00639 myErrorString = "Failure trying to read quality line";
00640 reportErrorOnLine();
00641 return false;
00642 }
00643
00644
00645 int offset = 0;
00646
00647
00648 bool valid = validateQualityString(offset);
00649
00650 offset = myQualityString.Length();
00651
00652
00653
00654 while((myQualityString.Length() < myRawSequence.Length()) &&
00655 (!ifeof(myFile)))
00656 {
00657
00658 if(isTimeToQuit())
00659 {
00660 return(false);
00661 }
00662
00663
00664 readStatus = myTempPartialQuality.ReadLine(myFile);
00665 myLineNum++;
00666
00667 if(readStatus <= 0)
00668 {
00669 myFileProblem = true;
00670 myErrorString = "Failure trying to read quality line";
00671 reportErrorOnLine();
00672 return false;
00673 }
00674
00675 myQualityString += myTempPartialQuality;
00676 myTempPartialQuality.Clear();
00677
00678
00679 valid &= validateQualityString(offset);
00680 offset = myQualityString.Length();
00681 }
00682
00683
00684 if(isTimeToQuit())
00685 {
00686 return(false);
00687 }
00688
00689
00690
00691 if(myQualityString.Length() != myRawSequence.Length())
00692 {
00693 myErrorString = "Quality string length (";
00694 myErrorString += myQualityString.Length();
00695 myErrorString += ") does not equal raw sequence length (";
00696 myErrorString += myRawSequence.Length();
00697 myErrorString += ")";
00698 reportErrorOnLine();
00699 valid = false;
00700 }
00701 return(valid);
00702 }
00703
00704
00705
00706 bool FastQFile::validateRawSequence(int offset)
00707 {
00708 bool validBase = false;
00709 bool valid = true;
00710
00711
00712 for(int sequenceIndex = offset; sequenceIndex < myRawSequence.Length();
00713 sequenceIndex++)
00714 {
00715
00716
00717 validBase =
00718 myBaseComposition.updateComposition(sequenceIndex,
00719 myRawSequence[sequenceIndex]);
00720
00721 if(!validBase)
00722 {
00723
00724 myErrorString = "Invalid character ('";
00725 myErrorString += myRawSequence[sequenceIndex];
00726 myErrorString += "') in base sequence.";
00727 reportErrorOnLine();
00728 valid = false;
00729
00730 if(isTimeToQuit())
00731 {
00732 return(false);
00733 }
00734 }
00735 }
00736 return(valid);
00737 }
00738
00739
00740
00741
00742 bool FastQFile::validateSequencePlus()
00743 {
00744
00745
00746
00747
00748 int lineLength = myPlusLine.Length();
00749
00750
00751
00752
00753 if((lineLength == 1) || (myPlusLine[1] == ' '))
00754 {
00755
00756 return true;
00757 }
00758
00759
00760
00761
00762
00763
00764
00765 int sequenceIdentifierLength = mySequenceIdentifier.Length();
00766 if(lineLength <= sequenceIdentifierLength)
00767 {
00768 myErrorString =
00769 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
00770 reportErrorOnLine();
00771 return false;
00772 }
00773
00774 bool same = true;
00775 int seqIndex = 0;
00776 int lineIndex = 1;
00777
00778
00779
00780
00781 while((same == true) && (seqIndex < sequenceIdentifierLength))
00782 {
00783 if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex])
00784 {
00785 myErrorString =
00786 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
00787 reportErrorOnLine();
00788 same = false;
00789 }
00790 lineIndex++;
00791 seqIndex++;
00792 }
00793 return(same);
00794 }
00795
00796
00797
00798 bool FastQFile::validateQualityString(int offset)
00799 {
00800 bool valid = true;
00801
00802 for(int i=offset; i < myQualityString.Length(); i++)
00803 {
00804 if(myQualityString[i] <= 32)
00805 {
00806 myErrorString = "Invalid character ('";
00807 myErrorString += myQualityString[i];
00808 myErrorString += "') in quality string.";
00809 reportErrorOnLine();
00810 valid = false;
00811
00812 if(isTimeToQuit())
00813 {
00814 return(false);
00815 }
00816 }
00817 }
00818 return(valid);
00819 }
00820
00821
00822
00823
00824
00825 void FastQFile::reportErrorOnLine()
00826 {
00827
00828 myNumErrors++;
00829
00830
00831 if(myNumErrors <= myNumPrintableErrors)
00832 {
00833
00834 char buffer[100];
00835 sprintf(buffer, "ERROR on Line %u: ", myLineNum);
00836 std::string message = buffer;
00837 message += myErrorString.c_str();
00838 logMessage(message.c_str());
00839 }
00840 }
00841
00842
00843
00844 void FastQFile::reset()
00845 {
00846
00847
00848 resetForEachSequence();
00849 myNumErrors = 0;
00850 myLineNum = 0;
00851 myFileName.SetLength(0);
00852 myIdentifierMap.clear();
00853 myBaseComposition.clear();
00854 myFileProblem = false;
00855 }
00856
00857
00858
00859 void FastQFile::resetForEachSequence()
00860 {
00861 myLineBuffer.SetLength(0);
00862 myErrorString.SetLength(0);
00863 myRawSequence.SetLength(0);
00864 mySequenceIdLine.SetLength(0);
00865 mySequenceIdentifier.SetLength(0);
00866 myPlusLine.SetLength(0);
00867 myQualityString.SetLength(0);
00868 myTempPartialQuality.SetLength(0);
00869 }
00870
00871
00872 void FastQFile::logMessage(const char* logMessage)
00873 {
00874
00875 if(!myDisableMessages)
00876 {
00877 std::cout << logMessage << std::endl;
00878 }
00879 }
00880
00881
00882
00883
00884 bool FastQFile::isTimeToQuit()
00885 {
00886
00887
00888 if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors))
00889 {
00890 return(true);
00891 }
00892 return(false);
00893 }