FastQFile.cpp
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include <iostream>
00019
00020 #include "InputFile.h"
00021 #include "FastQFile.h"
00022 #include "BaseUtilities.h"
00023
00024
00025
00026
00027
00028
00029
00030 FastQFile::FastQFile(int minReadLength, int numPrintableErrors)
00031 : myFile(NULL),
00032 myBaseComposition(),
00033 myQualPerCycle(),
00034 myCountPerCycle(),
00035 myCheckSeqID(true),
00036 myMinReadLength(minReadLength),
00037 myNumPrintableErrors(numPrintableErrors),
00038 myMaxErrors(-1),
00039 myDisableMessages(false),
00040 myFileProblem(false)
00041 {
00042
00043 reset();
00044 }
00045
00046
00047 void FastQFile::disableMessages()
00048 {
00049 myDisableMessages = true;
00050 }
00051
00052
00053 void FastQFile::enableMessages()
00054 {
00055 myDisableMessages = false;
00056 }
00057
00058
00059
00060
00061 void FastQFile::disableSeqIDCheck()
00062 {
00063 myCheckSeqID = false;
00064 }
00065
00066
00067
00068
00069 void FastQFile::enableSeqIDCheck()
00070 {
00071 myCheckSeqID = true;
00072 }
00073
00074
00075
00076 void FastQFile::setMaxErrors(int maxErrors)
00077 {
00078 myMaxErrors = maxErrors;
00079 }
00080
00081
00082
00083 FastQStatus::Status FastQFile::openFile(const char* fileName,
00084 BaseAsciiMap::SPACE_TYPE spaceType)
00085 {
00086
00087 reset();
00088
00089 myBaseComposition.resetBaseMapType();
00090 myBaseComposition.setBaseMapType(spaceType);
00091 myQualPerCycle.clear();
00092 myCountPerCycle.clear();
00093
00094 FastQStatus::Status status = FastQStatus::FASTQ_SUCCESS;
00095
00096
00097 status = closeFile();
00098 if(status == FastQStatus::FASTQ_SUCCESS)
00099 {
00100
00101
00102
00103 myFile = ifopen(fileName, "rt");
00104 myFileName = fileName;
00105
00106 if(myFile == NULL)
00107 {
00108
00109 status = FastQStatus::FASTQ_OPEN_ERROR;
00110 }
00111 }
00112
00113 if(status != FastQStatus::FASTQ_SUCCESS)
00114 {
00115
00116 std::string errorMessage = "ERROR: Failed to open file: ";
00117 errorMessage += fileName;
00118 logMessage(errorMessage.c_str());
00119 }
00120 return(status);
00121 }
00122
00123
00124
00125 FastQStatus::Status FastQFile::closeFile()
00126 {
00127 int closeStatus = 0;
00128
00129
00130 if(myFile != NULL)
00131 {
00132
00133 closeStatus = ifclose(myFile);
00134 myFile = NULL;
00135 }
00136 if(closeStatus == 0)
00137 {
00138
00139
00140 return(FastQStatus::FASTQ_SUCCESS);
00141 }
00142 else
00143 {
00144 std::string errorMessage = "Failed to close file: ";
00145 errorMessage += myFileName.c_str();
00146 logMessage(errorMessage.c_str());
00147 return(FastQStatus::FASTQ_CLOSE_ERROR);
00148 }
00149 }
00150
00151
00152
00153 bool FastQFile::isOpen()
00154 {
00155
00156 if((myFile != NULL) && (myFile->isOpen()))
00157 {
00158
00159 return true;
00160 }
00161
00162
00163 return false;
00164 }
00165
00166
00167
00168 bool FastQFile::isEof()
00169 {
00170
00171 if((myFile != NULL) && (ifeof(myFile)))
00172 {
00173
00174 return true;
00175 }
00176
00177
00178 return false;
00179 }
00180
00181
00182
00183
00184 bool FastQFile::keepReadingFile()
00185 {
00186 if(isEof() || myFileProblem)
00187 {
00188 return(false);
00189 }
00190 return(true);
00191 }
00192
00193
00194
00195 FastQStatus::Status FastQFile::validateFastQFile(const String& filename,
00196 bool printBaseComp,
00197 BaseAsciiMap::SPACE_TYPE spaceType,
00198 bool printQualAvg)
00199 {
00200
00201 if(openFile(filename, spaceType) != FastQStatus::FASTQ_SUCCESS)
00202 {
00203
00204 return(FastQStatus::FASTQ_OPEN_ERROR);
00205 }
00206
00207
00208 int numSequences = 0;
00209
00210
00211
00212
00213
00214 FastQStatus::Status status = FastQStatus::FASTQ_SUCCESS;
00215 while (keepReadingFile() &&
00216 ((myMaxErrors == -1) || (myMaxErrors > myNumErrors)))
00217 {
00218
00219
00220 status = readFastQSequence();
00221 if((status == FastQStatus::FASTQ_SUCCESS) || (status == FastQStatus::FASTQ_INVALID))
00222 {
00223
00224
00225 ++numSequences;
00226 }
00227 else
00228 {
00229
00230 break;
00231 }
00232 }
00233
00234
00235 if(printBaseComp)
00236 {
00237 myBaseComposition.print();
00238 }
00239
00240 if(printQualAvg)
00241 {
00242 printAvgQual();
00243 }
00244
00245 std::string finishMessage = "Finished processing ";
00246 finishMessage += myFileName.c_str();
00247 char buffer[100];
00248 if(sprintf(buffer,
00249 " with %u lines containing %d sequences.",
00250 myLineNum, numSequences) > 0)
00251 {
00252 finishMessage += buffer;
00253 logMessage(finishMessage.c_str());
00254 }
00255 if(sprintf(buffer,
00256 "There were a total of %d errors.",
00257 myNumErrors) > 0)
00258 {
00259 logMessage(buffer);
00260 }
00261
00262
00263 FastQStatus::Status closeStatus = closeFile();
00264
00265 if((status != FastQStatus::FASTQ_SUCCESS) && (status != FastQStatus::FASTQ_INVALID))
00266 {
00267
00268
00269 return(status);
00270 }
00271 else if(myNumErrors == 0)
00272 {
00273
00274
00275
00276 if(numSequences == 0)
00277 {
00278
00279 logMessage("ERROR: No FastQSequences in the file.");
00280 return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
00281 }
00282 return(FastQStatus::FASTQ_SUCCESS);
00283 }
00284 else
00285 {
00286
00287
00288
00289 if(closeStatus != FastQStatus::FASTQ_SUCCESS)
00290 {
00291 return(closeStatus);
00292 }
00293 return(FastQStatus::FASTQ_INVALID);
00294 }
00295 }
00296
00297
00298
00299 FastQStatus::Status FastQFile::readFastQSequence()
00300 {
00301
00302 if(!isOpen())
00303 {
00304 std::string message =
00305 "ERROR: Trying to read a fastq file but no file is open.";
00306 logMessage(message.c_str());
00307 return(FastQStatus::FASTQ_ORDER_ERROR);
00308 }
00309
00310
00311 resetForEachSequence();
00312
00313 bool valid = true;
00314
00315
00316 if(isTimeToQuit())
00317 {
00318 return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
00319 }
00320
00321
00322 valid = validateSequenceIdentifierLine();
00323
00324 if(myFileProblem)
00325 {
00326 return(FastQStatus::FASTQ_READ_ERROR);
00327 }
00328
00329
00330
00331 if(ifeof(myFile))
00332 {
00333
00334
00335 if(mySequenceIdLine.Length() != 0)
00336 {
00337
00338
00339 myErrorString = "Incomplete Sequence.\n";
00340 reportErrorOnLine();
00341
00342 valid = false;
00343 }
00344 if(valid)
00345 {
00346
00347
00348 return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
00349 }
00350 else
00351 {
00352 return(FastQStatus::FASTQ_INVALID);
00353 }
00354 }
00355
00356
00357 if(isTimeToQuit())
00358 {
00359
00360 return(FastQStatus::FASTQ_INVALID);
00361 }
00362
00363
00364 valid &= validateRawSequenceAndPlusLines();
00365
00366 if(myFileProblem)
00367 {
00368 return(FastQStatus::FASTQ_READ_ERROR);
00369 }
00370
00371
00372 if(isTimeToQuit())
00373 {
00374 return(FastQStatus::FASTQ_INVALID);
00375 }
00376
00377
00378 if(ifeof(myFile))
00379 {
00380
00381
00382 myErrorString = "Incomplete Sequence, missing Quality String.";
00383 reportErrorOnLine();
00384 valid = false;
00385 return(FastQStatus::FASTQ_INVALID);
00386 }
00387
00388
00389 valid &= validateQualityStringLines();
00390
00391 if(myFileProblem)
00392 {
00393 return(FastQStatus::FASTQ_READ_ERROR);
00394 }
00395
00396 if(valid)
00397 {
00398 return(FastQStatus::FASTQ_SUCCESS);
00399 }
00400 return(FastQStatus::FASTQ_INVALID);
00401 }
00402
00403
00404
00405 bool FastQFile::validateSequenceIdentifierLine()
00406 {
00407
00408 int readStatus = mySequenceIdLine.ReadLine(myFile);
00409
00410
00411 if(readStatus <= 0)
00412 {
00413
00414 if(ifeof(myFile))
00415 {
00416 return true;
00417 }
00418 myFileProblem = true;
00419 myErrorString = "Failure trying to read sequence identifier line";
00420 reportErrorOnLine();
00421 return false;
00422 }
00423
00424
00425
00426 if((mySequenceIdLine.Length() == 0) && (ifeof(myFile)))
00427 {
00428
00429 return true;
00430 }
00431
00432
00433 myLineNum++;
00434
00435
00436
00437 if(mySequenceIdLine.Length() < 2)
00438 {
00439
00440 myErrorString = "The sequence identifier line was too short.";
00441 reportErrorOnLine();
00442 return false;
00443 }
00444
00445
00446 if(mySequenceIdLine[0] != '@')
00447 {
00448
00449 myErrorString = "First line of a sequence does not begin with @";
00450 reportErrorOnLine();
00451 return false;
00452 }
00453
00454
00455
00456
00457
00458
00459
00460 int endSequenceIdentifier = mySequenceIdLine.FastFindChar(' ', 1);
00461
00462
00463 if(endSequenceIdentifier == -1)
00464 {
00465
00466
00467 mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str();
00468 }
00469 else
00470 {
00471
00472
00473
00474 if(endSequenceIdentifier <= 1)
00475 {
00476 myErrorString =
00477 "No Sequence Identifier specified before the comment.";
00478 reportErrorOnLine();
00479 return false;
00480 }
00481
00482 mySequenceIdentifier =
00483 (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str();
00484 }
00485
00486
00487 if(myCheckSeqID)
00488 {
00489
00490
00491 std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult;
00492 insertResult =
00493 myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(),
00494 myLineNum));
00495
00496 if(insertResult.second == false)
00497 {
00498
00499 myErrorString = "Repeated Sequence Identifier: ";
00500 myErrorString += mySequenceIdentifier.c_str();
00501 myErrorString += " at Lines ";
00502 myErrorString += insertResult.first->second;
00503 myErrorString += " and ";
00504 myErrorString += myLineNum;
00505 reportErrorOnLine();
00506 return(false);
00507 }
00508 }
00509
00510
00511 return(true);
00512 }
00513
00514
00515
00516
00517
00518
00519
00520 bool FastQFile::validateRawSequenceAndPlusLines()
00521 {
00522
00523 int readStatus = myRawSequence.ReadLine(myFile);
00524
00525 myLineNum++;
00526
00527 if(readStatus <= 0)
00528 {
00529 myFileProblem = true;
00530 myErrorString = "Failure trying to read sequence line";
00531 reportErrorOnLine();
00532 return false;
00533 }
00534
00535
00536 int offset = 0;
00537
00538
00539 bool valid = validateRawSequence(offset);
00540
00541
00542 offset = myRawSequence.Length();
00543
00544
00545
00546
00547 bool stillRawLine = true;
00548 while(stillRawLine &&
00549 !ifeof(myFile))
00550 {
00551
00552 if(isTimeToQuit())
00553 {
00554 return(false);
00555 }
00556
00557
00558
00559
00560 readStatus = myPlusLine.ReadLine(myFile);
00561 myLineNum++;
00562
00563 if(readStatus <= 0)
00564 {
00565 myFileProblem = true;
00566 myErrorString = "Failure trying to read sequence/plus line";
00567 reportErrorOnLine();
00568 return false;
00569 }
00570
00571
00572 if(myPlusLine.Length() == 0)
00573 {
00574
00575
00576 myErrorString =
00577 "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence.";
00578 reportErrorOnLine();
00579 }
00580
00581 else if(myPlusLine[0] == '+')
00582 {
00583
00584 valid &= validateSequencePlus();
00585 stillRawLine = false;
00586 }
00587 else
00588 {
00589
00590
00591
00592 myRawSequence += myPlusLine;
00593 myPlusLine.SetLength(0);
00594 valid &= validateRawSequence(offset);
00595
00596
00597 offset = myRawSequence.Length();
00598 }
00599 }
00600
00601
00602 if(isTimeToQuit())
00603 {
00604 return(false);
00605 }
00606
00607
00608
00609 if(myRawSequence.Length() < myMinReadLength)
00610 {
00611
00612 myErrorString = "Raw Sequence is shorter than the min read length: ";
00613 myErrorString += myRawSequence.Length();
00614 myErrorString += " < ";
00615 myErrorString += myMinReadLength;
00616 reportErrorOnLine();
00617 valid = false;
00618 }
00619
00620
00621 if(isTimeToQuit())
00622 {
00623 return(false);
00624 }
00625
00626
00627
00628 if(stillRawLine)
00629 {
00630 myErrorString = "Reached the end of the file without a '+' line.";
00631 reportErrorOnLine();
00632 valid = false;
00633 }
00634
00635 return(valid);
00636 }
00637
00638
00639
00640 bool FastQFile::validateQualityStringLines()
00641 {
00642
00643 int readStatus = myQualityString.ReadLine(myFile);
00644 myLineNum++;
00645
00646 if(readStatus <= 0)
00647 {
00648 myFileProblem = true;
00649 myErrorString = "Failure trying to read quality line";
00650 reportErrorOnLine();
00651 return false;
00652 }
00653
00654
00655 int offset = 0;
00656
00657
00658 bool valid = validateQualityString(offset);
00659
00660 offset = myQualityString.Length();
00661
00662
00663
00664 while((myQualityString.Length() < myRawSequence.Length()) &&
00665 (!ifeof(myFile)))
00666 {
00667
00668 if(isTimeToQuit())
00669 {
00670 return(false);
00671 }
00672
00673
00674 readStatus = myTempPartialQuality.ReadLine(myFile);
00675 myLineNum++;
00676
00677 if(readStatus <= 0)
00678 {
00679 myFileProblem = true;
00680 myErrorString = "Failure trying to read quality line";
00681 reportErrorOnLine();
00682 return false;
00683 }
00684
00685 myQualityString += myTempPartialQuality;
00686 myTempPartialQuality.Clear();
00687
00688
00689 valid &= validateQualityString(offset);
00690 offset = myQualityString.Length();
00691 }
00692
00693
00694 if(isTimeToQuit())
00695 {
00696 return(false);
00697 }
00698
00699
00700
00701 if(myQualityString.Length() != myRawSequence.Length())
00702 {
00703 myErrorString = "Quality string length (";
00704 myErrorString += myQualityString.Length();
00705 myErrorString += ") does not equal raw sequence length (";
00706 myErrorString += myRawSequence.Length();
00707 myErrorString += ")";
00708 reportErrorOnLine();
00709 valid = false;
00710 }
00711 return(valid);
00712 }
00713
00714
00715
00716 bool FastQFile::validateRawSequence(int offset)
00717 {
00718 bool validBase = false;
00719 bool valid = true;
00720
00721
00722 for(int sequenceIndex = offset; sequenceIndex < myRawSequence.Length();
00723 sequenceIndex++)
00724 {
00725
00726
00727 validBase =
00728 myBaseComposition.updateComposition(sequenceIndex,
00729 myRawSequence[sequenceIndex]);
00730
00731 if(!validBase)
00732 {
00733
00734 myErrorString = "Invalid character ('";
00735 myErrorString += myRawSequence[sequenceIndex];
00736 myErrorString += "') in base sequence.";
00737 reportErrorOnLine();
00738 valid = false;
00739
00740 if(isTimeToQuit())
00741 {
00742 return(false);
00743 }
00744 }
00745 }
00746 return(valid);
00747 }
00748
00749
00750
00751
00752 bool FastQFile::validateSequencePlus()
00753 {
00754
00755
00756
00757
00758 int lineLength = myPlusLine.Length();
00759
00760
00761
00762
00763 if((lineLength == 1) || (myPlusLine[1] == ' '))
00764 {
00765
00766 return true;
00767 }
00768
00769
00770
00771
00772
00773
00774
00775 int sequenceIdentifierLength = mySequenceIdentifier.Length();
00776 if(lineLength <= sequenceIdentifierLength)
00777 {
00778 myErrorString =
00779 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
00780 reportErrorOnLine();
00781 return false;
00782 }
00783
00784 bool same = true;
00785 int seqIndex = 0;
00786 int lineIndex = 1;
00787
00788
00789
00790
00791 while((same == true) && (seqIndex < sequenceIdentifierLength))
00792 {
00793 if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex])
00794 {
00795 myErrorString =
00796 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
00797 reportErrorOnLine();
00798 same = false;
00799 }
00800 lineIndex++;
00801 seqIndex++;
00802 }
00803 return(same);
00804 }
00805
00806
00807
00808 bool FastQFile::validateQualityString(int offset)
00809 {
00810 bool valid = true;
00811 if(myQualityString.Length() > (int)(myQualPerCycle.size()))
00812 {
00813 myQualPerCycle.resize(myQualityString.Length());
00814 myCountPerCycle.resize(myQualityString.Length());
00815 }
00816
00817 for(int i=offset; i < myQualityString.Length(); i++)
00818 {
00819 if(myQualityString[i] <= 32)
00820 {
00821 myErrorString = "Invalid character ('";
00822 myErrorString += myQualityString[i];
00823 myErrorString += "') in quality string.";
00824 reportErrorOnLine();
00825 valid = false;
00826
00827 if(isTimeToQuit())
00828 {
00829 return(false);
00830 }
00831 }
00832 else
00833 {
00834 myQualPerCycle[i] += BaseUtilities::getPhredBaseQuality(myQualityString[i]);
00835 myCountPerCycle[i] += 1;
00836 }
00837 }
00838 return(valid);
00839 }
00840
00841
00842
00843
00844
00845 void FastQFile::reportErrorOnLine()
00846 {
00847
00848 myNumErrors++;
00849
00850
00851 if(myNumErrors <= myNumPrintableErrors)
00852 {
00853
00854 char buffer[100];
00855 sprintf(buffer, "ERROR on Line %u: ", myLineNum);
00856 std::string message = buffer;
00857 message += myErrorString.c_str();
00858 logMessage(message.c_str());
00859 }
00860 }
00861
00862
00863
00864 void FastQFile::reset()
00865 {
00866
00867
00868 resetForEachSequence();
00869 myNumErrors = 0;
00870 myLineNum = 0;
00871 myFileName.SetLength(0);
00872 myIdentifierMap.clear();
00873 myBaseComposition.clear();
00874 myQualPerCycle.clear();
00875 myCountPerCycle.clear();
00876 myFileProblem = false;
00877 }
00878
00879
00880
00881 void FastQFile::resetForEachSequence()
00882 {
00883 myLineBuffer.SetLength(0);
00884 myErrorString.SetLength(0);
00885 myRawSequence.SetLength(0);
00886 mySequenceIdLine.SetLength(0);
00887 mySequenceIdentifier.SetLength(0);
00888 myPlusLine.SetLength(0);
00889 myQualityString.SetLength(0);
00890 myTempPartialQuality.SetLength(0);
00891 }
00892
00893
00894 void FastQFile::logMessage(const char* logMessage)
00895 {
00896
00897 if(!myDisableMessages)
00898 {
00899 std::cout << logMessage << std::endl;
00900 }
00901 }
00902
00903
00904
00905
00906 bool FastQFile::isTimeToQuit()
00907 {
00908
00909
00910 if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors))
00911 {
00912 return(true);
00913 }
00914 return(false);
00915 }
00916
00917
00918 void FastQFile::printAvgQual()
00919 {
00920 std::cout << std::endl << "Average Phred Quality by Read Index (starts at 0):" << std::endl;
00921 std::cout.precision(2);
00922 std::cout << std::fixed << "Read Index\tAverage Quality"
00923 << std::endl;
00924 if(myQualPerCycle.size() != myCountPerCycle.size())
00925 {
00926
00927 std::cerr << "ERROR calculating the average Qualities per cycle\n";
00928 }
00929
00930 double sumQual = 0;
00931 double count = 0;
00932 double avgQual = 0;
00933 for(unsigned int i = 0; i < myQualPerCycle.size(); i++)
00934 {
00935 avgQual = 0;
00936 if(myCountPerCycle[i] != 0)
00937 {
00938 avgQual = myQualPerCycle[i] / (double)(myCountPerCycle[i]);
00939 }
00940 std::cout << i << "\t" << avgQual << "\n";
00941 sumQual += myQualPerCycle[i];
00942 count += myCountPerCycle[i];
00943 }
00944 std::cout << std::endl;
00945 avgQual = 0;
00946 if(count != 0)
00947 {
00948 avgQual = sumQual / count;
00949 }
00950 std::cout << "Overall Average Phred Quality = " << avgQual << std::endl;
00951 }