ReferenceSequence.cpp
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "assert.h"
00019 #include "ctype.h"
00020 #include "stdio.h"
00021 #include "Error.h"
00022
00023
00024 #include "Generic.h"
00025 #include "ReferenceSequence.h"
00026
00027 #include <algorithm>
00028 #include <istream>
00029 #include <fstream>
00030 #include <sstream>
00031 #include <stdexcept>
00032
00033
00034
00035
00036
00037 bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
00038 {
00039 chromosomeCount = 0;
00040 baseCount = 0;
00041 bool atLineStart = true;
00042
00043
00044
00045
00046
00047 for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00048 {
00049 switch (fastaData[fastaIndex])
00050 {
00051 case '\n':
00052 case '\r':
00053 atLineStart = true;
00054 break;
00055 case '>':
00056 {
00057 if (!atLineStart) break;
00058 chromosomeCount++;
00059
00060
00061
00062 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
00063 {
00064 fastaIndex++;
00065 }
00066 break;
00067 }
00068 default:
00069 baseCount++;
00070 atLineStart = false;
00071 break;
00072 }
00073
00074 }
00075 return false;
00076 }
00077
00078 #if 0
00079
00080 int GenomeSequence::debugPrintReadValidation(
00081 std::string &read,
00082 std::string &quality,
00083 char direction,
00084 genomeIndex_t readLocation,
00085 int sumQuality,
00086 int mismatchCount,
00087 bool recurse
00088 )
00089 {
00090 int validateSumQ = 0;
00091 int validateMismatchCount = 0;
00092 int rc = 0;
00093 std::string genomeData;
00094
00095 for (uint32_t i=0; i<read.size(); i++)
00096 {
00097 if (tolower(read[i]) != tolower((*this)[readLocation + i]))
00098 {
00099 validateSumQ += quality[i] - '!';
00100
00101 if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
00102 genomeData.push_back(tolower((*this)[readLocation + i]));
00103 }
00104 else
00105 {
00106 genomeData.push_back(toupper((*this)[readLocation + i]));
00107 }
00108 }
00109 assert(validateSumQ>=0);
00110 if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
00111 {
00112 printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n",
00113 genomeData.c_str(),
00114 read.c_str(),
00115 validateSumQ,
00116 sumQuality
00117 );
00118 rc++;
00119 }
00120 else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
00121 {
00122 printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n",
00123 genomeData.c_str(),
00124 read.c_str(),
00125 validateMismatchCount,
00126 mismatchCount
00127 );
00128 rc++;
00129 }
00130 else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
00131 {
00132 printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
00133 genomeData.c_str(),
00134 read.c_str(),
00135 validateSumQ,
00136 sumQuality,
00137 validateMismatchCount,
00138 mismatchCount
00139 );
00140 rc++;
00141 }
00142
00143 if (recurse && abs(validateMismatchCount - mismatchCount) > (int) read.size()/2)
00144 {
00145 printf("large mismatch difference, trying reverse strand: ");
00146 std::string reverseRead = read;
00147 std::string reverseQuality = quality;
00148 getReverseRead(reverseRead);
00149 reverse(reverseQuality.begin(), reverseQuality.end());
00150 rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
00151 }
00152 return rc;
00153 }
00154 #endif
00155
00156
00157