libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include "assert.h" 00019 #include "ctype.h" 00020 #include "stdio.h" 00021 #include "Error.h" 00022 00023 00024 #include "Generic.h" 00025 #include "ReferenceSequence.h" 00026 00027 #include <algorithm> 00028 #include <istream> 00029 #include <fstream> 00030 #include <sstream> 00031 #include <stdexcept> 00032 00033 // 00034 // Given a buffer with a fasta format contents, count 00035 // the number of chromsomes in it and return that value. 00036 // 00037 bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount) 00038 { 00039 chromosomeCount = 0; 00040 baseCount = 0; 00041 bool atLineStart = true; 00042 00043 // 00044 // loop over the fasta file, essentially matching for the 00045 // pattern '^>.*$' and counting them. 00046 // 00047 for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++) 00048 { 00049 switch (fastaData[fastaIndex]) 00050 { 00051 case '\n': 00052 case '\r': 00053 atLineStart = true; 00054 break; 00055 case '>': 00056 { 00057 if (!atLineStart) break; 00058 chromosomeCount++; 00059 // 00060 // eat the rest of the line 00061 // 00062 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r') 00063 { 00064 fastaIndex++; 00065 } 00066 break; 00067 } 00068 default: 00069 baseCount++; 00070 atLineStart = false; 00071 break; 00072 } 00073 00074 } 00075 return false; 00076 } 00077 00078 #if 0 00079 // turn this into a template on read/quality/etc... 00080 int GenomeSequence::debugPrintReadValidation( 00081 std::string &read, 00082 std::string &quality, 00083 char direction, 00084 genomeIndex_t readLocation, 00085 int sumQuality, 00086 int mismatchCount, 00087 bool recurse 00088 ) 00089 { 00090 int validateSumQ = 0; 00091 int validateMismatchCount = 0; 00092 int rc = 0; 00093 std::string genomeData; 00094 00095 for (uint32_t i=0; i<read.size(); i++) 00096 { 00097 if (tolower(read[i]) != tolower((*this)[readLocation + i])) 00098 { 00099 validateSumQ += quality[i] - '!'; 00100 // XXX no longer valid: 00101 if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++; 00102 genomeData.push_back(tolower((*this)[readLocation + i])); 00103 } 00104 else 00105 { 00106 genomeData.push_back(toupper((*this)[readLocation + i])); 00107 } 00108 } 00109 assert(validateSumQ>=0); 00110 if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount) 00111 { 00112 printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n", 00113 genomeData.c_str(), 00114 read.c_str(), 00115 validateSumQ, 00116 sumQuality 00117 ); 00118 rc++; 00119 } 00120 else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount) 00121 { 00122 printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n", 00123 genomeData.c_str(), 00124 read.c_str(), 00125 validateMismatchCount, 00126 mismatchCount 00127 ); 00128 rc++; 00129 } 00130 else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount) 00131 { 00132 printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n", 00133 genomeData.c_str(), 00134 read.c_str(), 00135 validateSumQ, 00136 sumQuality, 00137 validateMismatchCount, 00138 mismatchCount 00139 ); 00140 rc++; 00141 } 00142 00143 if (recurse && abs(validateMismatchCount - mismatchCount) > (int) read.size()/2) 00144 { 00145 printf("large mismatch difference, trying reverse strand: "); 00146 std::string reverseRead = read; 00147 std::string reverseQuality = quality; 00148 getReverseRead(reverseRead); 00149 reverse(reverseQuality.begin(), reverseQuality.end()); 00150 rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false); 00151 } 00152 return rc; 00153 } 00154 #endif 00155 00156 00157