libStatGen Software  1
ReferenceSequence.cpp
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "assert.h"
00019 #include "ctype.h"
00020 #include "stdio.h"
00021 #include "Error.h"
00022 
00023 
00024 #include "Generic.h"
00025 #include "ReferenceSequence.h"
00026 
00027 #include <algorithm>
00028 #include <istream>
00029 #include <fstream>
00030 #include <sstream>
00031 #include <stdexcept>
00032 
00033 //
00034 // Given a buffer with a fasta format contents, count
00035 // the number of chromsomes in it and return that value.
00036 //
00037 bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
00038 {
00039     chromosomeCount = 0;
00040     baseCount = 0;
00041     bool atLineStart = true;
00042 
00043     //
00044     // loop over the fasta file, essentially matching for the
00045     // pattern '^>.*$' and counting them.
00046     //
00047     for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
00048     {
00049         switch (fastaData[fastaIndex])
00050         {
00051             case '\n':
00052             case '\r':
00053                 atLineStart = true;
00054                 break;
00055             case '>':
00056             {
00057                 if (!atLineStart) break;
00058                 chromosomeCount++;
00059                 //
00060                 // eat the rest of the line
00061                 //
00062                 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
00063                 {
00064                     fastaIndex++;
00065                 }
00066                 break;
00067             }
00068             default:
00069                 baseCount++;
00070                 atLineStart = false;
00071                 break;
00072         }
00073 
00074     }
00075     return false;
00076 }
00077 
00078 #if 0
00079 // turn this into a template on read/quality/etc...
00080 int GenomeSequence::debugPrintReadValidation(
00081     std::string &read,
00082     std::string &quality,
00083     char   direction,
00084     genomeIndex_t   readLocation,
00085     int sumQuality,
00086     int mismatchCount,
00087     bool recurse
00088 ) 
00089 {
00090     int validateSumQ = 0;
00091     int validateMismatchCount = 0;
00092     int rc = 0;
00093     std::string genomeData;
00094 
00095     for (uint32_t i=0; i<read.size(); i++)
00096     {
00097         if (tolower(read[i]) != tolower((*this)[readLocation + i]))
00098         {
00099             validateSumQ += quality[i] - '!';
00100             // XXX no longer valid:
00101             if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
00102             genomeData.push_back(tolower((*this)[readLocation + i]));
00103         }
00104         else
00105         {
00106             genomeData.push_back(toupper((*this)[readLocation + i]));
00107         }
00108     }
00109     assert(validateSumQ>=0);
00110     if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
00111     {
00112         printf("SUMQ: Original Genome: %s  test read: %s : actual sumQ = %d, test sumQ = %d\n",
00113                genomeData.c_str(),
00114                read.c_str(),
00115                validateSumQ,
00116                sumQuality
00117               );
00118         rc++;
00119     }
00120     else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
00121     {
00122         printf("MISM: Original Genome: %s  test read: %s : actual mismatch %d test mismatches %d\n",
00123                genomeData.c_str(),
00124                read.c_str(),
00125                validateMismatchCount,
00126                mismatchCount
00127               );
00128         rc++;
00129     }
00130     else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
00131     {
00132         printf("BOTH: Original Genome: %s  test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
00133                genomeData.c_str(),
00134                read.c_str(),
00135                validateSumQ,
00136                sumQuality,
00137                validateMismatchCount,
00138                mismatchCount
00139               );
00140         rc++;
00141     }
00142 
00143     if (recurse && abs(validateMismatchCount - mismatchCount) > (int) read.size()/2)
00144     {
00145         printf("large mismatch difference, trying reverse strand: ");
00146         std::string reverseRead = read;
00147         std::string reverseQuality = quality;
00148         getReverseRead(reverseRead);
00149         reverse(reverseQuality.begin(), reverseQuality.end());
00150         rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
00151     }
00152     return rc;
00153 }
00154 #endif
00155 
00156 
00157 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends