libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef _GREEDY_TUPLE_H 00019 #define _GREEDY_TUPLE_H 00020 00021 #include <stdio.h> 00022 #include <stdlib.h> 00023 #include <string.h> 00024 #include <assert.h> 00025 #include <ctype.h> 00026 #include "Generic.h" 00027 #include "CigarRoller.h" 00028 00029 /* 00030 * 00031 00032 TODO: 00033 1. how to efficiently find insertion? 00034 00035 */ 00036 00037 /** 00038 * Weight includes various penalties(e.g. gap open) used in local alignment 00039 */ 00040 struct Weight 00041 { 00042 public: 00043 Weight() 00044 { 00045 gapOpen = gapExtend = -1; // here we do not use affine gap penalty for simlicity. 00046 mismatch = -1; 00047 match= 2; 00048 }; 00049 int gapOpen; 00050 int gapExtend; 00051 int mismatch; 00052 int match; 00053 }; 00054 00055 // 00056 // tuple number is 3, arbitrary number from my guess! 00057 // another reason 00058 // 00059 template <typename QueryType, typename ReferenceType, typename ReferenceIndex> 00060 class GreedyTupleAligner 00061 { 00062 public: 00063 GreedyTupleAligner(Weight& wt): weight(wt) 00064 {/* */} 00065 00066 /** 00067 * Match 'query' to the 'reference' from 'searchStartIndex' up 00068 * to 'searchSize', store matched length to 'matchedLength' 00069 * and number of mismatch to 'mismatch' 00070 * @param query input query 00071 * @param queryLength length of query 00072 * @param reference reference sequence 00073 * @param searchStartIndex the positino where search starts 00074 * @param searchSize the total length in reference sequence that will be examine 00075 * @param matchedLength store how many bases are matched 00076 * @param mismatch store how many bases are mismatched 00077 * @return -1 for unsuccess return 00078 */ 00079 int MatchTuple( 00080 const QueryType query, 00081 const int queryLength, 00082 const ReferenceType reference, 00083 const ReferenceIndex searchStartIndex, 00084 const int searchSize, 00085 int& matchedLength, 00086 int& mismatch) 00087 { 00088 // now use naive search, 00089 // TODO: will incorportate KMP serach later 00090 // TODO: adjust tolerance of mismatches 00091 const int MAX_MISMATCH=2; 00092 int bestPos = 0, bestMismatch = queryLength, bestMatchedLength = 0, bestScore=-1; 00093 00094 #if defined(DEBUG_GREEDY_ALIGNER) 00095 cout << "searchStartIndex == " << searchStartIndex << ", searchSize == " << searchSize << std::endl; 00096 #endif 00097 // here i is the matching position (inclusive) 00098 // j is the matched length 00099 for (int i = 0; i <= searchSize - tupleSize; i++) 00100 { 00101 int j = 0; 00102 mismatch = 0; 00103 while (j < queryLength) 00104 { 00105 if (searchStartIndex + i + j >= reference.getNumberBases()) 00106 break; 00107 if (query[j] != reference[searchStartIndex + i + j]) 00108 { 00109 mismatch++; 00110 if (mismatch >= MAX_MISMATCH) 00111 break; 00112 } 00113 j++; 00114 } 00115 00116 if (j>0 && (j==queryLength)) j--; 00117 00118 while (searchStartIndex +i +j < reference.getNumberBases() 00119 && ((j+1) > mismatch) 00120 && mismatch>0 00121 && query[j] != reference[searchStartIndex + i+j]) 00122 { 00123 // if pattern matching goes beyong the preset mismatch cutoff, 00124 // we will have to go backwards 00125 j--; 00126 mismatch--; 00127 } 00128 00129 int score = j - mismatch; 00130 00131 if (score > bestScore) 00132 { 00133 bestPos = i; 00134 bestScore = score; 00135 bestMismatch = mismatch; 00136 bestMatchedLength = j+1; 00137 } 00138 } 00139 00140 if (bestScore > 0) 00141 { 00142 mismatch = bestMismatch; 00143 matchedLength = bestMatchedLength; 00144 return bestPos; 00145 } 00146 return -1; 00147 } 00148 00149 /** 00150 * Core local alignment algorithm 00151 * @param query input query 00152 * @param queryLength length of query 00153 * @param reference reference genome 00154 * @param searchStartIndex matching starts here 00155 * @param searchSize how far we will search 00156 * @param cigarRoller store alignment results here 00157 * @param matchPosition store match position 00158 */ 00159 void Align( 00160 QueryType query, 00161 int queryLength, 00162 ReferenceType reference, 00163 ReferenceIndex searchStartIndex, 00164 int searchSize, 00165 CigarRoller& cigarRoller, 00166 ReferenceIndex& matchPosition) 00167 { 00168 // Algorithm: 00169 // finished align? (should we try different align position?) 00170 // if not, try next tuple 00171 // is the tuple aligned? 00172 // yes, extend to previous, mark unmatched part mismatch or gap 00173 // extend to next matched part 00174 int r1 = 0; // a start index: reference starting from r1 (inclusive) will be used 00175 int queryMatchCount = 0; // query matched # of bases 00176 int q1 = 0; // to align 00177 int pos = -1; 00178 int lastR1 = -1; // index: record last 00179 00180 cigarRoller.clear(); 00181 matchPosition = -1; 00182 00183 while (queryMatchCount < queryLength) 00184 { 00185 if (r1 == searchSize - 1) // touched ref right boundary 00186 { 00187 cigarRoller.Add(CigarRoller::softClip, queryLength-queryMatchCount); 00188 break; 00189 } 00190 if (queryLength - q1 < tupleSize) 00191 { 00192 // XXX this needs to do something more sane 00193 // printf("some bases left!\n"); 00194 // a simple fix: treat all left-over bases as mismatches/matches 00195 cigarRoller.Add(CigarRoller::mismatch, queryLength - queryMatchCount); 00196 break; 00197 } 00198 int mismatch = 0; 00199 int matchedLen = 0; 00200 if ((pos = MatchTuple(query+q1, queryLength-q1, reference, searchStartIndex + r1, searchSize - r1, matchedLen, mismatch)) // found match position for tuple 00201 00202 >= 0) 00203 { 00204 // found match position for tuple 00205 00206 if (lastR1<0) 00207 matchPosition = pos; 00208 00209 // 00210 // deal with left 00211 // 00212 if (lastR1>=0) // have previously aligned part of the query to the reference genome yet 00213 { 00214 if (pos > 0) 00215 { 00216 cigarRoller.Add(CigarRoller::del, pos); 00217 } 00218 } 00219 else 00220 { 00221 lastR1 = pos; 00222 } 00223 00224 r1 += pos; 00225 r1 += matchedLen; 00226 q1 += matchedLen; 00227 00228 // 00229 // deal with right 00230 // 00231 cigarRoller.Add(CigarRoller::match, matchedLen); 00232 queryMatchCount = q1; 00233 lastR1 = r1; 00234 00235 continue; 00236 } // end if 00237 00238 // 00239 // try insertion 00240 // maximum insert ? say 2 00241 // 00242 for (int i = 1; i < queryLength - q1 - tupleSize; i++) 00243 { 00244 int mismatch = 0; 00245 int matchedLen = 0; 00246 // check reference genome broundary 00247 if (searchStartIndex + r1 >= reference.getNumberBases()) 00248 return; 00249 if ((pos = MatchTuple(query+q1 + i , 00250 queryLength - q1 -i , 00251 reference, 00252 searchStartIndex + r1, 00253 searchSize - r1, 00254 matchedLen, 00255 mismatch)) // found match position for tuple 00256 >= 0) 00257 { 00258 if (matchPosition < 0) 00259 matchPosition = pos + q1 + i ; 00260 } 00261 queryMatchCount += i; 00262 q1 += i; 00263 cigarRoller.Add(CigarRoller::insert, i); 00264 00265 lastR1 = r1 + pos; 00266 r1 += pos + tupleSize; 00267 q1 += tupleSize; 00268 00269 // deal with right 00270 while (searchStartIndex + r1 < reference.getNumberBases() 00271 && query[q1]==reference[searchStartIndex + r1] 00272 && q1 < queryLength) 00273 { 00274 r1++; 00275 q1++; 00276 } 00277 if (q1 < queryLength) 00278 { 00279 cigarRoller.Add(CigarRoller::match, q1 - queryMatchCount); 00280 queryMatchCount = q1; 00281 } 00282 else 00283 { 00284 cigarRoller.Add(CigarRoller::match, queryLength - queryMatchCount); 00285 queryMatchCount = queryLength ; 00286 break ; 00287 } 00288 } 00289 00290 r1++; 00291 q1++; 00292 00293 // try next 00294 } // end while (queryMatchCount < queryLength) 00295 } 00296 private: 00297 static const int tupleSize = 3; 00298 Weight weight; 00299 }; 00300 00301 00302 #endif