00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #if !defined(_SMITH_WATERMAN_)
00019 #define _SMITH_WATERMAN_
00020
00021 #include <string.h>
00022 #include <limits.h>
00023 #include <stdint.h>
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079 #include <assert.h>
00080 #include <stdio.h>
00081 #include <stdlib.h>
00082 #include <string>
00083 #include <iostream>
00084 #include <iomanip>
00085 #include <utility>
00086 #include <vector>
00087
00088 #include "CigarRoller.h"
00089 #include "Generic.h"
00090
00091 using std::cout;
00092 using std::cin;
00093 using std::cerr;
00094 using std::setw;
00095 using std::endl;
00096 using std::pair;
00097 using std::vector;
00098
00099 #if !defined(MAX)
00100 #define MAX(x,y) ((x)>(y) ? (x) : (y))
00101 #endif
00102 #if !defined(MIN)
00103 #define MIN(x,y) ((x)<(y) ? (x) : (y))
00104 #endif
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126 template <int maxReadLengthH, int maxReferenceLengthH, typename HCellType, typename Atype, typename Btype, typename QualityType, typename readIndexType, typename referenceIndexType>
00127 class SmithWaterman
00128 {
00129 public:
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139 struct weight
00140 {
00141 weight()
00142 {
00143 match=2;
00144 misMatch=-1;
00145 insert=-1;
00146 del=-1;
00147 };
00148
00149 int match;
00150 int misMatch;
00151 int insert;
00152 int del;
00153 };
00154
00155 HCellType H[maxReadLengthH][maxReferenceLengthH];
00156 Atype *A;
00157 Btype *B;
00158 QualityType *qualities;
00159
00160 int m,n;
00161 readIndexType MOffset;
00162 referenceIndexType NOffset;
00163 weight w;
00164 int allowedInsertDelete;
00165 int direction;
00166 int gapOpenCount;
00167 int gapCloseCount;
00168 int gapExtendCount;
00169 vector<pair<int,int> > alignment;
00170 void clearAlignment()
00171 {
00172 alignment.clear();
00173 }
00174
00175 HCellType maxCostValue;
00176 pair<int,int> maxCostPosition;
00177
00178
00179
00180
00181
00182
00183
00184
00185 void clear()
00186 {
00187 maxCostPosition.first = 0;
00188 maxCostPosition.second = 0;
00189 A = NULL;
00190 B = NULL;
00191 qualities = NULL;
00192 m = 0;
00193 n = 0;
00194 MOffset = 0;
00195 NOffset = 0;
00196 allowedInsertDelete = 0;
00197 direction = 0;
00198 gapOpenCount = 0;
00199 gapCloseCount = 0;
00200 gapExtendCount = 0;
00201 }
00202
00203
00204 SmithWaterman()
00205 {
00206 clear();
00207 clearH();
00208 }
00209
00210
00211 SmithWaterman(
00212 Atype *A,
00213 QualityType *qualities,
00214 Btype *B,
00215 int m,
00216 int n,
00217 int allowedInsertDelete = INT_MAX,
00218 int direction = 1,
00219 readIndexType MOffset = 0,
00220 referenceIndexType NOffset = 0):
00221 A(A),
00222 qualities(qualities),
00223 B(B),
00224 m(m),
00225 n(n),
00226 allowedInsertDelete(allowedInsertDelete),
00227 direction(direction),
00228 MOffset(MOffset),
00229 NOffset(NOffset),
00230 maxCostValue((HCellType) 0)
00231 {
00232 }
00233
00234 void setRead(Atype *A)
00235 {
00236 this->A = A;
00237 }
00238 void setReadQuality(QualityType *qualities)
00239 {
00240 this->qualities = qualities;
00241 }
00242 void setReference(Btype *B)
00243 {
00244 this->B = B;
00245 }
00246
00247
00248
00249
00250 void setReadLength(int m)
00251 {
00252 this->m = m;
00253 }
00254 void setReadOffset(readIndexType MOffset)
00255 {
00256 this->MOffset = MOffset;
00257 }
00258
00259
00260
00261
00262
00263 void setReferenceLength(int n)
00264 {
00265 this->n = n;
00266 }
00267 void setReferenceOffset(referenceIndexType NOffset)
00268 {
00269 this->NOffset = NOffset;
00270 }
00271
00272
00273
00274
00275
00276
00277
00278
00279
00280
00281 void setAllowedInsertDelete(int allowedInsertDelete = INT_MAX)
00282 {
00283 this->allowedInsertDelete = allowedInsertDelete;
00284 }
00285
00286
00287
00288
00289
00290 void setDirection(int direction)
00291 {
00292 this->direction = direction;
00293 }
00294
00295 void clearH()
00296 {
00297 memset(H, 0, sizeof(H));
00298 }
00299
00300 void populateH()
00301 {
00302
00303 maxCostValue = 0;
00304
00305 for (int i=1; i<=m ; i++)
00306 {
00307
00308
00309 int low = MAX(1, i - allowedInsertDelete);
00310 int high = MIN(n, i + allowedInsertDelete);
00311
00312 for (int j=low; j<=high ; j++)
00313 {
00314 HCellType c;
00315 c = 0;
00316 if (direction>0) c = MAX(c, H[i-1][j-1] + (((*A)[MOffset + i-1]==(*B)[NOffset + j-1]) ? w.match : w.misMatch));
00317 else c = MAX(c, H[i-1][j-1] + (((*A)[MOffset + m-i+0]==(*B)[NOffset + n-j+0]) ? w.match : w.misMatch));
00318 c = MAX(c, H[i-1][j] + w.del);
00319 c = MAX(c, H[i][j-1] + w.insert);
00320 H[i][j] = c;
00321 if (c>maxCostValue)
00322 {
00323 maxCostValue = c;
00324 maxCostPosition.first = i;
00325 maxCostPosition.second = j;
00326 }
00327 }
00328 }
00329 }
00330
00331
00332
00333
00334 void printH(bool prettyPrint = true)
00335 {
00336
00337 for (int i=-1; i<=m ; i++)
00338 {
00339 for (int j=-1; j<=n ; j++)
00340 {
00341 if (prettyPrint) cout << setw(3);
00342 if (i==-1 && j==-1)
00343 {
00344 if (prettyPrint) cout << " ";
00345 else cout << "\t";
00346 }
00347 else if (j==-1)
00348 {
00349 if (!prettyPrint) cout << "\t";
00350 if (i==0) cout << "-";
00351 else cout << (*A)[MOffset + direction>0 ? i-1 : m - i];
00352 }
00353 else if (i==-1)
00354 {
00355 if (!prettyPrint) cout << "\t";
00356 if (j==0) cout << "-";
00357 else cout << (*B)[NOffset + direction>0 ? j-1 : n - j];
00358 }
00359 else
00360 {
00361 if (!prettyPrint) cout << "\t";
00362 cout << H[i][j];
00363 }
00364 }
00365 cout << endl;
00366 }
00367 }
00368
00369 void debugPrint(bool doPrintH = true)
00370 {
00371 if (doPrintH) printH();
00372 cout << "maxCostPosition = " << maxCostPosition << std::endl;
00373 if (alignment.empty()) cout << "alignment vector is empty.\n";
00374 else
00375 {
00376 cout << "alignment vector:\n";
00377 for (vector<pair<int,int> >::iterator i=alignment.begin(); i < alignment.end(); i++)
00378 {
00379 cout << (i - alignment.begin()) << ": " << *i << "\n";
00380 }
00381 }
00382 cout << std::endl;
00383 }
00384
00385
00386
00387
00388
00389 void populateAlignment()
00390 {
00391 alignment.clear();
00392 int i = m, j = n;
00393
00394 i = maxCostPosition.first;
00395 j = maxCostPosition.second;
00396
00397
00398
00399
00400
00401
00402
00403 while (H[i][j] > 0 || (i>0 && j>0))
00404 {
00405
00406 #if defined(DEBUG_ALIGNMENT_VECTOR)
00407 cout << "alignment.push_back(" << i << ", " << j << ")" << endl;
00408 #endif
00409 alignment.push_back(pair<int,int>(i,j));
00410 if (H[i-1][j-1]>=H[i-1][j] && H[i-1][j-1]>=H[i][j-1])
00411 {
00412
00413 i--;
00414 j--;
00415 }
00416 else if (H[i-1][j] < H[i][j-1])
00417 {
00418
00419 j--;
00420 }
00421 else
00422 {
00423
00424 i--;
00425 }
00426 }
00427 alignment.push_back(pair<int,int>(i,j));
00428 #if defined(DEBUG_ALIGNMENT_VECTOR)
00429 cout << "alignment.push_back(" << i << ", " << j << ")" << endl;
00430 cout << "alignment.size(): " << alignment.size() << endl;
00431 #endif
00432 }
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448 int getSumQ()
00449 {
00450 if (direction>0) return getSumQForward();
00451 else return getSumQBackward();
00452 }
00453
00454 int getSumQForward()
00455 {
00456 int sumQ = 0;
00457 vector<pair<int,int> >::reverse_iterator i;
00458
00459 for (i=alignment.rbegin(); i < alignment.rend() - 1; i++)
00460 {
00461
00462 #if defined(DEBUG_GETSUMQ)
00463 cout << *i << ": ";
00464 #endif
00465 if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second + 1))
00466 {
00467
00468 #if defined(DEBUG_GETSUMQ)
00469 cout << "Match/Mismatch";
00470 #endif
00471 if ((*A)[MOffset + (*i).first] != (*B)[NOffset + (*i).second])
00472 sumQ += (*qualities)[MOffset + (*i).first] - '!';
00473 }
00474 else if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second))
00475 {
00476
00477 #if defined(DEBUG_GETSUMQ)
00478 cout << "Insert";
00479 #endif
00480 sumQ += 50;
00481 }
00482 else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second + 1))
00483 {
00484
00485 #if defined(DEBUG_GETSUMQ)
00486 cout << "Delete";
00487 #endif
00488 sumQ += 50;
00489 }
00490 }
00491 #if defined(DEBUG_GETSUMQ)
00492 cout << endl;
00493 #endif
00494 return sumQ;
00495 }
00496
00497 int getSumQBackward()
00498 {
00499 int sumQ = 0;
00500 vector<pair<int,int> >::iterator i;
00501
00502 for (i=alignment.begin(); i < alignment.end() - 1; i++)
00503 {
00504 #if defined(DEBUG_GETSUMQ)
00505 cout << *i << ": ";
00506 #endif
00507 if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second - 1))
00508 {
00509
00510 #if defined(DEBUG_GETSUMQ)
00511 cout << "Match/Mismatch";
00512 #endif
00513 if ((*A)[MOffset + m - (*i).first] != (*B)[NOffset + n - (*i).second])
00514 sumQ += (*qualities)[MOffset + m - (*i).first] - '!';
00515 }
00516 else if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second))
00517 {
00518
00519 #if defined(DEBUG_GETSUMQ)
00520 cout << "Insert?";
00521 #endif
00522 sumQ += 50;
00523 }
00524 else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second - 1))
00525 {
00526
00527 #if defined(DEBUG_GETSUMQ)
00528 cout << "Delete?";
00529 #endif
00530 sumQ += 50;
00531 }
00532 }
00533 #if defined(DEBUG_GETSUMQ)
00534 cout << endl;
00535 #endif
00536 return sumQ;
00537 }
00538
00539 #if 0
00540 int getSumQ()
00541 {
00542 vector<pair<int,int> >::reverse_iterator i;
00543 int sumQ = 0;
00544 for (i=alignment.rbegin(); i < alignment.rend() - 1; i++)
00545 {
00546 #if defined(DEBUG_ALIGNMENT_VECTOR)
00547 cout << "i: " << i - alignment.rbegin() << *i << endl;
00548 #endif
00549
00550 if (direction>0)
00551 {
00552 if ((*A)[MOffset + (*i).first] != (*B)[NOffset + (*i).second])
00553 sumQ += (*qualities)[MOffset + (*i).first] - '!';
00554 }
00555 else
00556 {
00557
00558 if ((*A)[MOffset + m - (*i).first] != (*B)[NOffset + n - (*i).second])
00559 sumQ += (*qualities)[MOffset + m - (*i).first] - '!';
00560 }
00561 }
00562 return sumQ;
00563 }
00564 #endif
00565
00566
00567
00568
00569
00570
00571
00572
00573
00574 void rollCigarForward(CigarRoller &cigar)
00575 {
00576 vector<pair<int,int> >::reverse_iterator i;
00577
00578 for (i=alignment.rbegin(); i < alignment.rend() - 1; i++)
00579 {
00580
00581 #if defined(DEBUG_CIGAR)
00582 cout << *i << ": ";
00583 #endif
00584 if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second + 1))
00585 {
00586
00587 #if defined(DEBUG_CIGAR)
00588 cout << "Match/Mismatch";
00589 #endif
00590 cigar.Add(CigarRoller::match, 1);
00591 }
00592 else if ((*(i+1)).first == ((*i).first+1) && (*(i+1)).second == ((*i).second))
00593 {
00594
00595 #if defined(DEBUG_CIGAR)
00596 cout << "Insert";
00597 #endif
00598 cigar.Add(CigarRoller::insert, 1);
00599 }
00600 else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second + 1))
00601 {
00602
00603 #if defined(DEBUG_CIGAR)
00604 cout << "Delete";
00605 #endif
00606 cigar.Add(CigarRoller::del, 1);
00607 }
00608 }
00609
00610
00611 cigar.Add(CigarRoller::softClip, getSoftClipCount());
00612 #if defined(DEBUG_CIGAR)
00613 cout << endl;
00614 #endif
00615 }
00616
00617
00618
00619
00620
00621
00622
00623
00624
00625 void rollCigarBackward(CigarRoller &cigar)
00626 {
00627 vector<pair<int,int> >::iterator i;
00628
00629
00630
00631 cigar.Add(CigarRoller::softClip, getSoftClipCount());
00632
00633 i = alignment.begin();
00634
00635 for (i=alignment.begin();
00636 i < alignment.end() - 1;
00637 i++)
00638 {
00639 #if defined(DEBUG_CIGAR)
00640 cout << *i << ": ";
00641 #endif
00642 if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second - 1))
00643 {
00644
00645 #if defined(DEBUG_CIGAR)
00646 cout << "Match/Mismatch";
00647 #endif
00648 cigar.Add(CigarRoller::match, 1);
00649 }
00650 else if ((*(i+1)).first == ((*i).first-1) && (*(i+1)).second == ((*i).second))
00651 {
00652
00653 #if defined(DEBUG_CIGAR)
00654 cout << "Insert?";
00655 #endif
00656 cigar.Add(CigarRoller::insert, 1);
00657 }
00658 else if ((*(i+1)).first == ((*i).first) && (*(i+1)).second == ((*i).second - 1))
00659 {
00660
00661 #if defined(DEBUG_CIGAR)
00662 cout << "Delete?";
00663 #endif
00664 cigar.Add(CigarRoller::del, 1);
00665 }
00666 }
00667 #if defined(DEBUG_CIGAR)
00668 cout << endl;
00669 #endif
00670 }
00671
00672
00673
00674
00675
00676
00677
00678
00679
00680 int getSoftClipCount()
00681 {
00682 if (direction>0)
00683 {
00684
00685 return m - maxCostPosition.first;
00686 }
00687 else
00688 {
00689
00690
00691
00692 return m - maxCostPosition.first;
00693 }
00694 }
00695
00696 void rollCigar(CigarRoller &cigar)
00697 {
00698 if (direction>0) rollCigarForward(cigar);
00699 else rollCigarBackward(cigar);
00700 }
00701
00702
00703
00704
00705
00706
00707
00708
00709
00710
00711
00712
00713 bool localAlignment(
00714 uint32_t bandSize,
00715 Atype &read,
00716 readIndexType readLength,
00717 QualityType &quality,
00718 Btype &reference,
00719 referenceIndexType referenceLength,
00720 referenceIndexType referenceOffset,
00721 CigarRoller &cigarRoller,
00722 uint32_t &softClipCount,
00723 referenceIndexType &cigarStartingPoint,
00724 int &sumQ
00725 )
00726 {
00727
00728 clear();
00729
00730 cigarRoller.clear();
00731
00732 setDirection(+1);
00733 setAllowedInsertDelete(bandSize);
00734
00735 setRead(&read);
00736 setReadOffset(0);
00737 setReadLength(readLength);
00738
00739 setReadQuality(&quality);
00740
00741 setReference(&reference);
00742 setReferenceOffset(referenceOffset);
00743 setReferenceLength(referenceLength);
00744
00745 populateH();
00746
00747 softClipCount = getSoftClipCount();
00748
00749 populateAlignment();
00750
00751 rollCigar(cigarRoller);
00752
00753 sumQ = getSumQ();
00754
00755 return false;
00756
00757 };
00758
00759 };
00760
00761 #endif