libStatGen Software  1
SmithWaterman.cpp
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <stdio.h>
00019 #include <stdlib.h>
00020 #include <stdint.h>
00021 #include "SmithWaterman.h"
00022 
00023 // put TEST below here, so that makedepend will see the .h, so that we
00024 // can get a clean dependency for SmithWaterman.o, so that we can at least
00025 // compile the header when we change it.
00026 
00027 #if defined(TEST)
00028 
00029 #include <getopt.h>
00030 #include "Generic.h"
00031 
00032 // g++ -g -o testSW -DTEST SmithWaterman.cpp
00033 //
00034 // Smith-Waterman - test code uses a 256x256 array of int16
00035 //
00036 int swat(
00037     bool        showAllCases,
00038     const char *A,
00039     const char *qualities,
00040     const char *B,
00041     int direction,
00042     const char *expectedCigarString,
00043     int expectedSumQ
00044 )
00045 {
00046     int allowedInsertDelete = 1024;
00047     int errors = 0;
00048 
00049     //            read length 256
00050     //                 reference length 1024
00051     SmithWaterman<256, 1024, uint16_t, const char *, const char *, const char *, uint32_t, uint32_t > sw(&A, &qualities, &B, strlen(A), strlen(B), allowedInsertDelete, direction);
00052 
00053     //
00054     // now we align the read:
00055     //
00056     sw.populateH();
00057     sw.populateAlignment();
00058 
00059     int sumQ = sw.getSumQ();
00060 
00061     CigarRoller cigar;
00062     cigar.clear();
00063     sw.rollCigar(cigar);
00064 
00065     const char *cigarStr = cigar.getString();
00066 
00067     //
00068     // now we pretty print the results
00069     //
00070 
00071     bool badCigar = false, badQuality = false;
00072 
00073     if (strcmp(cigarStr, expectedCigarString)!=0)
00074     {
00075         badCigar = true;
00076         errors ++;
00077     }
00078 
00079     if (sumQ != expectedSumQ)
00080     {
00081         badQuality = true;
00082         errors ++;
00083     }
00084 
00085 
00086 
00087     if (showAllCases || errors>0)
00088     {
00089         cout << "=============" << endl;
00090         cout << "        Read: " << A << endl;
00091         cout << "   Reference: " << B << endl;
00092         cout << "   Direction: " << direction << endl;
00093         cout << "Max Cell: " << sw.maxCostValue << " located at " << sw.maxCostPosition << endl;
00094         cout << "M: " << sw.m << "  N: " << sw.n << endl;
00095         cout << "Cigar String: " << cigarStr ;
00096         if (badCigar)
00097             cout << " (EXPECTED: " << expectedCigarString << ")";
00098         cout << endl;
00099         cout << "        sumQ:" << sumQ;
00100         if (badQuality)
00101             cout << " (EXPECTED: " << expectedSumQ << ")";
00102         cout << endl;
00103 
00104         if (strlen(B) < 100 || showAllCases)
00105             sw.printH(false);
00106 
00107         for (vector<pair<int,int> >::iterator i = sw.alignment.begin(); i != sw.alignment.end(); i++) cout << *i << endl;
00108 
00109         cout << "=============" << endl << endl;
00110     }
00111 
00112     delete cigarStr;
00113 
00114     return errors;
00115 }
00116 
00117 
00118 // test with Sequence 1 = ACACACTA
00119 //           Sequence 2 = AGCACACA
00120 int main(int argc, const char **argv)
00121 {
00122     int errors = 0;
00123 
00124     bool showAllCasesFlag = false;
00125     int opt;
00126 
00127     while ((opt = getopt(argc, (char **) argv, "v")) != -1)
00128     {
00129         switch (opt)
00130         {
00131             case 'v':
00132                 showAllCasesFlag = true;
00133                 break;
00134             default:
00135                 cerr << "usage: testSW [-v]" << std::endl;
00136                 exit(1);
00137         }
00138     }
00139 
00140 
00141     // CIGAR explanation - for backward SW runs, the corresponding
00142     // CIGAR string is generated from the back of the string to the
00143     // front.  Recall that the soft clipping is only done at the
00144     // "end" of the string, taking direction into account.
00145 
00146     // forwards - simple
00147     errors += swat(showAllCasesFlag, "1234", "\"#$-", "1235", 1, "3M1S", 0);
00148 
00149     // backwards - simple
00150     errors += swat(showAllCasesFlag, "1234", "\"#$-", "1235", -1, "4M", 12);
00151 
00152     // backwards - soft left clip
00153     errors += swat(showAllCasesFlag, "1234", "\"#$-", "0234", -1, "1S3M", 0);
00154 
00155     // delete in read (arg 1) - forward
00156     errors += swat(showAllCasesFlag, "123467890", "\"#$%^&*()-", "1234567890", +1, "4M1D5M", 50);
00157 
00158     // insert in read (arg 1) - forward
00159     errors += swat(showAllCasesFlag, "1234556789", "\"#$%^&*()-", "1234567890", +1, "5M1I4M", 50);
00160 
00161     // delete in read (arg 1) - backward
00162     errors += swat(showAllCasesFlag, "X123467890", "#\"#$%^&*()-", "1234567890", -1, "1S4M1D5M", 50);
00163 
00164     // insert in read (arg 1) - backward
00165     errors += swat(showAllCasesFlag, "1234556789", "\"#$%^&*()-", "0123456789", -1, "4M1I5M", 50);
00166 
00167     // insert in read (arg 1) - backward
00168     errors += swat(showAllCasesFlag, "X1223456789", "00000000000", "00123456789", -1, "1S1M1I8M", 50);
00169 
00170     // insert in read (arg 1) - backward
00171     errors += swat(showAllCasesFlag, "XY1223456789", "000000000000", "000123456789", -1, "2S1M1I8M", 50);
00172 
00173     // forward - soft right clip of 2 bases - sumQ should be 0
00174     errors += swat(showAllCasesFlag, "123456700", "\"#$%^&*()-", "123456789", +1, "7M2S", 0);
00175 
00176     // insert in read (arg 1) - forward w/2 mismatches at end
00177     errors += swat(showAllCasesFlag, "1023456700", "\"#$%^&*()-", "123456789", +1, "1M1I6M2S", 50);
00178 
00179     // insert in read (arg 1) - forward w/2 mismatches at end
00180     errors += swat(showAllCasesFlag, "CTCCACCTCCCGGTT", "111111111111111", "TCCACCTCCCAGGTT", -1, "1S10M1D4M", 50);
00181 
00182     //
00183     errors += swat(showAllCasesFlag, "1234", "0000", "12345", +1, "4M", 0);
00184 
00185     //
00186     errors += swat(showAllCasesFlag, "1234X", "00000", "12345", +1, "4M1S", 0);
00187 
00188     //
00189     errors += swat(showAllCasesFlag, "4321", "0000", "7654321", -1, "4M", 0);
00190 
00191     //
00192     errors += swat(showAllCasesFlag, "X4321", "00000", "7654321", -1, "1S4M", 0);
00193 
00194     //
00195     errors += swat(showAllCasesFlag, "X432A10", "0000000", "76543210", -1, "1S3M1I2M", 50);
00196 
00197     //
00198     errors += swat(showAllCasesFlag, "1345", "0000", "12345", -1, "1M1D3M", 50);
00199 
00200     errors += swat(showAllCasesFlag, "45689", "00000", "1234567890", -1, "3M1D2M", 50);
00201 
00202 //  errors += swat(showAllCasesFlag, "AATAATTTTTTATATACAGATCGCTGTAGAGTGTAGTTATAGTATGATTCCAACTTTTATTTCTTTCATGACTAATTATATGTATACATGTGCCATGTTGGTGTGCTG", "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", "TCCAATGTAGGGCTGTTATAAACAGTGTTGATACATATGTTTTTGTATAAGTCTTTGTTGAATACATGCTTTCATTTTTGTAGGGTATATGTCCAGGAATTAAATTTTTGCATTATTGGGGAAGTTCAAACGTAGATCAGTAGATGTTCCCAAATGATTTTCAGGATATGTATCCATGTAAATTCCTACCAGCAATGCAGGAGAATTCCAATTGCCCATGTTCTAATCAGAATATTGTTATATCCTAAGACTAATTTTAAATATTCTGATGGGTGTAGAGTGGAGGCATAGTATGATTTCAACTTGTATTTCTTTCATGACTAATTATCTTCTATGTTAATTGTTATTTTGTATGTTTATTGCAAAGTGCCTATCCAGAATTTTTGTCTATAATTTTGTTGTGCTGTCTCTTGCTTTATGAATTTTATAGGATTCTTAATATTATAATTGAGTTATCTTTCTTTTTTATTATTATTATTATACTTTAAGTTTTAGGGTATATGTGCACAACGTGCAAGTTTGTCACATATGTATACATGTGCCATGTTGGTGTGCTGCACCCATTAACTCATCATTTAGCATTAGGTATATCTCCTAATGCTATCCCTTCCTCCTCCCCCCACCCCACAACAGTCCCCGGTGTGTGATGTTCCCCTGCCTTTGTCCTCTTTCTTATACTTGCATGAGCAATCTCCTCAAACTGATACTTGCCTTTTTTGTCCTTGGTGTGGTTTGGCTCTGTGTTCCCACCCAAATCTTCATAATACCCATGTGCCAAGGGTGGGACTGGGTGGAGGTAATTGGGTCATGGGGATGGTTTCCCTCATACTATTATGATAGTGAGTGTTTTCACGAGACCTGATGGTTTTATAACTGTGTGGCATTTCCCTTGCTTCCACTCACTCCATCCTGCCACCCTGTGAAGAAGGTGCCTGCTTCTCCTTTGGTTACTGCTATGATTGTAAGTTTCCTGAGGCCTCCCCAGCAACGCAAAACTGTGAATCAATTAAACCTTTTTCCTTTATAAATTACTAAGTCTTGGGTATTTCTTCATAGTGTTGTGAGCATAGACTAAAACAGTAAGTTGTTACCAGGAGTGGGGTACTGCTGTAAGATAACTGAGAATGTGAAAGTGACTTAGGAACTAGGTAATGAGCAGAGGTTGGAACAGTTTAAAAGGCTCAGAAGAAGACAGAAAGATGTGGGAAAGTTTGGA", -1, "77M200D31M", 50);
00203 
00204     errors += swat(showAllCasesFlag, "TTAGAATGCTATTGTGTTTGGAGATTTGAGGAAAGTGGGCGTGAAGACTTAGTGTTCATTTCCTCAACCTCTCTCTGTGTGAACATACGTCATCGGTCAGAAATTGGG", "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", "CCGAGATTGTGCCATTGCACTCCTGCCTGGGTAACAGAGTCAGACCCTGTCTCAAAAAAAAAAAAAAAAAAAAAAAAGATTAGGTTTTATAGATGGAAAATTCACAGCTCTCTCCAGATCAGAAATCTCCAAGAGTAAATTAGTGTCTTAAAGGGGTTGTAATAACTTTCCTATGTGACTAAGTGCATTATTAATCAATTTTTCTATGATCAAGTACTCCTTTACATACCTGCTAATACAATTTTTGATATGAAATCAGTCCTAGAGGGAATCAATGTAAGATACAGACTTGATGAGTGCTTGCAGTTTTTTATTGACAATCTGAAGAATGACTTGACTCTAAATTGCAGCTCAAGGCTTAGAATGCTATTGTGTTTGGAGATTTGAGGAAAGTGGGCGTGAAGACTTAGTGTTCATTTCCTCAACCTCTCTCTGTGTGAACATACAGGAATCAAATCTGTCTAGCCTCTCTTTTTGGCAAGGTTAAGAACAATTCCACTTCATCCTAATCCCAATGATTCCTGCCGACCCTCTTCCAAAAACTATTTAAAGACATGTTCTTCAAAGTTATATTTGTCTTTCCTTCAGGGAGAAAAAGAATACCAATCACTTATAATATGGAAACTAGCAGAAATGGGTCACATAAGTCATCTGTCAGAAATTGGGAAAATAGAGTAGGTCAGTCTTTCCAGTCATGGTACTTTTACCTTCAATCA", -1, "88M200D20M", 50);
00205 
00206     // prefix TTAGAATGCTATTGTGTTTGGAGATTTGAGGAAAGTGGGCGTGAAGACTTAGTGTTCATTTCCTCAACCTCTCTCTGTGTGAACATAC
00207     // suffix GTCATCTGTCAGAAATTGGGA
00208     cout << endl << "Total Errors found: " << errors << endl;
00209 }
00210 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends