libStatGen Software  1
GreedyTupleAligner.h
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _GREEDY_TUPLE_H
00019 #define _GREEDY_TUPLE_H
00020 
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <assert.h>
00025 #include <ctype.h>
00026 #include "Generic.h"
00027 #include "CigarRoller.h"
00028 
00029 /*
00030  *
00031 
00032  TODO:
00033  1. how to efficiently find insertion?
00034 
00035 */
00036 
00037 /**
00038  * Weight includes various penalties(e.g. gap open) used in local alignment
00039  */
00040 struct Weight
00041 {
00042 public:
00043     Weight()
00044     {
00045         gapOpen = gapExtend = -1; // here we do not use affine gap penalty for simlicity.
00046         mismatch = -1;
00047         match= 2;
00048     };
00049     int gapOpen;
00050     int gapExtend;
00051     int mismatch;
00052     int match;
00053 };
00054 
00055 //
00056 // tuple number is 3, arbitrary number from my guess!
00057 // another reason
00058 //
00059 template <typename QueryType, typename ReferenceType, typename ReferenceIndex>
00060 class GreedyTupleAligner
00061 {
00062 public:
00063     GreedyTupleAligner(Weight& wt): weight(wt)
00064     {/* */}
00065 
00066     /**
00067      * Match 'query' to the 'reference' from 'searchStartIndex' up
00068      * to 'searchSize', store matched length to 'matchedLength'
00069      * and number of mismatch to 'mismatch'
00070      * @param query            input query
00071      * @param queryLength       length of query
00072      * @param reference        reference sequence
00073      * @param searchStartIndex the positino where search starts
00074      * @param searchSize       the total length in reference sequence that will be examine
00075      * @param matchedLength    store how many bases are matched
00076      * @param mismatch         store how many bases are mismatched
00077      * @return -1 for unsuccess return
00078      */
00079     int MatchTuple(
00080         const QueryType       query,
00081         const int             queryLength,
00082         const ReferenceType   reference,
00083         const ReferenceIndex  searchStartIndex,
00084         const int             searchSize,
00085         int&            matchedLength,
00086         int&            mismatch)
00087     {
00088         // now use naive search,
00089         // TODO: will incorportate KMP serach later
00090         // TODO: adjust tolerance of mismatches
00091         const int MAX_MISMATCH=2;
00092         int bestPos = 0, bestMismatch = queryLength, bestMatchedLength = 0, bestScore=-1;
00093 
00094 #if defined(DEBUG_GREEDY_ALIGNER)
00095         cout << "searchStartIndex == " << searchStartIndex << ", searchSize == " << searchSize << std::endl;
00096 #endif
00097         // here i is the matching position (inclusive)
00098         // j is the matched length
00099         for (int i = 0; i <= searchSize - tupleSize; i++)
00100         {
00101             int j = 0;
00102             mismatch = 0;
00103             while (j < queryLength)
00104             {
00105                 if (searchStartIndex + i + j >= reference.getNumberBases())
00106                     break;
00107                 if (query[j] != reference[searchStartIndex + i + j])
00108                 {
00109                     mismatch++;
00110                     if (mismatch >= MAX_MISMATCH)
00111                         break;
00112                 }
00113                 j++;
00114             }
00115 
00116             if (j>0 && (j==queryLength)) j--;
00117 
00118             while (searchStartIndex +i +j < reference.getNumberBases()
00119                     && ((j+1) > mismatch)
00120                     && mismatch>0
00121                     && query[j] != reference[searchStartIndex + i+j])
00122             {
00123                 // if pattern matching goes beyong the preset mismatch cutoff,
00124                 // we will have to go backwards
00125                 j--;
00126                 mismatch--;
00127             }
00128 
00129             int score = j - mismatch;
00130 
00131             if (score > bestScore)
00132             {
00133                 bestPos = i;
00134                 bestScore = score;
00135                 bestMismatch = mismatch;
00136                 bestMatchedLength = j+1;
00137             }
00138         }
00139 
00140         if (bestScore > 0)
00141         {
00142             mismatch = bestMismatch;
00143             matchedLength = bestMatchedLength;
00144             return bestPos;
00145         }
00146         return -1;
00147     }
00148 
00149     /**
00150      * Core local alignment algorithm
00151      * @param query             input query
00152      * @param queryLength       length of query
00153      * @param reference         reference genome
00154      * @param searchStartIndex  matching starts here
00155      * @param searchSize        how far we will search
00156      * @param cigarRoller       store alignment results here
00157      * @param matchPosition     store match position
00158      */
00159     void Align(
00160         QueryType       query,
00161         int             queryLength,
00162         ReferenceType   reference,
00163         ReferenceIndex  searchStartIndex,
00164         int             searchSize,
00165         CigarRoller&    cigarRoller,
00166         ReferenceIndex& matchPosition)
00167     {
00168         // Algorithm:
00169         // finished align? (should we try different align position?)
00170         // if not, try next tuple
00171         //    is the tuple aligned?
00172         //    yes, extend to previous, mark unmatched part mismatch or gap
00173         //         extend to next matched part
00174         int r1 = 0; // a start index: reference starting from r1 (inclusive) will be used
00175         int queryMatchCount = 0; // query matched # of bases
00176         int q1 = 0; // to align
00177         int pos = -1;
00178         int lastR1 = -1; // index: record last
00179 
00180         cigarRoller.clear();
00181         matchPosition = -1;
00182 
00183         while (queryMatchCount < queryLength)
00184         {
00185             if (r1 == searchSize - 1)   // touched ref right boundary
00186             {
00187                 cigarRoller.Add(CigarRoller::softClip, queryLength-queryMatchCount);
00188                 break;
00189             }
00190             if (queryLength - q1 < tupleSize)
00191             {
00192                 // XXX this needs to do something more sane
00193                 // printf("some bases left!\n");
00194                 // a simple fix: treat all left-over bases as mismatches/matches
00195                 cigarRoller.Add(CigarRoller::mismatch, queryLength - queryMatchCount);
00196                 break;
00197             }
00198             int mismatch = 0;
00199             int matchedLen = 0;
00200             if ((pos = MatchTuple(query+q1, queryLength-q1, reference, searchStartIndex + r1, searchSize - r1, matchedLen, mismatch)) // found match position for tuple
00201 
00202                     >= 0)
00203             {
00204                 // found match position for tuple
00205 
00206                 if (lastR1<0)
00207                     matchPosition = pos;
00208 
00209                 //
00210                 // deal with left
00211                 //
00212                 if (lastR1>=0)   // have previously aligned part of the query to the reference genome yet
00213                 {
00214                     if (pos > 0)
00215                     {
00216                         cigarRoller.Add(CigarRoller::del, pos);
00217                     }
00218                 }
00219                 else
00220                 {
00221                     lastR1 = pos;
00222                 }
00223 
00224                 r1 += pos;
00225                 r1 += matchedLen;
00226                 q1 += matchedLen;
00227 
00228                 //
00229                 // deal with right
00230                 //
00231                 cigarRoller.Add(CigarRoller::match, matchedLen);
00232                 queryMatchCount = q1;
00233                 lastR1 = r1;
00234 
00235                 continue;
00236             } // end if
00237 
00238             //
00239             // try insertion
00240             // maximum insert ? say 2
00241             //
00242             for (int i = 1; i < queryLength - q1 - tupleSize; i++)
00243             {
00244                 int mismatch = 0;
00245                 int matchedLen = 0;
00246                 // check reference genome broundary
00247                 if (searchStartIndex + r1 >= reference.getNumberBases())
00248                     return;
00249                 if ((pos = MatchTuple(query+q1 + i ,
00250                                       queryLength - q1 -i ,
00251                                       reference,
00252                                       searchStartIndex + r1,
00253                                       searchSize - r1,
00254                                       matchedLen,
00255                                       mismatch)) // found match position for tuple
00256                         >= 0)
00257                 {
00258                     if (matchPosition < 0)
00259                         matchPosition = pos + q1 + i ;
00260                 }
00261                 queryMatchCount += i;
00262                 q1 += i;
00263                 cigarRoller.Add(CigarRoller::insert, i);
00264 
00265                 lastR1 = r1 + pos;
00266                 r1 += pos + tupleSize;
00267                 q1 += tupleSize;
00268 
00269                 // deal with right
00270                 while (searchStartIndex + r1 < reference.getNumberBases()
00271                         && query[q1]==reference[searchStartIndex + r1]
00272                         && q1 < queryLength)
00273                 {
00274                     r1++;
00275                     q1++;
00276                 }
00277                 if (q1 < queryLength)
00278                 {
00279                     cigarRoller.Add(CigarRoller::match, q1 - queryMatchCount);
00280                     queryMatchCount = q1;
00281                 }
00282                 else
00283                 {
00284                     cigarRoller.Add(CigarRoller::match, queryLength - queryMatchCount);
00285                     queryMatchCount = queryLength ;
00286                     break ;
00287                 }
00288             }
00289 
00290             r1++;
00291             q1++;
00292 
00293             // try next
00294         } // end while (queryMatchCount < queryLength)
00295     }
00296 private:
00297     static const int tupleSize = 3;
00298     Weight weight;
00299 };
00300 
00301 
00302 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends