libStatGen Software  1
CigarHelper.cpp
00001 /*
00002  *  Copyright (C) 2011  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 //////////////////////////////////////////////////////////////////////////
00019 
00020 #include "CigarHelper.h"
00021 
00022 // Soft Clip from the beginning of the read to the specified reference position.
00023 int32_t CigarHelper::softClipBeginByRefPos(SamRecord& record, 
00024                                            int32_t refPosition0Based,
00025                                            CigarRoller& newCigar,
00026                                            int32_t &new0BasedPosition)
00027 {
00028     newCigar.clear();
00029     Cigar* cigar = record.getCigarInfo();
00030     if(cigar == NULL)
00031     {
00032         // Failed to get the cigar.
00033         ErrorHandler::handleError("Soft clipping, but failed to read the cigar");
00034         return(NO_CLIP);
00035     }
00036 
00037     // No cigar or position in the record, so return no clip.
00038     if((cigar->size() == 0) || (record.get0BasedPosition() == -1))
00039     {
00040         return(NO_CLIP);
00041     }
00042 
00043     // Check to see if the reference position occurs before the record starts,
00044     // if it does, do no clipping.
00045     if(refPosition0Based < record.get0BasedPosition())
00046     {
00047         // Not within this read, so nothing to clip.
00048         newCigar.Set(record.getCigar());
00049         return(NO_CLIP);
00050     }
00051 
00052     // The position falls after the read starts, so loop through until the
00053     // position or the end of the read is found.
00054     int32_t readClipPosition = 0;
00055     bool clipWritten = false;
00056     new0BasedPosition = record.get0BasedPosition();
00057     for(int i = 0; i < cigar->size(); i++)
00058     {
00059         const Cigar::CigarOperator* op = &(cigar->getOperator(i));
00060 
00061         if(clipWritten)
00062         {
00063             // Clip point has been found, so just add everything.
00064             newCigar += *op;
00065             // Go to the next operation.
00066             continue;
00067         }
00068 
00069         // The clip point has not yet been found, so check to see if we found 
00070         // it now.
00071 
00072         // Not a clip, check to see if the operation is found in the
00073         // reference.
00074         if(Cigar::foundInReference(*op))
00075         {
00076             // match, mismatch, deletion, skip
00077 
00078             // increment the current reference position to just past this
00079             // operation.
00080             new0BasedPosition += op->count;
00081 
00082             // Check to see if this is also in the query, because otherwise
00083             // the operation is still being consumed.
00084             if(Cigar::foundInQuery(*op))
00085             {
00086                 // Also in the query, determine if the entire thing should
00087                 // be clipped or just part of it.
00088 
00089                 uint32_t numKeep = 0;
00090                 // Check to see if we have hit our clip position.
00091                 if(refPosition0Based < new0BasedPosition)
00092                 {
00093                     // The specified clip position is in this cigar operation.
00094                     numKeep = new0BasedPosition - refPosition0Based - 1;
00095                     
00096                     if(numKeep > op->count)
00097                     {
00098                         // Keep the entire read.  This happens because
00099                         // we keep reading until the first match/mismatch
00100                         // after the clip.
00101                         numKeep = op->count;
00102                     }
00103                 }
00104 
00105                 // Add the part of this operation that is being clipped
00106                 // to the clip count.
00107                 readClipPosition += (op->count - numKeep);
00108 
00109                 // Only write the clip if we found a match/mismatch
00110                 // to write.  Otherwise we will keep accumulating clips
00111                 // for the case of insertions.
00112                 if(numKeep > 0)
00113                 {
00114                     new0BasedPosition -= numKeep;
00115 
00116                     newCigar.Add(Cigar::softClip, readClipPosition);
00117                     
00118                     // Add the clipped part of this cigar to the clip
00119                     // position.
00120                     newCigar.Add(op->operation, numKeep);
00121                     
00122                     // Found a match after the clip point, so stop
00123                     // consuming cigar operations.
00124                     clipWritten = true;
00125                     continue;
00126                 }
00127             }
00128         }
00129         else
00130         {
00131             // Only add hard clips.  The softclips will be added in
00132             // when the total number is found.
00133             if(op->operation == Cigar::hardClip)
00134             {
00135                 // Check if this is the first operation, if so, just write it.
00136                 if(i == 0)
00137                 {
00138                     newCigar += *op;
00139                 }
00140                 // Check if it is the last operation (otherwise skip it).
00141                 else if(i == (cigar->size() - 1))
00142                 {
00143                     // Check whether or not the clip was ever written, and if
00144                     // not, write it.
00145                     if(clipWritten == false)
00146                     {
00147                         newCigar.Add(Cigar::softClip, readClipPosition);
00148                         // Since no match/mismatch was ever found, set
00149                         // the new ref position to the original one.
00150                         new0BasedPosition = record.get0BasedPosition();
00151                         clipWritten = true;
00152                     }
00153                     // Add the hard clip.
00154                     newCigar += *op;
00155                 }
00156             }
00157             // Not yet to the clip position, so do not add this operation.
00158             if(Cigar::foundInQuery(*op))
00159             {
00160                 // Found in the query, so update the read clip position.
00161                 readClipPosition += op->count;
00162             }
00163         }
00164     } // End loop through cigar.
00165 
00166 
00167     // Check whether or not the clip was ever written, and if
00168     // not, write it.
00169     if(clipWritten == false)
00170     {
00171         newCigar.Add(Cigar::softClip, readClipPosition);
00172         // Since no match/mismatch was ever found, set
00173         // the new ref position to the original one.
00174         new0BasedPosition = record.get0BasedPosition();
00175     }
00176 
00177     // Subtract 1 since readClipPosition atually contains the first 0based 
00178     // position that is not clipped.
00179     return(readClipPosition - 1);
00180 }
00181 
00182 
00183 // Soft Clip from the end of the read at the specified reference position.
00184 int32_t CigarHelper::softClipEndByRefPos(SamRecord& record, 
00185                                          int32_t refPosition0Based,
00186                                          CigarRoller& newCigar)
00187 {
00188     newCigar.clear();
00189     Cigar* cigar = record.getCigarInfo();
00190     if(cigar == NULL)
00191     {
00192         // Failed to get the cigar.
00193         ErrorHandler::handleError("Soft clipping, but failed to read the cigar");
00194         return(NO_CLIP);
00195     }
00196 
00197     // No cigar or position in the record, so return no clip.
00198     if((cigar->size() == 0) || (record.get0BasedPosition() == -1))
00199     {
00200         return(NO_CLIP);
00201     }
00202 
00203     // Check to see if the reference position occurs after the record ends,
00204     // if so, do no clipping.
00205     if(refPosition0Based > record.get0BasedAlignmentEnd())
00206     {
00207         // Not within this read, so nothing to clip.
00208         newCigar.Set(record.getCigar());
00209         return(NO_CLIP);
00210     }
00211 
00212     // The position falls before the read ends, so loop through until the
00213     // position is found.
00214     int32_t currentRefPosition = record.get0BasedPosition();
00215     int32_t readClipPosition = 0;
00216     for(int i = 0; i < cigar->size(); i++)
00217     {
00218         const Cigar::CigarOperator* op = &(cigar->getOperator(i));
00219 
00220         // If the operation is found in the reference, increase the
00221         // reference position.
00222         if(Cigar::foundInReference(*op))
00223         {
00224             // match, mismatch, deletion, skip
00225             // increment the current reference position to just past
00226             // this operation.
00227             currentRefPosition += op->count;
00228         }
00229          
00230         // Check to see if we have hit our clip position.
00231         if(refPosition0Based < currentRefPosition)
00232         {
00233             // If this read is also in the query (match/mismatch), 
00234             // write the partial op to the new cigar.
00235             int32_t numKeep = 0;
00236             if(Cigar::foundInQuery(*op))
00237             {
00238                 numKeep = op->count - (currentRefPosition - refPosition0Based);
00239                 if(numKeep > 0)
00240                 {
00241                     newCigar.Add(op->operation, numKeep);
00242                     readClipPosition += numKeep;
00243                 }
00244             }
00245             else if(Cigar::isClip(*op))
00246             {
00247                 // This is a hard clip, so write it.
00248                 newCigar.Add(op->operation, op->count);
00249             }
00250             else
00251             {
00252 
00253                 // Not found in the query (skip/deletion),
00254                 // so don't write any of the operation.
00255             }
00256             // Found the clip point, so break.
00257             break;
00258         }
00259         else if(refPosition0Based == currentRefPosition)
00260         {
00261             newCigar += *op;
00262             if(Cigar::foundInQuery(*op))
00263             {
00264                 readClipPosition += op->count;
00265             }
00266         }
00267         else
00268         {
00269             // Not yet to the clip position, so add this operation/size to
00270             // the new cigar.
00271             newCigar += *op;
00272             if(Cigar::foundInQuery(*op))
00273             {
00274                 // Found in the query, so update the read clip position.
00275                 readClipPosition += op->count;
00276             }
00277         }
00278     } // End loop through cigar.
00279 
00280     // Before adding the softclip, read from the end of the cigar checking to
00281     // see if the operations are in the query, removing operations that are
00282     // not (pad/delete/skip) until a hardclip or an operation in the query is
00283     // found. We do not want a pad/delete/skip right before a softclip.
00284     for(int j = newCigar.size() - 1; j >= 0; j--)
00285     {
00286         const Cigar::CigarOperator* op = &(newCigar.getOperator(j));
00287         if(!Cigar::foundInQuery(*op) && !Cigar::isClip(*op))
00288         {
00289             // pad/delete/skip
00290             newCigar.Remove(j);
00291         }
00292         else if(Cigar::foundInQuery(*op) & Cigar::isClip(*op))
00293         {
00294             // Soft clip, so increment the clip position for the return value.
00295             // Remove the softclip since the readClipPosition is used to
00296             // calculate teh size of the soft clip added.
00297             readClipPosition -= op->count;
00298             newCigar.Remove(j);
00299         }
00300         else
00301         {
00302             // Found a cigar operation that should not be deleted, so stop deleting.
00303             break;
00304         }
00305     } 
00306 
00307     // Determine the number of soft clips.
00308     int32_t numSoftClips = record.getReadLength() - readClipPosition;
00309     // NOTE that if the previous operation is a softclip, the CigarRoller logic
00310     // will merge this with that one.
00311     newCigar.Add(Cigar::softClip, numSoftClips);
00312 
00313     // Check if an ending hard clip needs to be added.
00314     if(cigar->size() != 0)
00315     {
00316         const Cigar::CigarOperator* lastOp = 
00317             &(cigar->getOperator(cigar->size() - 1));
00318         if(lastOp->operation == Cigar::hardClip)
00319         {
00320             newCigar += *lastOp;
00321         }
00322     }
00323 
00324     return(readClipPosition);
00325 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends