Cigar.cpp

00001 /*
00002  *  Copyright (C) 2010-2011  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <stdio.h>
00019 #include <stdlib.h>
00020 #include "Cigar.h"
00021 #include "STLUtilities.h"
00022 
00023 // Initialize INDEX_NA.
00024 const int32_t Cigar::INDEX_NA = -1;
00025 
00026 
00027 ////////////////////////////////////////////////////////////////////////
00028 //
00029 // Cigar Class
00030 //
00031 
00032 //
00033 // Set the passed in string to the string reprentation of the Cigar operations
00034 // in this object.
00035 //
00036 void Cigar::getCigarString(std::string& cigarString) const
00037 {
00038     using namespace STLUtilities;
00039 
00040     std::vector<CigarOperator>::const_iterator i;
00041 
00042     cigarString.clear();  // clear result string
00043 
00044     // Progressively append the character representations of the operations to
00045     // the cigar string.
00046     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00047     {
00048         cigarString << (*i).count << (*i).getChar();
00049     }
00050 }
00051 
00052 void Cigar::getCigarString(String& cigarString) const
00053 {
00054     std::string cigar;
00055 
00056     getCigarString(cigar);
00057 
00058     cigarString = cigar.c_str();
00059 
00060     return;
00061 }
00062 
00063 void Cigar::getExpandedString(std::string &s) const
00064 {
00065     s = "";
00066 
00067     std::vector<CigarOperator>::const_iterator i;
00068 
00069     // Progressively append the character representations of the operations to
00070     // the string passed in
00071 
00072     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00073     {
00074         for (uint32_t j = 0; j<(*i).count; j++) s += (*i).getChar();
00075     }
00076     return;
00077 }
00078 
00079 
00080 bool Cigar::operator == (Cigar &rhs) const
00081 {
00082 
00083     if (this->size() != rhs.size()) return false;
00084 
00085     for (int i = 0; i < this->size(); i++)
00086     {
00087         if (cigarOperations[i]!=rhs.cigarOperations[i]) return false;
00088     }
00089     return true;
00090 }
00091 
00092 
00093 // return the length of the read that corresponds to
00094 // the current CIGAR string.
00095 int Cigar::getExpectedQueryBaseCount() const
00096 {
00097     int matchCount = 0;
00098     std::vector<CigarOperator>::const_iterator i;
00099     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00100     {
00101         switch (i->operation)
00102         {
00103             case match:
00104             case mismatch:
00105             case softClip:
00106             case insert:
00107                 matchCount += i->count;
00108                 break;
00109             default:
00110                 // we only care about operations that are in the query sequence.
00111                 break;
00112         }
00113     }
00114     return matchCount;
00115 }
00116 
00117 
00118 // return the number of bases in the reference that
00119 // this read "spans"
00120 int Cigar::getExpectedReferenceBaseCount() const
00121 {
00122     int matchCount = 0;
00123     std::vector<CigarOperator>::const_iterator i;
00124     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00125     {
00126         switch (i->operation)
00127         {
00128             case match:
00129             case mismatch:
00130             case del:
00131             case skip:
00132                 matchCount += i->count;
00133                 break;
00134             default:
00135                 // we only care about operations that are in the reference sequence.
00136                 break;
00137         }
00138     }
00139     return matchCount;
00140 }
00141 
00142 
00143 // Return the number of clips that are at the beginning of the cigar.
00144 int Cigar::getNumBeginClips() const
00145 {
00146     int numBeginClips = 0;
00147     for (unsigned int i = 0; i != cigarOperations.size(); i++)
00148     {
00149         if ((cigarOperations[i].operation == softClip) ||
00150                 (cigarOperations[i].operation == hardClip))
00151         {
00152             // Clipping operator, increment the counter.
00153             numBeginClips += cigarOperations[i].count;
00154         }
00155         else
00156         {
00157             // Break out of the loop since a non-clipping operator was found.
00158             break;
00159         }
00160     }
00161     return(numBeginClips);
00162 }
00163 
00164 
00165 // Return the number of clips that are at the end of the cigar.
00166 int Cigar::getNumEndClips() const
00167 {
00168     int numEndClips = 0;
00169     for (int i = (cigarOperations.size() - 1); i >= 0; i--)
00170     {
00171         if ((cigarOperations[i].operation == softClip) ||
00172                 (cigarOperations[i].operation == hardClip))
00173         {
00174             // Clipping operator, increment the counter.
00175             numEndClips += cigarOperations[i].count;
00176         }
00177         else
00178         {
00179             // Break out of the loop since a non-clipping operator was found.
00180             break;
00181         }
00182     }
00183     return(numEndClips);
00184 }
00185 
00186 
00187 int32_t Cigar::getRefOffset(int32_t queryIndex)
00188 {
00189     // If the vectors aren't set, set them.
00190     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00191     {
00192         setQueryAndReferenceIndexes();
00193     }
00194     if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size()))
00195     {
00196         return(INDEX_NA);
00197     }
00198     return(queryToRef[queryIndex]);
00199 }
00200 
00201 
00202 int32_t Cigar::getQueryIndex(int32_t refOffset)
00203 {
00204     // If the vectors aren't set, set them.
00205     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00206     {
00207         setQueryAndReferenceIndexes();
00208     }
00209     if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size()))
00210     {
00211         return(INDEX_NA);
00212     }
00213     return(refToQuery[refOffset]);
00214 }
00215 
00216 
00217 int32_t Cigar::getRefPosition(int32_t queryIndex, int32_t queryStartPos)
00218 {
00219     // If the vectors aren't set, set them.
00220     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00221     {
00222         setQueryAndReferenceIndexes();
00223     }
00224     if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size()))
00225     {
00226         return(INDEX_NA);
00227     }
00228 
00229     if (queryToRef[queryIndex] != INDEX_NA)
00230     {
00231         return(queryToRef[queryIndex] + queryStartPos);
00232     }
00233     return(INDEX_NA);
00234 }
00235 
00236 
00237 // Return the query index associated with the specified reference position
00238 // when the query starts at the specified reference position based on
00239 // this cigar.
00240 int32_t Cigar::getQueryIndex(int32_t refPosition, int32_t queryStartPos)
00241 {
00242     // If the vectors aren't set, set them.
00243     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00244     {
00245         setQueryAndReferenceIndexes();
00246     }
00247 
00248     int32_t refOffset = refPosition - queryStartPos;
00249     if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size()))
00250     {
00251         return(INDEX_NA);
00252     }
00253 
00254     return(refToQuery[refOffset]);
00255 }
00256 
00257 
00258 // Return the number of bases that overlap the reference and the
00259 // read associated with this cigar that falls within the specified region.
00260 uint32_t Cigar::getNumOverlaps(int32_t start, int32_t end,
00261                                int32_t queryStartPos)
00262 {
00263     // Get the overlap info.
00264     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00265     {
00266         setQueryAndReferenceIndexes();
00267     }
00268 
00269     // Get the start and end offsets.
00270     int32_t startRefOffset = 0;
00271     // If the specified start is more than the queryStartPos, set
00272     // the startRefOffset to the appropriate non-zero value.
00273     // (if start is <= queryStartPos, than startRefOffset is 0 - it should
00274     // not be set to a negative value.)
00275     if (start > queryStartPos)
00276     {
00277         startRefOffset = start - queryStartPos;
00278     }
00279 
00280     int32_t endRefOffset = end - queryStartPos;
00281     if (end  == -1)
00282     {
00283         // -1 means that the region goes to the end of the refrerence.
00284         // So set endRefOffset to the max refOffset + 1 which is the
00285         // size of the refToQuery vector.
00286         endRefOffset = refToQuery.size();
00287     }
00288 
00289 
00290     // if endRefOffset is less than 0, then this read does not fall within
00291     // the specified region, so return 0.
00292     if (endRefOffset < 0)
00293     {
00294         return(0);
00295     }
00296 
00297     // Get the overlaps for these offsets.
00298     // Loop through the read counting positions that match the reference
00299     // within this region.
00300     int32_t refOffset = 0;
00301     int32_t numOverlaps = 0;
00302     for (unsigned int queryIndex = 0; queryIndex < queryToRef.size();
00303             queryIndex++)
00304     {
00305         refOffset = getRefOffset(queryIndex);
00306         if (refOffset > endRefOffset)
00307         {
00308             // Past the end of the specified region, so stop checking
00309             // for overlaps since there will be no more.
00310             break;
00311         }
00312         else if ((refOffset >= startRefOffset) && (refOffset < endRefOffset))
00313         {
00314             // within the region, increment the counter.
00315             ++numOverlaps;
00316         }
00317     }
00318 
00319     return(numOverlaps);
00320 }
00321 
00322 
00323 // Return whether or not the cigar has an indel
00324 bool Cigar::hasIndel()
00325 {
00326     for(unsigned int i = 0; i < cigarOperations.size(); i++)
00327     {
00328         if((cigarOperations[i].operation == insert) ||
00329            (cigarOperations[i].operation == del))
00330         {
00331             // Found an indel, so return true.
00332             return(true);
00333         }
00334     }
00335     // Went through all the operations, and found no indel, so return false.
00336     return(false);
00337 }
00338 
00339 
00340 // Clear the query index/reference offset index vectors.
00341 void Cigar::clearQueryAndReferenceIndexes()
00342 {
00343     queryToRef.clear();
00344     refToQuery.clear();
00345 }
00346 
00347 
00348 ///////////////////////////////////////////////////////
00349 // Set the query index/reference offset index vectors.
00350 //
00351 // For Cigar: 3M2I2M1D1M
00352 // That total count of cigar elements is 9 (3+2+2+1+1)
00353 //
00354 // The entries that are valid in the query/reference contain the index/offset
00355 // where they are found in the query/reference.  N/A are marked by 'x':
00356 // query indexes:     0123456x7
00357 //                    ---------
00358 // reference offsets: 012xx3456
00359 //
00360 // This shows what query index is associated with which reference offset and
00361 // vice versa.
00362 // For ones where an x appears, -1 would be returned.
00363 //
00364 void Cigar::setQueryAndReferenceIndexes()
00365 {
00366     // First ensure that the vectors are clear by clearing them.
00367     clearQueryAndReferenceIndexes();
00368 
00369     // Process each cigar index.
00370     for (uint32_t cigarIndex = 0; cigarIndex < cigarOperations.size(); cigarIndex++)
00371     {
00372         // Process the cigar operation.
00373         switch (cigarOperations[cigarIndex].operation)
00374         {
00375             case match:
00376             case mismatch:
00377                 // For match/mismatch, update the maps between query
00378                 // and reference for the number of matches/mismatches.
00379                 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
00380                 {
00381                     // The associated indexes are the next location in
00382                     // each array, which is equal to the current size.
00383                     int32_t queryToRefLen = queryToRef.size();
00384                     int32_t refToQueryLen = refToQuery.size();
00385                     queryToRef.push_back(refToQueryLen);
00386                     refToQuery.push_back(queryToRefLen);
00387                 }
00388                 break;
00389             case insert:
00390             case softClip:
00391                 // Add N/A reference offset for each query index that this
00392                 // insert covers.
00393                 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
00394                 {
00395                     queryToRef.push_back(INDEX_NA);
00396                 }
00397                 break;
00398             case del:
00399             case skip:
00400                 // Add N/A query index for each reference offset that this
00401                 // deletion/skip covers.
00402                 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
00403                 {
00404                     refToQuery.push_back(INDEX_NA);
00405                 }
00406                 break;
00407             case hardClip:
00408             case pad:
00409             case none:
00410                 break;
00411         };
00412     }
00413 }
00414 
Generated on Tue Aug 23 18:19:05 2011 for libStatGen Software by  doxygen 1.6.3