libStatGen Software  1
Cigar.cpp
00001 /*
00002  *  Copyright (C) 2010-2011  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <stdio.h>
00019 #include <stdlib.h>
00020 #include "Cigar.h"
00021 #include "STLUtilities.h"
00022 
00023 // Initialize INDEX_NA.
00024 const int32_t Cigar::INDEX_NA = -1;
00025 
00026 
00027 ////////////////////////////////////////////////////////////////////////
00028 //
00029 // Cigar Class
00030 //
00031 
00032 //
00033 // Set the passed in string to the string reprentation of the Cigar operations
00034 // in this object.
00035 //
00036 void Cigar::getCigarString(std::string& cigarString) const
00037 {
00038     using namespace STLUtilities;
00039 
00040     std::vector<CigarOperator>::const_iterator i;
00041 
00042     cigarString.clear();  // clear result string
00043 
00044     // Progressively append the character representations of the operations to
00045     // the cigar string.
00046     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00047     {
00048         cigarString << (*i).count << (*i).getChar();
00049     }
00050 }
00051 
00052 void Cigar::getCigarString(String& cigarString) const
00053 {
00054     std::string cigar;
00055 
00056     getCigarString(cigar);
00057 
00058     cigarString = cigar.c_str();
00059 
00060     return;
00061 }
00062 
00063 void Cigar::getExpandedString(std::string &s) const
00064 {
00065     s = "";
00066 
00067     std::vector<CigarOperator>::const_iterator i;
00068 
00069     // Progressively append the character representations of the operations to
00070     // the string passed in
00071 
00072     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00073     {
00074         for (uint32_t j = 0; j<(*i).count; j++) s += (*i).getChar();
00075     }
00076     return;
00077 }
00078 
00079 
00080 bool Cigar::operator == (Cigar &rhs) const
00081 {
00082 
00083     if (this->size() != rhs.size()) return false;
00084 
00085     for (int i = 0; i < this->size(); i++)
00086     {
00087         if (cigarOperations[i]!=rhs.cigarOperations[i]) return false;
00088     }
00089     return true;
00090 }
00091 
00092 
00093 // return the length of the read that corresponds to
00094 // the current CIGAR string.
00095 int Cigar::getExpectedQueryBaseCount() const
00096 {
00097     int matchCount = 0;
00098     std::vector<CigarOperator>::const_iterator i;
00099     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00100     {
00101         switch (i->operation)
00102         {
00103             case match:
00104             case mismatch:
00105             case softClip:
00106             case insert:
00107                 matchCount += i->count;
00108                 break;
00109             default:
00110                 // we only care about operations that are in the query sequence.
00111                 break;
00112         }
00113     }
00114     return matchCount;
00115 }
00116 
00117 
00118 // return the number of bases in the reference that
00119 // this read "spans"
00120 int Cigar::getExpectedReferenceBaseCount() const
00121 {
00122     int matchCount = 0;
00123     std::vector<CigarOperator>::const_iterator i;
00124     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00125     {
00126         switch (i->operation)
00127         {
00128             case match:
00129             case mismatch:
00130             case del:
00131             case skip:
00132                 matchCount += i->count;
00133                 break;
00134             default:
00135                 // we only care about operations that are in the reference sequence.
00136                 break;
00137         }
00138     }
00139     return matchCount;
00140 }
00141 
00142 
00143 // Return the number of clips that are at the beginning of the cigar.
00144 int Cigar::getNumBeginClips() const
00145 {
00146     int numBeginClips = 0;
00147     for (unsigned int i = 0; i != cigarOperations.size(); i++)
00148     {
00149         if ((cigarOperations[i].operation == softClip) ||
00150                 (cigarOperations[i].operation == hardClip))
00151         {
00152             // Clipping operator, increment the counter.
00153             numBeginClips += cigarOperations[i].count;
00154         }
00155         else
00156         {
00157             // Break out of the loop since a non-clipping operator was found.
00158             break;
00159         }
00160     }
00161     return(numBeginClips);
00162 }
00163 
00164 
00165 // Return the number of clips that are at the end of the cigar.
00166 int Cigar::getNumEndClips() const
00167 {
00168     int numEndClips = 0;
00169     for (int i = (cigarOperations.size() - 1); i >= 0; i--)
00170     {
00171         if ((cigarOperations[i].operation == softClip) ||
00172                 (cigarOperations[i].operation == hardClip))
00173         {
00174             // Clipping operator, increment the counter.
00175             numEndClips += cigarOperations[i].count;
00176         }
00177         else
00178         {
00179             // Break out of the loop since a non-clipping operator was found.
00180             break;
00181         }
00182     }
00183     return(numEndClips);
00184 }
00185 
00186 
00187 int32_t Cigar::getRefOffset(int32_t queryIndex)
00188 {
00189     // If the vectors aren't set, set them.
00190     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00191     {
00192         setQueryAndReferenceIndexes();
00193     }
00194     if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size()))
00195     {
00196         return(INDEX_NA);
00197     }
00198     return(queryToRef[queryIndex]);
00199 }
00200 
00201 
00202 int32_t Cigar::getQueryIndex(int32_t refOffset)
00203 {
00204     // If the vectors aren't set, set them.
00205     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00206     {
00207         setQueryAndReferenceIndexes();
00208     }
00209     if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size()))
00210     {
00211         return(INDEX_NA);
00212     }
00213     return(refToQuery[refOffset]);
00214 }
00215 
00216 
00217 int32_t Cigar::getRefPosition(int32_t queryIndex, int32_t queryStartPos)
00218 {
00219     // If the vectors aren't set, set them.
00220     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00221     {
00222         setQueryAndReferenceIndexes();
00223     }
00224     if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size()))
00225     {
00226         return(INDEX_NA);
00227     }
00228 
00229     if (queryToRef[queryIndex] != INDEX_NA)
00230     {
00231         return(queryToRef[queryIndex] + queryStartPos);
00232     }
00233     return(INDEX_NA);
00234 }
00235 
00236 
00237 // Return the query index associated with the specified reference position
00238 // when the query starts at the specified reference position based on
00239 // this cigar.
00240 int32_t Cigar::getQueryIndex(int32_t refPosition, int32_t queryStartPos)
00241 {
00242     // If the vectors aren't set, set them.
00243     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00244     {
00245         setQueryAndReferenceIndexes();
00246     }
00247 
00248     int32_t refOffset = refPosition - queryStartPos;
00249     if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size()))
00250     {
00251         return(INDEX_NA);
00252     }
00253 
00254     return(refToQuery[refOffset]);
00255 }
00256 
00257 
00258 int32_t Cigar::getExpandedCigarIndexFromQueryIndex(int32_t queryIndex)
00259 {
00260     // If the vectors aren't set, set them.
00261     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00262     {
00263         setQueryAndReferenceIndexes();
00264     }
00265     if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToCigar.size()))
00266     {
00267         return(INDEX_NA);
00268     }
00269     return(queryToCigar[queryIndex]);
00270 }
00271 
00272 
00273 int32_t Cigar::getExpandedCigarIndexFromRefOffset(int32_t refOffset)
00274 {
00275     // If the vectors aren't set, set them.
00276     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00277     {
00278         setQueryAndReferenceIndexes();
00279     }
00280     if ((refOffset < 0) || ((uint32_t)refOffset >= refToCigar.size()))
00281     {
00282         return(INDEX_NA);
00283     }
00284     return(refToCigar[refOffset]);
00285 }
00286 
00287 
00288 int32_t Cigar::getExpandedCigarIndexFromRefPos(int32_t refPosition, 
00289                                                int32_t queryStartPos)
00290 {
00291     return(getExpandedCigarIndexFromRefOffset(refPosition - queryStartPos));
00292 }
00293 
00294 
00295 char Cigar::getCigarCharOp(int32_t expandedCigarIndex)
00296 {
00297     // Check if the expanded cigar has been set yet
00298     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00299     {
00300         // Set the expanded cigar.
00301         setQueryAndReferenceIndexes();
00302     }
00303 
00304     // Check to see if the index is in range.
00305     if((expandedCigarIndex < 0) || 
00306        ((uint32_t)expandedCigarIndex >= myExpandedCigar.length()))
00307     {
00308         return('?');
00309     }
00310     return(myExpandedCigar[expandedCigarIndex]);
00311 }
00312 
00313 
00314 char Cigar::getCigarCharOpFromQueryIndex(int32_t queryIndex)
00315 {
00316     return(getCigarCharOp(getExpandedCigarIndexFromQueryIndex(queryIndex)));
00317 }
00318 
00319 
00320 char Cigar::getCigarCharOpFromRefOffset(int32_t refOffset)
00321 {
00322     return(getCigarCharOp(getExpandedCigarIndexFromRefOffset(refOffset)));
00323 }
00324 
00325 
00326 char Cigar::getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos)
00327 {
00328     return(getCigarCharOp(getExpandedCigarIndexFromRefPos(refPosition, queryStartPos)));
00329 }
00330 
00331 
00332 // Return the number of bases that overlap the reference and the
00333 // read associated with this cigar that falls within the specified region.
00334 uint32_t Cigar::getNumOverlaps(int32_t start, int32_t end,
00335                                int32_t queryStartPos)
00336 {
00337     // Get the overlap info.
00338     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00339     {
00340         setQueryAndReferenceIndexes();
00341     }
00342 
00343     // Get the start and end offsets.
00344     int32_t startRefOffset = 0;
00345     // If the specified start is more than the queryStartPos, set
00346     // the startRefOffset to the appropriate non-zero value.
00347     // (if start is <= queryStartPos, than startRefOffset is 0 - it should
00348     // not be set to a negative value.)
00349     if (start > queryStartPos)
00350     {
00351         startRefOffset = start - queryStartPos;
00352     }
00353 
00354     int32_t endRefOffset = end - queryStartPos;
00355     if (end  == -1)
00356     {
00357         // -1 means that the region goes to the end of the refrerence.
00358         // So set endRefOffset to the max refOffset + 1 which is the
00359         // size of the refToQuery vector.
00360         endRefOffset = refToQuery.size();
00361     }
00362 
00363 
00364     // if endRefOffset is less than 0, then this read does not fall within
00365     // the specified region, so return 0.
00366     if (endRefOffset < 0)
00367     {
00368         return(0);
00369     }
00370 
00371     // Get the overlaps for these offsets.
00372     // Loop through the read counting positions that match the reference
00373     // within this region.
00374     int32_t refOffset = 0;
00375     int32_t numOverlaps = 0;
00376     for (unsigned int queryIndex = 0; queryIndex < queryToRef.size();
00377             queryIndex++)
00378     {
00379         refOffset = getRefOffset(queryIndex);
00380         if (refOffset > endRefOffset)
00381         {
00382             // Past the end of the specified region, so stop checking
00383             // for overlaps since there will be no more.
00384             break;
00385         }
00386         else if ((refOffset >= startRefOffset) && (refOffset < endRefOffset))
00387         {
00388             // within the region, increment the counter.
00389             ++numOverlaps;
00390         }
00391     }
00392 
00393     return(numOverlaps);
00394 }
00395 
00396 
00397 // Return whether or not the cigar has an indel
00398 bool Cigar::hasIndel()
00399 {
00400     for(unsigned int i = 0; i < cigarOperations.size(); i++)
00401     {
00402         if((cigarOperations[i].operation == insert) ||
00403            (cigarOperations[i].operation == del))
00404         {
00405             // Found an indel, so return true.
00406             return(true);
00407         }
00408     }
00409     // Went through all the operations, and found no indel, so return false.
00410     return(false);
00411 }
00412 
00413 
00414 // Clear the query index/reference offset index vectors.
00415 void Cigar::clearQueryAndReferenceIndexes()
00416 {
00417     queryToRef.clear();
00418     refToQuery.clear();
00419     refToCigar.clear();
00420     queryToCigar.clear();
00421     myExpandedCigar.clear();
00422 }
00423 
00424 
00425 ///////////////////////////////////////////////////////
00426 // Set the query index/reference offset index vectors.
00427 //
00428 // For Cigar: 3M2I2M1D1M
00429 // That total count of cigar elements is 9 (3+2+2+1+1)
00430 //
00431 // The entries that are valid in the query/reference contain the index/offset
00432 // where they are found in the query/reference.  N/A are marked by 'x':
00433 // query indexes:     0123456x7
00434 //                    ---------
00435 // reference offsets: 012xx3456
00436 //
00437 // This shows what query index is associated with which reference offset and
00438 // vice versa.
00439 // For ones where an x appears, -1 would be returned.
00440 //
00441 void Cigar::setQueryAndReferenceIndexes()
00442 {
00443     // First ensure that the vectors are clear by clearing them.
00444     clearQueryAndReferenceIndexes();
00445 
00446     int extPos = 0;
00447 
00448     // Process each cigar index.
00449     for (uint32_t cigarIndex = 0; cigarIndex < cigarOperations.size(); cigarIndex++)
00450     {
00451         // Process the cigar operation.
00452         switch (cigarOperations[cigarIndex].operation)
00453         {
00454             case match:
00455             case mismatch:
00456                 // For match/mismatch, update the maps between query
00457                 // and reference for the number of matches/mismatches.
00458                 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
00459                 {
00460                     // The associated indexes are the next location in
00461                     // each array, which is equal to the current size.
00462                     int32_t queryToRefLen = queryToRef.size();
00463                     int32_t refToQueryLen = refToQuery.size();
00464                     queryToRef.push_back(refToQueryLen);
00465                     refToQuery.push_back(queryToRefLen);
00466                     refToCigar.push_back(extPos);
00467                     queryToCigar.push_back(extPos++);
00468                     myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar());
00469                 }
00470                 break;
00471             case insert:
00472             case softClip:
00473                 // Add N/A reference offset for each query index that this
00474                 // insert covers.
00475                 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
00476                 {
00477                     queryToRef.push_back(INDEX_NA);
00478                     queryToCigar.push_back(extPos++);
00479                     myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar());
00480                 }
00481                 break;
00482             case del:
00483             case skip:
00484                 // Add N/A query index for each reference offset that this
00485                 // deletion/skip covers.
00486                 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
00487                 {
00488                     refToQuery.push_back(INDEX_NA);
00489                     refToCigar.push_back(extPos++);
00490                     myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar());
00491                 }
00492                 break;
00493             case hardClip:
00494             case pad:
00495             case none:
00496                 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
00497                 {
00498                     myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar());
00499                     ++extPos;
00500                 }
00501                 break;
00502         };
00503     }
00504 }
00505 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends