Cigar.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <stdio.h>
00019 #include <stdlib.h>
00020 #include "Cigar.h"
00021 #include "STLUtilities.h"
00022 
00023 // Initialize INDEX_NA.
00024 const int32_t Cigar::INDEX_NA = -1;
00025 
00026 
00027 ////////////////////////////////////////////////////////////////////////
00028 //
00029 // Cigar Class
00030 //
00031 
00032 //
00033 // Set the passed in string to the string reprentation of the Cigar operations
00034 // in this object.
00035 //
00036 void Cigar::getCigarString(std::string& cigarString) const
00037 {
00038     using namespace STLUtilities;
00039 
00040     std::vector<CigarOperator>::const_iterator i;
00041 
00042     cigarString.clear();  // clear result string
00043 
00044     // Progressively append the character representations of the operations to
00045     // the cigar string.
00046     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00047     {
00048         cigarString << (*i).count << (*i).getChar();
00049     }
00050 }
00051 
00052 void Cigar::getCigarString(String& cigarString) const
00053 {
00054     std::string cigar;
00055 
00056     getCigarString(cigar);
00057 
00058     cigarString = cigar.c_str();
00059 
00060     return;
00061 }
00062 
00063 void Cigar::getExpandedString(std::string &s) const
00064 {
00065     s = "";
00066 
00067     std::vector<CigarOperator>::const_iterator i;
00068 
00069     // Progressively append the character representations of the operations to
00070     // the string passed in
00071 
00072     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00073     {
00074         for (uint32_t j = 0; j<(*i).count; j++) s += (*i).getChar();
00075     }
00076     return;
00077 }
00078 
00079 
00080 bool Cigar::operator == (Cigar &rhs) const
00081 {
00082 
00083     if (this->size() != rhs.size()) return false;
00084 
00085     for (int i = 0; i < this->size(); i++)
00086     {
00087         if (cigarOperations[i]!=rhs.cigarOperations[i]) return false;
00088     }
00089     return true;
00090 }
00091 
00092 
00093 /// return the length of the read that corresponds to
00094 /// the current CIGAR string.
00095 ///
00096 /// For validation, we should expect that a sequence
00097 /// read in a SAM file will be the same length as the
00098 /// value returned by this method.
00099 ///
00100 /// Example: 3M2D3M describes a read with three bases
00101 /// matching the reference, then skips 2 bases, then has
00102 /// three more bases that match the reference (match/mismatch).
00103 /// In this case, the read length is expected to be 6.
00104 ///
00105 /// Example: 3M2I3M describes a read with 3 match/mismatch
00106 /// bases, two extra bases, and then 3 more match/mistmatch
00107 /// bases.  The total in this example is 8 bases.
00108 ///
00109 /// /return returns the expected read length
00110 int Cigar::getExpectedQueryBaseCount() const
00111 {
00112     int matchCount = 0;
00113     std::vector<CigarOperator>::const_iterator i;
00114     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00115     {
00116         switch (i->operation)
00117         {
00118             case match:
00119             case mismatch:
00120             case softClip:
00121             case insert:
00122                 matchCount += i->count;
00123                 break;
00124             default:
00125                 // we only care about operations that are in the query sequence.
00126                 break;
00127         }
00128     }
00129     return matchCount;
00130 }
00131 
00132 
00133 /// return the number of bases in the reference that
00134 /// this read "spans"
00135 ///
00136 /// When doing range checking, we occassionally need to know
00137 /// how many total bases the CIGAR string represents as compared
00138 /// to the reference.
00139 ///
00140 /// Examples: 3M2D3M describes a read that overlays 8 bases in
00141 /// the reference.  3M2I3M describes a read with 3 bases that
00142 /// match the reference, two additional bases that aren't in the
00143 /// reference, and 3 more bases that match the reference, so it
00144 /// spans 6 bases in the reference.
00145 ///
00146 /// /return how many bases in the reference are spanned
00147 /// by the given CIGAR string
00148 ///
00149 int Cigar::getExpectedReferenceBaseCount() const
00150 {
00151     int matchCount = 0;
00152     std::vector<CigarOperator>::const_iterator i;
00153     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00154     {
00155         switch (i->operation)
00156         {
00157             case match:
00158             case mismatch:
00159             case del:
00160             case skip:
00161                 matchCount += i->count;
00162                 break;
00163             default:
00164                 // we only care about operations that are in the reference sequence.
00165                 break;
00166         }
00167     }
00168     return matchCount;
00169 }
00170 
00171 
00172 /// Return the number of clips that are at the beginning of the cigar.
00173 int Cigar::getNumBeginClips() const
00174 {
00175     int numBeginClips = 0;
00176     for (unsigned int i = 0; i != cigarOperations.size(); i++)
00177     {
00178         if ((cigarOperations[i].operation == softClip) ||
00179                 (cigarOperations[i].operation == hardClip))
00180         {
00181             // Clipping operator, increment the counter.
00182             numBeginClips += cigarOperations[i].count;
00183         }
00184         else
00185         {
00186             // Break out of the loop since a non-clipping operator was found.
00187             break;
00188         }
00189     }
00190     return(numBeginClips);
00191 }
00192 
00193 
00194 /// Return the number of clips that are at the end of the cigar.
00195 int Cigar::getNumEndClips() const
00196 {
00197     int numEndClips = 0;
00198     for (int i = (cigarOperations.size() - 1); i >= 0; i--)
00199     {
00200         if ((cigarOperations[i].operation == softClip) ||
00201                 (cigarOperations[i].operation == hardClip))
00202         {
00203             // Clipping operator, increment the counter.
00204             numEndClips += cigarOperations[i].count;
00205         }
00206         else
00207         {
00208             // Break out of the loop since a non-clipping operator was found.
00209             break;
00210         }
00211     }
00212     return(numEndClips);
00213 }
00214 
00215 
00216 int32_t Cigar::getRefOffset(int32_t queryIndex)
00217 {
00218     // If the vectors aren't set, set them.
00219     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00220     {
00221         setQueryAndReferenceIndexes();
00222     }
00223     if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size()))
00224     {
00225         return(INDEX_NA);
00226     }
00227     return(queryToRef[queryIndex]);
00228 }
00229 
00230 
00231 int32_t Cigar::getQueryIndex(int32_t refOffset)
00232 {
00233     // If the vectors aren't set, set them.
00234     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00235     {
00236         setQueryAndReferenceIndexes();
00237     }
00238     if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size()))
00239     {
00240         return(INDEX_NA);
00241     }
00242     return(refToQuery[refOffset]);
00243 }
00244 
00245 
00246 int32_t Cigar::getRefPosition(int32_t queryIndex, int32_t queryStartPos)
00247 {
00248     // If the vectors aren't set, set them.
00249     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00250     {
00251         setQueryAndReferenceIndexes();
00252     }
00253     if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size()))
00254     {
00255         return(INDEX_NA);
00256     }
00257 
00258     if (queryToRef[queryIndex] != INDEX_NA)
00259     {
00260         return(queryToRef[queryIndex] + queryStartPos);
00261     }
00262     return(INDEX_NA);
00263 }
00264 
00265 
00266 // Return the query index associated with the specified reference position
00267 // when the query starts at the specified reference position based on
00268 // this cigar.
00269 int32_t Cigar::getQueryIndex(int32_t refPosition, int32_t queryStartPos)
00270 {
00271     // If the vectors aren't set, set them.
00272     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00273     {
00274         setQueryAndReferenceIndexes();
00275     }
00276 
00277     int32_t refOffset = refPosition - queryStartPos;
00278     if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size()))
00279     {
00280         return(INDEX_NA);
00281     }
00282 
00283     return(refToQuery[refOffset]);
00284 }
00285 
00286 
00287 // Return the number of bases that overlap the reference and the
00288 // read associated with this cigar that falls within the specified region.
00289 uint32_t Cigar::getNumOverlaps(int32_t start, int32_t end,
00290                                int32_t queryStartPos)
00291 {
00292     // Get the overlap info.
00293     if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
00294     {
00295         setQueryAndReferenceIndexes();
00296     }
00297 
00298     // Get the start and end offsets.
00299     int32_t startRefOffset = 0;
00300     // If the specified start is more than the queryStartPos, set
00301     // the startRefOffset to the appropriate non-zero value.
00302     // (if start is <= queryStartPos, than startRefOffset is 0 - it should
00303     // not be set to a negative value.)
00304     if (start > queryStartPos)
00305     {
00306         startRefOffset = start - queryStartPos;
00307     }
00308 
00309     int32_t endRefOffset = end - queryStartPos;
00310     if (end  == -1)
00311     {
00312         // -1 means that the region goes to the end of the refrerence.
00313         // So set endRefOffset to the max refOffset + 1 which is the
00314         // size of the refToQuery vector.
00315         endRefOffset = refToQuery.size();
00316     }
00317 
00318 
00319     // if endRefOffset is less than 0, then this read does not fall within
00320     // the specified region, so return 0.
00321     if (endRefOffset < 0)
00322     {
00323         return(0);
00324     }
00325 
00326     // Get the overlaps for these offsets.
00327     // Loop through the read counting positions that match the reference
00328     // within this region.
00329     int32_t refOffset = 0;
00330     int32_t numOverlaps = 0;
00331     for (unsigned int queryIndex = 0; queryIndex < queryToRef.size();
00332             queryIndex++)
00333     {
00334         refOffset = getRefOffset(queryIndex);
00335         if (refOffset > endRefOffset)
00336         {
00337             // Past the end of the specified region, so stop checking
00338             // for overlaps since there will be no more.
00339             break;
00340         }
00341         else if ((refOffset >= startRefOffset) && (refOffset < endRefOffset))
00342         {
00343             // within the region, increment the counter.
00344             ++numOverlaps;
00345         }
00346     }
00347 
00348     return(numOverlaps);
00349 }
00350 
00351 
00352 // Clear the query index/reference offset index vectors.
00353 void Cigar::clearQueryAndReferenceIndexes()
00354 {
00355     queryToRef.clear();
00356     refToQuery.clear();
00357 }
00358 
00359 
00360 ///////////////////////////////////////////////////////
00361 // Set the query index/reference offset index vectors.
00362 //
00363 // For Cigar: 3M2I2M1D1M
00364 // That total count of cigar elements is 9 (3+2+2+1+1)
00365 //
00366 // The entries that are valid in the query/reference contain the index/offset
00367 // where they are found in the query/reference.  N/A are marked by 'x':
00368 // query indexes:     0123456x7
00369 //                    ---------
00370 // reference offsets: 012xx3456
00371 //
00372 // This shows what query index is associated with which reference offset and
00373 // vice versa.
00374 // For ones where an x appears, -1 would be returned.
00375 //
00376 void Cigar::setQueryAndReferenceIndexes()
00377 {
00378     // First ensure that the vectors are clear by clearing them.
00379     clearQueryAndReferenceIndexes();
00380 
00381     // Process each cigar index.
00382     for (uint32_t cigarIndex = 0; cigarIndex < cigarOperations.size(); cigarIndex++)
00383     {
00384         // Process the cigar operation.
00385         switch (cigarOperations[cigarIndex].operation)
00386         {
00387             case match:
00388             case mismatch:
00389                 // For match/mismatch, update the maps between query
00390                 // and reference for the number of matches/mismatches.
00391                 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
00392                 {
00393                     // The associated indexes are the next location in
00394                     // each array, which is equal to the current size.
00395                     int32_t queryToRefLen = queryToRef.size();
00396                     int32_t refToQueryLen = refToQuery.size();
00397                     queryToRef.push_back(refToQueryLen);
00398                     refToQuery.push_back(queryToRefLen);
00399                 }
00400                 break;
00401             case insert:
00402             case softClip:
00403                 // Add N/A reference offset for each query index that this
00404                 // insert covers.
00405                 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
00406                 {
00407                     queryToRef.push_back(INDEX_NA);
00408                 }
00409                 break;
00410             case del:
00411             case skip:
00412                 // Add N/A query index for each reference offset that this
00413                 // deletion/skip covers.
00414                 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
00415                 {
00416                     refToQuery.push_back(INDEX_NA);
00417                 }
00418                 break;
00419             case hardClip:
00420             case pad:
00421             case none:
00422                 break;
00423         };
00424     }
00425 }
00426 
Generated on Wed Nov 17 15:38:28 2010 for StatGen Software by  doxygen 1.6.3