CigarRoller.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <stdio.h>
00019 #include <stdlib.h>
00020 #include "CigarRoller.h"
00021 
00022 ////////////////////////////////////////////////////////////////////////
00023 //
00024 // Cigar Roller Class
00025 //
00026 
00027 
00028 CigarRoller & CigarRoller::operator += (CigarRoller &rhs)
00029 {
00030     std::vector<CigarOperator>::iterator i;
00031     for (i = rhs.cigarOperations.begin(); i != rhs.cigarOperations.end(); i++)
00032     {
00033         (*this) += *i;
00034     }
00035     return *this;
00036 }
00037 
00038 
00039 //
00040 // Append a new operator at the end of the sequence.
00041 //
00042 CigarRoller & CigarRoller::operator += (const CigarOperator &rhs)
00043 {
00044     // Adding to the cigar, so the query & reference indexes would be
00045     // incomplete, so just clear them.
00046     clearQueryAndReferenceIndexes();
00047 
00048     if (rhs.count==0)
00049     {
00050         // nothing to do
00051     }
00052     else if (cigarOperations.empty() || cigarOperations.back() != rhs)
00053     {
00054         cigarOperations.push_back(rhs);
00055     }
00056     else
00057     {
00058         // last stored operation is the same as the new one, so just add it in
00059         cigarOperations.back().count += rhs.count;
00060     }
00061     return *this;
00062 }
00063 
00064 
00065 CigarRoller & CigarRoller::operator = (CigarRoller &rhs)
00066 {
00067     clear();
00068 
00069     (*this) += rhs;
00070     
00071     return *this;
00072 }
00073 
00074 
00075 //
00076 void CigarRoller::Add(Operation operation, int count)
00077 {
00078     CigarOperator rhs(operation, count);
00079     (*this) += rhs;
00080 }
00081 
00082 
00083 void CigarRoller::Add(char operation, int count)
00084 {
00085     switch (operation)
00086     {
00087         case 0:
00088         case 'M':
00089             Add(match, count);
00090             break;
00091         case 1:
00092         case 'I':
00093             Add(insert, count);
00094             break;
00095         case 2:
00096         case 'D':
00097             Add(del, count);
00098             break;
00099         case 3:
00100         case 'N':
00101             Add(skip, count);
00102             break;
00103         case 4:
00104         case 'S':
00105             Add(softClip, count);
00106             break;
00107         case 5:
00108         case 'H':
00109             Add(hardClip, count);
00110             break;
00111         case 6:
00112         case 'P':
00113             Add(pad, count);
00114             break;
00115         default:
00116             // Hmmm... what to do?
00117             std::cerr << "ERROR parsing CIGAR - invalid character";
00118             break;
00119     }
00120 }
00121 
00122 
00123 void CigarRoller::Add(const char *cigarString)
00124 {
00125     int operationCount = 0;
00126     while (*cigarString)
00127     {
00128         if (isdigit(*cigarString))
00129         {
00130             char *endPtr;
00131             operationCount = strtol((char *) cigarString, &endPtr, 10);
00132             cigarString = endPtr;
00133         }
00134         else
00135         {
00136             Add(*cigarString, operationCount);
00137             cigarString++;
00138         }
00139     }
00140 }
00141 
00142 
00143 void CigarRoller::Set(const char *cigarString)
00144 {
00145     clear();
00146     Add(cigarString);
00147 }
00148 
00149 
00150 void CigarRoller::Set(const uint32_t* cigarBuffer, uint16_t bufferLen)
00151 {
00152     clear();
00153 
00154     // Parse the buffer.
00155     for (int i = 0; i < bufferLen; i++)
00156     {
00157         int opLen = cigarBuffer[i] >> 4;
00158 
00159         Add(cigarBuffer[i] & 0xF, opLen);
00160     }
00161 }
00162 
00163 
00164 //
00165 // when we examine CIGAR strings, we need to know how
00166 // many cumulative insert and delete positions there are
00167 // so that we can adjust the read location appropriately.
00168 //
00169 // Here, we iterate over the vector of CIGAR operations,
00170 // summaring the count for each insert or delete (insert
00171 // increases the offset, delete decreases it).
00172 //
00173 // The use case for this is when we have a genome match
00174 // position based on an index word other than the first one,
00175 // and there is also a insert or delete between the beginning
00176 // of the read and the index word.  We can't simply report
00177 // the match position without taking into account the indels,
00178 // otherwise we'll be off by N where N is the sum of this
00179 // indel count.
00180 //
00181 // DEPRECATED - do not use.  There are better ways to accomplish that by using
00182 // read lengths, reference lengths, span of the read, etc.
00183 int CigarRoller::getMatchPositionOffset()
00184 {
00185     int offset = 0;
00186     std::vector<CigarOperator>::iterator i;
00187 
00188     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00189     {
00190         switch (i->operation)
00191         {
00192             case insert:
00193                 offset += i->count;
00194                 break;
00195             case del:
00196                 offset -= i->count;
00197                 break;
00198                 // TODO anything for case skip:????
00199             default:
00200                 break;
00201         }
00202     }
00203     return offset;
00204 }
00205 
00206 
00207 //
00208 // Get the string reprentation of the Cigar operations in this object.
00209 // Caller must delete the returned value.
00210 //
00211 const char * CigarRoller::getString()
00212 {
00213     // NB: the exact size of the string is not important, it just needs to be guaranteed
00214     // larger than the largest number of characters we could put into it.
00215 
00216     // we do not explicitly manage memory usage, and we expect when program exits, the memory used here will be freed
00217     static char *ret = NULL;
00218     static unsigned int retSize = 0;
00219 
00220     if (ret == NULL)
00221     {
00222         retSize = cigarOperations.size() * 12 + 1;  // 12 == a magic number -> > 1 + log base 10 of MAXINT
00223         ret = (char*) malloc(sizeof(char) * retSize);
00224         assert(ret != NULL);
00225 
00226     }
00227     else
00228     {
00229         // currently, ret pointer has enough memory to use
00230         if (retSize > cigarOperations.size() * 12 + 1)
00231         {
00232         }
00233         else
00234         {
00235             retSize = cigarOperations.size() * 12 + 1;
00236             free(ret);
00237             ret = (char*) malloc(sizeof(char) * retSize);
00238         }
00239         assert(ret != NULL);
00240     }
00241 
00242     char *ptr = ret;
00243     char buf[12];   // > 1 + log base 10 of MAXINT
00244 
00245     std::vector<CigarOperator>::iterator i;
00246 
00247     // Progressively append the character representations of the operations to
00248     // the cigar string we allocated above.
00249 
00250     *ptr = '\0';    // clear result string
00251     for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
00252     {
00253         sprintf(buf, "%d%c", (*i).count, (*i).getChar());
00254         strcat(ptr, buf);
00255         while (*ptr)
00256         {
00257             ptr++;    // limit the cost of strcat above
00258         }
00259     }
00260     return ret;
00261 }
00262 
00263 
00264 void CigarRoller::clear()
00265 {
00266     // Clearing the cigar, so the query & reference indexes are out of
00267     // date, so clear them.
00268     clearQueryAndReferenceIndexes();
00269     cigarOperations.clear();
00270 }
Generated on Wed Nov 17 15:38:28 2010 for StatGen Software by  doxygen 1.6.3