libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010-2011 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include <stdio.h> 00019 #include <stdlib.h> 00020 #include <ctype.h> 00021 #include "CigarRoller.h" 00022 00023 //////////////////////////////////////////////////////////////////////// 00024 // 00025 // Cigar Roller Class 00026 // 00027 00028 00029 CigarRoller & CigarRoller::operator += (CigarRoller &rhs) 00030 { 00031 std::vector<CigarOperator>::iterator i; 00032 for (i = rhs.cigarOperations.begin(); i != rhs.cigarOperations.end(); i++) 00033 { 00034 (*this) += *i; 00035 } 00036 return *this; 00037 } 00038 00039 00040 // 00041 // Append a new operator at the end of the sequence. 00042 // 00043 CigarRoller & CigarRoller::operator += (const CigarOperator &rhs) 00044 { 00045 // Adding to the cigar, so the query & reference indexes would be 00046 // incomplete, so just clear them. 00047 clearQueryAndReferenceIndexes(); 00048 00049 if (rhs.count==0) 00050 { 00051 // nothing to do 00052 } 00053 else if (cigarOperations.empty() || cigarOperations.back() != rhs) 00054 { 00055 cigarOperations.push_back(rhs); 00056 } 00057 else 00058 { 00059 // last stored operation is the same as the new one, so just add it in 00060 cigarOperations.back().count += rhs.count; 00061 } 00062 return *this; 00063 } 00064 00065 00066 CigarRoller & CigarRoller::operator = (CigarRoller &rhs) 00067 { 00068 clear(); 00069 00070 (*this) += rhs; 00071 00072 return *this; 00073 } 00074 00075 00076 // 00077 void CigarRoller::Add(Operation operation, int count) 00078 { 00079 CigarOperator rhs(operation, count); 00080 (*this) += rhs; 00081 } 00082 00083 00084 void CigarRoller::Add(char operation, int count) 00085 { 00086 switch (operation) 00087 { 00088 case 0: 00089 case 'M': 00090 Add(match, count); 00091 break; 00092 case 1: 00093 case 'I': 00094 Add(insert, count); 00095 break; 00096 case 2: 00097 case 'D': 00098 Add(del, count); 00099 break; 00100 case 3: 00101 case 'N': 00102 Add(skip, count); 00103 break; 00104 case 4: 00105 case 'S': 00106 Add(softClip, count); 00107 break; 00108 case 5: 00109 case 'H': 00110 Add(hardClip, count); 00111 break; 00112 case 6: 00113 case 'P': 00114 Add(pad, count); 00115 break; 00116 case 7: 00117 case '=': 00118 Add(match, count); 00119 break; 00120 case 8: 00121 case 'X': 00122 Add(match, count); 00123 break; 00124 default: 00125 // Hmmm... what to do? 00126 std::cerr << "ERROR " 00127 << "(" << __FILE__ << ":" << __LINE__ <<"): " 00128 << "Parsing CIGAR - invalid character found " 00129 << "with parameter " << operation << " and " << count 00130 << std::endl; 00131 break; 00132 } 00133 } 00134 00135 00136 void CigarRoller::Add(const char *cigarString) 00137 { 00138 int operationCount = 0; 00139 while (*cigarString) 00140 { 00141 if (isdigit(*cigarString)) 00142 { 00143 char *endPtr; 00144 operationCount = strtol((char *) cigarString, &endPtr, 10); 00145 cigarString = endPtr; 00146 } 00147 else 00148 { 00149 Add(*cigarString, operationCount); 00150 cigarString++; 00151 } 00152 } 00153 } 00154 00155 00156 bool CigarRoller::Remove(int index) 00157 { 00158 if((index < 0) || ((unsigned int)index >= cigarOperations.size())) 00159 { 00160 // can't remove, out of range, return false. 00161 return(false); 00162 } 00163 cigarOperations.erase(cigarOperations.begin() + index); 00164 // Modifying the cigar, so the query & reference indexes are out of date, 00165 // so clear them. 00166 clearQueryAndReferenceIndexes(); 00167 return(true); 00168 } 00169 00170 00171 bool CigarRoller::IncrementCount(int index, int increment) 00172 { 00173 if((index < 0) || ((unsigned int)index >= cigarOperations.size())) 00174 { 00175 // can't update, out of range, return false. 00176 return(false); 00177 } 00178 cigarOperations[index].count += increment; 00179 00180 // Modifying the cigar, so the query & reference indexes are out of date, 00181 // so clear them. 00182 clearQueryAndReferenceIndexes(); 00183 return(true); 00184 } 00185 00186 00187 bool CigarRoller::Update(int index, Operation op, int count) 00188 { 00189 if((index < 0) || ((unsigned int)index >= cigarOperations.size())) 00190 { 00191 // can't update, out of range, return false. 00192 return(false); 00193 } 00194 cigarOperations[index].operation = op; 00195 cigarOperations[index].count = count; 00196 00197 // Modifying the cigar, so the query & reference indexes are out of date, 00198 // so clear them. 00199 clearQueryAndReferenceIndexes(); 00200 return(true); 00201 } 00202 00203 00204 void CigarRoller::Set(const char *cigarString) 00205 { 00206 clear(); 00207 Add(cigarString); 00208 } 00209 00210 00211 void CigarRoller::Set(const uint32_t* cigarBuffer, uint16_t bufferLen) 00212 { 00213 clear(); 00214 00215 // Parse the buffer. 00216 for (int i = 0; i < bufferLen; i++) 00217 { 00218 int opLen = cigarBuffer[i] >> 4; 00219 00220 Add(cigarBuffer[i] & 0xF, opLen); 00221 } 00222 } 00223 00224 00225 // 00226 // when we examine CIGAR strings, we need to know how 00227 // many cumulative insert and delete positions there are 00228 // so that we can adjust the read location appropriately. 00229 // 00230 // Here, we iterate over the vector of CIGAR operations, 00231 // summaring the count for each insert or delete (insert 00232 // increases the offset, delete decreases it). 00233 // 00234 // The use case for this is when we have a genome match 00235 // position based on an index word other than the first one, 00236 // and there is also a insert or delete between the beginning 00237 // of the read and the index word. We can't simply report 00238 // the match position without taking into account the indels, 00239 // otherwise we'll be off by N where N is the sum of this 00240 // indel count. 00241 // 00242 // DEPRECATED - do not use. There are better ways to accomplish that by using 00243 // read lengths, reference lengths, span of the read, etc. 00244 int CigarRoller::getMatchPositionOffset() 00245 { 00246 int offset = 0; 00247 std::vector<CigarOperator>::iterator i; 00248 00249 for (i = cigarOperations.begin(); i != cigarOperations.end(); i++) 00250 { 00251 switch (i->operation) 00252 { 00253 case insert: 00254 offset += i->count; 00255 break; 00256 case del: 00257 offset -= i->count; 00258 break; 00259 // TODO anything for case skip:???? 00260 default: 00261 break; 00262 } 00263 } 00264 return offset; 00265 } 00266 00267 00268 // 00269 // Get the string reprentation of the Cigar operations in this object. 00270 // Caller must delete the returned value. 00271 // 00272 const char * CigarRoller::getString() 00273 { 00274 // NB: the exact size of the string is not important, it just needs to be guaranteed 00275 // larger than the largest number of characters we could put into it. 00276 00277 // we do not explicitly manage memory usage, and we expect when program exits, the memory used here will be freed 00278 static char *ret = NULL; 00279 static unsigned int retSize = 0; 00280 00281 if (ret == NULL) 00282 { 00283 retSize = cigarOperations.size() * 12 + 1; // 12 == a magic number -> > 1 + log base 10 of MAXINT 00284 ret = (char*) malloc(sizeof(char) * retSize); 00285 assert(ret != NULL); 00286 00287 } 00288 else 00289 { 00290 // currently, ret pointer has enough memory to use 00291 if (retSize > cigarOperations.size() * 12 + 1) 00292 { 00293 } 00294 else 00295 { 00296 retSize = cigarOperations.size() * 12 + 1; 00297 free(ret); 00298 ret = (char*) malloc(sizeof(char) * retSize); 00299 } 00300 assert(ret != NULL); 00301 } 00302 00303 char *ptr = ret; 00304 char buf[12]; // > 1 + log base 10 of MAXINT 00305 00306 std::vector<CigarOperator>::iterator i; 00307 00308 // Progressively append the character representations of the operations to 00309 // the cigar string we allocated above. 00310 00311 *ptr = '\0'; // clear result string 00312 for (i = cigarOperations.begin(); i != cigarOperations.end(); i++) 00313 { 00314 sprintf(buf, "%d%c", (*i).count, (*i).getChar()); 00315 strcat(ptr, buf); 00316 while (*ptr) 00317 { 00318 ptr++; // limit the cost of strcat above 00319 } 00320 } 00321 return ret; 00322 } 00323 00324 00325 void CigarRoller::clear() 00326 { 00327 // Clearing the cigar, so the query & reference indexes are out of 00328 // date, so clear them. 00329 clearQueryAndReferenceIndexes(); 00330 cigarOperations.clear(); 00331 }