libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010-2011 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include <stdio.h> 00019 #include <stdlib.h> 00020 #include "Cigar.h" 00021 #include "STLUtilities.h" 00022 00023 // Initialize INDEX_NA. 00024 const int32_t Cigar::INDEX_NA = -1; 00025 00026 00027 //////////////////////////////////////////////////////////////////////// 00028 // 00029 // Cigar Class 00030 // 00031 00032 // 00033 // Set the passed in string to the string reprentation of the Cigar operations 00034 // in this object. 00035 // 00036 void Cigar::getCigarString(std::string& cigarString) const 00037 { 00038 using namespace STLUtilities; 00039 00040 std::vector<CigarOperator>::const_iterator i; 00041 00042 cigarString.clear(); // clear result string 00043 00044 // Progressively append the character representations of the operations to 00045 // the cigar string. 00046 for (i = cigarOperations.begin(); i != cigarOperations.end(); i++) 00047 { 00048 cigarString << (*i).count << (*i).getChar(); 00049 } 00050 } 00051 00052 void Cigar::getCigarString(String& cigarString) const 00053 { 00054 std::string cigar; 00055 00056 getCigarString(cigar); 00057 00058 cigarString = cigar.c_str(); 00059 00060 return; 00061 } 00062 00063 void Cigar::getExpandedString(std::string &s) const 00064 { 00065 s = ""; 00066 00067 std::vector<CigarOperator>::const_iterator i; 00068 00069 // Progressively append the character representations of the operations to 00070 // the string passed in 00071 00072 for (i = cigarOperations.begin(); i != cigarOperations.end(); i++) 00073 { 00074 for (uint32_t j = 0; j<(*i).count; j++) s += (*i).getChar(); 00075 } 00076 return; 00077 } 00078 00079 00080 bool Cigar::operator == (Cigar &rhs) const 00081 { 00082 00083 if (this->size() != rhs.size()) return false; 00084 00085 for (int i = 0; i < this->size(); i++) 00086 { 00087 if (cigarOperations[i]!=rhs.cigarOperations[i]) return false; 00088 } 00089 return true; 00090 } 00091 00092 00093 // return the length of the read that corresponds to 00094 // the current CIGAR string. 00095 int Cigar::getExpectedQueryBaseCount() const 00096 { 00097 int matchCount = 0; 00098 std::vector<CigarOperator>::const_iterator i; 00099 for (i = cigarOperations.begin(); i != cigarOperations.end(); i++) 00100 { 00101 switch (i->operation) 00102 { 00103 case match: 00104 case mismatch: 00105 case softClip: 00106 case insert: 00107 matchCount += i->count; 00108 break; 00109 default: 00110 // we only care about operations that are in the query sequence. 00111 break; 00112 } 00113 } 00114 return matchCount; 00115 } 00116 00117 00118 // return the number of bases in the reference that 00119 // this read "spans" 00120 int Cigar::getExpectedReferenceBaseCount() const 00121 { 00122 int matchCount = 0; 00123 std::vector<CigarOperator>::const_iterator i; 00124 for (i = cigarOperations.begin(); i != cigarOperations.end(); i++) 00125 { 00126 switch (i->operation) 00127 { 00128 case match: 00129 case mismatch: 00130 case del: 00131 case skip: 00132 matchCount += i->count; 00133 break; 00134 default: 00135 // we only care about operations that are in the reference sequence. 00136 break; 00137 } 00138 } 00139 return matchCount; 00140 } 00141 00142 00143 // Return the number of clips that are at the beginning of the cigar. 00144 int Cigar::getNumBeginClips() const 00145 { 00146 int numBeginClips = 0; 00147 for (unsigned int i = 0; i != cigarOperations.size(); i++) 00148 { 00149 if ((cigarOperations[i].operation == softClip) || 00150 (cigarOperations[i].operation == hardClip)) 00151 { 00152 // Clipping operator, increment the counter. 00153 numBeginClips += cigarOperations[i].count; 00154 } 00155 else 00156 { 00157 // Break out of the loop since a non-clipping operator was found. 00158 break; 00159 } 00160 } 00161 return(numBeginClips); 00162 } 00163 00164 00165 // Return the number of clips that are at the end of the cigar. 00166 int Cigar::getNumEndClips() const 00167 { 00168 int numEndClips = 0; 00169 for (int i = (cigarOperations.size() - 1); i >= 0; i--) 00170 { 00171 if ((cigarOperations[i].operation == softClip) || 00172 (cigarOperations[i].operation == hardClip)) 00173 { 00174 // Clipping operator, increment the counter. 00175 numEndClips += cigarOperations[i].count; 00176 } 00177 else 00178 { 00179 // Break out of the loop since a non-clipping operator was found. 00180 break; 00181 } 00182 } 00183 return(numEndClips); 00184 } 00185 00186 00187 int32_t Cigar::getRefOffset(int32_t queryIndex) 00188 { 00189 // If the vectors aren't set, set them. 00190 if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) 00191 { 00192 setQueryAndReferenceIndexes(); 00193 } 00194 if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size())) 00195 { 00196 return(INDEX_NA); 00197 } 00198 return(queryToRef[queryIndex]); 00199 } 00200 00201 00202 int32_t Cigar::getQueryIndex(int32_t refOffset) 00203 { 00204 // If the vectors aren't set, set them. 00205 if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) 00206 { 00207 setQueryAndReferenceIndexes(); 00208 } 00209 if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size())) 00210 { 00211 return(INDEX_NA); 00212 } 00213 return(refToQuery[refOffset]); 00214 } 00215 00216 00217 int32_t Cigar::getRefPosition(int32_t queryIndex, int32_t queryStartPos) 00218 { 00219 // If the vectors aren't set, set them. 00220 if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) 00221 { 00222 setQueryAndReferenceIndexes(); 00223 } 00224 if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size())) 00225 { 00226 return(INDEX_NA); 00227 } 00228 00229 if (queryToRef[queryIndex] != INDEX_NA) 00230 { 00231 return(queryToRef[queryIndex] + queryStartPos); 00232 } 00233 return(INDEX_NA); 00234 } 00235 00236 00237 // Return the query index associated with the specified reference position 00238 // when the query starts at the specified reference position based on 00239 // this cigar. 00240 int32_t Cigar::getQueryIndex(int32_t refPosition, int32_t queryStartPos) 00241 { 00242 // If the vectors aren't set, set them. 00243 if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) 00244 { 00245 setQueryAndReferenceIndexes(); 00246 } 00247 00248 int32_t refOffset = refPosition - queryStartPos; 00249 if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size())) 00250 { 00251 return(INDEX_NA); 00252 } 00253 00254 return(refToQuery[refOffset]); 00255 } 00256 00257 00258 int32_t Cigar::getExpandedCigarIndexFromQueryIndex(int32_t queryIndex) 00259 { 00260 // If the vectors aren't set, set them. 00261 if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) 00262 { 00263 setQueryAndReferenceIndexes(); 00264 } 00265 if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToCigar.size())) 00266 { 00267 return(INDEX_NA); 00268 } 00269 return(queryToCigar[queryIndex]); 00270 } 00271 00272 00273 int32_t Cigar::getExpandedCigarIndexFromRefOffset(int32_t refOffset) 00274 { 00275 // If the vectors aren't set, set them. 00276 if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) 00277 { 00278 setQueryAndReferenceIndexes(); 00279 } 00280 if ((refOffset < 0) || ((uint32_t)refOffset >= refToCigar.size())) 00281 { 00282 return(INDEX_NA); 00283 } 00284 return(refToCigar[refOffset]); 00285 } 00286 00287 00288 int32_t Cigar::getExpandedCigarIndexFromRefPos(int32_t refPosition, 00289 int32_t queryStartPos) 00290 { 00291 return(getExpandedCigarIndexFromRefOffset(refPosition - queryStartPos)); 00292 } 00293 00294 00295 char Cigar::getCigarCharOp(int32_t expandedCigarIndex) 00296 { 00297 // Check if the expanded cigar has been set yet 00298 if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) 00299 { 00300 // Set the expanded cigar. 00301 setQueryAndReferenceIndexes(); 00302 } 00303 00304 // Check to see if the index is in range. 00305 if((expandedCigarIndex < 0) || 00306 ((uint32_t)expandedCigarIndex >= myExpandedCigar.length())) 00307 { 00308 return('?'); 00309 } 00310 return(myExpandedCigar[expandedCigarIndex]); 00311 } 00312 00313 00314 char Cigar::getCigarCharOpFromQueryIndex(int32_t queryIndex) 00315 { 00316 return(getCigarCharOp(getExpandedCigarIndexFromQueryIndex(queryIndex))); 00317 } 00318 00319 00320 char Cigar::getCigarCharOpFromRefOffset(int32_t refOffset) 00321 { 00322 return(getCigarCharOp(getExpandedCigarIndexFromRefOffset(refOffset))); 00323 } 00324 00325 00326 char Cigar::getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos) 00327 { 00328 return(getCigarCharOp(getExpandedCigarIndexFromRefPos(refPosition, queryStartPos))); 00329 } 00330 00331 00332 // Return the number of bases that overlap the reference and the 00333 // read associated with this cigar that falls within the specified region. 00334 uint32_t Cigar::getNumOverlaps(int32_t start, int32_t end, 00335 int32_t queryStartPos) 00336 { 00337 // Get the overlap info. 00338 if ((queryToRef.size() == 0) || (refToQuery.size() == 0)) 00339 { 00340 setQueryAndReferenceIndexes(); 00341 } 00342 00343 // Get the start and end offsets. 00344 int32_t startRefOffset = 0; 00345 // If the specified start is more than the queryStartPos, set 00346 // the startRefOffset to the appropriate non-zero value. 00347 // (if start is <= queryStartPos, than startRefOffset is 0 - it should 00348 // not be set to a negative value.) 00349 if (start > queryStartPos) 00350 { 00351 startRefOffset = start - queryStartPos; 00352 } 00353 00354 int32_t endRefOffset = end - queryStartPos; 00355 if (end == -1) 00356 { 00357 // -1 means that the region goes to the end of the refrerence. 00358 // So set endRefOffset to the max refOffset + 1 which is the 00359 // size of the refToQuery vector. 00360 endRefOffset = refToQuery.size(); 00361 } 00362 00363 00364 // if endRefOffset is less than 0, then this read does not fall within 00365 // the specified region, so return 0. 00366 if (endRefOffset < 0) 00367 { 00368 return(0); 00369 } 00370 00371 // Get the overlaps for these offsets. 00372 // Loop through the read counting positions that match the reference 00373 // within this region. 00374 int32_t refOffset = 0; 00375 int32_t numOverlaps = 0; 00376 for (unsigned int queryIndex = 0; queryIndex < queryToRef.size(); 00377 queryIndex++) 00378 { 00379 refOffset = getRefOffset(queryIndex); 00380 if (refOffset > endRefOffset) 00381 { 00382 // Past the end of the specified region, so stop checking 00383 // for overlaps since there will be no more. 00384 break; 00385 } 00386 else if ((refOffset >= startRefOffset) && (refOffset < endRefOffset)) 00387 { 00388 // within the region, increment the counter. 00389 ++numOverlaps; 00390 } 00391 } 00392 00393 return(numOverlaps); 00394 } 00395 00396 00397 // Return whether or not the cigar has an indel 00398 bool Cigar::hasIndel() 00399 { 00400 for(unsigned int i = 0; i < cigarOperations.size(); i++) 00401 { 00402 if((cigarOperations[i].operation == insert) || 00403 (cigarOperations[i].operation == del)) 00404 { 00405 // Found an indel, so return true. 00406 return(true); 00407 } 00408 } 00409 // Went through all the operations, and found no indel, so return false. 00410 return(false); 00411 } 00412 00413 00414 // Clear the query index/reference offset index vectors. 00415 void Cigar::clearQueryAndReferenceIndexes() 00416 { 00417 queryToRef.clear(); 00418 refToQuery.clear(); 00419 refToCigar.clear(); 00420 queryToCigar.clear(); 00421 myExpandedCigar.clear(); 00422 } 00423 00424 00425 /////////////////////////////////////////////////////// 00426 // Set the query index/reference offset index vectors. 00427 // 00428 // For Cigar: 3M2I2M1D1M 00429 // That total count of cigar elements is 9 (3+2+2+1+1) 00430 // 00431 // The entries that are valid in the query/reference contain the index/offset 00432 // where they are found in the query/reference. N/A are marked by 'x': 00433 // query indexes: 0123456x7 00434 // --------- 00435 // reference offsets: 012xx3456 00436 // 00437 // This shows what query index is associated with which reference offset and 00438 // vice versa. 00439 // For ones where an x appears, -1 would be returned. 00440 // 00441 void Cigar::setQueryAndReferenceIndexes() 00442 { 00443 // First ensure that the vectors are clear by clearing them. 00444 clearQueryAndReferenceIndexes(); 00445 00446 int extPos = 0; 00447 00448 // Process each cigar index. 00449 for (uint32_t cigarIndex = 0; cigarIndex < cigarOperations.size(); cigarIndex++) 00450 { 00451 // Process the cigar operation. 00452 switch (cigarOperations[cigarIndex].operation) 00453 { 00454 case match: 00455 case mismatch: 00456 // For match/mismatch, update the maps between query 00457 // and reference for the number of matches/mismatches. 00458 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++) 00459 { 00460 // The associated indexes are the next location in 00461 // each array, which is equal to the current size. 00462 int32_t queryToRefLen = queryToRef.size(); 00463 int32_t refToQueryLen = refToQuery.size(); 00464 queryToRef.push_back(refToQueryLen); 00465 refToQuery.push_back(queryToRefLen); 00466 refToCigar.push_back(extPos); 00467 queryToCigar.push_back(extPos++); 00468 myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar()); 00469 } 00470 break; 00471 case insert: 00472 case softClip: 00473 // Add N/A reference offset for each query index that this 00474 // insert covers. 00475 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++) 00476 { 00477 queryToRef.push_back(INDEX_NA); 00478 queryToCigar.push_back(extPos++); 00479 myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar()); 00480 } 00481 break; 00482 case del: 00483 case skip: 00484 // Add N/A query index for each reference offset that this 00485 // deletion/skip covers. 00486 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++) 00487 { 00488 refToQuery.push_back(INDEX_NA); 00489 refToCigar.push_back(extPos++); 00490 myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar()); 00491 } 00492 break; 00493 case hardClip: 00494 case pad: 00495 case none: 00496 for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++) 00497 { 00498 myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar()); 00499 ++extPos; 00500 } 00501 break; 00502 }; 00503 } 00504 } 00505