libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010-2011 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #if !defined(_CIGAR_H) 00019 #define _CIGAR_H 00020 00021 #include <string.h> // for inline use of strcat, etc 00022 #include <limits.h> // for INT_MAX 00023 #include <stdint.h> // for uint32_t and friends 00024 00025 00026 #include <assert.h> 00027 #include <stdio.h> 00028 #include <stdlib.h> 00029 #include <string> 00030 #include <iostream> 00031 #include <iomanip> 00032 #include <utility> 00033 #include <vector> 00034 00035 #include "Generic.h" 00036 #include "StringBasics.h" 00037 00038 /// This class represents the CIGAR without any methods to set the cigar 00039 /// (see CigarRoller for that). 00040 00041 // 00042 // Docs from Sam1.pdf: 00043 // 00044 // Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one. 00045 // Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is 00046 // an example. Suppose the clipped alignment is: 00047 // REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG 00048 // READ: gggGTGTAACC-GACTAGgggg 00049 // where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for 00050 // this alignment is: 3S8M1D6M4S. 00051 // 00052 // 00053 // If the mapping position of the query is not available, RNAME and 00054 // CIGAR are set as “*” 00055 // 00056 // A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows 00057 // for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format 00058 // further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing: 00059 // 00060 // op Description 00061 // -- ----------- 00062 // M Match or mismatch 00063 // I Insertion to the reference 00064 // D Deletion from the reference 00065 // N Skipped region from the reference 00066 // S Soft clip on the read (clipped sequence present in <seq>) 00067 // H Hard clip on the read (clipped sequence NOT present in <seq>) 00068 // P Padding (silent deletion from the padded reference sequence) 00069 // 00070 00071 00072 00073 00074 //////////////////////////////////////////////////////////////////////// 00075 /// 00076 /// This class represents the CIGAR. It contains methods for converting 00077 /// to strings and extracting information from the cigar on how a read 00078 /// maps to the reference. 00079 /// 00080 /// It only contains read only methods. There are no ways to set 00081 /// values. To set a value, a child class must be used. 00082 /// 00083 class Cigar 00084 { 00085 public: 00086 /// Enum for the cigar operations. 00087 enum Operation { 00088 none=0, ///< no operation has been set. 00089 match, ///< match/mismatch operation. Associated with CIGAR Operation "M" 00090 mismatch, ///< mismatch operation. Associated with CIGAR Operation "M" 00091 insert, ///< insertion to the reference (the query sequence contains bases that have no corresponding base in the reference). Associated with CIGAR Operation "I" 00092 del, ///< deletion from the reference (the reference contains bases that have no corresponding base in the query sequence). Associated with CIGAR Operation "D" 00093 skip, ///< skipped region from the reference (the reference contains bases that have no corresponding base in the query sequence). Associated with CIGAR Operation "N" 00094 softClip, ///< Soft clip on the read (clipped sequence present in the query sequence, but not in reference). Associated with CIGAR Operation "S" 00095 hardClip, ///< Hard clip on the read (clipped sequence not present in the query sequence or reference). Associated with CIGAR Operation "H" 00096 pad ///< Padding (not in reference or query). Associated with CIGAR Operation "P" 00097 }; 00098 00099 // The maximum value in the operation enum (used for setting up a bitset of 00100 // operations. 00101 static const int MAX_OP_VALUE = pad; 00102 00103 //////////////////////////////////////////////////////////////////////// 00104 // 00105 // Nested Struct : CigarOperator 00106 // 00107 struct CigarOperator 00108 { 00109 00110 CigarOperator() 00111 { 00112 operation = none; 00113 count = 0; 00114 } 00115 00116 /// Set the cigar operator with the specified operation and 00117 /// count length. 00118 CigarOperator(Operation operation, uint32_t count) 00119 : operation(operation), count(count) {}; 00120 00121 Operation operation; 00122 00123 uint32_t count; 00124 00125 /// Get the character code (M, I, D, N, S, H, or P) associated with 00126 /// this operation. 00127 char getChar() const 00128 { 00129 switch (operation) 00130 { 00131 case none: 00132 return '?'; // error 00133 case match: 00134 case mismatch: 00135 return'M'; 00136 case insert: 00137 return 'I'; 00138 case del: 00139 return'D'; 00140 case skip: 00141 return 'N'; 00142 case softClip: 00143 return 'S'; 00144 case hardClip: 00145 return 'H'; 00146 case pad: 00147 return 'P'; 00148 } 00149 return '?'; // actually it is an error to get here 00150 } 00151 00152 /// Compare only on the operator, true if they are the same, false if not. Match and mismatch are considered the same for CIGAR strings. 00153 bool operator == (const CigarOperator &rhs) const 00154 { 00155 if (operation==rhs.operation) 00156 return true; 00157 if ((operation == mismatch || operation == match) && (rhs.operation == mismatch || rhs.operation == match)) 00158 return true; 00159 return false; 00160 } 00161 00162 /// Compare only on the operator, false if they are the same, true if not. Match and mismatch are considered the same for CIGAR strings. 00163 bool operator != (const CigarOperator &rhs) const 00164 { 00165 return !((*this) == rhs) ; 00166 } 00167 00168 }; 00169 00170 //////////////////////////////////////////////////////////////////////// 00171 // 00172 // Cigar Class statics 00173 // 00174 00175 /// Return true if the specified operation is found in the 00176 /// reference sequence, false if not. 00177 static bool foundInReference(Operation op) 00178 { 00179 switch(op) 00180 { 00181 case match: 00182 case mismatch: 00183 case del: 00184 case skip: 00185 return true; 00186 default: 00187 return false; 00188 } 00189 return false; 00190 } 00191 00192 /// Return true if the specified operation is found in the 00193 /// reference sequence, false if not. 00194 static bool foundInReference(char op) 00195 { 00196 switch(op) 00197 { 00198 case 'M': 00199 case '=': 00200 case 'X': 00201 case 'D': 00202 case 'N': 00203 return true; 00204 default: 00205 return false; 00206 } 00207 return false; 00208 } 00209 00210 /// Return true if the specified operation is found in the 00211 /// reference sequence, false if not. 00212 static bool foundInReference(const CigarOperator &op) 00213 { 00214 return(foundInReference(op.operation)); 00215 } 00216 00217 /// Return true if the specified operation is found in the 00218 /// query sequence, false if not. 00219 static bool foundInQuery(Operation op) 00220 { 00221 switch(op) 00222 { 00223 case match: 00224 case mismatch: 00225 case insert: 00226 case softClip: 00227 return true; 00228 default: 00229 return false; 00230 } 00231 return false; 00232 } 00233 00234 /// Return true if the specified operation is found in the 00235 /// query sequence, false if not. 00236 static bool foundInQuery(char op) 00237 { 00238 switch(op) 00239 { 00240 case 'M': 00241 case '=': 00242 case 'X': 00243 case 'I': 00244 case 'S': 00245 return true; 00246 default: 00247 return false; 00248 } 00249 return false; 00250 } 00251 00252 /// Return true if the specified operation is found in the 00253 /// query sequence, false if not. 00254 static bool foundInQuery(const CigarOperator &op) 00255 { 00256 return(foundInQuery(op.operation)); 00257 } 00258 00259 /// Return true if the specified operation is a clipping operation, 00260 /// false if not. 00261 static bool isClip(Operation op) 00262 { 00263 switch(op) 00264 { 00265 case softClip: 00266 case hardClip: 00267 return true; 00268 default: 00269 return false; 00270 } 00271 return false; 00272 } 00273 00274 /// Return true if the specified operation is a clipping operation, 00275 /// false if not. 00276 static bool isClip(char op) 00277 { 00278 switch(op) 00279 { 00280 case 'S': 00281 case 'H': 00282 return true; 00283 default: 00284 return false; 00285 } 00286 return false; 00287 } 00288 00289 /// Return true if the specified operation is a clipping operation, 00290 /// false if not. 00291 static bool isClip(const CigarOperator &op) 00292 { 00293 return(isClip(op.operation)); 00294 } 00295 00296 /// Return true if the specified operation is a match/mismatch operation, 00297 /// false if not. 00298 static bool isMatchOrMismatch(Operation op) 00299 { 00300 switch(op) 00301 { 00302 case match: 00303 case mismatch: 00304 return true; 00305 default: 00306 return false; 00307 } 00308 return false; 00309 } 00310 00311 /// Return true if the specified operation is a match/mismatch operation, 00312 /// false if not. 00313 static bool isMatchOrMismatch(const CigarOperator &op) 00314 { 00315 return(isMatchOrMismatch(op.operation)); 00316 } 00317 00318 00319 //////////////////////////////////////////////////////////////////////// 00320 // 00321 // Cigar Class non static 00322 // 00323 friend std::ostream &operator << (std::ostream &stream, const Cigar& cigar); 00324 00325 /// Default constructor initializes as a CIGAR with no operations. 00326 Cigar() 00327 { 00328 clearQueryAndReferenceIndexes(); 00329 } 00330 00331 00332 /// Set the passed in String to the string reprentation of the Cigar 00333 /// operations in this object. 00334 void getCigarString(String& cigarString) const; 00335 00336 /// Set the passed in std::string to the string reprentation of the Cigar 00337 /// operations in this object. 00338 void getCigarString(std::string& cigarString) const; 00339 00340 /// Sets the specified string to a valid CIGAR string of characters that 00341 /// represent the cigar with no digits (a CIGAR of "3M" would return "MMM"). 00342 /// The returned string is actually also a valid CIGAR string. 00343 /// In theory this makes it easier to parse some reads. 00344 /// \return s the string to populate 00345 void getExpandedString(std::string &s) const; 00346 00347 /// Return the Cigar Operation at the specified index (starting at 0). 00348 const CigarOperator & operator [](int i) const 00349 { 00350 return cigarOperations[i]; 00351 } 00352 00353 /// Return the Cigar Operation at the specified index (starting at 0). 00354 const CigarOperator & getOperator(int i) const 00355 { 00356 return cigarOperations[i]; 00357 } 00358 00359 /// Return true if the 2 Cigars are the same 00360 /// (the same operations of the same sizes). 00361 bool operator == (Cigar &rhs) const; 00362 00363 /// Return the number of cigar operations 00364 int size() const 00365 { 00366 return cigarOperations.size(); 00367 } 00368 00369 /// Write this object as a string to cout. 00370 void Dump() const 00371 { 00372 String cigarString; 00373 getCigarString(cigarString); 00374 std::cout << cigarString ; 00375 } 00376 00377 /// Return the length of the read that corresponds to 00378 /// the current CIGAR string. 00379 /// 00380 /// For validation, we should expect that a sequence 00381 /// read in a SAM file will be the same length as the 00382 /// value returned by this method. 00383 /// 00384 /// Example: 3M2D3M describes a read with three bases 00385 /// matching the reference, then skips 2 bases, then has 00386 /// three more bases that match the reference (match/mismatch). 00387 /// In this case, the read length is expected to be 6. 00388 /// 00389 /// Example: 3M2I3M describes a read with 3 match/mismatch 00390 /// bases, two extra bases, and then 3 more match/mistmatch 00391 /// bases. The total in this example is 8 bases. 00392 /// 00393 /// \return returns the expected read length 00394 int getExpectedQueryBaseCount() const; 00395 00396 /// Return the number of bases in the reference that 00397 /// this CIGAR "spans". 00398 /// 00399 /// When doing range checking, we occassionally need to know 00400 /// how many total bases the CIGAR string represents as compared 00401 /// to the reference. 00402 /// 00403 /// Examples: 3M2D3M describes a read that overlays 8 bases in 00404 /// the reference. 3M2I3M describes a read with 3 bases that 00405 /// match the reference, two additional bases that aren't in the 00406 /// reference, and 3 more bases that match the reference, so it 00407 /// spans 6 bases in the reference. 00408 /// 00409 /// \return how many bases in the reference are spanned 00410 /// by the given CIGAR string 00411 /// 00412 int getExpectedReferenceBaseCount() const; 00413 00414 /// Return the number of clips that are at the beginning of the cigar. 00415 int getNumBeginClips() const; 00416 00417 /// Return the number of clips that are at the end of the cigar. 00418 int getNumEndClips() const; 00419 00420 /// Return the reference offset associated with the specified 00421 /// query index or INDEX_NA based on this cigar. 00422 int32_t getRefOffset(int32_t queryIndex); 00423 00424 /// Return the query index associated with the specified 00425 /// reference offset or INDEX_NA based on this cigar. 00426 int32_t getQueryIndex(int32_t refOffset); 00427 00428 /// Return the reference position associated with the specified query index 00429 /// or INDEX_NA based on this cigar and the specified queryStartPos which 00430 /// is the leftmost mapping position of the first matching base in the 00431 /// query. 00432 int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos); 00433 00434 /// Return the query index or INDEX_NA associated with the specified 00435 /// reference offset when the query starts at the specified reference 00436 /// position. 00437 int32_t getQueryIndex(int32_t refPosition, int32_t queryStartPos); 00438 00439 /// Returns the index into the expanded cigar for the cigar 00440 /// associated with the specified queryIndex. 00441 /// INDEX_NA returned if the index is out of range. 00442 int32_t getExpandedCigarIndexFromQueryIndex(int32_t queryIndex); 00443 00444 /// Returns the index into the expanded cigar for the cigar 00445 /// associated with the specified reference offset. 00446 /// INDEX_NA returned if the offset is out of range. 00447 int32_t getExpandedCigarIndexFromRefOffset(int32_t refOffset); 00448 00449 /// Returns the index into the expanded cigar for the cigar 00450 /// associated with the specified reference position and queryStartPos. 00451 /// INDEX_NA returned if the position is out of range. 00452 int32_t getExpandedCigarIndexFromRefPos(int32_t refPosition, 00453 int32_t queryStartPos); 00454 00455 /// Return the character code of the cigar operator associated with the 00456 /// specified expanded CIGAR index. '?' is returned for an out of range 00457 /// index. 00458 char getCigarCharOp(int32_t expandedCigarIndex); 00459 00460 /// Return the character code of the cigar operator associated with 00461 /// the specified queryIndex. '?' is returned for an out of range index. 00462 char getCigarCharOpFromQueryIndex(int32_t queryIndex); 00463 00464 /// Return the character code of the cigar operator associated with 00465 /// the specified reference offset. '?' is returned for an out of range offset. 00466 char getCigarCharOpFromRefOffset(int32_t refOffset); 00467 00468 /// Return the character code of the cigar operator associated with 00469 /// the specified reference position. '?' is returned for an out of 00470 /// range reference position. 00471 char getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos); 00472 00473 /// Return the number of bases that overlap the reference and the 00474 /// read associated with this cigar that falls within the specified region. 00475 /// \param start : inclusive 0-based start position (reference position) of 00476 /// the region to check for overlaps in 00477 /// (-1 indicates to start at the beginning of the reference.) 00478 /// \param end : exclusive 0-based end position (reference position) of the 00479 /// region to check for overlaps in 00480 /// (-1 indicates to go to the end of the reference.) 00481 /// \param queryStartPos : 0-based leftmost mapping position of the first 00482 /// matcihng base in the query. 00483 uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos); 00484 00485 /// Return whether or not the cigar has indels (insertions or delections) 00486 /// \return true if it has an insertion or deletion, false if not. 00487 bool hasIndel(); 00488 00489 /// Value associated with an index that is not applicable/does not exist, 00490 /// used for converting between query and reference indexes/offsets when 00491 /// an associated index/offset does not exist. 00492 static const int32_t INDEX_NA; 00493 00494 protected: 00495 // Clear the query index/reference offset index vectors. 00496 void clearQueryAndReferenceIndexes(); 00497 00498 // Set the query index/reference offset index vectors. 00499 void setQueryAndReferenceIndexes(); 00500 00501 // Container for the cigar operations in this cigar. 00502 std::vector<CigarOperator> cigarOperations; 00503 00504 private: 00505 // The vector is indexed by query index and contains the reference 00506 // offset associated with that query index. 00507 // The vector is reset each time a new cigar operation is added, and 00508 // is calculated when accessed if it is not already set. 00509 std::vector<int32_t> queryToRef; 00510 00511 // The vector is indexed by reference offset and contains the query 00512 // index associated with that reference offset. 00513 // The vector is reset each time a new cigar operation is added, and 00514 // is calculated when accessed if it is not already set. 00515 std::vector<int32_t> refToQuery; 00516 00517 // The vector is indexed by reference offset and contains the offset into 00518 // the expanded cigar associated with that reference offset. 00519 // The vector is reset each time a new cigar operation is added, and 00520 // is calculated when accessed if it is not already set. 00521 std::vector<int32_t> refToCigar; 00522 00523 // The vector is indexed by query index and contains the offset into 00524 // the expanded cigar associated with that query index. 00525 // The vector is reset each time a new cigar operation is added, and 00526 // is calculated when accessed if it is not already set. 00527 std::vector<int32_t> queryToCigar; 00528 00529 std::string myExpandedCigar; 00530 }; 00531 00532 /// Writes the specified cigar operation to the specified stream as <count><char> (3M). 00533 inline std::ostream &operator << (std::ostream &stream, const Cigar::CigarOperator& o) 00534 { 00535 stream << o.count << o.getChar(); 00536 return stream; 00537 } 00538 00539 /// Writes all of the cigar operations contained in the cigar to the passed in stream. 00540 inline std::ostream &operator << (std::ostream &stream, const Cigar& cigar) 00541 { 00542 stream << cigar.cigarOperations; 00543 return stream; 00544 } 00545 00546 #endif