00001 /* 00002 * Copyright (C) 2010-2011 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #if !defined(_CIGAR_H) 00019 #define _CIGAR_H 00020 00021 #include <string.h> // for inline use of strcat, etc 00022 #include <limits.h> // for INT_MAX 00023 #include <stdint.h> // for uint32_t and friends 00024 00025 00026 #include <assert.h> 00027 #include <stdio.h> 00028 #include <stdlib.h> 00029 #include <string> 00030 #include <iostream> 00031 #include <iomanip> 00032 #include <utility> 00033 #include <vector> 00034 00035 #include "Generic.h" 00036 #include "StringBasics.h" 00037 00038 /// This class represents the CIGAR without any methods to set the cigar 00039 /// (see CigarRoller for that). 00040 00041 // 00042 // Docs from Sam1.pdf: 00043 // 00044 // Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one. 00045 // Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is 00046 // an example. Suppose the clipped alignment is: 00047 // REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG 00048 // READ: gggGTGTAACC-GACTAGgggg 00049 // where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for 00050 // this alignment is: 3S8M1D6M4S. 00051 // 00052 // 00053 // If the mapping position of the query is not available, RNAME and 00054 // CIGAR are set as “*” 00055 // 00056 // A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows 00057 // for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format 00058 // further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing: 00059 // 00060 // op Description 00061 // -- ----------- 00062 // M Match or mismatch 00063 // I Insertion to the reference 00064 // D Deletion from the reference 00065 // N Skipped region from the reference 00066 // S Soft clip on the read (clipped sequence present in <seq>) 00067 // H Hard clip on the read (clipped sequence NOT present in <seq>) 00068 // P Padding (silent deletion from the padded reference sequence) 00069 // 00070 00071 00072 00073 00074 //////////////////////////////////////////////////////////////////////// 00075 /// 00076 /// This class represents the CIGAR. It contains methods for converting 00077 /// to strings and extracting information from the cigar on how a read 00078 /// maps to the reference. 00079 /// 00080 /// It only contains read only methods. There are no ways to set 00081 /// values. To set a value, a child class must be used. 00082 /// 00083 class Cigar 00084 { 00085 public: 00086 /// Enum for the cigar operations. 00087 enum Operation { 00088 none=0, ///< no operation has been set. 00089 match, ///< match/mismatch operation. Associated with CIGAR Operation "M" 00090 mismatch, ///< mismatch operation. Associated with CIGAR Operation "M" 00091 insert, ///< insertion to the reference (the query sequence contains bases that have no corresponding base in the reference). Associated with CIGAR Operation "I" 00092 del, ///< deletion from the reference (the reference contains bases that have no corresponding base in the query sequence). Associated with CIGAR Operation "D" 00093 skip, ///< skipped region from the reference (the reference contains bases that have no corresponding base in the query sequence). Associated with CIGAR Operation "N" 00094 softClip, ///< Soft clip on the read (clipped sequence present in the query sequence, but not in reference). Associated with CIGAR Operation "S" 00095 hardClip, ///< Hard clip on the read (clipped sequence not present in the query sequence or reference). Associated with CIGAR Operation "H" 00096 pad ///< Padding (not in reference or query). Associated with CIGAR Operation "P" 00097 }; 00098 00099 // The maximum value in the operation enum (used for setting up a bitset of 00100 // operations. 00101 static const int MAX_OP_VALUE = pad; 00102 00103 //////////////////////////////////////////////////////////////////////// 00104 // 00105 // Nested Struct : CigarOperator 00106 // 00107 struct CigarOperator 00108 { 00109 00110 CigarOperator() 00111 { 00112 operation = none; 00113 count = 0; 00114 } 00115 00116 /// Set the cigar operator with the specified operation and 00117 /// count length. 00118 CigarOperator(Operation operation, uint32_t count) 00119 : operation(operation), count(count) {}; 00120 00121 Operation operation; 00122 00123 uint32_t count; 00124 00125 /// Get the character code (M, I, D, N, S, H, or P) associated with 00126 /// this operation. 00127 char getChar() const 00128 { 00129 switch (operation) 00130 { 00131 case none: 00132 return '?'; // error 00133 case match: 00134 case mismatch: 00135 return'M'; 00136 case insert: 00137 return 'I'; 00138 case del: 00139 return'D'; 00140 case skip: 00141 return 'N'; 00142 case softClip: 00143 return 'S'; 00144 case hardClip: 00145 return 'H'; 00146 case pad: 00147 return 'P'; 00148 } 00149 return '?'; // actually it is an error to get here 00150 } 00151 00152 /// Compare only on the operator, true if they are the same, false if not. Match and mismatch are considered the same for CIGAR strings. 00153 bool operator == (const CigarOperator &rhs) const 00154 { 00155 if (operation==rhs.operation) 00156 return true; 00157 if ((operation == mismatch || operation == match) && (rhs.operation == mismatch || rhs.operation == match)) 00158 return true; 00159 return false; 00160 } 00161 00162 /// Compare only on the operator, false if they are the same, true if not. Match and mismatch are considered the same for CIGAR strings. 00163 bool operator != (const CigarOperator &rhs) const 00164 { 00165 return !((*this) == rhs) ; 00166 } 00167 00168 }; 00169 00170 //////////////////////////////////////////////////////////////////////// 00171 // 00172 // Cigar Class statics 00173 // 00174 00175 /// Return true if the specified operation is found in the 00176 /// query sequence, false if not. 00177 static bool foundInQuery(Operation op) 00178 { 00179 switch(op) 00180 { 00181 case match: 00182 case mismatch: 00183 case insert: 00184 case softClip: 00185 return true; 00186 default: 00187 return false; 00188 } 00189 return false; 00190 } 00191 00192 /// Return true if the specified operation is found in the 00193 /// query sequence, false if not. 00194 static bool foundInQuery(CigarOperator op) 00195 { 00196 switch(op.operation) 00197 { 00198 case match: 00199 case mismatch: 00200 case insert: 00201 case softClip: 00202 return true; 00203 default: 00204 return false; 00205 } 00206 return false; 00207 } 00208 00209 /// Return true if the specified operation is a clipping operation, 00210 /// false if not. 00211 static bool isClip(Operation op) 00212 { 00213 switch(op) 00214 { 00215 case softClip: 00216 case hardClip: 00217 return true; 00218 default: 00219 return false; 00220 } 00221 return false; 00222 } 00223 00224 /// Return true if the specified operation is a clipping operation, 00225 /// false if not. 00226 static bool isClip(CigarOperator op) 00227 { 00228 switch(op.operation) 00229 { 00230 case softClip: 00231 case hardClip: 00232 return true; 00233 default: 00234 return false; 00235 } 00236 return false; 00237 } 00238 00239 /// Return true if the specified operation is a match/mismatch operation, 00240 /// false if not. 00241 static bool isMatchOrMismatch(Operation op) 00242 { 00243 switch(op) 00244 { 00245 case match: 00246 case mismatch: 00247 return true; 00248 default: 00249 return false; 00250 } 00251 return false; 00252 } 00253 00254 /// Return true if the specified operation is a match/mismatch operation, 00255 /// false if not. 00256 static bool isMatchOrMismatch(CigarOperator op) 00257 { 00258 switch(op.operation) 00259 { 00260 case match: 00261 case mismatch: 00262 return true; 00263 default: 00264 return false; 00265 } 00266 return false; 00267 } 00268 00269 00270 //////////////////////////////////////////////////////////////////////// 00271 // 00272 // Cigar Class non static 00273 // 00274 friend std::ostream &operator << (std::ostream &stream, const Cigar& cigar); 00275 00276 /// Default constructor initializes as a CIGAR with no operations. 00277 Cigar() 00278 { 00279 clearQueryAndReferenceIndexes(); 00280 } 00281 00282 00283 /// Set the passed in String to the string reprentation of the Cigar 00284 /// operations in this object. 00285 void getCigarString(String& cigarString) const; 00286 00287 /// Set the passed in std::string to the string reprentation of the Cigar 00288 /// operations in this object. 00289 void getCigarString(std::string& cigarString) const; 00290 00291 /// Sets the specified string to a valid CIGAR string of characters that 00292 /// represent the cigar with no digits (a CIGAR of "3M" would return "MMM"). 00293 /// The returned string is actually also a valid CIGAR string. 00294 /// In theory this makes it easier to parse some reads. 00295 /// \return s the string to populate 00296 void getExpandedString(std::string &s) const; 00297 00298 /// Return the Cigar Operation at the specified index (starting at 0). 00299 const CigarOperator & operator [](int i) const 00300 { 00301 return cigarOperations[i]; 00302 } 00303 00304 /// Return the Cigar Operation at the specified index (starting at 0). 00305 const CigarOperator & getOperator(int i) const 00306 { 00307 return cigarOperations[i]; 00308 } 00309 00310 /// Return true if the 2 Cigars are the same 00311 /// (the same operations of the same sizes). 00312 bool operator == (Cigar &rhs) const; 00313 00314 /// Return the number of cigar operations 00315 int size() const 00316 { 00317 return cigarOperations.size(); 00318 } 00319 00320 /// Write this object as a string to cout. 00321 void Dump() const 00322 { 00323 String cigarString; 00324 getCigarString(cigarString); 00325 std::cout << cigarString ; 00326 } 00327 00328 /// Return the length of the read that corresponds to 00329 /// the current CIGAR string. 00330 /// 00331 /// For validation, we should expect that a sequence 00332 /// read in a SAM file will be the same length as the 00333 /// value returned by this method. 00334 /// 00335 /// Example: 3M2D3M describes a read with three bases 00336 /// matching the reference, then skips 2 bases, then has 00337 /// three more bases that match the reference (match/mismatch). 00338 /// In this case, the read length is expected to be 6. 00339 /// 00340 /// Example: 3M2I3M describes a read with 3 match/mismatch 00341 /// bases, two extra bases, and then 3 more match/mistmatch 00342 /// bases. The total in this example is 8 bases. 00343 /// 00344 /// \return returns the expected read length 00345 int getExpectedQueryBaseCount() const; 00346 00347 /// Return the number of bases in the reference that 00348 /// this CIGAR "spans". 00349 /// 00350 /// When doing range checking, we occassionally need to know 00351 /// how many total bases the CIGAR string represents as compared 00352 /// to the reference. 00353 /// 00354 /// Examples: 3M2D3M describes a read that overlays 8 bases in 00355 /// the reference. 3M2I3M describes a read with 3 bases that 00356 /// match the reference, two additional bases that aren't in the 00357 /// reference, and 3 more bases that match the reference, so it 00358 /// spans 6 bases in the reference. 00359 /// 00360 /// \return how many bases in the reference are spanned 00361 /// by the given CIGAR string 00362 /// 00363 int getExpectedReferenceBaseCount() const; 00364 00365 /// Return the number of clips that are at the beginning of the cigar. 00366 int getNumBeginClips() const; 00367 00368 /// Return the number of clips that are at the end of the cigar. 00369 int getNumEndClips() const; 00370 00371 /// Return the reference offset associated with the specified 00372 /// query index or INDEX_NA based on this cigar. 00373 int32_t getRefOffset(int32_t queryIndex); 00374 00375 /// Return the query index associated with the specified 00376 /// reference offset or INDEX_NA based on this cigar. 00377 int32_t getQueryIndex(int32_t refOffset); 00378 00379 /// Return the reference position associated with the specified query index 00380 /// or INDEX_NA based on this cigar and the specified queryStartPos which 00381 /// is the leftmost mapping position of the first matching base in the 00382 /// query. 00383 int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos); 00384 00385 /// Return the query index or INDEX_NA associated with the specified 00386 /// reference offset when the query starts at the specified reference 00387 /// position. 00388 int32_t getQueryIndex(int32_t refPosition, int32_t queryStartPos); 00389 00390 /// Return the number of bases that overlap the reference and the 00391 /// read associated with this cigar that falls within the specified region. 00392 /// \param start : inclusive 0-based start position (reference position) of 00393 /// the region to check for overlaps in 00394 /// (-1 indicates to start at the beginning of the reference.) 00395 /// \param end : exclusive 0-based end position (reference position) of the 00396 /// region to check for overlaps in 00397 /// (-1 indicates to go to the end of the reference.) 00398 /// \param queryStartPos : 0-based leftmost mapping position of the first 00399 /// matcihng base in the query. 00400 uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos); 00401 00402 /// Return whether or not the cigar has indels (insertions or delections) 00403 /// \return true if it has an insertion or deletion, false if not. 00404 bool hasIndel(); 00405 00406 /// Value associated with an index that is not applicable/does not exist, 00407 /// used for converting between query and reference indexes/offsets when 00408 /// an associated index/offset does not exist. 00409 static const int32_t INDEX_NA; 00410 00411 protected: 00412 // Clear the query index/reference offset index vectors. 00413 void clearQueryAndReferenceIndexes(); 00414 00415 // Set the query index/reference offset index vectors. 00416 void setQueryAndReferenceIndexes(); 00417 00418 // Container for the cigar operations in this cigar. 00419 std::vector<CigarOperator> cigarOperations; 00420 00421 private: 00422 // The vector is indexed by query index and contains the reference 00423 // offset associated with that query index. 00424 // The vector is reset each time a new cigar operation is added, and 00425 // is calculated when accessed if it is not already set. 00426 std::vector<int32_t> queryToRef; 00427 00428 // The vector is indexed by reference offset and contains the query 00429 // index associated with that reference offset. 00430 // The vector is reset each time a new cigar operation is added, and 00431 // is calculated when accessed if it is not already set. 00432 std::vector<int32_t> refToQuery; 00433 }; 00434 00435 /// Writes the specified cigar operation to the specified stream as <count><char> (3M). 00436 inline std::ostream &operator << (std::ostream &stream, const Cigar::CigarOperator& o) 00437 { 00438 stream << o.count << o.getChar(); 00439 return stream; 00440 } 00441 00442 /// Writes all of the cigar operations contained in the cigar to the passed in stream. 00443 inline std::ostream &operator << (std::ostream &stream, const Cigar& cigar) 00444 { 00445 stream << cigar.cigarOperations; 00446 return stream; 00447 } 00448 00449 #endif