00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #if !defined(_CIGAR_H) 00019 #define _CIGAR_H 00020 00021 #include <string.h> // for inline use of strcat, etc 00022 #include <limits.h> // for INT_MAX 00023 #include <stdint.h> // for uint32_t and friends 00024 00025 00026 #include <assert.h> 00027 #include <stdio.h> 00028 #include <stdlib.h> 00029 #include <string> 00030 #include <iostream> 00031 #include <iomanip> 00032 #include <utility> 00033 #include <vector> 00034 00035 #include "Generic.h" 00036 #include "StringBasics.h" 00037 00038 // 00039 // Docs from Sam1.pdf: 00040 // 00041 // Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one. 00042 // Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is 00043 // an example. Suppose the clipped alignment is: 00044 // REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG 00045 // READ: gggGTGTAACC-GACTAGgggg 00046 // where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for 00047 // this alignment is: 3S8M1D6M4S. 00048 // 00049 // 00050 // If the mapping position of the query is not available, RNAME and 00051 // CIGAR are set as “*” 00052 // 00053 // A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows 00054 // for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format 00055 // further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing: 00056 // 00057 // op Description 00058 // -- ----------- 00059 // M Match or mismatch 00060 // I Insertion to the reference 00061 // D Deletion from the reference 00062 // N Skipped region from the reference 00063 // S Soft clip on the read (clipped sequence present in <seq>) 00064 // H Hard clip on the read (clipped sequence NOT present in <seq>) 00065 // P Padding (silent deletion from the padded reference sequence) 00066 // 00067 // 00068 00069 00070 00071 //////////////////////////////////////////////////////////////////////// 00072 // 00073 // This class represents the CIGAR. It contains methods for converting 00074 // to strings and extracting information from the cigar on how a read 00075 // maps to the reference. 00076 // 00077 // It only contains read only methods. There are no ways to set 00078 // values. To set a value, a child class must be used. 00079 // 00080 class Cigar 00081 { 00082 public: 00083 enum Operation {none, match, mismatch, insert, del, skip, softClip, hardClip, pad}; 00084 00085 // Return true if the specified operation is found in the 00086 // query sequence, false if not. 00087 static bool foundInQuery(Operation op) 00088 { 00089 switch(op) 00090 { 00091 case match: 00092 case mismatch: 00093 case insert: 00094 case softClip: 00095 return true; 00096 default: 00097 return false; 00098 } 00099 return false; 00100 } 00101 00102 // Return true if the specified operation is a clipping operation, 00103 // false if not. 00104 static bool isClip(Operation op) 00105 { 00106 switch(op) 00107 { 00108 case softClip: 00109 case hardClip: 00110 return true; 00111 default: 00112 return false; 00113 } 00114 return false; 00115 } 00116 00117 //////////////////////////////////////////////////////////////////////// 00118 // 00119 // Nested Struct : CigarOperator 00120 // 00121 struct CigarOperator 00122 { 00123 00124 CigarOperator() 00125 { 00126 operation = none; 00127 count = 0; 00128 } 00129 00130 CigarOperator(Operation operation, uint32_t count): operation(operation), count(count) {}; 00131 00132 Operation operation; 00133 00134 uint32_t count; 00135 00136 // Get the character associated with this operation. 00137 char getChar() const 00138 { 00139 switch (operation) 00140 { 00141 case none: 00142 return '?'; // error 00143 case match: 00144 case mismatch: 00145 return'M'; 00146 case insert: 00147 return 'I'; 00148 case del: 00149 return'D'; 00150 case skip: 00151 return 'N'; 00152 case softClip: 00153 return 'S'; 00154 case hardClip: 00155 return 'H'; 00156 case pad: 00157 return 'P'; 00158 } 00159 return '?'; // actually it is an error to get here 00160 } 00161 00162 // 00163 // Compare only on the operator. 00164 // 00165 // Match and mismatch are considered the same for CIGAR strings. 00166 // 00167 bool operator == (const CigarOperator &rhs) const 00168 { 00169 if (operation==rhs.operation) 00170 return true; 00171 if ((operation == mismatch || operation == match) && (rhs.operation == mismatch || rhs.operation == match)) 00172 return true; 00173 return false; 00174 } 00175 00176 bool operator != (const CigarOperator &rhs) const 00177 { 00178 return !((*this) == rhs) ; 00179 } 00180 00181 }; 00182 00183 //////////////////////////////////////////////////////////////////////// 00184 // 00185 // Cigar Class 00186 // 00187 friend std::ostream &operator << (std::ostream &stream, const Cigar& cigar); 00188 00189 Cigar() 00190 { 00191 clearQueryAndReferenceIndexes(); 00192 } 00193 00194 // 00195 // Set the passed in string to the string reprentation of the Cigar 00196 // operations in this object. 00197 // 00198 void getCigarString(String& cigarString) const; 00199 void getCigarString(std::string& cigarString) const; 00200 00201 /// obtain a non-run length encoded string of operations 00202 /// 00203 /// The returned string is actually also a valid CIGAR string, 00204 /// but it does not have any digits in it - only the characters 00205 /// themselves. In theory this makes it easier to parse some 00206 /// reads. 00207 /// 00208 /// /return s the string to populate 00209 void getExpandedString(std::string &s) const; 00210 00211 const CigarOperator & operator [](int i) const 00212 { 00213 return cigarOperations[i]; 00214 } 00215 00216 const CigarOperator & getOperator(int i) const 00217 { 00218 return cigarOperations[i]; 00219 } 00220 00221 bool operator == (Cigar &rhs) const; 00222 00223 // 00224 // get the number of cigar operations 00225 // 00226 int size() const 00227 { 00228 return cigarOperations.size(); 00229 } 00230 00231 void Dump() const 00232 { 00233 String cigarString; 00234 getCigarString(cigarString); 00235 std::cout << cigarString ; 00236 } 00237 00238 /// return the length of the read that corresponds to 00239 /// the current CIGAR string. 00240 /// 00241 /// For validation, we should expect that a sequence 00242 /// read in a SAM file will be the same length as the 00243 /// value returned by this method. 00244 /// 00245 /// Example: 3M2D3M describes a read with three bases 00246 /// matching the reference, then skips 2 bases, then has 00247 /// three more bases that match the reference (match/mismatch). 00248 /// In this case, the read length is expected to be 6. 00249 /// 00250 /// Example: 3M2I3M describes a read with 3 match/mismatch 00251 /// bases, two extra bases, and then 3 more match/mistmatch 00252 /// bases. The total in this example is 8 bases. 00253 /// 00254 /// /return returns the expected read length 00255 int getExpectedQueryBaseCount() const; 00256 00257 /// return the number of bases in the reference that 00258 /// this read "spans" 00259 /// 00260 /// When doing range checking, we occassionally need to know 00261 /// how many total bases the CIGAR string represents as compared 00262 /// to the reference. 00263 /// 00264 /// Examples: 3M2D3M describes a read that overlays 8 bases in 00265 /// the reference. 3M2I3M describes a read with 3 bases that 00266 /// match the reference, two additional bases that aren't in the 00267 /// reference, and 3 more bases that match the reference, so it 00268 /// spans 6 bases in the reference. 00269 /// 00270 /// /return how many bases in the reference are spanned 00271 /// by the given CIGAR string 00272 /// 00273 int getExpectedReferenceBaseCount() const; 00274 00275 /// Return the number of clips that are at the beginning of the cigar. 00276 int getNumBeginClips() const; 00277 00278 /// Return the number of clips that are at the end of the cigar. 00279 int getNumEndClips() const; 00280 00281 // Return the reference offset associated with the specified 00282 // query index based on this cigar. 00283 int32_t getRefOffset(int32_t queryIndex); 00284 00285 // Return the query index associated with the specified 00286 // reference offset based on this cigar. 00287 int32_t getQueryIndex(int32_t refOffset); 00288 00289 // Return the reference position associated with the specified query 00290 // index based on this cigar and the specified queryStartPos which 00291 // is the leftmost mapping position of the first matching 00292 // base in the query. 00293 int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos); 00294 00295 // Return the query index associated with the specified reference position 00296 // when the query starts at the specified reference position based on 00297 // this cigar. 00298 int32_t getQueryIndex(int32_t refPosition, int32_t queryStartPos); 00299 00300 // Return the number of bases that overlap the reference and the 00301 // read associated with this cigar that falls within the specified region. 00302 // start : inclusive 0-based start position (reference position) of the 00303 // region to check for overlaps in. 00304 // (-1 indicates to start at the beginning of the reference.) 00305 // end : exclusive 0-based end position (reference position) of the 00306 // region to check for overlaps in. 00307 // (-1 indicates to go to the end of the reference.) 00308 // queryStartPos : 0-based leftmost mapping position of the first matching 00309 // base in the query. 00310 uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos); 00311 00312 static const int32_t INDEX_NA; 00313 00314 protected: 00315 // Clear the query index/reference offset index vectors. 00316 void clearQueryAndReferenceIndexes(); 00317 00318 // Set the query index/reference offset index vectors. 00319 void setQueryAndReferenceIndexes(); 00320 00321 // Container for the cigar operations in this cigar. 00322 std::vector<CigarOperator> cigarOperations; 00323 00324 private: 00325 // The vector is indexed by query index and contains the reference 00326 // offset associated with that query index. 00327 // The vector is reset each time a new cigar operation is added, and 00328 // is calculated when accessed if it is not already set. 00329 std::vector<int32_t> queryToRef; 00330 00331 // The vector is indexed by reference offset and contains the query 00332 // index associated with that reference offset. 00333 // The vector is reset each time a new cigar operation is added, and 00334 // is calculated when accessed if it is not already set. 00335 std::vector<int32_t> refToQuery; 00336 }; 00337 00338 inline std::ostream &operator << (std::ostream &stream, const Cigar::CigarOperator& o) 00339 { 00340 stream << o.count << o.getChar(); 00341 return stream; 00342 } 00343 00344 inline std::ostream &operator << (std::ostream &stream, const Cigar& cigar) 00345 { 00346 stream << cigar.cigarOperations; 00347 return stream; 00348 } 00349 00350 #endif