00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #if !defined(_CIGAR_ROLLER_H) 00019 #define _CIGAR_ROLLER_H 00020 00021 #include "Cigar.h" 00022 00023 // 00024 // Docs from Sam1.pdf: 00025 // 00026 // Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one. 00027 // Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is 00028 // an example. Suppose the clipped alignment is: 00029 // REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG 00030 // READ: gggGTGTAACC-GACTAGgggg 00031 // where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for 00032 // this alignment is: 3S8M1D6M4S. 00033 // 00034 // 00035 // If the mapping position of the query is not available, RNAME and 00036 // CIGAR are set as “*” 00037 // 00038 // A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows 00039 // for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format 00040 // further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing: 00041 // 00042 // op Description 00043 // -- ----------- 00044 // M Match or mismatch 00045 // I Insertion to the reference 00046 // D Deletion from the reference 00047 // N Skipped region from the reference 00048 // S Soft clip on the read (clipped sequence present in <seq>) 00049 // H Hard clip on the read (clipped sequence NOT present in <seq>) 00050 // P Padding (silent deletion from the padded reference sequence) 00051 // 00052 // 00053 00054 00055 00056 //////////////////////////////////////////////////////////////////////// 00057 // 00058 // CigarRoller is an aid to correctly generating the CIGAR strings 00059 // necessary to represent how a read maps to the reference. 00060 // 00061 // It is called once a particular match candidate is being written 00062 // out, so it is far less performance sensitive than the Smith Waterman 00063 // code below. 00064 // 00065 class CigarRoller : public Cigar 00066 { 00067 public: 00068 00069 //////////////////////////////////////////////////////////////////////// 00070 // 00071 // Cigar Roller Class 00072 // 00073 friend std::ostream &operator << (std::ostream &stream, const CigarRoller& roller); 00074 00075 CigarRoller() 00076 { 00077 clearQueryAndReferenceIndexes(); 00078 } 00079 CigarRoller(const char *cigarString) 00080 { 00081 Set(cigarString); 00082 } 00083 00084 00085 CigarRoller & operator += (CigarRoller &rhs); 00086 00087 // 00088 // Append a new operator at the end of the sequence. 00089 // 00090 CigarRoller & operator += (const CigarOperator &rhs); 00091 00092 CigarRoller & operator = (CigarRoller &rhs); 00093 00094 // 00095 void Add(Operation operation, int count); 00096 00097 void Add(char operation, int count); 00098 00099 void Add(const char *cigarString); 00100 00101 void Add(CigarRoller &rhs) 00102 { 00103 (*this) += rhs; 00104 } 00105 00106 void Set(const char *cigarString); 00107 00108 void Set(const uint32_t* cigarBuffer, uint16_t bufferLen); 00109 00110 // 00111 // when we examine CIGAR strings, we need to know how 00112 // many cumulative insert and delete positions there are 00113 // so that we can adjust the read location appropriately. 00114 // 00115 // Here, we iterate over the vector of CIGAR operations, 00116 // summaring the count for each insert or delete (insert 00117 // increases the offset, delete decreases it). 00118 // 00119 // The use case for this is when we have a genome match 00120 // position based on an index word other than the first one, 00121 // and there is also a insert or delete between the beginning 00122 // of the read and the index word. We can't simply report 00123 // the match position without taking into account the indels, 00124 // otherwise we'll be off by N where N is the sum of this 00125 // indel count. 00126 // 00127 // DEPRECATED - do not use. There are better ways to accomplish that by using 00128 // read lengths, reference lengths, span of the read, etc. 00129 int getMatchPositionOffset(); 00130 00131 // 00132 // Get the string reprentation of the Cigar operations in this object. 00133 // Caller must delete the returned value. 00134 // 00135 const char *getString(); 00136 00137 void clear(); 00138 00139 private: 00140 }; 00141 00142 inline std::ostream &operator << (std::ostream &stream, const CigarRoller& roller) 00143 { 00144 stream << roller.cigarOperations; 00145 return stream; 00146 } 00147 00148 #endif