libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010-2011 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #if !defined(_CIGAR_ROLLER_H) 00019 #define _CIGAR_ROLLER_H 00020 00021 #include "Cigar.h" 00022 00023 /// The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object. It is a child class of Cigar. 00024 00025 /// 00026 /// Docs from Sam1.pdf: 00027 /// 00028 /// Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one. 00029 /// Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is 00030 /// an example. Suppose the clipped alignment is: 00031 /// REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG 00032 /// READ: gggGTGTAACC-GACTAGgggg 00033 /// where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for 00034 /// this alignment is: 3S8M1D6M4S. 00035 /// 00036 /// 00037 /// If the mapping position of the query is not available, RNAME and 00038 /// CIGAR are set as “*” 00039 /// 00040 /// A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows 00041 /// for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format 00042 /// further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing: 00043 /// 00044 /// op Description 00045 /// -- ----------- 00046 /// M Match or mismatch 00047 /// I Insertion to the reference 00048 /// D Deletion from the reference 00049 /// N Skipped region from the reference 00050 /// S Soft clip on the read (clipped sequence present in <seq>) 00051 /// H Hard clip on the read (clipped sequence NOT present in <seq>) 00052 /// P Padding (silent deletion from the padded reference sequence) 00053 /// 00054 00055 00056 00057 //////////////////////////////////////////////////////////////////////// 00058 /// 00059 /// CigarRoller is an aid to correctly generating the CIGAR strings 00060 /// necessary to represent how a read maps to the reference. 00061 /// 00062 /// It is called once a particular match candidate is being written 00063 /// out, so it is far less performance sensitive than the Smith Waterman 00064 /// code below. 00065 /// 00066 class CigarRoller : public Cigar 00067 { 00068 public: 00069 00070 //////////////////////////////////////////////////////////////////////// 00071 // 00072 // Cigar Roller Class 00073 // 00074 /// Writes all of the cigar operations contained in this roller to the 00075 /// passed in stream. 00076 friend std::ostream &operator << (std::ostream &stream, const CigarRoller& roller); 00077 00078 /// Default constructor initializes as a CIGAR with no operations. 00079 CigarRoller() 00080 { 00081 clearQueryAndReferenceIndexes(); 00082 } 00083 00084 /// Constructor that initializes the object with the specified cigarString. 00085 CigarRoller(const char *cigarString) 00086 { 00087 Set(cigarString); 00088 } 00089 00090 /// Add the contents of the specified CigarRoller to this object. 00091 CigarRoller & operator += (CigarRoller &rhs); 00092 00093 /// Append the specified operator to this object. 00094 CigarRoller & operator += (const CigarOperator &rhs); 00095 00096 /// Set this object to be equal to the specified CigarRoller. 00097 CigarRoller & operator = (CigarRoller &rhs); 00098 00099 /// Append the specified operation with the specified count to this object. 00100 void Add(Operation operation, int count); 00101 00102 /// Append the specified operation with the specified count to this object. 00103 void Add(char operation, int count); 00104 00105 /// Append the specified cigarString to this object. 00106 void Add(const char *cigarString); 00107 00108 /// Append the specified Cigar object to this object. 00109 void Add(CigarRoller &rhs) 00110 { 00111 (*this) += rhs; 00112 } 00113 00114 /// Remove the operation at the specified index. 00115 /// \return true if successfully removed, false if not. 00116 bool Remove(int index); 00117 00118 /// Increments the count for the operation at the specified index 00119 /// by the specified value, specify a negative value to decrement. 00120 /// \return true if it is successfully incremented, false if not. 00121 bool IncrementCount(int index, int increment); 00122 00123 /// Updates the operation at the specified index to be the specified 00124 /// operation and have the specified count. 00125 /// \return true if it is successfully updated, false if not. 00126 bool Update(int index, Operation op, int count); 00127 00128 /// Sets this object to the specified cigarString. 00129 void Set(const char *cigarString); 00130 00131 /// Sets this object to the BAM formatted cigar found at the beginning 00132 /// of the specified buffer which is bufferLen long. 00133 void Set(const uint32_t* cigarBuffer, uint16_t bufferLen); 00134 00135 // 00136 // when we examine CIGAR strings, we need to know how 00137 // many cumulative insert and delete positions there are 00138 // so that we can adjust the read location appropriately. 00139 // 00140 // Here, we iterate over the vector of CIGAR operations, 00141 // summaring the count for each insert or delete (insert 00142 // increases the offset, delete decreases it). 00143 // 00144 // The use case for this is when we have a genome match 00145 // position based on an index word other than the first one, 00146 // and there is also a insert or delete between the beginning 00147 // of the read and the index word. We can't simply report 00148 // the match position without taking into account the indels, 00149 // otherwise we'll be off by N where N is the sum of this 00150 // indel count. 00151 // 00152 /// DEPRECATED - do not use, there are better ways to accomplish that by 00153 /// using read lengths, reference lengths, span of the read, etc. 00154 int getMatchPositionOffset(); 00155 00156 /// Get the string reprentation of the Cigar operations in this object, 00157 /// caller must delete the returned value. 00158 const char *getString(); 00159 00160 /// Clear this object so that it has no Cigar Operations. 00161 void clear(); 00162 00163 private: 00164 }; 00165 00166 00167 inline std::ostream &operator << (std::ostream &stream, const CigarRoller& roller) 00168 { 00169 stream << roller.cigarOperations; 00170 return stream; 00171 } 00172 00173 #endif