libStatGen Software  1
CigarRoller.h
00001 /*
00002  *  Copyright (C) 2010-2011  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #if !defined(_CIGAR_ROLLER_H)
00019 #define _CIGAR_ROLLER_H
00020 
00021 #include "Cigar.h"
00022 
00023 /// The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object. It is a child class of Cigar.
00024 
00025 ///
00026 /// Docs from Sam1.pdf:
00027 ///
00028 /// Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one.
00029 /// Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is
00030 /// an example. Suppose the clipped alignment is:
00031 /// REF:  AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG
00032 /// READ:        gggGTGTAACC-GACTAGgggg
00033 /// where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for
00034 /// this alignment is: 3S8M1D6M4S.
00035 ///
00036 ///
00037 /// If the mapping position of the query is not available, RNAME and
00038 /// CIGAR are set as “*”
00039 ///
00040 /// A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows
00041 /// for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format
00042 /// further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing:
00043 ///
00044 /// op   Description
00045 /// --   -----------
00046 /// M    Match or mismatch
00047 /// I    Insertion to the reference
00048 /// D    Deletion from the reference
00049 /// N    Skipped region from the reference
00050 /// S    Soft clip on the read (clipped sequence present in <seq>)
00051 /// H    Hard clip on the read (clipped sequence NOT present in <seq>)
00052 /// P    Padding (silent deletion from the padded reference sequence)
00053 ///
00054 
00055 
00056 
00057 ////////////////////////////////////////////////////////////////////////
00058 ///
00059 /// CigarRoller is an aid to correctly generating the CIGAR strings
00060 /// necessary to represent how a read maps to the reference.
00061 ///
00062 /// It is called once a particular match candidate is being written
00063 /// out, so it is far less performance sensitive than the Smith Waterman
00064 /// code below.
00065 ///
00066 class CigarRoller : public Cigar
00067 {
00068 public:
00069 
00070     ////////////////////////////////////////////////////////////////////////
00071     //
00072     // Cigar Roller Class
00073     //
00074     /// Writes all of the cigar operations contained in this roller to the
00075     /// passed in stream.
00076     friend std::ostream &operator << (std::ostream &stream, const CigarRoller& roller);
00077 
00078     /// Default constructor initializes as a CIGAR with no operations.
00079     CigarRoller()
00080     {
00081         clearQueryAndReferenceIndexes();
00082     }
00083 
00084     /// Constructor that initializes the object with the specified cigarString.
00085     CigarRoller(const char *cigarString)
00086     {
00087         Set(cigarString);
00088     }
00089 
00090     /// Add the contents of the specified CigarRoller to this object.
00091     CigarRoller & operator += (CigarRoller &rhs);
00092 
00093     /// Append the specified operator to this object.
00094     CigarRoller & operator += (const CigarOperator &rhs);
00095 
00096     /// Set this object to be equal to the specified CigarRoller.
00097     CigarRoller & operator = (CigarRoller &rhs);
00098 
00099     /// Append the specified operation with the specified count to this object.
00100     void Add(Operation operation, int count);
00101 
00102     /// Append the specified operation with the specified count to this object.
00103     void Add(char operation, int count);
00104 
00105     /// Append the specified cigarString to this object.
00106     void Add(const char *cigarString);
00107 
00108     /// Append the specified Cigar object to this object.
00109     void Add(CigarRoller &rhs)
00110     {
00111         (*this) += rhs;
00112     }
00113 
00114     /// Remove the operation at the specified index.
00115     /// \return true if successfully removed, false if not.
00116     bool Remove(int index);
00117 
00118     /// Increments the count for the operation at the specified index
00119     /// by the specified value, specify a negative value to decrement.
00120     /// \return true if it is successfully incremented, false if not.
00121     bool IncrementCount(int index, int increment);
00122 
00123     /// Updates the operation at the specified index to be the specified
00124     /// operation and have the specified count.
00125     /// \return true if it is successfully updated, false if not.
00126     bool Update(int index, Operation op, int count);
00127 
00128     /// Sets this object to the specified cigarString.
00129     void Set(const char *cigarString);
00130 
00131     /// Sets this object to the BAM formatted cigar found at the beginning
00132     /// of the specified buffer which is bufferLen long.
00133     void Set(const uint32_t* cigarBuffer, uint16_t bufferLen);
00134 
00135     //
00136     // when we examine CIGAR strings, we need to know how
00137     // many cumulative insert and delete positions there are
00138     // so that we can adjust the read location appropriately.
00139     //
00140     // Here, we iterate over the vector of CIGAR operations,
00141     // summaring the count for each insert or delete (insert
00142     // increases the offset, delete decreases it).
00143     //
00144     // The use case for this is when we have a genome match
00145     // position based on an index word other than the first one,
00146     // and there is also a insert or delete between the beginning
00147     // of the read and the index word.  We can't simply report
00148     // the match position without taking into account the indels,
00149     // otherwise we'll be off by N where N is the sum of this
00150     // indel count.
00151     //
00152     /// DEPRECATED - do not use, there are better ways to accomplish that by 
00153     /// using read lengths, reference lengths, span of the read, etc.
00154     int getMatchPositionOffset();
00155 
00156     /// Get the string reprentation of the Cigar operations in this object,
00157     /// caller must delete the returned value.
00158     const char *getString();
00159 
00160     /// Clear this object so that it has no Cigar Operations.
00161     void clear();
00162 
00163 private:
00164 };
00165 
00166 
00167 inline std::ostream &operator << (std::ostream &stream, const CigarRoller& roller)
00168 {
00169     stream << roller.cigarOperations;
00170     return stream;
00171 }
00172 
00173 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends