CigarRoller.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #if !defined(_CIGAR_ROLLER_H)
00019 #define _CIGAR_ROLLER_H
00020 
00021 #include "Cigar.h"
00022 
00023 //
00024 // Docs from Sam1.pdf:
00025 //
00026 // Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one.
00027 // Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is
00028 // an example. Suppose the clipped alignment is:
00029 // REF:  AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG
00030 // READ:        gggGTGTAACC-GACTAGgggg
00031 // where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for
00032 // this alignment is: 3S8M1D6M4S.
00033 //
00034 //
00035 // If the mapping position of the query is not available, RNAME and
00036 // CIGAR are set as “*”
00037 //
00038 // A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows
00039 // for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format
00040 // further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing:
00041 //
00042 // op   Description
00043 // --   -----------
00044 // M    Match or mismatch
00045 // I    Insertion to the reference
00046 // D    Deletion from the reference
00047 // N    Skipped region from the reference
00048 // S    Soft clip on the read (clipped sequence present in <seq>)
00049 // H    Hard clip on the read (clipped sequence NOT present in <seq>)
00050 // P    Padding (silent deletion from the padded reference sequence)
00051 //
00052 //
00053 
00054 
00055 
00056 ////////////////////////////////////////////////////////////////////////
00057 //
00058 // CigarRoller is an aid to correctly generating the CIGAR strings
00059 // necessary to represent how a read maps to the reference.
00060 //
00061 // It is called once a particular match candidate is being written
00062 // out, so it is far less performance sensitive than the Smith Waterman
00063 // code below.
00064 //
00065 class CigarRoller : public Cigar
00066 {
00067 public:
00068 
00069     ////////////////////////////////////////////////////////////////////////
00070     //
00071     // Cigar Roller Class
00072     //
00073     friend std::ostream &operator << (std::ostream &stream, const CigarRoller& roller);
00074 
00075     CigarRoller()
00076     {
00077         clearQueryAndReferenceIndexes();
00078     }
00079     CigarRoller(const char *cigarString)
00080     {
00081         Set(cigarString);
00082     }
00083 
00084 
00085     CigarRoller & operator += (CigarRoller &rhs);
00086 
00087     //
00088     // Append a new operator at the end of the sequence.
00089     //
00090     CigarRoller & operator += (const CigarOperator &rhs);
00091 
00092     CigarRoller & operator = (CigarRoller &rhs);
00093 
00094     //
00095     void Add(Operation operation, int count);
00096 
00097     void Add(char operation, int count);
00098 
00099     void Add(const char *cigarString);
00100 
00101     void Add(CigarRoller &rhs)
00102     {
00103         (*this) += rhs;
00104     }
00105 
00106     void Set(const char *cigarString);
00107 
00108     void Set(const uint32_t* cigarBuffer, uint16_t bufferLen);
00109 
00110     //
00111     // when we examine CIGAR strings, we need to know how
00112     // many cumulative insert and delete positions there are
00113     // so that we can adjust the read location appropriately.
00114     //
00115     // Here, we iterate over the vector of CIGAR operations,
00116     // summaring the count for each insert or delete (insert
00117     // increases the offset, delete decreases it).
00118     //
00119     // The use case for this is when we have a genome match
00120     // position based on an index word other than the first one,
00121     // and there is also a insert or delete between the beginning
00122     // of the read and the index word.  We can't simply report
00123     // the match position without taking into account the indels,
00124     // otherwise we'll be off by N where N is the sum of this
00125     // indel count.
00126     //
00127     // DEPRECATED - do not use.  There are better ways to accomplish that by using
00128     // read lengths, reference lengths, span of the read, etc.
00129     int getMatchPositionOffset();
00130 
00131     //
00132     // Get the string reprentation of the Cigar operations in this object.
00133     // Caller must delete the returned value.
00134     //
00135     const char *getString();
00136 
00137     void clear();
00138 
00139 private:
00140 };
00141 
00142 inline std::ostream &operator << (std::ostream &stream, const CigarRoller& roller)
00143 {
00144     stream << roller.cigarOperations;
00145     return stream;
00146 }
00147 
00148 #endif