Cigar.h

00001 /*
00002  *  Copyright (C) 2010-2011  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #if !defined(_CIGAR_H)
00019 #define _CIGAR_H
00020 
00021 #include <string.h> // for inline use of strcat, etc
00022 #include <limits.h> // for INT_MAX
00023 #include <stdint.h> // for uint32_t and friends
00024 
00025 
00026 #include <assert.h>
00027 #include <stdio.h>
00028 #include <stdlib.h>
00029 #include <string>
00030 #include <iostream>
00031 #include <iomanip>
00032 #include <utility>
00033 #include <vector>
00034 
00035 #include "Generic.h"
00036 #include "StringBasics.h"
00037 
00038 /// This class represents the CIGAR without any methods to set the cigar
00039 /// (see CigarRoller for that).
00040 
00041 //
00042 // Docs from Sam1.pdf:
00043 //
00044 // Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one.
00045 // Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is
00046 // an example. Suppose the clipped alignment is:
00047 // REF:  AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG
00048 // READ:        gggGTGTAACC-GACTAGgggg
00049 // where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for
00050 // this alignment is: 3S8M1D6M4S.
00051 //
00052 //
00053 // If the mapping position of the query is not available, RNAME and
00054 // CIGAR are set as “*”
00055 //
00056 // A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows
00057 // for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format
00058 // further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing:
00059 //
00060 // op   Description
00061 // --   -----------
00062 // M    Match or mismatch
00063 // I    Insertion to the reference
00064 // D    Deletion from the reference
00065 // N    Skipped region from the reference
00066 // S    Soft clip on the read (clipped sequence present in <seq>)
00067 // H    Hard clip on the read (clipped sequence NOT present in <seq>)
00068 // P    Padding (silent deletion from the padded reference sequence)
00069 //
00070 
00071 
00072 
00073 
00074 ////////////////////////////////////////////////////////////////////////
00075 ///
00076 /// This class represents the CIGAR.  It contains methods for converting
00077 /// to strings and extracting information from the cigar on how a read
00078 /// maps to the reference.
00079 ///
00080 /// It only contains read only methods.  There are no ways to set
00081 /// values.  To set a value, a child class must be used.
00082 ///
00083 class Cigar
00084 {
00085 public:
00086     /// Enum for the cigar operations.
00087     enum Operation {
00088         none=0, ///< no operation has been set.
00089         match, ///< match/mismatch operation.  Associated with CIGAR Operation "M"
00090         mismatch, ///< mismatch operation.  Associated with CIGAR Operation "M"
00091         insert,  ///< insertion to the reference (the query sequence contains bases that have no corresponding base in the reference).  Associated with CIGAR Operation "I"
00092         del,  ///< deletion from the reference (the reference contains bases that have no corresponding base in the query sequence).  Associated with CIGAR Operation "D"
00093         skip,  ///< skipped region from the reference (the reference contains bases that have no corresponding base in the query sequence).  Associated with CIGAR Operation "N"
00094         softClip,  ///< Soft clip on the read (clipped sequence present in the query sequence, but not in reference).  Associated with CIGAR Operation "S"
00095         hardClip,  ///< Hard clip on the read (clipped sequence not present in the query sequence or reference).  Associated with CIGAR Operation "H"
00096         pad ///< Padding (not in reference or query).  Associated with CIGAR Operation "P"
00097     };
00098 
00099     // The maximum value in the operation enum (used for setting up a bitset of
00100     // operations.
00101     static const int MAX_OP_VALUE = pad;
00102 
00103     ////////////////////////////////////////////////////////////////////////
00104     //
00105     // Nested Struct : CigarOperator
00106     //
00107     struct CigarOperator
00108     {
00109 
00110         CigarOperator()
00111         {
00112             operation = none;
00113             count = 0;
00114         }
00115 
00116         /// Set the cigar operator with the specified operation and
00117         /// count length.
00118         CigarOperator(Operation operation, uint32_t count)
00119             : operation(operation), count(count) {};
00120 
00121         Operation operation;
00122 
00123         uint32_t count;
00124 
00125         /// Get the character code (M, I, D, N, S, H, or P) associated with
00126         /// this operation.
00127         char getChar() const
00128         {
00129             switch (operation)
00130             {
00131                 case none:
00132                     return '?';  // error
00133                 case match:
00134                 case mismatch:
00135                     return'M';
00136                 case insert:
00137                     return 'I';
00138                 case del:
00139                     return'D';
00140                 case skip:
00141                     return 'N';
00142                 case softClip:
00143                     return 'S';
00144                 case hardClip:
00145                     return 'H';
00146                 case pad:
00147                     return 'P';
00148             }
00149             return '?'; // actually it is an error to get here
00150         }
00151 
00152         /// Compare only on the operator, true if they are the same, false if not.  Match and mismatch are considered the same for CIGAR strings.
00153         bool operator == (const CigarOperator &rhs) const
00154         {
00155             if (operation==rhs.operation)
00156                 return true;
00157             if ((operation == mismatch || operation == match) && (rhs.operation == mismatch || rhs.operation == match))
00158                 return true;
00159             return false;
00160         }
00161 
00162         /// Compare only on the operator, false if they are the same, true if not.  Match and mismatch are considered the same for CIGAR strings.
00163         bool operator != (const CigarOperator &rhs) const
00164         {
00165             return !((*this) == rhs) ;
00166         }
00167 
00168     };
00169 
00170     ////////////////////////////////////////////////////////////////////////
00171     //
00172     // Cigar  Class statics
00173     //
00174 
00175     /// Return true if the specified operation is found in the
00176     /// query sequence, false if not.
00177     static bool foundInQuery(Operation op)
00178     {
00179         switch(op)
00180         {
00181             case match:
00182             case mismatch:
00183             case insert:
00184             case softClip:
00185                 return true;
00186             default:
00187                 return false;
00188         }
00189         return false;
00190     }
00191     
00192     /// Return true if the specified operation is found in the
00193     /// query sequence, false if not.
00194     static bool foundInQuery(CigarOperator op)
00195     {
00196         switch(op.operation)
00197         {
00198             case match:
00199             case mismatch:
00200             case insert:
00201             case softClip:
00202                 return true;
00203             default:
00204                 return false;
00205         }
00206         return false;
00207     }
00208     
00209     /// Return true if the specified operation is a clipping operation,
00210     /// false if not.
00211     static bool isClip(Operation op)
00212     {
00213         switch(op)
00214         {
00215             case softClip:
00216             case hardClip:
00217                 return true;
00218             default:
00219                 return false;
00220         }
00221         return false;
00222     }
00223 
00224     /// Return true if the specified operation is a clipping operation,
00225     /// false if not.
00226     static bool isClip(CigarOperator op)
00227     {
00228         switch(op.operation)
00229         {
00230             case softClip:
00231             case hardClip:
00232                 return true;
00233             default:
00234                 return false;
00235         }
00236         return false;
00237     }
00238 
00239     /// Return true if the specified operation is a match/mismatch operation,
00240     /// false if not.
00241     static bool isMatchOrMismatch(Operation op)
00242     {
00243         switch(op)
00244         {
00245             case match:
00246             case mismatch:
00247                 return true;
00248             default:
00249                 return false;
00250         }
00251         return false;
00252     }
00253     
00254     /// Return true if the specified operation is a match/mismatch operation,
00255     /// false if not.
00256     static bool isMatchOrMismatch(CigarOperator op)
00257     {
00258         switch(op.operation)
00259         {
00260             case match:
00261             case mismatch:
00262                 return true;
00263             default:
00264                 return false;
00265         }
00266         return false;
00267     }
00268 
00269 
00270     ////////////////////////////////////////////////////////////////////////
00271     //
00272     // Cigar  Class non static
00273     //
00274     friend std::ostream &operator << (std::ostream &stream, const Cigar& cigar);
00275 
00276     /// Default constructor initializes as a CIGAR with no operations.
00277     Cigar()
00278     {
00279         clearQueryAndReferenceIndexes();
00280     }
00281 
00282 
00283     /// Set the passed in String to the string reprentation of the Cigar
00284     /// operations in this object.
00285     void getCigarString(String& cigarString) const;
00286 
00287     /// Set the passed in std::string to the string reprentation of the Cigar
00288     /// operations in this object.
00289     void getCigarString(std::string& cigarString) const;
00290 
00291     /// Sets the specified string to a valid CIGAR string of characters that
00292     /// represent the cigar with no digits (a CIGAR of "3M" would return "MMM").
00293     /// The returned string is actually also a valid CIGAR string.
00294     /// In theory this makes it easier to parse some reads.
00295     /// \return s the string to populate
00296     void getExpandedString(std::string &s) const;
00297 
00298     /// Return the Cigar Operation at the specified index (starting at 0).
00299     const CigarOperator & operator [](int i) const
00300     {
00301         return cigarOperations[i];
00302     }
00303 
00304     /// Return the Cigar Operation at the specified index (starting at 0).
00305     const CigarOperator & getOperator(int i) const
00306     {
00307         return cigarOperations[i];
00308     }
00309 
00310     /// Return true if the 2 Cigars are the same
00311     /// (the same operations of the same sizes).
00312     bool operator == (Cigar &rhs) const;
00313 
00314     /// Return the number of cigar operations
00315     int size()  const
00316     {
00317         return cigarOperations.size();
00318     }
00319 
00320     /// Write this object as a string to cout.
00321     void Dump() const
00322     {
00323         String cigarString;
00324         getCigarString(cigarString);
00325         std::cout << cigarString ;
00326     }
00327 
00328     /// Return the length of the read that corresponds to
00329     /// the current CIGAR string.
00330     ///
00331     /// For validation, we should expect that a sequence
00332     /// read in a SAM file will be the same length as the
00333     /// value returned by this method.
00334     ///
00335     /// Example: 3M2D3M describes a read with three bases
00336     /// matching the reference, then skips 2 bases, then has
00337     /// three more bases that match the reference (match/mismatch).
00338     /// In this case, the read length is expected to be 6.
00339     ///
00340     /// Example: 3M2I3M describes a read with 3 match/mismatch
00341     /// bases, two extra bases, and then 3 more match/mistmatch
00342     /// bases.  The total in this example is 8 bases.
00343     ///
00344     /// \return returns the expected read length
00345     int getExpectedQueryBaseCount() const;
00346 
00347     /// Return the number of bases in the reference that
00348     /// this CIGAR "spans".
00349     ///
00350     /// When doing range checking, we occassionally need to know
00351     /// how many total bases the CIGAR string represents as compared
00352     /// to the reference.
00353     ///
00354     /// Examples: 3M2D3M describes a read that overlays 8 bases in
00355     /// the reference.  3M2I3M describes a read with 3 bases that
00356     /// match the reference, two additional bases that aren't in the
00357     /// reference, and 3 more bases that match the reference, so it
00358     /// spans 6 bases in the reference.
00359     ///
00360     /// \return how many bases in the reference are spanned
00361     /// by the given CIGAR string
00362     ///
00363     int getExpectedReferenceBaseCount() const;
00364 
00365     /// Return the number of clips that are at the beginning of the cigar.
00366     int getNumBeginClips() const;
00367 
00368     /// Return the number of clips that are at the end of the cigar.
00369     int getNumEndClips() const;
00370 
00371     /// Return the reference offset associated with the specified
00372     /// query index or INDEX_NA based on this cigar.
00373     int32_t getRefOffset(int32_t queryIndex);
00374 
00375     /// Return the query index associated with the specified
00376     /// reference offset or INDEX_NA based on this cigar.
00377     int32_t getQueryIndex(int32_t refOffset);
00378 
00379     /// Return the reference position associated with the specified query index
00380     /// or INDEX_NA based on this cigar and the specified queryStartPos which
00381     /// is the leftmost mapping position of the first matching base in the
00382     /// query.
00383     int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos);
00384 
00385     /// Return the query index or INDEX_NA associated with the specified 
00386     /// reference offset when the query starts at the specified reference
00387     /// position.
00388     int32_t getQueryIndex(int32_t refPosition, int32_t queryStartPos);
00389 
00390     /// Return the number of bases that overlap the reference and the
00391     /// read associated with this cigar that falls within the specified region.
00392     /// \param start : inclusive 0-based start position (reference position) of
00393     ///         the region to check for overlaps in
00394     ///         (-1 indicates to start at the beginning of the reference.)
00395     /// \param end  : exclusive 0-based end position (reference position) of the
00396     ///          region to check for overlaps in
00397     ///         (-1 indicates to go to the end of the reference.)
00398     /// \param queryStartPos : 0-based leftmost mapping position of the first 
00399     ///                 matcihng base in the query.
00400     uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos);
00401 
00402     /// Return whether or not the cigar has indels (insertions or delections)
00403     /// \return true if it has an insertion or deletion, false if not.
00404     bool hasIndel();
00405 
00406     /// Value associated with an index that is not applicable/does not exist,
00407     /// used for converting between query and reference indexes/offsets when
00408     /// an associated index/offset does not exist.
00409     static const int32_t INDEX_NA;
00410 
00411 protected:
00412     // Clear the query index/reference offset index vectors.
00413     void clearQueryAndReferenceIndexes();
00414 
00415     // Set the query index/reference offset index vectors.
00416     void setQueryAndReferenceIndexes();
00417 
00418     // Container for the cigar operations in this cigar.
00419     std::vector<CigarOperator> cigarOperations;
00420 
00421 private:
00422     // The vector is indexed by query index and contains the reference
00423     // offset associated with that query index.
00424     // The vector is reset each time a new cigar operation is added, and
00425     // is calculated when accessed if it is not already set.
00426     std::vector<int32_t> queryToRef;
00427 
00428     // The vector is indexed by reference offset and contains the query
00429     // index associated with that reference offset.
00430     // The vector is reset each time a new cigar operation is added, and
00431     // is calculated when accessed if it is not already set.
00432     std::vector<int32_t> refToQuery;
00433 };
00434 
00435 /// Writes the specified cigar operation to the specified stream as <count><char> (3M).
00436 inline std::ostream &operator << (std::ostream &stream, const Cigar::CigarOperator& o)
00437 {
00438     stream << o.count << o.getChar();
00439     return stream;
00440 }
00441 
00442 /// Writes all of the cigar operations contained in the cigar to the passed in stream.
00443 inline std::ostream &operator << (std::ostream &stream, const Cigar& cigar)
00444 {
00445     stream << cigar.cigarOperations;
00446     return stream;
00447 }
00448 
00449 #endif
Generated on Tue Sep 6 17:52:00 2011 for libStatGen Software by  doxygen 1.6.3