libStatGen Software  1
Cigar.h
00001 /*
00002  *  Copyright (C) 2010-2011  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #if !defined(_CIGAR_H)
00019 #define _CIGAR_H
00020 
00021 #include <string.h> // for inline use of strcat, etc
00022 #include <limits.h> // for INT_MAX
00023 #include <stdint.h> // for uint32_t and friends
00024 
00025 
00026 #include <assert.h>
00027 #include <stdio.h>
00028 #include <stdlib.h>
00029 #include <string>
00030 #include <iostream>
00031 #include <iomanip>
00032 #include <utility>
00033 #include <vector>
00034 
00035 #include "Generic.h"
00036 #include "StringBasics.h"
00037 
00038 /// This class represents the CIGAR without any methods to set the cigar
00039 /// (see CigarRoller for that).
00040 
00041 //
00042 // Docs from Sam1.pdf:
00043 //
00044 // Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one.
00045 // Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is
00046 // an example. Suppose the clipped alignment is:
00047 // REF:  AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG
00048 // READ:        gggGTGTAACC-GACTAGgggg
00049 // where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for
00050 // this alignment is: 3S8M1D6M4S.
00051 //
00052 //
00053 // If the mapping position of the query is not available, RNAME and
00054 // CIGAR are set as “*”
00055 //
00056 // A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows
00057 // for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format
00058 // further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing:
00059 //
00060 // op   Description
00061 // --   -----------
00062 // M    Match or mismatch
00063 // I    Insertion to the reference
00064 // D    Deletion from the reference
00065 // N    Skipped region from the reference
00066 // S    Soft clip on the read (clipped sequence present in <seq>)
00067 // H    Hard clip on the read (clipped sequence NOT present in <seq>)
00068 // P    Padding (silent deletion from the padded reference sequence)
00069 //
00070 
00071 
00072 
00073 
00074 ////////////////////////////////////////////////////////////////////////
00075 ///
00076 /// This class represents the CIGAR.  It contains methods for converting
00077 /// to strings and extracting information from the cigar on how a read
00078 /// maps to the reference.
00079 ///
00080 /// It only contains read only methods.  There are no ways to set
00081 /// values.  To set a value, a child class must be used.
00082 ///
00083 class Cigar
00084 {
00085 public:
00086     /// Enum for the cigar operations.
00087     enum Operation {
00088         none=0, ///< no operation has been set.
00089         match, ///< match/mismatch operation.  Associated with CIGAR Operation "M"
00090         mismatch, ///< mismatch operation.  Associated with CIGAR Operation "M"
00091         insert,  ///< insertion to the reference (the query sequence contains bases that have no corresponding base in the reference).  Associated with CIGAR Operation "I"
00092         del,  ///< deletion from the reference (the reference contains bases that have no corresponding base in the query sequence).  Associated with CIGAR Operation "D"
00093         skip,  ///< skipped region from the reference (the reference contains bases that have no corresponding base in the query sequence).  Associated with CIGAR Operation "N"
00094         softClip,  ///< Soft clip on the read (clipped sequence present in the query sequence, but not in reference).  Associated with CIGAR Operation "S"
00095         hardClip,  ///< Hard clip on the read (clipped sequence not present in the query sequence or reference).  Associated with CIGAR Operation "H"
00096         pad ///< Padding (not in reference or query).  Associated with CIGAR Operation "P"
00097     };
00098 
00099     // The maximum value in the operation enum (used for setting up a bitset of
00100     // operations.
00101     static const int MAX_OP_VALUE = pad;
00102 
00103     ////////////////////////////////////////////////////////////////////////
00104     //
00105     // Nested Struct : CigarOperator
00106     //
00107     struct CigarOperator
00108     {
00109 
00110         CigarOperator()
00111         {
00112             operation = none;
00113             count = 0;
00114         }
00115 
00116         /// Set the cigar operator with the specified operation and
00117         /// count length.
00118         CigarOperator(Operation operation, uint32_t count)
00119             : operation(operation), count(count) {};
00120 
00121         Operation operation;
00122 
00123         uint32_t count;
00124 
00125         /// Get the character code (M, I, D, N, S, H, or P) associated with
00126         /// this operation.
00127         char getChar() const
00128         {
00129             switch (operation)
00130             {
00131                 case none:
00132                     return '?';  // error
00133                 case match:
00134                 case mismatch:
00135                     return'M';
00136                 case insert:
00137                     return 'I';
00138                 case del:
00139                     return'D';
00140                 case skip:
00141                     return 'N';
00142                 case softClip:
00143                     return 'S';
00144                 case hardClip:
00145                     return 'H';
00146                 case pad:
00147                     return 'P';
00148             }
00149             return '?'; // actually it is an error to get here
00150         }
00151 
00152         /// Compare only on the operator, true if they are the same, false if not.  Match and mismatch are considered the same for CIGAR strings.
00153         bool operator == (const CigarOperator &rhs) const
00154         {
00155             if (operation==rhs.operation)
00156                 return true;
00157             if ((operation == mismatch || operation == match) && (rhs.operation == mismatch || rhs.operation == match))
00158                 return true;
00159             return false;
00160         }
00161 
00162         /// Compare only on the operator, false if they are the same, true if not.  Match and mismatch are considered the same for CIGAR strings.
00163         bool operator != (const CigarOperator &rhs) const
00164         {
00165             return !((*this) == rhs) ;
00166         }
00167 
00168     };
00169 
00170     ////////////////////////////////////////////////////////////////////////
00171     //
00172     // Cigar  Class statics
00173     //
00174 
00175     /// Return true if the specified operation is found in the
00176     /// reference sequence, false if not.
00177     static bool foundInReference(Operation op)
00178     {
00179         switch(op)
00180         {
00181             case match:
00182             case mismatch:
00183             case del:
00184             case skip:
00185                 return true;
00186             default:
00187                 return false;
00188         }
00189         return false;
00190     }
00191 
00192     /// Return true if the specified operation is found in the
00193     /// reference sequence, false if not.
00194     static bool foundInReference(char op)
00195     {
00196         switch(op)
00197         {
00198             case 'M':
00199             case '=':
00200             case 'X':
00201             case 'D':
00202             case 'N':
00203                 return true;
00204             default:
00205                 return false;
00206         }
00207         return false;
00208     }
00209 
00210     /// Return true if the specified operation is found in the
00211     /// reference sequence, false if not.
00212     static bool foundInReference(const CigarOperator &op)
00213     {
00214         return(foundInReference(op.operation));
00215     }
00216 
00217     /// Return true if the specified operation is found in the
00218     /// query sequence, false if not.
00219     static bool foundInQuery(Operation op)
00220     {
00221         switch(op)
00222         {
00223             case match:
00224             case mismatch:
00225             case insert:
00226             case softClip:
00227                 return true;
00228             default:
00229                 return false;
00230         }
00231         return false;
00232     }
00233     
00234     /// Return true if the specified operation is found in the
00235     /// query sequence, false if not.
00236     static bool foundInQuery(char op)
00237     {
00238         switch(op)
00239         {
00240             case 'M':
00241             case '=':
00242             case 'X':
00243             case 'I':
00244             case 'S':
00245                 return true;
00246             default:
00247                 return false;
00248         }
00249         return false;
00250     }
00251 
00252     /// Return true if the specified operation is found in the
00253     /// query sequence, false if not.
00254     static bool foundInQuery(const CigarOperator &op)
00255     {
00256         return(foundInQuery(op.operation));
00257     }
00258     
00259     /// Return true if the specified operation is a clipping operation,
00260     /// false if not.
00261     static bool isClip(Operation op)
00262     {
00263         switch(op)
00264         {
00265             case softClip:
00266             case hardClip:
00267                 return true;
00268             default:
00269                 return false;
00270         }
00271         return false;
00272     }
00273 
00274     /// Return true if the specified operation is a clipping operation,
00275     /// false if not.
00276     static bool isClip(char op)
00277     {
00278         switch(op)
00279         {
00280             case 'S':
00281             case 'H':
00282                 return true;
00283             default:
00284                 return false;
00285         }
00286         return false;
00287     }
00288 
00289     /// Return true if the specified operation is a clipping operation,
00290     /// false if not.
00291     static bool isClip(const CigarOperator &op)
00292     {
00293         return(isClip(op.operation));
00294     }
00295 
00296     /// Return true if the specified operation is a match/mismatch operation,
00297     /// false if not.
00298     static bool isMatchOrMismatch(Operation op)
00299     {
00300         switch(op)
00301         {
00302             case match:
00303             case mismatch:
00304                 return true;
00305             default:
00306                 return false;
00307         }
00308         return false;
00309     }
00310     
00311     /// Return true if the specified operation is a match/mismatch operation,
00312     /// false if not.
00313     static bool isMatchOrMismatch(const CigarOperator &op)
00314     {
00315         return(isMatchOrMismatch(op.operation));
00316     }
00317 
00318 
00319     ////////////////////////////////////////////////////////////////////////
00320     //
00321     // Cigar  Class non static
00322     //
00323     friend std::ostream &operator << (std::ostream &stream, const Cigar& cigar);
00324 
00325     /// Default constructor initializes as a CIGAR with no operations.
00326     Cigar()
00327     {
00328         clearQueryAndReferenceIndexes();
00329     }
00330 
00331 
00332     /// Set the passed in String to the string reprentation of the Cigar
00333     /// operations in this object.
00334     void getCigarString(String& cigarString) const;
00335 
00336     /// Set the passed in std::string to the string reprentation of the Cigar
00337     /// operations in this object.
00338     void getCigarString(std::string& cigarString) const;
00339 
00340     /// Sets the specified string to a valid CIGAR string of characters that
00341     /// represent the cigar with no digits (a CIGAR of "3M" would return "MMM").
00342     /// The returned string is actually also a valid CIGAR string.
00343     /// In theory this makes it easier to parse some reads.
00344     /// \return s the string to populate
00345     void getExpandedString(std::string &s) const;
00346 
00347     /// Return the Cigar Operation at the specified index (starting at 0).
00348     const CigarOperator & operator [](int i) const
00349     {
00350         return cigarOperations[i];
00351     }
00352 
00353     /// Return the Cigar Operation at the specified index (starting at 0).
00354     const CigarOperator & getOperator(int i) const
00355     {
00356         return cigarOperations[i];
00357     }
00358 
00359     /// Return true if the 2 Cigars are the same
00360     /// (the same operations of the same sizes).
00361     bool operator == (Cigar &rhs) const;
00362 
00363     /// Return the number of cigar operations
00364     int size()  const
00365     {
00366         return cigarOperations.size();
00367     }
00368 
00369     /// Write this object as a string to cout.
00370     void Dump() const
00371     {
00372         String cigarString;
00373         getCigarString(cigarString);
00374         std::cout << cigarString ;
00375     }
00376 
00377     /// Return the length of the read that corresponds to
00378     /// the current CIGAR string.
00379     ///
00380     /// For validation, we should expect that a sequence
00381     /// read in a SAM file will be the same length as the
00382     /// value returned by this method.
00383     ///
00384     /// Example: 3M2D3M describes a read with three bases
00385     /// matching the reference, then skips 2 bases, then has
00386     /// three more bases that match the reference (match/mismatch).
00387     /// In this case, the read length is expected to be 6.
00388     ///
00389     /// Example: 3M2I3M describes a read with 3 match/mismatch
00390     /// bases, two extra bases, and then 3 more match/mistmatch
00391     /// bases.  The total in this example is 8 bases.
00392     ///
00393     /// \return returns the expected read length
00394     int getExpectedQueryBaseCount() const;
00395 
00396     /// Return the number of bases in the reference that
00397     /// this CIGAR "spans".
00398     ///
00399     /// When doing range checking, we occassionally need to know
00400     /// how many total bases the CIGAR string represents as compared
00401     /// to the reference.
00402     ///
00403     /// Examples: 3M2D3M describes a read that overlays 8 bases in
00404     /// the reference.  3M2I3M describes a read with 3 bases that
00405     /// match the reference, two additional bases that aren't in the
00406     /// reference, and 3 more bases that match the reference, so it
00407     /// spans 6 bases in the reference.
00408     ///
00409     /// \return how many bases in the reference are spanned
00410     /// by the given CIGAR string
00411     ///
00412     int getExpectedReferenceBaseCount() const;
00413 
00414     /// Return the number of clips that are at the beginning of the cigar.
00415     int getNumBeginClips() const;
00416 
00417     /// Return the number of clips that are at the end of the cigar.
00418     int getNumEndClips() const;
00419 
00420     /// Return the reference offset associated with the specified
00421     /// query index or INDEX_NA based on this cigar.
00422     int32_t getRefOffset(int32_t queryIndex);
00423 
00424     /// Return the query index associated with the specified
00425     /// reference offset or INDEX_NA based on this cigar.
00426     int32_t getQueryIndex(int32_t refOffset);
00427 
00428     /// Return the reference position associated with the specified query index
00429     /// or INDEX_NA based on this cigar and the specified queryStartPos which
00430     /// is the leftmost mapping position of the first matching base in the
00431     /// query.
00432     int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos);
00433 
00434     /// Return the query index or INDEX_NA associated with the specified 
00435     /// reference offset when the query starts at the specified reference
00436     /// position.
00437     int32_t getQueryIndex(int32_t refPosition, int32_t queryStartPos);
00438 
00439     /// Returns the index into the expanded cigar for the cigar 
00440     /// associated with the specified queryIndex.
00441     /// INDEX_NA returned if the index is out of range.
00442     int32_t getExpandedCigarIndexFromQueryIndex(int32_t queryIndex);
00443 
00444     /// Returns the index into the expanded cigar for the cigar 
00445     /// associated with the specified reference offset.
00446     /// INDEX_NA returned if the offset is out of range.
00447     int32_t getExpandedCigarIndexFromRefOffset(int32_t refOffset);
00448 
00449     /// Returns the index into the expanded cigar for the cigar 
00450     /// associated with the specified reference position and queryStartPos.
00451     /// INDEX_NA returned if the position is out of range.
00452     int32_t getExpandedCigarIndexFromRefPos(int32_t refPosition, 
00453                                             int32_t queryStartPos);
00454 
00455     /// Return the character code of the cigar operator associated with the
00456     /// specified expanded CIGAR index.  '?' is returned for an out of range
00457     /// index.
00458     char getCigarCharOp(int32_t expandedCigarIndex);
00459 
00460     /// Return the character code of the cigar operator associated with
00461     /// the specified queryIndex.  '?' is returned for an out of range index.
00462     char getCigarCharOpFromQueryIndex(int32_t queryIndex);
00463 
00464     /// Return the character code of the cigar operator associated with
00465     /// the specified reference offset.  '?' is returned for an out of range offset.
00466     char getCigarCharOpFromRefOffset(int32_t refOffset);
00467 
00468     /// Return the character code of the cigar operator associated with
00469     /// the specified reference position.  '?' is returned for an out of
00470     /// range reference position.
00471     char getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos);
00472 
00473     /// Return the number of bases that overlap the reference and the
00474     /// read associated with this cigar that falls within the specified region.
00475     /// \param start : inclusive 0-based start position (reference position) of
00476     ///         the region to check for overlaps in
00477     ///         (-1 indicates to start at the beginning of the reference.)
00478     /// \param end  : exclusive 0-based end position (reference position) of the
00479     ///          region to check for overlaps in
00480     ///         (-1 indicates to go to the end of the reference.)
00481     /// \param queryStartPos : 0-based leftmost mapping position of the first 
00482     ///                 matcihng base in the query.
00483     uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos);
00484 
00485     /// Return whether or not the cigar has indels (insertions or delections)
00486     /// \return true if it has an insertion or deletion, false if not.
00487     bool hasIndel();
00488 
00489     /// Value associated with an index that is not applicable/does not exist,
00490     /// used for converting between query and reference indexes/offsets when
00491     /// an associated index/offset does not exist.
00492     static const int32_t INDEX_NA;
00493 
00494 protected:
00495     // Clear the query index/reference offset index vectors.
00496     void clearQueryAndReferenceIndexes();
00497 
00498     // Set the query index/reference offset index vectors.
00499     void setQueryAndReferenceIndexes();
00500 
00501     // Container for the cigar operations in this cigar.
00502     std::vector<CigarOperator> cigarOperations;
00503 
00504 private:
00505     // The vector is indexed by query index and contains the reference
00506     // offset associated with that query index.
00507     // The vector is reset each time a new cigar operation is added, and
00508     // is calculated when accessed if it is not already set.
00509     std::vector<int32_t> queryToRef;
00510 
00511     // The vector is indexed by reference offset and contains the query
00512     // index associated with that reference offset.
00513     // The vector is reset each time a new cigar operation is added, and
00514     // is calculated when accessed if it is not already set.
00515     std::vector<int32_t> refToQuery;
00516 
00517     // The vector is indexed by reference offset and contains the offset into
00518     // the expanded cigar associated with that reference offset.
00519     // The vector is reset each time a new cigar operation is added, and
00520     // is calculated when accessed if it is not already set.
00521     std::vector<int32_t> refToCigar;
00522 
00523     // The vector is indexed by query index and contains the offset into
00524     // the expanded cigar associated with that query index.
00525     // The vector is reset each time a new cigar operation is added, and
00526     // is calculated when accessed if it is not already set.
00527     std::vector<int32_t> queryToCigar;
00528 
00529     std::string myExpandedCigar;
00530 };
00531 
00532 /// Writes the specified cigar operation to the specified stream as <count><char> (3M).
00533 inline std::ostream &operator << (std::ostream &stream, const Cigar::CigarOperator& o)
00534 {
00535     stream << o.count << o.getChar();
00536     return stream;
00537 }
00538 
00539 /// Writes all of the cigar operations contained in the cigar to the passed in stream.
00540 inline std::ostream &operator << (std::ostream &stream, const Cigar& cigar)
00541 {
00542     stream << cigar.cigarOperations;
00543     return stream;
00544 }
00545 
00546 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends