Cigar.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #if !defined(_CIGAR_H)
00019 #define _CIGAR_H
00020 
00021 #include <string.h> // for inline use of strcat, etc
00022 #include <limits.h> // for INT_MAX
00023 #include <stdint.h> // for uint32_t and friends
00024 
00025 
00026 #include <assert.h>
00027 #include <stdio.h>
00028 #include <stdlib.h>
00029 #include <string>
00030 #include <iostream>
00031 #include <iomanip>
00032 #include <utility>
00033 #include <vector>
00034 
00035 #include "Generic.h"
00036 #include "StringBasics.h"
00037 
00038 //
00039 // Docs from Sam1.pdf:
00040 //
00041 // Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one.
00042 // Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is
00043 // an example. Suppose the clipped alignment is:
00044 // REF:  AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG
00045 // READ:        gggGTGTAACC-GACTAGgggg
00046 // where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for
00047 // this alignment is: 3S8M1D6M4S.
00048 //
00049 //
00050 // If the mapping position of the query is not available, RNAME and
00051 // CIGAR are set as “*”
00052 //
00053 // A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows
00054 // for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format
00055 // further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing:
00056 //
00057 // op   Description
00058 // --   -----------
00059 // M    Match or mismatch
00060 // I    Insertion to the reference
00061 // D    Deletion from the reference
00062 // N    Skipped region from the reference
00063 // S    Soft clip on the read (clipped sequence present in <seq>)
00064 // H    Hard clip on the read (clipped sequence NOT present in <seq>)
00065 // P    Padding (silent deletion from the padded reference sequence)
00066 //
00067 //
00068 
00069 
00070 
00071 ////////////////////////////////////////////////////////////////////////
00072 //
00073 // This class represents the CIGAR.  It contains methods for converting
00074 // to strings and extracting information from the cigar on how a read
00075 // maps to the reference.
00076 //
00077 // It only contains read only methods.  There are no ways to set
00078 // values.  To set a value, a child class must be used.
00079 //
00080 class Cigar
00081 {
00082 public:
00083     enum Operation {none, match, mismatch, insert, del, skip, softClip, hardClip, pad};
00084 
00085     // Return true if the specified operation is found in the
00086     // query sequence, false if not.
00087     static bool foundInQuery(Operation op)
00088     {
00089         switch(op)
00090         {
00091             case match:
00092             case mismatch:
00093             case insert:
00094             case softClip:
00095                 return true;
00096             default:
00097                 return false;
00098         }
00099         return false;
00100     }
00101     
00102     // Return true if the specified operation is a clipping operation,
00103     // false if not.
00104     static bool isClip(Operation op)
00105     {
00106         switch(op)
00107         {
00108             case softClip:
00109             case hardClip:
00110                 return true;
00111             default:
00112                 return false;
00113         }
00114         return false;
00115     }
00116     
00117     ////////////////////////////////////////////////////////////////////////
00118     //
00119     // Nested Struct : CigarOperator
00120     //
00121     struct CigarOperator
00122     {
00123 
00124         CigarOperator()
00125         {
00126             operation = none;
00127             count = 0;
00128         }
00129 
00130         CigarOperator(Operation operation, uint32_t count): operation(operation), count(count) {};
00131 
00132         Operation operation;
00133 
00134         uint32_t count;
00135 
00136         // Get the character associated with this operation.
00137         char getChar() const
00138         {
00139             switch (operation)
00140             {
00141                 case none:
00142                     return '?';  // error
00143                 case match:
00144                 case mismatch:
00145                     return'M';
00146                 case insert:
00147                     return 'I';
00148                 case del:
00149                     return'D';
00150                 case skip:
00151                     return 'N';
00152                 case softClip:
00153                     return 'S';
00154                 case hardClip:
00155                     return 'H';
00156                 case pad:
00157                     return 'P';
00158             }
00159             return '?'; // actually it is an error to get here
00160         }
00161 
00162         //
00163         // Compare only on the operator.
00164         //
00165         // Match and mismatch are considered the same for CIGAR strings.
00166         //
00167         bool operator == (const CigarOperator &rhs) const
00168         {
00169             if (operation==rhs.operation)
00170                 return true;
00171             if ((operation == mismatch || operation == match) && (rhs.operation == mismatch || rhs.operation == match))
00172                 return true;
00173             return false;
00174         }
00175 
00176         bool operator != (const CigarOperator &rhs) const
00177         {
00178             return !((*this) == rhs) ;
00179         }
00180 
00181     };
00182 
00183     ////////////////////////////////////////////////////////////////////////
00184     //
00185     // Cigar  Class
00186     //
00187     friend std::ostream &operator << (std::ostream &stream, const Cigar& cigar);
00188 
00189     Cigar()
00190     {
00191         clearQueryAndReferenceIndexes();
00192     }
00193 
00194     //
00195     // Set the passed in string to the string reprentation of the Cigar
00196     // operations in this object.
00197     //
00198     void getCigarString(String& cigarString) const;
00199     void getCigarString(std::string& cigarString) const;
00200 
00201     /// obtain a non-run length encoded string of operations
00202     ///
00203     /// The returned string is actually also a valid CIGAR string,
00204     /// but it does not have any digits in it - only the characters
00205     /// themselves.  In theory this makes it easier to parse some
00206     /// reads.
00207     ///
00208     /// /return s the string to populate
00209     void getExpandedString(std::string &s) const;
00210 
00211     const CigarOperator & operator [](int i) const
00212     {
00213         return cigarOperations[i];
00214     }
00215 
00216     const CigarOperator & getOperator(int i) const
00217     {
00218         return cigarOperations[i];
00219     }
00220 
00221     bool operator == (Cigar &rhs) const;
00222 
00223     //
00224     // get the number of cigar operations
00225     //
00226     int size()  const
00227     {
00228         return cigarOperations.size();
00229     }
00230 
00231     void Dump() const
00232     {
00233         String cigarString;
00234         getCigarString(cigarString);
00235         std::cout << cigarString ;
00236     }
00237 
00238     /// return the length of the read that corresponds to
00239     /// the current CIGAR string.
00240     ///
00241     /// For validation, we should expect that a sequence
00242     /// read in a SAM file will be the same length as the
00243     /// value returned by this method.
00244     ///
00245     /// Example: 3M2D3M describes a read with three bases
00246     /// matching the reference, then skips 2 bases, then has
00247     /// three more bases that match the reference (match/mismatch).
00248     /// In this case, the read length is expected to be 6.
00249     ///
00250     /// Example: 3M2I3M describes a read with 3 match/mismatch
00251     /// bases, two extra bases, and then 3 more match/mistmatch
00252     /// bases.  The total in this example is 8 bases.
00253     ///
00254     /// /return returns the expected read length
00255     int getExpectedQueryBaseCount() const;
00256 
00257     /// return the number of bases in the reference that
00258     /// this read "spans"
00259     ///
00260     /// When doing range checking, we occassionally need to know
00261     /// how many total bases the CIGAR string represents as compared
00262     /// to the reference.
00263     ///
00264     /// Examples: 3M2D3M describes a read that overlays 8 bases in
00265     /// the reference.  3M2I3M describes a read with 3 bases that
00266     /// match the reference, two additional bases that aren't in the
00267     /// reference, and 3 more bases that match the reference, so it
00268     /// spans 6 bases in the reference.
00269     ///
00270     /// /return how many bases in the reference are spanned
00271     /// by the given CIGAR string
00272     ///
00273     int getExpectedReferenceBaseCount() const;
00274 
00275     /// Return the number of clips that are at the beginning of the cigar.
00276     int getNumBeginClips() const;
00277 
00278     /// Return the number of clips that are at the end of the cigar.
00279     int getNumEndClips() const;
00280 
00281     // Return the reference offset associated with the specified
00282     // query index based on this cigar.
00283     int32_t getRefOffset(int32_t queryIndex);
00284 
00285     // Return the query index associated with the specified
00286     // reference offset based on this cigar.
00287     int32_t getQueryIndex(int32_t refOffset);
00288 
00289     // Return the reference position associated with the specified query
00290     // index based on this cigar and the specified queryStartPos which
00291     // is the leftmost mapping position of the first matching
00292     // base in the query.
00293     int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos);
00294 
00295     // Return the query index associated with the specified reference position
00296     // when the query starts at the specified reference position based on
00297     // this cigar.
00298     int32_t getQueryIndex(int32_t refPosition, int32_t queryStartPos);
00299 
00300     // Return the number of bases that overlap the reference and the
00301     // read associated with this cigar that falls within the specified region.
00302     // start : inclusive 0-based start position (reference position) of the
00303     //         region to check for overlaps in.
00304     //         (-1 indicates to start at the beginning of the reference.)
00305     // end   : exclusive 0-based end position (reference position) of the
00306     //          region to check for overlaps in.
00307     //         (-1 indicates to go to the end of the reference.)
00308     // queryStartPos : 0-based leftmost mapping position of the first matching
00309     //                 base in the query.
00310     uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos);
00311 
00312     static const int32_t INDEX_NA;
00313 
00314 protected:
00315     // Clear the query index/reference offset index vectors.
00316     void clearQueryAndReferenceIndexes();
00317 
00318     // Set the query index/reference offset index vectors.
00319     void setQueryAndReferenceIndexes();
00320 
00321     // Container for the cigar operations in this cigar.
00322     std::vector<CigarOperator> cigarOperations;
00323 
00324 private:
00325     // The vector is indexed by query index and contains the reference
00326     // offset associated with that query index.
00327     // The vector is reset each time a new cigar operation is added, and
00328     // is calculated when accessed if it is not already set.
00329     std::vector<int32_t> queryToRef;
00330 
00331     // The vector is indexed by reference offset and contains the query
00332     // index associated with that reference offset.
00333     // The vector is reset each time a new cigar operation is added, and
00334     // is calculated when accessed if it is not already set.
00335     std::vector<int32_t> refToQuery;
00336 };
00337 
00338 inline std::ostream &operator << (std::ostream &stream, const Cigar::CigarOperator& o)
00339 {
00340     stream << o.count << o.getChar();
00341     return stream;
00342 }
00343 
00344 inline std::ostream &operator << (std::ostream &stream, const Cigar& cigar)
00345 {
00346     stream << cigar.cigarOperations;
00347     return stream;
00348 }
00349 
00350 #endif
Generated on Wed Nov 17 15:38:28 2010 for StatGen Software by  doxygen 1.6.3