SamRecord.h

00001 /*
00002  *  Copyright (C) 2010-2011  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_RECORD_H__
00019 #define __SAM_RECORD_H__
00020 
00021 #include <stdint.h>
00022 
00023 #include "GenomeSequence.h"
00024 #include "SamStatus.h"
00025 #include "LongHash.h"
00026 #include "MathVector.h"
00027 #include "StringArray.h"
00028 #include "IntArray.h"
00029 #include "SamFileHeader.h"
00030 #include "CigarRoller.h"
00031 
00032 /// Structure of a BAM record.
00033 struct bamRecordStruct
00034 {
00035 public:
00036     int32_t      myBlockSize;
00037     int32_t      myReferenceID;
00038     int32_t      myPosition;
00039     uint32_t     myReadNameLength : 8, myMapQuality : 8, myBin : 16;
00040     uint32_t     myCigarLength : 16, myFlag : 16;
00041     int32_t      myReadLength;
00042     int32_t      myMateReferenceID;
00043     int32_t      myMatePosition;
00044     int32_t      myInsertSize;             // Outer fragment length
00045     char  myData[1];
00046 };
00047 
00048 
00049 /// Class providing an easy to use interface to get/set/operate on the
00050 /// fields in a SAM/BAM record. 
00051 class SamRecord
00052 {
00053 public:
00054     /// Enum containing the settings on how to translate the sequence if a
00055     /// reference is available.  If no reference is available, no translation
00056     /// is done.
00057     enum SequenceTranslation { 
00058         NONE,   ///< Leave the sequence as is.
00059         EQUAL,  ///< Translate bases that match the reference to '='
00060         BASES,  ///< Translate '=' to the actual base.
00061     };
00062 
00063     /// Default Constructor.
00064     SamRecord();
00065 
00066     /// Constructor that sets the error handling type.
00067     /// \param errorHandlingType how to handle errors.
00068     SamRecord(ErrorHandler::HandlingType errorHandlingType);
00069 
00070     /// Destructor
00071     ~SamRecord();
00072 
00073     /// Reset the fields of the record to a default value.
00074     /// This is not necessary when you are reading a Sam/Bam file, 
00075     /// but if you are setting fields, it is a good idea to clean
00076     /// out a record before reusing it. Clearing it allows you to 
00077     /// not have to set any empty fields. 
00078     void resetRecord();
00079 
00080     /// Reset the tag iterator to the beginning of the tags.
00081     void resetTagIter();
00082  
00083     /// Returns whether or not the record is valid.
00084     /// Sets the status to indicate success or failure.
00085     /// \param header SAM Header associated with the record.  Used to perform
00086     /// some validation against the header.
00087     /// \return true if the record is valid, false if not.
00088     bool isValid(SamFileHeader& header);
00089 
00090     /// Read the BAM record from a file.
00091     /// \param filePtr file to read the buffer from.
00092     /// \param header BAM header for the record.
00093     /// \return status of the reading the BAM record from the file.
00094     SamStatus::Status setBufferFromFile(IFILE filePtr, SamFileHeader& header);
00095 
00096     /// Set the reference to the specified genome sequence object.
00097     /// \param reference pointer to the GenomeSequence object.
00098     void setReference(GenomeSequence* reference);
00099 
00100     /// Set the type of sequence translation to use when getting
00101     /// the sequence.  The default type (if this method is never called) is
00102     /// NONE (the sequence is left as-is).  Can be over-ridden by using 
00103     /// the accessors that take a SequenceTranslation parameter.
00104     /// \param translation type of sequence translation to use.
00105     void setSequenceTranslation(SequenceTranslation translation);
00106 
00107     ///////////////////////
00108     /// @name  Set Alignment Data
00109     /// Set methods for record fields.  All of the "set" methods set the
00110     /// status to indicate success or the failure reason.
00111     //@{
00112 
00113     /// Set QNAME to the passed in name.
00114     /// \param readName the readname to set the QNAME to.
00115     /// \return true if successfully set, false if not.
00116     bool setReadName(const char* readName);
00117 
00118     /// Set the bitwise flag to the specified value.
00119     /// \param flag integer flag to use.
00120     /// \return true if successfully set, false if not.
00121     bool setFlag(uint16_t flag);
00122     
00123     /// Set the reference name to the specified name, using the header to
00124     /// determine the reference id.
00125     /// \param header SAM/BAM header to use to determine the reference id.
00126     /// \param referenceName reference name to use.
00127     /// \return true if successfully set, false if not
00128     bool setReferenceName(SamFileHeader& header, 
00129                           const char* referenceName);
00130 
00131     /// Set the leftmost position using the specified 1-based (SAM format)
00132     /// value.
00133     /// Internal processing handles the switching between SAM/BAM formats 
00134     /// when read/written.
00135     /// \param position 1-based start position
00136     /// \return true if successfully set, false if not.
00137     bool set1BasedPosition(int32_t position);
00138 
00139     /// Set the leftmost position using the specified 0-based (BAM format)
00140     /// value.
00141     /// Internal processing handles the switching between SAM/BAM formats 
00142     /// when read/written.
00143     /// \param position 0-based start position
00144     /// \return true if successfully set, false if not.
00145     bool set0BasedPosition(int32_t position);
00146 
00147     /// Set the mapping quality.
00148     /// \param mapQuality map quality to set in the record.
00149     /// \return true if successfully set, false if not.
00150     bool setMapQuality(uint8_t mapQuality);
00151 
00152     /// Set the CIGAR to the specified SAM formatted cigar string.
00153     /// Internal processing handles the switching between SAM/BAM formats 
00154     /// when read/written.
00155     /// \param cigar string containing the SAM formatted cigar.
00156     /// \return true if successfully set, false if not.
00157     bool setCigar(const char* cigar);
00158 
00159     /// Set the CIGAR to the specified Cigar object.
00160     /// Internal processing handles the switching between SAM/BAM formats 
00161     /// when read/written.
00162     /// \param cigar object to set this record's cigar to have.
00163     /// \return true if successfully set, false if not.
00164     bool setCigar(const Cigar& cigar);
00165 
00166 
00167     /// Set the mate reference sequence name to the specified name, using the
00168     /// header to determine the matee reference id.
00169     /// \param header SAM/BAM header to use to determine the mate reference id.
00170     /// \param referenceName mate reference name to use.
00171     /// \return true if successfully set, false if not
00172     bool setMateReferenceName(SamFileHeader& header,
00173                               const char* mateReferenceName);
00174 
00175     /// Set the leftmost mate position using the specified 1-based (SAM format)
00176     /// value.
00177     /// Internal processing handles the switching between SAM/BAM formats 
00178     /// when read/written.
00179     /// \param position 1-based start position
00180     /// \return true if successfully set, false if not.
00181     bool set1BasedMatePosition(int32_t matePosition);
00182 
00183     /// Set the leftmost mate position using the specified 0-based (BAM format)
00184     /// value.
00185     /// Internal processing handles the switching between SAM/BAM formats 
00186     /// when read/written.
00187     /// \param position 0-based start position
00188     /// \return true if successfully set, false if not.
00189     bool set0BasedMatePosition(int32_t matePosition);
00190 
00191     /// Sets the inferred insert size.
00192     /// \param insertSize inferred insert size.
00193     /// \return true if successfully set, false if not.
00194     bool setInsertSize(int32_t insertSize);
00195 
00196     /// Sets the sequence to the specified sequence string.  This is a 
00197     /// SAM formatted sequence string.  Internal processing handles switching
00198     /// between SAM/BAM formats when read/written.
00199     /// \param seq SAM sequence string.  May contain '='.
00200     /// \return true if successfully set, false if not.
00201     bool setSequence(const char* seq);
00202 
00203     /// Sets the quality to the specified quality string.  This is a SAM 
00204     /// formatted quality string.  Internal processing handles switching 
00205     /// between SAM/BAM formats when read/written.
00206     /// \param quality SAM quality string.
00207     /// \return true if successfully set, false if not.
00208     bool setQuality(const char* quality);
00209 
00210     /// Sets the SamRecord to contain the information in BAM format
00211     /// found in fromBuffer.
00212     /// \param fromBuffer buffer to read the BAM record from.
00213     /// \param fromBufferSize size of the buffer containing the BAM record.
00214     /// \param header BAM header for the record.
00215     /// \return status of reading the BAM record from the buffer.
00216     SamStatus::Status setBuffer(const char* fromBuffer, uint32_t fromBufferSize,
00217                                 SamFileHeader& header);
00218 
00219     /// Add the specified integer tag to the record.  Internal processing
00220     /// handles switching between SAM/BAM formats when read/written and 
00221     /// determining the type for BAM format.  If the tag is already there
00222     /// this code will replace it if the specified value is different.
00223     /// \param tag two character tag to be added to the SAM/BAM record.
00224     /// \param value value for the specified tag.
00225     /// \return true if the tag was successfully added, false otherwise.
00226     bool addIntTag(const char* tag, int32_t value);
00227 
00228     /// Add the specified tag to the record.  Internal processing handles 
00229     /// switching between SAM/BAM formats when read/written.  If the tag
00230     /// is already there this code will replace it if the specified value
00231     /// is different.
00232     /// \param tag two character tag to be added to the SAM/BAM record.
00233     /// \param vtype vtype of the specified value - either SAM/BAM vtypes.
00234     /// \param value value as a string for the specified tag.
00235     /// \return true if the tag was successfully added, false otherwise.
00236     bool addTag(const char* tag, char vtype, const char* value);
00237 
00238     /// Shift the indels (if any) to the left by updating the CIGAR.
00239     /// \return true if the cigar was shifted, false if not.
00240     bool shiftIndelsLeft();
00241 
00242     //@}
00243 
00244     ///////////////////////
00245     /// @name  Get Alignment Data
00246     /// Get methods for record fields.  All of the "get" methods set the
00247     /// status to indicate success or the failure reason.
00248     //@{
00249 
00250     /// Get a const pointer to the buffer that contains the BAM representation
00251     /// of the record.
00252     /// \return const pointer to the buffer that contains the BAM representation
00253     /// of the record.
00254     const void* getRecordBuffer();
00255 
00256     /// Get a const pointer to the buffer that contains the BAM representation
00257     /// of the record.
00258     /// \param translation type of sequence translation to use.
00259     /// \return const pointer to the buffer that contains the BAM representation
00260     /// of the record.
00261     const void* getRecordBuffer(SequenceTranslation translation);
00262 
00263     /// Write the record as a BAM into the specified file.
00264     /// \param filePtr file to write the BAM record into.
00265     /// \return status of the write.
00266     SamStatus::Status writeRecordBuffer(IFILE filePtr);
00267 
00268     /// Write the record as a BAM into the specified file.
00269     /// \param filePtr file to write the BAM record into.
00270     /// \param translation type of sequence translation to use.
00271     /// \return status of the write.
00272     SamStatus::Status writeRecordBuffer(IFILE filePtr, 
00273                                         SequenceTranslation translation);
00274 
00275     /// Get the block size of the record.
00276     /// \return BAM block size of the record.
00277     int32_t getBlockSize();
00278 
00279     /// Get the reference sequence name of the record.
00280     /// \return reference sequence name
00281     const char* getReferenceName();
00282 
00283     /// Get the reference sequence id of the record.
00284     /// \return reference sequence id
00285     int32_t getReferenceID();
00286 
00287     /// Get the 1-based(SAM) leftmost position of the record.
00288     /// \return 1-based leftmost position.
00289     int32_t get1BasedPosition();
00290  
00291     /// Get the 0-based(BAM) leftmost position of the record.
00292     /// \return 0-based leftmost position.
00293    int32_t get0BasedPosition();
00294 
00295     /// Get the length of the readname (QNAME) including the null.
00296     /// \return length of the read name (including null).
00297     uint8_t getReadNameLength();
00298 
00299     /// Get the mapping quality of the record.
00300     /// \return map quality.
00301     uint8_t getMapQuality();
00302 
00303     /// Get the BAM bin for the record.
00304     /// \return BAM bin
00305     uint16_t getBin();
00306 
00307     /// Get the length of the CIGAR in BAM format.
00308     /// \return length of BAM formatted cigar.
00309     uint16_t getCigarLength();
00310 
00311     /// Get the flag.
00312     /// \return flag.
00313     uint16_t getFlag();
00314 
00315     /// Get the length of the read.
00316     /// \return read length.
00317     int32_t getReadLength();
00318 
00319     /// Get the mate reference sequence name of the record.  If it is equal to
00320     /// the reference name, it still returns the reference name.
00321     /// \return reference sequence name
00322     const char* getMateReferenceName();
00323 
00324     /// Get the mate reference sequence name of the record, returning "=" if
00325     /// it is the same as the reference name, unless they are both "*" in
00326     /// which case "*" is returned.
00327     /// \return reference sequence name
00328     const char* getMateReferenceNameOrEqual();
00329 
00330     /// Get the mate reference id of the record.
00331     /// \return reference id
00332     int32_t getMateReferenceID();
00333 
00334     /// Get the 1-based(SAM) leftmost mate position of the record.
00335     /// \return 1-based leftmost position.
00336     int32_t get1BasedMatePosition();
00337 
00338     /// Get the 0-based(BAM) leftmost mate position of the record.
00339     /// \return 0-based leftmost position.
00340     int32_t get0BasedMatePosition();
00341 
00342     /// Get the inferred insert size of the read pair.
00343     /// \return inferred insert size.
00344     int32_t getInsertSize();
00345 
00346     /// Returns the 0-based inclusive rightmost position of the
00347     /// clipped sequence.
00348     /// \return 0-based inclusive rightmost position
00349     int32_t get0BasedAlignmentEnd();
00350 
00351     /// Returns the 1-based inclusive rightmost position of the
00352     /// clipped sequence.
00353     /// \return 1-based inclusive rightmost position
00354     int32_t get1BasedAlignmentEnd();
00355    
00356     /// Returns the length of the clipped sequence, returning 0 if the cigar
00357     /// is '*'.
00358     /// \return length of the clipped sequence.
00359     int32_t getAlignmentLength();
00360 
00361     /// Returns the 0-based inclusive left-most position adjusted for
00362     /// clipped bases.
00363     /// \return 0-based inclusive leftmost position including clips.
00364     int32_t get0BasedUnclippedStart();
00365 
00366     /// Returns the 1-based inclusive left-most position adjusted for
00367     /// clipped bases.
00368     /// \return 1-based inclusive leftmost position including clips.
00369     int32_t get1BasedUnclippedStart();
00370 
00371     /// Returns the 0-based inclusive right-most position adjusted for
00372     /// clipped bases.
00373     /// \return 0-based inclusive rightmost position including clips.
00374     int32_t get0BasedUnclippedEnd();
00375  
00376     /// Returns the 1-based inclusive right-most position adjusted for
00377     /// clipped bases.
00378     /// \return 1-based inclusive rightmost position including clips.
00379     int32_t get1BasedUnclippedEnd();
00380 
00381     /// Returns the SAM formatted Read Name (QNAME).
00382     /// \return read name.
00383     const char* getReadName();
00384 
00385     /// Returns the SAM formatted CIGAR string.
00386     /// \return cigar string.
00387     const char* getCigar();
00388 
00389     /// Returns the SAM formatted sequence string, translating the base as
00390     /// specified by setSequenceTranslation.
00391     /// \return sequence string.
00392     const char* getSequence();
00393 
00394     /// Returns the SAM formatted sequence string performing the specified
00395     /// sequence translation.
00396     /// \param translation type of sequence translation to use.
00397     /// \return sequence string.
00398     const char* getSequence(SequenceTranslation translation);
00399 
00400     /// Returns the SAM formatted quality string.
00401     /// \return quality string.
00402     const char* getQuality();
00403 
00404     /// Get the sequence base at the specified index into this sequence 0 to
00405     /// readLength - 1, translating the base as specified by
00406     /// setSequenceTranslation.
00407     /// \param index index into the sequence string (0 to readLength-1).
00408     /// \return the sequence base at the specified index into the sequence.
00409     char getSequence(int index);
00410     
00411     /// Get the sequence base at the specified index into this sequence 0 to
00412     /// readLength -  performing the specified sequence translation1.
00413     /// \param index index into the sequence string (0 to readLength-1).
00414     /// \param translation type of sequence translation to use.
00415     /// \return the sequence base at the specified index into the sequence.
00416     char getSequence(int index, SequenceTranslation translation);
00417     
00418     /// Get the quality character at the specified index into the quality 0 to
00419     /// readLength - 1.
00420     /// \param index index into the quality string (0 to readLength-1).
00421     /// \return the quality character at the specified index into the quality.
00422     char getQuality(int index);
00423    
00424     /// Returns a pointer to the Cigar object associated with this record.  
00425     /// The object is essentially read-only, only allowing modifications 
00426     /// due to lazy evaluations.
00427     /// \return pointer to the Cigar object.
00428     // TODO - want this to be getCigar
00429     Cigar* getCigarInfo();
00430 
00431     /// Returns the length of the tags in BAM format.
00432     /// \return length of tags in BAM format.
00433     uint32_t getTagLength();
00434 
00435     /// Get the next tag from the record.
00436     /// Sets the Status to SUCCESS when a tag is successfully returned or
00437     /// when there are no more tags.  Otherwise the status is set to describe
00438     /// why it failed (parsing, etc).
00439     /// \param tag set to the tag when a tag is read.
00440     /// \param vtype set to the vtype when a tag is read.
00441     /// \param value pointer to the value of the tag (will need to cast
00442     /// to int, double, char, or string based on vtype).
00443     /// \return true if a tag was read, false if there are no more tags.
00444     bool getNextSamTag(char* tag, char& vtype, void** value);
00445 
00446     /// Returns the values of all fields except the tags.
00447     /// \param recStruct structure containing the contents of all 
00448     /// non-variable length fields.
00449     /// \param readName read name from the record (return param)
00450     /// \param cigar cigar string from the record (return param)
00451     /// \param sequence sequence string from the record (return param)
00452     /// \param quality quality string from the record (return param)
00453     /// \return true if all fields were successfully set, false otherwise.
00454     bool getFields(bamRecordStruct& recStruct, String& readName, 
00455                    String& cigar, String& sequence, String& quality);
00456 
00457     /// Returns the values of all fields except the tags.
00458     /// \param recStruct structure containing the contents of all 
00459     /// non-variable length fields.
00460     /// \param readName read name from the record (return param)
00461     /// \param cigar cigar string from the record (return param)
00462     /// \param sequence sequence string from the record (return param)
00463     /// \param quality quality string from the record (return param)
00464     /// \param translation type of sequence translation to use.
00465     /// \return true if all fields were successfully set, false otherwise.
00466     bool getFields(bamRecordStruct& recStruct, String& readName, 
00467                    String& cigar, String& sequence, String& quality,
00468                    SequenceTranslation translation);
00469 
00470     /// Returns a pointer to the genome sequence object associated with this
00471     /// record if it was set (NULL if it was not set).
00472     /// \return pointer to the GenomeSequence object or NULL if there isn't one.
00473     GenomeSequence* getReference();
00474 
00475     //@}
00476 
00477     /// Returns whether or not the specified vtype is an integer type.
00478     /// Does not set SamStatus.
00479     /// \param vtype value type to check.
00480     /// \return true if the passed in vtype is an integer ('c', 'C', 's',
00481     /// 'S', 'i', 'I'), false otherwise.
00482     bool isIntegerType(char vtype) const;
00483 
00484     /// Returns whether or not the specified vtype is a double type.
00485     /// Does not set SamStatus.
00486     /// \param vtype value type to check.
00487     /// \return true if the passed in vtype is a double ('f'), false otherwise.
00488     bool isDoubleType(char vtype) const;
00489 
00490     /// Returns whether or not the specified vtype is a char type.
00491     /// Does not set SamStatus.
00492     /// \param vtype value type to check.
00493     /// \return true if the passed in vtype is a char ('A'), false otherwise.
00494     bool isCharType(char vtype) const;
00495 
00496     /// Returns whether or not the specified vtype is a string type.
00497     /// Does not set SamStatus.
00498     /// \param vtype value type to check.
00499     /// \return true if the passed in vtype is a string ('Z'), false othwerise.
00500     bool isStringType(char vtype) const;
00501 
00502     /// Clear the tags in this record.
00503     /// Does not set SamStatus.
00504     void clearTags();
00505    
00506     /// Remove a tag.
00507     /// \param tag tag to remove.
00508     /// \param type of the tag to be removed.
00509     /// \return true if the tag no longer exists in the record, false if it could not be removed (Returns true if the tag was not found in the record).
00510     bool rmTag(const char* tag, char type);
00511 
00512     /// Remove tags.
00513     /// \param tags tags to remove, formatted as  Tag:Type;Tag:Type;Tag:Type...
00514     /// \return true if all tags no longer exist in the record, false if any could not be removed
00515     /// (Returns true if the tags were not found in the record).
00516     /// SamStatus is set to INVALID if the tags are incorrectly formatted.
00517     bool rmTags(const char* tags);
00518 
00519     /// Returns the status associated with the last method that sets the status.
00520     /// \return SamStatus of the last command that sets status.
00521     const SamStatus& getStatus();
00522     
00523     /// Get the string representation of the tags from the record, formatted
00524     /// as TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE...
00525     /// Sets the Status to SUCCESS when the tags are successfully returned or
00526     /// the tags were not found.  If a different error occured, the status is
00527     /// set appropriately.
00528     /// \param tags the tags to retrieve, formatted as TAG:TYPE;TAG:TYPE...
00529     /// \param returnString the String to set (this method first clears returnString)
00530     ///                     to TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE...
00531     /// \param delim delimiter to use to separate two tags, default is a tab.
00532     /// \return true if there were not any errors even if no tags were found.
00533     bool getTagsString(const char* tags, String& returnString, char delim = '\t');
00534 
00535     /// Get the string value for the specified tag.
00536     /// \param tag tag to retrieve
00537     /// \param pointer to the tag's string value if found, NULL if not found.
00538     String* getStringTag(const char * tag);
00539 
00540     /// Get the integer value for the specified tag.
00541     /// \param tag tag to retrieve
00542     /// \retun pointer to the tag's integer value if found, NULL if not found.
00543     int* getIntegerTag(const char * tag);
00544 
00545     /// Get the char value for the specified tag.
00546     /// \param tag tag to retrieve
00547     /// \retun pointer to the tag's char value if found, NULL if not found.
00548     char* getCharTag(const char * tag);
00549 
00550     /// Get the double value for the specified tag.
00551     /// \param tag tag to retrieve
00552     /// \return pointer to the tag's double value if found, NULL if not found.
00553     double* getDoubleTag(const char * tag);
00554 
00555     /// Get the string value for the specified tag.
00556     String & getString(const char * tag);
00557 
00558     /// Get the integer value for the specified tag.
00559     int &    getInteger(const char * tag);
00560 
00561     /// Get the char value for the specified tag.
00562     char &    getChar(const char * tag);
00563 
00564     /// Get the double value for the specified tag.
00565     double & getDouble(const char * tag);
00566 
00567 
00568 //     void getSamExtraFieldFromKey(int key, String& extraField);
00569     
00570     /// Check if the specified tag contains a string.
00571     /// Does not set SamStatus.
00572     /// \param tag SAM tag to check contents of.
00573     /// \return true if the value associated with the tag is a string.
00574     bool checkString(const char * tag)    { return checkTag(tag, 'Z'); }
00575     
00576     /// Check if the specified tag contains a string.
00577     /// Does not set SamStatus.
00578     /// \param tag SAM tag to check contents of.
00579     /// \return true if the value associated with the tag is a string.
00580     bool checkInteger(const char * tag)   { return checkTag(tag, 'i'); }
00581     
00582     /// Check if the specified tag contains a string.
00583     /// Does not set SamStatus.
00584     /// \param tag SAM tag to check contents of.
00585     /// \return true if the value associated with the tag is a string.
00586     bool checkDouble(const char * tag)    { return checkTag(tag, 'f'); }
00587      
00588     /// Check if the specified tag contains a value of the specified vtype.
00589     /// Does not set SamStatus.
00590     /// \param tag SAM tag to check contents of.
00591     /// \param type value type to check if the SAM tag matches.
00592     /// \return true if the value associated with the tag is a string.
00593    bool checkTag(const char * tag, char type);
00594 
00595     
00596     /// Return the number of bases in this read that overlap the passed in
00597     /// region.
00598     /// \param start inclusive 0-based start position (reference position) of
00599     ///              the region to check for overlaps in.
00600     ///              (-1 indicates to start at the beginning of the reference.)
00601     /// \param end   exclusive 0-based end position (reference position) of the
00602     ///              region to check for overlaps in.
00603     ///              (-1 indicates to go to the end of the reference.)
00604     /// \return number of overlapping bases
00605     /// (matches in the cigar - not skips/deletions)
00606     uint32_t getNumOverlaps(int32_t start, int32_t end);
00607 
00608 
00609 private:
00610     static int MAKEKEY(char ch1, char ch2, char type)
00611     { return (getKeyType(type) << 16) + (ch2 << 8) + ch1; }
00612 
00613     static char getKeyType(char type)
00614     {
00615         switch(type)
00616         {
00617             // For any char/integer type, return 'i'
00618             case 'A' :
00619             case 'c' :
00620             case 'C' :
00621             case 's' :
00622             case 'S' :
00623             case 'i' :
00624             case 'I' :
00625                 return('i');
00626                 break;
00627             default:
00628                 // For all other types, return the actual type.
00629                 return(type);
00630         };
00631     }
00632 
00633     // Allocate space for the record - does a realloc.  
00634     // The passed in size is the size of the entire record including the
00635     // block size field.
00636     // Adds any errors to myStatus.
00637     bool allocateRecordStructure(int size);
00638 
00639     void* getStringPtr(int offset);
00640     void* getIntegerPtr(int offset, char& vtype);
00641     void* getCharPtr(int offset);
00642     void* getDoublePtr(int offset);
00643 
00644     // Fixes the buffer to match the variable length fields.
00645     // Adds any errors to myStatus.
00646     bool fixBuffer(SequenceTranslation translation);
00647 
00648     // Sets the Sequence and Quality strings from the buffer.
00649     // They are done together in one method because they require the same
00650     // loop, so might as well be done at the same time.
00651     // Adds any errors to myStatus.
00652     void setSequenceAndQualityFromBuffer();
00653 
00654     // Parse the cigar to calculate the alignment/unclipped ends and convert
00655     // to SAM/BAM format.
00656     // Adds any errors to myStatus.
00657     bool parseCigar();
00658     // Parse the cigar string to calculate the cigar length and alignment end
00659     // and convert to SAM format.
00660     // Adds any errors to myStatus.
00661     bool parseCigarBinary();
00662     // Parse the cigar string to calculate the cigar length and alignment end
00663     // and convert to BAM format.
00664     // Adds any errors to myStatus.
00665     bool parseCigarString();
00666 
00667     // Set the tags from the buffer.
00668     // Adds any errors to myStatus.
00669     bool setTagsFromBuffer();
00670 
00671     // Set the tags in the buffer.
00672     // Adds any errors to myStatus.
00673     bool setTagsInBuffer();
00674 
00675     void setVariablesForNewBuffer(SamFileHeader& header);
00676 
00677     void getTypeFromKey(int key, char& type) const;
00678     void getTag(int key, char* tag) const;
00679 
00680     String & getString(int offset);
00681     int &    getInteger(int offset);
00682     char &   getIntegerType(int offset);
00683     char &   getChar(int offset);
00684     double & getDouble(int offset);
00685 
00686     static const int DEFAULT_BLOCK_SIZE = 40;
00687     static const int DEFAULT_BIN = 4680;
00688     static const int DEFAULT_READ_NAME_LENGTH = 8;
00689     static const char* DEFAULT_READ_NAME;
00690     static const char* FIELD_ABSENT_STRING;
00691 
00692     bamRecordStruct * myRecordPtr;
00693     int allocatedSize;
00694 
00695     // Pointer to a temporary cigar buffer that can be used during string
00696     // parsing before it is ready to be copied into the actual record.
00697     uint32_t* myCigarTempBuffer;
00698 
00699     // Size of the currently allocated temporary cigar buffer.
00700     int myCigarTempBufferAllocatedSize;
00701 
00702     // Length of the cigar currently contained in the temporary buffer.
00703     int myCigarTempBufferLength;
00704 
00705     // Track if the buffer is in sync with the Strings/Tags.
00706     // Set to false if any of the variable length fields are modified.
00707     // Set to true when the buffer is updated to match the variable length
00708     // fields.
00709     bool myIsBufferSynced;
00710 
00711     // Track if the tags need to be set from the buffer.
00712     bool myNeedToSetTagsFromBuffer;
00713 
00714     // Trag if the tags need to be set in the buffer.
00715     // Allows you to set just the tags if they are the only thing that changed
00716     // in the buffer.
00717     bool myNeedToSetTagsInBuffer;
00718 
00719     int myTagBufferSize;
00720     int myLastTagIndex;
00721 
00722     String myReadName;
00723     String myReferenceName;
00724     String myMateReferenceName;
00725     String myCigar;
00726     String mySequence;
00727     String myQuality;
00728 
00729     std::string mySeqWithEq;
00730     std::string mySeqWithoutEq;
00731 
00732     // The length of the alignment.
00733     int32_t myAlignmentLength;
00734     // Unclipped alignment positions.
00735     int32_t myUnclippedStartOffset;
00736     int32_t myUnclippedEndOffset;
00737     
00738     CigarRoller myCigarRoller;
00739 
00740     LongHash<int>  extras;
00741     // Note: not all values in strings, integers, and doubles are always
00742     // in extras.  They will not be if the tags were removed.  Removed
00743     // tags are removed from extras, but not from strings, integers, or doubles
00744     // since if one was removed from these arrays, all other entries would
00745     // need their indices updated in extras.
00746     StringArray    strings;
00747     IntArray       integers;
00748     std::vector<char> intType; // contains the type of int at same position in integers.
00749     Vector         doubles;
00750 
00751 
00752     // Track whether or not the buffer values are correct for
00753     // each setting.
00754     bool myIsReadNameBufferValid;
00755     bool myIsCigarBufferValid;
00756     bool myIsSequenceBufferValid;
00757     bool myIsQualityBufferValid;
00758     bool myIsTagsBufferValid;
00759     bool myIsBinValid;
00760 
00761     SamStatus myStatus;
00762 
00763     // The current translation of the sequence as it occurs in the buffer.
00764     // Only applicable if myIsSequenceBufferValid == true.
00765     SequenceTranslation myBufferSequenceTranslation;
00766 
00767 
00768     // Track the Reference.
00769     GenomeSequence* myRefPtr;
00770 
00771     // The type of translation to do when getting a sequence.
00772     SequenceTranslation mySequenceTranslation;
00773 
00774     String NOT_FOUND_TAG_STRING;
00775     int NOT_FOUND_TAG_INT;
00776     double NOT_FOUND_TAG_DOUBLE;
00777 };
00778 
00779 #endif
Generated on Tue Aug 23 18:19:04 2011 for libStatGen Software by  doxygen 1.6.3