libStatGen Software  1
SamRecord.h
00001 /*
00002  *  Copyright (C) 2010-2011  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_RECORD_H__
00019 #define __SAM_RECORD_H__
00020 
00021 #include <stdint.h>
00022 
00023 #include "GenomeSequence.h"
00024 #include "SamStatus.h"
00025 #include "LongHash.h"
00026 #include "MathVector.h"
00027 #include "StringArray.h"
00028 #include "IntArray.h"
00029 #include "SamFileHeader.h"
00030 #include "CigarRoller.h"
00031 
00032 /// Structure of a BAM record.
00033 struct bamRecordStruct
00034 {
00035 public:
00036     int32_t      myBlockSize;
00037     int32_t      myReferenceID;
00038     int32_t      myPosition;
00039     uint32_t     myReadNameLength : 8, myMapQuality : 8, myBin : 16;
00040     uint32_t     myCigarLength : 16, myFlag : 16;
00041     int32_t      myReadLength;
00042     int32_t      myMateReferenceID;
00043     int32_t      myMatePosition;
00044     int32_t      myInsertSize;             // Outer fragment length
00045     char  myData[1];
00046 };
00047 
00048 
00049 /// Class providing an easy to use interface to get/set/operate on the
00050 /// fields in a SAM/BAM record. 
00051 class SamRecord
00052 {
00053 public:
00054     /// Enum containing the settings on how to translate the sequence if a
00055     /// reference is available.  If no reference is available, no translation
00056     /// is done.
00057     enum SequenceTranslation { 
00058         NONE,   ///< Leave the sequence as is.
00059         EQUAL,  ///< Translate bases that match the reference to '='
00060         BASES,  ///< Translate '=' to the actual base.
00061     };
00062 
00063     /// Default Constructor.
00064     SamRecord();
00065 
00066     /// Constructor that sets the error handling type.
00067     /// \param errorHandlingType how to handle errors.
00068     SamRecord(ErrorHandler::HandlingType errorHandlingType);
00069 
00070     /// Destructor
00071     ~SamRecord();
00072 
00073     /// Reset the fields of the record to a default value.
00074     /// This is not necessary when you are reading a SAM/BAM file, 
00075     /// but if you are setting fields, it is a good idea to clean
00076     /// out a record before reusing it. Clearing it allows you to 
00077     /// not have to set any empty fields. 
00078     void resetRecord();
00079 
00080     /// Returns whether or not the record is valid, setting the status to
00081     /// indicate success or failure.
00082     /// \param header SAM Header associated with the record.  Used to perform
00083     /// some validation against the header.
00084     /// \return true if the record is valid, false if not.
00085     bool isValid(SamFileHeader& header);
00086 
00087     /// Set the reference to the specified genome sequence object.
00088     /// \param reference pointer to the GenomeSequence object.
00089     void setReference(GenomeSequence* reference);
00090 
00091     /// Set the type of sequence translation to use when getting
00092     /// the sequence.  The default type (if this method is never called) is
00093     /// NONE (the sequence is left as-is).  Can be over-ridden by using 
00094     /// the accessors that take a SequenceTranslation parameter.
00095     /// \param translation type of sequence translation to use.
00096     void setSequenceTranslation(SequenceTranslation translation);
00097 
00098     ///////////////////////
00099     /// @name  Set Alignment Data
00100     /// Set methods for record fields.  All of the "set" methods set the
00101     /// status to indicate success or the failure reason.
00102     //@{
00103 
00104     /// Set QNAME to the passed in name.
00105     /// \param readName the readname to set the QNAME to.
00106     /// \return true if successfully set, false if not.
00107     bool setReadName(const char* readName);
00108 
00109     /// Set the bitwise FLAG to the specified value.
00110     /// \param flag integer flag to use.
00111     /// \return true if successfully set, false if not.
00112     bool setFlag(uint16_t flag);
00113     
00114     /// Set the reference sequence name (RNAME) to the specified name, using
00115     /// the header to determine the reference id.
00116     /// \param header SAM/BAM header to use to determine the reference id.
00117     /// \param referenceName reference name to use.
00118     /// \return true if successfully set, false if not
00119     bool setReferenceName(SamFileHeader& header, 
00120                           const char* referenceName);
00121 
00122     /// Set the leftmost position (POS) using the specified 1-based (SAM format)
00123     /// value.
00124     /// Internal processing handles the switching between SAM/BAM formats 
00125     /// when read/written.
00126     /// \param position 1-based start position
00127     /// \return true if successfully set, false if not.
00128     bool set1BasedPosition(int32_t position);
00129 
00130     /// Set the leftmost position using the specified 0-based (BAM format)
00131     /// value.
00132     /// Internal processing handles the switching between SAM/BAM formats 
00133     /// when read/written.
00134     /// \param position 0-based start position
00135     /// \return true if successfully set, false if not.
00136     bool set0BasedPosition(int32_t position);
00137 
00138     /// Set the mapping quality (MAPQ).
00139     /// \param mapQuality map quality to set in the record.
00140     /// \return true if successfully set, false if not.
00141     bool setMapQuality(uint8_t mapQuality);
00142 
00143     /// Set the CIGAR to the specified SAM formatted cigar string.
00144     /// Internal processing handles the switching between SAM/BAM formats 
00145     /// when read/written.
00146     /// \param cigar string containing the SAM formatted cigar.
00147     /// \return true if successfully set, false if not.
00148     bool setCigar(const char* cigar);
00149 
00150     /// Set the CIGAR to the specified Cigar object.
00151     /// Internal processing handles the switching between SAM/BAM formats 
00152     /// when read/written.
00153     /// \param cigar object to set this record's cigar to have.
00154     /// \return true if successfully set, false if not.
00155     bool setCigar(const Cigar& cigar);
00156 
00157 
00158     /// Set the mate/next fragment's reference sequence name (RNEXT) to the
00159     /// specified name, using the header to determine the mate reference id.
00160     /// \param header SAM/BAM header to use to determine the mate reference id.
00161     /// \param referenceName mate reference name to use.
00162     /// \return true if successfully set, false if not
00163     bool setMateReferenceName(SamFileHeader& header,
00164                               const char* mateReferenceName);
00165 
00166     /// Set the mate/next fragment's leftmost position (PNEXT) using the
00167     /// specified 1-based (SAM format) value.
00168     /// Internal processing handles the switching between SAM/BAM formats 
00169     /// when read/written.
00170     /// \param position 1-based start position
00171     /// \return true if successfully set, false if not.
00172     bool set1BasedMatePosition(int32_t matePosition);
00173 
00174     /// Set the mate/next fragment's leftmost position using the specified
00175     /// 0-based (BAM format) value.
00176     /// Internal processing handles the switching between SAM/BAM formats 
00177     /// when read/written.
00178     /// \param position 0-based start position
00179     /// \return true if successfully set, false if not.
00180     bool set0BasedMatePosition(int32_t matePosition);
00181 
00182     /// Sets the inferred insert size (ISIZE)/observed template length (TLEN).
00183     /// \param insertSize inferred insert size/observed template length.
00184     /// \return true if successfully set, false if not.
00185     bool setInsertSize(int32_t insertSize);
00186 
00187     /// Sets the sequence (SEQ) to the specified SAM formatted sequence string.
00188     ///  Internal processing handles switching between SAM/BAM formats when
00189     /// read/written.
00190     /// \param seq SAM sequence string.  May contain '='.
00191     /// \return true if successfully set, false if not.
00192     bool setSequence(const char* seq);
00193 
00194     /// Sets the quality (QUAL) to the specified SAM formatted quality string.
00195     /// Internal processing handles switching between SAM/BAM formats when
00196     /// read/written.
00197     /// \param quality SAM quality string.
00198     /// \return true if successfully set, false if not.
00199     bool setQuality(const char* quality);
00200 
00201     /// Shift the indels (if any) to the left by updating the CIGAR.
00202     /// \return true if the cigar was shifted, false if not.
00203     bool shiftIndelsLeft();
00204 
00205     /// Sets the SamRecord to contain the information in the BAM formatted
00206     /// fromBuffer.
00207     /// \param fromBuffer buffer to read the BAM record from.
00208     /// \param fromBufferSize size of the buffer containing the BAM record.
00209     /// \param header BAM header for the record.
00210     /// \return status of reading the BAM record from the buffer.
00211     SamStatus::Status setBuffer(const char* fromBuffer, uint32_t fromBufferSize,
00212                                 SamFileHeader& header);
00213 
00214     /// Read the BAM record from a file.
00215     /// \param filePtr file to read the buffer from.
00216     /// \param header BAM header for the record.
00217     /// \return status of the reading the BAM record from the file.
00218     SamStatus::Status setBufferFromFile(IFILE filePtr, SamFileHeader& header);
00219 
00220     //@}
00221 
00222     ///////////////////////
00223     /// @name  Set Tag Data
00224     /// Set methods for tags.
00225     //@{
00226 
00227     /// Add the specified integer tag to the record.  Internal processing
00228     /// handles switching between SAM/BAM formats when read/written and 
00229     /// determining the type for BAM format.  If the tag is already there
00230     /// this code will replace it if the specified value is different.
00231     /// \param tag two character tag to be added to the SAM/BAM record.
00232     /// \param value value for the specified tag.
00233     /// \return true if the tag was successfully added, false otherwise.
00234     bool addIntTag(const char* tag, int32_t value);
00235 
00236     /// Add the specified tag,vtype,value to the record.  Vtype can be SAM/BAM
00237     /// format.  Internal processing handles switching between SAM/BAM formats
00238     /// when read/written.  If the tag is already there this code will replace
00239     /// it if the specified value is different.
00240     /// \param tag two character tag to be added to the SAM/BAM record.
00241     /// \param vtype vtype of the specified value - either SAM/BAM vtypes.
00242     /// \param value value as a string for the specified tag.
00243     /// \return true if the tag was successfully added, false otherwise.
00244     bool addTag(const char* tag, char vtype, const char* value);
00245 
00246     /// Clear the tags in this record.
00247     /// Does not set SamStatus.
00248     void clearTags();
00249    
00250     /// Remove a tag.
00251     /// \param tag tag to remove.
00252     /// \param type of the tag to be removed.
00253     /// \return true if the tag no longer exists in the record, false if it could not be removed (Returns true if the tag was not found in the record).
00254     bool rmTag(const char* tag, char type);
00255 
00256     /// Remove tags.
00257     /// The delimiter between the tags is ',' or ';'.  ',' was added since 
00258     /// the original delimiter, ';', requires the string to be quoted on the
00259     /// command-line.
00260     /// \param tags tags to remove, formatted as  Tag:Type,Tag:Type,Tag:Type...
00261     /// \return true if all tags no longer exist in the record, false if any could not be removed
00262     /// (Returns true if the tags were not found in the record).
00263     /// SamStatus is set to INVALID if the tags are incorrectly formatted.
00264     bool rmTags(const char* tags);
00265 
00266     //@}
00267 
00268     ///////////////////////
00269     /// @name  Get Alignment Data
00270     /// Get methods for record fields.  All of the "get" methods set the
00271     /// status to indicate success or the failure reason.
00272     //@{
00273 
00274     /// Get a const pointer to the buffer that contains the BAM representation
00275     /// of the record.
00276     /// \return const pointer to the buffer that contains the BAM representation
00277     /// of the record.
00278     const void* getRecordBuffer();
00279 
00280     /// Get a const pointer to the buffer that contains the BAM representation
00281     /// of the record using the specified translation on the sequence.
00282     /// \param translation type of sequence translation to use.
00283     /// \return const pointer to the buffer that contains the BAM representation
00284     /// of the record.
00285     const void* getRecordBuffer(SequenceTranslation translation);
00286 
00287     /// Write the record as a BAM into the specified already opened file.
00288     /// \param filePtr file to write the BAM record into.
00289     /// \return status of the write.
00290     SamStatus::Status writeRecordBuffer(IFILE filePtr);
00291 
00292     /// Write the record as a BAM into the specified already opened file using
00293     /// the specified translation on the sequence.
00294     /// \param filePtr file to write the BAM record into.
00295     /// \param translation type of sequence translation to use.
00296     /// \return status of the write.
00297     SamStatus::Status writeRecordBuffer(IFILE filePtr, 
00298                                         SequenceTranslation translation);
00299 
00300     /// Get the block size of the record (BAM format).
00301     /// \return BAM block size of the record.
00302     int32_t getBlockSize();
00303 
00304     /// Get the reference sequence name (RNAME) of the record.
00305     /// \return reference sequence name
00306     const char* getReferenceName();
00307 
00308     /// Get the reference sequence id of the record (BAM format rid).
00309     /// \return reference sequence id
00310     int32_t getReferenceID();
00311 
00312     /// Get the 1-based(SAM) leftmost position (POS) of the record.
00313     /// \return 1-based leftmost position.
00314     int32_t get1BasedPosition();
00315  
00316     /// Get the 0-based(BAM) leftmost position of the record.
00317     /// \return 0-based leftmost position.
00318    int32_t get0BasedPosition();
00319 
00320     /// Get the length of the readname (QNAME) including the null.
00321     /// \return length of the read name (including null).
00322     uint8_t getReadNameLength();
00323 
00324     /// Get the mapping quality (MAPQ) of the record.
00325     /// \return map quality.
00326     uint8_t getMapQuality();
00327 
00328     /// Get the BAM bin for the record.
00329     /// \return BAM bin
00330     uint16_t getBin();
00331 
00332     /// Get the length of the BAM formatted CIGAR.
00333     /// \return length of BAM formatted cigar.
00334     uint16_t getCigarLength();
00335 
00336     /// Get the flag (FLAG).
00337     /// \return flag.
00338     uint16_t getFlag();
00339 
00340     /// Get the length of the read.
00341     /// \return read length.
00342     int32_t getReadLength();
00343 
00344     /// Get the mate/next fragment's reference sequence name (RNEXT).  If it
00345     /// is equal to the reference name, it still returns the reference name.
00346     /// \return reference sequence name
00347     const char* getMateReferenceName();
00348 
00349     /// Get the mate/next fragment's reference sequence name (RNEXT),
00350     /// returning "=" if it is the same as the reference name, unless 
00351     /// they are both "*" in which case "*" is returned.
00352     /// \return reference sequence name or '='
00353     const char* getMateReferenceNameOrEqual();
00354 
00355     /// Get the mate reference id of the record
00356     /// (BAM format: mate_rid/next_refID).
00357     /// \return reference id
00358     int32_t getMateReferenceID();
00359 
00360     /// Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT).
00361     /// \return 1-based leftmost position.
00362     int32_t get1BasedMatePosition();
00363 
00364     /// Get the 0-based(BAM) leftmost mate/next fragment's position.
00365     /// \return 0-based leftmost position.
00366     int32_t get0BasedMatePosition();
00367 
00368     /// Get the inferred insert size of the read pair (ISIZE) or
00369     /// observed template length (TLEN).
00370     /// \return inferred insert size or observed template length.
00371     int32_t getInsertSize();
00372 
00373     /// Returns the 0-based inclusive rightmost position of the
00374     /// clipped sequence.
00375     /// \return 0-based inclusive rightmost position
00376     int32_t get0BasedAlignmentEnd();
00377 
00378     /// Returns the 1-based inclusive rightmost position of the
00379     /// clipped sequence.
00380     /// \return 1-based inclusive rightmost position
00381     int32_t get1BasedAlignmentEnd();
00382    
00383     /// Returns the length of the clipped sequence, returning 0 if the cigar
00384     /// is '*'.
00385     /// \return length of the clipped sequence.
00386     int32_t getAlignmentLength();
00387 
00388     /// Returns the 0-based inclusive left-most position adjusted for
00389     /// clipped bases.
00390     /// \return 0-based inclusive leftmost position including clips.
00391     int32_t get0BasedUnclippedStart();
00392 
00393     /// Returns the 1-based inclusive left-most position adjusted for
00394     /// clipped bases.
00395     /// \return 1-based inclusive leftmost position including clips.
00396     int32_t get1BasedUnclippedStart();
00397 
00398     /// Returns the 0-based inclusive right-most position adjusted for
00399     /// clipped bases.
00400     /// \return 0-based inclusive rightmost position including clips.
00401     int32_t get0BasedUnclippedEnd();
00402  
00403     /// Returns the 1-based inclusive right-most position adjusted for
00404     /// clipped bases.
00405     /// \return 1-based inclusive rightmost position including clips.
00406     int32_t get1BasedUnclippedEnd();
00407 
00408     /// Returns the SAM formatted Read Name (QNAME).
00409     /// \return read name.
00410     const char* getReadName();
00411 
00412     /// Returns the SAM formatted CIGAR string.
00413     /// \return cigar string.
00414     const char* getCigar();
00415 
00416     /// Returns the SAM formatted sequence string (SEQ), translating the base as
00417     /// specified by setSequenceTranslation.
00418     /// \return sequence string.
00419     const char* getSequence();
00420 
00421     /// Returns the SAM formatted sequence string (SEQ) performing the specified
00422     /// sequence translation.
00423     /// \param translation type of sequence translation to use.
00424     /// \return sequence string.
00425     const char* getSequence(SequenceTranslation translation);
00426 
00427     /// Returns the SAM formatted quality string (QUAL).
00428     /// \return quality string.
00429     const char* getQuality();
00430 
00431     /// Get the sequence base at the specified index into this sequence 0 to
00432     /// readLength - 1, translating the base as specified by
00433     /// setSequenceTranslation.  Throws an exception if index is out of range.
00434     /// \param index index into the sequence string (0 to readLength-1).
00435     /// \return the sequence base at the specified index into the sequence.
00436     char getSequence(int index);
00437     
00438     /// Get the sequence base at the specified index into this sequence 0 to
00439     /// readLength - 1 performing the specified sequence translation. 
00440     /// Throws an exception if index is out of range.
00441     /// \param index index into the sequence string (0 to readLength-1).
00442     /// \param translation type of sequence translation to use.
00443     /// \return the sequence base at the specified index into the sequence.
00444     char getSequence(int index, SequenceTranslation translation);
00445     
00446     /// Get the quality character at the specified index into the quality 0 to
00447     /// readLength - 1.  Throws an exception if index is out of range.
00448     /// \param index index into the quality string (0 to readLength-1).
00449     /// \return the quality character at the specified index into the quality.
00450     char getQuality(int index);
00451    
00452     /// Returns a pointer to the Cigar object associated with this record.  
00453     /// The object is essentially read-only, only allowing modifications 
00454     /// due to lazy evaluations.
00455     /// \return pointer to the Cigar object.
00456     Cigar* getCigarInfo();
00457 
00458     /// Return the number of bases in this read that overlap the passed in
00459     /// region.  Matches & mismatches between the read and the reference
00460     /// are counted as overlaps, but insertions, deletions, skips, clips, and
00461     /// pads are not counted.
00462     /// \param start inclusive 0-based start position (reference position) of
00463     ///              the region to check for overlaps in.
00464     ///              (-1 indicates to start at the beginning of the reference.)
00465     /// \param end   exclusive 0-based end position (reference position) of the
00466     ///              region to check for overlaps in.
00467     ///              (-1 indicates to go to the end of the reference.)
00468     /// \return number of overlapping bases
00469     uint32_t getNumOverlaps(int32_t start, int32_t end);
00470 
00471     /// Returns the values of all fields except the tags.
00472     /// \param recStruct structure containing the contents of all 
00473     /// non-variable length fields.
00474     /// \param readName read name from the record (return param)
00475     /// \param cigar cigar string from the record (return param)
00476     /// \param sequence sequence string from the record (return param)
00477     /// \param quality quality string from the record (return param)
00478     /// \return true if all fields were successfully set, false otherwise.
00479     bool getFields(bamRecordStruct& recStruct, String& readName, 
00480                    String& cigar, String& sequence, String& quality);
00481 
00482     /// Returns the values of all fields except the tags using the specified
00483     /// sequence translation.
00484     /// \param recStruct structure containing the contents of all 
00485     /// non-variable length fields.
00486     /// \param readName read name from the record (return param)
00487     /// \param cigar cigar string from the record (return param)
00488     /// \param sequence sequence string from the record (return param)
00489     /// \param quality quality string from the record (return param)
00490     /// \param translation type of sequence translation to use.
00491     /// \return true if all fields were successfully set, false otherwise.
00492     bool getFields(bamRecordStruct& recStruct, String& readName, 
00493                    String& cigar, String& sequence, String& quality,
00494                    SequenceTranslation translation);
00495 
00496     /// Returns a pointer to the genome sequence object associated with this
00497     /// record if it was set (NULL if it was not set).
00498     /// \return pointer to the GenomeSequence object or NULL if there isn't one.
00499     GenomeSequence* getReference();
00500 
00501     //@}
00502 
00503     ///////////////////////
00504     /// @name  Get Tag Methods
00505     /// Get methods for obtaining information on tags.
00506     //@{
00507 
00508     /// Returns the length of the BAM formatted tags.
00509     /// \return length of the BAM formatted tags.
00510     uint32_t getTagLength();
00511 
00512     /// Get the next tag from the record.
00513     /// Sets the Status to SUCCESS when a tag is successfully returned or
00514     /// when there are no more tags.  Otherwise the status is set to describe
00515     /// why it failed (parsing, etc).
00516     /// \param tag set to the tag when a tag is read.
00517     /// \param vtype set to the vtype when a tag is read.
00518     /// \param value pointer to the value of the tag (will need to cast
00519     /// to int, float, char, or string based on vtype).
00520     /// \return true if a tag was read, false if there are no more tags.
00521     bool getNextSamTag(char* tag, char& vtype, void** value);
00522 
00523     /// Reset the tag iterator to the beginning of the tags.
00524     void resetTagIter();
00525  
00526     /// Returns whether or not the specified vtype is an integer type.
00527     /// Does not set SamStatus.
00528     /// \param vtype value type to check.
00529     /// \return true if the passed in vtype is an integer ('c', 'C', 's',
00530     /// 'S', 'i', 'I'), false otherwise.
00531     static bool isIntegerType(char vtype);
00532 
00533     /// Returns whether or not the specified vtype is a float type.
00534     /// Does not set SamStatus.
00535     /// \param vtype value type to check.
00536     /// \return true if the passed in vtype is a float ('f'), false otherwise.
00537     static bool isFloatType(char vtype);
00538 
00539     /// Returns whether or not the specified vtype is a char type.
00540     /// Does not set SamStatus.
00541     /// \param vtype value type to check.
00542     /// \return true if the passed in vtype is a char ('A'), false otherwise.
00543     static bool isCharType(char vtype);
00544 
00545     /// Returns whether or not the specified vtype is a string type.
00546     /// Does not set SamStatus.
00547     /// \param vtype value type to check.
00548     /// \return true if the passed in vtype is a string ('Z'/'B'), false othwerise.
00549     static bool isStringType(char vtype);
00550 
00551     /// Get the string representation of the tags from the record, formatted
00552     /// as TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE...
00553     /// Sets the Status to SUCCESS when the tags are successfully returned or
00554     /// the tags were not found.  If a different error occured, the status is
00555     /// set appropriately.
00556     /// The delimiter between the tags to retrieve is ',' or ';'.  ',' was added
00557     /// since the original delimiter, ';', requires the string to be quoted on
00558     /// the command-line.
00559     /// \param tags the tags to retrieve, formatted as TAG:TYPE,TAG:TYPE...
00560     /// \param returnString the String to set (this method first clears returnString)
00561     ///                     to TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE...
00562     /// \param delim delimiter to use to separate two tags, default is a tab.
00563     /// \return true if there were not any errors even if no tags were found.
00564     bool getTagsString(const char* tags, String& returnString, char delim = '\t');
00565 
00566     /// Get the string value for the specified tag.
00567     /// \param tag tag to retrieve
00568     /// \param pointer to the tag's string value if found, NULL if not found.
00569     const String* getStringTag(const char * tag);
00570 
00571     /// Get the integer value for the specified tag, DEPRECATED, use one that returns a bool (success/failure).
00572     /// \param tag tag to retrieve
00573     /// \retun pointer to the tag's integer value if found, NULL if not found.
00574     int* getIntegerTag(const char * tag);
00575 
00576     /// Get the integer value for the specified tag.
00577     /// \param tag tag to retrieve
00578     /// \param tagVal return parameter with integer value for the tag
00579     /// \retun bool true if Integer tag was found and tagVal was set, 
00580     ///             false if not.
00581     bool getIntegerTag(const char * tag, int& tagVal);
00582 
00583     /// Get the float value for the specified tag.
00584     /// \param tag tag to retrieve
00585     /// \param tagVal return parameter with integer value for the tag
00586     /// \return bool true if Float tag was found and tagVal was set,
00587     ///         false if not.
00588     bool getFloatTag(const char * tag, float& tagVal);
00589 
00590     /// Get the string value for the specified tag.
00591     const String & getString(const char * tag);
00592 
00593     /// Get the integer value for the specified tag, DEPRECATED, use getIntegerTag that returns a bool.
00594     int &    getInteger(const char * tag);
00595 
00596     /// Check if the specified tag contains a string.
00597     /// Does not set SamStatus.
00598     /// \param tag SAM tag to check contents of.
00599     /// \return true if the value associated with the tag is a string.
00600     bool checkString(const char * tag)
00601     { return(checkTag(tag, 'Z') || checkTag(tag, 'B')); }
00602     
00603     /// Check if the specified tag contains an integer.
00604     /// Does not set SamStatus.
00605     /// \param tag SAM tag to check contents of.
00606     /// \return true if the value associated with the tag is a string.
00607     bool checkInteger(const char * tag)   { return checkTag(tag, 'i'); }
00608     
00609     /// Check if the specified tag contains a string.
00610     /// Does not set SamStatus.
00611     /// \param tag SAM tag to check contents of.
00612     /// \return true if the value associated with the tag is a string.
00613     bool checkFloat(const char * tag)    { return checkTag(tag, 'f'); }
00614      
00615     /// Check if the specified tag contains a value of the specified vtype.
00616     /// Does not set SamStatus.
00617     /// \param tag SAM tag to check contents of.
00618     /// \param type value type to check if the SAM tag matches.
00619     /// \return true if the value associated with the tag is a string.
00620    bool checkTag(const char * tag, char type);
00621     //@}
00622 
00623     /// Returns the status associated with the last method that sets the status.
00624     /// \return SamStatus of the last command that sets status.
00625     const SamStatus& getStatus();
00626 
00627 
00628 private:
00629     static int MAKEKEY(char ch1, char ch2, char type)
00630     { return (getKeyType(type) << 16) + (ch2 << 8) + ch1; }
00631 
00632     static char getKeyType(char type)
00633     {
00634         switch(type)
00635         {
00636             // For any char/integer type, return 'i'
00637             case 'A' :
00638             case 'c' :
00639             case 'C' :
00640             case 's' :
00641             case 'S' :
00642             case 'i' :
00643             case 'I' :
00644                 return('i');
00645                 break;
00646             default:
00647                 // For all other types, return the actual type.
00648                 return(type);
00649         };
00650     }
00651 
00652     static inline int getNumericTagTypeSize(char type)
00653     {
00654         switch(type)
00655         {
00656             case 'A':
00657             case 'c':
00658             case 'C':
00659                 return(1);
00660                 break;
00661             case 's':
00662             case 'S':
00663                 return(2);
00664                 break;
00665             case 'i':
00666             case 'I':
00667             case 'f':
00668                 return(4);
00669             default:
00670                 // Not a numeric type.
00671                 return(0);
00672         }
00673     }
00674 
00675     // Allocate space for the record - does a realloc.  
00676     // The passed in size is the size of the entire record including the
00677     // block size field.
00678     // Adds any errors to myStatus.
00679     bool allocateRecordStructure(int size);
00680 
00681     void* getStringPtr(int offset);
00682     void* getIntegerPtr(int offset, char& vtype);
00683     void* getFloatPtr(int offset);
00684 
00685     // Fixes the buffer to match the variable length fields.
00686     // Adds any errors to myStatus.
00687     bool fixBuffer(SequenceTranslation translation);
00688 
00689     // Sets the Sequence and Quality strings from the buffer.
00690     // They are done together in one method because they require the same
00691     // loop, so might as well be done at the same time.
00692     // Adds any errors to myStatus.
00693     void setSequenceAndQualityFromBuffer();
00694 
00695     // Parse the cigar to calculate the alignment/unclipped ends and convert
00696     // to SAM/BAM format.
00697     // Adds any errors to myStatus.
00698     bool parseCigar();
00699     // Parse the cigar string to calculate the cigar length and alignment end
00700     // and convert to SAM format.
00701     // Adds any errors to myStatus.
00702     bool parseCigarBinary();
00703     // Parse the cigar string to calculate the cigar length and alignment end
00704     // and convert to BAM format.
00705     // Adds any errors to myStatus.
00706     bool parseCigarString();
00707 
00708     // Set the tags from the buffer.
00709     // Adds any errors to myStatus.
00710     bool setTagsFromBuffer();
00711 
00712     // Set the tags in the buffer.
00713     // Adds any errors to myStatus.
00714     bool setTagsInBuffer();
00715 
00716     void setVariablesForNewBuffer(SamFileHeader& header);
00717 
00718     void getTypeFromKey(int key, char& type) const;
00719     void getTag(int key, char* tag) const;
00720 
00721     String & getString(int offset);
00722     int &    getInteger(int offset);
00723     const char &   getIntegerType(int offset) const;
00724     float & getFloat(int offset);
00725 
00726     // Append the string representation of the value at the specified index
00727     // of the int array.
00728     inline void appendIntArrayValue(int index, String& strVal) const
00729     {
00730         appendIntArrayValue(intType[index], integers[index], strVal);
00731     }
00732 
00733     void appendIntArrayValue(char type, int value, String& strVal) const;
00734 
00735     int getBtagBufferSize(String& tagStr);
00736     int setBtagBuffer(String& tagStr, char* extraPtr);
00737     int getStringFromBtagBuffer(unsigned char* buffer, String& tagStr);
00738 
00739     static const int DEFAULT_BLOCK_SIZE = 40;
00740     static const int DEFAULT_BIN = 4680;
00741     static const int DEFAULT_READ_NAME_LENGTH = 8;
00742     static const char* DEFAULT_READ_NAME;
00743     static const char* FIELD_ABSENT_STRING;
00744 
00745     bamRecordStruct * myRecordPtr;
00746     int allocatedSize;
00747 
00748     // Pointer to a temporary cigar buffer that can be used during string
00749     // parsing before it is ready to be copied into the actual record.
00750     uint32_t* myCigarTempBuffer;
00751 
00752     // Size of the currently allocated temporary cigar buffer.
00753     int myCigarTempBufferAllocatedSize;
00754 
00755     // Length of the cigar currently contained in the temporary buffer.
00756     int myCigarTempBufferLength;
00757 
00758     // Track if the buffer is in sync with the Strings/Tags.
00759     // Set to false if any of the variable length fields are modified.
00760     // Set to true when the buffer is updated to match the variable length
00761     // fields.
00762     bool myIsBufferSynced;
00763 
00764     // Track if the tags need to be set from the buffer.
00765     bool myNeedToSetTagsFromBuffer;
00766 
00767     // Trag if the tags need to be set in the buffer.
00768     // Allows you to set just the tags if they are the only thing that changed
00769     // in the buffer.
00770     bool myNeedToSetTagsInBuffer;
00771 
00772     int myTagBufferSize;
00773     int myLastTagIndex;
00774 
00775     String myReadName;
00776     String myReferenceName;
00777     String myMateReferenceName;
00778     String myCigar;
00779     String mySequence;
00780     String myQuality;
00781 
00782     std::string mySeqWithEq;
00783     std::string mySeqWithoutEq;
00784 
00785     // The length of the alignment.
00786     int32_t myAlignmentLength;
00787     // Unclipped alignment positions.
00788     int32_t myUnclippedStartOffset;
00789     int32_t myUnclippedEndOffset;
00790     
00791     CigarRoller myCigarRoller;
00792 
00793     LongHash<int>  extras;
00794     // Note: not all values in strings, integers, and floats are always
00795     // in extras.  They will not be if the tags were removed.  Removed
00796     // tags are removed from extras, but not from strings, integers, or floats
00797     // since if one was removed from these arrays, all other entries would
00798     // need their indices updated in extras.
00799     StringArray    strings;
00800     IntArray       integers;
00801     std::vector<char> intType; // contains the type of int at same position in integers.
00802     std::vector<float> floats;
00803 
00804 
00805     // Track whether or not the buffer values are correct for
00806     // each setting.
00807     bool myIsReadNameBufferValid;
00808     bool myIsCigarBufferValid;
00809     bool myIsSequenceBufferValid;
00810     bool myIsQualityBufferValid;
00811     bool myIsTagsBufferValid;
00812     bool myIsBinValid;
00813 
00814     unsigned char* myPackedSequence;
00815     unsigned char* myPackedQuality;
00816 
00817 
00818     SamStatus myStatus;
00819 
00820     // The current translation of the sequence as it occurs in the buffer.
00821     // Only applicable if myIsSequenceBufferValid == true.
00822     SequenceTranslation myBufferSequenceTranslation;
00823 
00824 
00825     // Track the Reference.
00826     GenomeSequence* myRefPtr;
00827 
00828     // The type of translation to do when getting a sequence.
00829     SequenceTranslation mySequenceTranslation;
00830 
00831     String NOT_FOUND_TAG_STRING;
00832     int NOT_FOUND_TAG_INT;
00833 
00834     static const int myMaxWarns = 5;
00835     static int myNumWarns;
00836 };
00837 
00838 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends