SamRecord.h

00001 /*
00002  *  Copyright (C) 2010-2011  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_RECORD_H__
00019 #define __SAM_RECORD_H__
00020 
00021 #include <stdint.h>
00022 
00023 #include "GenomeSequence.h"
00024 #include "SamStatus.h"
00025 #include "LongHash.h"
00026 #include "MathVector.h"
00027 #include "StringArray.h"
00028 #include "IntArray.h"
00029 #include "SamFileHeader.h"
00030 #include "CigarRoller.h"
00031 
00032 /// Structure of a BAM record.
00033 struct bamRecordStruct
00034 {
00035 public:
00036     int32_t      myBlockSize;
00037     int32_t      myReferenceID;
00038     int32_t      myPosition;
00039     uint32_t     myReadNameLength : 8, myMapQuality : 8, myBin : 16;
00040     uint32_t     myCigarLength : 16, myFlag : 16;
00041     int32_t      myReadLength;
00042     int32_t      myMateReferenceID;
00043     int32_t      myMatePosition;
00044     int32_t      myInsertSize;             // Outer fragment length
00045     char  myData[1];
00046 };
00047 
00048 
00049 /// Class providing an easy to use interface to get/set/operate on the
00050 /// fields in a SAM/BAM record. 
00051 class SamRecord
00052 {
00053 public:
00054     /// Enum containing the settings on how to translate the sequence if a
00055     /// reference is available.  If no reference is available, no translation
00056     /// is done.
00057     enum SequenceTranslation { 
00058         NONE,   ///< Leave the sequence as is.
00059         EQUAL,  ///< Translate bases that match the reference to '='
00060         BASES,  ///< Translate '=' to the actual base.
00061     };
00062 
00063     /// Default Constructor.
00064     SamRecord();
00065 
00066     /// Constructor that sets the error handling type.
00067     /// \param errorHandlingType how to handle errors.
00068     SamRecord(ErrorHandler::HandlingType errorHandlingType);
00069 
00070     /// Destructor
00071     ~SamRecord();
00072 
00073     /// Reset the fields of the record to a default value.
00074     /// This is not necessary when you are reading a SAM/BAM file, 
00075     /// but if you are setting fields, it is a good idea to clean
00076     /// out a record before reusing it. Clearing it allows you to 
00077     /// not have to set any empty fields. 
00078     void resetRecord();
00079 
00080     /// Returns whether or not the record is valid, setting the status to
00081     /// indicate success or failure.
00082     /// \param header SAM Header associated with the record.  Used to perform
00083     /// some validation against the header.
00084     /// \return true if the record is valid, false if not.
00085     bool isValid(SamFileHeader& header);
00086 
00087     /// Set the reference to the specified genome sequence object.
00088     /// \param reference pointer to the GenomeSequence object.
00089     void setReference(GenomeSequence* reference);
00090 
00091     /// Set the type of sequence translation to use when getting
00092     /// the sequence.  The default type (if this method is never called) is
00093     /// NONE (the sequence is left as-is).  Can be over-ridden by using 
00094     /// the accessors that take a SequenceTranslation parameter.
00095     /// \param translation type of sequence translation to use.
00096     void setSequenceTranslation(SequenceTranslation translation);
00097 
00098     ///////////////////////
00099     /// @name  Set Alignment Data
00100     /// Set methods for record fields.  All of the "set" methods set the
00101     /// status to indicate success or the failure reason.
00102     //@{
00103 
00104     /// Set QNAME to the passed in name.
00105     /// \param readName the readname to set the QNAME to.
00106     /// \return true if successfully set, false if not.
00107     bool setReadName(const char* readName);
00108 
00109     /// Set the bitwise FLAG to the specified value.
00110     /// \param flag integer flag to use.
00111     /// \return true if successfully set, false if not.
00112     bool setFlag(uint16_t flag);
00113     
00114     /// Set the reference sequence name (RNAME) to the specified name, using
00115     /// the header to determine the reference id.
00116     /// \param header SAM/BAM header to use to determine the reference id.
00117     /// \param referenceName reference name to use.
00118     /// \return true if successfully set, false if not
00119     bool setReferenceName(SamFileHeader& header, 
00120                           const char* referenceName);
00121 
00122     /// Set the leftmost position (POS) using the specified 1-based (SAM format)
00123     /// value.
00124     /// Internal processing handles the switching between SAM/BAM formats 
00125     /// when read/written.
00126     /// \param position 1-based start position
00127     /// \return true if successfully set, false if not.
00128     bool set1BasedPosition(int32_t position);
00129 
00130     /// Set the leftmost position using the specified 0-based (BAM format)
00131     /// value.
00132     /// Internal processing handles the switching between SAM/BAM formats 
00133     /// when read/written.
00134     /// \param position 0-based start position
00135     /// \return true if successfully set, false if not.
00136     bool set0BasedPosition(int32_t position);
00137 
00138     /// Set the mapping quality (MAPQ).
00139     /// \param mapQuality map quality to set in the record.
00140     /// \return true if successfully set, false if not.
00141     bool setMapQuality(uint8_t mapQuality);
00142 
00143     /// Set the CIGAR to the specified SAM formatted cigar string.
00144     /// Internal processing handles the switching between SAM/BAM formats 
00145     /// when read/written.
00146     /// \param cigar string containing the SAM formatted cigar.
00147     /// \return true if successfully set, false if not.
00148     bool setCigar(const char* cigar);
00149 
00150     /// Set the CIGAR to the specified Cigar object.
00151     /// Internal processing handles the switching between SAM/BAM formats 
00152     /// when read/written.
00153     /// \param cigar object to set this record's cigar to have.
00154     /// \return true if successfully set, false if not.
00155     bool setCigar(const Cigar& cigar);
00156 
00157 
00158     /// Set the mate/next fragment's reference sequence name (RNEXT) to the
00159     /// specified name, using the header to determine the mate reference id.
00160     /// \param header SAM/BAM header to use to determine the mate reference id.
00161     /// \param referenceName mate reference name to use.
00162     /// \return true if successfully set, false if not
00163     bool setMateReferenceName(SamFileHeader& header,
00164                               const char* mateReferenceName);
00165 
00166     /// Set the mate/next fragment's leftmost position (PNEXT) using the
00167     /// specified 1-based (SAM format) value.
00168     /// Internal processing handles the switching between SAM/BAM formats 
00169     /// when read/written.
00170     /// \param position 1-based start position
00171     /// \return true if successfully set, false if not.
00172     bool set1BasedMatePosition(int32_t matePosition);
00173 
00174     /// Set the mate/next fragment's leftmost position using the specified
00175     /// 0-based (BAM format) value.
00176     /// Internal processing handles the switching between SAM/BAM formats 
00177     /// when read/written.
00178     /// \param position 0-based start position
00179     /// \return true if successfully set, false if not.
00180     bool set0BasedMatePosition(int32_t matePosition);
00181 
00182     /// Sets the inferred insert size (ISIZE)/observed template length (TLEN).
00183     /// \param insertSize inferred insert size/observed template length.
00184     /// \return true if successfully set, false if not.
00185     bool setInsertSize(int32_t insertSize);
00186 
00187     /// Sets the sequence (SEQ) to the specified SAM formatted sequence string.
00188     ///  Internal processing handles switching between SAM/BAM formats when
00189     /// read/written.
00190     /// \param seq SAM sequence string.  May contain '='.
00191     /// \return true if successfully set, false if not.
00192     bool setSequence(const char* seq);
00193 
00194     /// Sets the quality (QUAL) to the specified SAM formatted quality string.
00195     /// Internal processing handles switching between SAM/BAM formats when
00196     /// read/written.
00197     /// \param quality SAM quality string.
00198     /// \return true if successfully set, false if not.
00199     bool setQuality(const char* quality);
00200 
00201     /// Shift the indels (if any) to the left by updating the CIGAR.
00202     /// \return true if the cigar was shifted, false if not.
00203     bool shiftIndelsLeft();
00204 
00205     /// Sets the SamRecord to contain the information in the BAM formatted
00206     /// fromBuffer.
00207     /// \param fromBuffer buffer to read the BAM record from.
00208     /// \param fromBufferSize size of the buffer containing the BAM record.
00209     /// \param header BAM header for the record.
00210     /// \return status of reading the BAM record from the buffer.
00211     SamStatus::Status setBuffer(const char* fromBuffer, uint32_t fromBufferSize,
00212                                 SamFileHeader& header);
00213 
00214     /// Read the BAM record from a file.
00215     /// \param filePtr file to read the buffer from.
00216     /// \param header BAM header for the record.
00217     /// \return status of the reading the BAM record from the file.
00218     SamStatus::Status setBufferFromFile(IFILE filePtr, SamFileHeader& header);
00219 
00220     //@}
00221 
00222     ///////////////////////
00223     /// @name  Set Tag Data
00224     /// Set methods for tags.
00225     //@{
00226 
00227     /// Add the specified integer tag to the record.  Internal processing
00228     /// handles switching between SAM/BAM formats when read/written and 
00229     /// determining the type for BAM format.  If the tag is already there
00230     /// this code will replace it if the specified value is different.
00231     /// \param tag two character tag to be added to the SAM/BAM record.
00232     /// \param value value for the specified tag.
00233     /// \return true if the tag was successfully added, false otherwise.
00234     bool addIntTag(const char* tag, int32_t value);
00235 
00236     /// Add the specified tag,vtype,value to the record.  Vtype can be SAM/BAM
00237     /// format.  Internal processing handles switching between SAM/BAM formats
00238     /// when read/written.  If the tag is already there this code will replace
00239     /// it if the specified value is different.
00240     /// \param tag two character tag to be added to the SAM/BAM record.
00241     /// \param vtype vtype of the specified value - either SAM/BAM vtypes.
00242     /// \param value value as a string for the specified tag.
00243     /// \return true if the tag was successfully added, false otherwise.
00244     bool addTag(const char* tag, char vtype, const char* value);
00245 
00246     /// Clear the tags in this record.
00247     /// Does not set SamStatus.
00248     void clearTags();
00249    
00250     /// Remove a tag.
00251     /// \param tag tag to remove.
00252     /// \param type of the tag to be removed.
00253     /// \return true if the tag no longer exists in the record, false if it could not be removed (Returns true if the tag was not found in the record).
00254     bool rmTag(const char* tag, char type);
00255 
00256     /// Remove tags.
00257     /// \param tags tags to remove, formatted as  Tag:Type;Tag:Type;Tag:Type...
00258     /// \return true if all tags no longer exist in the record, false if any could not be removed
00259     /// (Returns true if the tags were not found in the record).
00260     /// SamStatus is set to INVALID if the tags are incorrectly formatted.
00261     bool rmTags(const char* tags);
00262 
00263     //@}
00264 
00265     ///////////////////////
00266     /// @name  Get Alignment Data
00267     /// Get methods for record fields.  All of the "get" methods set the
00268     /// status to indicate success or the failure reason.
00269     //@{
00270 
00271     /// Get a const pointer to the buffer that contains the BAM representation
00272     /// of the record.
00273     /// \return const pointer to the buffer that contains the BAM representation
00274     /// of the record.
00275     const void* getRecordBuffer();
00276 
00277     /// Get a const pointer to the buffer that contains the BAM representation
00278     /// of the record using the specified translation on the sequence.
00279     /// \param translation type of sequence translation to use.
00280     /// \return const pointer to the buffer that contains the BAM representation
00281     /// of the record.
00282     const void* getRecordBuffer(SequenceTranslation translation);
00283 
00284     /// Write the record as a BAM into the specified already opened file.
00285     /// \param filePtr file to write the BAM record into.
00286     /// \return status of the write.
00287     SamStatus::Status writeRecordBuffer(IFILE filePtr);
00288 
00289     /// Write the record as a BAM into the specified already opened file using
00290     /// the specified translation on the sequence.
00291     /// \param filePtr file to write the BAM record into.
00292     /// \param translation type of sequence translation to use.
00293     /// \return status of the write.
00294     SamStatus::Status writeRecordBuffer(IFILE filePtr, 
00295                                         SequenceTranslation translation);
00296 
00297     /// Get the block size of the record (BAM format).
00298     /// \return BAM block size of the record.
00299     int32_t getBlockSize();
00300 
00301     /// Get the reference sequence name (RNAME) of the record.
00302     /// \return reference sequence name
00303     const char* getReferenceName();
00304 
00305     /// Get the reference sequence id of the record (BAM format rid).
00306     /// \return reference sequence id
00307     int32_t getReferenceID();
00308 
00309     /// Get the 1-based(SAM) leftmost position (POS) of the record.
00310     /// \return 1-based leftmost position.
00311     int32_t get1BasedPosition();
00312  
00313     /// Get the 0-based(BAM) leftmost position of the record.
00314     /// \return 0-based leftmost position.
00315    int32_t get0BasedPosition();
00316 
00317     /// Get the length of the readname (QNAME) including the null.
00318     /// \return length of the read name (including null).
00319     uint8_t getReadNameLength();
00320 
00321     /// Get the mapping quality (MAPQ) of the record.
00322     /// \return map quality.
00323     uint8_t getMapQuality();
00324 
00325     /// Get the BAM bin for the record.
00326     /// \return BAM bin
00327     uint16_t getBin();
00328 
00329     /// Get the length of the BAM formatted CIGAR.
00330     /// \return length of BAM formatted cigar.
00331     uint16_t getCigarLength();
00332 
00333     /// Get the flag (FLAG).
00334     /// \return flag.
00335     uint16_t getFlag();
00336 
00337     /// Get the length of the read.
00338     /// \return read length.
00339     int32_t getReadLength();
00340 
00341     /// Get the mate/next fragment's reference sequence name (RNEXT).  If it
00342     /// is equal to the reference name, it still returns the reference name.
00343     /// \return reference sequence name
00344     const char* getMateReferenceName();
00345 
00346     /// Get the mate/next fragment's reference sequence name (RNEXT),
00347     /// returning "=" if it is the same as the reference name, unless 
00348     /// they are both "*" in which case "*" is returned.
00349     /// \return reference sequence name or '='
00350     const char* getMateReferenceNameOrEqual();
00351 
00352     /// Get the mate reference id of the record
00353     /// (BAM format: mate_rid/next_refID).
00354     /// \return reference id
00355     int32_t getMateReferenceID();
00356 
00357     /// Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT).
00358     /// \return 1-based leftmost position.
00359     int32_t get1BasedMatePosition();
00360 
00361     /// Get the 0-based(BAM) leftmost mate/next fragment's position.
00362     /// \return 0-based leftmost position.
00363     int32_t get0BasedMatePosition();
00364 
00365     /// Get the inferred insert size of the read pair (ISIZE) or
00366     /// observed template length (TLEN).
00367     /// \return inferred insert size or observed template length.
00368     int32_t getInsertSize();
00369 
00370     /// Returns the 0-based inclusive rightmost position of the
00371     /// clipped sequence.
00372     /// \return 0-based inclusive rightmost position
00373     int32_t get0BasedAlignmentEnd();
00374 
00375     /// Returns the 1-based inclusive rightmost position of the
00376     /// clipped sequence.
00377     /// \return 1-based inclusive rightmost position
00378     int32_t get1BasedAlignmentEnd();
00379    
00380     /// Returns the length of the clipped sequence, returning 0 if the cigar
00381     /// is '*'.
00382     /// \return length of the clipped sequence.
00383     int32_t getAlignmentLength();
00384 
00385     /// Returns the 0-based inclusive left-most position adjusted for
00386     /// clipped bases.
00387     /// \return 0-based inclusive leftmost position including clips.
00388     int32_t get0BasedUnclippedStart();
00389 
00390     /// Returns the 1-based inclusive left-most position adjusted for
00391     /// clipped bases.
00392     /// \return 1-based inclusive leftmost position including clips.
00393     int32_t get1BasedUnclippedStart();
00394 
00395     /// Returns the 0-based inclusive right-most position adjusted for
00396     /// clipped bases.
00397     /// \return 0-based inclusive rightmost position including clips.
00398     int32_t get0BasedUnclippedEnd();
00399  
00400     /// Returns the 1-based inclusive right-most position adjusted for
00401     /// clipped bases.
00402     /// \return 1-based inclusive rightmost position including clips.
00403     int32_t get1BasedUnclippedEnd();
00404 
00405     /// Returns the SAM formatted Read Name (QNAME).
00406     /// \return read name.
00407     const char* getReadName();
00408 
00409     /// Returns the SAM formatted CIGAR string.
00410     /// \return cigar string.
00411     const char* getCigar();
00412 
00413     /// Returns the SAM formatted sequence string (SEQ), translating the base as
00414     /// specified by setSequenceTranslation.
00415     /// \return sequence string.
00416     const char* getSequence();
00417 
00418     /// Returns the SAM formatted sequence string (SEQ) performing the specified
00419     /// sequence translation.
00420     /// \param translation type of sequence translation to use.
00421     /// \return sequence string.
00422     const char* getSequence(SequenceTranslation translation);
00423 
00424     /// Returns the SAM formatted quality string (QUAL).
00425     /// \return quality string.
00426     const char* getQuality();
00427 
00428     /// Get the sequence base at the specified index into this sequence 0 to
00429     /// readLength - 1, translating the base as specified by
00430     /// setSequenceTranslation.  Throws an exception if index is out of range.
00431     /// \param index index into the sequence string (0 to readLength-1).
00432     /// \return the sequence base at the specified index into the sequence.
00433     char getSequence(int index);
00434     
00435     /// Get the sequence base at the specified index into this sequence 0 to
00436     /// readLength - 1 performing the specified sequence translation. 
00437     /// Throws an exception if index is out of range.
00438     /// \param index index into the sequence string (0 to readLength-1).
00439     /// \param translation type of sequence translation to use.
00440     /// \return the sequence base at the specified index into the sequence.
00441     char getSequence(int index, SequenceTranslation translation);
00442     
00443     /// Get the quality character at the specified index into the quality 0 to
00444     /// readLength - 1.  Throws an exception if index is out of range.
00445     /// \param index index into the quality string (0 to readLength-1).
00446     /// \return the quality character at the specified index into the quality.
00447     char getQuality(int index);
00448    
00449     /// Returns a pointer to the Cigar object associated with this record.  
00450     /// The object is essentially read-only, only allowing modifications 
00451     /// due to lazy evaluations.
00452     /// \return pointer to the Cigar object.
00453     Cigar* getCigarInfo();
00454 
00455     /// Return the number of bases in this read that overlap the passed in
00456     /// region.  Matches & mismatches between the read and the reference
00457     /// are counted as overlaps, but insertions, deletions, skips, clips, and
00458     /// pads are not counted.
00459     /// \param start inclusive 0-based start position (reference position) of
00460     ///              the region to check for overlaps in.
00461     ///              (-1 indicates to start at the beginning of the reference.)
00462     /// \param end   exclusive 0-based end position (reference position) of the
00463     ///              region to check for overlaps in.
00464     ///              (-1 indicates to go to the end of the reference.)
00465     /// \return number of overlapping bases
00466     uint32_t getNumOverlaps(int32_t start, int32_t end);
00467 
00468     /// Returns the values of all fields except the tags.
00469     /// \param recStruct structure containing the contents of all 
00470     /// non-variable length fields.
00471     /// \param readName read name from the record (return param)
00472     /// \param cigar cigar string from the record (return param)
00473     /// \param sequence sequence string from the record (return param)
00474     /// \param quality quality string from the record (return param)
00475     /// \return true if all fields were successfully set, false otherwise.
00476     bool getFields(bamRecordStruct& recStruct, String& readName, 
00477                    String& cigar, String& sequence, String& quality);
00478 
00479     /// Returns the values of all fields except the tags using the specified
00480     /// sequence translation.
00481     /// \param recStruct structure containing the contents of all 
00482     /// non-variable length fields.
00483     /// \param readName read name from the record (return param)
00484     /// \param cigar cigar string from the record (return param)
00485     /// \param sequence sequence string from the record (return param)
00486     /// \param quality quality string from the record (return param)
00487     /// \param translation type of sequence translation to use.
00488     /// \return true if all fields were successfully set, false otherwise.
00489     bool getFields(bamRecordStruct& recStruct, String& readName, 
00490                    String& cigar, String& sequence, String& quality,
00491                    SequenceTranslation translation);
00492 
00493     /// Returns a pointer to the genome sequence object associated with this
00494     /// record if it was set (NULL if it was not set).
00495     /// \return pointer to the GenomeSequence object or NULL if there isn't one.
00496     GenomeSequence* getReference();
00497 
00498     //@}
00499 
00500     ///////////////////////
00501     /// @name  Get Tag Methods
00502     /// Get methods for obtaining information on tags.
00503     //@{
00504 
00505     /// Returns the length of the BAM formatted tags.
00506     /// \return length of the BAM formatted tags.
00507     uint32_t getTagLength();
00508 
00509     /// Get the next tag from the record.
00510     /// Sets the Status to SUCCESS when a tag is successfully returned or
00511     /// when there are no more tags.  Otherwise the status is set to describe
00512     /// why it failed (parsing, etc).
00513     /// \param tag set to the tag when a tag is read.
00514     /// \param vtype set to the vtype when a tag is read.
00515     /// \param value pointer to the value of the tag (will need to cast
00516     /// to int, double, char, or string based on vtype).
00517     /// \return true if a tag was read, false if there are no more tags.
00518     bool getNextSamTag(char* tag, char& vtype, void** value);
00519 
00520     /// Reset the tag iterator to the beginning of the tags.
00521     void resetTagIter();
00522  
00523     /// Returns whether or not the specified vtype is an integer type.
00524     /// Does not set SamStatus.
00525     /// \param vtype value type to check.
00526     /// \return true if the passed in vtype is an integer ('c', 'C', 's',
00527     /// 'S', 'i', 'I'), false otherwise.
00528     bool isIntegerType(char vtype) const;
00529 
00530     /// Returns whether or not the specified vtype is a double type.
00531     /// Does not set SamStatus.
00532     /// \param vtype value type to check.
00533     /// \return true if the passed in vtype is a double ('f'), false otherwise.
00534     bool isDoubleType(char vtype) const;
00535 
00536     /// Returns whether or not the specified vtype is a char type.
00537     /// Does not set SamStatus.
00538     /// \param vtype value type to check.
00539     /// \return true if the passed in vtype is a char ('A'), false otherwise.
00540     bool isCharType(char vtype) const;
00541 
00542     /// Returns whether or not the specified vtype is a string type.
00543     /// Does not set SamStatus.
00544     /// \param vtype value type to check.
00545     /// \return true if the passed in vtype is a string ('Z'), false othwerise.
00546     bool isStringType(char vtype) const;
00547 
00548     /// Get the string representation of the tags from the record, formatted
00549     /// as TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE...
00550     /// Sets the Status to SUCCESS when the tags are successfully returned or
00551     /// the tags were not found.  If a different error occured, the status is
00552     /// set appropriately.
00553     /// \param tags the tags to retrieve, formatted as TAG:TYPE;TAG:TYPE...
00554     /// \param returnString the String to set (this method first clears returnString)
00555     ///                     to TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE...
00556     /// \param delim delimiter to use to separate two tags, default is a tab.
00557     /// \return true if there were not any errors even if no tags were found.
00558     bool getTagsString(const char* tags, String& returnString, char delim = '\t');
00559 
00560     /// Get the string value for the specified tag.
00561     /// \param tag tag to retrieve
00562     /// \param pointer to the tag's string value if found, NULL if not found.
00563     String* getStringTag(const char * tag);
00564 
00565     /// Get the integer value for the specified tag.
00566     /// \param tag tag to retrieve
00567     /// \retun pointer to the tag's integer value if found, NULL if not found.
00568     int* getIntegerTag(const char * tag);
00569 
00570     /// Get the double value for the specified tag.
00571     /// \param tag tag to retrieve
00572     /// \return pointer to the tag's double value if found, NULL if not found.
00573     double* getDoubleTag(const char * tag);
00574 
00575     /// Get the string value for the specified tag.
00576     String & getString(const char * tag);
00577 
00578     /// Get the integer value for the specified tag.
00579     int &    getInteger(const char * tag);
00580 
00581     /// Get the double value for the specified tag.
00582     double & getDouble(const char * tag);
00583 
00584     /// Check if the specified tag contains a string.
00585     /// Does not set SamStatus.
00586     /// \param tag SAM tag to check contents of.
00587     /// \return true if the value associated with the tag is a string.
00588     bool checkString(const char * tag)    { return checkTag(tag, 'Z'); }
00589     
00590     /// Check if the specified tag contains a string.
00591     /// Does not set SamStatus.
00592     /// \param tag SAM tag to check contents of.
00593     /// \return true if the value associated with the tag is a string.
00594     bool checkInteger(const char * tag)   { return checkTag(tag, 'i'); }
00595     
00596     /// Check if the specified tag contains a string.
00597     /// Does not set SamStatus.
00598     /// \param tag SAM tag to check contents of.
00599     /// \return true if the value associated with the tag is a string.
00600     bool checkDouble(const char * tag)    { return checkTag(tag, 'f'); }
00601      
00602     /// Check if the specified tag contains a value of the specified vtype.
00603     /// Does not set SamStatus.
00604     /// \param tag SAM tag to check contents of.
00605     /// \param type value type to check if the SAM tag matches.
00606     /// \return true if the value associated with the tag is a string.
00607    bool checkTag(const char * tag, char type);
00608     //@}
00609 
00610     /// Returns the status associated with the last method that sets the status.
00611     /// \return SamStatus of the last command that sets status.
00612     const SamStatus& getStatus();
00613     
00614 
00615     
00616 
00617 private:
00618     static int MAKEKEY(char ch1, char ch2, char type)
00619     { return (getKeyType(type) << 16) + (ch2 << 8) + ch1; }
00620 
00621     static char getKeyType(char type)
00622     {
00623         switch(type)
00624         {
00625             // For any char/integer type, return 'i'
00626             case 'A' :
00627             case 'c' :
00628             case 'C' :
00629             case 's' :
00630             case 'S' :
00631             case 'i' :
00632             case 'I' :
00633                 return('i');
00634                 break;
00635             default:
00636                 // For all other types, return the actual type.
00637                 return(type);
00638         };
00639     }
00640 
00641     // Allocate space for the record - does a realloc.  
00642     // The passed in size is the size of the entire record including the
00643     // block size field.
00644     // Adds any errors to myStatus.
00645     bool allocateRecordStructure(int size);
00646 
00647     void* getStringPtr(int offset);
00648     void* getIntegerPtr(int offset, char& vtype);
00649     void* getDoublePtr(int offset);
00650 
00651     // Fixes the buffer to match the variable length fields.
00652     // Adds any errors to myStatus.
00653     bool fixBuffer(SequenceTranslation translation);
00654 
00655     // Sets the Sequence and Quality strings from the buffer.
00656     // They are done together in one method because they require the same
00657     // loop, so might as well be done at the same time.
00658     // Adds any errors to myStatus.
00659     void setSequenceAndQualityFromBuffer();
00660 
00661     // Parse the cigar to calculate the alignment/unclipped ends and convert
00662     // to SAM/BAM format.
00663     // Adds any errors to myStatus.
00664     bool parseCigar();
00665     // Parse the cigar string to calculate the cigar length and alignment end
00666     // and convert to SAM format.
00667     // Adds any errors to myStatus.
00668     bool parseCigarBinary();
00669     // Parse the cigar string to calculate the cigar length and alignment end
00670     // and convert to BAM format.
00671     // Adds any errors to myStatus.
00672     bool parseCigarString();
00673 
00674     // Set the tags from the buffer.
00675     // Adds any errors to myStatus.
00676     bool setTagsFromBuffer();
00677 
00678     // Set the tags in the buffer.
00679     // Adds any errors to myStatus.
00680     bool setTagsInBuffer();
00681 
00682     void setVariablesForNewBuffer(SamFileHeader& header);
00683 
00684     void getTypeFromKey(int key, char& type) const;
00685     void getTag(int key, char* tag) const;
00686 
00687     String & getString(int offset);
00688     int &    getInteger(int offset);
00689     char &   getIntegerType(int offset);
00690     double & getDouble(int offset);
00691 
00692     static const int DEFAULT_BLOCK_SIZE = 40;
00693     static const int DEFAULT_BIN = 4680;
00694     static const int DEFAULT_READ_NAME_LENGTH = 8;
00695     static const char* DEFAULT_READ_NAME;
00696     static const char* FIELD_ABSENT_STRING;
00697 
00698     bamRecordStruct * myRecordPtr;
00699     int allocatedSize;
00700 
00701     // Pointer to a temporary cigar buffer that can be used during string
00702     // parsing before it is ready to be copied into the actual record.
00703     uint32_t* myCigarTempBuffer;
00704 
00705     // Size of the currently allocated temporary cigar buffer.
00706     int myCigarTempBufferAllocatedSize;
00707 
00708     // Length of the cigar currently contained in the temporary buffer.
00709     int myCigarTempBufferLength;
00710 
00711     // Track if the buffer is in sync with the Strings/Tags.
00712     // Set to false if any of the variable length fields are modified.
00713     // Set to true when the buffer is updated to match the variable length
00714     // fields.
00715     bool myIsBufferSynced;
00716 
00717     // Track if the tags need to be set from the buffer.
00718     bool myNeedToSetTagsFromBuffer;
00719 
00720     // Trag if the tags need to be set in the buffer.
00721     // Allows you to set just the tags if they are the only thing that changed
00722     // in the buffer.
00723     bool myNeedToSetTagsInBuffer;
00724 
00725     int myTagBufferSize;
00726     int myLastTagIndex;
00727 
00728     String myReadName;
00729     String myReferenceName;
00730     String myMateReferenceName;
00731     String myCigar;
00732     String mySequence;
00733     String myQuality;
00734 
00735     std::string mySeqWithEq;
00736     std::string mySeqWithoutEq;
00737 
00738     // The length of the alignment.
00739     int32_t myAlignmentLength;
00740     // Unclipped alignment positions.
00741     int32_t myUnclippedStartOffset;
00742     int32_t myUnclippedEndOffset;
00743     
00744     CigarRoller myCigarRoller;
00745 
00746     LongHash<int>  extras;
00747     // Note: not all values in strings, integers, and doubles are always
00748     // in extras.  They will not be if the tags were removed.  Removed
00749     // tags are removed from extras, but not from strings, integers, or doubles
00750     // since if one was removed from these arrays, all other entries would
00751     // need their indices updated in extras.
00752     StringArray    strings;
00753     IntArray       integers;
00754     std::vector<char> intType; // contains the type of int at same position in integers.
00755     Vector         doubles;
00756 
00757 
00758     // Track whether or not the buffer values are correct for
00759     // each setting.
00760     bool myIsReadNameBufferValid;
00761     bool myIsCigarBufferValid;
00762     bool myIsSequenceBufferValid;
00763     bool myIsQualityBufferValid;
00764     bool myIsTagsBufferValid;
00765     bool myIsBinValid;
00766 
00767     SamStatus myStatus;
00768 
00769     // The current translation of the sequence as it occurs in the buffer.
00770     // Only applicable if myIsSequenceBufferValid == true.
00771     SequenceTranslation myBufferSequenceTranslation;
00772 
00773 
00774     // Track the Reference.
00775     GenomeSequence* myRefPtr;
00776 
00777     // The type of translation to do when getting a sequence.
00778     SequenceTranslation mySequenceTranslation;
00779 
00780     String NOT_FOUND_TAG_STRING;
00781     int NOT_FOUND_TAG_INT;
00782     double NOT_FOUND_TAG_DOUBLE;
00783 };
00784 
00785 #endif
Generated on Tue Sep 6 17:51:59 2011 for libStatGen Software by  doxygen 1.6.3