SamRecord.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_RECORD_H__
00019 #define __SAM_RECORD_H__
00020 
00021 #include <stdint.h>
00022 
00023 #include "SamStatus.h"
00024 #include "LongHash.h"
00025 #include "MathVector.h"
00026 #include "StringArray.h"
00027 #include "IntArray.h"
00028 #include "SamFileHeader.h"
00029 #include "CigarRoller.h"
00030 
00031 struct bamRecordStruct
00032 {
00033 public:
00034     int32_t      myBlockSize;
00035     int32_t      myReferenceID;
00036     int32_t      myPosition;
00037     uint32_t     myReadNameLength : 8, myMapQuality : 8, myBin : 16;
00038     uint32_t     myCigarLength : 16, myFlag : 16;
00039     int32_t      myReadLength;
00040     int32_t      myMateReferenceID;
00041     int32_t      myMatePosition;
00042     int32_t      myInsertSize;             // Outer fragment length
00043     char  myData[1];
00044 };
00045 
00046 class SamRecord
00047 {
00048 public:
00049    
00050     /// Default Constructor.
00051     SamRecord();
00052 
00053     /// Constructor that sets the error handling type.
00054     /// \param errorHandlingType how to handle errors.
00055     SamRecord(ErrorHandler::HandlingType errorHandlingType);
00056 
00057     ~SamRecord();
00058 
00059     // Reset the fields of the record to a default value.
00060     void resetRecord();
00061     // Reset the tag iterator to the beginning of the tags.
00062     void resetTagIter();
00063  
00064     // Returns whether or not the record is valid.
00065     // Header is needed to perform some validation against it.
00066     // Sets the status to indicate success or failure.
00067     bool isValid(SamFileHeader& header);
00068 
00069     ///////////////////////
00070     // Set alignment data
00071     ///////////////////////
00072     // Set methods for record fields.  All of the "set" methods set the
00073     // status to indicate success or the failure reason.
00074     bool setReadName(const char* readName);
00075     bool setFlag(uint16_t flag);
00076     bool setReferenceName(SamFileHeader& header, 
00077                           const char* referenceName);
00078     bool set1BasedPosition(int32_t position);
00079     bool set0BasedPosition(int32_t position);
00080     bool setMapQuality(uint8_t mapQuality);
00081     bool setCigar(const char* cigar);
00082     bool setCigar(const Cigar& cigar);
00083     bool setMateReferenceName(SamFileHeader& header,
00084                               const char* mateReferenceName);
00085     bool set1BasedMatePosition(int32_t matePosition);
00086     bool set0BasedMatePosition(int32_t matePosition);
00087     bool setInsertSize(int32_t insertSize);
00088     bool setSequence(const char* seq);
00089     bool setQuality(const char* quality);
00090 
00091     // Read the BAM record from a file.
00092     SamStatus::Status setBufferFromFile(IFILE filePtr, SamFileHeader& header);
00093 
00094     // Read the BAM record from a file.
00095     SamStatus::Status setBuffer(const char* fromBuffer, uint32_t fromBufferSize,
00096                                 SamFileHeader& header);
00097 
00098     // Add the specified tag to the record.
00099     // Returns true if the tag was successfully added, false otherwise.
00100     // Sets the status.
00101     bool addTag(const char* tag, char vtype, const char* value);
00102 
00103     // Get methods for record fields.  All of the "get" methods set the
00104     // status to indicate success or the failure reason.
00105     const void* getRecordBuffer();
00106     SamStatus::Status writeRecordBuffer(IFILE filePtr);
00107     int32_t getBlockSize();
00108     const char* getReferenceName();
00109     int32_t getReferenceID();
00110     int32_t get1BasedPosition();
00111     int32_t get0BasedPosition();
00112     uint8_t getReadNameLength();
00113     uint8_t getMapQuality();
00114     uint16_t getBin();
00115     uint16_t getCigarLength();
00116     uint16_t getFlag();
00117     int32_t getReadLength();
00118 
00119     // This method returns the mate reference name.  If it is equal to the
00120     // reference name, it still returns the reference name.
00121     const char* getMateReferenceName();
00122 
00123     // This method returns the mate reference name.  If it is equal to the
00124     // reference name, it returns "=", unless they are both "*" in which case
00125     // "*" is returned.
00126     const char* getMateReferenceNameOrEqual();
00127     int32_t getMateReferenceID();
00128     int32_t get1BasedMatePosition();
00129     int32_t get0BasedMatePosition();
00130     int32_t getInsertSize();
00131 
00132     // Returns the inclusive rightmost position of the clipped sequence.
00133     int32_t get0BasedAlignmentEnd();
00134     int32_t get1BasedAlignmentEnd();
00135    
00136     // Return the length of the alignment.
00137     int32_t getAlignmentLength();
00138 
00139     // Returns the inclusive left-most position adjust for clipped bases.
00140     int32_t get0BasedUnclippedStart();
00141     int32_t get1BasedUnclippedStart();
00142     // Returns the inclusive right-most position adjust for clipped bases.
00143     int32_t get0BasedUnclippedEnd();
00144     int32_t get1BasedUnclippedEnd();
00145 
00146     const char* getReadName();
00147     const char* getCigar();
00148     const char* getSequence();
00149     const char* getQuality();
00150 
00151     // Get the sequence base at the specified index into this sequence 0 to
00152     // readLength - 1.
00153     char getSequence(int index);
00154 
00155     // Get the quality char at the specified index into this quality 0 to
00156     // readLength - 1.
00157     char getQuality(int index);
00158    
00159     // TODO - want this to be getCigar
00160     Cigar* getCigarInfo();
00161 
00162     uint32_t getTagLength();
00163 
00164     // Sets the Status to SUCCESS when a tag is successfully returned or
00165     // when there are no more tags.  Otherwise the status is set to describe
00166     // why it failed (parsing, etc).
00167     bool getNextSamTag(char* tag, char& vtype, void** value);
00168 
00169     // Returns the values of all fields except the tags.
00170     bool getFields(bamRecordStruct& recStruct, String& readName, 
00171                    String& cigar, String& sequence, String& quality);
00172 
00173     // The following set of methods do not set the status.
00174     bool isIntegerType(char vtype) const;
00175     bool isDoubleType(char vtype) const;
00176     bool isCharType(char vtype) const;
00177     bool isStringType(char vtype) const;
00178 
00179     // The following set of methods do not set the status.
00180     void clearTags();
00181    
00182     // Returns the status associated with the last method
00183     // that sets the status.
00184     const SamStatus& getStatus();
00185     
00186     // The following set of methods do not set the status.
00187     String & getString(const char * tag);
00188     int &    getInteger(const char * tag);
00189     double & getDouble(const char * tag);
00190 
00191 
00192 //     void getSamExtraFieldFromKey(int key, String& extraField);
00193     
00194     // The following set of methods do not set the status.
00195     bool checkString(const char * tag)    { return checkTag(tag, 'Z'); }
00196     bool checkInteger(const char * tag)   { return checkTag(tag, 'i'); }
00197     bool checkDouble(const char * tag)    { return checkTag(tag, 'f'); }
00198     bool checkTag(const char * tag, char type);
00199 
00200 
00201     
00202     // Return the number of bases in this read that overlap the passed in
00203     // region.
00204     // start : inclusive 0-based start position (reference position) of the
00205     //         region to check for overlaps in.
00206     //         (-1 indicates to start at the beginning of the reference.)
00207     // end   : exclusive 0-based end position (reference position) of the
00208     //          region to check for overlaps in.
00209     //         (-1 indicates to go to the end of the reference.)
00210     // Returns the number of overlapping bases
00211     // (matches in the cigar - not skips/deletions)
00212     uint32_t getNumOverlaps(int32_t start, int32_t end);
00213 
00214 
00215 private:
00216     static int MAKEKEY(char ch1, char ch2, char type)
00217     { return (type << 16) + (ch2 << 8) + ch1; }
00218 
00219     // Allocate space for the record - does a realloc.  
00220     // The passed in size is the size of the entire record including the
00221     // block size field.
00222     // Adds any errors to myStatus.
00223     bool allocateRecordStructure(int size);
00224 
00225 
00226     void* getStringPtr(int offset);
00227     void* getIntegerPtr(int offset);
00228     void* getDoublePtr(int offset);
00229 
00230     // Fixes the buffer to match the variable length fields.
00231     // Adds any errors to myStatus.
00232     bool fixBuffer();
00233 
00234     // Sets the Sequence and Quality strings from the buffer.
00235     // They are done together in one method because they require the same
00236     // loop, so might as well be done at the same time.
00237     // Adds any errors to myStatus.
00238     void setSequenceAndQualityFromBuffer();
00239 
00240     // Parse the cigar to calculate the alignment/unclipped ends and convert
00241     // to SAM/BAM format.
00242     // Adds any errors to myStatus.
00243     bool parseCigar();
00244     // Parse the cigar string to calculate the cigar length and alignment end
00245     // and convert to SAM format.
00246     // Adds any errors to myStatus.
00247     bool parseCigarBinary();
00248     // Parse the cigar string to calculate the cigar length and alignment end
00249     // and convert to BAM format.
00250     // Adds any errors to myStatus.
00251     bool parseCigarString();
00252 
00253     // Set the tags from the buffer.
00254     // Adds any errors to myStatus.
00255     bool setTagsFromBuffer();
00256 
00257     // Set the tags in the buffer.
00258     // Adds any errors to myStatus.
00259     bool setTagsInBuffer();
00260 
00261     void setVariablesForNewBuffer(SamFileHeader& header);
00262 
00263     void getVtype(int key, char& vtype) const;
00264     void getTag(int key, char* tag) const;
00265 
00266     String & getString(int offset);
00267     int &    getInteger(int offset);
00268     double & getDouble(int offset);
00269 
00270     static const int DEFAULT_BLOCK_SIZE = 40;
00271     static const int DEFAULT_BIN = 4680;
00272     static const int DEFAULT_READ_NAME_LENGTH = 8;
00273     static const char* DEFAULT_READ_NAME;
00274     static const char* FIELD_ABSENT_STRING;
00275 
00276     bamRecordStruct * myRecordPtr;
00277     int allocatedSize;
00278 
00279     // Pointer to a temporary cigar buffer that can be used during string
00280     // parsing before it is ready to be copied into the actual record.
00281     uint32_t* myCigarTempBuffer;
00282 
00283     // Size of the currently allocated temporary cigar buffer.
00284     int myCigarTempBufferAllocatedSize;
00285 
00286     // Length of the cigar currently contained in the temporary buffer.
00287     int myCigarTempBufferLength;
00288 
00289     // Track if the buffer is in sync with the Strings/Tags.
00290     // Set to false if any of the variable length fields are modified.
00291     // Set to true when the buffer is updated to match the variable length
00292     // fields.
00293     bool myIsBufferSynced;
00294 
00295     // Track if the tags need to be set from the buffer.
00296     bool myNeedToSetTagsFromBuffer;
00297 
00298     // Trag if the tags need to be set in the buffer.
00299     // Allows you to set just the tags if they are the only thing that changed
00300     // in the buffer.
00301     bool myNeedToSetTagsInBuffer;
00302 
00303     int myTagBufferSize;
00304     int myLastTagIndex;
00305 
00306     String myReadName;
00307     String myReferenceName;
00308     String myMateReferenceName;
00309     String myCigar;
00310     String mySequence;
00311     String myQuality;
00312 
00313     // The length of the alignment.
00314     int32_t myAlignmentLength;
00315     // Unclipped alignment positions.
00316     int32_t myUnclippedStartOffset;
00317     int32_t myUnclippedEndOffset;
00318     
00319     CigarRoller myCigarRoller;
00320 
00321     LongHash<int>  extras;
00322     StringArray    strings;
00323     IntArray       integers;
00324     Vector         doubles;
00325 
00326 
00327     // Track whether or not the buffer values are correct for
00328     // each setting.
00329     bool myIsReadNameBufferValid;
00330     bool myIsCigarBufferValid;
00331     bool myIsSequenceBufferValid;
00332     bool myIsQualityBufferValid;
00333     bool myIsTagsBufferValid;
00334     bool myIsBinValid;
00335 
00336     SamStatus myStatus;
00337 
00338     String NOT_FOUND_TAG_STRING;
00339     int NOT_FOUND_TAG_INT;
00340     double NOT_FOUND_TAG_DOUBLE;
00341 };
00342 
00343 #endif
Generated on Wed Nov 17 15:38:27 2010 for StatGen Software by  doxygen 1.6.3