libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010-2011 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __SAM_RECORD_H__ 00019 #define __SAM_RECORD_H__ 00020 00021 #include <stdint.h> 00022 00023 #include "GenomeSequence.h" 00024 #include "SamStatus.h" 00025 #include "LongHash.h" 00026 #include "MathVector.h" 00027 #include "StringArray.h" 00028 #include "IntArray.h" 00029 #include "SamFileHeader.h" 00030 #include "CigarRoller.h" 00031 00032 /// Structure of a BAM record. 00033 struct bamRecordStruct 00034 { 00035 public: 00036 int32_t myBlockSize; 00037 int32_t myReferenceID; 00038 int32_t myPosition; 00039 uint32_t myReadNameLength : 8, myMapQuality : 8, myBin : 16; 00040 uint32_t myCigarLength : 16, myFlag : 16; 00041 int32_t myReadLength; 00042 int32_t myMateReferenceID; 00043 int32_t myMatePosition; 00044 int32_t myInsertSize; // Outer fragment length 00045 char myData[1]; 00046 }; 00047 00048 00049 /// Class providing an easy to use interface to get/set/operate on the 00050 /// fields in a SAM/BAM record. 00051 class SamRecord 00052 { 00053 public: 00054 /// Enum containing the settings on how to translate the sequence if a 00055 /// reference is available. If no reference is available, no translation 00056 /// is done. 00057 enum SequenceTranslation { 00058 NONE, ///< Leave the sequence as is. 00059 EQUAL, ///< Translate bases that match the reference to '=' 00060 BASES, ///< Translate '=' to the actual base. 00061 }; 00062 00063 /// Default Constructor. 00064 SamRecord(); 00065 00066 /// Constructor that sets the error handling type. 00067 /// \param errorHandlingType how to handle errors. 00068 SamRecord(ErrorHandler::HandlingType errorHandlingType); 00069 00070 /// Destructor 00071 ~SamRecord(); 00072 00073 /// Reset the fields of the record to a default value. 00074 /// This is not necessary when you are reading a SAM/BAM file, 00075 /// but if you are setting fields, it is a good idea to clean 00076 /// out a record before reusing it. Clearing it allows you to 00077 /// not have to set any empty fields. 00078 void resetRecord(); 00079 00080 /// Returns whether or not the record is valid, setting the status to 00081 /// indicate success or failure. 00082 /// \param header SAM Header associated with the record. Used to perform 00083 /// some validation against the header. 00084 /// \return true if the record is valid, false if not. 00085 bool isValid(SamFileHeader& header); 00086 00087 /// Set the reference to the specified genome sequence object. 00088 /// \param reference pointer to the GenomeSequence object. 00089 void setReference(GenomeSequence* reference); 00090 00091 /// Set the type of sequence translation to use when getting 00092 /// the sequence. The default type (if this method is never called) is 00093 /// NONE (the sequence is left as-is). Can be over-ridden by using 00094 /// the accessors that take a SequenceTranslation parameter. 00095 /// \param translation type of sequence translation to use. 00096 void setSequenceTranslation(SequenceTranslation translation); 00097 00098 /////////////////////// 00099 /// @name Set Alignment Data 00100 /// Set methods for record fields. All of the "set" methods set the 00101 /// status to indicate success or the failure reason. 00102 //@{ 00103 00104 /// Set QNAME to the passed in name. 00105 /// \param readName the readname to set the QNAME to. 00106 /// \return true if successfully set, false if not. 00107 bool setReadName(const char* readName); 00108 00109 /// Set the bitwise FLAG to the specified value. 00110 /// \param flag integer flag to use. 00111 /// \return true if successfully set, false if not. 00112 bool setFlag(uint16_t flag); 00113 00114 /// Set the reference sequence name (RNAME) to the specified name, using 00115 /// the header to determine the reference id. 00116 /// \param header SAM/BAM header to use to determine the reference id. 00117 /// \param referenceName reference name to use. 00118 /// \return true if successfully set, false if not 00119 bool setReferenceName(SamFileHeader& header, 00120 const char* referenceName); 00121 00122 /// Set the leftmost position (POS) using the specified 1-based (SAM format) 00123 /// value. 00124 /// Internal processing handles the switching between SAM/BAM formats 00125 /// when read/written. 00126 /// \param position 1-based start position 00127 /// \return true if successfully set, false if not. 00128 bool set1BasedPosition(int32_t position); 00129 00130 /// Set the leftmost position using the specified 0-based (BAM format) 00131 /// value. 00132 /// Internal processing handles the switching between SAM/BAM formats 00133 /// when read/written. 00134 /// \param position 0-based start position 00135 /// \return true if successfully set, false if not. 00136 bool set0BasedPosition(int32_t position); 00137 00138 /// Set the mapping quality (MAPQ). 00139 /// \param mapQuality map quality to set in the record. 00140 /// \return true if successfully set, false if not. 00141 bool setMapQuality(uint8_t mapQuality); 00142 00143 /// Set the CIGAR to the specified SAM formatted cigar string. 00144 /// Internal processing handles the switching between SAM/BAM formats 00145 /// when read/written. 00146 /// \param cigar string containing the SAM formatted cigar. 00147 /// \return true if successfully set, false if not. 00148 bool setCigar(const char* cigar); 00149 00150 /// Set the CIGAR to the specified Cigar object. 00151 /// Internal processing handles the switching between SAM/BAM formats 00152 /// when read/written. 00153 /// \param cigar object to set this record's cigar to have. 00154 /// \return true if successfully set, false if not. 00155 bool setCigar(const Cigar& cigar); 00156 00157 00158 /// Set the mate/next fragment's reference sequence name (RNEXT) to the 00159 /// specified name, using the header to determine the mate reference id. 00160 /// \param header SAM/BAM header to use to determine the mate reference id. 00161 /// \param referenceName mate reference name to use. 00162 /// \return true if successfully set, false if not 00163 bool setMateReferenceName(SamFileHeader& header, 00164 const char* mateReferenceName); 00165 00166 /// Set the mate/next fragment's leftmost position (PNEXT) using the 00167 /// specified 1-based (SAM format) value. 00168 /// Internal processing handles the switching between SAM/BAM formats 00169 /// when read/written. 00170 /// \param position 1-based start position 00171 /// \return true if successfully set, false if not. 00172 bool set1BasedMatePosition(int32_t matePosition); 00173 00174 /// Set the mate/next fragment's leftmost position using the specified 00175 /// 0-based (BAM format) value. 00176 /// Internal processing handles the switching between SAM/BAM formats 00177 /// when read/written. 00178 /// \param position 0-based start position 00179 /// \return true if successfully set, false if not. 00180 bool set0BasedMatePosition(int32_t matePosition); 00181 00182 /// Sets the inferred insert size (ISIZE)/observed template length (TLEN). 00183 /// \param insertSize inferred insert size/observed template length. 00184 /// \return true if successfully set, false if not. 00185 bool setInsertSize(int32_t insertSize); 00186 00187 /// Sets the sequence (SEQ) to the specified SAM formatted sequence string. 00188 /// Internal processing handles switching between SAM/BAM formats when 00189 /// read/written. 00190 /// \param seq SAM sequence string. May contain '='. 00191 /// \return true if successfully set, false if not. 00192 bool setSequence(const char* seq); 00193 00194 /// Sets the quality (QUAL) to the specified SAM formatted quality string. 00195 /// Internal processing handles switching between SAM/BAM formats when 00196 /// read/written. 00197 /// \param quality SAM quality string. 00198 /// \return true if successfully set, false if not. 00199 bool setQuality(const char* quality); 00200 00201 /// Shift the indels (if any) to the left by updating the CIGAR. 00202 /// \return true if the cigar was shifted, false if not. 00203 bool shiftIndelsLeft(); 00204 00205 /// Sets the SamRecord to contain the information in the BAM formatted 00206 /// fromBuffer. 00207 /// \param fromBuffer buffer to read the BAM record from. 00208 /// \param fromBufferSize size of the buffer containing the BAM record. 00209 /// \param header BAM header for the record. 00210 /// \return status of reading the BAM record from the buffer. 00211 SamStatus::Status setBuffer(const char* fromBuffer, uint32_t fromBufferSize, 00212 SamFileHeader& header); 00213 00214 /// Read the BAM record from a file. 00215 /// \param filePtr file to read the buffer from. 00216 /// \param header BAM header for the record. 00217 /// \return status of the reading the BAM record from the file. 00218 SamStatus::Status setBufferFromFile(IFILE filePtr, SamFileHeader& header); 00219 00220 //@} 00221 00222 /////////////////////// 00223 /// @name Set Tag Data 00224 /// Set methods for tags. 00225 //@{ 00226 00227 /// Add the specified integer tag to the record. Internal processing 00228 /// handles switching between SAM/BAM formats when read/written and 00229 /// determining the type for BAM format. If the tag is already there 00230 /// this code will replace it if the specified value is different. 00231 /// \param tag two character tag to be added to the SAM/BAM record. 00232 /// \param value value for the specified tag. 00233 /// \return true if the tag was successfully added, false otherwise. 00234 bool addIntTag(const char* tag, int32_t value); 00235 00236 /// Add the specified tag,vtype,value to the record. Vtype can be SAM/BAM 00237 /// format. Internal processing handles switching between SAM/BAM formats 00238 /// when read/written. If the tag is already there this code will replace 00239 /// it if the specified value is different. 00240 /// \param tag two character tag to be added to the SAM/BAM record. 00241 /// \param vtype vtype of the specified value - either SAM/BAM vtypes. 00242 /// \param value value as a string for the specified tag. 00243 /// \return true if the tag was successfully added, false otherwise. 00244 bool addTag(const char* tag, char vtype, const char* value); 00245 00246 /// Clear the tags in this record. 00247 /// Does not set SamStatus. 00248 void clearTags(); 00249 00250 /// Remove a tag. 00251 /// \param tag tag to remove. 00252 /// \param type of the tag to be removed. 00253 /// \return true if the tag no longer exists in the record, false if it could not be removed (Returns true if the tag was not found in the record). 00254 bool rmTag(const char* tag, char type); 00255 00256 /// Remove tags. 00257 /// The delimiter between the tags is ',' or ';'. ',' was added since 00258 /// the original delimiter, ';', requires the string to be quoted on the 00259 /// command-line. 00260 /// \param tags tags to remove, formatted as Tag:Type,Tag:Type,Tag:Type... 00261 /// \return true if all tags no longer exist in the record, false if any could not be removed 00262 /// (Returns true if the tags were not found in the record). 00263 /// SamStatus is set to INVALID if the tags are incorrectly formatted. 00264 bool rmTags(const char* tags); 00265 00266 //@} 00267 00268 /////////////////////// 00269 /// @name Get Alignment Data 00270 /// Get methods for record fields. All of the "get" methods set the 00271 /// status to indicate success or the failure reason. 00272 //@{ 00273 00274 /// Get a const pointer to the buffer that contains the BAM representation 00275 /// of the record. 00276 /// \return const pointer to the buffer that contains the BAM representation 00277 /// of the record. 00278 const void* getRecordBuffer(); 00279 00280 /// Get a const pointer to the buffer that contains the BAM representation 00281 /// of the record using the specified translation on the sequence. 00282 /// \param translation type of sequence translation to use. 00283 /// \return const pointer to the buffer that contains the BAM representation 00284 /// of the record. 00285 const void* getRecordBuffer(SequenceTranslation translation); 00286 00287 /// Write the record as a BAM into the specified already opened file. 00288 /// \param filePtr file to write the BAM record into. 00289 /// \return status of the write. 00290 SamStatus::Status writeRecordBuffer(IFILE filePtr); 00291 00292 /// Write the record as a BAM into the specified already opened file using 00293 /// the specified translation on the sequence. 00294 /// \param filePtr file to write the BAM record into. 00295 /// \param translation type of sequence translation to use. 00296 /// \return status of the write. 00297 SamStatus::Status writeRecordBuffer(IFILE filePtr, 00298 SequenceTranslation translation); 00299 00300 /// Get the block size of the record (BAM format). 00301 /// \return BAM block size of the record. 00302 int32_t getBlockSize(); 00303 00304 /// Get the reference sequence name (RNAME) of the record. 00305 /// \return reference sequence name 00306 const char* getReferenceName(); 00307 00308 /// Get the reference sequence id of the record (BAM format rid). 00309 /// \return reference sequence id 00310 int32_t getReferenceID(); 00311 00312 /// Get the 1-based(SAM) leftmost position (POS) of the record. 00313 /// \return 1-based leftmost position. 00314 int32_t get1BasedPosition(); 00315 00316 /// Get the 0-based(BAM) leftmost position of the record. 00317 /// \return 0-based leftmost position. 00318 int32_t get0BasedPosition(); 00319 00320 /// Get the length of the readname (QNAME) including the null. 00321 /// \return length of the read name (including null). 00322 uint8_t getReadNameLength(); 00323 00324 /// Get the mapping quality (MAPQ) of the record. 00325 /// \return map quality. 00326 uint8_t getMapQuality(); 00327 00328 /// Get the BAM bin for the record. 00329 /// \return BAM bin 00330 uint16_t getBin(); 00331 00332 /// Get the length of the BAM formatted CIGAR. 00333 /// \return length of BAM formatted cigar. 00334 uint16_t getCigarLength(); 00335 00336 /// Get the flag (FLAG). 00337 /// \return flag. 00338 uint16_t getFlag(); 00339 00340 /// Get the length of the read. 00341 /// \return read length. 00342 int32_t getReadLength(); 00343 00344 /// Get the mate/next fragment's reference sequence name (RNEXT). If it 00345 /// is equal to the reference name, it still returns the reference name. 00346 /// \return reference sequence name 00347 const char* getMateReferenceName(); 00348 00349 /// Get the mate/next fragment's reference sequence name (RNEXT), 00350 /// returning "=" if it is the same as the reference name, unless 00351 /// they are both "*" in which case "*" is returned. 00352 /// \return reference sequence name or '=' 00353 const char* getMateReferenceNameOrEqual(); 00354 00355 /// Get the mate reference id of the record 00356 /// (BAM format: mate_rid/next_refID). 00357 /// \return reference id 00358 int32_t getMateReferenceID(); 00359 00360 /// Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT). 00361 /// \return 1-based leftmost position. 00362 int32_t get1BasedMatePosition(); 00363 00364 /// Get the 0-based(BAM) leftmost mate/next fragment's position. 00365 /// \return 0-based leftmost position. 00366 int32_t get0BasedMatePosition(); 00367 00368 /// Get the inferred insert size of the read pair (ISIZE) or 00369 /// observed template length (TLEN). 00370 /// \return inferred insert size or observed template length. 00371 int32_t getInsertSize(); 00372 00373 /// Returns the 0-based inclusive rightmost position of the 00374 /// clipped sequence. 00375 /// \return 0-based inclusive rightmost position 00376 int32_t get0BasedAlignmentEnd(); 00377 00378 /// Returns the 1-based inclusive rightmost position of the 00379 /// clipped sequence. 00380 /// \return 1-based inclusive rightmost position 00381 int32_t get1BasedAlignmentEnd(); 00382 00383 /// Returns the length of the clipped sequence, returning 0 if the cigar 00384 /// is '*'. 00385 /// \return length of the clipped sequence. 00386 int32_t getAlignmentLength(); 00387 00388 /// Returns the 0-based inclusive left-most position adjusted for 00389 /// clipped bases. 00390 /// \return 0-based inclusive leftmost position including clips. 00391 int32_t get0BasedUnclippedStart(); 00392 00393 /// Returns the 1-based inclusive left-most position adjusted for 00394 /// clipped bases. 00395 /// \return 1-based inclusive leftmost position including clips. 00396 int32_t get1BasedUnclippedStart(); 00397 00398 /// Returns the 0-based inclusive right-most position adjusted for 00399 /// clipped bases. 00400 /// \return 0-based inclusive rightmost position including clips. 00401 int32_t get0BasedUnclippedEnd(); 00402 00403 /// Returns the 1-based inclusive right-most position adjusted for 00404 /// clipped bases. 00405 /// \return 1-based inclusive rightmost position including clips. 00406 int32_t get1BasedUnclippedEnd(); 00407 00408 /// Returns the SAM formatted Read Name (QNAME). 00409 /// \return read name. 00410 const char* getReadName(); 00411 00412 /// Returns the SAM formatted CIGAR string. 00413 /// \return cigar string. 00414 const char* getCigar(); 00415 00416 /// Returns the SAM formatted sequence string (SEQ), translating the base as 00417 /// specified by setSequenceTranslation. 00418 /// \return sequence string. 00419 const char* getSequence(); 00420 00421 /// Returns the SAM formatted sequence string (SEQ) performing the specified 00422 /// sequence translation. 00423 /// \param translation type of sequence translation to use. 00424 /// \return sequence string. 00425 const char* getSequence(SequenceTranslation translation); 00426 00427 /// Returns the SAM formatted quality string (QUAL). 00428 /// \return quality string. 00429 const char* getQuality(); 00430 00431 /// Get the sequence base at the specified index into this sequence 0 to 00432 /// readLength - 1, translating the base as specified by 00433 /// setSequenceTranslation. Throws an exception if index is out of range. 00434 /// \param index index into the sequence string (0 to readLength-1). 00435 /// \return the sequence base at the specified index into the sequence. 00436 char getSequence(int index); 00437 00438 /// Get the sequence base at the specified index into this sequence 0 to 00439 /// readLength - 1 performing the specified sequence translation. 00440 /// Throws an exception if index is out of range. 00441 /// \param index index into the sequence string (0 to readLength-1). 00442 /// \param translation type of sequence translation to use. 00443 /// \return the sequence base at the specified index into the sequence. 00444 char getSequence(int index, SequenceTranslation translation); 00445 00446 /// Get the quality character at the specified index into the quality 0 to 00447 /// readLength - 1. Throws an exception if index is out of range. 00448 /// \param index index into the quality string (0 to readLength-1). 00449 /// \return the quality character at the specified index into the quality. 00450 char getQuality(int index); 00451 00452 /// Returns a pointer to the Cigar object associated with this record. 00453 /// The object is essentially read-only, only allowing modifications 00454 /// due to lazy evaluations. 00455 /// \return pointer to the Cigar object. 00456 Cigar* getCigarInfo(); 00457 00458 /// Return the number of bases in this read that overlap the passed in 00459 /// region. Matches & mismatches between the read and the reference 00460 /// are counted as overlaps, but insertions, deletions, skips, clips, and 00461 /// pads are not counted. 00462 /// \param start inclusive 0-based start position (reference position) of 00463 /// the region to check for overlaps in. 00464 /// (-1 indicates to start at the beginning of the reference.) 00465 /// \param end exclusive 0-based end position (reference position) of the 00466 /// region to check for overlaps in. 00467 /// (-1 indicates to go to the end of the reference.) 00468 /// \return number of overlapping bases 00469 uint32_t getNumOverlaps(int32_t start, int32_t end); 00470 00471 /// Returns the values of all fields except the tags. 00472 /// \param recStruct structure containing the contents of all 00473 /// non-variable length fields. 00474 /// \param readName read name from the record (return param) 00475 /// \param cigar cigar string from the record (return param) 00476 /// \param sequence sequence string from the record (return param) 00477 /// \param quality quality string from the record (return param) 00478 /// \return true if all fields were successfully set, false otherwise. 00479 bool getFields(bamRecordStruct& recStruct, String& readName, 00480 String& cigar, String& sequence, String& quality); 00481 00482 /// Returns the values of all fields except the tags using the specified 00483 /// sequence translation. 00484 /// \param recStruct structure containing the contents of all 00485 /// non-variable length fields. 00486 /// \param readName read name from the record (return param) 00487 /// \param cigar cigar string from the record (return param) 00488 /// \param sequence sequence string from the record (return param) 00489 /// \param quality quality string from the record (return param) 00490 /// \param translation type of sequence translation to use. 00491 /// \return true if all fields were successfully set, false otherwise. 00492 bool getFields(bamRecordStruct& recStruct, String& readName, 00493 String& cigar, String& sequence, String& quality, 00494 SequenceTranslation translation); 00495 00496 /// Returns a pointer to the genome sequence object associated with this 00497 /// record if it was set (NULL if it was not set). 00498 /// \return pointer to the GenomeSequence object or NULL if there isn't one. 00499 GenomeSequence* getReference(); 00500 00501 //@} 00502 00503 /////////////////////// 00504 /// @name Get Tag Methods 00505 /// Get methods for obtaining information on tags. 00506 //@{ 00507 00508 /// Returns the length of the BAM formatted tags. 00509 /// \return length of the BAM formatted tags. 00510 uint32_t getTagLength(); 00511 00512 /// Get the next tag from the record. 00513 /// Sets the Status to SUCCESS when a tag is successfully returned or 00514 /// when there are no more tags. Otherwise the status is set to describe 00515 /// why it failed (parsing, etc). 00516 /// \param tag set to the tag when a tag is read. 00517 /// \param vtype set to the vtype when a tag is read. 00518 /// \param value pointer to the value of the tag (will need to cast 00519 /// to int, float, char, or string based on vtype). 00520 /// \return true if a tag was read, false if there are no more tags. 00521 bool getNextSamTag(char* tag, char& vtype, void** value); 00522 00523 /// Reset the tag iterator to the beginning of the tags. 00524 void resetTagIter(); 00525 00526 /// Returns whether or not the specified vtype is an integer type. 00527 /// Does not set SamStatus. 00528 /// \param vtype value type to check. 00529 /// \return true if the passed in vtype is an integer ('c', 'C', 's', 00530 /// 'S', 'i', 'I'), false otherwise. 00531 static bool isIntegerType(char vtype); 00532 00533 /// Returns whether or not the specified vtype is a float type. 00534 /// Does not set SamStatus. 00535 /// \param vtype value type to check. 00536 /// \return true if the passed in vtype is a float ('f'), false otherwise. 00537 static bool isFloatType(char vtype); 00538 00539 /// Returns whether or not the specified vtype is a char type. 00540 /// Does not set SamStatus. 00541 /// \param vtype value type to check. 00542 /// \return true if the passed in vtype is a char ('A'), false otherwise. 00543 static bool isCharType(char vtype); 00544 00545 /// Returns whether or not the specified vtype is a string type. 00546 /// Does not set SamStatus. 00547 /// \param vtype value type to check. 00548 /// \return true if the passed in vtype is a string ('Z'/'B'), false othwerise. 00549 static bool isStringType(char vtype); 00550 00551 /// Get the string representation of the tags from the record, formatted 00552 /// as TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE... 00553 /// Sets the Status to SUCCESS when the tags are successfully returned or 00554 /// the tags were not found. If a different error occured, the status is 00555 /// set appropriately. 00556 /// The delimiter between the tags to retrieve is ',' or ';'. ',' was added 00557 /// since the original delimiter, ';', requires the string to be quoted on 00558 /// the command-line. 00559 /// \param tags the tags to retrieve, formatted as TAG:TYPE,TAG:TYPE... 00560 /// \param returnString the String to set (this method first clears returnString) 00561 /// to TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE... 00562 /// \param delim delimiter to use to separate two tags, default is a tab. 00563 /// \return true if there were not any errors even if no tags were found. 00564 bool getTagsString(const char* tags, String& returnString, char delim = '\t'); 00565 00566 /// Get the string value for the specified tag. 00567 /// \param tag tag to retrieve 00568 /// \param pointer to the tag's string value if found, NULL if not found. 00569 const String* getStringTag(const char * tag); 00570 00571 /// Get the integer value for the specified tag, DEPRECATED, use one that returns a bool (success/failure). 00572 /// \param tag tag to retrieve 00573 /// \retun pointer to the tag's integer value if found, NULL if not found. 00574 int* getIntegerTag(const char * tag); 00575 00576 /// Get the integer value for the specified tag. 00577 /// \param tag tag to retrieve 00578 /// \param tagVal return parameter with integer value for the tag 00579 /// \retun bool true if Integer tag was found and tagVal was set, 00580 /// false if not. 00581 bool getIntegerTag(const char * tag, int& tagVal); 00582 00583 /// Get the float value for the specified tag. 00584 /// \param tag tag to retrieve 00585 /// \param tagVal return parameter with integer value for the tag 00586 /// \return bool true if Float tag was found and tagVal was set, 00587 /// false if not. 00588 bool getFloatTag(const char * tag, float& tagVal); 00589 00590 /// Get the string value for the specified tag. 00591 const String & getString(const char * tag); 00592 00593 /// Get the integer value for the specified tag, DEPRECATED, use getIntegerTag that returns a bool. 00594 int & getInteger(const char * tag); 00595 00596 /// Check if the specified tag contains a string. 00597 /// Does not set SamStatus. 00598 /// \param tag SAM tag to check contents of. 00599 /// \return true if the value associated with the tag is a string. 00600 bool checkString(const char * tag) 00601 { return(checkTag(tag, 'Z') || checkTag(tag, 'B')); } 00602 00603 /// Check if the specified tag contains an integer. 00604 /// Does not set SamStatus. 00605 /// \param tag SAM tag to check contents of. 00606 /// \return true if the value associated with the tag is a string. 00607 bool checkInteger(const char * tag) { return checkTag(tag, 'i'); } 00608 00609 /// Check if the specified tag contains a string. 00610 /// Does not set SamStatus. 00611 /// \param tag SAM tag to check contents of. 00612 /// \return true if the value associated with the tag is a string. 00613 bool checkFloat(const char * tag) { return checkTag(tag, 'f'); } 00614 00615 /// Check if the specified tag contains a value of the specified vtype. 00616 /// Does not set SamStatus. 00617 /// \param tag SAM tag to check contents of. 00618 /// \param type value type to check if the SAM tag matches. 00619 /// \return true if the value associated with the tag is a string. 00620 bool checkTag(const char * tag, char type); 00621 //@} 00622 00623 /// Returns the status associated with the last method that sets the status. 00624 /// \return SamStatus of the last command that sets status. 00625 const SamStatus& getStatus(); 00626 00627 00628 private: 00629 static int MAKEKEY(char ch1, char ch2, char type) 00630 { return (getKeyType(type) << 16) + (ch2 << 8) + ch1; } 00631 00632 static char getKeyType(char type) 00633 { 00634 switch(type) 00635 { 00636 // For any char/integer type, return 'i' 00637 case 'A' : 00638 case 'c' : 00639 case 'C' : 00640 case 's' : 00641 case 'S' : 00642 case 'i' : 00643 case 'I' : 00644 return('i'); 00645 break; 00646 default: 00647 // For all other types, return the actual type. 00648 return(type); 00649 }; 00650 } 00651 00652 static inline int getNumericTagTypeSize(char type) 00653 { 00654 switch(type) 00655 { 00656 case 'A': 00657 case 'c': 00658 case 'C': 00659 return(1); 00660 break; 00661 case 's': 00662 case 'S': 00663 return(2); 00664 break; 00665 case 'i': 00666 case 'I': 00667 case 'f': 00668 return(4); 00669 default: 00670 // Not a numeric type. 00671 return(0); 00672 } 00673 } 00674 00675 // Allocate space for the record - does a realloc. 00676 // The passed in size is the size of the entire record including the 00677 // block size field. 00678 // Adds any errors to myStatus. 00679 bool allocateRecordStructure(int size); 00680 00681 void* getStringPtr(int offset); 00682 void* getIntegerPtr(int offset, char& vtype); 00683 void* getFloatPtr(int offset); 00684 00685 // Fixes the buffer to match the variable length fields. 00686 // Adds any errors to myStatus. 00687 bool fixBuffer(SequenceTranslation translation); 00688 00689 // Sets the Sequence and Quality strings from the buffer. 00690 // They are done together in one method because they require the same 00691 // loop, so might as well be done at the same time. 00692 // Adds any errors to myStatus. 00693 void setSequenceAndQualityFromBuffer(); 00694 00695 // Parse the cigar to calculate the alignment/unclipped ends and convert 00696 // to SAM/BAM format. 00697 // Adds any errors to myStatus. 00698 bool parseCigar(); 00699 // Parse the cigar string to calculate the cigar length and alignment end 00700 // and convert to SAM format. 00701 // Adds any errors to myStatus. 00702 bool parseCigarBinary(); 00703 // Parse the cigar string to calculate the cigar length and alignment end 00704 // and convert to BAM format. 00705 // Adds any errors to myStatus. 00706 bool parseCigarString(); 00707 00708 // Set the tags from the buffer. 00709 // Adds any errors to myStatus. 00710 bool setTagsFromBuffer(); 00711 00712 // Set the tags in the buffer. 00713 // Adds any errors to myStatus. 00714 bool setTagsInBuffer(); 00715 00716 void setVariablesForNewBuffer(SamFileHeader& header); 00717 00718 void getTypeFromKey(int key, char& type) const; 00719 void getTag(int key, char* tag) const; 00720 00721 String & getString(int offset); 00722 int & getInteger(int offset); 00723 const char & getIntegerType(int offset) const; 00724 float & getFloat(int offset); 00725 00726 // Append the string representation of the value at the specified index 00727 // of the int array. 00728 inline void appendIntArrayValue(int index, String& strVal) const 00729 { 00730 appendIntArrayValue(intType[index], integers[index], strVal); 00731 } 00732 00733 void appendIntArrayValue(char type, int value, String& strVal) const; 00734 00735 int getBtagBufferSize(String& tagStr); 00736 int setBtagBuffer(String& tagStr, char* extraPtr); 00737 int getStringFromBtagBuffer(unsigned char* buffer, String& tagStr); 00738 00739 static const int DEFAULT_BLOCK_SIZE = 40; 00740 static const int DEFAULT_BIN = 4680; 00741 static const int DEFAULT_READ_NAME_LENGTH = 8; 00742 static const char* DEFAULT_READ_NAME; 00743 static const char* FIELD_ABSENT_STRING; 00744 00745 bamRecordStruct * myRecordPtr; 00746 int allocatedSize; 00747 00748 // Pointer to a temporary cigar buffer that can be used during string 00749 // parsing before it is ready to be copied into the actual record. 00750 uint32_t* myCigarTempBuffer; 00751 00752 // Size of the currently allocated temporary cigar buffer. 00753 int myCigarTempBufferAllocatedSize; 00754 00755 // Length of the cigar currently contained in the temporary buffer. 00756 int myCigarTempBufferLength; 00757 00758 // Track if the buffer is in sync with the Strings/Tags. 00759 // Set to false if any of the variable length fields are modified. 00760 // Set to true when the buffer is updated to match the variable length 00761 // fields. 00762 bool myIsBufferSynced; 00763 00764 // Track if the tags need to be set from the buffer. 00765 bool myNeedToSetTagsFromBuffer; 00766 00767 // Trag if the tags need to be set in the buffer. 00768 // Allows you to set just the tags if they are the only thing that changed 00769 // in the buffer. 00770 bool myNeedToSetTagsInBuffer; 00771 00772 int myTagBufferSize; 00773 int myLastTagIndex; 00774 00775 String myReadName; 00776 String myReferenceName; 00777 String myMateReferenceName; 00778 String myCigar; 00779 String mySequence; 00780 String myQuality; 00781 00782 std::string mySeqWithEq; 00783 std::string mySeqWithoutEq; 00784 00785 // The length of the alignment. 00786 int32_t myAlignmentLength; 00787 // Unclipped alignment positions. 00788 int32_t myUnclippedStartOffset; 00789 int32_t myUnclippedEndOffset; 00790 00791 CigarRoller myCigarRoller; 00792 00793 LongHash<int> extras; 00794 // Note: not all values in strings, integers, and floats are always 00795 // in extras. They will not be if the tags were removed. Removed 00796 // tags are removed from extras, but not from strings, integers, or floats 00797 // since if one was removed from these arrays, all other entries would 00798 // need their indices updated in extras. 00799 StringArray strings; 00800 IntArray integers; 00801 std::vector<char> intType; // contains the type of int at same position in integers. 00802 std::vector<float> floats; 00803 00804 00805 // Track whether or not the buffer values are correct for 00806 // each setting. 00807 bool myIsReadNameBufferValid; 00808 bool myIsCigarBufferValid; 00809 bool myIsSequenceBufferValid; 00810 bool myIsQualityBufferValid; 00811 bool myIsTagsBufferValid; 00812 bool myIsBinValid; 00813 00814 unsigned char* myPackedSequence; 00815 unsigned char* myPackedQuality; 00816 00817 00818 SamStatus myStatus; 00819 00820 // The current translation of the sequence as it occurs in the buffer. 00821 // Only applicable if myIsSequenceBufferValid == true. 00822 SequenceTranslation myBufferSequenceTranslation; 00823 00824 00825 // Track the Reference. 00826 GenomeSequence* myRefPtr; 00827 00828 // The type of translation to do when getting a sequence. 00829 SequenceTranslation mySequenceTranslation; 00830 00831 String NOT_FOUND_TAG_STRING; 00832 int NOT_FOUND_TAG_INT; 00833 00834 static const int myMaxWarns = 5; 00835 static int myNumWarns; 00836 }; 00837 00838 #endif