00001 /* 00002 * Copyright (C) 2010-2011 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __SAM_RECORD_H__ 00019 #define __SAM_RECORD_H__ 00020 00021 #include <stdint.h> 00022 00023 #include "GenomeSequence.h" 00024 #include "SamStatus.h" 00025 #include "LongHash.h" 00026 #include "MathVector.h" 00027 #include "StringArray.h" 00028 #include "IntArray.h" 00029 #include "SamFileHeader.h" 00030 #include "CigarRoller.h" 00031 00032 /// Structure of a BAM record. 00033 struct bamRecordStruct 00034 { 00035 public: 00036 int32_t myBlockSize; 00037 int32_t myReferenceID; 00038 int32_t myPosition; 00039 uint32_t myReadNameLength : 8, myMapQuality : 8, myBin : 16; 00040 uint32_t myCigarLength : 16, myFlag : 16; 00041 int32_t myReadLength; 00042 int32_t myMateReferenceID; 00043 int32_t myMatePosition; 00044 int32_t myInsertSize; // Outer fragment length 00045 char myData[1]; 00046 }; 00047 00048 00049 /// Class providing an easy to use interface to get/set/operate on the 00050 /// fields in a SAM/BAM record. 00051 class SamRecord 00052 { 00053 public: 00054 /// Enum containing the settings on how to translate the sequence if a 00055 /// reference is available. If no reference is available, no translation 00056 /// is done. 00057 enum SequenceTranslation { 00058 NONE, ///< Leave the sequence as is. 00059 EQUAL, ///< Translate bases that match the reference to '=' 00060 BASES, ///< Translate '=' to the actual base. 00061 }; 00062 00063 /// Default Constructor. 00064 SamRecord(); 00065 00066 /// Constructor that sets the error handling type. 00067 /// \param errorHandlingType how to handle errors. 00068 SamRecord(ErrorHandler::HandlingType errorHandlingType); 00069 00070 /// Destructor 00071 ~SamRecord(); 00072 00073 /// Reset the fields of the record to a default value. 00074 /// This is not necessary when you are reading a SAM/BAM file, 00075 /// but if you are setting fields, it is a good idea to clean 00076 /// out a record before reusing it. Clearing it allows you to 00077 /// not have to set any empty fields. 00078 void resetRecord(); 00079 00080 /// Returns whether or not the record is valid, setting the status to 00081 /// indicate success or failure. 00082 /// \param header SAM Header associated with the record. Used to perform 00083 /// some validation against the header. 00084 /// \return true if the record is valid, false if not. 00085 bool isValid(SamFileHeader& header); 00086 00087 /// Set the reference to the specified genome sequence object. 00088 /// \param reference pointer to the GenomeSequence object. 00089 void setReference(GenomeSequence* reference); 00090 00091 /// Set the type of sequence translation to use when getting 00092 /// the sequence. The default type (if this method is never called) is 00093 /// NONE (the sequence is left as-is). Can be over-ridden by using 00094 /// the accessors that take a SequenceTranslation parameter. 00095 /// \param translation type of sequence translation to use. 00096 void setSequenceTranslation(SequenceTranslation translation); 00097 00098 /////////////////////// 00099 /// @name Set Alignment Data 00100 /// Set methods for record fields. All of the "set" methods set the 00101 /// status to indicate success or the failure reason. 00102 //@{ 00103 00104 /// Set QNAME to the passed in name. 00105 /// \param readName the readname to set the QNAME to. 00106 /// \return true if successfully set, false if not. 00107 bool setReadName(const char* readName); 00108 00109 /// Set the bitwise FLAG to the specified value. 00110 /// \param flag integer flag to use. 00111 /// \return true if successfully set, false if not. 00112 bool setFlag(uint16_t flag); 00113 00114 /// Set the reference sequence name (RNAME) to the specified name, using 00115 /// the header to determine the reference id. 00116 /// \param header SAM/BAM header to use to determine the reference id. 00117 /// \param referenceName reference name to use. 00118 /// \return true if successfully set, false if not 00119 bool setReferenceName(SamFileHeader& header, 00120 const char* referenceName); 00121 00122 /// Set the leftmost position (POS) using the specified 1-based (SAM format) 00123 /// value. 00124 /// Internal processing handles the switching between SAM/BAM formats 00125 /// when read/written. 00126 /// \param position 1-based start position 00127 /// \return true if successfully set, false if not. 00128 bool set1BasedPosition(int32_t position); 00129 00130 /// Set the leftmost position using the specified 0-based (BAM format) 00131 /// value. 00132 /// Internal processing handles the switching between SAM/BAM formats 00133 /// when read/written. 00134 /// \param position 0-based start position 00135 /// \return true if successfully set, false if not. 00136 bool set0BasedPosition(int32_t position); 00137 00138 /// Set the mapping quality (MAPQ). 00139 /// \param mapQuality map quality to set in the record. 00140 /// \return true if successfully set, false if not. 00141 bool setMapQuality(uint8_t mapQuality); 00142 00143 /// Set the CIGAR to the specified SAM formatted cigar string. 00144 /// Internal processing handles the switching between SAM/BAM formats 00145 /// when read/written. 00146 /// \param cigar string containing the SAM formatted cigar. 00147 /// \return true if successfully set, false if not. 00148 bool setCigar(const char* cigar); 00149 00150 /// Set the CIGAR to the specified Cigar object. 00151 /// Internal processing handles the switching between SAM/BAM formats 00152 /// when read/written. 00153 /// \param cigar object to set this record's cigar to have. 00154 /// \return true if successfully set, false if not. 00155 bool setCigar(const Cigar& cigar); 00156 00157 00158 /// Set the mate/next fragment's reference sequence name (RNEXT) to the 00159 /// specified name, using the header to determine the mate reference id. 00160 /// \param header SAM/BAM header to use to determine the mate reference id. 00161 /// \param referenceName mate reference name to use. 00162 /// \return true if successfully set, false if not 00163 bool setMateReferenceName(SamFileHeader& header, 00164 const char* mateReferenceName); 00165 00166 /// Set the mate/next fragment's leftmost position (PNEXT) using the 00167 /// specified 1-based (SAM format) value. 00168 /// Internal processing handles the switching between SAM/BAM formats 00169 /// when read/written. 00170 /// \param position 1-based start position 00171 /// \return true if successfully set, false if not. 00172 bool set1BasedMatePosition(int32_t matePosition); 00173 00174 /// Set the mate/next fragment's leftmost position using the specified 00175 /// 0-based (BAM format) value. 00176 /// Internal processing handles the switching between SAM/BAM formats 00177 /// when read/written. 00178 /// \param position 0-based start position 00179 /// \return true if successfully set, false if not. 00180 bool set0BasedMatePosition(int32_t matePosition); 00181 00182 /// Sets the inferred insert size (ISIZE)/observed template length (TLEN). 00183 /// \param insertSize inferred insert size/observed template length. 00184 /// \return true if successfully set, false if not. 00185 bool setInsertSize(int32_t insertSize); 00186 00187 /// Sets the sequence (SEQ) to the specified SAM formatted sequence string. 00188 /// Internal processing handles switching between SAM/BAM formats when 00189 /// read/written. 00190 /// \param seq SAM sequence string. May contain '='. 00191 /// \return true if successfully set, false if not. 00192 bool setSequence(const char* seq); 00193 00194 /// Sets the quality (QUAL) to the specified SAM formatted quality string. 00195 /// Internal processing handles switching between SAM/BAM formats when 00196 /// read/written. 00197 /// \param quality SAM quality string. 00198 /// \return true if successfully set, false if not. 00199 bool setQuality(const char* quality); 00200 00201 /// Shift the indels (if any) to the left by updating the CIGAR. 00202 /// \return true if the cigar was shifted, false if not. 00203 bool shiftIndelsLeft(); 00204 00205 /// Sets the SamRecord to contain the information in the BAM formatted 00206 /// fromBuffer. 00207 /// \param fromBuffer buffer to read the BAM record from. 00208 /// \param fromBufferSize size of the buffer containing the BAM record. 00209 /// \param header BAM header for the record. 00210 /// \return status of reading the BAM record from the buffer. 00211 SamStatus::Status setBuffer(const char* fromBuffer, uint32_t fromBufferSize, 00212 SamFileHeader& header); 00213 00214 /// Read the BAM record from a file. 00215 /// \param filePtr file to read the buffer from. 00216 /// \param header BAM header for the record. 00217 /// \return status of the reading the BAM record from the file. 00218 SamStatus::Status setBufferFromFile(IFILE filePtr, SamFileHeader& header); 00219 00220 //@} 00221 00222 /////////////////////// 00223 /// @name Set Tag Data 00224 /// Set methods for tags. 00225 //@{ 00226 00227 /// Add the specified integer tag to the record. Internal processing 00228 /// handles switching between SAM/BAM formats when read/written and 00229 /// determining the type for BAM format. If the tag is already there 00230 /// this code will replace it if the specified value is different. 00231 /// \param tag two character tag to be added to the SAM/BAM record. 00232 /// \param value value for the specified tag. 00233 /// \return true if the tag was successfully added, false otherwise. 00234 bool addIntTag(const char* tag, int32_t value); 00235 00236 /// Add the specified tag,vtype,value to the record. Vtype can be SAM/BAM 00237 /// format. Internal processing handles switching between SAM/BAM formats 00238 /// when read/written. If the tag is already there this code will replace 00239 /// it if the specified value is different. 00240 /// \param tag two character tag to be added to the SAM/BAM record. 00241 /// \param vtype vtype of the specified value - either SAM/BAM vtypes. 00242 /// \param value value as a string for the specified tag. 00243 /// \return true if the tag was successfully added, false otherwise. 00244 bool addTag(const char* tag, char vtype, const char* value); 00245 00246 /// Clear the tags in this record. 00247 /// Does not set SamStatus. 00248 void clearTags(); 00249 00250 /// Remove a tag. 00251 /// \param tag tag to remove. 00252 /// \param type of the tag to be removed. 00253 /// \return true if the tag no longer exists in the record, false if it could not be removed (Returns true if the tag was not found in the record). 00254 bool rmTag(const char* tag, char type); 00255 00256 /// Remove tags. 00257 /// \param tags tags to remove, formatted as Tag:Type;Tag:Type;Tag:Type... 00258 /// \return true if all tags no longer exist in the record, false if any could not be removed 00259 /// (Returns true if the tags were not found in the record). 00260 /// SamStatus is set to INVALID if the tags are incorrectly formatted. 00261 bool rmTags(const char* tags); 00262 00263 //@} 00264 00265 /////////////////////// 00266 /// @name Get Alignment Data 00267 /// Get methods for record fields. All of the "get" methods set the 00268 /// status to indicate success or the failure reason. 00269 //@{ 00270 00271 /// Get a const pointer to the buffer that contains the BAM representation 00272 /// of the record. 00273 /// \return const pointer to the buffer that contains the BAM representation 00274 /// of the record. 00275 const void* getRecordBuffer(); 00276 00277 /// Get a const pointer to the buffer that contains the BAM representation 00278 /// of the record using the specified translation on the sequence. 00279 /// \param translation type of sequence translation to use. 00280 /// \return const pointer to the buffer that contains the BAM representation 00281 /// of the record. 00282 const void* getRecordBuffer(SequenceTranslation translation); 00283 00284 /// Write the record as a BAM into the specified already opened file. 00285 /// \param filePtr file to write the BAM record into. 00286 /// \return status of the write. 00287 SamStatus::Status writeRecordBuffer(IFILE filePtr); 00288 00289 /// Write the record as a BAM into the specified already opened file using 00290 /// the specified translation on the sequence. 00291 /// \param filePtr file to write the BAM record into. 00292 /// \param translation type of sequence translation to use. 00293 /// \return status of the write. 00294 SamStatus::Status writeRecordBuffer(IFILE filePtr, 00295 SequenceTranslation translation); 00296 00297 /// Get the block size of the record (BAM format). 00298 /// \return BAM block size of the record. 00299 int32_t getBlockSize(); 00300 00301 /// Get the reference sequence name (RNAME) of the record. 00302 /// \return reference sequence name 00303 const char* getReferenceName(); 00304 00305 /// Get the reference sequence id of the record (BAM format rid). 00306 /// \return reference sequence id 00307 int32_t getReferenceID(); 00308 00309 /// Get the 1-based(SAM) leftmost position (POS) of the record. 00310 /// \return 1-based leftmost position. 00311 int32_t get1BasedPosition(); 00312 00313 /// Get the 0-based(BAM) leftmost position of the record. 00314 /// \return 0-based leftmost position. 00315 int32_t get0BasedPosition(); 00316 00317 /// Get the length of the readname (QNAME) including the null. 00318 /// \return length of the read name (including null). 00319 uint8_t getReadNameLength(); 00320 00321 /// Get the mapping quality (MAPQ) of the record. 00322 /// \return map quality. 00323 uint8_t getMapQuality(); 00324 00325 /// Get the BAM bin for the record. 00326 /// \return BAM bin 00327 uint16_t getBin(); 00328 00329 /// Get the length of the BAM formatted CIGAR. 00330 /// \return length of BAM formatted cigar. 00331 uint16_t getCigarLength(); 00332 00333 /// Get the flag (FLAG). 00334 /// \return flag. 00335 uint16_t getFlag(); 00336 00337 /// Get the length of the read. 00338 /// \return read length. 00339 int32_t getReadLength(); 00340 00341 /// Get the mate/next fragment's reference sequence name (RNEXT). If it 00342 /// is equal to the reference name, it still returns the reference name. 00343 /// \return reference sequence name 00344 const char* getMateReferenceName(); 00345 00346 /// Get the mate/next fragment's reference sequence name (RNEXT), 00347 /// returning "=" if it is the same as the reference name, unless 00348 /// they are both "*" in which case "*" is returned. 00349 /// \return reference sequence name or '=' 00350 const char* getMateReferenceNameOrEqual(); 00351 00352 /// Get the mate reference id of the record 00353 /// (BAM format: mate_rid/next_refID). 00354 /// \return reference id 00355 int32_t getMateReferenceID(); 00356 00357 /// Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT). 00358 /// \return 1-based leftmost position. 00359 int32_t get1BasedMatePosition(); 00360 00361 /// Get the 0-based(BAM) leftmost mate/next fragment's position. 00362 /// \return 0-based leftmost position. 00363 int32_t get0BasedMatePosition(); 00364 00365 /// Get the inferred insert size of the read pair (ISIZE) or 00366 /// observed template length (TLEN). 00367 /// \return inferred insert size or observed template length. 00368 int32_t getInsertSize(); 00369 00370 /// Returns the 0-based inclusive rightmost position of the 00371 /// clipped sequence. 00372 /// \return 0-based inclusive rightmost position 00373 int32_t get0BasedAlignmentEnd(); 00374 00375 /// Returns the 1-based inclusive rightmost position of the 00376 /// clipped sequence. 00377 /// \return 1-based inclusive rightmost position 00378 int32_t get1BasedAlignmentEnd(); 00379 00380 /// Returns the length of the clipped sequence, returning 0 if the cigar 00381 /// is '*'. 00382 /// \return length of the clipped sequence. 00383 int32_t getAlignmentLength(); 00384 00385 /// Returns the 0-based inclusive left-most position adjusted for 00386 /// clipped bases. 00387 /// \return 0-based inclusive leftmost position including clips. 00388 int32_t get0BasedUnclippedStart(); 00389 00390 /// Returns the 1-based inclusive left-most position adjusted for 00391 /// clipped bases. 00392 /// \return 1-based inclusive leftmost position including clips. 00393 int32_t get1BasedUnclippedStart(); 00394 00395 /// Returns the 0-based inclusive right-most position adjusted for 00396 /// clipped bases. 00397 /// \return 0-based inclusive rightmost position including clips. 00398 int32_t get0BasedUnclippedEnd(); 00399 00400 /// Returns the 1-based inclusive right-most position adjusted for 00401 /// clipped bases. 00402 /// \return 1-based inclusive rightmost position including clips. 00403 int32_t get1BasedUnclippedEnd(); 00404 00405 /// Returns the SAM formatted Read Name (QNAME). 00406 /// \return read name. 00407 const char* getReadName(); 00408 00409 /// Returns the SAM formatted CIGAR string. 00410 /// \return cigar string. 00411 const char* getCigar(); 00412 00413 /// Returns the SAM formatted sequence string (SEQ), translating the base as 00414 /// specified by setSequenceTranslation. 00415 /// \return sequence string. 00416 const char* getSequence(); 00417 00418 /// Returns the SAM formatted sequence string (SEQ) performing the specified 00419 /// sequence translation. 00420 /// \param translation type of sequence translation to use. 00421 /// \return sequence string. 00422 const char* getSequence(SequenceTranslation translation); 00423 00424 /// Returns the SAM formatted quality string (QUAL). 00425 /// \return quality string. 00426 const char* getQuality(); 00427 00428 /// Get the sequence base at the specified index into this sequence 0 to 00429 /// readLength - 1, translating the base as specified by 00430 /// setSequenceTranslation. Throws an exception if index is out of range. 00431 /// \param index index into the sequence string (0 to readLength-1). 00432 /// \return the sequence base at the specified index into the sequence. 00433 char getSequence(int index); 00434 00435 /// Get the sequence base at the specified index into this sequence 0 to 00436 /// readLength - 1 performing the specified sequence translation. 00437 /// Throws an exception if index is out of range. 00438 /// \param index index into the sequence string (0 to readLength-1). 00439 /// \param translation type of sequence translation to use. 00440 /// \return the sequence base at the specified index into the sequence. 00441 char getSequence(int index, SequenceTranslation translation); 00442 00443 /// Get the quality character at the specified index into the quality 0 to 00444 /// readLength - 1. Throws an exception if index is out of range. 00445 /// \param index index into the quality string (0 to readLength-1). 00446 /// \return the quality character at the specified index into the quality. 00447 char getQuality(int index); 00448 00449 /// Returns a pointer to the Cigar object associated with this record. 00450 /// The object is essentially read-only, only allowing modifications 00451 /// due to lazy evaluations. 00452 /// \return pointer to the Cigar object. 00453 Cigar* getCigarInfo(); 00454 00455 /// Return the number of bases in this read that overlap the passed in 00456 /// region. Matches & mismatches between the read and the reference 00457 /// are counted as overlaps, but insertions, deletions, skips, clips, and 00458 /// pads are not counted. 00459 /// \param start inclusive 0-based start position (reference position) of 00460 /// the region to check for overlaps in. 00461 /// (-1 indicates to start at the beginning of the reference.) 00462 /// \param end exclusive 0-based end position (reference position) of the 00463 /// region to check for overlaps in. 00464 /// (-1 indicates to go to the end of the reference.) 00465 /// \return number of overlapping bases 00466 uint32_t getNumOverlaps(int32_t start, int32_t end); 00467 00468 /// Returns the values of all fields except the tags. 00469 /// \param recStruct structure containing the contents of all 00470 /// non-variable length fields. 00471 /// \param readName read name from the record (return param) 00472 /// \param cigar cigar string from the record (return param) 00473 /// \param sequence sequence string from the record (return param) 00474 /// \param quality quality string from the record (return param) 00475 /// \return true if all fields were successfully set, false otherwise. 00476 bool getFields(bamRecordStruct& recStruct, String& readName, 00477 String& cigar, String& sequence, String& quality); 00478 00479 /// Returns the values of all fields except the tags using the specified 00480 /// sequence translation. 00481 /// \param recStruct structure containing the contents of all 00482 /// non-variable length fields. 00483 /// \param readName read name from the record (return param) 00484 /// \param cigar cigar string from the record (return param) 00485 /// \param sequence sequence string from the record (return param) 00486 /// \param quality quality string from the record (return param) 00487 /// \param translation type of sequence translation to use. 00488 /// \return true if all fields were successfully set, false otherwise. 00489 bool getFields(bamRecordStruct& recStruct, String& readName, 00490 String& cigar, String& sequence, String& quality, 00491 SequenceTranslation translation); 00492 00493 /// Returns a pointer to the genome sequence object associated with this 00494 /// record if it was set (NULL if it was not set). 00495 /// \return pointer to the GenomeSequence object or NULL if there isn't one. 00496 GenomeSequence* getReference(); 00497 00498 //@} 00499 00500 /////////////////////// 00501 /// @name Get Tag Methods 00502 /// Get methods for obtaining information on tags. 00503 //@{ 00504 00505 /// Returns the length of the BAM formatted tags. 00506 /// \return length of the BAM formatted tags. 00507 uint32_t getTagLength(); 00508 00509 /// Get the next tag from the record. 00510 /// Sets the Status to SUCCESS when a tag is successfully returned or 00511 /// when there are no more tags. Otherwise the status is set to describe 00512 /// why it failed (parsing, etc). 00513 /// \param tag set to the tag when a tag is read. 00514 /// \param vtype set to the vtype when a tag is read. 00515 /// \param value pointer to the value of the tag (will need to cast 00516 /// to int, double, char, or string based on vtype). 00517 /// \return true if a tag was read, false if there are no more tags. 00518 bool getNextSamTag(char* tag, char& vtype, void** value); 00519 00520 /// Reset the tag iterator to the beginning of the tags. 00521 void resetTagIter(); 00522 00523 /// Returns whether or not the specified vtype is an integer type. 00524 /// Does not set SamStatus. 00525 /// \param vtype value type to check. 00526 /// \return true if the passed in vtype is an integer ('c', 'C', 's', 00527 /// 'S', 'i', 'I'), false otherwise. 00528 bool isIntegerType(char vtype) const; 00529 00530 /// Returns whether or not the specified vtype is a double type. 00531 /// Does not set SamStatus. 00532 /// \param vtype value type to check. 00533 /// \return true if the passed in vtype is a double ('f'), false otherwise. 00534 bool isDoubleType(char vtype) const; 00535 00536 /// Returns whether or not the specified vtype is a char type. 00537 /// Does not set SamStatus. 00538 /// \param vtype value type to check. 00539 /// \return true if the passed in vtype is a char ('A'), false otherwise. 00540 bool isCharType(char vtype) const; 00541 00542 /// Returns whether or not the specified vtype is a string type. 00543 /// Does not set SamStatus. 00544 /// \param vtype value type to check. 00545 /// \return true if the passed in vtype is a string ('Z'), false othwerise. 00546 bool isStringType(char vtype) const; 00547 00548 /// Get the string representation of the tags from the record, formatted 00549 /// as TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE... 00550 /// Sets the Status to SUCCESS when the tags are successfully returned or 00551 /// the tags were not found. If a different error occured, the status is 00552 /// set appropriately. 00553 /// \param tags the tags to retrieve, formatted as TAG:TYPE;TAG:TYPE... 00554 /// \param returnString the String to set (this method first clears returnString) 00555 /// to TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE... 00556 /// \param delim delimiter to use to separate two tags, default is a tab. 00557 /// \return true if there were not any errors even if no tags were found. 00558 bool getTagsString(const char* tags, String& returnString, char delim = '\t'); 00559 00560 /// Get the string value for the specified tag. 00561 /// \param tag tag to retrieve 00562 /// \param pointer to the tag's string value if found, NULL if not found. 00563 String* getStringTag(const char * tag); 00564 00565 /// Get the integer value for the specified tag. 00566 /// \param tag tag to retrieve 00567 /// \retun pointer to the tag's integer value if found, NULL if not found. 00568 int* getIntegerTag(const char * tag); 00569 00570 /// Get the double value for the specified tag. 00571 /// \param tag tag to retrieve 00572 /// \return pointer to the tag's double value if found, NULL if not found. 00573 double* getDoubleTag(const char * tag); 00574 00575 /// Get the string value for the specified tag. 00576 String & getString(const char * tag); 00577 00578 /// Get the integer value for the specified tag. 00579 int & getInteger(const char * tag); 00580 00581 /// Get the double value for the specified tag. 00582 double & getDouble(const char * tag); 00583 00584 /// Check if the specified tag contains a string. 00585 /// Does not set SamStatus. 00586 /// \param tag SAM tag to check contents of. 00587 /// \return true if the value associated with the tag is a string. 00588 bool checkString(const char * tag) { return checkTag(tag, 'Z'); } 00589 00590 /// Check if the specified tag contains a string. 00591 /// Does not set SamStatus. 00592 /// \param tag SAM tag to check contents of. 00593 /// \return true if the value associated with the tag is a string. 00594 bool checkInteger(const char * tag) { return checkTag(tag, 'i'); } 00595 00596 /// Check if the specified tag contains a string. 00597 /// Does not set SamStatus. 00598 /// \param tag SAM tag to check contents of. 00599 /// \return true if the value associated with the tag is a string. 00600 bool checkDouble(const char * tag) { return checkTag(tag, 'f'); } 00601 00602 /// Check if the specified tag contains a value of the specified vtype. 00603 /// Does not set SamStatus. 00604 /// \param tag SAM tag to check contents of. 00605 /// \param type value type to check if the SAM tag matches. 00606 /// \return true if the value associated with the tag is a string. 00607 bool checkTag(const char * tag, char type); 00608 //@} 00609 00610 /// Returns the status associated with the last method that sets the status. 00611 /// \return SamStatus of the last command that sets status. 00612 const SamStatus& getStatus(); 00613 00614 00615 00616 00617 private: 00618 static int MAKEKEY(char ch1, char ch2, char type) 00619 { return (getKeyType(type) << 16) + (ch2 << 8) + ch1; } 00620 00621 static char getKeyType(char type) 00622 { 00623 switch(type) 00624 { 00625 // For any char/integer type, return 'i' 00626 case 'A' : 00627 case 'c' : 00628 case 'C' : 00629 case 's' : 00630 case 'S' : 00631 case 'i' : 00632 case 'I' : 00633 return('i'); 00634 break; 00635 default: 00636 // For all other types, return the actual type. 00637 return(type); 00638 }; 00639 } 00640 00641 // Allocate space for the record - does a realloc. 00642 // The passed in size is the size of the entire record including the 00643 // block size field. 00644 // Adds any errors to myStatus. 00645 bool allocateRecordStructure(int size); 00646 00647 void* getStringPtr(int offset); 00648 void* getIntegerPtr(int offset, char& vtype); 00649 void* getDoublePtr(int offset); 00650 00651 // Fixes the buffer to match the variable length fields. 00652 // Adds any errors to myStatus. 00653 bool fixBuffer(SequenceTranslation translation); 00654 00655 // Sets the Sequence and Quality strings from the buffer. 00656 // They are done together in one method because they require the same 00657 // loop, so might as well be done at the same time. 00658 // Adds any errors to myStatus. 00659 void setSequenceAndQualityFromBuffer(); 00660 00661 // Parse the cigar to calculate the alignment/unclipped ends and convert 00662 // to SAM/BAM format. 00663 // Adds any errors to myStatus. 00664 bool parseCigar(); 00665 // Parse the cigar string to calculate the cigar length and alignment end 00666 // and convert to SAM format. 00667 // Adds any errors to myStatus. 00668 bool parseCigarBinary(); 00669 // Parse the cigar string to calculate the cigar length and alignment end 00670 // and convert to BAM format. 00671 // Adds any errors to myStatus. 00672 bool parseCigarString(); 00673 00674 // Set the tags from the buffer. 00675 // Adds any errors to myStatus. 00676 bool setTagsFromBuffer(); 00677 00678 // Set the tags in the buffer. 00679 // Adds any errors to myStatus. 00680 bool setTagsInBuffer(); 00681 00682 void setVariablesForNewBuffer(SamFileHeader& header); 00683 00684 void getTypeFromKey(int key, char& type) const; 00685 void getTag(int key, char* tag) const; 00686 00687 String & getString(int offset); 00688 int & getInteger(int offset); 00689 char & getIntegerType(int offset); 00690 double & getDouble(int offset); 00691 00692 static const int DEFAULT_BLOCK_SIZE = 40; 00693 static const int DEFAULT_BIN = 4680; 00694 static const int DEFAULT_READ_NAME_LENGTH = 8; 00695 static const char* DEFAULT_READ_NAME; 00696 static const char* FIELD_ABSENT_STRING; 00697 00698 bamRecordStruct * myRecordPtr; 00699 int allocatedSize; 00700 00701 // Pointer to a temporary cigar buffer that can be used during string 00702 // parsing before it is ready to be copied into the actual record. 00703 uint32_t* myCigarTempBuffer; 00704 00705 // Size of the currently allocated temporary cigar buffer. 00706 int myCigarTempBufferAllocatedSize; 00707 00708 // Length of the cigar currently contained in the temporary buffer. 00709 int myCigarTempBufferLength; 00710 00711 // Track if the buffer is in sync with the Strings/Tags. 00712 // Set to false if any of the variable length fields are modified. 00713 // Set to true when the buffer is updated to match the variable length 00714 // fields. 00715 bool myIsBufferSynced; 00716 00717 // Track if the tags need to be set from the buffer. 00718 bool myNeedToSetTagsFromBuffer; 00719 00720 // Trag if the tags need to be set in the buffer. 00721 // Allows you to set just the tags if they are the only thing that changed 00722 // in the buffer. 00723 bool myNeedToSetTagsInBuffer; 00724 00725 int myTagBufferSize; 00726 int myLastTagIndex; 00727 00728 String myReadName; 00729 String myReferenceName; 00730 String myMateReferenceName; 00731 String myCigar; 00732 String mySequence; 00733 String myQuality; 00734 00735 std::string mySeqWithEq; 00736 std::string mySeqWithoutEq; 00737 00738 // The length of the alignment. 00739 int32_t myAlignmentLength; 00740 // Unclipped alignment positions. 00741 int32_t myUnclippedStartOffset; 00742 int32_t myUnclippedEndOffset; 00743 00744 CigarRoller myCigarRoller; 00745 00746 LongHash<int> extras; 00747 // Note: not all values in strings, integers, and doubles are always 00748 // in extras. They will not be if the tags were removed. Removed 00749 // tags are removed from extras, but not from strings, integers, or doubles 00750 // since if one was removed from these arrays, all other entries would 00751 // need their indices updated in extras. 00752 StringArray strings; 00753 IntArray integers; 00754 std::vector<char> intType; // contains the type of int at same position in integers. 00755 Vector doubles; 00756 00757 00758 // Track whether or not the buffer values are correct for 00759 // each setting. 00760 bool myIsReadNameBufferValid; 00761 bool myIsCigarBufferValid; 00762 bool myIsSequenceBufferValid; 00763 bool myIsQualityBufferValid; 00764 bool myIsTagsBufferValid; 00765 bool myIsBinValid; 00766 00767 SamStatus myStatus; 00768 00769 // The current translation of the sequence as it occurs in the buffer. 00770 // Only applicable if myIsSequenceBufferValid == true. 00771 SequenceTranslation myBufferSequenceTranslation; 00772 00773 00774 // Track the Reference. 00775 GenomeSequence* myRefPtr; 00776 00777 // The type of translation to do when getting a sequence. 00778 SequenceTranslation mySequenceTranslation; 00779 00780 String NOT_FOUND_TAG_STRING; 00781 int NOT_FOUND_TAG_INT; 00782 double NOT_FOUND_TAG_DOUBLE; 00783 }; 00784 00785 #endif