00001 /* 00002 * Copyright (C) 2010-2011 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __SAM_RECORD_H__ 00019 #define __SAM_RECORD_H__ 00020 00021 #include <stdint.h> 00022 00023 #include "GenomeSequence.h" 00024 #include "SamStatus.h" 00025 #include "LongHash.h" 00026 #include "MathVector.h" 00027 #include "StringArray.h" 00028 #include "IntArray.h" 00029 #include "SamFileHeader.h" 00030 #include "CigarRoller.h" 00031 00032 /// Structure of a BAM record. 00033 struct bamRecordStruct 00034 { 00035 public: 00036 int32_t myBlockSize; 00037 int32_t myReferenceID; 00038 int32_t myPosition; 00039 uint32_t myReadNameLength : 8, myMapQuality : 8, myBin : 16; 00040 uint32_t myCigarLength : 16, myFlag : 16; 00041 int32_t myReadLength; 00042 int32_t myMateReferenceID; 00043 int32_t myMatePosition; 00044 int32_t myInsertSize; // Outer fragment length 00045 char myData[1]; 00046 }; 00047 00048 00049 /// Class providing an easy to use interface to get/set/operate on the 00050 /// fields in a SAM/BAM record. 00051 class SamRecord 00052 { 00053 public: 00054 /// Enum containing the settings on how to translate the sequence if a 00055 /// reference is available. If no reference is available, no translation 00056 /// is done. 00057 enum SequenceTranslation { 00058 NONE, ///< Leave the sequence as is. 00059 EQUAL, ///< Translate bases that match the reference to '=' 00060 BASES, ///< Translate '=' to the actual base. 00061 }; 00062 00063 /// Default Constructor. 00064 SamRecord(); 00065 00066 /// Constructor that sets the error handling type. 00067 /// \param errorHandlingType how to handle errors. 00068 SamRecord(ErrorHandler::HandlingType errorHandlingType); 00069 00070 /// Destructor 00071 ~SamRecord(); 00072 00073 /// Reset the fields of the record to a default value. 00074 /// This is not necessary when you are reading a Sam/Bam file, 00075 /// but if you are setting fields, it is a good idea to clean 00076 /// out a record before reusing it. Clearing it allows you to 00077 /// not have to set any empty fields. 00078 void resetRecord(); 00079 00080 /// Reset the tag iterator to the beginning of the tags. 00081 void resetTagIter(); 00082 00083 /// Returns whether or not the record is valid. 00084 /// Sets the status to indicate success or failure. 00085 /// \param header SAM Header associated with the record. Used to perform 00086 /// some validation against the header. 00087 /// \return true if the record is valid, false if not. 00088 bool isValid(SamFileHeader& header); 00089 00090 /// Read the BAM record from a file. 00091 /// \param filePtr file to read the buffer from. 00092 /// \param header BAM header for the record. 00093 /// \return status of the reading the BAM record from the file. 00094 SamStatus::Status setBufferFromFile(IFILE filePtr, SamFileHeader& header); 00095 00096 /// Set the reference to the specified genome sequence object. 00097 /// \param reference pointer to the GenomeSequence object. 00098 void setReference(GenomeSequence* reference); 00099 00100 /// Set the type of sequence translation to use when getting 00101 /// the sequence. The default type (if this method is never called) is 00102 /// NONE (the sequence is left as-is). Can be over-ridden by using 00103 /// the accessors that take a SequenceTranslation parameter. 00104 /// \param translation type of sequence translation to use. 00105 void setSequenceTranslation(SequenceTranslation translation); 00106 00107 /////////////////////// 00108 /// @name Set Alignment Data 00109 /// Set methods for record fields. All of the "set" methods set the 00110 /// status to indicate success or the failure reason. 00111 //@{ 00112 00113 /// Set QNAME to the passed in name. 00114 /// \param readName the readname to set the QNAME to. 00115 /// \return true if successfully set, false if not. 00116 bool setReadName(const char* readName); 00117 00118 /// Set the bitwise flag to the specified value. 00119 /// \param flag integer flag to use. 00120 /// \return true if successfully set, false if not. 00121 bool setFlag(uint16_t flag); 00122 00123 /// Set the reference name to the specified name, using the header to 00124 /// determine the reference id. 00125 /// \param header SAM/BAM header to use to determine the reference id. 00126 /// \param referenceName reference name to use. 00127 /// \return true if successfully set, false if not 00128 bool setReferenceName(SamFileHeader& header, 00129 const char* referenceName); 00130 00131 /// Set the leftmost position using the specified 1-based (SAM format) 00132 /// value. 00133 /// Internal processing handles the switching between SAM/BAM formats 00134 /// when read/written. 00135 /// \param position 1-based start position 00136 /// \return true if successfully set, false if not. 00137 bool set1BasedPosition(int32_t position); 00138 00139 /// Set the leftmost position using the specified 0-based (BAM format) 00140 /// value. 00141 /// Internal processing handles the switching between SAM/BAM formats 00142 /// when read/written. 00143 /// \param position 0-based start position 00144 /// \return true if successfully set, false if not. 00145 bool set0BasedPosition(int32_t position); 00146 00147 /// Set the mapping quality. 00148 /// \param mapQuality map quality to set in the record. 00149 /// \return true if successfully set, false if not. 00150 bool setMapQuality(uint8_t mapQuality); 00151 00152 /// Set the CIGAR to the specified SAM formatted cigar string. 00153 /// Internal processing handles the switching between SAM/BAM formats 00154 /// when read/written. 00155 /// \param cigar string containing the SAM formatted cigar. 00156 /// \return true if successfully set, false if not. 00157 bool setCigar(const char* cigar); 00158 00159 /// Set the CIGAR to the specified Cigar object. 00160 /// Internal processing handles the switching between SAM/BAM formats 00161 /// when read/written. 00162 /// \param cigar object to set this record's cigar to have. 00163 /// \return true if successfully set, false if not. 00164 bool setCigar(const Cigar& cigar); 00165 00166 00167 /// Set the mate reference sequence name to the specified name, using the 00168 /// header to determine the matee reference id. 00169 /// \param header SAM/BAM header to use to determine the mate reference id. 00170 /// \param referenceName mate reference name to use. 00171 /// \return true if successfully set, false if not 00172 bool setMateReferenceName(SamFileHeader& header, 00173 const char* mateReferenceName); 00174 00175 /// Set the leftmost mate position using the specified 1-based (SAM format) 00176 /// value. 00177 /// Internal processing handles the switching between SAM/BAM formats 00178 /// when read/written. 00179 /// \param position 1-based start position 00180 /// \return true if successfully set, false if not. 00181 bool set1BasedMatePosition(int32_t matePosition); 00182 00183 /// Set the leftmost mate position using the specified 0-based (BAM format) 00184 /// value. 00185 /// Internal processing handles the switching between SAM/BAM formats 00186 /// when read/written. 00187 /// \param position 0-based start position 00188 /// \return true if successfully set, false if not. 00189 bool set0BasedMatePosition(int32_t matePosition); 00190 00191 /// Sets the inferred insert size. 00192 /// \param insertSize inferred insert size. 00193 /// \return true if successfully set, false if not. 00194 bool setInsertSize(int32_t insertSize); 00195 00196 /// Sets the sequence to the specified sequence string. This is a 00197 /// SAM formatted sequence string. Internal processing handles switching 00198 /// between SAM/BAM formats when read/written. 00199 /// \param seq SAM sequence string. May contain '='. 00200 /// \return true if successfully set, false if not. 00201 bool setSequence(const char* seq); 00202 00203 /// Sets the quality to the specified quality string. This is a SAM 00204 /// formatted quality string. Internal processing handles switching 00205 /// between SAM/BAM formats when read/written. 00206 /// \param quality SAM quality string. 00207 /// \return true if successfully set, false if not. 00208 bool setQuality(const char* quality); 00209 00210 /// Sets the SamRecord to contain the information in BAM format 00211 /// found in fromBuffer. 00212 /// \param fromBuffer buffer to read the BAM record from. 00213 /// \param fromBufferSize size of the buffer containing the BAM record. 00214 /// \param header BAM header for the record. 00215 /// \return status of reading the BAM record from the buffer. 00216 SamStatus::Status setBuffer(const char* fromBuffer, uint32_t fromBufferSize, 00217 SamFileHeader& header); 00218 00219 /// Add the specified integer tag to the record. Internal processing 00220 /// handles switching between SAM/BAM formats when read/written and 00221 /// determining the type for BAM format. If the tag is already there 00222 /// this code will replace it if the specified value is different. 00223 /// \param tag two character tag to be added to the SAM/BAM record. 00224 /// \param value value for the specified tag. 00225 /// \return true if the tag was successfully added, false otherwise. 00226 bool addIntTag(const char* tag, int32_t value); 00227 00228 /// Add the specified tag to the record. Internal processing handles 00229 /// switching between SAM/BAM formats when read/written. If the tag 00230 /// is already there this code will replace it if the specified value 00231 /// is different. 00232 /// \param tag two character tag to be added to the SAM/BAM record. 00233 /// \param vtype vtype of the specified value - either SAM/BAM vtypes. 00234 /// \param value value as a string for the specified tag. 00235 /// \return true if the tag was successfully added, false otherwise. 00236 bool addTag(const char* tag, char vtype, const char* value); 00237 00238 /// Shift the indels (if any) to the left by updating the CIGAR. 00239 /// \return true if the cigar was shifted, false if not. 00240 bool shiftIndelsLeft(); 00241 00242 //@} 00243 00244 /////////////////////// 00245 /// @name Get Alignment Data 00246 /// Get methods for record fields. All of the "get" methods set the 00247 /// status to indicate success or the failure reason. 00248 //@{ 00249 00250 /// Get a const pointer to the buffer that contains the BAM representation 00251 /// of the record. 00252 /// \return const pointer to the buffer that contains the BAM representation 00253 /// of the record. 00254 const void* getRecordBuffer(); 00255 00256 /// Get a const pointer to the buffer that contains the BAM representation 00257 /// of the record. 00258 /// \param translation type of sequence translation to use. 00259 /// \return const pointer to the buffer that contains the BAM representation 00260 /// of the record. 00261 const void* getRecordBuffer(SequenceTranslation translation); 00262 00263 /// Write the record as a BAM into the specified file. 00264 /// \param filePtr file to write the BAM record into. 00265 /// \return status of the write. 00266 SamStatus::Status writeRecordBuffer(IFILE filePtr); 00267 00268 /// Write the record as a BAM into the specified file. 00269 /// \param filePtr file to write the BAM record into. 00270 /// \param translation type of sequence translation to use. 00271 /// \return status of the write. 00272 SamStatus::Status writeRecordBuffer(IFILE filePtr, 00273 SequenceTranslation translation); 00274 00275 /// Get the block size of the record. 00276 /// \return BAM block size of the record. 00277 int32_t getBlockSize(); 00278 00279 /// Get the reference sequence name of the record. 00280 /// \return reference sequence name 00281 const char* getReferenceName(); 00282 00283 /// Get the reference sequence id of the record. 00284 /// \return reference sequence id 00285 int32_t getReferenceID(); 00286 00287 /// Get the 1-based(SAM) leftmost position of the record. 00288 /// \return 1-based leftmost position. 00289 int32_t get1BasedPosition(); 00290 00291 /// Get the 0-based(BAM) leftmost position of the record. 00292 /// \return 0-based leftmost position. 00293 int32_t get0BasedPosition(); 00294 00295 /// Get the length of the readname (QNAME) including the null. 00296 /// \return length of the read name (including null). 00297 uint8_t getReadNameLength(); 00298 00299 /// Get the mapping quality of the record. 00300 /// \return map quality. 00301 uint8_t getMapQuality(); 00302 00303 /// Get the BAM bin for the record. 00304 /// \return BAM bin 00305 uint16_t getBin(); 00306 00307 /// Get the length of the CIGAR in BAM format. 00308 /// \return length of BAM formatted cigar. 00309 uint16_t getCigarLength(); 00310 00311 /// Get the flag. 00312 /// \return flag. 00313 uint16_t getFlag(); 00314 00315 /// Get the length of the read. 00316 /// \return read length. 00317 int32_t getReadLength(); 00318 00319 /// Get the mate reference sequence name of the record. If it is equal to 00320 /// the reference name, it still returns the reference name. 00321 /// \return reference sequence name 00322 const char* getMateReferenceName(); 00323 00324 /// Get the mate reference sequence name of the record, returning "=" if 00325 /// it is the same as the reference name, unless they are both "*" in 00326 /// which case "*" is returned. 00327 /// \return reference sequence name 00328 const char* getMateReferenceNameOrEqual(); 00329 00330 /// Get the mate reference id of the record. 00331 /// \return reference id 00332 int32_t getMateReferenceID(); 00333 00334 /// Get the 1-based(SAM) leftmost mate position of the record. 00335 /// \return 1-based leftmost position. 00336 int32_t get1BasedMatePosition(); 00337 00338 /// Get the 0-based(BAM) leftmost mate position of the record. 00339 /// \return 0-based leftmost position. 00340 int32_t get0BasedMatePosition(); 00341 00342 /// Get the inferred insert size of the read pair. 00343 /// \return inferred insert size. 00344 int32_t getInsertSize(); 00345 00346 /// Returns the 0-based inclusive rightmost position of the 00347 /// clipped sequence. 00348 /// \return 0-based inclusive rightmost position 00349 int32_t get0BasedAlignmentEnd(); 00350 00351 /// Returns the 1-based inclusive rightmost position of the 00352 /// clipped sequence. 00353 /// \return 1-based inclusive rightmost position 00354 int32_t get1BasedAlignmentEnd(); 00355 00356 /// Returns the length of the clipped sequence, returning 0 if the cigar 00357 /// is '*'. 00358 /// \return length of the clipped sequence. 00359 int32_t getAlignmentLength(); 00360 00361 /// Returns the 0-based inclusive left-most position adjusted for 00362 /// clipped bases. 00363 /// \return 0-based inclusive leftmost position including clips. 00364 int32_t get0BasedUnclippedStart(); 00365 00366 /// Returns the 1-based inclusive left-most position adjusted for 00367 /// clipped bases. 00368 /// \return 1-based inclusive leftmost position including clips. 00369 int32_t get1BasedUnclippedStart(); 00370 00371 /// Returns the 0-based inclusive right-most position adjusted for 00372 /// clipped bases. 00373 /// \return 0-based inclusive rightmost position including clips. 00374 int32_t get0BasedUnclippedEnd(); 00375 00376 /// Returns the 1-based inclusive right-most position adjusted for 00377 /// clipped bases. 00378 /// \return 1-based inclusive rightmost position including clips. 00379 int32_t get1BasedUnclippedEnd(); 00380 00381 /// Returns the SAM formatted Read Name (QNAME). 00382 /// \return read name. 00383 const char* getReadName(); 00384 00385 /// Returns the SAM formatted CIGAR string. 00386 /// \return cigar string. 00387 const char* getCigar(); 00388 00389 /// Returns the SAM formatted sequence string, translating the base as 00390 /// specified by setSequenceTranslation. 00391 /// \return sequence string. 00392 const char* getSequence(); 00393 00394 /// Returns the SAM formatted sequence string performing the specified 00395 /// sequence translation. 00396 /// \param translation type of sequence translation to use. 00397 /// \return sequence string. 00398 const char* getSequence(SequenceTranslation translation); 00399 00400 /// Returns the SAM formatted quality string. 00401 /// \return quality string. 00402 const char* getQuality(); 00403 00404 /// Get the sequence base at the specified index into this sequence 0 to 00405 /// readLength - 1, translating the base as specified by 00406 /// setSequenceTranslation. 00407 /// \param index index into the sequence string (0 to readLength-1). 00408 /// \return the sequence base at the specified index into the sequence. 00409 char getSequence(int index); 00410 00411 /// Get the sequence base at the specified index into this sequence 0 to 00412 /// readLength - performing the specified sequence translation1. 00413 /// \param index index into the sequence string (0 to readLength-1). 00414 /// \param translation type of sequence translation to use. 00415 /// \return the sequence base at the specified index into the sequence. 00416 char getSequence(int index, SequenceTranslation translation); 00417 00418 /// Get the quality character at the specified index into the quality 0 to 00419 /// readLength - 1. 00420 /// \param index index into the quality string (0 to readLength-1). 00421 /// \return the quality character at the specified index into the quality. 00422 char getQuality(int index); 00423 00424 /// Returns a pointer to the Cigar object associated with this record. 00425 /// The object is essentially read-only, only allowing modifications 00426 /// due to lazy evaluations. 00427 /// \return pointer to the Cigar object. 00428 // TODO - want this to be getCigar 00429 Cigar* getCigarInfo(); 00430 00431 /// Returns the length of the tags in BAM format. 00432 /// \return length of tags in BAM format. 00433 uint32_t getTagLength(); 00434 00435 /// Get the next tag from the record. 00436 /// Sets the Status to SUCCESS when a tag is successfully returned or 00437 /// when there are no more tags. Otherwise the status is set to describe 00438 /// why it failed (parsing, etc). 00439 /// \param tag set to the tag when a tag is read. 00440 /// \param vtype set to the vtype when a tag is read. 00441 /// \param value pointer to the value of the tag (will need to cast 00442 /// to int, double, char, or string based on vtype). 00443 /// \return true if a tag was read, false if there are no more tags. 00444 bool getNextSamTag(char* tag, char& vtype, void** value); 00445 00446 /// Returns the values of all fields except the tags. 00447 /// \param recStruct structure containing the contents of all 00448 /// non-variable length fields. 00449 /// \param readName read name from the record (return param) 00450 /// \param cigar cigar string from the record (return param) 00451 /// \param sequence sequence string from the record (return param) 00452 /// \param quality quality string from the record (return param) 00453 /// \return true if all fields were successfully set, false otherwise. 00454 bool getFields(bamRecordStruct& recStruct, String& readName, 00455 String& cigar, String& sequence, String& quality); 00456 00457 /// Returns the values of all fields except the tags. 00458 /// \param recStruct structure containing the contents of all 00459 /// non-variable length fields. 00460 /// \param readName read name from the record (return param) 00461 /// \param cigar cigar string from the record (return param) 00462 /// \param sequence sequence string from the record (return param) 00463 /// \param quality quality string from the record (return param) 00464 /// \param translation type of sequence translation to use. 00465 /// \return true if all fields were successfully set, false otherwise. 00466 bool getFields(bamRecordStruct& recStruct, String& readName, 00467 String& cigar, String& sequence, String& quality, 00468 SequenceTranslation translation); 00469 00470 /// Returns a pointer to the genome sequence object associated with this 00471 /// record if it was set (NULL if it was not set). 00472 /// \return pointer to the GenomeSequence object or NULL if there isn't one. 00473 GenomeSequence* getReference(); 00474 00475 //@} 00476 00477 /// Returns whether or not the specified vtype is an integer type. 00478 /// Does not set SamStatus. 00479 /// \param vtype value type to check. 00480 /// \return true if the passed in vtype is an integer ('c', 'C', 's', 00481 /// 'S', 'i', 'I'), false otherwise. 00482 bool isIntegerType(char vtype) const; 00483 00484 /// Returns whether or not the specified vtype is a double type. 00485 /// Does not set SamStatus. 00486 /// \param vtype value type to check. 00487 /// \return true if the passed in vtype is a double ('f'), false otherwise. 00488 bool isDoubleType(char vtype) const; 00489 00490 /// Returns whether or not the specified vtype is a char type. 00491 /// Does not set SamStatus. 00492 /// \param vtype value type to check. 00493 /// \return true if the passed in vtype is a char ('A'), false otherwise. 00494 bool isCharType(char vtype) const; 00495 00496 /// Returns whether or not the specified vtype is a string type. 00497 /// Does not set SamStatus. 00498 /// \param vtype value type to check. 00499 /// \return true if the passed in vtype is a string ('Z'), false othwerise. 00500 bool isStringType(char vtype) const; 00501 00502 /// Clear the tags in this record. 00503 /// Does not set SamStatus. 00504 void clearTags(); 00505 00506 /// Remove a tag. 00507 /// \param tag tag to remove. 00508 /// \param type of the tag to be removed. 00509 /// \return true if the tag no longer exists in the record, false if it could not be removed (Returns true if the tag was not found in the record). 00510 bool rmTag(const char* tag, char type); 00511 00512 /// Remove tags. 00513 /// \param tags tags to remove, formatted as Tag:Type;Tag:Type;Tag:Type... 00514 /// \return true if all tags no longer exist in the record, false if any could not be removed 00515 /// (Returns true if the tags were not found in the record). 00516 /// SamStatus is set to INVALID if the tags are incorrectly formatted. 00517 bool rmTags(const char* tags); 00518 00519 /// Returns the status associated with the last method that sets the status. 00520 /// \return SamStatus of the last command that sets status. 00521 const SamStatus& getStatus(); 00522 00523 /// Get the string representation of the tags from the record, formatted 00524 /// as TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE... 00525 /// Sets the Status to SUCCESS when the tags are successfully returned or 00526 /// the tags were not found. If a different error occured, the status is 00527 /// set appropriately. 00528 /// \param tags the tags to retrieve, formatted as TAG:TYPE;TAG:TYPE... 00529 /// \param returnString the String to set (this method first clears returnString) 00530 /// to TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE... 00531 /// \param delim delimiter to use to separate two tags, default is a tab. 00532 /// \return true if there were not any errors even if no tags were found. 00533 bool getTagsString(const char* tags, String& returnString, char delim = '\t'); 00534 00535 /// Get the string value for the specified tag. 00536 /// \param tag tag to retrieve 00537 /// \param pointer to the tag's string value if found, NULL if not found. 00538 String* getStringTag(const char * tag); 00539 00540 /// Get the integer value for the specified tag. 00541 /// \param tag tag to retrieve 00542 /// \retun pointer to the tag's integer value if found, NULL if not found. 00543 int* getIntegerTag(const char * tag); 00544 00545 /// Get the char value for the specified tag. 00546 /// \param tag tag to retrieve 00547 /// \retun pointer to the tag's char value if found, NULL if not found. 00548 char* getCharTag(const char * tag); 00549 00550 /// Get the double value for the specified tag. 00551 /// \param tag tag to retrieve 00552 /// \return pointer to the tag's double value if found, NULL if not found. 00553 double* getDoubleTag(const char * tag); 00554 00555 /// Get the string value for the specified tag. 00556 String & getString(const char * tag); 00557 00558 /// Get the integer value for the specified tag. 00559 int & getInteger(const char * tag); 00560 00561 /// Get the char value for the specified tag. 00562 char & getChar(const char * tag); 00563 00564 /// Get the double value for the specified tag. 00565 double & getDouble(const char * tag); 00566 00567 00568 // void getSamExtraFieldFromKey(int key, String& extraField); 00569 00570 /// Check if the specified tag contains a string. 00571 /// Does not set SamStatus. 00572 /// \param tag SAM tag to check contents of. 00573 /// \return true if the value associated with the tag is a string. 00574 bool checkString(const char * tag) { return checkTag(tag, 'Z'); } 00575 00576 /// Check if the specified tag contains a string. 00577 /// Does not set SamStatus. 00578 /// \param tag SAM tag to check contents of. 00579 /// \return true if the value associated with the tag is a string. 00580 bool checkInteger(const char * tag) { return checkTag(tag, 'i'); } 00581 00582 /// Check if the specified tag contains a string. 00583 /// Does not set SamStatus. 00584 /// \param tag SAM tag to check contents of. 00585 /// \return true if the value associated with the tag is a string. 00586 bool checkDouble(const char * tag) { return checkTag(tag, 'f'); } 00587 00588 /// Check if the specified tag contains a value of the specified vtype. 00589 /// Does not set SamStatus. 00590 /// \param tag SAM tag to check contents of. 00591 /// \param type value type to check if the SAM tag matches. 00592 /// \return true if the value associated with the tag is a string. 00593 bool checkTag(const char * tag, char type); 00594 00595 00596 /// Return the number of bases in this read that overlap the passed in 00597 /// region. 00598 /// \param start inclusive 0-based start position (reference position) of 00599 /// the region to check for overlaps in. 00600 /// (-1 indicates to start at the beginning of the reference.) 00601 /// \param end exclusive 0-based end position (reference position) of the 00602 /// region to check for overlaps in. 00603 /// (-1 indicates to go to the end of the reference.) 00604 /// \return number of overlapping bases 00605 /// (matches in the cigar - not skips/deletions) 00606 uint32_t getNumOverlaps(int32_t start, int32_t end); 00607 00608 00609 private: 00610 static int MAKEKEY(char ch1, char ch2, char type) 00611 { return (getKeyType(type) << 16) + (ch2 << 8) + ch1; } 00612 00613 static char getKeyType(char type) 00614 { 00615 switch(type) 00616 { 00617 // For any char/integer type, return 'i' 00618 case 'A' : 00619 case 'c' : 00620 case 'C' : 00621 case 's' : 00622 case 'S' : 00623 case 'i' : 00624 case 'I' : 00625 return('i'); 00626 break; 00627 default: 00628 // For all other types, return the actual type. 00629 return(type); 00630 }; 00631 } 00632 00633 // Allocate space for the record - does a realloc. 00634 // The passed in size is the size of the entire record including the 00635 // block size field. 00636 // Adds any errors to myStatus. 00637 bool allocateRecordStructure(int size); 00638 00639 void* getStringPtr(int offset); 00640 void* getIntegerPtr(int offset, char& vtype); 00641 void* getCharPtr(int offset); 00642 void* getDoublePtr(int offset); 00643 00644 // Fixes the buffer to match the variable length fields. 00645 // Adds any errors to myStatus. 00646 bool fixBuffer(SequenceTranslation translation); 00647 00648 // Sets the Sequence and Quality strings from the buffer. 00649 // They are done together in one method because they require the same 00650 // loop, so might as well be done at the same time. 00651 // Adds any errors to myStatus. 00652 void setSequenceAndQualityFromBuffer(); 00653 00654 // Parse the cigar to calculate the alignment/unclipped ends and convert 00655 // to SAM/BAM format. 00656 // Adds any errors to myStatus. 00657 bool parseCigar(); 00658 // Parse the cigar string to calculate the cigar length and alignment end 00659 // and convert to SAM format. 00660 // Adds any errors to myStatus. 00661 bool parseCigarBinary(); 00662 // Parse the cigar string to calculate the cigar length and alignment end 00663 // and convert to BAM format. 00664 // Adds any errors to myStatus. 00665 bool parseCigarString(); 00666 00667 // Set the tags from the buffer. 00668 // Adds any errors to myStatus. 00669 bool setTagsFromBuffer(); 00670 00671 // Set the tags in the buffer. 00672 // Adds any errors to myStatus. 00673 bool setTagsInBuffer(); 00674 00675 void setVariablesForNewBuffer(SamFileHeader& header); 00676 00677 void getTypeFromKey(int key, char& type) const; 00678 void getTag(int key, char* tag) const; 00679 00680 String & getString(int offset); 00681 int & getInteger(int offset); 00682 char & getIntegerType(int offset); 00683 char & getChar(int offset); 00684 double & getDouble(int offset); 00685 00686 static const int DEFAULT_BLOCK_SIZE = 40; 00687 static const int DEFAULT_BIN = 4680; 00688 static const int DEFAULT_READ_NAME_LENGTH = 8; 00689 static const char* DEFAULT_READ_NAME; 00690 static const char* FIELD_ABSENT_STRING; 00691 00692 bamRecordStruct * myRecordPtr; 00693 int allocatedSize; 00694 00695 // Pointer to a temporary cigar buffer that can be used during string 00696 // parsing before it is ready to be copied into the actual record. 00697 uint32_t* myCigarTempBuffer; 00698 00699 // Size of the currently allocated temporary cigar buffer. 00700 int myCigarTempBufferAllocatedSize; 00701 00702 // Length of the cigar currently contained in the temporary buffer. 00703 int myCigarTempBufferLength; 00704 00705 // Track if the buffer is in sync with the Strings/Tags. 00706 // Set to false if any of the variable length fields are modified. 00707 // Set to true when the buffer is updated to match the variable length 00708 // fields. 00709 bool myIsBufferSynced; 00710 00711 // Track if the tags need to be set from the buffer. 00712 bool myNeedToSetTagsFromBuffer; 00713 00714 // Trag if the tags need to be set in the buffer. 00715 // Allows you to set just the tags if they are the only thing that changed 00716 // in the buffer. 00717 bool myNeedToSetTagsInBuffer; 00718 00719 int myTagBufferSize; 00720 int myLastTagIndex; 00721 00722 String myReadName; 00723 String myReferenceName; 00724 String myMateReferenceName; 00725 String myCigar; 00726 String mySequence; 00727 String myQuality; 00728 00729 std::string mySeqWithEq; 00730 std::string mySeqWithoutEq; 00731 00732 // The length of the alignment. 00733 int32_t myAlignmentLength; 00734 // Unclipped alignment positions. 00735 int32_t myUnclippedStartOffset; 00736 int32_t myUnclippedEndOffset; 00737 00738 CigarRoller myCigarRoller; 00739 00740 LongHash<int> extras; 00741 // Note: not all values in strings, integers, and doubles are always 00742 // in extras. They will not be if the tags were removed. Removed 00743 // tags are removed from extras, but not from strings, integers, or doubles 00744 // since if one was removed from these arrays, all other entries would 00745 // need their indices updated in extras. 00746 StringArray strings; 00747 IntArray integers; 00748 std::vector<char> intType; // contains the type of int at same position in integers. 00749 Vector doubles; 00750 00751 00752 // Track whether or not the buffer values are correct for 00753 // each setting. 00754 bool myIsReadNameBufferValid; 00755 bool myIsCigarBufferValid; 00756 bool myIsSequenceBufferValid; 00757 bool myIsQualityBufferValid; 00758 bool myIsTagsBufferValid; 00759 bool myIsBinValid; 00760 00761 SamStatus myStatus; 00762 00763 // The current translation of the sequence as it occurs in the buffer. 00764 // Only applicable if myIsSequenceBufferValid == true. 00765 SequenceTranslation myBufferSequenceTranslation; 00766 00767 00768 // Track the Reference. 00769 GenomeSequence* myRefPtr; 00770 00771 // The type of translation to do when getting a sequence. 00772 SequenceTranslation mySequenceTranslation; 00773 00774 String NOT_FOUND_TAG_STRING; 00775 int NOT_FOUND_TAG_INT; 00776 double NOT_FOUND_TAG_DOUBLE; 00777 }; 00778 00779 #endif