libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010-2012 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include <stdlib.h> 00019 #include <limits> 00020 #include <stdexcept> 00021 00022 #include "bam.h" 00023 00024 #include "SamRecord.h" 00025 #include "SamValidation.h" 00026 00027 #include "BaseUtilities.h" 00028 #include "SamQuerySeqWithRefHelper.h" 00029 00030 const char* SamRecord::DEFAULT_READ_NAME = "UNKNOWN"; 00031 const char* SamRecord::FIELD_ABSENT_STRING = "="; 00032 int SamRecord::myNumWarns = 0; 00033 00034 SamRecord::SamRecord() 00035 : myStatus(), 00036 myRefPtr(NULL), 00037 mySequenceTranslation(NONE) 00038 { 00039 int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t); 00040 00041 myRecordPtr = 00042 (bamRecordStruct *) malloc(defaultAllocSize); 00043 00044 myCigarTempBuffer = NULL; 00045 myCigarTempBufferAllocatedSize = 0; 00046 00047 allocatedSize = defaultAllocSize; 00048 00049 resetRecord(); 00050 } 00051 00052 00053 SamRecord::SamRecord(ErrorHandler::HandlingType errorHandlingType) 00054 : myStatus(errorHandlingType), 00055 myRefPtr(NULL), 00056 mySequenceTranslation(NONE) 00057 { 00058 int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t); 00059 00060 myRecordPtr = 00061 (bamRecordStruct *) malloc(defaultAllocSize); 00062 00063 myCigarTempBuffer = NULL; 00064 myCigarTempBufferAllocatedSize = 0; 00065 00066 allocatedSize = defaultAllocSize; 00067 00068 resetRecord(); 00069 } 00070 00071 00072 SamRecord::~SamRecord() 00073 { 00074 resetRecord(); 00075 00076 if(myRecordPtr != NULL) 00077 { 00078 free(myRecordPtr); 00079 myRecordPtr = NULL; 00080 } 00081 if(myCigarTempBuffer != NULL) 00082 { 00083 free(myCigarTempBuffer); 00084 myCigarTempBuffer = NULL; 00085 myCigarTempBufferAllocatedSize = 0; 00086 } 00087 } 00088 00089 00090 // Resets the fields of the record to a default value. 00091 void SamRecord::resetRecord() 00092 { 00093 myIsBufferSynced = true; 00094 00095 myRecordPtr->myBlockSize = DEFAULT_BLOCK_SIZE; 00096 myRecordPtr->myReferenceID = -1; 00097 myRecordPtr->myPosition = -1; 00098 myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH; 00099 myRecordPtr->myMapQuality = 0; 00100 myRecordPtr->myBin = DEFAULT_BIN; 00101 myRecordPtr->myCigarLength = 0; 00102 myRecordPtr->myFlag = 0; 00103 myRecordPtr->myReadLength = 0; 00104 myRecordPtr->myMateReferenceID = -1; 00105 myRecordPtr->myMatePosition = -1; 00106 myRecordPtr->myInsertSize = 0; 00107 00108 // Set the sam values for the variable length fields. 00109 // TODO - one way to speed this up might be to not set to "*" and just 00110 // clear them, and write out a '*' for SAM if it is empty. 00111 myReadName = DEFAULT_READ_NAME; 00112 myReferenceName = "*"; 00113 myMateReferenceName = "*"; 00114 myCigar = "*"; 00115 mySequence = "*"; 00116 mySeqWithEq.clear(); 00117 mySeqWithoutEq.clear(); 00118 myQuality = "*"; 00119 myNeedToSetTagsFromBuffer = false; 00120 myNeedToSetTagsInBuffer = false; 00121 00122 // Initialize the calculated alignment info to the uncalculated value. 00123 myAlignmentLength = -1; 00124 myUnclippedStartOffset = -1; 00125 myUnclippedEndOffset = -1; 00126 00127 clearTags(); 00128 00129 // Set the bam values for the variable length fields. 00130 // Only the read name needs to be set, the others are a length of 0. 00131 // Set the read name. The min size of myRecordPtr includes the size for 00132 // the default read name. 00133 memcpy(&(myRecordPtr->myData), myReadName.c_str(), 00134 myRecordPtr->myReadNameLength); 00135 00136 // Set that the variable length buffer fields are valid. 00137 myIsReadNameBufferValid = true; 00138 myIsCigarBufferValid = true; 00139 myPackedSequence = 00140 (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength + 00141 myRecordPtr->myCigarLength * sizeof(int); 00142 myIsSequenceBufferValid = true; 00143 myBufferSequenceTranslation = NONE; 00144 00145 myPackedQuality = myPackedSequence; 00146 myIsQualityBufferValid = true; 00147 myIsTagsBufferValid = true; 00148 myIsBinValid = true; 00149 00150 myCigarTempBufferLength = -1; 00151 00152 myStatus = SamStatus::SUCCESS; 00153 00154 NOT_FOUND_TAG_STRING = ""; 00155 NOT_FOUND_TAG_INT = -1; // TODO - deprecate 00156 } 00157 00158 00159 // Returns whether or not the record is valid. 00160 // Header is needed to perform some validation against it. 00161 bool SamRecord::isValid(SamFileHeader& header) 00162 { 00163 myStatus = SamStatus::SUCCESS; 00164 SamValidationErrors invalidSamErrors; 00165 if(!SamValidator::isValid(header, *this, invalidSamErrors)) 00166 { 00167 // The record is not valid. 00168 std::string errorMessage = ""; 00169 invalidSamErrors.getErrorString(errorMessage); 00170 myStatus.setStatus(SamStatus::INVALID, errorMessage.c_str()); 00171 return(false); 00172 } 00173 // The record is valid. 00174 return(true); 00175 } 00176 00177 00178 void SamRecord::setReference(GenomeSequence* reference) 00179 { 00180 myRefPtr = reference; 00181 } 00182 00183 00184 // Set the type of sequence translation to use when getting 00185 // the sequence. The default type (if this method is never called) is 00186 // NONE (the sequence is left as-is). This is used 00187 void SamRecord::setSequenceTranslation(SequenceTranslation translation) 00188 { 00189 mySequenceTranslation = translation; 00190 } 00191 00192 00193 bool SamRecord::setReadName(const char* readName) 00194 { 00195 myReadName = readName; 00196 myIsBufferSynced = false; 00197 myIsReadNameBufferValid = false; 00198 myStatus = SamStatus::SUCCESS; 00199 00200 // The read name must at least have some length, otherwise this is a parsing 00201 // error. 00202 if(myReadName.Length() == 0) 00203 { 00204 // Invalid - reset ReadName return false. 00205 myReadName = DEFAULT_READ_NAME; 00206 myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH; 00207 myStatus.setStatus(SamStatus::INVALID, "0 length Query Name."); 00208 return(false); 00209 } 00210 00211 return true; 00212 } 00213 00214 00215 bool SamRecord::setFlag(uint16_t flag) 00216 { 00217 myStatus = SamStatus::SUCCESS; 00218 myRecordPtr->myFlag = flag; 00219 return true; 00220 } 00221 00222 00223 bool SamRecord::setReferenceName(SamFileHeader& header, 00224 const char* referenceName) 00225 { 00226 myStatus = SamStatus::SUCCESS; 00227 00228 myReferenceName = referenceName; 00229 // If the reference ID does not already exist, add it (pass true) 00230 myRecordPtr->myReferenceID = header.getReferenceID(referenceName, true); 00231 00232 return true; 00233 } 00234 00235 00236 bool SamRecord::set1BasedPosition(int32_t position) 00237 { 00238 return(set0BasedPosition(position - 1)); 00239 } 00240 00241 00242 bool SamRecord::set0BasedPosition(int32_t position) 00243 { 00244 myStatus = SamStatus::SUCCESS; 00245 myRecordPtr->myPosition = position; 00246 myIsBinValid = false; 00247 return true; 00248 } 00249 00250 00251 bool SamRecord::setMapQuality(uint8_t mapQuality) 00252 { 00253 myStatus = SamStatus::SUCCESS; 00254 myRecordPtr->myMapQuality = mapQuality; 00255 return true; 00256 } 00257 00258 00259 bool SamRecord::setCigar(const char* cigar) 00260 { 00261 myStatus = SamStatus::SUCCESS; 00262 myCigar = cigar; 00263 00264 myIsBufferSynced = false; 00265 myIsCigarBufferValid = false; 00266 myCigarTempBufferLength = -1; 00267 myIsBinValid = false; 00268 00269 // Initialize the calculated alignment info to the uncalculated value. 00270 myAlignmentLength = -1; 00271 myUnclippedStartOffset = -1; 00272 myUnclippedEndOffset = -1; 00273 00274 return true; 00275 } 00276 00277 00278 bool SamRecord::setCigar(const Cigar& cigar) 00279 { 00280 myStatus = SamStatus::SUCCESS; 00281 cigar.getCigarString(myCigar); 00282 00283 myIsBufferSynced = false; 00284 myIsCigarBufferValid = false; 00285 myCigarTempBufferLength = -1; 00286 myIsBinValid = false; 00287 00288 // Initialize the calculated alignment info to the uncalculated value. 00289 myAlignmentLength = -1; 00290 myUnclippedStartOffset = -1; 00291 myUnclippedEndOffset = -1; 00292 00293 return true; 00294 } 00295 00296 00297 bool SamRecord::setMateReferenceName(SamFileHeader& header, 00298 const char* mateReferenceName) 00299 { 00300 myStatus = SamStatus::SUCCESS; 00301 // Set the mate reference, if it is "=", set it to be equal 00302 // to myReferenceName. This assumes that myReferenceName has already 00303 // been called. 00304 if(strcmp(mateReferenceName, FIELD_ABSENT_STRING) == 0) 00305 { 00306 myMateReferenceName = myReferenceName; 00307 } 00308 else 00309 { 00310 myMateReferenceName = mateReferenceName; 00311 } 00312 00313 // Set the Mate Reference ID. 00314 // If the reference ID does not already exist, add it (pass true) 00315 myRecordPtr->myMateReferenceID = 00316 header.getReferenceID(myMateReferenceName, true); 00317 00318 return true; 00319 } 00320 00321 00322 bool SamRecord::set1BasedMatePosition(int32_t matePosition) 00323 { 00324 return(set0BasedMatePosition(matePosition - 1)); 00325 } 00326 00327 00328 bool SamRecord::set0BasedMatePosition(int32_t matePosition) 00329 { 00330 myStatus = SamStatus::SUCCESS; 00331 myRecordPtr->myMatePosition = matePosition; 00332 return true; 00333 } 00334 00335 00336 bool SamRecord::setInsertSize(int32_t insertSize) 00337 { 00338 myStatus = SamStatus::SUCCESS; 00339 myRecordPtr->myInsertSize = insertSize; 00340 return true; 00341 } 00342 00343 00344 bool SamRecord::setSequence(const char* seq) 00345 { 00346 myStatus = SamStatus::SUCCESS; 00347 mySequence = seq; 00348 mySeqWithEq.clear(); 00349 mySeqWithoutEq.clear(); 00350 00351 myIsBufferSynced = false; 00352 myIsSequenceBufferValid = false; 00353 return true; 00354 } 00355 00356 00357 bool SamRecord::setQuality(const char* quality) 00358 { 00359 myStatus = SamStatus::SUCCESS; 00360 myQuality = quality; 00361 myIsBufferSynced = false; 00362 myIsQualityBufferValid = false; 00363 return true; 00364 } 00365 00366 00367 //Shift indels to the left 00368 bool SamRecord::shiftIndelsLeft() 00369 { 00370 // Check to see whether or not the Cigar has already been 00371 // set - this is determined by checking if alignment length 00372 // is set since alignment length and the cigar are set 00373 // at the same time. 00374 if(myAlignmentLength == -1) 00375 { 00376 // Not been set, so calculate it. 00377 parseCigar(); 00378 } 00379 00380 // Track whether or not there was a shift. 00381 bool shifted = false; 00382 00383 // Cigar is set, so now myCigarRoller can be used. 00384 // Track where in the read we are. 00385 uint32_t currentPos = 0; 00386 00387 // Since the loop starts at 1 because the first operation can't be shifted, 00388 // increment the currentPos past the first operation. 00389 if(Cigar::foundInQuery(myCigarRoller[0])) 00390 { 00391 // This op was found in the read, increment the current position. 00392 currentPos += myCigarRoller[0].count; 00393 } 00394 00395 int numOps = myCigarRoller.size(); 00396 00397 // Loop through the cigar operations from the 2nd operation since 00398 // the first operation is already on the end and can't shift. 00399 for(int currentOp = 1; currentOp < numOps; currentOp++) 00400 { 00401 if(myCigarRoller[currentOp].operation == Cigar::insert) 00402 { 00403 // For now, only shift a max of 1 operation. 00404 int prevOpIndex = currentOp-1; 00405 // Track the next op for seeing if it is the same as the 00406 // previous for merging reasons. 00407 int nextOpIndex = currentOp+1; 00408 if(nextOpIndex == numOps) 00409 { 00410 // There is no next op, so set it equal to the current one. 00411 nextOpIndex = currentOp; 00412 } 00413 // The start of the previous operation, so we know when we hit it 00414 // so we don't shift past it. 00415 uint32_t prevOpStart = 00416 currentPos - myCigarRoller[prevOpIndex].count; 00417 00418 // We can only shift if the previous operation 00419 if(!Cigar::isMatchOrMismatch(myCigarRoller[prevOpIndex])) 00420 { 00421 // TODO - shift past pads 00422 // An insert is in the read, so increment the position. 00423 currentPos += myCigarRoller[currentOp].count; 00424 // Not a match/mismatch, so can't shift into it. 00425 continue; 00426 } 00427 00428 // It is a match or mismatch, so check to see if we can 00429 // shift into it. 00430 00431 // The end of the insert is calculated by adding the size 00432 // of this insert minus 1 to the start of the insert. 00433 uint32_t insertEndPos = 00434 currentPos + myCigarRoller[currentOp].count - 1; 00435 00436 // The insert starts at the current position. 00437 uint32_t insertStartPos = currentPos; 00438 00439 // Loop as long as the position before the insert start 00440 // matches the last character in the insert. If they match, 00441 // the insert can be shifted one index left because the 00442 // implied reference will not change. If they do not match, 00443 // we can't shift because the implied reference would change. 00444 // Stop loop when insertStartPos = prevOpStart, because we 00445 // don't want to move past that. 00446 while((insertStartPos > prevOpStart) && 00447 (getSequence(insertEndPos,BASES) == 00448 getSequence(insertStartPos - 1, BASES))) 00449 { 00450 // We can shift, so move the insert start & end one left. 00451 --insertEndPos; 00452 --insertStartPos; 00453 } 00454 00455 // Determine if a shift has occurred. 00456 int shiftLen = currentPos - insertStartPos; 00457 if(shiftLen > 0) 00458 { 00459 // Shift occured, so adjust the cigar if the cigar will 00460 // not become more operations. 00461 // If the next operation is the same as the previous or 00462 // if the insert and the previous operation switch positions 00463 // then the cigar has the same number of operations. 00464 // If the next operation is different, and the shift splits 00465 // the previous operation in 2, then the cigar would 00466 // become longer, so we do not want to shift. 00467 if(myCigarRoller[nextOpIndex].operation == 00468 myCigarRoller[prevOpIndex].operation) 00469 { 00470 // The operations are the same, so merge them by adding 00471 // the length of the shift to the next operation. 00472 myCigarRoller.IncrementCount(nextOpIndex, shiftLen); 00473 myCigarRoller.IncrementCount(prevOpIndex, -shiftLen); 00474 00475 // If the previous op length is 0, just remove that 00476 // operation. 00477 if(myCigarRoller[prevOpIndex].count == 0) 00478 { 00479 myCigarRoller.Remove(prevOpIndex); 00480 } 00481 shifted = true; 00482 } 00483 else 00484 { 00485 // Can only shift if the insert shifts past the 00486 // entire previous operation, otherwise an operation 00487 // would need to be added. 00488 if(insertStartPos == prevOpStart) 00489 { 00490 // Swap the positions of the insert and the 00491 // previous operation. 00492 myCigarRoller.Update(currentOp, 00493 myCigarRoller[prevOpIndex].operation, 00494 myCigarRoller[prevOpIndex].count); 00495 // Size of the previous op is the entire 00496 // shift length. 00497 myCigarRoller.Update(prevOpIndex, 00498 Cigar::insert, 00499 shiftLen); 00500 shifted = true; 00501 } 00502 } 00503 } 00504 // An insert is in the read, so increment the position. 00505 currentPos += myCigarRoller[currentOp].count; 00506 } 00507 else if(Cigar::foundInQuery(myCigarRoller[currentOp])) 00508 { 00509 // This op was found in the read, increment the current position. 00510 currentPos += myCigarRoller[currentOp].count; 00511 } 00512 } 00513 if(shifted) 00514 { 00515 // TODO - setCigar is currently inefficient because later the cigar 00516 // roller will be recalculated, but for now it will work. 00517 setCigar(myCigarRoller); 00518 } 00519 return(shifted); 00520 } 00521 00522 00523 // Set the BAM record from the passeed in buffer of the specified size. 00524 // Note: The size includes the block size. 00525 SamStatus::Status SamRecord::setBuffer(const char* fromBuffer, 00526 uint32_t fromBufferSize, 00527 SamFileHeader& header) 00528 { 00529 myStatus = SamStatus::SUCCESS; 00530 if((fromBuffer == NULL) || (fromBufferSize == 0)) 00531 { 00532 // Buffer is empty. 00533 myStatus.setStatus(SamStatus::FAIL_PARSE, 00534 "Cannot parse an empty file."); 00535 return(SamStatus::FAIL_PARSE); 00536 } 00537 00538 // Clear the record. 00539 resetRecord(); 00540 00541 // allocate space for the record size. 00542 if(!allocateRecordStructure(fromBufferSize)) 00543 { 00544 // Failed to allocate space. 00545 return(SamStatus::FAIL_MEM); 00546 } 00547 00548 memcpy(myRecordPtr, fromBuffer, fromBufferSize); 00549 00550 setVariablesForNewBuffer(header); 00551 00552 // Return the status of the record. 00553 return(SamStatus::SUCCESS); 00554 } 00555 00556 00557 // Read the BAM record from a file. 00558 SamStatus::Status SamRecord::setBufferFromFile(IFILE filePtr, 00559 SamFileHeader& header) 00560 { 00561 myStatus = SamStatus::SUCCESS; 00562 if((filePtr == NULL) || (filePtr->isOpen() == false)) 00563 { 00564 // File is not open, return failure. 00565 myStatus.setStatus(SamStatus::FAIL_ORDER, 00566 "Can't read from an unopened file."); 00567 return(SamStatus::FAIL_ORDER); 00568 } 00569 00570 // Clear the record. 00571 resetRecord(); 00572 00573 // read the record size. 00574 int numBytes = 00575 ifread(filePtr, &(myRecordPtr->myBlockSize), sizeof(int32_t)); 00576 00577 // Check to see if the end of the file was hit and no bytes were read. 00578 if(ifeof(filePtr) && (numBytes == 0)) 00579 { 00580 // End of file, nothing was read, no more records. 00581 myStatus.setStatus(SamStatus::NO_MORE_RECS, 00582 "No more records left to read."); 00583 return(SamStatus::NO_MORE_RECS); 00584 } 00585 00586 if(numBytes != sizeof(int32_t)) 00587 { 00588 // Failed to read the entire block size. Either the end of the file 00589 // was reached early or there was an error. 00590 if(ifeof(filePtr)) 00591 { 00592 // Error: end of the file reached prior to reading the rest of the 00593 // record. 00594 myStatus.setStatus(SamStatus::FAIL_PARSE, 00595 "EOF reached in the middle of a record."); 00596 return(SamStatus::FAIL_PARSE); 00597 } 00598 else 00599 { 00600 // Error reading. 00601 myStatus.setStatus(SamStatus::FAIL_IO, 00602 "Failed to read the record size."); 00603 return(SamStatus::FAIL_IO); 00604 } 00605 } 00606 00607 // allocate space for the record size. 00608 if(!allocateRecordStructure(myRecordPtr->myBlockSize + sizeof(int32_t))) 00609 { 00610 // Failed to allocate space. 00611 // Status is set by allocateRecordStructure. 00612 return(SamStatus::FAIL_MEM); 00613 } 00614 00615 // Read the rest of the alignment block, starting at the reference id. 00616 if(ifread(filePtr, &(myRecordPtr->myReferenceID), myRecordPtr->myBlockSize) 00617 != (unsigned int)myRecordPtr->myBlockSize) 00618 { 00619 // Error reading the record. Reset it and return failure. 00620 resetRecord(); 00621 myStatus.setStatus(SamStatus::FAIL_IO, 00622 "Failed to read the record"); 00623 return(SamStatus::FAIL_IO); 00624 } 00625 00626 setVariablesForNewBuffer(header); 00627 00628 // Return the status of the record. 00629 return(SamStatus::SUCCESS); 00630 } 00631 00632 00633 // Add the specified tag to the record. 00634 // Returns true if the tag was successfully added, false otherwise. 00635 bool SamRecord::addIntTag(const char* tag, int32_t value) 00636 { 00637 myStatus = SamStatus::SUCCESS; 00638 int key = 0; 00639 int index = 0; 00640 char bamvtype; 00641 00642 int tagBufferSize = 0; 00643 00644 // First check to see if the tags need to be synced to the buffer. 00645 if(myNeedToSetTagsFromBuffer) 00646 { 00647 if(!setTagsFromBuffer()) 00648 { 00649 // Failed to read tags from the buffer, so cannot add new ones. 00650 return(false); 00651 } 00652 } 00653 00654 // Ints come in as int. But it can be represented in fewer bits. 00655 // So determine a more specific type that is in line with the 00656 // types for BAM files. 00657 // First check to see if it is a negative. 00658 if(value < 0) 00659 { 00660 // The int is negative, so it will need to use a signed type. 00661 // See if it is greater than the min value for a char. 00662 if(value > ((std::numeric_limits<char>::min)())) 00663 { 00664 // It can be stored in a signed char. 00665 bamvtype = 'c'; 00666 tagBufferSize += 4; 00667 } 00668 else if(value > ((std::numeric_limits<short>::min)())) 00669 { 00670 // It fits in a signed short. 00671 bamvtype = 's'; 00672 tagBufferSize += 5; 00673 } 00674 else 00675 { 00676 // Just store it as a signed int. 00677 bamvtype = 'i'; 00678 tagBufferSize += 7; 00679 } 00680 } 00681 else 00682 { 00683 // It is positive, so an unsigned type can be used. 00684 if(value < ((std::numeric_limits<unsigned char>::max)())) 00685 { 00686 // It is under the max of an unsigned char. 00687 bamvtype = 'C'; 00688 tagBufferSize += 4; 00689 } 00690 else if(value < ((std::numeric_limits<unsigned short>::max)())) 00691 { 00692 // It is under the max of an unsigned short. 00693 bamvtype = 'S'; 00694 tagBufferSize += 5; 00695 } 00696 else 00697 { 00698 // Just store it as an unsigned int. 00699 bamvtype = 'I'; 00700 tagBufferSize += 7; 00701 } 00702 } 00703 00704 // Check to see if the tag is already there. 00705 key = MAKEKEY(tag[0], tag[1], bamvtype); 00706 unsigned int hashIndex = extras.Find(key); 00707 if(hashIndex != LH_NOTFOUND) 00708 { 00709 // Tag was already found. 00710 index = extras[hashIndex]; 00711 00712 // Since the tagBufferSize was already updated with the new value, 00713 // subtract the size for the previous tag (even if they are the same). 00714 switch(intType[index]) 00715 { 00716 case 'c': 00717 case 'C': 00718 case 'A': 00719 tagBufferSize -= 4; 00720 break; 00721 case 's': 00722 case 'S': 00723 tagBufferSize -= 5; 00724 break; 00725 case 'i': 00726 case 'I': 00727 tagBufferSize -= 7; 00728 break; 00729 default: 00730 myStatus.setStatus(SamStatus::INVALID, 00731 "unknown tag inttype type found.\n"); 00732 return(false); 00733 } 00734 00735 // Tag already existed, print message about overwriting. 00736 // WARN about dropping duplicate tags. 00737 if(myNumWarns++ < myMaxWarns) 00738 { 00739 String newVal; 00740 String origVal; 00741 appendIntArrayValue(index, origVal); 00742 appendIntArrayValue(bamvtype, value, newVal); 00743 fprintf(stderr, "WARNING: Duplicate Tags, overwritting %c%c:%c:%s with %c%c:%c:%s\n", 00744 tag[0], tag[1], intType[index], origVal.c_str(), tag[0], tag[1], bamvtype, newVal.c_str()); 00745 if(myNumWarns == myMaxWarns) 00746 { 00747 fprintf(stderr, "Suppressing rest of Duplicate Tag warnings.\n"); 00748 } 00749 } 00750 00751 // Update the integer value and type. 00752 integers[index] = value; 00753 intType[index] = bamvtype; 00754 } 00755 else 00756 { 00757 // Tag is not already there, so add it. 00758 index = integers.Length(); 00759 00760 integers.Push(value); 00761 intType.push_back(bamvtype); 00762 00763 extras.Add(key, index); 00764 } 00765 00766 // The buffer tags are now out of sync. 00767 myNeedToSetTagsInBuffer = true; 00768 myIsTagsBufferValid = false; 00769 myIsBufferSynced = false; 00770 myTagBufferSize += tagBufferSize; 00771 00772 return(true); 00773 } 00774 00775 00776 // Add the specified tag to the record, replacing it if it is already there and 00777 // is different from the previous value. 00778 // Returns true if the tag was successfully added (or was already there), false otherwise. 00779 bool SamRecord::addTag(const char* tag, char vtype, const char* valuePtr) 00780 { 00781 if(vtype == 'i') 00782 { 00783 // integer type. Call addIntTag to handle it. 00784 int intVal = atoi(valuePtr); 00785 return(addIntTag(tag, intVal)); 00786 } 00787 00788 // Non-int type. 00789 myStatus = SamStatus::SUCCESS; 00790 bool status = true; // default to successful. 00791 int key = 0; 00792 int index = 0; 00793 00794 int tagBufferSize = 0; 00795 00796 // First check to see if the tags need to be synced to the buffer. 00797 if(myNeedToSetTagsFromBuffer) 00798 { 00799 if(!setTagsFromBuffer()) 00800 { 00801 // Failed to read tags from the buffer, so cannot add new ones. 00802 return(false); 00803 } 00804 } 00805 00806 // First check to see if the tag is already there. 00807 key = MAKEKEY(tag[0], tag[1], vtype); 00808 unsigned int hashIndex = extras.Find(key); 00809 if(hashIndex != LH_NOTFOUND) 00810 { 00811 // The key was found in the hash, so get the lookup index. 00812 index = extras[hashIndex]; 00813 00814 String origTag; 00815 char origType = vtype; 00816 00817 // Adjust the currently pointed to value to the new setting. 00818 switch (vtype) 00819 { 00820 case 'A' : 00821 // First check to see if the value changed. 00822 if((integers[index] == (const int)*(valuePtr)) && 00823 (intType[index] == vtype)) 00824 { 00825 // The value & type has not changed, so do nothing. 00826 return(true); 00827 } 00828 else 00829 { 00830 // Tag buffer size changes if type changes, so subtract & add. 00831 origType = intType[index]; 00832 appendIntArrayValue(index, origTag); 00833 tagBufferSize -= getNumericTagTypeSize(intType[index]); 00834 tagBufferSize += getNumericTagTypeSize(vtype); 00835 integers[index] = (const int)*(valuePtr); 00836 intType[index] = vtype; 00837 } 00838 break; 00839 case 'Z' : 00840 // First check to see if the value changed. 00841 if(strings[index] == valuePtr) 00842 { 00843 // The value has not changed, so do nothing. 00844 return(true); 00845 } 00846 else 00847 { 00848 // Adjust the tagBufferSize by removing the size of the old string. 00849 origTag = strings[index]; 00850 tagBufferSize -= strings[index].Length(); 00851 strings[index] = valuePtr; 00852 // Adjust the tagBufferSize by adding the size of the new string. 00853 tagBufferSize += strings[index].Length(); 00854 } 00855 break; 00856 case 'B' : 00857 // First check to see if the value changed. 00858 if(strings[index] == valuePtr) 00859 { 00860 // The value has not changed, so do nothing. 00861 return(true); 00862 } 00863 else 00864 { 00865 // Adjust the tagBufferSize by removing the size of the old field. 00866 origTag = strings[index]; 00867 tagBufferSize -= getBtagBufferSize(strings[index]); 00868 strings[index] = valuePtr; 00869 // Adjust the tagBufferSize by adding the size of the new field. 00870 tagBufferSize += getBtagBufferSize(strings[index]); 00871 } 00872 break; 00873 case 'f' : 00874 // First check to see if the value changed. 00875 if(floats[index] == (float)atof(valuePtr)) 00876 { 00877 // The value has not changed, so do nothing. 00878 return(true); 00879 } 00880 else 00881 { 00882 // Tag buffer size doesn't change between different 'f' entries. 00883 origTag.appendFullFloat(floats[index]); 00884 floats[index] = (float)atof(valuePtr); 00885 } 00886 break; 00887 default : 00888 fprintf(stderr, 00889 "samRecord::addTag() - Unknown custom field of type %c\n", 00890 vtype); 00891 myStatus.setStatus(SamStatus::FAIL_PARSE, 00892 "Unknown custom field in a tag"); 00893 status = false; 00894 break; 00895 } 00896 00897 // Duplicate tag in this record. 00898 // Tag already existed, print message about overwriting. 00899 // WARN about dropping duplicate tags. 00900 if(myNumWarns++ < myMaxWarns) 00901 { 00902 fprintf(stderr, "WARNING: Duplicate Tags, overwritting %c%c:%c:%s with %c%c:%c:%s\n", 00903 tag[0], tag[1], origType, origTag.c_str(), tag[0], tag[1], vtype, valuePtr); 00904 if(myNumWarns == myMaxWarns) 00905 { 00906 fprintf(stderr, "Suppressing rest of Duplicate Tag warnings.\n"); 00907 } 00908 } 00909 } 00910 else 00911 { 00912 // The key was not found in the hash, so add it. 00913 switch (vtype) 00914 { 00915 case 'A' : 00916 index = integers.Length(); 00917 integers.Push((const int)*(valuePtr)); 00918 intType.push_back(vtype); 00919 tagBufferSize += 4; 00920 break; 00921 case 'Z' : 00922 index = strings.Length(); 00923 strings.Push(valuePtr); 00924 tagBufferSize += 4 + strings.Last().Length(); 00925 break; 00926 case 'B' : 00927 index = strings.Length(); 00928 strings.Push(valuePtr); 00929 tagBufferSize += 3 + getBtagBufferSize(strings[index]); 00930 break; 00931 case 'f' : 00932 index = floats.size(); 00933 floats.push_back((float)atof(valuePtr)); 00934 tagBufferSize += 7; 00935 break; 00936 default : 00937 fprintf(stderr, 00938 "samRecord::addTag() - Unknown custom field of type %c\n", 00939 vtype); 00940 myStatus.setStatus(SamStatus::FAIL_PARSE, 00941 "Unknown custom field in a tag"); 00942 status = false; 00943 break; 00944 } 00945 if(status) 00946 { 00947 // If successful, add the key to extras. 00948 extras.Add(key, index); 00949 } 00950 } 00951 00952 // Only add the tag if it has so far been successfully processed. 00953 if(status) 00954 { 00955 // The buffer tags are now out of sync. 00956 myNeedToSetTagsInBuffer = true; 00957 myIsTagsBufferValid = false; 00958 myIsBufferSynced = false; 00959 myTagBufferSize += tagBufferSize; 00960 } 00961 return(status); 00962 } 00963 00964 00965 void SamRecord::clearTags() 00966 { 00967 if(extras.Entries() != 0) 00968 { 00969 extras.Clear(); 00970 } 00971 strings.Clear(); 00972 integers.Clear(); 00973 intType.clear(); 00974 floats.clear(); 00975 myTagBufferSize = 0; 00976 resetTagIter(); 00977 } 00978 00979 00980 bool SamRecord::rmTag(const char* tag, char type) 00981 { 00982 // Check the length of tag. 00983 if(strlen(tag) != 2) 00984 { 00985 // Tag is the wrong length. 00986 myStatus.setStatus(SamStatus::INVALID, 00987 "rmTag called with tag that is not 2 characters\n"); 00988 return(false); 00989 } 00990 00991 myStatus = SamStatus::SUCCESS; 00992 if(myNeedToSetTagsFromBuffer) 00993 { 00994 if(!setTagsFromBuffer()) 00995 { 00996 // Failed to read the tags from the buffer, so cannot 00997 // get tags. 00998 return(false); 00999 } 01000 } 01001 01002 // Construct the key. 01003 int key = MAKEKEY(tag[0], tag[1], type); 01004 // Look to see if the key exsists in the hash. 01005 int offset = extras.Find(key); 01006 01007 if(offset < 0) 01008 { 01009 // Not found, so return true, successfully removed since 01010 // it is not in tag. 01011 return(true); 01012 } 01013 01014 // Offset is set, so the key was found. 01015 // First if it is an integer, determine the actual type of the int. 01016 char vtype; 01017 getTypeFromKey(key, vtype); 01018 if(vtype == 'i') 01019 { 01020 vtype = getIntegerType(offset); 01021 } 01022 01023 // Offset is set, so recalculate the buffer size without this entry. 01024 // Do NOT remove from strings, integers, or floats because then 01025 // extras would need to be updated for all entries with the new indexes 01026 // into those variables. 01027 int rmBuffSize = 0; 01028 switch(vtype) 01029 { 01030 case 'A': 01031 case 'c': 01032 case 'C': 01033 rmBuffSize = 4; 01034 break; 01035 case 's': 01036 case 'S': 01037 rmBuffSize = 5; 01038 break; 01039 case 'i': 01040 case 'I': 01041 rmBuffSize = 7; 01042 break; 01043 case 'f': 01044 rmBuffSize = 7; 01045 break; 01046 case 'Z': 01047 rmBuffSize = 4 + getString(offset).Length(); 01048 break; 01049 case 'B': 01050 rmBuffSize = 3 + getBtagBufferSize(getString(offset)); 01051 break; 01052 default: 01053 myStatus.setStatus(SamStatus::INVALID, 01054 "rmTag called with unknown type.\n"); 01055 return(false); 01056 break; 01057 }; 01058 01059 // The buffer tags are now out of sync. 01060 myNeedToSetTagsInBuffer = true; 01061 myIsTagsBufferValid = false; 01062 myIsBufferSynced = false; 01063 myTagBufferSize -= rmBuffSize; 01064 01065 // Remove from the hash. 01066 extras.Delete(offset); 01067 return(true); 01068 } 01069 01070 01071 bool SamRecord::rmTags(const char* tags) 01072 { 01073 const char* currentTagPtr = tags; 01074 01075 myStatus = SamStatus::SUCCESS; 01076 if(myNeedToSetTagsFromBuffer) 01077 { 01078 if(!setTagsFromBuffer()) 01079 { 01080 // Failed to read the tags from the buffer, so cannot 01081 // get tags. 01082 return(false); 01083 } 01084 } 01085 01086 bool returnStatus = true; 01087 01088 int rmBuffSize = 0; 01089 while(*currentTagPtr != '\0') 01090 { 01091 01092 // Tags are formatted as: XY:Z 01093 // Where X is [A-Za-z], Y is [A-Za-z], and 01094 // Z is A,i,f,Z,H (cCsSI are also excepted) 01095 if((currentTagPtr[0] == '\0') || (currentTagPtr[1] == '\0') || 01096 (currentTagPtr[2] != ':') || (currentTagPtr[3] == '\0')) 01097 { 01098 myStatus.setStatus(SamStatus::INVALID, 01099 "rmTags called with improperly formatted tags.\n"); 01100 returnStatus = false; 01101 break; 01102 } 01103 01104 // Construct the key. 01105 int key = MAKEKEY(currentTagPtr[0], currentTagPtr[1], 01106 currentTagPtr[3]); 01107 // Look to see if the key exsists in the hash. 01108 int offset = extras.Find(key); 01109 01110 if(offset >= 0) 01111 { 01112 // Offset is set, so the key was found. 01113 // First if it is an integer, determine the actual type of the int. 01114 char vtype; 01115 getTypeFromKey(key, vtype); 01116 if(vtype == 'i') 01117 { 01118 vtype = getIntegerType(offset); 01119 } 01120 01121 // Offset is set, so recalculate the buffer size without this entry. 01122 // Do NOT remove from strings, integers, or floats because then 01123 // extras would need to be updated for all entries with the new indexes 01124 // into those variables. 01125 switch(vtype) 01126 { 01127 case 'A': 01128 case 'c': 01129 case 'C': 01130 rmBuffSize += 4; 01131 break; 01132 case 's': 01133 case 'S': 01134 rmBuffSize += 5; 01135 break; 01136 case 'i': 01137 case 'I': 01138 rmBuffSize += 7; 01139 break; 01140 case 'f': 01141 rmBuffSize += 7; 01142 break; 01143 case 'Z': 01144 rmBuffSize += 4 + getString(offset).Length(); 01145 break; 01146 case 'B': 01147 rmBuffSize += 3 + getBtagBufferSize(getString(offset)); 01148 break; 01149 default: 01150 myStatus.setStatus(SamStatus::INVALID, 01151 "rmTag called with unknown type.\n"); 01152 returnStatus = false; 01153 break; 01154 }; 01155 01156 // Remove from the hash. 01157 extras.Delete(offset); 01158 } 01159 // Increment to the next tag. 01160 if((currentTagPtr[4] == ';') || (currentTagPtr[4] == ',')) 01161 { 01162 // Increment once more. 01163 currentTagPtr += 5; 01164 } 01165 else if(currentTagPtr[4] != '\0') 01166 { 01167 // Invalid tag format. 01168 myStatus.setStatus(SamStatus::INVALID, 01169 "rmTags called with improperly formatted tags.\n"); 01170 returnStatus = false; 01171 break; 01172 } 01173 else 01174 { 01175 // Last Tag. 01176 currentTagPtr += 4; 01177 } 01178 } 01179 01180 // The buffer tags are now out of sync. 01181 myNeedToSetTagsInBuffer = true; 01182 myIsTagsBufferValid = false; 01183 myIsBufferSynced = false; 01184 myTagBufferSize -= rmBuffSize; 01185 01186 01187 return(returnStatus); 01188 } 01189 01190 01191 // Get methods for record fields. 01192 const void* SamRecord::getRecordBuffer() 01193 { 01194 return(getRecordBuffer(mySequenceTranslation)); 01195 } 01196 01197 01198 // Get methods for record fields. 01199 const void* SamRecord::getRecordBuffer(SequenceTranslation translation) 01200 { 01201 myStatus = SamStatus::SUCCESS; 01202 bool status = true; 01203 // If the buffer is not synced or the sequence in the buffer is not 01204 // properly translated, fix the buffer. 01205 if((myIsBufferSynced == false) || 01206 (myBufferSequenceTranslation != translation)) 01207 { 01208 status &= fixBuffer(translation); 01209 } 01210 // If the buffer is synced, check to see if the tags need to be synced. 01211 if(myNeedToSetTagsInBuffer) 01212 { 01213 status &= setTagsInBuffer(); 01214 } 01215 if(!status) 01216 { 01217 return(NULL); 01218 } 01219 return (const void *)myRecordPtr; 01220 } 01221 01222 01223 // Write the record as a buffer into the file using the class's 01224 // sequence translation setting. 01225 SamStatus::Status SamRecord::writeRecordBuffer(IFILE filePtr) 01226 { 01227 return(writeRecordBuffer(filePtr, mySequenceTranslation)); 01228 } 01229 01230 01231 // Write the record as a buffer into the file using the specified translation. 01232 SamStatus::Status SamRecord::writeRecordBuffer(IFILE filePtr, 01233 SequenceTranslation translation) 01234 { 01235 myStatus = SamStatus::SUCCESS; 01236 if((filePtr == NULL) || (filePtr->isOpen() == false)) 01237 { 01238 // File is not open, return failure. 01239 myStatus.setStatus(SamStatus::FAIL_ORDER, 01240 "Can't write to an unopened file."); 01241 return(SamStatus::FAIL_ORDER); 01242 } 01243 01244 if((myIsBufferSynced == false) || 01245 (myBufferSequenceTranslation != translation)) 01246 { 01247 if(!fixBuffer(translation)) 01248 { 01249 return(myStatus.getStatus()); 01250 } 01251 } 01252 01253 // Write the record. 01254 unsigned int numBytesToWrite = myRecordPtr->myBlockSize + sizeof(int32_t); 01255 unsigned int numBytesWritten = 01256 ifwrite(filePtr, myRecordPtr, numBytesToWrite); 01257 01258 // Return status based on if the correct number of bytes were written. 01259 if(numBytesToWrite == numBytesWritten) 01260 { 01261 return(SamStatus::SUCCESS); 01262 } 01263 // The correct number of bytes were not written. 01264 myStatus.setStatus(SamStatus::FAIL_IO, "Failed to write the entire record."); 01265 return(SamStatus::FAIL_IO); 01266 } 01267 01268 01269 int32_t SamRecord::getBlockSize() 01270 { 01271 myStatus = SamStatus::SUCCESS; 01272 // If the buffer isn't synced, sync the buffer to determine the 01273 // block size. 01274 if(myIsBufferSynced == false) 01275 { 01276 // Since this just returns the block size, the translation of 01277 // the sequence does not matter, so just use the currently set 01278 // value. 01279 fixBuffer(myBufferSequenceTranslation); 01280 } 01281 return myRecordPtr->myBlockSize; 01282 } 01283 01284 01285 // This method returns the reference name. 01286 const char* SamRecord::getReferenceName() 01287 { 01288 myStatus = SamStatus::SUCCESS; 01289 return myReferenceName.c_str(); 01290 } 01291 01292 01293 int32_t SamRecord::getReferenceID() 01294 { 01295 myStatus = SamStatus::SUCCESS; 01296 return myRecordPtr->myReferenceID; 01297 } 01298 01299 01300 int32_t SamRecord::get1BasedPosition() 01301 { 01302 myStatus = SamStatus::SUCCESS; 01303 return (myRecordPtr->myPosition + 1); 01304 } 01305 01306 01307 int32_t SamRecord::get0BasedPosition() 01308 { 01309 myStatus = SamStatus::SUCCESS; 01310 return myRecordPtr->myPosition; 01311 } 01312 01313 01314 uint8_t SamRecord::getReadNameLength() 01315 { 01316 myStatus = SamStatus::SUCCESS; 01317 // If the buffer is valid, return the size from there, otherwise get the 01318 // size from the string length + 1 (ending null). 01319 if(myIsReadNameBufferValid) 01320 { 01321 return(myRecordPtr->myReadNameLength); 01322 } 01323 01324 return(myReadName.Length() + 1); 01325 } 01326 01327 01328 uint8_t SamRecord::getMapQuality() 01329 { 01330 myStatus = SamStatus::SUCCESS; 01331 return myRecordPtr->myMapQuality; 01332 } 01333 01334 01335 uint16_t SamRecord::getBin() 01336 { 01337 myStatus = SamStatus::SUCCESS; 01338 if(!myIsBinValid) 01339 { 01340 // The bin that is set in the record is not valid, so 01341 // reset it. 01342 myRecordPtr->myBin = 01343 bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd()); 01344 myIsBinValid = true; 01345 } 01346 return(myRecordPtr->myBin); 01347 } 01348 01349 01350 uint16_t SamRecord::getCigarLength() 01351 { 01352 myStatus = SamStatus::SUCCESS; 01353 // If the cigar buffer is valid 01354 // then get the length from there. 01355 if(myIsCigarBufferValid) 01356 { 01357 return myRecordPtr->myCigarLength; 01358 } 01359 01360 if(myCigarTempBufferLength == -1) 01361 { 01362 // The cigar buffer is not valid and the cigar temp buffer is not set, 01363 // so parse the string. 01364 parseCigarString(); 01365 } 01366 01367 // The temp buffer is now set, so return the size. 01368 return(myCigarTempBufferLength); 01369 } 01370 01371 01372 uint16_t SamRecord::getFlag() 01373 { 01374 myStatus = SamStatus::SUCCESS; 01375 return myRecordPtr->myFlag; 01376 } 01377 01378 01379 int32_t SamRecord::getReadLength() 01380 { 01381 myStatus = SamStatus::SUCCESS; 01382 if(myIsSequenceBufferValid == false) 01383 { 01384 // If the sequence is "*", then return 0. 01385 if((mySequence.Length() == 1) && (mySequence[0] == '*')) 01386 { 01387 return(0); 01388 } 01389 // Do not add 1 since it is not null terminated. 01390 return(mySequence.Length()); 01391 } 01392 return(myRecordPtr->myReadLength); 01393 } 01394 01395 01396 // This method returns the mate reference name. If it is equal to the 01397 // reference name, it still returns the reference name. 01398 const char* SamRecord::getMateReferenceName() 01399 { 01400 myStatus = SamStatus::SUCCESS; 01401 return myMateReferenceName.c_str(); 01402 } 01403 01404 01405 // This method returns the mate reference name. If it is equal to the 01406 // reference name, it returns "=", unless they are both "*" in which case 01407 // "*" is returned. 01408 const char* SamRecord::getMateReferenceNameOrEqual() 01409 { 01410 myStatus = SamStatus::SUCCESS; 01411 if(myMateReferenceName == "*") 01412 { 01413 return(myMateReferenceName); 01414 } 01415 if(myMateReferenceName == getReferenceName()) 01416 { 01417 return(FIELD_ABSENT_STRING); 01418 } 01419 else 01420 { 01421 return(myMateReferenceName); 01422 } 01423 } 01424 01425 01426 int32_t SamRecord::getMateReferenceID() 01427 { 01428 myStatus = SamStatus::SUCCESS; 01429 return myRecordPtr->myMateReferenceID; 01430 } 01431 01432 01433 int32_t SamRecord::get1BasedMatePosition() 01434 { 01435 myStatus = SamStatus::SUCCESS; 01436 return (myRecordPtr->myMatePosition + 1); 01437 } 01438 01439 01440 int32_t SamRecord::get0BasedMatePosition() 01441 { 01442 myStatus = SamStatus::SUCCESS; 01443 return myRecordPtr->myMatePosition; 01444 } 01445 01446 01447 int32_t SamRecord::getInsertSize() 01448 { 01449 myStatus = SamStatus::SUCCESS; 01450 return myRecordPtr->myInsertSize; 01451 } 01452 01453 01454 // Returns the inclusive rightmost position of the clipped sequence. 01455 int32_t SamRecord::get0BasedAlignmentEnd() 01456 { 01457 myStatus = SamStatus::SUCCESS; 01458 if(myAlignmentLength == -1) 01459 { 01460 // Alignment end has not been set, so calculate it. 01461 parseCigar(); 01462 } 01463 // If alignment length > 0, subtract 1 from it to get the end. 01464 if(myAlignmentLength == 0) 01465 { 01466 // Length is 0, just return the start position. 01467 return(myRecordPtr->myPosition); 01468 } 01469 return(myRecordPtr->myPosition + myAlignmentLength - 1); 01470 } 01471 01472 01473 // Returns the inclusive rightmost position of the clipped sequence. 01474 int32_t SamRecord::get1BasedAlignmentEnd() 01475 { 01476 return(get0BasedAlignmentEnd() + 1); 01477 } 01478 01479 01480 // Return the length of the alignment. 01481 int32_t SamRecord::getAlignmentLength() 01482 { 01483 myStatus = SamStatus::SUCCESS; 01484 if(myAlignmentLength == -1) 01485 { 01486 // Alignment end has not been set, so calculate it. 01487 parseCigar(); 01488 } 01489 // Return the alignment length. 01490 return(myAlignmentLength); 01491 } 01492 01493 // Returns the inclusive left-most position adjust for clipped bases. 01494 int32_t SamRecord::get0BasedUnclippedStart() 01495 { 01496 myStatus = SamStatus::SUCCESS; 01497 if(myUnclippedStartOffset == -1) 01498 { 01499 // Unclipped has not yet been calculated, so parse the cigar to get it 01500 parseCigar(); 01501 } 01502 return(myRecordPtr->myPosition - myUnclippedStartOffset); 01503 } 01504 01505 01506 // Returns the inclusive left-most position adjust for clipped bases. 01507 int32_t SamRecord::get1BasedUnclippedStart() 01508 { 01509 return(get0BasedUnclippedStart() + 1); 01510 } 01511 01512 01513 // Returns the inclusive right-most position adjust for clipped bases. 01514 int32_t SamRecord::get0BasedUnclippedEnd() 01515 { 01516 // myUnclippedEndOffset will be set by get0BasedAlignmentEnd if the 01517 // cigar has not yet been parsed, so no need to check it here. 01518 return(get0BasedAlignmentEnd() + myUnclippedEndOffset); 01519 } 01520 01521 01522 // Returns the inclusive right-most position adjust for clipped bases. 01523 int32_t SamRecord::get1BasedUnclippedEnd() 01524 { 01525 return(get0BasedUnclippedEnd() + 1); 01526 } 01527 01528 01529 // Get the read name. 01530 const char* SamRecord::getReadName() 01531 { 01532 myStatus = SamStatus::SUCCESS; 01533 if(myReadName.Length() == 0) 01534 { 01535 // 0 Length, means that it is in the buffer, but has not yet 01536 // been synced to the string, so do the sync. 01537 myReadName = (char*)&(myRecordPtr->myData); 01538 } 01539 return myReadName.c_str(); 01540 } 01541 01542 01543 const char* SamRecord::getCigar() 01544 { 01545 myStatus = SamStatus::SUCCESS; 01546 if(myCigar.Length() == 0) 01547 { 01548 // 0 Length, means that it is in the buffer, but has not yet 01549 // been synced to the string, so do the sync. 01550 parseCigarBinary(); 01551 } 01552 return myCigar.c_str(); 01553 } 01554 01555 01556 const char* SamRecord::getSequence() 01557 { 01558 return(getSequence(mySequenceTranslation)); 01559 } 01560 01561 01562 const char* SamRecord::getSequence(SequenceTranslation translation) 01563 { 01564 myStatus = SamStatus::SUCCESS; 01565 if(mySequence.Length() == 0) 01566 { 01567 // 0 Length, means that it is in the buffer, but has not yet 01568 // been synced to the string, so do the sync. 01569 setSequenceAndQualityFromBuffer(); 01570 } 01571 01572 // Determine if translation needs to be done. 01573 if((translation == NONE) || (myRefPtr == NULL)) 01574 { 01575 return mySequence.c_str(); 01576 } 01577 else if(translation == EQUAL) 01578 { 01579 if(mySeqWithEq.length() == 0) 01580 { 01581 // Check to see if the sequence is defined. 01582 if(mySequence == "*") 01583 { 01584 // Sequence is undefined, so no translation necessary. 01585 mySeqWithEq = '*'; 01586 } 01587 else 01588 { 01589 // Sequence defined, so translate it. 01590 SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(), 01591 myRecordPtr->myPosition, 01592 *(getCigarInfo()), 01593 getReferenceName(), 01594 *myRefPtr, 01595 mySeqWithEq); 01596 } 01597 } 01598 return(mySeqWithEq.c_str()); 01599 } 01600 else 01601 { 01602 // translation == BASES 01603 if(mySeqWithoutEq.length() == 0) 01604 { 01605 if(mySequence == "*") 01606 { 01607 // Sequence is undefined, so no translation necessary. 01608 mySeqWithoutEq = '*'; 01609 } 01610 else 01611 { 01612 // Sequence defined, so translate it. 01613 SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(), 01614 myRecordPtr->myPosition, 01615 *(getCigarInfo()), 01616 getReferenceName(), 01617 *myRefPtr, 01618 mySeqWithoutEq); 01619 } 01620 } 01621 return(mySeqWithoutEq.c_str()); 01622 } 01623 } 01624 01625 01626 const char* SamRecord::getQuality() 01627 { 01628 myStatus = SamStatus::SUCCESS; 01629 if(myQuality.Length() == 0) 01630 { 01631 // 0 Length, means that it is in the buffer, but has not yet 01632 // been synced to the string, so do the sync. 01633 setSequenceAndQualityFromBuffer(); 01634 } 01635 return myQuality.c_str(); 01636 } 01637 01638 01639 char SamRecord::getSequence(int index) 01640 { 01641 return(getSequence(index, mySequenceTranslation)); 01642 } 01643 01644 01645 char SamRecord::getSequence(int index, SequenceTranslation translation) 01646 { 01647 static const char * asciiBases = "=AC.G...T......N"; 01648 01649 // Determine the read length. 01650 int32_t readLen = getReadLength(); 01651 01652 // If the read length is 0, this method should not be called. 01653 if(readLen == 0) 01654 { 01655 String exceptionString = "SamRecord::getSequence("; 01656 exceptionString += index; 01657 exceptionString += ") is not allowed since sequence = '*'"; 01658 throw std::runtime_error(exceptionString.c_str()); 01659 } 01660 else if((index < 0) || (index >= readLen)) 01661 { 01662 // Only get here if the index was out of range, so thow an exception. 01663 String exceptionString = "SamRecord::getSequence("; 01664 exceptionString += index; 01665 exceptionString += ") is out of range. Index must be between 0 and "; 01666 exceptionString += (readLen - 1); 01667 throw std::runtime_error(exceptionString.c_str()); 01668 } 01669 01670 // Determine if translation needs to be done. 01671 if((translation == NONE) || (myRefPtr == NULL)) 01672 { 01673 // No translation needs to be done. 01674 if(mySequence.Length() == 0) 01675 { 01676 // Parse BAM sequence. 01677 if(myIsSequenceBufferValid) 01678 { 01679 return(index & 1 ? 01680 asciiBases[myPackedSequence[index / 2] & 0xF] : 01681 asciiBases[myPackedSequence[index / 2] >> 4]); 01682 } 01683 else 01684 { 01685 String exceptionString = "SamRecord::getSequence("; 01686 exceptionString += index; 01687 exceptionString += ") called with no sequence set"; 01688 throw std::runtime_error(exceptionString.c_str()); 01689 } 01690 } 01691 // Already have string. 01692 return(mySequence[index]); 01693 } 01694 else 01695 { 01696 // Need to translate the sequence either to have '=' or to not 01697 // have it. 01698 // First check to see if the sequence has been set. 01699 if(mySequence.Length() == 0) 01700 { 01701 // 0 Length, means that it is in the buffer, but has not yet 01702 // been synced to the string, so do the sync. 01703 setSequenceAndQualityFromBuffer(); 01704 } 01705 01706 // Check the type of translation. 01707 if(translation == EQUAL) 01708 { 01709 // Check whether or not the string has already been 01710 // retrieved that has the '=' in it. 01711 if(mySeqWithEq.length() == 0) 01712 { 01713 // The string with '=' has not yet been determined, 01714 // so get the string. 01715 // Check to see if the sequence is defined. 01716 if(mySequence == "*") 01717 { 01718 // Sequence is undefined, so no translation necessary. 01719 mySeqWithEq = '*'; 01720 } 01721 else 01722 { 01723 // Sequence defined, so translate it. 01724 SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(), 01725 myRecordPtr->myPosition, 01726 *(getCigarInfo()), 01727 getReferenceName(), 01728 *myRefPtr, 01729 mySeqWithEq); 01730 } 01731 } 01732 // Sequence is set, so return it. 01733 return(mySeqWithEq[index]); 01734 } 01735 else 01736 { 01737 // translation == BASES 01738 // Check whether or not the string has already been 01739 // retrieved that does not have the '=' in it. 01740 if(mySeqWithoutEq.length() == 0) 01741 { 01742 // The string with '=' has not yet been determined, 01743 // so get the string. 01744 // Check to see if the sequence is defined. 01745 if(mySequence == "*") 01746 { 01747 // Sequence is undefined, so no translation necessary. 01748 mySeqWithoutEq = '*'; 01749 } 01750 else 01751 { 01752 // Sequence defined, so translate it. 01753 // The string without '=' has not yet been determined, 01754 // so get the string. 01755 SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(), 01756 myRecordPtr->myPosition, 01757 *(getCigarInfo()), 01758 getReferenceName(), 01759 *myRefPtr, 01760 mySeqWithoutEq); 01761 } 01762 } 01763 // Sequence is set, so return it. 01764 return(mySeqWithoutEq[index]); 01765 } 01766 } 01767 } 01768 01769 01770 char SamRecord::getQuality(int index) 01771 { 01772 // Determine the read length. 01773 int32_t readLen = getReadLength(); 01774 01775 // If the read length is 0, return ' ' whose ascii code is below 01776 // the minimum ascii code for qualities. 01777 if(readLen == 0) 01778 { 01779 return(BaseUtilities::UNKNOWN_QUALITY_CHAR); 01780 } 01781 else if((index < 0) || (index >= readLen)) 01782 { 01783 // Only get here if the index was out of range, so thow an exception. 01784 String exceptionString = "SamRecord::getQuality("; 01785 exceptionString += index; 01786 exceptionString += ") is out of range. Index must be between 0 and "; 01787 exceptionString += (readLen - 1); 01788 throw std::runtime_error(exceptionString.c_str()); 01789 } 01790 01791 if(myQuality.Length() == 0) 01792 { 01793 // Parse BAM Quality. 01794 // Know that myPackedQuality is correct since readLen != 0. 01795 return(myPackedQuality[index] + 33); 01796 } 01797 else 01798 { 01799 // Already have string. 01800 if((myQuality.Length() == 1) && (myQuality[0] == '*')) 01801 { 01802 // Return the unknown quality character. 01803 return(BaseUtilities::UNKNOWN_QUALITY_CHAR); 01804 } 01805 else if(index >= myQuality.Length()) 01806 { 01807 // Only get here if the index was out of range, so thow an exception. 01808 // Technically the myQuality string is not guaranteed to be the same length 01809 // as the sequence, so this catches that error. 01810 String exceptionString = "SamRecord::getQuality("; 01811 exceptionString += index; 01812 exceptionString += ") is out of range. Index must be between 0 and "; 01813 exceptionString += (myQuality.Length() - 1); 01814 throw std::runtime_error(exceptionString.c_str()); 01815 } 01816 else 01817 { 01818 return(myQuality[index]); 01819 } 01820 } 01821 } 01822 01823 01824 Cigar* SamRecord::getCigarInfo() 01825 { 01826 // Check to see whether or not the Cigar has already been 01827 // set - this is determined by checking if alignment length 01828 // is set since alignment length and the cigar are set 01829 // at the same time. 01830 if(myAlignmentLength == -1) 01831 { 01832 // Not been set, so calculate it. 01833 parseCigar(); 01834 } 01835 return(&myCigarRoller); 01836 } 01837 01838 01839 // Return the number of bases in this read that overlap the passed in 01840 // region. (start & end are 0-based) 01841 uint32_t SamRecord::getNumOverlaps(int32_t start, int32_t end) 01842 { 01843 // Determine whether or not the cigar has been parsed, which sets up 01844 // the cigar roller. This is determined by checking the alignment length. 01845 if(myAlignmentLength == -1) 01846 { 01847 parseCigar(); 01848 } 01849 return(myCigarRoller.getNumOverlaps(start, end, get0BasedPosition())); 01850 } 01851 01852 01853 // Returns the values of all fields except the tags. 01854 bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName, 01855 String& cigar, String& sequence, String& quality) 01856 { 01857 return(getFields(recStruct, readName, cigar, sequence, quality, 01858 mySequenceTranslation)); 01859 } 01860 01861 01862 // Returns the values of all fields except the tags. 01863 bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName, 01864 String& cigar, String& sequence, String& quality, 01865 SequenceTranslation translation) 01866 { 01867 myStatus = SamStatus::SUCCESS; 01868 if(myIsBufferSynced == false) 01869 { 01870 if(!fixBuffer(translation)) 01871 { 01872 // failed to set the buffer, return false. 01873 return(false); 01874 } 01875 } 01876 memcpy(&recStruct, myRecordPtr, sizeof(bamRecordStruct)); 01877 01878 readName = getReadName(); 01879 // Check the status. 01880 if(myStatus != SamStatus::SUCCESS) 01881 { 01882 // Failed to set the fields, return false. 01883 return(false); 01884 } 01885 cigar = getCigar(); 01886 // Check the status. 01887 if(myStatus != SamStatus::SUCCESS) 01888 { 01889 // Failed to set the fields, return false. 01890 return(false); 01891 } 01892 sequence = getSequence(translation); 01893 // Check the status. 01894 if(myStatus != SamStatus::SUCCESS) 01895 { 01896 // Failed to set the fields, return false. 01897 return(false); 01898 } 01899 quality = getQuality(); 01900 // Check the status. 01901 if(myStatus != SamStatus::SUCCESS) 01902 { 01903 // Failed to set the fields, return false. 01904 return(false); 01905 } 01906 return(true); 01907 } 01908 01909 01910 // Returns the reference pointer. 01911 GenomeSequence* SamRecord::getReference() 01912 { 01913 return(myRefPtr); 01914 } 01915 01916 01917 uint32_t SamRecord::getTagLength() 01918 { 01919 myStatus = SamStatus::SUCCESS; 01920 if(myNeedToSetTagsFromBuffer) 01921 { 01922 // Tags are only set in the buffer, so the size of the tags is 01923 // the length of the record minus the starting location of the tags. 01924 unsigned char * tagStart = 01925 (unsigned char *)myRecordPtr->myData 01926 + myRecordPtr->myReadNameLength 01927 + myRecordPtr->myCigarLength * sizeof(int) 01928 + (myRecordPtr->myReadLength + 1) / 2 + myRecordPtr->myReadLength; 01929 01930 // The non-tags take up from the start of the record to the tag start. 01931 // Do not include the block size part of the record since it is not 01932 // included in the size. 01933 uint32_t nonTagSize = 01934 tagStart - (unsigned char*)&(myRecordPtr->myReferenceID); 01935 // Tags take up the size of the block minus the non-tag section. 01936 uint32_t tagSize = myRecordPtr->myBlockSize - nonTagSize; 01937 return(tagSize); 01938 } 01939 01940 // Tags are stored outside the buffer, so myTagBufferSize is set. 01941 return(myTagBufferSize); 01942 } 01943 01944 01945 // Returns true if there is another tag and sets tag and vtype to the 01946 // appropriate values, and returns a pointer to the value. 01947 // Sets the Status to SUCCESS when a tag is successfully returned or 01948 // when there are no more tags. Otherwise the status is set to describe 01949 // why it failed (parsing, etc). 01950 bool SamRecord::getNextSamTag(char* tag, char& vtype, void** value) 01951 { 01952 myStatus = SamStatus::SUCCESS; 01953 if(myNeedToSetTagsFromBuffer) 01954 { 01955 if(!setTagsFromBuffer()) 01956 { 01957 // Failed to read the tags from the buffer, so cannot 01958 // get tags. 01959 return(false); 01960 } 01961 } 01962 01963 // Increment the tag index to start looking at the next tag. 01964 // At the beginning, it is set to -1. 01965 myLastTagIndex++; 01966 int maxTagIndex = extras.Capacity(); 01967 if(myLastTagIndex >= maxTagIndex) 01968 { 01969 // Hit the end of the tags, return false, no more tags. 01970 // Status is still success since this is not an error, 01971 // it is just the end of the list. 01972 return(false); 01973 } 01974 01975 bool tagFound = false; 01976 // Loop until a tag is found or the end of extras is hit. 01977 while((tagFound == false) && (myLastTagIndex < maxTagIndex)) 01978 { 01979 if(extras.SlotInUse(myLastTagIndex)) 01980 { 01981 // Found a slot to use. 01982 int key = extras.GetKey(myLastTagIndex); 01983 getTag(key, tag); 01984 getTypeFromKey(key, vtype); 01985 tagFound = true; 01986 // Get the value associated with the key based on the vtype. 01987 switch (vtype) 01988 { 01989 case 'f' : 01990 *value = getFloatPtr(myLastTagIndex); 01991 break; 01992 case 'i' : 01993 *value = getIntegerPtr(myLastTagIndex, vtype); 01994 if(vtype != 'A') 01995 { 01996 // Convert all int types to 'i' 01997 vtype = 'i'; 01998 } 01999 break; 02000 case 'Z' : 02001 case 'B' : 02002 *value = getStringPtr(myLastTagIndex); 02003 break; 02004 default: 02005 myStatus.setStatus(SamStatus::FAIL_PARSE, 02006 "Unknown tag type"); 02007 tagFound = false; 02008 break; 02009 } 02010 } 02011 if(!tagFound) 02012 { 02013 // Increment the index since a tag was not found. 02014 myLastTagIndex++; 02015 } 02016 } 02017 return(tagFound); 02018 } 02019 02020 02021 // Reset the tag iterator to the beginning of the tags. 02022 void SamRecord::resetTagIter() 02023 { 02024 myLastTagIndex = -1; 02025 } 02026 02027 02028 bool SamRecord::isIntegerType(char vtype) 02029 { 02030 if((vtype == 'c') || (vtype == 'C') || 02031 (vtype == 's') || (vtype == 'S') || 02032 (vtype == 'i') || (vtype == 'I')) 02033 { 02034 return(true); 02035 } 02036 return(false); 02037 } 02038 02039 02040 bool SamRecord::isFloatType(char vtype) 02041 { 02042 if(vtype == 'f') 02043 { 02044 return(true); 02045 } 02046 return(false); 02047 } 02048 02049 02050 bool SamRecord::isCharType(char vtype) 02051 { 02052 if(vtype == 'A') 02053 { 02054 return(true); 02055 } 02056 return(false); 02057 } 02058 02059 02060 bool SamRecord::isStringType(char vtype) 02061 { 02062 if((vtype == 'Z') || (vtype == 'B')) 02063 { 02064 return(true); 02065 } 02066 return(false); 02067 } 02068 02069 02070 bool SamRecord::getTagsString(const char* tags, String& returnString, char delim) 02071 { 02072 const char* currentTagPtr = tags; 02073 02074 returnString.Clear(); 02075 myStatus = SamStatus::SUCCESS; 02076 if(myNeedToSetTagsFromBuffer) 02077 { 02078 if(!setTagsFromBuffer()) 02079 { 02080 // Failed to read the tags from the buffer, so cannot 02081 // get tags. 02082 return(false); 02083 } 02084 } 02085 02086 bool returnStatus = true; 02087 02088 while(*currentTagPtr != '\0') 02089 { 02090 // Tags are formatted as: XY:Z 02091 // Where X is [A-Za-z], Y is [A-Za-z], and 02092 // Z is A,i,f,Z,H (cCsSI are also excepted) 02093 if((currentTagPtr[0] == '\0') || (currentTagPtr[1] == '\0') || 02094 (currentTagPtr[2] != ':') || (currentTagPtr[3] == '\0')) 02095 { 02096 myStatus.setStatus(SamStatus::INVALID, 02097 "getTagsString called with improperly formatted tags.\n"); 02098 returnStatus = false; 02099 break; 02100 } 02101 02102 // Construct the key. 02103 int key = MAKEKEY(currentTagPtr[0], currentTagPtr[1], 02104 currentTagPtr[3]); 02105 // Look to see if the key exsists in the hash. 02106 int offset = extras.Find(key); 02107 02108 if(offset >= 0) 02109 { 02110 // Offset is set, so the key was found. 02111 if(!returnString.IsEmpty()) 02112 { 02113 returnString += delim; 02114 } 02115 returnString += currentTagPtr[0]; 02116 returnString += currentTagPtr[1]; 02117 returnString += ':'; 02118 returnString += currentTagPtr[3]; 02119 returnString += ':'; 02120 02121 // First if it is an integer, determine the actual type of the int. 02122 char vtype; 02123 getTypeFromKey(key, vtype); 02124 02125 switch(vtype) 02126 { 02127 case 'i': 02128 returnString += *(int*)getIntegerPtr(offset, vtype); 02129 break; 02130 case 'f': 02131 returnString += *(float*)getFloatPtr(offset); 02132 break; 02133 case 'Z': 02134 case 'B': 02135 returnString += *(String*)getStringPtr(offset); 02136 break; 02137 default: 02138 myStatus.setStatus(SamStatus::INVALID, 02139 "rmTag called with unknown type.\n"); 02140 returnStatus = false; 02141 break; 02142 }; 02143 } 02144 // Increment to the next tag. 02145 if((currentTagPtr[4] == ';') || (currentTagPtr[4] == ',')) 02146 { 02147 // Increment once more. 02148 currentTagPtr += 5; 02149 } 02150 else if(currentTagPtr[4] != '\0') 02151 { 02152 // Invalid tag format. 02153 myStatus.setStatus(SamStatus::INVALID, 02154 "rmTags called with improperly formatted tags.\n"); 02155 returnStatus = false; 02156 break; 02157 } 02158 else 02159 { 02160 // Last Tag. 02161 currentTagPtr += 4; 02162 } 02163 } 02164 return(returnStatus); 02165 } 02166 02167 02168 const String* SamRecord::getStringTag(const char * tag) 02169 { 02170 // Parse the buffer if necessary. 02171 if(myNeedToSetTagsFromBuffer) 02172 { 02173 if(!setTagsFromBuffer()) 02174 { 02175 // Failed to read the tags from the buffer, so cannot 02176 // get tags. setTagsFromBuffer set the errors, 02177 // so just return null. 02178 return(NULL); 02179 } 02180 } 02181 02182 int key = MAKEKEY(tag[0], tag[1], 'Z'); 02183 int offset = extras.Find(key); 02184 02185 int value; 02186 if (offset < 0) 02187 { 02188 // Check for 'B' tag. 02189 key = MAKEKEY(tag[0], tag[1], 'B'); 02190 offset = extras.Find(key); 02191 if(offset < 0) 02192 { 02193 // Tag not found. 02194 return(NULL); 02195 } 02196 } 02197 02198 // Offset is valid, so return the tag. 02199 value = extras[offset]; 02200 return(&(strings[value])); 02201 } 02202 02203 02204 int* SamRecord::getIntegerTag(const char * tag) 02205 { 02206 // Init to success. 02207 myStatus = SamStatus::SUCCESS; 02208 // Parse the buffer if necessary. 02209 if(myNeedToSetTagsFromBuffer) 02210 { 02211 if(!setTagsFromBuffer()) 02212 { 02213 // Failed to read the tags from the buffer, so cannot 02214 // get tags. setTagsFromBuffer set the errors, 02215 // so just return NULL. 02216 return(NULL); 02217 } 02218 } 02219 02220 int key = MAKEKEY(tag[0], tag[1], 'i'); 02221 int offset = extras.Find(key); 02222 02223 int value; 02224 if (offset < 0) 02225 { 02226 // Failed to find the tag. 02227 return(NULL); 02228 } 02229 else 02230 value = extras[offset]; 02231 02232 return(&(integers[value])); 02233 } 02234 02235 02236 bool SamRecord::getIntegerTag(const char * tag, int& tagVal) 02237 { 02238 // Init to success. 02239 myStatus = SamStatus::SUCCESS; 02240 // Parse the buffer if necessary. 02241 if(myNeedToSetTagsFromBuffer) 02242 { 02243 if(!setTagsFromBuffer()) 02244 { 02245 // Failed to read the tags from the buffer, so cannot 02246 // get tags. setTagsFromBuffer set the errors, 02247 // so just return false. 02248 return(false); 02249 } 02250 } 02251 02252 int key = MAKEKEY(tag[0], tag[1], 'i'); 02253 int offset = extras.Find(key); 02254 02255 int value; 02256 if (offset < 0) 02257 { 02258 // Failed to find the tag. 02259 return(false); 02260 } 02261 else 02262 value = extras[offset]; 02263 02264 tagVal = integers[value]; 02265 return(true); 02266 } 02267 02268 02269 bool SamRecord::getFloatTag(const char * tag, float& tagVal) 02270 { 02271 // Init to success. 02272 myStatus = SamStatus::SUCCESS; 02273 // Parse the buffer if necessary. 02274 if(myNeedToSetTagsFromBuffer) 02275 { 02276 if(!setTagsFromBuffer()) 02277 { 02278 // Failed to read the tags from the buffer, so cannot 02279 // get tags. setTagsFromBuffer set the errors, 02280 // so just return false. 02281 return(false); 02282 } 02283 } 02284 02285 int key = MAKEKEY(tag[0], tag[1], 'f'); 02286 int offset = extras.Find(key); 02287 02288 int value; 02289 if (offset < 0) 02290 { 02291 // Failed to find the tag. 02292 return(false); 02293 } 02294 else 02295 value = extras[offset]; 02296 02297 tagVal = floats[value]; 02298 return(true); 02299 } 02300 02301 02302 const String & SamRecord::getString(const char * tag) 02303 { 02304 // Init to success. 02305 myStatus = SamStatus::SUCCESS; 02306 // Parse the buffer if necessary. 02307 if(myNeedToSetTagsFromBuffer) 02308 { 02309 if(!setTagsFromBuffer()) 02310 { 02311 // Failed to read the tags from the buffer, so cannot 02312 // get tags. 02313 // TODO - what do we want to do on failure? 02314 } 02315 } 02316 02317 int key = MAKEKEY(tag[0], tag[1], 'Z'); 02318 int offset = extras.Find(key); 02319 02320 int value; 02321 if (offset < 0) 02322 { 02323 02324 key = MAKEKEY(tag[0], tag[1], 'B'); 02325 offset = extras.Find(key); 02326 if (offset < 0) 02327 { 02328 // TODO - what do we want to do on failure? 02329 return(NOT_FOUND_TAG_STRING); 02330 } 02331 } 02332 value = extras[offset]; 02333 02334 return strings[value]; 02335 } 02336 02337 02338 int & SamRecord::getInteger(const char * tag) 02339 { 02340 // Init to success. 02341 myStatus = SamStatus::SUCCESS; 02342 // Parse the buffer if necessary. 02343 if(myNeedToSetTagsFromBuffer) 02344 { 02345 if(!setTagsFromBuffer()) 02346 { 02347 // Failed to read the tags from the buffer, so cannot 02348 // get tags. setTagsFromBuffer set the error. 02349 // TODO - what do we want to do on failure? 02350 } 02351 } 02352 02353 int key = MAKEKEY(tag[0], tag[1], 'i'); 02354 int offset = extras.Find(key); 02355 02356 int value; 02357 if (offset < 0) 02358 { 02359 // TODO - what do we want to do on failure? 02360 return NOT_FOUND_TAG_INT; 02361 } 02362 else 02363 value = extras[offset]; 02364 02365 return integers[value]; 02366 } 02367 02368 02369 bool SamRecord::checkTag(const char * tag, char type) 02370 { 02371 // Init to success. 02372 myStatus = SamStatus::SUCCESS; 02373 // Parse the buffer if necessary. 02374 if(myNeedToSetTagsFromBuffer) 02375 { 02376 if(!setTagsFromBuffer()) 02377 { 02378 // Failed to read the tags from the buffer, so cannot 02379 // get tags. setTagsFromBuffer set the error. 02380 return(""); 02381 } 02382 } 02383 02384 int key = MAKEKEY(tag[0], tag[1], type); 02385 02386 return (extras.Find(key) != LH_NOTFOUND); 02387 } 02388 02389 02390 // Return the error after a failed SamRecord call. 02391 const SamStatus& SamRecord::getStatus() 02392 { 02393 return(myStatus); 02394 } 02395 02396 02397 // Allocate space for the record - does a realloc. 02398 // The passed in size is the size of the entire record including the 02399 // block size field. 02400 bool SamRecord::allocateRecordStructure(int size) 02401 { 02402 if (allocatedSize < size) 02403 { 02404 bamRecordStruct* tmpRecordPtr = 02405 (bamRecordStruct *)realloc(myRecordPtr, size); 02406 if(tmpRecordPtr == NULL) 02407 { 02408 // FAILED to allocate memory 02409 fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!"); 02410 myStatus.addError(SamStatus::FAIL_MEM, "Failed Memory Allocation."); 02411 return(false); 02412 } 02413 // Successfully allocated memory, so set myRecordPtr. 02414 myRecordPtr = tmpRecordPtr; 02415 02416 // Reset the pointers into the record. 02417 if(myIsSequenceBufferValid) 02418 { 02419 myPackedSequence = (unsigned char *)myRecordPtr->myData + 02420 myRecordPtr->myReadNameLength + 02421 myRecordPtr->myCigarLength * sizeof(int); 02422 } 02423 if(myIsQualityBufferValid) 02424 { 02425 myPackedQuality = (unsigned char *)myRecordPtr->myData + 02426 myRecordPtr->myReadNameLength + 02427 myRecordPtr->myCigarLength * sizeof(int) + 02428 (myRecordPtr->myReadLength + 1) / 2; 02429 } 02430 02431 allocatedSize = size; 02432 } 02433 return(true); 02434 } 02435 02436 02437 // Index is the index into the strings array. 02438 void* SamRecord::getStringPtr(int index) 02439 { 02440 int value = extras[index]; 02441 02442 return &(strings[value]); 02443 } 02444 02445 void* SamRecord::getIntegerPtr(int offset, char& type) 02446 { 02447 int value = extras[offset]; 02448 02449 type = intType[value]; 02450 02451 return &(integers[value]); 02452 } 02453 02454 void* SamRecord::getFloatPtr(int offset) 02455 { 02456 int value = extras[offset]; 02457 02458 return &(floats[value]); 02459 } 02460 02461 02462 // Fixes the buffer to match the variable length fields if they are set. 02463 bool SamRecord::fixBuffer(SequenceTranslation translation) 02464 { 02465 // Check to see if the buffer is already synced. 02466 if(myIsBufferSynced && 02467 (myBufferSequenceTranslation == translation)) 02468 { 02469 // Already synced, nothing to do. 02470 return(true); 02471 } 02472 02473 // Set the bin if necessary. 02474 if(!myIsBinValid) 02475 { 02476 // The bin that is set in the record is not valid, so 02477 // reset it. 02478 myRecordPtr->myBin = 02479 bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd()); 02480 myIsBinValid = true; 02481 } 02482 02483 // Not synced. 02484 bool status = true; 02485 02486 // First determine the size the buffer needs to be. 02487 uint8_t newReadNameLen = getReadNameLength(); 02488 uint16_t newCigarLen = getCigarLength(); 02489 int32_t newReadLen = getReadLength(); 02490 uint32_t newTagLen = getTagLength(); 02491 uint32_t bamSequenceLen = (newReadLen+1)/2; 02492 02493 // The buffer size extends from the start of the record to data 02494 // plus the length of the variable fields, 02495 // Multiply the cigar length by 4 since it is the number of uint32_t fields. 02496 int newBufferSize = 02497 ((unsigned char*)(&(myRecordPtr->myData)) - 02498 (unsigned char*)myRecordPtr) + 02499 newReadNameLen + ((newCigarLen)*4) + 02500 newReadLen + bamSequenceLen + newTagLen; 02501 02502 if(!allocateRecordStructure(newBufferSize)) 02503 { 02504 // Failed to allocate space. 02505 return(false); 02506 } 02507 02508 // Now that space has been added to the buffer, check to see what if 02509 // any fields need to be extracted from the buffer prior to starting to 02510 // overwrite it. Fields need to be extracted from the buffer if the 02511 // buffer is valid for the field and a previous variable length field has 02512 // changed length. 02513 bool readNameLenChange = (newReadNameLen != myRecordPtr->myReadNameLength); 02514 bool cigarLenChange = (newCigarLen != myRecordPtr->myCigarLength); 02515 bool readLenChange = (newReadLen != myRecordPtr->myReadLength); 02516 02517 // If the tags are still stored in the buffer and any other fields changed 02518 // lengths, they need to be extracted. 02519 if(myIsTagsBufferValid && 02520 (readNameLenChange | cigarLenChange | readLenChange)) 02521 { 02522 status &= setTagsFromBuffer(); 02523 // The tag buffer will not be valid once the other fields 02524 // are written, so set it to not valid. 02525 myIsTagsBufferValid = false; 02526 } 02527 02528 // If the sequence or quality strings are still stored in the buffer, and 02529 // any of the previous fields have changed length, extract it from the 02530 // current buffer. 02531 if((myIsQualityBufferValid | myIsSequenceBufferValid) && 02532 (readNameLenChange | cigarLenChange | readLenChange)) 02533 { 02534 setSequenceAndQualityFromBuffer(); 02535 // The quality and sequence buffers will not be valid once the other 02536 // fields are written, so set them to not valid. 02537 myIsQualityBufferValid = false; 02538 myIsSequenceBufferValid = false; 02539 } 02540 02541 // If the cigar is still stored in the buffer, and any of the 02542 // previous fields have changed length, extract it from the current buffer. 02543 if((myIsCigarBufferValid) && 02544 (readNameLenChange)) 02545 { 02546 status &= parseCigarBinary(); 02547 myIsCigarBufferValid = false; 02548 } 02549 02550 // Set each value in the buffer if it is not already valid. 02551 if(!myIsReadNameBufferValid) 02552 { 02553 memcpy(&(myRecordPtr->myData), myReadName.c_str(), 02554 newReadNameLen); 02555 02556 // Set the new ReadNameLength. 02557 myRecordPtr->myReadNameLength = newReadNameLen; 02558 myIsReadNameBufferValid = true; 02559 } 02560 02561 unsigned char * readNameEnds = (unsigned char*)(&(myRecordPtr->myData)) + 02562 myRecordPtr->myReadNameLength; 02563 02564 // Set the Cigar. Need to reformat from the string to 02565 unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds; 02566 02567 if(!myIsCigarBufferValid) 02568 { 02569 // The cigar was already parsed when it was set, so just copy 02570 // data from the temporary buffer. 02571 myRecordPtr->myCigarLength = newCigarLen; 02572 memcpy(packedCigar, myCigarTempBuffer, 02573 myRecordPtr->myCigarLength * sizeof(uint32_t)); 02574 02575 myIsCigarBufferValid = true; 02576 } 02577 02578 unsigned char * packedSequence = readNameEnds + 02579 myRecordPtr->myCigarLength * sizeof(int); 02580 unsigned char * packedQuality = packedSequence + bamSequenceLen; 02581 02582 if(!myIsSequenceBufferValid || !myIsQualityBufferValid || 02583 (myBufferSequenceTranslation != translation)) 02584 { 02585 myRecordPtr->myReadLength = newReadLen; 02586 // Determine if the quality needs to be set and is just a * and needs to 02587 // be set to 0xFF. 02588 bool noQuality = false; 02589 if((myQuality.Length() == 1) && (myQuality[0] == '*')) 02590 { 02591 noQuality = true; 02592 } 02593 02594 const char* translatedSeq = NULL; 02595 // If the sequence is not valid in the buffer or it is not 02596 // properly translated, get the properly translated sequence 02597 // that needs to be put into the buffer. 02598 if((!myIsSequenceBufferValid) || 02599 (translation != myBufferSequenceTranslation)) 02600 { 02601 translatedSeq = getSequence(translation); 02602 } 02603 02604 for (int i = 0; i < myRecordPtr->myReadLength; i++) 02605 { 02606 if((!myIsSequenceBufferValid) || 02607 (translation != myBufferSequenceTranslation)) 02608 { 02609 // Sequence buffer is not valid, so set the sequence. 02610 int seqVal = 0; 02611 switch(translatedSeq[i]) 02612 { 02613 case '=': 02614 seqVal = 0; 02615 break; 02616 case 'A': 02617 case 'a': 02618 seqVal = 1; 02619 break; 02620 case 'C': 02621 case 'c': 02622 seqVal = 2; 02623 break; 02624 case 'G': 02625 case 'g': 02626 seqVal = 4; 02627 break; 02628 case 'T': 02629 case 't': 02630 seqVal = 8; 02631 break; 02632 case 'N': 02633 case 'n': 02634 case '.': 02635 seqVal = 15; 02636 break; 02637 default: 02638 myStatus.addError(SamStatus::FAIL_PARSE, 02639 "Unknown Sequence character found."); 02640 status = false; 02641 break; 02642 }; 02643 02644 if(i & 1) 02645 { 02646 // Odd number i's go in the lower 4 bits, so OR in the 02647 // lower bits 02648 packedSequence[i/2] |= seqVal; 02649 } 02650 else 02651 { 02652 // Even i's go in the upper 4 bits and are always set first. 02653 packedSequence[i/2] = seqVal << 4; 02654 } 02655 } 02656 02657 if(!myIsQualityBufferValid) 02658 { 02659 // Set the quality. 02660 if((noQuality) || (myQuality.Length() <= i)) 02661 { 02662 // No quality or the quality is smaller than the sequence, 02663 // so set it to 0xFF 02664 packedQuality[i] = 0xFF; 02665 } 02666 else 02667 { 02668 // Copy the quality string. 02669 packedQuality[i] = myQuality[i] - 33; 02670 } 02671 } 02672 } 02673 myPackedSequence = (unsigned char *)myRecordPtr->myData + 02674 myRecordPtr->myReadNameLength + 02675 myRecordPtr->myCigarLength * sizeof(int); 02676 myPackedQuality = myPackedSequence + 02677 (myRecordPtr->myReadLength + 1) / 2; 02678 myIsSequenceBufferValid = true; 02679 myIsQualityBufferValid = true; 02680 myBufferSequenceTranslation = translation; 02681 } 02682 02683 if(!myIsTagsBufferValid) 02684 { 02685 status &= setTagsInBuffer(); 02686 } 02687 02688 // Set the lengths in the buffer. 02689 myRecordPtr->myReadNameLength = newReadNameLen; 02690 myRecordPtr->myCigarLength = newCigarLen; 02691 myRecordPtr->myReadLength = newReadLen; 02692 02693 // Set the buffer block size to the size of the buffer minus the 02694 // first field. 02695 myRecordPtr->myBlockSize = newBufferSize - sizeof(int32_t); 02696 02697 if(status) 02698 { 02699 myIsBufferSynced = true; 02700 } 02701 02702 return(status); 02703 } 02704 02705 02706 // Sets the Sequence and Quality strings from the buffer. 02707 // They are done together in one method because they require the same 02708 // loop, so might as well be done at the same time. 02709 void SamRecord::setSequenceAndQualityFromBuffer() 02710 { 02711 // NOTE: If the sequence buffer is not valid, do not set the sequence 02712 // string from the buffer. 02713 // NOTE: If the quality buffer is not valid, do not set the quality string 02714 // from the buffer. 02715 02716 // Extract the sequence if the buffer is valid and the string's length is 0. 02717 bool extractSequence = false; 02718 if(myIsSequenceBufferValid && (mySequence.Length() == 0)) 02719 { 02720 extractSequence = true; 02721 } 02722 02723 // Extract the quality if the buffer is valid and the string's length is 0. 02724 bool extractQuality = false; 02725 if(myIsQualityBufferValid && (myQuality.Length() == 0)) 02726 { 02727 extractQuality = true; 02728 } 02729 02730 // If neither the quality nor the sequence need to be extracted, 02731 // just return. 02732 if(!extractSequence && !extractQuality) 02733 { 02734 return; 02735 } 02736 02737 // Set the sequence and quality strings.. 02738 if(extractSequence) 02739 { 02740 mySequence.SetLength(myRecordPtr->myReadLength); 02741 } 02742 if(extractQuality) 02743 { 02744 myQuality.SetLength(myRecordPtr->myReadLength); 02745 } 02746 02747 const char * asciiBases = "=AC.G...T......N"; 02748 02749 // Flag to see if the quality is specified - the quality contains a value 02750 // other than 0xFF. If all values are 0xFF, then there is no quality. 02751 bool qualitySpecified = false; 02752 02753 for (int i = 0; i < myRecordPtr->myReadLength; i++) 02754 { 02755 if(extractSequence) 02756 { 02757 mySequence[i] = i & 1 ? 02758 asciiBases[myPackedSequence[i / 2] & 0xF] : 02759 asciiBases[myPackedSequence[i / 2] >> 4]; 02760 } 02761 02762 if(extractQuality) 02763 { 02764 if(myPackedQuality[i] != 0xFF) 02765 { 02766 // Quality is specified, so mark the flag. 02767 qualitySpecified = true; 02768 } 02769 02770 myQuality[i] = myPackedQuality[i] + 33; 02771 } 02772 } 02773 02774 // If the read length is 0, then set the sequence and quality to '*' 02775 if(myRecordPtr->myReadLength == 0) 02776 { 02777 if(extractSequence) 02778 { 02779 mySequence = "*"; 02780 } 02781 if(extractQuality) 02782 { 02783 myQuality = "*"; 02784 } 02785 } 02786 else if(extractQuality && !qualitySpecified) 02787 { 02788 // No quality was specified, so set it to "*" 02789 myQuality = "*"; 02790 } 02791 } 02792 02793 02794 // Parse the cigar to calculate the alignment/unclipped end. 02795 bool SamRecord::parseCigar() 02796 { 02797 // Determine if the cigar string or cigar binary needs to be parsed. 02798 if(myCigar.Length() == 0) 02799 { 02800 // The cigar string is not yet set, so parse the binary. 02801 return(parseCigarBinary()); 02802 } 02803 return(parseCigarString()); 02804 } 02805 02806 // Parse the cigar to calculate the alignment/unclipped end. 02807 bool SamRecord::parseCigarBinary() 02808 { 02809 // Only need to parse if the string is not already set. 02810 // The length of the cigar string is set to zero when the 02811 // record is read from a file into the buffer. 02812 if(myCigar.Length() != 0) 02813 { 02814 // Already parsed. 02815 return(true); 02816 } 02817 02818 unsigned char * readNameEnds = 02819 (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength; 02820 02821 unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds; 02822 02823 myCigarRoller.Set(packedCigar, myRecordPtr->myCigarLength); 02824 02825 myCigarRoller.getCigarString(myCigar); 02826 02827 myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount(); 02828 02829 myUnclippedStartOffset = myCigarRoller.getNumBeginClips(); 02830 myUnclippedEndOffset = myCigarRoller.getNumEndClips(); 02831 02832 // if the cigar length is 0, then set the cigar string to "*" 02833 if(myRecordPtr->myCigarLength == 0) 02834 { 02835 myCigar = "*"; 02836 return(true); 02837 } 02838 02839 // Copy the cigar into a temporary buffer. 02840 int newBufferSize = myRecordPtr->myCigarLength * sizeof(uint32_t); 02841 if(newBufferSize > myCigarTempBufferAllocatedSize) 02842 { 02843 uint32_t* tempBufferPtr = 02844 (uint32_t*)realloc(myCigarTempBuffer, newBufferSize); 02845 if(tempBufferPtr == NULL) 02846 { 02847 // Failed to allocate memory. 02848 // Do not parse, just return. 02849 fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!"); 02850 myStatus.addError(SamStatus::FAIL_MEM, 02851 "Failed to Allocate Memory."); 02852 return(false); 02853 } 02854 myCigarTempBuffer = tempBufferPtr; 02855 myCigarTempBufferAllocatedSize = newBufferSize; 02856 } 02857 02858 memcpy(myCigarTempBuffer, packedCigar, 02859 myRecordPtr->myCigarLength * sizeof(uint32_t)); 02860 02861 // Set the length of the temp buffer. 02862 myCigarTempBufferLength = myRecordPtr->myCigarLength; 02863 02864 return(true); 02865 } 02866 02867 // Parse the cigar string to calculate the cigar length and alignment end. 02868 bool SamRecord::parseCigarString() 02869 { 02870 myCigarTempBufferLength = 0; 02871 if(myCigar == "*") 02872 { 02873 // Cigar is empty, so initialize the variables. 02874 myAlignmentLength = 0; 02875 myUnclippedStartOffset = 0; 02876 myUnclippedEndOffset = 0; 02877 myCigarRoller.clear(); 02878 return(true); 02879 } 02880 02881 myCigarRoller.Set(myCigar); 02882 02883 myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount(); 02884 02885 myUnclippedStartOffset = myCigarRoller.getNumBeginClips(); 02886 myUnclippedEndOffset = myCigarRoller.getNumEndClips(); 02887 02888 // Check to see if the Temporary Cigar Buffer is large enough to contain 02889 // this cigar. If we make it the size of the length of the cigar string, 02890 // it will be more than large enough. 02891 int newBufferSize = myCigar.Length() * sizeof(uint32_t); 02892 if(newBufferSize > myCigarTempBufferAllocatedSize) 02893 { 02894 uint32_t* tempBufferPtr = 02895 (uint32_t*)realloc(myCigarTempBuffer, newBufferSize); 02896 if(tempBufferPtr == NULL) 02897 { 02898 // Failed to allocate memory. 02899 // Do not parse, just return. 02900 fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!"); 02901 myStatus.addError(SamStatus::FAIL_MEM, 02902 "Failed to Allocate Memory."); 02903 return(false); 02904 } 02905 myCigarTempBuffer = tempBufferPtr; 02906 myCigarTempBufferAllocatedSize = newBufferSize; 02907 } 02908 02909 // Track if there were any errors. 02910 bool status = true; 02911 02912 // Track the index into the cigar string that is being parsed. 02913 char *cigarOp; 02914 const char* cigarEntryStart = myCigar.c_str(); 02915 int opLen = 0; 02916 int op = 0; 02917 02918 unsigned int * packedCigar = myCigarTempBuffer; 02919 // TODO - maybe one day make a cigar list... or maybe make a 02920 // reference cigar string for ease of lookup.... 02921 const char* endCigarString = cigarEntryStart + myCigar.Length(); 02922 while(cigarEntryStart < endCigarString) 02923 { 02924 bool validCigarEntry = true; 02925 // Get the opLen from the string. cigarOp will then point to 02926 // the operation. 02927 opLen = strtol(cigarEntryStart, &cigarOp, 10); 02928 // Switch on the type of operation. 02929 switch(*cigarOp) 02930 { 02931 case('M'): 02932 op = 0; 02933 break; 02934 case('I'): 02935 // Insert into the reference position, so do not increment the 02936 // reference end position. 02937 op = 1; 02938 break; 02939 case('D'): 02940 op = 2; 02941 break; 02942 case('N'): 02943 op = 3; 02944 break; 02945 case('S'): 02946 op = 4; 02947 break; 02948 case('H'): 02949 op = 5; 02950 break; 02951 case('P'): 02952 op = 6; 02953 break; 02954 default: 02955 fprintf(stderr, "ERROR parsing cigar\n"); 02956 validCigarEntry = false; 02957 status = false; 02958 myStatus.addError(SamStatus::FAIL_PARSE, 02959 "Unknown operation found when parsing the Cigar."); 02960 break; 02961 }; 02962 if(validCigarEntry) 02963 { 02964 // Increment the cigar length. 02965 ++myCigarTempBufferLength; 02966 *packedCigar = (opLen << 4) | op; 02967 packedCigar++; 02968 } 02969 // The next Entry starts at one past the cigar op, so set the start. 02970 cigarEntryStart = ++cigarOp; 02971 } 02972 02973 // Check clipLength to adjust the end position. 02974 return(status); 02975 } 02976 02977 02978 bool SamRecord::setTagsFromBuffer() 02979 { 02980 // If the tags do not need to be set from the buffer, return true. 02981 if(myNeedToSetTagsFromBuffer == false) 02982 { 02983 // Already been set from the buffer. 02984 return(true); 02985 } 02986 02987 // Mark false, as they are being set now. 02988 myNeedToSetTagsFromBuffer = false; 02989 02990 unsigned char * extraPtr = myPackedQuality + myRecordPtr->myReadLength; 02991 02992 // Default to success, will be changed to false on failure. 02993 bool status = true; 02994 02995 // Clear any previously set tags. 02996 clearTags(); 02997 while (myRecordPtr->myBlockSize + 4 - 02998 (extraPtr - (unsigned char *)myRecordPtr) > 0) 02999 { 03000 int key = 0; 03001 int value = 0; 03002 void * content = extraPtr + 3; 03003 int tagBufferSize = 0; 03004 03005 key = MAKEKEY(extraPtr[0], extraPtr[1], extraPtr[2]); 03006 03007 // First check if the tag already exists. 03008 unsigned int location = extras.Find(key); 03009 int origIndex = 0; 03010 String* duplicate = NULL; 03011 String* origTag = NULL; 03012 if(location != LH_NOTFOUND) 03013 { 03014 duplicate = new String; 03015 origTag = new String; 03016 origIndex = extras[location]; 03017 03018 *duplicate = (char)(extraPtr[0]); 03019 *duplicate += (char)(extraPtr[1]); 03020 *duplicate += ':'; 03021 03022 *origTag = *duplicate; 03023 *duplicate += (char)(extraPtr[2]); 03024 *duplicate += ':'; 03025 } 03026 03027 switch (extraPtr[2]) 03028 { 03029 case 'A' : 03030 if(duplicate != NULL) 03031 { 03032 *duplicate += (* (char *) content); 03033 *origTag += intType[origIndex]; 03034 *origTag += ':'; 03035 appendIntArrayValue(origIndex, *origTag); 03036 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); 03037 integers[origIndex] = *(char *)content; 03038 intType[origIndex] = extraPtr[2]; 03039 tagBufferSize += getNumericTagTypeSize(intType[origIndex]); 03040 } 03041 else 03042 { 03043 value = integers.Length(); 03044 integers.Push(* (char *) content); 03045 intType.push_back(extraPtr[2]); 03046 tagBufferSize += 4; 03047 } 03048 extraPtr += 4; 03049 break; 03050 case 'c' : 03051 if(duplicate != NULL) 03052 { 03053 *duplicate += (* (char *) content); 03054 *origTag += intType[origIndex]; 03055 *origTag += ':'; 03056 appendIntArrayValue(origIndex, *origTag); 03057 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); 03058 integers[origIndex] = *(char *)content; 03059 intType[origIndex] = extraPtr[2]; 03060 tagBufferSize += getNumericTagTypeSize(intType[origIndex]); 03061 } 03062 else 03063 { 03064 value = integers.Length(); 03065 integers.Push(* (char *) content); 03066 intType.push_back(extraPtr[2]); 03067 tagBufferSize += 4; 03068 } 03069 extraPtr += 4; 03070 break; 03071 case 'C' : 03072 if(duplicate != NULL) 03073 { 03074 *duplicate += (* (unsigned char *) content); 03075 *origTag += intType[origIndex]; 03076 *origTag += ':'; 03077 appendIntArrayValue(origIndex, *origTag); 03078 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); 03079 integers[origIndex] = *(unsigned char *)content; 03080 intType[origIndex] = extraPtr[2]; 03081 tagBufferSize += getNumericTagTypeSize(intType[origIndex]); 03082 } 03083 else 03084 { 03085 value = integers.Length(); 03086 integers.Push(* (unsigned char *) content); 03087 intType.push_back(extraPtr[2]); 03088 tagBufferSize += 4; 03089 } 03090 extraPtr += 4; 03091 break; 03092 case 's' : 03093 if(duplicate != NULL) 03094 { 03095 *duplicate += (* (short *) content); 03096 *origTag += intType[origIndex]; 03097 *origTag += ':'; 03098 appendIntArrayValue(origIndex, *origTag); 03099 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); 03100 integers[origIndex] = *(short *)content; 03101 intType[origIndex] = extraPtr[2]; 03102 tagBufferSize += getNumericTagTypeSize(intType[origIndex]); 03103 } 03104 else 03105 { 03106 value = integers.Length(); 03107 integers.Push(* (short *) content); 03108 intType.push_back(extraPtr[2]); 03109 tagBufferSize += 5; 03110 } 03111 extraPtr += 5; 03112 break; 03113 case 'S' : 03114 if(duplicate != NULL) 03115 { 03116 *duplicate += (* (unsigned short *) content); 03117 *origTag += intType[origIndex]; 03118 *origTag += ':'; 03119 appendIntArrayValue(origIndex, *origTag); 03120 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); 03121 integers[origIndex] = *(unsigned short *)content; 03122 intType[origIndex] = extraPtr[2]; 03123 tagBufferSize += getNumericTagTypeSize(intType[origIndex]); 03124 } 03125 else 03126 { 03127 value = integers.Length(); 03128 integers.Push(* (unsigned short *) content); 03129 intType.push_back(extraPtr[2]); 03130 tagBufferSize += 5; 03131 } 03132 extraPtr += 5; 03133 break; 03134 case 'i' : 03135 if(duplicate != NULL) 03136 { 03137 *duplicate += (* (int *) content); 03138 *origTag += intType[origIndex]; 03139 *origTag += ':'; 03140 appendIntArrayValue(origIndex, *origTag); 03141 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); 03142 integers[origIndex] = *(int *)content; 03143 intType[origIndex] = extraPtr[2]; 03144 tagBufferSize += getNumericTagTypeSize(intType[origIndex]); 03145 } 03146 else 03147 { 03148 value = integers.Length(); 03149 integers.Push(* (int *) content); 03150 intType.push_back(extraPtr[2]); 03151 tagBufferSize += 7; 03152 } 03153 extraPtr += 7; 03154 break; 03155 case 'I' : 03156 if(duplicate != NULL) 03157 { 03158 *duplicate += (* (unsigned int *) content); 03159 *origTag += intType[origIndex]; 03160 *origTag += ':'; 03161 appendIntArrayValue(origIndex, *origTag); 03162 tagBufferSize -= getNumericTagTypeSize(intType[origIndex]); 03163 integers[origIndex] = *(unsigned int *)content; 03164 intType[origIndex] = extraPtr[2]; 03165 tagBufferSize += getNumericTagTypeSize(intType[origIndex]); 03166 } 03167 else 03168 { 03169 value = integers.Length(); 03170 integers.Push((int) * (unsigned int *) content); 03171 intType.push_back(extraPtr[2]); 03172 tagBufferSize += 7; 03173 } 03174 extraPtr += 7; 03175 break; 03176 case 'Z' : 03177 if(duplicate != NULL) 03178 { 03179 *duplicate += ((const char *) content); 03180 *origTag += 'Z'; 03181 *origTag += ':'; 03182 *origTag += (char*)(strings[origIndex]); 03183 tagBufferSize -= strings[origIndex].Length(); 03184 strings[origIndex] = (const char *) content; 03185 extraPtr += 4 + strings[origIndex].Length(); 03186 tagBufferSize += strings[origIndex].Length(); 03187 } 03188 else 03189 { 03190 value = strings.Length(); 03191 strings.Push((const char *) content); 03192 tagBufferSize += 4 + strings.Last().Length(); 03193 extraPtr += 4 + strings.Last().Length(); 03194 } 03195 break; 03196 case 'B' : 03197 if(duplicate != NULL) 03198 { 03199 *origTag += 'B'; 03200 *origTag += ':'; 03201 *origTag += (char*)(strings[origIndex]); 03202 tagBufferSize -= 03203 getBtagBufferSize(strings[origIndex]); 03204 int bufferSize = 03205 getStringFromBtagBuffer((unsigned char*)content, 03206 strings[origIndex]); 03207 *duplicate += (char *)(strings[origIndex]); 03208 tagBufferSize += bufferSize; 03209 extraPtr += 3 + bufferSize; 03210 } 03211 else 03212 { 03213 value = strings.Length(); 03214 String tempBTag; 03215 int bufferSize = 03216 getStringFromBtagBuffer((unsigned char*)content, 03217 tempBTag); 03218 strings.Push(tempBTag); 03219 tagBufferSize += 3 + bufferSize; 03220 extraPtr += 3 + bufferSize; 03221 } 03222 break; 03223 case 'f' : 03224 if(duplicate != NULL) 03225 { 03226 duplicate->appendFullFloat(* (float *) content); 03227 *origTag += 'f'; 03228 *origTag += ':'; 03229 origTag->appendFullFloat(floats[origIndex]); 03230 floats[origIndex] = *(float *)content; 03231 } 03232 else 03233 { 03234 value = floats.size(); 03235 floats.push_back(* (float *) content); 03236 tagBufferSize += 7; 03237 } 03238 extraPtr += 7; 03239 break; 03240 default : 03241 fprintf(stderr, 03242 "parsing BAM - Unknown custom field of type %c%c:%c\n", 03243 extraPtr[0], extraPtr[1], extraPtr[2]); 03244 fprintf(stderr, "BAM Tags: \n"); 03245 03246 unsigned char* tagInfo = myPackedQuality + myRecordPtr->myReadLength; 03247 03248 fprintf(stderr, "\n\n"); 03249 tagInfo = myPackedQuality + myRecordPtr->myReadLength; 03250 while(myRecordPtr->myBlockSize + 4 - 03251 (tagInfo - (unsigned char *)myRecordPtr) > 0) 03252 { 03253 fprintf(stderr, "%02x",tagInfo[0]); 03254 ++tagInfo; 03255 } 03256 fprintf(stderr, "\n"); 03257 03258 // Failed on read. 03259 // Increment extraPtr just by the size of the 3 known fields 03260 extraPtr += 3; 03261 myStatus.addError(SamStatus::FAIL_PARSE, 03262 "Unknown tag type."); 03263 status = false; 03264 } 03265 03266 if(duplicate != NULL) 03267 { 03268 // Duplicate tag in this record. 03269 // Tag already existed, print message about overwriting. 03270 // WARN about dropping duplicate tags. 03271 if(myNumWarns++ < myMaxWarns) 03272 { 03273 fprintf(stderr, "WARNING: Duplicate Tags, overwritting %s with %s\n", 03274 origTag->c_str(), duplicate->c_str()); 03275 if(myNumWarns == myMaxWarns) 03276 { 03277 fprintf(stderr, "Suppressing rest of Duplicate Tag warnings.\n"); 03278 } 03279 } 03280 03281 continue; 03282 } 03283 03284 // Only add the tag if it has so far been successfully processed. 03285 if(status) 03286 { 03287 // Add the tag. 03288 extras.Add(key, value); 03289 myTagBufferSize += tagBufferSize; 03290 } 03291 } 03292 return(status); 03293 } 03294 03295 03296 bool SamRecord::setTagsInBuffer() 03297 { 03298 // The buffer size extends from the start of the record to data 03299 // plus the length of the variable fields, 03300 // Multiply the cigar length by 4 since it is the number of uint32_t fields. 03301 int bamSequenceLength = (myRecordPtr->myReadLength+1)/2; 03302 int newBufferSize = ((unsigned char*)(&(myRecordPtr->myData)) - 03303 (unsigned char*)myRecordPtr) + 03304 myRecordPtr->myReadNameLength + ((myRecordPtr->myCigarLength)*4) + 03305 myRecordPtr->myReadLength + bamSequenceLength + myTagBufferSize; 03306 03307 // Make sure the buffer is big enough. 03308 if(!allocateRecordStructure(newBufferSize)) 03309 { 03310 // Failed to allocate space. 03311 return(false); 03312 } 03313 03314 char * extraPtr = (char*)myPackedQuality + myRecordPtr->myReadLength; 03315 03316 bool status = true; 03317 03318 // Set the tags in the buffer. 03319 if (extras.Entries()) 03320 { 03321 for (int i = 0; i < extras.Capacity(); i++) 03322 { 03323 if (extras.SlotInUse(i)) 03324 { 03325 int key = extras.GetKey(i); 03326 getTag(key, extraPtr); 03327 extraPtr += 2; 03328 char vtype; 03329 getTypeFromKey(key, vtype); 03330 03331 if(vtype == 'i') 03332 { 03333 vtype = getIntegerType(i); 03334 } 03335 03336 extraPtr[0] = vtype; 03337 03338 // increment the pointer to where the value is. 03339 extraPtr += 1; 03340 03341 switch (vtype) 03342 { 03343 case 'A' : 03344 *(char*)extraPtr = (char)getInteger(i); 03345 // sprintf(extraPtr, "%d", getInteger(i)); 03346 extraPtr += 1; 03347 break; 03348 case 'c' : 03349 *(int8_t*)extraPtr = (int8_t)getInteger(i); 03350 // sprintf(extraPtr, "%.4d", getInteger(i)); 03351 extraPtr += 1; 03352 break; 03353 case 'C' : 03354 *(uint8_t*)extraPtr = (uint8_t)getInteger(i); 03355 // sprintf(extraPtr, "%.4d", getInteger(i)); 03356 extraPtr += 1; 03357 break; 03358 case 's' : 03359 *(int16_t*)extraPtr = (int16_t)getInteger(i); 03360 // sprintf(extraPtr, "%.4d", getInteger(i)); 03361 extraPtr += 2; 03362 break; 03363 case 'S' : 03364 *(uint16_t*)extraPtr = (uint16_t)getInteger(i); 03365 // sprintf(extraPtr, "%.4d", getInteger(i)); 03366 extraPtr += 2; 03367 break; 03368 case 'i' : 03369 *(int32_t*)extraPtr = (int32_t)getInteger(i); 03370 // sprintf(extraPtr, "%.4d", getInteger(i)); 03371 extraPtr += 4; 03372 break; 03373 case 'I' : 03374 *(uint32_t*)extraPtr = (uint32_t)getInteger(i); 03375 // sprintf(extraPtr, "%.4d", getInteger(i)); 03376 extraPtr += 4; 03377 break; 03378 case 'Z' : 03379 sprintf(extraPtr, "%s", getString(i).c_str()); 03380 extraPtr += getString(i).Length() + 1; 03381 break; 03382 case 'B' : 03383 extraPtr += setBtagBuffer(getString(i), extraPtr); 03384 //--TODO-- Set buffer with correct B tag 03385 //sprintf(extraPtr, "%s", getString(i).c_str()); 03386 // extraPtr += getBtagBufferSize(getString(i)); 03387 break; 03388 case 'f' : 03389 *(float*)extraPtr = getFloat(i); 03390 extraPtr += 4; 03391 break; 03392 default : 03393 myStatus.addError(SamStatus::FAIL_PARSE, 03394 "Unknown tag type."); 03395 status = false; 03396 break; 03397 } 03398 } 03399 } 03400 } 03401 03402 // Validate that the extra pointer is at the end of the allocated buffer. 03403 // If not then there was a problem. 03404 if(extraPtr != (char*)myRecordPtr + newBufferSize) 03405 { 03406 fprintf(stderr, "ERROR updating the buffer. Incorrect size."); 03407 myStatus.addError(SamStatus::FAIL_PARSE, 03408 "ERROR updating the buffer. Incorrect size."); 03409 status = false; 03410 } 03411 03412 03413 // The buffer tags are now in sync. 03414 myNeedToSetTagsInBuffer = false; 03415 myIsTagsBufferValid = true; 03416 return(status); 03417 } 03418 03419 03420 // Reset the variables for a newly set buffer. The buffer must be set first 03421 // since this looks up the reference ids in the buffer to set the reference 03422 // names. 03423 void SamRecord::setVariablesForNewBuffer(SamFileHeader& header) 03424 { 03425 // Lookup the reference name & mate reference name associated with this 03426 // record. 03427 myReferenceName = 03428 header.getReferenceLabel(myRecordPtr->myReferenceID); 03429 myMateReferenceName = 03430 header.getReferenceLabel(myRecordPtr->myMateReferenceID); 03431 03432 // Clear the SAM Strings that are now not in-sync with the buffer. 03433 myReadName.SetLength(0); 03434 myCigar.SetLength(0); 03435 mySequence.SetLength(0); 03436 mySeqWithEq.clear(); 03437 mySeqWithoutEq.clear(); 03438 myQuality.SetLength(0); 03439 myNeedToSetTagsFromBuffer = true; 03440 myNeedToSetTagsInBuffer = false; 03441 03442 //Set that the buffer is valid. 03443 myIsBufferSynced = true; 03444 // Set that the variable length buffer fields are valid. 03445 myIsReadNameBufferValid = true; 03446 myIsCigarBufferValid = true; 03447 myPackedSequence = (unsigned char *)myRecordPtr->myData + 03448 myRecordPtr->myReadNameLength + myRecordPtr->myCigarLength * sizeof(int); 03449 myIsSequenceBufferValid = true; 03450 myBufferSequenceTranslation = NONE; 03451 myPackedQuality = myPackedSequence + 03452 (myRecordPtr->myReadLength + 1) / 2; 03453 myIsQualityBufferValid = true; 03454 myIsTagsBufferValid = true; 03455 myIsBinValid = true; 03456 } 03457 03458 03459 // Extract the vtype from the key. 03460 void SamRecord::getTypeFromKey(int key, char& type) const 03461 { 03462 // Extract the type from the key. 03463 type = (key >> 16) & 0xFF; 03464 } 03465 03466 03467 // Extract the tag from the key. 03468 void SamRecord::getTag(int key, char* tag) const 03469 { 03470 // Extract the tag from the key. 03471 tag[0] = key & 0xFF; 03472 tag[1] = (key >> 8) & 0xFF; 03473 tag[2] = 0; 03474 } 03475 03476 03477 // Index is the index into the strings array. 03478 String & SamRecord::getString(int index) 03479 { 03480 int value = extras[index]; 03481 03482 return strings[value]; 03483 } 03484 03485 int & SamRecord::getInteger(int offset) 03486 { 03487 int value = extras[offset]; 03488 03489 return integers[value]; 03490 } 03491 03492 const char & SamRecord::getIntegerType(int offset) const 03493 { 03494 int value = extras[offset]; 03495 03496 return intType[value]; 03497 } 03498 03499 float & SamRecord::getFloat(int offset) 03500 { 03501 int value = extras[offset]; 03502 03503 return floats[value]; 03504 } 03505 03506 03507 void SamRecord::appendIntArrayValue(char type, int value, String& strVal) const 03508 { 03509 switch(type) 03510 { 03511 case 'A': 03512 strVal += (char)value; 03513 break; 03514 case 'c': 03515 case 's': 03516 case 'i': 03517 case 'C': 03518 case 'S': 03519 case 'I': 03520 strVal += value; 03521 break; 03522 default: 03523 // Do nothing. 03524 ; 03525 } 03526 } 03527 03528 03529 int SamRecord::getBtagBufferSize(String& tagStr) 03530 { 03531 if(tagStr.Length() < 1) 03532 { 03533 // ERROR, needs at least the type. 03534 myStatus.addError(SamStatus::FAIL_PARSE, 03535 "SamRecord::getBtagBufferSize no tag subtype specified"); 03536 return(0); 03537 } 03538 char type = tagStr[0]; 03539 int elementSize = getNumericTagTypeSize(type); 03540 if(elementSize <= 0) 03541 { 03542 // ERROR, 'B' tag subtype must be numeric, so should be non-zero 03543 String errorMsg = "SamRecord::getBtagBufferSize invalid tag subtype, "; 03544 errorMsg += type; 03545 myStatus.addError(SamStatus::FAIL_PARSE, errorMsg.c_str()); 03546 return(0); 03547 } 03548 03549 // Separated by ',', so count the number of commas. 03550 int numElements = 0; 03551 int index = tagStr.FastFindChar(',', 0); 03552 while(index > 0) 03553 { 03554 ++numElements; 03555 index = tagStr.FastFindChar(',', index+1); 03556 } 03557 // Add 5 bytes: type & numElements 03558 return(numElements * elementSize + 5); 03559 } 03560 03561 03562 int SamRecord::setBtagBuffer(String& tagStr, char* extraPtr) 03563 { 03564 if(tagStr.Length() < 1) 03565 { 03566 // ERROR, needs at least the type. 03567 myStatus.addError(SamStatus::FAIL_PARSE, 03568 "SamRecord::getBtagBufferSize no tag subtype specified"); 03569 return(0); 03570 } 03571 char type = tagStr[0]; 03572 int elementSize = getNumericTagTypeSize(type); 03573 if(elementSize <= 0) 03574 { 03575 // ERROR, 'B' tag subtype must be numeric, so should be non-zero 03576 String errorMsg = "SamRecord::getBtagBufferSize invalid tag subtype, "; 03577 errorMsg += type; 03578 myStatus.addError(SamStatus::FAIL_PARSE, errorMsg.c_str()); 03579 return(0); 03580 } 03581 03582 int totalInc = 0; 03583 // Write the type. 03584 *(char*)extraPtr = type; 03585 ++extraPtr; 03586 ++totalInc; 03587 03588 // Get the number of elements by counting ','s 03589 uint32_t numElements = 0; 03590 int index = tagStr.FastFindChar(',', 0); 03591 while(index > 0) 03592 { 03593 ++numElements; 03594 index = tagStr.FastFindChar(',', index+1); 03595 } 03596 *(uint32_t*)extraPtr = numElements; 03597 extraPtr += 4; 03598 totalInc += 4; 03599 03600 const char* stringPtr = tagStr.c_str(); 03601 const char* endPtr = stringPtr + tagStr.Length(); 03602 // increment past the subtype and ','. 03603 stringPtr += 2; 03604 03605 char* newPtr = NULL; 03606 while(stringPtr < endPtr) 03607 { 03608 switch(type) 03609 { 03610 case 'f': 03611 *(float*)extraPtr = (float)(strtod(stringPtr, &newPtr)); 03612 break; 03613 case 'c': 03614 *(int8_t*)extraPtr = (int8_t)strtol(stringPtr, &newPtr, 0); 03615 break; 03616 case 's': 03617 *(int16_t*)extraPtr = (int16_t)strtol(stringPtr, &newPtr, 0); 03618 break; 03619 case 'i': 03620 *(int32_t*)extraPtr = (int32_t)strtol(stringPtr, &newPtr, 0); 03621 break; 03622 case 'C': 03623 *(uint8_t*)extraPtr = (uint8_t)strtoul(stringPtr, &newPtr, 0); 03624 break; 03625 case 'S': 03626 *(uint16_t*)extraPtr = (uint16_t)strtoul(stringPtr, &newPtr, 0); 03627 break; 03628 case 'I': 03629 *(uint32_t*)extraPtr = (uint32_t)strtoul(stringPtr, &newPtr, 0); 03630 break; 03631 default : 03632 myStatus.addError(SamStatus::FAIL_PARSE, 03633 "Unknown 'B' tag subtype."); 03634 break; 03635 } 03636 extraPtr += elementSize; 03637 totalInc += elementSize; 03638 stringPtr = newPtr + 1; // skip the ',' 03639 } 03640 return(totalInc); 03641 } 03642 03643 03644 int SamRecord::getStringFromBtagBuffer(unsigned char* buffer, 03645 String& tagStr) 03646 { 03647 tagStr.Clear(); 03648 03649 int bufferSize = 0; 03650 03651 // 1st byte is the type. 03652 char type = *buffer; 03653 ++buffer; 03654 ++bufferSize; 03655 tagStr = type; 03656 03657 // 2nd-5th bytes are the size 03658 unsigned int numEntries = *(unsigned int *)buffer; 03659 buffer += 4; 03660 bufferSize += 4; 03661 // Num Entries is not included in the string. 03662 03663 int subtypeSize = getNumericTagTypeSize(type); 03664 03665 for(unsigned int i = 0; i < numEntries; i++) 03666 { 03667 tagStr += ','; 03668 switch(type) 03669 { 03670 case 'f': 03671 tagStr.appendFullFloat(*(float *)buffer); 03672 break; 03673 case 'c': 03674 tagStr += *(int8_t *)buffer; 03675 break; 03676 case 's': 03677 tagStr += *(int16_t *)buffer; 03678 break; 03679 case 'i': 03680 tagStr += *(int32_t *)buffer; 03681 break; 03682 case 'C': 03683 tagStr += *(uint8_t *)buffer; 03684 break; 03685 case 'S': 03686 tagStr += *(uint16_t *)buffer; 03687 break; 03688 case 'I': 03689 tagStr += *(uint32_t *)buffer; 03690 break; 03691 default : 03692 myStatus.addError(SamStatus::FAIL_PARSE, 03693 "Unknown 'B' tag subtype."); 03694 break; 03695 } 03696 buffer += subtypeSize; 03697 bufferSize += subtypeSize; 03698 } 03699 return(bufferSize); 03700 }