SamRecord.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <stdlib.h>
00019 #include <limits>
00020 #include <stdexcept>
00021 
00022 #include "bam.h"
00023 
00024 #include "SamRecord.h"
00025 #include "SamValidation.h"
00026 
00027 #include "BaseUtilities.h"
00028 #include "SamQuerySeqWithRefHelper.h"
00029 
00030 const char* SamRecord::DEFAULT_READ_NAME = "UNKNOWN";
00031 const char* SamRecord::FIELD_ABSENT_STRING = "=";
00032 
00033 
00034 SamRecord::SamRecord()
00035     : myStatus(),
00036       myRefPtr(NULL),
00037       mySequenceTranslation(NONE)
00038 {
00039     int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t);
00040 
00041     myRecordPtr = 
00042         (bamRecordStruct *) malloc(defaultAllocSize);
00043 
00044     myCigarTempBuffer = NULL;
00045     myCigarTempBufferAllocatedSize = 0;
00046 
00047     allocatedSize = defaultAllocSize;
00048 
00049     resetRecord();
00050 }
00051 
00052 
00053 SamRecord::SamRecord(ErrorHandler::HandlingType errorHandlingType)
00054     : myStatus(errorHandlingType),
00055       myRefPtr(NULL),
00056       mySequenceTranslation(NONE)
00057 {
00058     int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t);
00059 
00060     myRecordPtr = 
00061         (bamRecordStruct *) malloc(defaultAllocSize);
00062 
00063     myCigarTempBuffer = NULL;
00064     myCigarTempBufferAllocatedSize = 0;
00065 
00066     allocatedSize = defaultAllocSize;
00067 
00068     resetRecord();
00069 }
00070 
00071 
00072 SamRecord::~SamRecord()
00073 {
00074     resetRecord();
00075 
00076     if(myRecordPtr != NULL)
00077     {
00078         free(myRecordPtr);
00079         myRecordPtr = NULL;
00080     }
00081     if(myCigarTempBuffer != NULL)
00082     {
00083         free(myCigarTempBuffer);
00084         myCigarTempBuffer = NULL;
00085         myCigarTempBufferAllocatedSize = 0;
00086     }
00087 }
00088 
00089 
00090 // Resets the fields of the record to a default value.
00091 void SamRecord::resetRecord()
00092 {
00093     myIsBufferSynced = true;
00094 
00095     myRecordPtr->myBlockSize = DEFAULT_BLOCK_SIZE;
00096     myRecordPtr->myReferenceID = -1;
00097     myRecordPtr->myPosition = -1;
00098     myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH;
00099     myRecordPtr->myMapQuality = 0;
00100     myRecordPtr->myBin = DEFAULT_BIN;
00101     myRecordPtr->myCigarLength = 0;
00102     myRecordPtr->myFlag = 0;
00103     myRecordPtr->myReadLength = 0;
00104     myRecordPtr->myMateReferenceID = -1;
00105     myRecordPtr->myMatePosition = -1;
00106     myRecordPtr->myInsertSize = 0;
00107    
00108     // Set the sam values for the variable length fields.
00109     // TODO - one way to speed this up might be to not set to "*" and just
00110     // clear them, and write out a '*' for SAM if it is empty.
00111     myReadName = DEFAULT_READ_NAME;
00112     myReferenceName = "*";
00113     myMateReferenceName = "*";
00114     myCigar = "*";
00115     mySequence = "*";
00116     mySeqWithEq.clear();
00117     mySeqWithoutEq.clear();
00118     myQuality = "*";
00119     myNeedToSetTagsFromBuffer = false;
00120     myNeedToSetTagsInBuffer = false;
00121 
00122     // Initialize the calculated alignment info to the uncalculated value.
00123     myAlignmentLength = -1;
00124     myUnclippedStartOffset = -1;
00125     myUnclippedEndOffset = -1;
00126 
00127     clearTags();
00128 
00129     // Set the bam values for the variable length fields.
00130     // Only the read name needs to be set, the others are a length of 0.
00131     // Set the read name.  The min size of myRecordPtr includes the size for
00132     // the default read name.
00133     memcpy(&(myRecordPtr->myData), myReadName.c_str(), 
00134            myRecordPtr->myReadNameLength);
00135 
00136     // Set that the variable length buffer fields are valid.
00137     myIsReadNameBufferValid = true;
00138     myIsCigarBufferValid = true;
00139     myIsSequenceBufferValid = true;
00140     myBufferSequenceTranslation = NONE;
00141     myIsQualityBufferValid = true;
00142     myIsTagsBufferValid = true;
00143     myIsBinValid = true;
00144 
00145     myCigarTempBufferLength = -1;
00146 
00147     myStatus = SamStatus::SUCCESS;
00148 
00149     NOT_FOUND_TAG_STRING = "";
00150     NOT_FOUND_TAG_INT = -1;
00151     NOT_FOUND_TAG_DOUBLE = -1;
00152 }
00153 
00154 
00155 // Reset the tag iterator to the beginning of the tags.
00156 void SamRecord::resetTagIter()
00157 {
00158     myLastTagIndex = -1;
00159 }
00160 
00161 
00162 // Returns whether or not the record is valid.
00163 // Header is needed to perform some validation against it.
00164 bool SamRecord::isValid(SamFileHeader& header)
00165 {
00166     myStatus = SamStatus::SUCCESS;
00167     SamValidationErrors invalidSamErrors;
00168     if(!SamValidator::isValid(header, *this, invalidSamErrors))
00169     {
00170         // The record is not valid.
00171         std::string errorMessage = "";
00172         invalidSamErrors.getErrorString(errorMessage);
00173         myStatus.setStatus(SamStatus::INVALID, errorMessage.c_str());
00174         return(false);
00175     }
00176     // The record is valid.
00177     return(true);
00178 }
00179 
00180 
00181 // Read the BAM record from a file.
00182 SamStatus::Status SamRecord::setBufferFromFile(IFILE filePtr, 
00183                                                SamFileHeader& header)
00184 {
00185     myStatus = SamStatus::SUCCESS;
00186     if((filePtr == NULL) || (filePtr->isOpen() == false))
00187     {
00188         // File is not open, return failure.
00189         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00190                            "Can't read from an unopened file.");
00191         return(SamStatus::FAIL_ORDER);
00192     }
00193 
00194     // Clear the record.
00195     resetRecord();
00196 
00197     // read the record size.
00198     int numBytes = 
00199         ifread(filePtr, &(myRecordPtr->myBlockSize), sizeof(int32_t));
00200 
00201     if(ifeof(filePtr))
00202     {
00203         if(numBytes == 0)
00204         {
00205             // End of file, nothing was read, no more records.
00206             myStatus.setStatus(SamStatus::NO_MORE_RECS,
00207                                "No more records left to read.");
00208             return(SamStatus::NO_MORE_RECS);
00209         }
00210         else
00211         {
00212             // Error: end of the file reached prior to reading the rest of the
00213             // record.
00214             myStatus.setStatus(SamStatus::FAIL_PARSE, 
00215                                "EOF reached in the middle of a record.");
00216             return(SamStatus::FAIL_PARSE);
00217         }
00218     }
00219 
00220     // allocate space for the record size.
00221     if(!allocateRecordStructure(myRecordPtr->myBlockSize + sizeof(int32_t)))
00222     {
00223         // Failed to allocate space.
00224         // Status is set by allocateRecordStructure.
00225         return(SamStatus::FAIL_MEM);
00226     }
00227 
00228     // Read the rest of the alignment block, starting at the reference id.
00229     if(ifread(filePtr, &(myRecordPtr->myReferenceID), myRecordPtr->myBlockSize)
00230        != (unsigned int)myRecordPtr->myBlockSize)
00231     {
00232         // Error reading the record.  Reset it and return failure.
00233         resetRecord();
00234         myStatus.setStatus(SamStatus::FAIL_IO,
00235                            "Failed to read the record");
00236         return(SamStatus::FAIL_IO);
00237     }
00238 
00239     setVariablesForNewBuffer(header);
00240 
00241     // Return the status of the record.
00242     return(SamStatus::SUCCESS);
00243 }
00244 
00245 
00246 void SamRecord::setReference(GenomeSequence* reference)
00247 {
00248     myRefPtr = reference;
00249 }
00250 
00251 
00252 // Set the type of sequence translation to use when getting
00253 // the sequence.  The default type (if this method is never called) is
00254 // NONE (the sequence is left as-is).  This is used 
00255 void SamRecord::setSequenceTranslation(SequenceTranslation translation)
00256 {
00257     mySequenceTranslation = translation;
00258 }
00259 
00260 
00261 bool SamRecord::setReadName(const char* readName) 
00262 {
00263     myReadName = readName;
00264     myIsBufferSynced = false;
00265     myIsReadNameBufferValid = false;
00266     myStatus = SamStatus::SUCCESS;
00267 
00268     // The read name must at least have some length, otherwise this is a parsing
00269     // error.
00270     if(myReadName.Length() == 0)
00271     {
00272         // Invalid - reset ReadName return false.
00273         myReadName = DEFAULT_READ_NAME;
00274         myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH;
00275         myStatus.setStatus(SamStatus::INVALID, "0 length Query Name.");
00276         return(false);
00277     }
00278 
00279     return true;
00280 }
00281 
00282 
00283 bool SamRecord::setFlag(uint16_t flag)
00284 {
00285     myStatus = SamStatus::SUCCESS;
00286     myRecordPtr->myFlag = flag;
00287     return true;
00288 }
00289 
00290 
00291 bool SamRecord::setReferenceName(SamFileHeader& header,
00292                                  const char* referenceName) 
00293 {
00294     myStatus = SamStatus::SUCCESS;
00295 
00296     myReferenceName = referenceName;
00297     myRecordPtr->myReferenceID = header.getReferenceID(referenceName);
00298 
00299     return true;
00300 }
00301 
00302 
00303 bool SamRecord::set1BasedPosition(int32_t position) 
00304 {
00305     return(set0BasedPosition(position - 1));
00306 }
00307 
00308 
00309 bool SamRecord::set0BasedPosition(int32_t position)
00310 {
00311     myStatus = SamStatus::SUCCESS;
00312     myRecordPtr->myPosition = position;
00313     myIsBinValid = false;
00314     return true;
00315 }
00316 
00317 
00318 bool SamRecord::setMapQuality(uint8_t mapQuality)
00319 {
00320     myStatus = SamStatus::SUCCESS;
00321     myRecordPtr->myMapQuality = mapQuality;
00322     return true;
00323 }
00324 
00325 
00326 bool SamRecord::setCigar(const char* cigar)
00327 {
00328     myStatus = SamStatus::SUCCESS;
00329     myCigar = cigar;
00330  
00331     myIsBufferSynced = false;
00332     myIsCigarBufferValid = false;
00333     myCigarTempBufferLength = -1;
00334     myIsBinValid = false;
00335 
00336     // Initialize the calculated alignment info to the uncalculated value.
00337     myAlignmentLength = -1;
00338     myUnclippedStartOffset = -1;
00339     myUnclippedEndOffset = -1;
00340 
00341     return true;
00342 }
00343 
00344 
00345 bool SamRecord::setCigar(const Cigar& cigar)
00346 {
00347     myStatus = SamStatus::SUCCESS;
00348     cigar.getCigarString(myCigar);
00349  
00350     myIsBufferSynced = false;
00351     myIsCigarBufferValid = false;
00352     myCigarTempBufferLength = -1;
00353     myIsBinValid = false;
00354 
00355     // Initialize the calculated alignment info to the uncalculated value.
00356     myAlignmentLength = -1;
00357     myUnclippedStartOffset = -1;
00358     myUnclippedEndOffset = -1;
00359 
00360     return true;
00361 }
00362 
00363 
00364 bool SamRecord::setMateReferenceName(SamFileHeader& header,
00365                                      const char* mateReferenceName) 
00366 {
00367     myStatus = SamStatus::SUCCESS;
00368     // Set the mate reference, if it is "=", set it to be equal
00369     // to myReferenceName.  This assumes that myReferenceName has already
00370     // been called.
00371     if(strcmp(mateReferenceName, FIELD_ABSENT_STRING) == 0)
00372     {
00373         myMateReferenceName = myReferenceName;
00374     }
00375     else
00376     {
00377         myMateReferenceName = mateReferenceName;
00378     }
00379 
00380     // Set the Mate Reference ID.
00381     myRecordPtr->myMateReferenceID = 
00382         header.getReferenceID(myMateReferenceName);
00383 
00384     return true;
00385 }
00386 
00387 
00388 bool SamRecord::set1BasedMatePosition(int32_t matePosition)
00389 {
00390     return(set0BasedMatePosition(matePosition - 1));
00391 }
00392 
00393 
00394 bool SamRecord::set0BasedMatePosition(int32_t matePosition)
00395 {
00396     myStatus = SamStatus::SUCCESS;
00397     myRecordPtr->myMatePosition = matePosition;
00398     return true;
00399 }
00400 
00401 
00402 bool SamRecord::setInsertSize(int32_t insertSize)
00403 {
00404     myStatus = SamStatus::SUCCESS;
00405     myRecordPtr->myInsertSize = insertSize;
00406     return true;
00407 }
00408 
00409 
00410 bool SamRecord::setSequence(const char* seq) 
00411 {
00412     myStatus = SamStatus::SUCCESS;
00413     mySequence = seq;
00414     mySeqWithEq.clear();
00415     mySeqWithoutEq.clear();
00416    
00417     myIsBufferSynced = false;
00418     myIsSequenceBufferValid = false;
00419     return true;
00420 }
00421 
00422 
00423 bool SamRecord::setQuality(const char* quality) 
00424 {
00425     myStatus = SamStatus::SUCCESS;
00426     myQuality = quality;
00427     myIsBufferSynced = false;
00428     myIsQualityBufferValid = false;
00429     return true;
00430 }
00431 
00432 
00433 // Set the BAM record from the passeed in buffer of the specified size.
00434 // Note: The size includes the block size.
00435 SamStatus::Status SamRecord::setBuffer(const char* fromBuffer,
00436                                        uint32_t fromBufferSize,
00437                                        SamFileHeader& header)
00438 {
00439     myStatus = SamStatus::SUCCESS;
00440     if((fromBuffer == NULL) || (fromBufferSize == 0))
00441     {
00442         // Buffer is empty.
00443         myStatus.setStatus(SamStatus::FAIL_PARSE,
00444                            "Cannot parse an empty file.");
00445         return(SamStatus::FAIL_PARSE);
00446     }
00447 
00448     // Clear the record.   
00449     resetRecord();
00450 
00451     // allocate space for the record size.
00452     if(!allocateRecordStructure(fromBufferSize))
00453     {
00454         // Failed to allocate space.
00455         return(SamStatus::FAIL_MEM);
00456     }
00457    
00458     memcpy(myRecordPtr, fromBuffer, fromBufferSize);
00459 
00460     setVariablesForNewBuffer(header);
00461 
00462     // Return the status of the record.
00463     return(SamStatus::SUCCESS);
00464 }
00465 
00466 
00467 // Add the specified tag to the record.
00468 // Returns true if the tag was successfully added, false otherwise.
00469 bool SamRecord::addTag(const char* tag, char vtype, const char* valuePtr)
00470 {
00471     myStatus = SamStatus::SUCCESS;
00472     bool status = true; // default to successful.
00473     int key = 0;
00474     int intVal = 0;
00475     int index = 0;
00476     char bamvtype;
00477 
00478     int tagBufferSize = 0;
00479 
00480     // First check to see if the tags need to be synced to the buffer.
00481     if(myNeedToSetTagsFromBuffer)
00482     {
00483         if(!setTagsFromBuffer())
00484         {
00485             // Failed to read tags from the buffer, so cannot add new ones.
00486             return(false);
00487         }
00488     }
00489 
00490     switch (vtype)
00491     {
00492         case 'A' :
00493             index = integers.Length();
00494             bamvtype = vtype;
00495             integers.Push((const int)*(valuePtr));
00496             tagBufferSize += 4;
00497             break;
00498         case 'i' :
00499             index = integers.Length();
00500             intVal = atoi((const char *)valuePtr);
00501             // Ints come in as int.  But it can be represented in fewer bits.
00502             // So determine a more specific type that is in line with the
00503             // types for BAM files.
00504             // First check to see if it is a negative.
00505             if(intVal < 0)
00506             {
00507                 // The int is negative, so it will need to use a signed type.
00508                 // See if it is greater than the min value for a char.
00509                 if(intVal > std::numeric_limits<char>::min())
00510                 {
00511                     // It can be stored in a signed char.
00512                     bamvtype = 'c';
00513                     tagBufferSize += 4;
00514                 }
00515                 else if(intVal > std::numeric_limits<short>::min())
00516                 {
00517                     // It fits in a signed short.
00518                     bamvtype = 's';
00519                     tagBufferSize += 5;
00520                 }
00521                 else
00522                 {
00523                     // Just store it as a signed int.
00524                     bamvtype = 'i';
00525                     tagBufferSize += 7;
00526                 }
00527             }
00528             else
00529             {
00530                 // It is positive, so an unsigned type can be used.
00531                 if(intVal < std::numeric_limits<unsigned char>::max())
00532                 {
00533                     // It is under the max of an unsigned char.
00534                     bamvtype = 'C';
00535                     tagBufferSize += 4;
00536                 }
00537                 else if(intVal < std::numeric_limits<unsigned short>::max())
00538                 {
00539                     // It is under the max of an unsigned short.
00540                     bamvtype = 'S';
00541                     tagBufferSize += 5;
00542                 }
00543                 else
00544                 {
00545                     // Just store it as an unsigned int.
00546                     bamvtype = 'I';
00547                     tagBufferSize += 7;
00548                 }
00549             }
00550             integers.Push(intVal);
00551             break;
00552         case 'Z' :
00553             index = strings.Length();
00554             bamvtype = vtype;
00555             strings.Push((const char *)valuePtr);
00556             tagBufferSize += 4 + strings.Last().Length();
00557             break;
00558         case 'f' :
00559             index = doubles.Length();
00560             bamvtype = vtype;
00561             doubles.Push(atof((const char *)valuePtr));
00562             tagBufferSize += 7;
00563             break;
00564         default :
00565             fprintf(stderr,
00566                     "samFile::ReadSAM() - Unknown custom field of type %c\n",
00567                     vtype);
00568             myStatus.setStatus(SamStatus::FAIL_PARSE, 
00569                                "Unknown custom field in a tag");
00570             status = false;
00571     }
00572 
00573     // Only add the tag if it has so far been successfully processed.
00574     if(status)
00575     {
00576         // The buffer tags are now out of sync.
00577         myNeedToSetTagsInBuffer = true;
00578         myIsTagsBufferValid = false;
00579         myIsBufferSynced = false;
00580 
00581         key = MAKEKEY(tag[0], tag[1], bamvtype);
00582         extras.Add(key, index);
00583         myTagBufferSize += tagBufferSize;
00584     }
00585     return(status);
00586 }
00587 
00588 
00589 // Get methods for record fields.
00590 const void* SamRecord::getRecordBuffer()
00591 {
00592     return(getRecordBuffer(mySequenceTranslation));
00593 }
00594 
00595 
00596 // Get methods for record fields.
00597 const void* SamRecord::getRecordBuffer(SequenceTranslation translation) 
00598 {
00599     myStatus = SamStatus::SUCCESS;
00600     bool status = true;
00601     // If the buffer is not synced or the sequence in the buffer is not
00602     // properly translated, fix the buffer.
00603     if((myIsBufferSynced == false) ||
00604        (myBufferSequenceTranslation != translation))
00605     {
00606         status &= fixBuffer(translation);
00607     }
00608     // If the buffer is synced, check to see if the tags need to be synced.
00609     if(myNeedToSetTagsInBuffer)
00610     {
00611         status &= setTagsInBuffer();
00612     }
00613     if(!status)
00614     {
00615         return(NULL);
00616     }
00617     return (const void *)myRecordPtr;
00618 }
00619 
00620 
00621 // Write the record as a buffer into the file using the class's 
00622 // sequence translation setting.
00623 SamStatus::Status SamRecord::writeRecordBuffer(IFILE filePtr)
00624 {
00625     return(writeRecordBuffer(filePtr, mySequenceTranslation));
00626 }
00627 
00628 
00629 // Write the record as a buffer into the file using the specified translation.
00630 SamStatus::Status SamRecord::writeRecordBuffer(IFILE filePtr, 
00631                                                SequenceTranslation translation)
00632 {
00633     myStatus = SamStatus::SUCCESS;
00634     if((filePtr == NULL) || (filePtr->isOpen() == false))
00635     {
00636         // File is not open, return failure.
00637         myStatus.setStatus(SamStatus::FAIL_ORDER,
00638                            "Can't write to an unopened file.");
00639         return(SamStatus::FAIL_ORDER);
00640     }
00641 
00642     if((myIsBufferSynced == false) ||
00643        (myBufferSequenceTranslation != translation))
00644     {
00645         if(!fixBuffer(translation))
00646         {
00647             return(myStatus.getStatus());
00648         }
00649     }
00650 
00651     // Write the record.
00652     unsigned int numBytesToWrite = myRecordPtr->myBlockSize + sizeof(int32_t);
00653     unsigned int numBytesWritten = 
00654         ifwrite(filePtr, myRecordPtr, numBytesToWrite);
00655 
00656     // Return status based on if the correct number of bytes were written.
00657     if(numBytesToWrite == numBytesWritten)
00658     {
00659         return(SamStatus::SUCCESS);
00660     }
00661     // The correct number of bytes were not written.
00662     myStatus.setStatus(SamStatus::FAIL_IO, "Failed to write the entire record.");
00663     return(SamStatus::FAIL_IO);
00664 }
00665 
00666 
00667 int32_t SamRecord::getBlockSize() 
00668 {
00669     myStatus = SamStatus::SUCCESS;
00670     // If the buffer isn't synced, sync the buffer to determine the
00671     // block size.
00672     if(myIsBufferSynced == false)
00673     {
00674         // Since this just returns the block size, the translation of
00675         // the sequence does not matter, so just use the currently set
00676         // value.
00677         fixBuffer(myBufferSequenceTranslation);
00678     }
00679     return myRecordPtr->myBlockSize;
00680 }
00681 
00682 
00683 // This method returns the reference name.
00684 const char* SamRecord::getReferenceName()
00685 {
00686     myStatus = SamStatus::SUCCESS;
00687     return myReferenceName.c_str();
00688 }
00689 
00690 
00691 int32_t SamRecord::getReferenceID()
00692 {
00693     myStatus = SamStatus::SUCCESS;
00694     return myRecordPtr->myReferenceID;
00695 }
00696 
00697 
00698 int32_t SamRecord::get1BasedPosition()
00699 {
00700     myStatus = SamStatus::SUCCESS;
00701     return (myRecordPtr->myPosition + 1);
00702 }
00703 
00704 
00705 int32_t SamRecord::get0BasedPosition()
00706 {
00707     myStatus = SamStatus::SUCCESS;
00708     return myRecordPtr->myPosition;
00709 }
00710 
00711 
00712 uint8_t SamRecord::getReadNameLength() 
00713 {
00714     myStatus = SamStatus::SUCCESS;
00715     // If the buffer is valid, return the size from there, otherwise get the 
00716     // size from the string length + 1 (ending null).
00717     if(myIsReadNameBufferValid)
00718     {
00719         return(myRecordPtr->myReadNameLength);
00720     }
00721    
00722     return(myReadName.Length() + 1);
00723 }
00724 
00725 
00726 uint8_t SamRecord::getMapQuality()
00727 {
00728     myStatus = SamStatus::SUCCESS;
00729     return myRecordPtr->myMapQuality;
00730 }
00731 
00732 
00733 uint16_t SamRecord::getBin()
00734 {
00735     myStatus = SamStatus::SUCCESS;
00736     if(!myIsBinValid)
00737     {
00738         // The bin that is set in the record is not valid, so
00739         // reset it.
00740         myRecordPtr->myBin = 
00741             bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd());      
00742         myIsBinValid = true;
00743     }
00744     return(myRecordPtr->myBin);
00745 }
00746 
00747 
00748 uint16_t SamRecord::getCigarLength()
00749 {
00750     myStatus = SamStatus::SUCCESS;
00751     // If the cigar buffer is valid
00752     // then get the length from there.
00753     if(myIsCigarBufferValid)
00754     {
00755         return myRecordPtr->myCigarLength;      
00756     }
00757 
00758     if(myCigarTempBufferLength == -1)
00759     {
00760         // The cigar buffer is not valid and the cigar temp buffer is not set,
00761         // so parse the string.
00762         parseCigarString();
00763     }
00764    
00765     // The temp buffer is now set, so return the size.
00766     return(myCigarTempBufferLength);
00767 }
00768 
00769 
00770 uint16_t SamRecord::getFlag()
00771 {
00772     myStatus = SamStatus::SUCCESS;
00773     return myRecordPtr->myFlag;
00774 }
00775 
00776 
00777 int32_t SamRecord::getReadLength() 
00778 {
00779     myStatus = SamStatus::SUCCESS;
00780     if(myIsSequenceBufferValid == false)
00781     {
00782         // If the sequence is "*", then return 0.
00783         if((mySequence.Length() == 1) && (mySequence[0] == '*'))
00784         {
00785             return(0);
00786         }
00787         // Do not add 1 since it is not null terminated.
00788         return(mySequence.Length());
00789     }
00790     return(myRecordPtr->myReadLength);
00791 }
00792 
00793 
00794 // This method returns the mate reference name.  If it is equal to the
00795 // reference name, it still returns the reference name.
00796 const char* SamRecord::getMateReferenceName()
00797 {
00798     myStatus = SamStatus::SUCCESS;
00799     return myMateReferenceName.c_str();
00800 }
00801 
00802 
00803 // This method returns the mate reference name.  If it is equal to the
00804 // reference name, it returns "=", unless they are both "*" in which case
00805 // "*" is returned.
00806 const char* SamRecord::getMateReferenceNameOrEqual()
00807 {
00808     myStatus = SamStatus::SUCCESS;
00809     if(myMateReferenceName == "*")
00810     {
00811         return(myMateReferenceName);
00812     }
00813     if(myMateReferenceName == getReferenceName())
00814     {
00815         return(FIELD_ABSENT_STRING);
00816     }
00817     else
00818     {
00819         return(myMateReferenceName);
00820     }
00821 }
00822 
00823 
00824 int32_t SamRecord::getMateReferenceID()
00825 {
00826     myStatus = SamStatus::SUCCESS;
00827     return myRecordPtr->myMateReferenceID;
00828 }
00829 
00830 
00831 int32_t SamRecord::get1BasedMatePosition()
00832 {
00833     myStatus = SamStatus::SUCCESS;
00834     return (myRecordPtr->myMatePosition + 1);
00835 }
00836 
00837 
00838 int32_t SamRecord::get0BasedMatePosition()
00839 {
00840     myStatus = SamStatus::SUCCESS;
00841     return myRecordPtr->myMatePosition;
00842 }
00843 
00844 
00845 int32_t SamRecord::getInsertSize()
00846 {
00847     myStatus = SamStatus::SUCCESS;
00848     return myRecordPtr->myInsertSize;
00849 }
00850 
00851 
00852 // Returns the inclusive rightmost position of the clipped sequence.
00853 int32_t SamRecord::get0BasedAlignmentEnd()
00854 {
00855     myStatus = SamStatus::SUCCESS;
00856     if(myAlignmentLength == -1)
00857     {
00858         // Alignment end has not been set, so calculate it.
00859         parseCigar();
00860     }
00861     // If alignment length > 0, subtract 1 from it to get the end.
00862     if(myAlignmentLength == 0)
00863     {
00864         // Length is 0, just return the start position.
00865         return(myRecordPtr->myPosition);
00866     }
00867     return(myRecordPtr->myPosition + myAlignmentLength - 1);
00868 }
00869 
00870 
00871 // Returns the inclusive rightmost position of the clipped sequence.
00872 int32_t SamRecord::get1BasedAlignmentEnd()
00873 {
00874     return(get0BasedAlignmentEnd() + 1);
00875 }
00876 
00877    
00878 // Return the length of the alignment.
00879 int32_t SamRecord::getAlignmentLength()
00880 {
00881     myStatus = SamStatus::SUCCESS;
00882     if(myAlignmentLength == -1)
00883     {
00884         // Alignment end has not been set, so calculate it.
00885         parseCigar();
00886     }
00887     // Return the alignment length.
00888     return(myAlignmentLength);
00889 }
00890 
00891 // Returns the inclusive left-most position adjust for clipped bases.
00892 int32_t SamRecord::get0BasedUnclippedStart()
00893 {
00894     myStatus = SamStatus::SUCCESS;
00895     if(myUnclippedStartOffset == -1)
00896     {
00897         // Unclipped has not yet been calculated, so parse the cigar to get it
00898         parseCigar();
00899     }
00900     return(myRecordPtr->myPosition - myUnclippedStartOffset);
00901 }
00902 
00903 
00904 // Returns the inclusive left-most position adjust for clipped bases.
00905 int32_t SamRecord::get1BasedUnclippedStart()
00906 {
00907     return(get0BasedUnclippedStart() + 1);
00908 }
00909 
00910 
00911 // Returns the inclusive right-most position adjust for clipped bases.
00912 int32_t SamRecord::get0BasedUnclippedEnd()
00913 {
00914     // myUnclippedEndOffset will be set by get0BasedAlignmentEnd if the 
00915     // cigar has not yet been parsed, so no need to check it here.
00916     return(get0BasedAlignmentEnd() + myUnclippedEndOffset);
00917 }
00918 
00919 
00920 // Returns the inclusive right-most position adjust for clipped bases.
00921 int32_t SamRecord::get1BasedUnclippedEnd()
00922 {
00923     return(get0BasedUnclippedEnd() + 1);
00924 }
00925 
00926 
00927 // Get the read name.
00928 const char* SamRecord::getReadName()
00929 {
00930     myStatus = SamStatus::SUCCESS;
00931     if(myReadName.Length() == 0)
00932     {
00933         // 0 Length, means that it is in the buffer, but has not yet
00934         // been synced to the string, so do the sync.
00935         myReadName = (char*)&(myRecordPtr->myData);
00936     }
00937     return myReadName.c_str();
00938 }
00939 
00940 
00941 const char* SamRecord::getCigar()
00942 {
00943     myStatus = SamStatus::SUCCESS;
00944     if(myCigar.Length() == 0)
00945     {
00946         // 0 Length, means that it is in the buffer, but has not yet
00947         // been synced to the string, so do the sync.
00948         parseCigarBinary();
00949     }
00950     return myCigar.c_str();
00951 }
00952 
00953 
00954 const char* SamRecord::getSequence()
00955 {
00956     return(getSequence(mySequenceTranslation));
00957 }
00958 
00959 
00960 const char* SamRecord::getSequence(SequenceTranslation translation)
00961 {
00962     myStatus = SamStatus::SUCCESS;
00963     if(mySequence.Length() == 0)
00964     {
00965         // 0 Length, means that it is in the buffer, but has not yet
00966         // been synced to the string, so do the sync.
00967         setSequenceAndQualityFromBuffer();
00968     }
00969 
00970     // Determine if translation needs to be done.
00971     if((translation == NONE) || (myRefPtr == NULL))
00972     {
00973         return mySequence.c_str();
00974     }
00975     else if(translation == EQUAL)
00976     {
00977         if(mySeqWithEq.length() == 0)
00978         {
00979             // Check to see if the sequence is defined.
00980             if(mySequence == "*")
00981             {
00982                 // Sequence is undefined, so no translation necessary.
00983                 mySeqWithEq = '*';
00984             }
00985             else
00986             {
00987                 // Sequence defined, so translate it.
00988                 SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(), 
00989                                                   myRecordPtr->myPosition,
00990                                                   *(getCigarInfo()),
00991                                                   getReferenceName(),
00992                                                   *myRefPtr,
00993                                                   mySeqWithEq);
00994             }
00995         }
00996         return(mySeqWithEq.c_str());
00997     }
00998     else
00999     {
01000         // translation == BASES
01001         if(mySeqWithoutEq.length() == 0)
01002         {
01003             if(mySequence == "*")
01004             {
01005                 // Sequence is undefined, so no translation necessary.
01006                 mySeqWithoutEq = '*';
01007             }
01008             else
01009             {
01010                 // Sequence defined, so translate it.
01011                 SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(), 
01012                                                      myRecordPtr->myPosition,
01013                                                      *(getCigarInfo()),
01014                                                      getReferenceName(),
01015                                                      *myRefPtr,
01016                                                      mySeqWithoutEq);
01017             }
01018         }
01019         return(mySeqWithoutEq.c_str());
01020     }
01021 }
01022 
01023 
01024 const char* SamRecord::getQuality() 
01025 {
01026     myStatus = SamStatus::SUCCESS;
01027     if(myQuality.Length() == 0)
01028     {
01029         // 0 Length, means that it is in the buffer, but has not yet
01030         // been synced to the string, so do the sync.
01031         setSequenceAndQualityFromBuffer();      
01032     }
01033     return myQuality.c_str();
01034 }
01035 
01036 
01037 char SamRecord::getSequence(int index)
01038 {
01039     return(getSequence(index, mySequenceTranslation));
01040 }
01041 
01042 
01043 char SamRecord::getSequence(int index, SequenceTranslation translation)
01044 {
01045     static const char * asciiBases = "=AC.G...T......N";
01046 
01047     // Determine the read length.
01048     int32_t readLen = getReadLength();
01049 
01050     // If the read length is 0, this method should not be called.
01051     if(readLen == 0)
01052     {
01053         String exceptionString = "SamRecord::getSequence(";
01054         exceptionString += index;
01055         exceptionString += ") is not allowed since sequence = '*'";
01056         throw std::runtime_error(exceptionString.c_str());
01057     }
01058     else if((index < 0) || (index >= readLen))
01059     {
01060         // Only get here if the index was out of range, so thow an exception.
01061         String exceptionString = "SamRecord::getSequence(";
01062         exceptionString += index;
01063         exceptionString += ") is out of range. Index must be between 0 and ";
01064         exceptionString += (readLen - 1);
01065         throw std::runtime_error(exceptionString.c_str());
01066     }
01067 
01068     // Determine if translation needs to be done.
01069     if((translation == NONE) || (myRefPtr == NULL))
01070     {
01071         // No translation needs to be done.
01072         if(mySequence.Length() == 0)
01073         {
01074             // Parse BAM sequence.
01075             // TODO - maybe store this pointer - and use that to track when
01076             // valid?
01077             unsigned char * packedSequence = 
01078                 (unsigned char *)myRecordPtr->myData + 
01079                 myRecordPtr->myReadNameLength +
01080                 myRecordPtr->myCigarLength * sizeof(int);
01081             
01082             return(index & 1 ?
01083                    asciiBases[packedSequence[index / 2] & 0xF] :
01084                    asciiBases[packedSequence[index / 2] >> 4]);
01085         }
01086         // Already have string.
01087         return(mySequence[index]);
01088     }
01089     else
01090     {
01091         // Need to translate the sequence either to have '=' or to not
01092         // have it.
01093         // First check to see if the sequence has been set.
01094         if(mySequence.Length() == 0)
01095         {
01096             // 0 Length, means that it is in the buffer, but has not yet
01097             // been synced to the string, so do the sync.
01098             setSequenceAndQualityFromBuffer();
01099         }
01100 
01101         // Check the type of translation.
01102         if(translation == EQUAL)
01103         {
01104             // Check whether or not the string has already been 
01105             // retrieved that has the '=' in it.
01106             if(mySeqWithEq.length() == 0)
01107             {
01108                 // The string with '=' has not yet been determined,
01109                 // so get the string.
01110                 // Check to see if the sequence is defined.
01111                 if(mySequence == "*")
01112                 {
01113                     // Sequence is undefined, so no translation necessary.
01114                     mySeqWithEq = '*';
01115                 }
01116                 else
01117                 {
01118                     // Sequence defined, so translate it.
01119                     SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(), 
01120                                                       myRecordPtr->myPosition, 
01121                                                       *(getCigarInfo()),
01122                                                       getReferenceName(),
01123                                                       *myRefPtr,
01124                                                       mySeqWithEq);
01125                 }
01126             }
01127             // Sequence is set, so return it.
01128             return(mySeqWithEq[index]);
01129         }
01130         else
01131         {
01132             // translation == BASES
01133             // Check whether or not the string has already been 
01134             // retrieved that does not have the '=' in it.
01135             if(mySeqWithoutEq.length() == 0)
01136             {
01137                 // The string with '=' has not yet been determined,
01138                 // so get the string.
01139                 // Check to see if the sequence is defined.
01140                 if(mySequence == "*")
01141                 {
01142                     // Sequence is undefined, so no translation necessary.
01143                     mySeqWithoutEq = '*';
01144                 }
01145                 else
01146                 {
01147                     // Sequence defined, so translate it.
01148                     // The string without '=' has not yet been determined,
01149                     // so get the string.
01150                     SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(), 
01151                                                          myRecordPtr->myPosition, 
01152                                                          *(getCigarInfo()),
01153                                                          getReferenceName(),
01154                                                          *myRefPtr,
01155                                                          mySeqWithoutEq);
01156                 }
01157             }
01158             // Sequence is set, so return it.
01159             return(mySeqWithoutEq[index]);
01160         }
01161     }
01162 }
01163 
01164 
01165 char SamRecord::getQuality(int index)
01166 {
01167     // Determine the read length.
01168     int32_t readLen = getReadLength();
01169 
01170     // If the read length is 0, return ' ' whose ascii code is below
01171     // the minimum ascii code for qualities.
01172     if(readLen == 0)
01173     {
01174         return(BaseUtilities::UNKNOWN_QUALITY_CHAR);
01175     }
01176     else if((index < 0) || (index >= readLen))
01177     {
01178         // Only get here if the index was out of range, so thow an exception.
01179         String exceptionString = "SamRecord::getQuality(";
01180         exceptionString += index;
01181         exceptionString += ") is out of range. Index must be between 0 and ";
01182         exceptionString += (readLen - 1);
01183         throw std::runtime_error(exceptionString.c_str());
01184     }
01185 
01186     if(myQuality.Length() == 0)
01187     {
01188         // Parse BAM Quality.
01189         unsigned char * packedQuality = 
01190             (unsigned char *)myRecordPtr->myData +
01191             myRecordPtr->myReadNameLength + 
01192             myRecordPtr->myCigarLength * sizeof(int) + 
01193             (myRecordPtr->myReadLength + 1) / 2;
01194         return(packedQuality[index] + 33);
01195     }
01196     else
01197     {
01198         // Already have string.
01199         if((myQuality.Length() == 1) && (myQuality[0] == '*'))
01200         {
01201             // Return 0xFF like it does for BAM.
01202             return(0xFF);
01203         }
01204         else
01205         {
01206             return(myQuality[index]);
01207         }
01208     }
01209 }
01210 
01211    
01212 Cigar* SamRecord::getCigarInfo()
01213 {
01214     // Check to see whether or not the Cigar has already been
01215     // set - this is determined by checking if alignment length
01216     // is set since alignment length and the cigar are set
01217     // at the same time.
01218     if(myAlignmentLength == -1)
01219     {
01220         // Not been set, so calculate it.
01221         parseCigar();
01222     }
01223     return(&myCigarRoller);
01224 }
01225 
01226 
01227 uint32_t SamRecord::getTagLength()
01228 {
01229     myStatus = SamStatus::SUCCESS;
01230     if(myNeedToSetTagsFromBuffer)
01231     {
01232         // Tags are only set in the buffer, so the size of the tags is 
01233         // the length of the record minus the starting location of the tags.
01234         unsigned char * tagStart = 
01235             (unsigned char *)myRecordPtr->myData 
01236             + myRecordPtr->myReadNameLength 
01237             + myRecordPtr->myCigarLength * sizeof(int)
01238             + (myRecordPtr->myReadLength + 1) / 2 + myRecordPtr->myReadLength;
01239       
01240         // The non-tags take up from the start of the record to the tag start.
01241         // Do not include the block size part of the record since it is not
01242         // included in the size.
01243         uint32_t nonTagSize = 
01244             tagStart - (unsigned char*)&(myRecordPtr->myReferenceID);
01245         // Tags take up the size of the block minus the non-tag section.
01246         uint32_t tagSize = myRecordPtr->myBlockSize - nonTagSize;
01247         return(tagSize);
01248     }
01249 
01250     // Tags are stored outside the buffer, so myTagBufferSize is set.
01251     return(myTagBufferSize);
01252 }
01253 
01254 
01255 // Returns true if there is another tag and sets tag and vtype to the
01256 // appropriate values, and returns a pointer to the value.
01257 // Sets the Status to SUCCESS when a tag is successfully returned or
01258 // when there are no more tags.  Otherwise the status is set to describe
01259 // why it failed (parsing, etc).
01260 bool SamRecord::getNextSamTag(char* tag, char& vtype, void** value)
01261 {
01262     myStatus = SamStatus::SUCCESS;
01263     if(myNeedToSetTagsFromBuffer)
01264     {
01265         if(!setTagsFromBuffer())
01266         {
01267             // Failed to read the tags from the buffer, so cannot
01268             // get tags.
01269             return(false);
01270         }
01271     }
01272 
01273     // Increment the tag index to start looking at the next tag.
01274     // At the beginning, it is set to -1.
01275     myLastTagIndex++;
01276     int maxTagIndex = extras.Capacity();
01277     if(myLastTagIndex >= maxTagIndex)
01278     {
01279         // Hit the end of the tags, return false, no more tags.
01280         // Status is still success since this is not an error, 
01281         // it is just the end of the list.
01282         return(false);
01283     }
01284 
01285     bool tagFound = false;
01286     // Loop until a tag is found or the end of extras is hit.
01287     while((tagFound == false) && (myLastTagIndex < maxTagIndex))
01288     {
01289         if(extras.SlotInUse(myLastTagIndex))
01290         {
01291             // Found a slot to use.
01292             int key = extras.GetKey(myLastTagIndex);
01293             getTag(key, tag);
01294             getVtype(key, vtype);
01295             tagFound = true;
01296             // Get the value associated with the key based on the vtype.
01297             switch (vtype)
01298             {
01299                 case 'A' :
01300                     *value = getIntegerPtr(myLastTagIndex);
01301                     break;
01302                 case 'f' :
01303                     *value = getDoublePtr(myLastTagIndex);
01304                     break;
01305                 case 'c' :
01306                 case 'C' :
01307                 case 's' :
01308                 case 'S' :
01309                 case 'i' :
01310                 case 'I' :
01311                     vtype = 'i';
01312                     *value = getIntegerPtr(myLastTagIndex);
01313                     break;
01314                 case 'Z' :
01315                     *value = getStringPtr(myLastTagIndex);
01316                     break;
01317                 default:
01318                     myStatus.setStatus(SamStatus::FAIL_PARSE,
01319                                        "Unknown tag type");
01320                     tagFound = false;
01321                     break;
01322             }
01323         }
01324         if(!tagFound)
01325         {
01326             // Increment the index since a tag was not found.
01327             myLastTagIndex++;
01328         }
01329     }
01330     return(tagFound);
01331 }
01332 
01333 
01334 // Returns the values of all fields except the tags.
01335 bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName, 
01336                           String& cigar, String& sequence, String& quality)
01337 {
01338     return(getFields(recStruct, readName, cigar, sequence, quality,
01339                      mySequenceTranslation));
01340 }
01341 
01342 
01343 // Returns the values of all fields except the tags.
01344 bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName, 
01345                           String& cigar, String& sequence, String& quality,
01346                           SequenceTranslation translation)
01347 {
01348     myStatus = SamStatus::SUCCESS;
01349     if(myIsBufferSynced == false)
01350     {
01351         if(!fixBuffer(translation))
01352         {
01353             // failed to set the buffer, return false.
01354             return(false);
01355         }
01356     }
01357     memcpy(&recStruct, myRecordPtr, sizeof(bamRecordStruct));
01358 
01359     readName = getReadName();
01360     // Check the status.
01361     if(myStatus != SamStatus::SUCCESS)
01362     {
01363         // Failed to set the fields, return false.
01364         return(false);
01365     }
01366     cigar = getCigar();
01367     // Check the status.
01368     if(myStatus != SamStatus::SUCCESS)
01369     {
01370         // Failed to set the fields, return false.
01371         return(false);
01372     }
01373     sequence = getSequence(translation);
01374     // Check the status.
01375     if(myStatus != SamStatus::SUCCESS)
01376     {
01377         // Failed to set the fields, return false.
01378         return(false);
01379     }
01380     quality = getQuality();
01381     // Check the status.
01382     if(myStatus != SamStatus::SUCCESS)
01383     {
01384         // Failed to set the fields, return false.
01385         return(false);
01386     }
01387     return(true);
01388 }
01389 
01390 
01391 bool SamRecord::isIntegerType(char vtype) const
01392 {
01393     if((vtype == 'c') || (vtype == 'C') ||
01394        (vtype == 's') || (vtype == 'S') ||
01395        (vtype == 'i') || (vtype == 'I'))
01396     {
01397         return(true);
01398     }
01399     return(false);
01400 }
01401 
01402 
01403 bool SamRecord::isDoubleType(char vtype) const
01404 {
01405     if(vtype == 'f')
01406     {
01407         return(true);
01408     }
01409     return(false);
01410 }
01411 
01412 
01413 bool SamRecord::isCharType(char vtype) const
01414 {
01415     if(vtype == 'A')
01416     {
01417         return(true);
01418     }
01419     return(false);
01420 }
01421 
01422 
01423 bool SamRecord::isStringType(char vtype) const
01424 {
01425     if(vtype == 'Z')
01426     {
01427         return(true);
01428     }
01429     return(false);
01430 }
01431 
01432 
01433 void SamRecord::clearTags()
01434 {
01435     if(extras.Entries() != 0)
01436     {
01437         extras.Clear();
01438     }
01439     strings.Clear();
01440     integers.Clear();
01441     doubles.Clear();
01442     myTagBufferSize = 0;
01443     resetTagIter();
01444 }
01445 
01446 
01447 bool SamRecord::rmTag(const char* tag, char type)
01448 {
01449     // Check the length of tag.
01450     if(strlen(tag) != 2)
01451     {
01452         // Tag is the wrong length.
01453         myStatus.setStatus(SamStatus::INVALID, 
01454                            "rmTag called with tag that is not 2 characters\n");
01455         return(false);
01456     }
01457 
01458     myStatus = SamStatus::SUCCESS;
01459     if(myNeedToSetTagsFromBuffer)
01460     {
01461         if(!setTagsFromBuffer())
01462         {
01463             // Failed to read the tags from the buffer, so cannot
01464             // get tags.
01465             return(false);
01466         }
01467     }
01468 
01469     // Construct the key.
01470     int key = MAKEKEY(tag[0], tag[1], type);
01471     // Look to see if the key exsists in the hash.
01472     int offset = extras.Find(key);
01473 
01474     if(offset < 0)
01475     {
01476         // Not found, so return true, successfully removed since
01477         // it is not in tag.
01478         return(true);
01479     }
01480 
01481     int rmBuffSize = 0;
01482     
01483     // Offset is set, so recalculate the buffer size without this entry.
01484     // Do NOT remove from strings, integers, or doubles because then
01485     // extras would need to be updated for all entries with the new indexes
01486     // into those variables.
01487     switch(type)
01488     {
01489         case 'A':
01490         case 'c':
01491         case 'C':
01492             rmBuffSize = 4;
01493             break;
01494         case 's':
01495         case 'S':
01496             rmBuffSize = 5;
01497             break;
01498         case 'i':
01499         case 'I':
01500             rmBuffSize = 7;
01501             break;
01502         case 'f':
01503             rmBuffSize = 7;
01504             break;
01505         case 'Z':
01506             rmBuffSize = 4 + getString(offset).Length();
01507             break;
01508         default:
01509             myStatus.setStatus(SamStatus::INVALID, 
01510                                "rmTag called with unknown type.\n");
01511             return(false);
01512             break;
01513     };
01514 
01515     // The buffer tags are now out of sync.
01516     myNeedToSetTagsInBuffer = true;
01517     myIsTagsBufferValid = false;
01518     myIsBufferSynced = false;
01519     myTagBufferSize -= rmBuffSize;
01520 
01521     // Remove from the hash.
01522     extras.Delete(offset);
01523     return(true);
01524 }
01525 
01526 
01527 // Return the error after a failed SamRecord call.
01528 const SamStatus& SamRecord::getStatus()
01529 {
01530     return(myStatus);
01531 }
01532 
01533 
01534 String & SamRecord::getString(const char * tag)
01535 {
01536     // Init to success.
01537     myStatus = SamStatus::SUCCESS;
01538     // Parse the buffer if necessary.
01539     if(myNeedToSetTagsFromBuffer)
01540     {
01541         if(!setTagsFromBuffer())
01542         {
01543             // Failed to read the tags from the buffer, so cannot
01544             // get tags.
01545             // TODO - what do we want to do on failure?
01546         }
01547     }
01548     
01549     int key = MAKEKEY(tag[0], tag[1], 'Z');
01550     int offset = extras.Find(key);
01551 
01552     int value;
01553     if (offset < 0)
01554     {
01555         // TODO - what do we want to do on failure?
01556         return(NOT_FOUND_TAG_STRING);
01557     }
01558     else
01559         value = extras[offset];
01560 
01561     return strings[value];
01562 }
01563 
01564 int & SamRecord::getInteger(const char * tag)
01565 {
01566     // Init to success.
01567     myStatus = SamStatus::SUCCESS;
01568     // Parse the buffer if necessary.
01569     if(myNeedToSetTagsFromBuffer)
01570     {
01571         if(!setTagsFromBuffer())
01572         {
01573             // Failed to read the tags from the buffer, so cannot
01574             // get tags.
01575             // TODO - what do we want to do on failure?
01576         }
01577     }
01578     
01579     int key = MAKEKEY(tag[0], tag[1], 'i');
01580     int offset = extras.Find(key);
01581 
01582     int value;
01583     if (offset < 0)
01584     {
01585         // TODO - what do we want to do on failure?
01586         return NOT_FOUND_TAG_INT;
01587     }
01588     else
01589         value = extras[offset];
01590 
01591     return integers[value];
01592 }
01593 
01594 double & SamRecord::getDouble(const char * tag)
01595 {
01596     // Init to success.
01597     myStatus = SamStatus::SUCCESS;
01598     // Parse the buffer if necessary.
01599     if(myNeedToSetTagsFromBuffer)
01600     {
01601         if(!setTagsFromBuffer())
01602         {
01603             // Failed to read the tags from the buffer, so cannot
01604             // get tags.
01605             // TODO - what do we want to do on failure?
01606         }
01607     }
01608     
01609     int key = MAKEKEY(tag[0], tag[1], 'f');
01610     int offset = extras.Find(key);
01611 
01612     int value;
01613     if (offset < 0)
01614     {
01615         // TODO - what do we want to do on failure?
01616         return NOT_FOUND_TAG_DOUBLE;
01617     }
01618     else
01619         value = extras[offset];
01620 
01621     return doubles[value];
01622 }
01623 
01624 
01625 bool SamRecord::checkTag(const char * tag, char type)
01626 {
01627     // Init to success.
01628     myStatus = SamStatus::SUCCESS;
01629     // Parse the buffer if necessary.
01630     if(myNeedToSetTagsFromBuffer)
01631     {
01632         if(!setTagsFromBuffer())
01633         {
01634             // Failed to read the tags from the buffer, so cannot
01635             // get tags.
01636             return("");
01637         }
01638     }
01639     
01640     int key = MAKEKEY(tag[0], tag[1], type);
01641 
01642     return (extras.Find(key) != LH_NOTFOUND);
01643 }
01644 
01645 
01646 // Return the number of bases in this read that overlap the passed in
01647 // region.  (start & end are 0-based)
01648 uint32_t SamRecord::getNumOverlaps(int32_t start, int32_t end)
01649 {
01650     // Determine whether or not the cigar has been parsed, which sets up
01651     // the cigar roller.  This is determined by checking the alignment length.
01652     if(myAlignmentLength == -1)
01653     {
01654         parseCigar();
01655     }
01656     return(myCigarRoller.getNumOverlaps(start, end, get0BasedPosition()));
01657 }
01658 
01659 
01660 // // Get the Extra field: <tag>:<vtype>:<value> for the key.
01661 // void SamRecord::getSamExtraFieldFromKey(int key, String& extraField) 
01662 // {
01663 //     myStatus = SamStatus::SUCCESS;
01664 //     // Extract the tag from the key.
01665 //     char tag[3];
01666 //     getTag(key, tag);
01667 
01668 //     char vtype;
01669 //     getVtype(key, vtype);
01670 
01671 //     // Look up the key to get the offset into the appropriate array.
01672 //     int offset = extras.Find(key);
01673 
01674 //     // Get the value associated with the key based on the vtype.
01675 //     switch (vtype)
01676 //     {
01677 //         case 'A' :
01678 //             extraField = tag;
01679 //             extraField += ":A:";
01680 //             extraField += (char)(getInteger(offset));
01681 //             break;
01682 //         case 'f' :
01683 //             extraField = tag;
01684 //             extraField += ":f:";
01685 //             extraField += getDouble(offset);
01686 //             break;
01687 //         case 'c' :
01688 //         case 'C' :
01689 //         case 's' :
01690 //         case 'S' :
01691 //         case 'i' :
01692 //         case 'I' :
01693 //             extraField = tag;
01694 //             extraField += ":i:";
01695 //             extraField += getInteger(offset);
01696 //             break;
01697 //         case 'Z' :
01698 //             //         String valueString = getString(offset).c_str();
01699 //             extraField = tag;
01700 //             extraField += ":Z:";
01701 //             extraField += getString(offset);
01702 //             break;
01703 //         default:
01704 //             myStatus.setStatus(SamStatus::FAIL_PARSE,
01705 //                                "Unknown tag type.");
01706 //             break;
01707 //     }
01708 // }
01709 
01710 
01711 // Allocate space for the record - does a realloc.  
01712 // The passed in size is the size of the entire record including the
01713 // block size field.
01714 bool SamRecord::allocateRecordStructure(int size)
01715 {
01716     if (allocatedSize < size)
01717     {
01718         bamRecordStruct* tmpRecordPtr = 
01719             (bamRecordStruct *)realloc(myRecordPtr, size);
01720         if(tmpRecordPtr == NULL)
01721         {
01722             // FAILED to allocate memory
01723             fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
01724             myStatus.addError(SamStatus::FAIL_MEM, "Failed Memory Allocation.");
01725             return(false);
01726         }
01727         // Successfully allocated memory, so set myRecordPtr.
01728         myRecordPtr = tmpRecordPtr;
01729         allocatedSize = size;
01730     }
01731     return(true);
01732 }
01733 
01734 
01735 // Index is the index into the strings array.
01736 void* SamRecord::getStringPtr(int index)
01737 {
01738     int value = extras[index];
01739 
01740     return &(strings[value]);
01741 }
01742 
01743 void* SamRecord::getIntegerPtr(int offset)
01744 {
01745     int value = extras[offset];
01746 
01747     return &(integers[value]);
01748 }
01749 
01750 void* SamRecord::getDoublePtr(int offset)
01751 {
01752     int value = extras[offset];
01753 
01754     return &(doubles[value]);
01755 }
01756 
01757 
01758 // Fixes the buffer to match the variable length fields if they are set.
01759 bool SamRecord::fixBuffer(SequenceTranslation translation)
01760 {
01761     // Check to see if the buffer is already synced.
01762     if(myIsBufferSynced &&
01763        (myBufferSequenceTranslation == translation))
01764     {
01765         // Already synced, nothing to do.
01766         return(true);
01767     }
01768    
01769     // Set the bin if necessary.
01770     if(!myIsBinValid)
01771     {
01772         // The bin that is set in the record is not valid, so
01773         // reset it.
01774         myRecordPtr->myBin = 
01775             bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd());      
01776         myIsBinValid = true;
01777     }
01778 
01779     // Not synced.
01780     bool status = true;
01781 
01782     // First determine the size the buffer needs to be.
01783     uint8_t newReadNameLen = getReadNameLength();
01784     uint16_t newCigarLen = getCigarLength();
01785     int32_t newReadLen = getReadLength();
01786     uint32_t newTagLen = getTagLength();
01787     uint32_t bamSequenceLen = (newReadLen+1)/2;
01788 
01789     // The buffer size extends from the start of the record to data
01790     // plus the length of the variable fields,
01791     // Multiply the cigar length by 4 since it is the number of uint32_t fields.
01792     int newBufferSize = 
01793         ((unsigned char*)(&(myRecordPtr->myData)) - 
01794          (unsigned char*)myRecordPtr) +
01795         newReadNameLen + ((newCigarLen)*4) +
01796         newReadLen + bamSequenceLen + newTagLen;
01797    
01798     if(!allocateRecordStructure(newBufferSize))
01799     {
01800         // Failed to allocate space.
01801         return(false);
01802     }
01803 
01804     // Now that space has been added to the buffer, check to see what if
01805     // any fields need to be extracted from the buffer prior to starting to
01806     // overwrite it.  Fields need to be extracted from the buffer if the 
01807     // buffer is valid for the field and a previous variable length field has
01808     // changed length.
01809     bool readNameLenChange = (newReadNameLen != myRecordPtr->myReadNameLength);
01810     bool cigarLenChange = (newCigarLen != myRecordPtr->myCigarLength);
01811     bool readLenChange = (newReadLen != myRecordPtr->myReadLength);
01812 
01813     // If the tags are still stored in the buffer and any other fields changed
01814     // lengths, they need to be extracted.
01815     if(myIsTagsBufferValid &&
01816        (readNameLenChange | cigarLenChange | readLenChange))
01817     {
01818         status &= setTagsFromBuffer();
01819         // The tag buffer will not be valid once the other fields
01820         // are written, so set it to not valid.
01821         myIsTagsBufferValid = false;
01822     }
01823 
01824     // If the sequence or quality strings are still stored in the buffer, and
01825     // any of the previous fields have changed length, extract it from the
01826     // current buffer.
01827     if((myIsQualityBufferValid | myIsSequenceBufferValid) && 
01828        (readNameLenChange | cigarLenChange | readLenChange))
01829     {
01830         setSequenceAndQualityFromBuffer();
01831         // The quality and sequence buffers will not be valid once the other
01832         // fields are written, so set them to not valid.
01833         myIsQualityBufferValid = false;
01834         myIsSequenceBufferValid = false;
01835     }
01836 
01837     // If the cigar is still stored in the buffer, and any of the
01838     // previous fields have changed length, extract it from the current buffer.
01839     if((myIsCigarBufferValid) && 
01840        (readNameLenChange))
01841     {
01842         status &= parseCigarBinary();
01843         myIsCigarBufferValid = false;
01844     }
01845 
01846     // Set each value in the buffer if it is not already valid.
01847     if(!myIsReadNameBufferValid)
01848     {
01849         memcpy(&(myRecordPtr->myData), myReadName.c_str(), 
01850                newReadNameLen);
01851    
01852         // Set the new ReadNameLength.
01853         myRecordPtr->myReadNameLength = newReadNameLen;
01854         myIsReadNameBufferValid = true;
01855     }
01856 
01857     unsigned char * readNameEnds = (unsigned char*)(&(myRecordPtr->myData)) +
01858         myRecordPtr->myReadNameLength;
01859    
01860     // Set the Cigar.  Need to reformat from the string to 
01861     unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds;
01862       
01863     if(!myIsCigarBufferValid)
01864     {
01865         // The cigar was already parsed when it was set, so just copy
01866         // data from the temporary buffer.
01867         myRecordPtr->myCigarLength = newCigarLen;
01868         memcpy(packedCigar, myCigarTempBuffer, 
01869                myRecordPtr->myCigarLength * sizeof(uint32_t));
01870       
01871         myIsCigarBufferValid = true;
01872     }
01873 
01874     unsigned char * packedSequence = readNameEnds + 
01875         myRecordPtr->myCigarLength * sizeof(int);
01876     unsigned char * packedQuality = packedSequence + bamSequenceLen;
01877    
01878     if(!myIsSequenceBufferValid || !myIsQualityBufferValid || 
01879        (myBufferSequenceTranslation != translation))
01880     {
01881         myRecordPtr->myReadLength = newReadLen;
01882         // Determine if the quality needs to be set and is just a * and needs to
01883         // be set to 0xFF.
01884         bool noQuality = false;
01885         if((myQuality.Length() == 1) && (myQuality[0] == '*'))
01886         {
01887             noQuality = true;
01888         }
01889       
01890         const char* translatedSeq = NULL;
01891         // If the sequence is not valid in the buffer or it is not
01892         // properly translated, get the properly translated sequence
01893         // that needs to be put into the buffer.
01894         if((!myIsSequenceBufferValid) ||
01895            (translation != myBufferSequenceTranslation))
01896         {
01897             translatedSeq = getSequence(translation);
01898         }
01899 
01900         for (int i = 0; i < myRecordPtr->myReadLength; i++) 
01901         {
01902             if((!myIsSequenceBufferValid) ||
01903                (translation != myBufferSequenceTranslation))
01904             {
01905                 // Sequence buffer is not valid, so set the sequence.
01906                 int seqVal = 0;
01907                 switch(translatedSeq[i])
01908                 {
01909                     case '=':
01910                         seqVal = 0;
01911                         break;
01912                     case 'A':
01913                     case 'a':
01914                         seqVal = 1;
01915                         break;
01916                     case 'C':
01917                     case 'c':
01918                         seqVal = 2;
01919                         break;
01920                     case 'G':
01921                     case 'g':
01922                         seqVal = 4;
01923                         break;
01924                     case 'T':
01925                     case 't':
01926                         seqVal = 8;
01927                         break;
01928                     case 'N':
01929                     case 'n':
01930                     case '.':
01931                         seqVal = 15;
01932                         break;
01933                     default:
01934                         myStatus.addError(SamStatus::FAIL_PARSE,
01935                                           "Unknown Sequence character found.");
01936                         status = false;
01937                         break;
01938                 };
01939             
01940                 if(i & 1)
01941                 {
01942                     // Odd number i's go in the lower 4 bits, so OR in the
01943                     // lower bits
01944                     packedSequence[i/2] |= seqVal;
01945                 }
01946                 else
01947                 {
01948                     // Even i's go in the upper 4 bits and are always set first.
01949                     packedSequence[i/2] = seqVal << 4;
01950                 }
01951             }
01952 
01953             if(!myIsQualityBufferValid)
01954             {
01955                 // Set the quality.
01956                 if((noQuality) || (myQuality.Length() <= i))
01957                 {
01958                     // No quality or the quality is smaller than the sequence,
01959                     // so set it to 0xFF
01960                     packedQuality[i] = 0xFF;
01961                 }
01962                 else
01963                 {
01964                     // Copy the quality string.
01965                     packedQuality[i] = myQuality[i] - 33;
01966                 }
01967             }
01968         }
01969         myIsQualityBufferValid = true;
01970         myIsSequenceBufferValid = true;
01971         myBufferSequenceTranslation = translation;
01972     }
01973 
01974     if(!myIsTagsBufferValid)
01975     {
01976         status &= setTagsInBuffer();
01977     }
01978 
01979     // Set the lengths in the buffer.
01980     myRecordPtr->myReadNameLength = newReadNameLen;
01981     myRecordPtr->myCigarLength = newCigarLen;
01982     myRecordPtr->myReadLength = newReadLen;
01983 
01984     // Set the buffer block size to the size of the buffer minus the
01985     // first field.
01986     myRecordPtr->myBlockSize = newBufferSize - sizeof(int32_t);
01987 
01988     if(status)
01989     {
01990         myIsBufferSynced = true;
01991     }
01992 
01993     return(status);
01994 }
01995 
01996 
01997 // Sets the Sequence and Quality strings from the buffer.
01998 // They are done together in one method because they require the same
01999 // loop, so might as well be done at the same time.
02000 void SamRecord::setSequenceAndQualityFromBuffer()
02001 {
02002     // NOTE: If the sequence buffer is not valid, do not set the sequence
02003     // string from the buffer.
02004     // NOTE: If the quality buffer is not valid, do not set the quality string
02005     // from the buffer.
02006 
02007     // Extract the sequence if the buffer is valid and the string's length is 0.
02008     bool extractSequence = false;
02009     if(myIsSequenceBufferValid && (mySequence.Length() == 0))
02010     {
02011         extractSequence = true;
02012     }
02013 
02014     // Extract the quality if the buffer is valid and the string's length is 0.
02015     bool extractQuality = false;
02016     if(myIsQualityBufferValid && (myQuality.Length() == 0))
02017     {
02018         extractQuality = true;
02019     }
02020 
02021     // If neither the quality nor the sequence need to be extracted,
02022     // just return.
02023     if(!extractSequence && !extractQuality)
02024     {
02025         return;
02026     }
02027 
02028     // Set the sequence and quality strings..
02029     if(extractSequence)
02030     {
02031         mySequence.SetLength(myRecordPtr->myReadLength);
02032     }
02033     if(extractQuality)
02034     {
02035         myQuality.SetLength(myRecordPtr->myReadLength);
02036     }
02037    
02038     unsigned char * readNameEnds = 
02039         (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength;
02040     unsigned char * packedSequence = 
02041         readNameEnds + myRecordPtr->myCigarLength * sizeof(int);
02042     unsigned char * packedQuality = 
02043         packedSequence + (myRecordPtr->myReadLength + 1) / 2;
02044 
02045     const char * asciiBases = "=AC.G...T......N";
02046 
02047     // Flag to see if the quality is specified - the quality contains a value
02048     // other than 0xFF.  If all values are 0xFF, then there is no quality.
02049     bool qualitySpecified = false;
02050 
02051     for (int i = 0; i < myRecordPtr->myReadLength; i++)
02052     {
02053         if(extractSequence)
02054         {
02055             mySequence[i] = i & 1 ?
02056                 asciiBases[packedSequence[i / 2] & 0xF] :
02057                 asciiBases[packedSequence[i / 2] >> 4];
02058         }
02059 
02060         if(extractQuality)
02061         {
02062             if(packedQuality[i] != 0xFF)
02063             {
02064                 // Quality is specified, so mark the flag.
02065                 qualitySpecified = true;
02066             }
02067 
02068             myQuality[i] = packedQuality[i] + 33;
02069         }
02070     }
02071 
02072     // If the read length is 0, then set the sequence and quality to '*'
02073     if(myRecordPtr->myReadLength == 0)
02074     {
02075         if(extractSequence)
02076         {
02077             mySequence = "*";
02078         }
02079         if(extractQuality)
02080         {
02081             myQuality = "*";
02082         }
02083     }
02084     else if(extractQuality && !qualitySpecified)
02085     {
02086         // No quality was specified, so set it to "*"
02087         myQuality = "*";
02088     }
02089 }
02090 
02091 
02092 // Parse the cigar to calculate the alignment/unclipped end.
02093 bool SamRecord::parseCigar()
02094 {
02095     // Determine if the cigar string or cigar binary needs to be parsed.
02096     if(myCigar.Length() == 0)
02097     {
02098         // The cigar string is not yet set, so parse the binary.
02099         return(parseCigarBinary());
02100     }
02101     return(parseCigarString());
02102 }
02103 
02104 // Parse the cigar to calculate the alignment/unclipped end.
02105 bool SamRecord::parseCigarBinary()
02106 {
02107     // Only need to parse if the string is not already set.
02108     // The length of the cigar string is set to zero when the 
02109     // record is read from a file into the buffer.
02110     if(myCigar.Length() != 0)
02111     {
02112         // Already parsed.
02113         return(true);
02114     }
02115 
02116     unsigned char * readNameEnds = 
02117         (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength;
02118    
02119     unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds;
02120 
02121     myCigarRoller.Set(packedCigar, myRecordPtr->myCigarLength);
02122     
02123     myCigarRoller.getCigarString(myCigar);
02124 
02125     myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount();
02126 
02127     myUnclippedStartOffset = myCigarRoller.getNumBeginClips();
02128     myUnclippedEndOffset = myCigarRoller.getNumEndClips();
02129 
02130     // if the cigar length is 0, then set the cigar string to "*"
02131     if(myRecordPtr->myCigarLength == 0)
02132     {
02133         myCigar = "*";
02134         return(true);
02135     }
02136 
02137     // Copy the cigar into a temporary buffer.
02138     int newBufferSize = myRecordPtr->myCigarLength * sizeof(uint32_t);
02139     if(newBufferSize > myCigarTempBufferAllocatedSize)
02140     {
02141         uint32_t* tempBufferPtr = 
02142             (uint32_t*)realloc(myCigarTempBuffer, newBufferSize);
02143         if(tempBufferPtr == NULL)
02144         {
02145             // Failed to allocate memory.
02146             // Do not parse, just return.
02147             fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
02148             myStatus.addError(SamStatus::FAIL_MEM,
02149                               "Failed to Allocate Memory.");
02150             return(false);
02151         }
02152         myCigarTempBuffer = tempBufferPtr;
02153         myCigarTempBufferAllocatedSize = newBufferSize;
02154     }
02155 
02156     memcpy(myCigarTempBuffer, packedCigar, 
02157            myRecordPtr->myCigarLength * sizeof(uint32_t));
02158 
02159     // Set the length of the temp buffer.
02160     myCigarTempBufferLength = myRecordPtr->myCigarLength;
02161 
02162     return(true);
02163 }
02164 
02165 // Parse the cigar string to calculate the cigar length and alignment end.
02166 bool SamRecord::parseCigarString()
02167 {
02168     myCigarTempBufferLength = 0;
02169     if(myCigar == "*")
02170     {
02171         // Cigar is empty, so initialize the variables.
02172         myAlignmentLength = 0;
02173         myUnclippedStartOffset = 0;
02174         myUnclippedEndOffset = 0;
02175         myCigarRoller.clear();
02176         return(true);
02177     }
02178 
02179     myCigarRoller.Set(myCigar);
02180     
02181     myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount();
02182     
02183     myUnclippedStartOffset = myCigarRoller.getNumBeginClips();
02184     myUnclippedEndOffset = myCigarRoller.getNumEndClips();
02185 
02186     // Check to see if the Temporary Cigar Buffer is large enough to contain
02187     // this cigar.  If we make it the size of the length of the cigar string,
02188     // it will be more than large enough.
02189     int newBufferSize = myCigar.Length() * sizeof(uint32_t);
02190     if(newBufferSize > myCigarTempBufferAllocatedSize)
02191     {
02192         uint32_t* tempBufferPtr = 
02193             (uint32_t*)realloc(myCigarTempBuffer, newBufferSize);
02194         if(tempBufferPtr == NULL)
02195         {
02196             // Failed to allocate memory.
02197             // Do not parse, just return.
02198             fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
02199             myStatus.addError(SamStatus::FAIL_MEM,
02200                               "Failed to Allocate Memory.");
02201             return(false);
02202         }
02203         myCigarTempBuffer = tempBufferPtr;
02204         myCigarTempBufferAllocatedSize = newBufferSize;
02205     }
02206 
02207     // Track if there were any errors.
02208     bool status = true;
02209 
02210     // Track the index into the cigar string that is being parsed.
02211     char *cigarOp;
02212     const char* cigarEntryStart = myCigar.c_str();
02213     int opLen = 0;
02214     int op = 0;
02215 
02216     unsigned int * packedCigar = myCigarTempBuffer;
02217     // TODO - maybe one day make a cigar list... or maybe make a 
02218     // reference cigar string for ease of lookup....
02219     const char* endCigarString = cigarEntryStart + myCigar.Length();
02220     while(cigarEntryStart < endCigarString)
02221     {
02222         bool validCigarEntry = true;
02223         // Get the opLen from the string.  cigarOp will then point to 
02224         // the operation.
02225         opLen = strtol(cigarEntryStart, &cigarOp, 10);
02226         // Switch on the type of operation.
02227         switch(*cigarOp)
02228         {
02229             case('M'):
02230                 op = 0;
02231                 break;
02232             case('I'):
02233                 // Insert into the reference position, so do not increment the
02234                 // reference end position.
02235                 op = 1;
02236                 break;
02237             case('D'):
02238                 op = 2;
02239                 break;
02240             case('N'):
02241                 op = 3;
02242                 break;
02243             case('S'):
02244                 op = 4;
02245                 break;
02246             case('H'):
02247                 op = 5;
02248                 break;
02249             case('P'):
02250                 op = 6;
02251                 break;
02252             default:
02253                 fprintf(stderr, "ERROR parsing cigar\n");
02254                 validCigarEntry = false;
02255                 status = false;
02256                 myStatus.addError(SamStatus::FAIL_PARSE,
02257                                   "Unknown operation found when parsing the Cigar.");
02258                 break;
02259         };
02260         if(validCigarEntry)
02261         {
02262             // Increment the cigar length.
02263             ++myCigarTempBufferLength;
02264             *packedCigar = (opLen << 4) | op;
02265             packedCigar++;
02266         }
02267         // The next Entry starts at one past the cigar op, so set the start.
02268         cigarEntryStart = ++cigarOp;
02269     }
02270 
02271     // Check clipLength to adjust the end position.
02272     return(status);
02273 }
02274 
02275 
02276 bool SamRecord::setTagsFromBuffer()
02277 {
02278     // If the tags do not need to be set from the buffer, return true.
02279     if(myNeedToSetTagsFromBuffer == false)
02280     {
02281         // Already been set from the buffer.
02282         return(true);
02283     }
02284 
02285     // Mark false, as they are being set now.
02286     myNeedToSetTagsFromBuffer = false;
02287 
02288     unsigned char * readNameEnds = 
02289         (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength;
02290     unsigned char * packedSequence = 
02291         readNameEnds + myRecordPtr->myCigarLength * sizeof(int);
02292     unsigned char * packedQuality = 
02293         packedSequence + (myRecordPtr->myReadLength + 1) / 2;
02294 
02295     unsigned char * extraPtr = packedQuality + myRecordPtr->myReadLength;
02296 
02297     // Default to success, will be changed to false on failure.
02298     bool status = true;
02299 
02300     // Clear any previously set tags.
02301     clearTags();
02302     while (myRecordPtr->myBlockSize + 4 - 
02303            (extraPtr - (unsigned char *)myRecordPtr) > 0)
02304     {
02305         int key;
02306         int value;
02307         void * content = extraPtr + 3;
02308         int tagBufferSize = 0;
02309       
02310         switch (extraPtr[2])
02311         {
02312             case 'A' :
02313                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'A');
02314                 value = integers.Length();
02315                 integers.Push(* (char *) content);
02316                 extraPtr += 4;
02317                 tagBufferSize += 4;
02318                 break;
02319             case 'c' :
02320                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'c');
02321                 value = integers.Length();
02322                 integers.Push(* (char *) content);
02323                 extraPtr += 4;
02324                 tagBufferSize += 4;
02325                 break;
02326             case 'C' :
02327                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'C');
02328                 value = integers.Length();
02329                 integers.Push(* (unsigned char *) content);
02330                 extraPtr += 4;
02331                 tagBufferSize += 4;
02332                 break;
02333             case 's' :
02334                 key = MAKEKEY(extraPtr[0], extraPtr[1], 's');
02335                 value = integers.Length();
02336                 integers.Push(* (short *) content);
02337                 extraPtr += 5;
02338                 tagBufferSize += 5;
02339                 break;
02340             case 'S' :
02341                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'S');
02342                 value = integers.Length();
02343                 integers.Push(* (unsigned short *) content);
02344                 extraPtr += 5;
02345                 tagBufferSize += 5;
02346                 break;
02347             case 'i' :
02348                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'i');
02349                 value = integers.Length();
02350                 integers.Push(* (int *) content);
02351                 extraPtr += 7;
02352                 tagBufferSize += 7;
02353                 break;
02354             case 'I' :
02355                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'I');
02356                 value = integers.Length();
02357                 integers.Push((int) * (unsigned int *) content);
02358                 extraPtr += 7;
02359                 tagBufferSize += 7;
02360                 break;
02361             case 'Z' :
02362                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'Z');
02363                 value = strings.Length();
02364                 strings.Push((const char *) content);
02365                 extraPtr += 4 + strings.Last().Length();
02366                 tagBufferSize += 4 + strings.Last().Length();
02367                 break;
02368             case 'f' :
02369                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'f');
02370                 value = doubles.Length();
02371                 doubles.Push(* (float *) content);
02372                 fprintf(stderr, "\n\nFLOAT!!!\n\n");
02373                 extraPtr += 7;
02374                 tagBufferSize += 7;
02375                 break;
02376             default :
02377                 fprintf(stderr, 
02378                         "parsing BAM - Unknown custom field of type %c%c:%c\n",
02379                         extraPtr[0], extraPtr[1], extraPtr[2]);
02380                 // Failed on read.
02381                 // Increment extraPtr just by the size of the 3 known fields
02382                 extraPtr += 3;
02383                 myStatus.addError(SamStatus::FAIL_PARSE,
02384                                   "Unknown tag type.");
02385                 status = false;
02386         }
02387 
02388         // Only add the tag if it has so far been successfully processed.
02389         if(status)
02390         {
02391             extras.Add(key, value);
02392             myTagBufferSize += tagBufferSize;
02393         }
02394     }
02395     return(status);
02396 }
02397 
02398 
02399 bool SamRecord::setTagsInBuffer()
02400 {
02401     // The buffer size extends from the start of the record to data
02402     // plus the length of the variable fields,
02403     // Multiply the cigar length by 4 since it is the number of uint32_t fields.
02404     int bamSequenceLength = (myRecordPtr->myReadLength+1)/2;
02405     int newBufferSize = ((unsigned char*)(&(myRecordPtr->myData)) - 
02406                          (unsigned char*)myRecordPtr) +  
02407         myRecordPtr->myReadNameLength + ((myRecordPtr->myCigarLength)*4) +
02408         myRecordPtr->myReadLength + bamSequenceLength + myTagBufferSize;
02409 
02410     // Make sure the buffer is big enough.
02411     if(!allocateRecordStructure(newBufferSize))
02412     {
02413         // Failed to allocate space.
02414         return(false);
02415     }
02416 
02417     unsigned char * readNameEnds = (unsigned char*)(&(myRecordPtr->myData)) +
02418         myRecordPtr->myReadNameLength;
02419     unsigned char * packedSequence = readNameEnds + 
02420         myRecordPtr->myCigarLength * sizeof(int);
02421     unsigned char * packedQuality = 
02422         packedSequence + bamSequenceLength;
02423    
02424     char * extraPtr = (char*)packedQuality + myRecordPtr->myReadLength;
02425 
02426     bool status = true;
02427 
02428     // Set the tags in the buffer.
02429     if (extras.Entries())
02430     {
02431         for (int i = 0; i < extras.Capacity(); i++)
02432         {
02433             if (extras.SlotInUse(i))
02434             {
02435                 int key = extras.GetKey(i);
02436                 getTag(key, extraPtr);
02437                 extraPtr += 2;
02438                 getVtype(key, extraPtr[0]);
02439                 char vtype = extraPtr[0];
02440 
02441                 // increment the pointer to where the value is.
02442                 extraPtr += 1;
02443 
02444                 switch (vtype)
02445                 {
02446                     case 'A' :
02447                         *(char*)extraPtr = (char)getInteger(i);
02448                         // sprintf(extraPtr, "%d", getInteger(i));
02449                         extraPtr += 1;
02450                         break;
02451                     case 'c' :
02452                         *(int8_t*)extraPtr = (int8_t)getInteger(i);
02453                         // sprintf(extraPtr, "%.4d", getInteger(i));
02454                         extraPtr += 1;
02455                         break;
02456                     case 'C' :
02457                         *(uint8_t*)extraPtr = (uint8_t)getInteger(i);
02458                         // sprintf(extraPtr, "%.4d", getInteger(i));
02459                         extraPtr += 1;
02460                         break;
02461                     case 's' :
02462                         *(int16_t*)extraPtr = (int16_t)getInteger(i);
02463                         // sprintf(extraPtr, "%.4d", getInteger(i));
02464                         extraPtr += 2;
02465                         break;
02466                     case 'S' :
02467                         *(uint16_t*)extraPtr = (uint16_t)getInteger(i);
02468                         // sprintf(extraPtr, "%.4d", getInteger(i));
02469                         extraPtr += 2;
02470                         break;
02471                     case 'i' :
02472                         *(int32_t*)extraPtr = (int32_t)getInteger(i);
02473                         // sprintf(extraPtr, "%.4d", getInteger(i));
02474                         extraPtr += 4;
02475                         break;
02476                     case 'I' :
02477                         *(uint32_t*)extraPtr = (uint32_t)getInteger(i);
02478                         // sprintf(extraPtr, "%.4d", getInteger(i));
02479                         extraPtr += 4;
02480                         break;
02481                     case 'Z' :
02482                         sprintf(extraPtr, "%s", getString(i).c_str());
02483                         extraPtr += getString(i).Length() + 1;
02484                         break;
02485                     case 'f' :
02486                         // TODO, figure out if %f is good enough.
02487                         sprintf(extraPtr, "%f", getDouble(i));
02488                         extraPtr += 4;
02489                         break;
02490                     default :
02491                         myStatus.addError(SamStatus::FAIL_PARSE,
02492                                           "Unknown tag type.");
02493                         status = false;
02494                         break;
02495                 }
02496             }
02497         }
02498     }
02499 
02500     // Validate that the extra pointer is at the end of the allocated buffer.
02501     // If not then there was a problem.
02502     if(extraPtr != (char*)myRecordPtr + newBufferSize)
02503     {
02504         fprintf(stderr, "ERROR updating the buffer.  Incorrect size.");
02505         myStatus.addError(SamStatus::FAIL_PARSE,
02506                           "ERROR updating the buffer.  Incorrect size.");
02507         status = false;
02508     }
02509 
02510 
02511     // The buffer tags are now in sync.
02512     myNeedToSetTagsInBuffer = false;
02513     myIsTagsBufferValid = true;
02514     return(status);
02515 }
02516 
02517 
02518 // Reset the variables for a newly set buffer.  The buffer must be set first
02519 // since this looks up the reference ids in the buffer to set the reference
02520 // names.
02521 void SamRecord::setVariablesForNewBuffer(SamFileHeader& header)
02522 {
02523     // Lookup the reference name & mate reference name associated with this
02524     // record.
02525     myReferenceName = 
02526         header.getReferenceLabel(myRecordPtr->myReferenceID);
02527     myMateReferenceName = 
02528         header.getReferenceLabel(myRecordPtr->myMateReferenceID);      
02529 
02530     // Clear the SAM Strings that are now not in-sync with the buffer.
02531     myReadName.SetLength(0);
02532     myCigar.SetLength(0);
02533     mySequence.SetLength(0);
02534     mySeqWithEq.clear();
02535     mySeqWithoutEq.clear();
02536     myQuality.SetLength(0);
02537     myNeedToSetTagsFromBuffer = true;
02538     myNeedToSetTagsInBuffer = false;
02539 
02540     //Set that the buffer is valid.
02541     myIsBufferSynced = true;
02542     // Set that the variable length buffer fields are valid.
02543     myIsReadNameBufferValid = true;
02544     myIsCigarBufferValid = true;
02545     myIsSequenceBufferValid = true;
02546     myBufferSequenceTranslation = NONE;
02547     myIsQualityBufferValid = true;
02548     myIsTagsBufferValid = true;
02549     myIsBinValid = true;
02550 }
02551 
02552 
02553 // Extract the vtype from the key.
02554 void SamRecord::getVtype(int key, char& vtype) const
02555 {
02556     // Extract the vtype from the key.
02557     vtype = (key >> 16) & 0xFF;
02558 }
02559 
02560 // Extract the tag from the key.
02561 void SamRecord::getTag(int key, char* tag) const
02562 {
02563     // Extract the tag from the key.
02564     tag[0] = key & 0xFF;
02565     tag[1] = (key >> 8) & 0xFF;
02566     tag[2] = 0;
02567 }
02568 
02569 
02570 // Index is the index into the strings array.
02571 String & SamRecord::getString(int index)
02572 {
02573     int value = extras[index];
02574 
02575     return strings[value];
02576 }
02577 
02578 int & SamRecord::getInteger(int offset)
02579 {
02580     int value = extras[offset];
02581 
02582     return integers[value];
02583 }
02584 
02585 double & SamRecord::getDouble(int offset)
02586 {
02587     int value = extras[offset];
02588 
02589     return doubles[value];
02590 }
02591 
02592 
Generated on Tue Mar 22 22:50:13 2011 for StatGen Software by  doxygen 1.6.3