SamRecord.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <stdlib.h>
00019 #include <limits>
00020 #include <stdexcept>
00021 
00022 #include "bam.h"
00023 
00024 #include "SamRecord.h"
00025 #include "SamValidation.h"
00026 
00027 #include "BaseUtilities.h"
00028 #include "SamQuerySeqWithRefHelper.h"
00029 
00030 const char* SamRecord::DEFAULT_READ_NAME = "UNKNOWN";
00031 const char* SamRecord::FIELD_ABSENT_STRING = "=";
00032 
00033 
00034 SamRecord::SamRecord()
00035     : myStatus(),
00036       myRefPtr(NULL),
00037       mySequenceTranslation(NONE)
00038 {
00039     int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t);
00040 
00041     myRecordPtr = 
00042         (bamRecordStruct *) malloc(defaultAllocSize);
00043 
00044     myCigarTempBuffer = NULL;
00045     myCigarTempBufferAllocatedSize = 0;
00046 
00047     allocatedSize = defaultAllocSize;
00048 
00049     resetRecord();
00050 }
00051 
00052 
00053 SamRecord::SamRecord(ErrorHandler::HandlingType errorHandlingType)
00054     : myStatus(errorHandlingType),
00055       myRefPtr(NULL),
00056       mySequenceTranslation(NONE)
00057 {
00058     int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t);
00059 
00060     myRecordPtr = 
00061         (bamRecordStruct *) malloc(defaultAllocSize);
00062 
00063     myCigarTempBuffer = NULL;
00064     myCigarTempBufferAllocatedSize = 0;
00065 
00066     allocatedSize = defaultAllocSize;
00067 
00068     resetRecord();
00069 }
00070 
00071 
00072 SamRecord::~SamRecord()
00073 {
00074     resetRecord();
00075 
00076     if(myRecordPtr != NULL)
00077     {
00078         free(myRecordPtr);
00079         myRecordPtr = NULL;
00080     }
00081     if(myCigarTempBuffer != NULL)
00082     {
00083         free(myCigarTempBuffer);
00084         myCigarTempBuffer = NULL;
00085         myCigarTempBufferAllocatedSize = 0;
00086     }
00087 }
00088 
00089 
00090 // Resets the fields of the record to a default value.
00091 void SamRecord::resetRecord()
00092 {
00093     myIsBufferSynced = true;
00094 
00095     myRecordPtr->myBlockSize = DEFAULT_BLOCK_SIZE;
00096     myRecordPtr->myReferenceID = -1;
00097     myRecordPtr->myPosition = -1;
00098     myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH;
00099     myRecordPtr->myMapQuality = 0;
00100     myRecordPtr->myBin = DEFAULT_BIN;
00101     myRecordPtr->myCigarLength = 0;
00102     myRecordPtr->myFlag = 0;
00103     myRecordPtr->myReadLength = 0;
00104     myRecordPtr->myMateReferenceID = -1;
00105     myRecordPtr->myMatePosition = -1;
00106     myRecordPtr->myInsertSize = 0;
00107    
00108     // Set the sam values for the variable length fields.
00109     // TODO - one way to speed this up might be to not set to "*" and just
00110     // clear them, and write out a '*' for SAM if it is empty.
00111     myReadName = DEFAULT_READ_NAME;
00112     myReferenceName = "*";
00113     myMateReferenceName = "*";
00114     myCigar = "*";
00115     mySequence = "*";
00116     mySeqWithEq.clear();
00117     mySeqWithoutEq.clear();
00118     myQuality = "*";
00119     myNeedToSetTagsFromBuffer = false;
00120     myNeedToSetTagsInBuffer = false;
00121 
00122     // Initialize the calculated alignment info to the uncalculated value.
00123     myAlignmentLength = -1;
00124     myUnclippedStartOffset = -1;
00125     myUnclippedEndOffset = -1;
00126 
00127     clearTags();
00128 
00129     // Set the bam values for the variable length fields.
00130     // Only the read name needs to be set, the others are a length of 0.
00131     // Set the read name.  The min size of myRecordPtr includes the size for
00132     // the default read name.
00133     memcpy(&(myRecordPtr->myData), myReadName.c_str(), 
00134            myRecordPtr->myReadNameLength);
00135 
00136     // Set that the variable length buffer fields are valid.
00137     myIsReadNameBufferValid = true;
00138     myIsCigarBufferValid = true;
00139     myIsSequenceBufferValid = true;
00140     myBufferSequenceTranslation = NONE;
00141     myIsQualityBufferValid = true;
00142     myIsTagsBufferValid = true;
00143     myIsBinValid = true;
00144 
00145     myCigarTempBufferLength = -1;
00146 
00147     myStatus = SamStatus::SUCCESS;
00148 
00149     NOT_FOUND_TAG_STRING = "";
00150     NOT_FOUND_TAG_INT = -1;
00151     NOT_FOUND_TAG_DOUBLE = -1;
00152 }
00153 
00154 
00155 // Reset the tag iterator to the beginning of the tags.
00156 void SamRecord::resetTagIter()
00157 {
00158     myLastTagIndex = -1;
00159 }
00160 
00161 
00162 // Returns whether or not the record is valid.
00163 // Header is needed to perform some validation against it.
00164 bool SamRecord::isValid(SamFileHeader& header)
00165 {
00166     myStatus = SamStatus::SUCCESS;
00167     SamValidationErrors invalidSamErrors;
00168     if(!SamValidator::isValid(header, *this, invalidSamErrors))
00169     {
00170         // The record is not valid.
00171         std::string errorMessage = "";
00172         invalidSamErrors.getErrorString(errorMessage);
00173         myStatus.setStatus(SamStatus::INVALID, errorMessage.c_str());
00174         return(false);
00175     }
00176     // The record is valid.
00177     return(true);
00178 }
00179 
00180 
00181 // Read the BAM record from a file.
00182 SamStatus::Status SamRecord::setBufferFromFile(IFILE filePtr, 
00183                                                SamFileHeader& header)
00184 {
00185     myStatus = SamStatus::SUCCESS;
00186     if((filePtr == NULL) || (filePtr->isOpen() == false))
00187     {
00188         // File is not open, return failure.
00189         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00190                            "Can't read from an unopened file.");
00191         return(SamStatus::FAIL_ORDER);
00192     }
00193 
00194     // Clear the record.
00195     resetRecord();
00196 
00197     // read the record size.
00198     int numBytes = 
00199         ifread(filePtr, &(myRecordPtr->myBlockSize), sizeof(int32_t));
00200 
00201     if(ifeof(filePtr))
00202     {
00203         if(numBytes == 0)
00204         {
00205             // End of file, nothing was read, no more records.
00206             myStatus.setStatus(SamStatus::NO_MORE_RECS,
00207                                "No more records left to read.");
00208             return(SamStatus::NO_MORE_RECS);
00209         }
00210         else
00211         {
00212             // Error: end of the file reached prior to reading the rest of the
00213             // record.
00214             myStatus.setStatus(SamStatus::FAIL_PARSE, 
00215                                "EOF reached in the middle of a record.");
00216             return(SamStatus::FAIL_PARSE);
00217         }
00218     }
00219 
00220     // allocate space for the record size.
00221     if(!allocateRecordStructure(myRecordPtr->myBlockSize + sizeof(int32_t)))
00222     {
00223         // Failed to allocate space.
00224         // Status is set by allocateRecordStructure.
00225         return(SamStatus::FAIL_MEM);
00226     }
00227 
00228     // Read the rest of the alignment block, starting at the reference id.
00229     if(ifread(filePtr, &(myRecordPtr->myReferenceID), myRecordPtr->myBlockSize)
00230        != (unsigned int)myRecordPtr->myBlockSize)
00231     {
00232         // Error reading the record.  Reset it and return failure.
00233         resetRecord();
00234         myStatus.setStatus(SamStatus::FAIL_IO,
00235                            "Failed to read the record");
00236         return(SamStatus::FAIL_IO);
00237     }
00238 
00239     setVariablesForNewBuffer(header);
00240 
00241     // Return the status of the record.
00242     return(SamStatus::SUCCESS);
00243 }
00244 
00245 
00246 void SamRecord::setReference(GenomeSequence* reference)
00247 {
00248     myRefPtr = reference;
00249 }
00250 
00251 
00252 // Set the type of sequence translation to use when getting
00253 // the sequence.  The default type (if this method is never called) is
00254 // NONE (the sequence is left as-is).  This is used 
00255 void SamRecord::setSequenceTranslation(SequenceTranslation translation)
00256 {
00257     mySequenceTranslation = translation;
00258 }
00259 
00260 
00261 bool SamRecord::setReadName(const char* readName) 
00262 {
00263     myReadName = readName;
00264     myIsBufferSynced = false;
00265     myIsReadNameBufferValid = false;
00266     myStatus = SamStatus::SUCCESS;
00267 
00268     // The read name must at least have some length, otherwise this is a parsing
00269     // error.
00270     if(myReadName.Length() == 0)
00271     {
00272         // Invalid - reset ReadName return false.
00273         myReadName = DEFAULT_READ_NAME;
00274         myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH;
00275         myStatus.setStatus(SamStatus::INVALID, "0 length Query Name.");
00276         return(false);
00277     }
00278 
00279     return true;
00280 }
00281 
00282 
00283 bool SamRecord::setFlag(uint16_t flag)
00284 {
00285     myStatus = SamStatus::SUCCESS;
00286     myRecordPtr->myFlag = flag;
00287     return true;
00288 }
00289 
00290 
00291 bool SamRecord::setReferenceName(SamFileHeader& header,
00292                                  const char* referenceName) 
00293 {
00294     myStatus = SamStatus::SUCCESS;
00295 
00296     myReferenceName = referenceName;
00297     myRecordPtr->myReferenceID = header.getReferenceID(referenceName);
00298 
00299     return true;
00300 }
00301 
00302 
00303 bool SamRecord::set1BasedPosition(int32_t position) 
00304 {
00305     return(set0BasedPosition(position - 1));
00306 }
00307 
00308 
00309 bool SamRecord::set0BasedPosition(int32_t position)
00310 {
00311     myStatus = SamStatus::SUCCESS;
00312     myRecordPtr->myPosition = position;
00313     myIsBinValid = false;
00314     return true;
00315 }
00316 
00317 
00318 bool SamRecord::setMapQuality(uint8_t mapQuality)
00319 {
00320     myStatus = SamStatus::SUCCESS;
00321     myRecordPtr->myMapQuality = mapQuality;
00322     return true;
00323 }
00324 
00325 
00326 bool SamRecord::setCigar(const char* cigar)
00327 {
00328     myStatus = SamStatus::SUCCESS;
00329     myCigar = cigar;
00330  
00331     myIsBufferSynced = false;
00332     myIsCigarBufferValid = false;
00333     myCigarTempBufferLength = -1;
00334     myIsBinValid = false;
00335 
00336     // Initialize the calculated alignment info to the uncalculated value.
00337     myAlignmentLength = -1;
00338     myUnclippedStartOffset = -1;
00339     myUnclippedEndOffset = -1;
00340 
00341     return true;
00342 }
00343 
00344 
00345 bool SamRecord::setCigar(const Cigar& cigar)
00346 {
00347     myStatus = SamStatus::SUCCESS;
00348     cigar.getCigarString(myCigar);
00349  
00350     myIsBufferSynced = false;
00351     myIsCigarBufferValid = false;
00352     myCigarTempBufferLength = -1;
00353     myIsBinValid = false;
00354 
00355     // Initialize the calculated alignment info to the uncalculated value.
00356     myAlignmentLength = -1;
00357     myUnclippedStartOffset = -1;
00358     myUnclippedEndOffset = -1;
00359 
00360     return true;
00361 }
00362 
00363 
00364 bool SamRecord::setMateReferenceName(SamFileHeader& header,
00365                                      const char* mateReferenceName) 
00366 {
00367     myStatus = SamStatus::SUCCESS;
00368     // Set the mate reference, if it is "=", set it to be equal
00369     // to myReferenceName.  This assumes that myReferenceName has already
00370     // been called.
00371     if(strcmp(mateReferenceName, FIELD_ABSENT_STRING) == 0)
00372     {
00373         myMateReferenceName = myReferenceName;
00374     }
00375     else
00376     {
00377         myMateReferenceName = mateReferenceName;
00378     }
00379 
00380     // Set the Mate Reference ID.
00381     myRecordPtr->myMateReferenceID = 
00382         header.getReferenceID(myMateReferenceName);
00383 
00384     return true;
00385 }
00386 
00387 
00388 bool SamRecord::set1BasedMatePosition(int32_t matePosition)
00389 {
00390     return(set0BasedMatePosition(matePosition - 1));
00391 }
00392 
00393 
00394 bool SamRecord::set0BasedMatePosition(int32_t matePosition)
00395 {
00396     myStatus = SamStatus::SUCCESS;
00397     myRecordPtr->myMatePosition = matePosition;
00398     return true;
00399 }
00400 
00401 
00402 bool SamRecord::setInsertSize(int32_t insertSize)
00403 {
00404     myStatus = SamStatus::SUCCESS;
00405     myRecordPtr->myInsertSize = insertSize;
00406     return true;
00407 }
00408 
00409 
00410 bool SamRecord::setSequence(const char* seq) 
00411 {
00412     myStatus = SamStatus::SUCCESS;
00413     mySequence = seq;
00414     mySeqWithEq.clear();
00415     mySeqWithoutEq.clear();
00416    
00417     myIsBufferSynced = false;
00418     myIsSequenceBufferValid = false;
00419     return true;
00420 }
00421 
00422 
00423 bool SamRecord::setQuality(const char* quality) 
00424 {
00425     myStatus = SamStatus::SUCCESS;
00426     myQuality = quality;
00427     myIsBufferSynced = false;
00428     myIsQualityBufferValid = false;
00429     return true;
00430 }
00431 
00432 
00433 // Set the BAM record from the passeed in buffer of the specified size.
00434 // Note: The size includes the block size.
00435 SamStatus::Status SamRecord::setBuffer(const char* fromBuffer,
00436                                        uint32_t fromBufferSize,
00437                                        SamFileHeader& header)
00438 {
00439     myStatus = SamStatus::SUCCESS;
00440     if((fromBuffer == NULL) || (fromBufferSize == 0))
00441     {
00442         // Buffer is empty.
00443         myStatus.setStatus(SamStatus::FAIL_PARSE,
00444                            "Cannot parse an empty file.");
00445         return(SamStatus::FAIL_PARSE);
00446     }
00447 
00448     // Clear the record.   
00449     resetRecord();
00450 
00451     // allocate space for the record size.
00452     if(!allocateRecordStructure(fromBufferSize))
00453     {
00454         // Failed to allocate space.
00455         return(SamStatus::FAIL_MEM);
00456     }
00457    
00458     memcpy(myRecordPtr, fromBuffer, fromBufferSize);
00459 
00460     setVariablesForNewBuffer(header);
00461 
00462     // Return the status of the record.
00463     return(SamStatus::SUCCESS);
00464 }
00465 
00466 
00467 // Add the specified tag to the record.
00468 // Returns true if the tag was successfully added, false otherwise.
00469 bool SamRecord::addTag(const char* tag, char vtype, const char* valuePtr)
00470 {
00471     myStatus = SamStatus::SUCCESS;
00472     bool status = true; // default to successful.
00473     int key = 0;
00474     int intVal = 0;
00475     int index = 0;
00476     char bamvtype;
00477 
00478     int tagBufferSize = 0;
00479 
00480     // First check to see if the tags need to be synced to the buffer.
00481     if(myNeedToSetTagsFromBuffer)
00482     {
00483         if(!setTagsFromBuffer())
00484         {
00485             // Failed to read tags from the buffer, so cannot add new ones.
00486             return(false);
00487         }
00488     }
00489 
00490     switch (vtype)
00491     {
00492         case 'A' :
00493             index = integers.Length();
00494             bamvtype = vtype;
00495             integers.Push((const int)*(valuePtr));
00496             tagBufferSize += 4;
00497             break;
00498         case 'i' :
00499             index = integers.Length();
00500             intVal = atoi((const char *)valuePtr);
00501             // Ints come in as int.  But it can be represented in fewer bits.
00502             // So determine a more specific type that is in line with the
00503             // types for BAM files.
00504             // First check to see if it is a negative.
00505             if(intVal < 0)
00506             {
00507                 // The int is negative, so it will need to use a signed type.
00508                 // See if it is greater than the min value for a char.
00509                 if(intVal > std::numeric_limits<char>::min())
00510                 {
00511                     // It can be stored in a signed char.
00512                     bamvtype = 'c';
00513                     tagBufferSize += 4;
00514                 }
00515                 else if(intVal > std::numeric_limits<short>::min())
00516                 {
00517                     // It fits in a signed short.
00518                     bamvtype = 's';
00519                     tagBufferSize += 5;
00520                 }
00521                 else
00522                 {
00523                     // Just store it as a signed int.
00524                     bamvtype = 'i';
00525                     tagBufferSize += 7;
00526                 }
00527             }
00528             else
00529             {
00530                 // It is positive, so an unsigned type can be used.
00531                 if(intVal < std::numeric_limits<unsigned char>::max())
00532                 {
00533                     // It is under the max of an unsigned char.
00534                     bamvtype = 'C';
00535                     tagBufferSize += 4;
00536                 }
00537                 else if(intVal < std::numeric_limits<unsigned short>::max())
00538                 {
00539                     // It is under the max of an unsigned short.
00540                     bamvtype = 'S';
00541                     tagBufferSize += 5;
00542                 }
00543                 else
00544                 {
00545                     // Just store it as an unsigned int.
00546                     bamvtype = 'I';
00547                     tagBufferSize += 7;
00548                 }
00549             }
00550             integers.Push(intVal);
00551             break;
00552         case 'Z' :
00553             index = strings.Length();
00554             bamvtype = vtype;
00555             strings.Push((const char *)valuePtr);
00556             tagBufferSize += 4 + strings.Last().Length();
00557             break;
00558         case 'f' :
00559             index = doubles.Length();
00560             bamvtype = vtype;
00561             doubles.Push(atof((const char *)valuePtr));
00562             tagBufferSize += 7;
00563             break;
00564         default :
00565             fprintf(stderr,
00566                     "samFile::ReadSAM() - Unknown custom field of type %c\n",
00567                     vtype);
00568             myStatus.setStatus(SamStatus::FAIL_PARSE, 
00569                                "Unknown custom field in a tag");
00570             status = false;
00571     }
00572 
00573     // Only add the tag if it has so far been successfully processed.
00574     if(status)
00575     {
00576         // The buffer tags are now out of sync.
00577         myNeedToSetTagsInBuffer = true;
00578         myIsTagsBufferValid = false;
00579         myIsBufferSynced = false;
00580 
00581         key = MAKEKEY(tag[0], tag[1], bamvtype);
00582         extras.Add(key, index);
00583         myTagBufferSize += tagBufferSize;
00584     }
00585     return(status);
00586 }
00587 
00588 
00589 // Get methods for record fields.
00590 const void* SamRecord::getRecordBuffer()
00591 {
00592     return(getRecordBuffer(mySequenceTranslation));
00593 }
00594 
00595 
00596 // Get methods for record fields.
00597 const void* SamRecord::getRecordBuffer(SequenceTranslation translation) 
00598 {
00599     myStatus = SamStatus::SUCCESS;
00600     bool status = true;
00601     // If the buffer is not synced or the sequence in the buffer is not
00602     // properly translated, fix the buffer.
00603     if((myIsBufferSynced == false) ||
00604        (myBufferSequenceTranslation != translation))
00605     {
00606         status &= fixBuffer(translation);
00607     }
00608     // If the buffer is synced, check to see if the tags need to be synced.
00609     if(myNeedToSetTagsInBuffer)
00610     {
00611         status &= setTagsInBuffer();
00612     }
00613     if(!status)
00614     {
00615         return(NULL);
00616     }
00617     return (const void *)myRecordPtr;
00618 }
00619 
00620 
00621 // Write the record as a buffer into the file using the class's 
00622 // sequence translation setting.
00623 SamStatus::Status SamRecord::writeRecordBuffer(IFILE filePtr)
00624 {
00625     return(writeRecordBuffer(filePtr, mySequenceTranslation));
00626 }
00627 
00628 
00629 // Write the record as a buffer into the file using the specified translation.
00630 SamStatus::Status SamRecord::writeRecordBuffer(IFILE filePtr, 
00631                                                SequenceTranslation translation)
00632 {
00633     myStatus = SamStatus::SUCCESS;
00634     if((filePtr == NULL) || (filePtr->isOpen() == false))
00635     {
00636         // File is not open, return failure.
00637         myStatus.setStatus(SamStatus::FAIL_ORDER,
00638                            "Can't write to an unopened file.");
00639         return(SamStatus::FAIL_ORDER);
00640     }
00641 
00642     if((myIsBufferSynced == false) ||
00643        (myBufferSequenceTranslation != translation))
00644     {
00645         if(!fixBuffer(translation))
00646         {
00647             return(myStatus.getStatus());
00648         }
00649     }
00650 
00651     // Write the record.
00652     unsigned int numBytesToWrite = myRecordPtr->myBlockSize + sizeof(int32_t);
00653     unsigned int numBytesWritten = 
00654         ifwrite(filePtr, myRecordPtr, numBytesToWrite);
00655 
00656     // Return status based on if the correct number of bytes were written.
00657     if(numBytesToWrite == numBytesWritten)
00658     {
00659         return(SamStatus::SUCCESS);
00660     }
00661     // The correct number of bytes were not written.
00662     myStatus.setStatus(SamStatus::FAIL_IO, "Failed to write the entire record.");
00663     return(SamStatus::FAIL_IO);
00664 }
00665 
00666 
00667 int32_t SamRecord::getBlockSize() 
00668 {
00669     myStatus = SamStatus::SUCCESS;
00670     // If the buffer isn't synced, sync the buffer to determine the
00671     // block size.
00672     if(myIsBufferSynced == false)
00673     {
00674         // Since this just returns the block size, the translation of
00675         // the sequence does not matter, so just use the currently set
00676         // value.
00677         fixBuffer(myBufferSequenceTranslation);
00678     }
00679     return myRecordPtr->myBlockSize;
00680 }
00681 
00682 
00683 // This method returns the reference name.
00684 const char* SamRecord::getReferenceName()
00685 {
00686     myStatus = SamStatus::SUCCESS;
00687     return myReferenceName.c_str();
00688 }
00689 
00690 
00691 int32_t SamRecord::getReferenceID()
00692 {
00693     myStatus = SamStatus::SUCCESS;
00694     return myRecordPtr->myReferenceID;
00695 }
00696 
00697 
00698 int32_t SamRecord::get1BasedPosition()
00699 {
00700     myStatus = SamStatus::SUCCESS;
00701     return (myRecordPtr->myPosition + 1);
00702 }
00703 
00704 
00705 int32_t SamRecord::get0BasedPosition()
00706 {
00707     myStatus = SamStatus::SUCCESS;
00708     return myRecordPtr->myPosition;
00709 }
00710 
00711 
00712 uint8_t SamRecord::getReadNameLength() 
00713 {
00714     myStatus = SamStatus::SUCCESS;
00715     // If the buffer is valid, return the size from there, otherwise get the 
00716     // size from the string length + 1 (ending null).
00717     if(myIsReadNameBufferValid)
00718     {
00719         return(myRecordPtr->myReadNameLength);
00720     }
00721    
00722     return(myReadName.Length() + 1);
00723 }
00724 
00725 
00726 uint8_t SamRecord::getMapQuality()
00727 {
00728     myStatus = SamStatus::SUCCESS;
00729     return myRecordPtr->myMapQuality;
00730 }
00731 
00732 
00733 uint16_t SamRecord::getBin()
00734 {
00735     myStatus = SamStatus::SUCCESS;
00736     if(!myIsBinValid)
00737     {
00738         // The bin that is set in the record is not valid, so
00739         // reset it.
00740         myRecordPtr->myBin = 
00741             bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd());      
00742         myIsBinValid = true;
00743     }
00744     return(myRecordPtr->myBin);
00745 }
00746 
00747 
00748 uint16_t SamRecord::getCigarLength()
00749 {
00750     myStatus = SamStatus::SUCCESS;
00751     // If the cigar buffer is valid
00752     // then get the length from there.
00753     if(myIsCigarBufferValid)
00754     {
00755         return myRecordPtr->myCigarLength;      
00756     }
00757 
00758     if(myCigarTempBufferLength == -1)
00759     {
00760         // The cigar buffer is not valid and the cigar temp buffer is not set,
00761         // so parse the string.
00762         parseCigarString();
00763     }
00764    
00765     // The temp buffer is now set, so return the size.
00766     return(myCigarTempBufferLength);
00767 }
00768 
00769 
00770 uint16_t SamRecord::getFlag()
00771 {
00772     myStatus = SamStatus::SUCCESS;
00773     return myRecordPtr->myFlag;
00774 }
00775 
00776 
00777 int32_t SamRecord::getReadLength() 
00778 {
00779     myStatus = SamStatus::SUCCESS;
00780     if(myIsSequenceBufferValid == false)
00781     {
00782         // If the sequence is "*", then return 0.
00783         if((mySequence.Length() == 1) && (mySequence[0] == '*'))
00784         {
00785             return(0);
00786         }
00787         // Do not add 1 since it is not null terminated.
00788         return(mySequence.Length());
00789     }
00790     return(myRecordPtr->myReadLength);
00791 }
00792 
00793 
00794 // This method returns the mate reference name.  If it is equal to the
00795 // reference name, it still returns the reference name.
00796 const char* SamRecord::getMateReferenceName()
00797 {
00798     myStatus = SamStatus::SUCCESS;
00799     return myMateReferenceName.c_str();
00800 }
00801 
00802 
00803 // This method returns the mate reference name.  If it is equal to the
00804 // reference name, it returns "=", unless they are both "*" in which case
00805 // "*" is returned.
00806 const char* SamRecord::getMateReferenceNameOrEqual()
00807 {
00808     myStatus = SamStatus::SUCCESS;
00809     if(myMateReferenceName == "*")
00810     {
00811         return(myMateReferenceName);
00812     }
00813     if(myMateReferenceName == getReferenceName())
00814     {
00815         return(FIELD_ABSENT_STRING);
00816     }
00817     else
00818     {
00819         return(myMateReferenceName);
00820     }
00821 }
00822 
00823 
00824 int32_t SamRecord::getMateReferenceID()
00825 {
00826     myStatus = SamStatus::SUCCESS;
00827     return myRecordPtr->myMateReferenceID;
00828 }
00829 
00830 
00831 int32_t SamRecord::get1BasedMatePosition()
00832 {
00833     myStatus = SamStatus::SUCCESS;
00834     return (myRecordPtr->myMatePosition + 1);
00835 }
00836 
00837 
00838 int32_t SamRecord::get0BasedMatePosition()
00839 {
00840     myStatus = SamStatus::SUCCESS;
00841     return myRecordPtr->myMatePosition;
00842 }
00843 
00844 
00845 int32_t SamRecord::getInsertSize()
00846 {
00847     myStatus = SamStatus::SUCCESS;
00848     return myRecordPtr->myInsertSize;
00849 }
00850 
00851 
00852 // Returns the inclusive rightmost position of the clipped sequence.
00853 int32_t SamRecord::get0BasedAlignmentEnd()
00854 {
00855     myStatus = SamStatus::SUCCESS;
00856     if(myAlignmentLength == -1)
00857     {
00858         // Alignment end has not been set, so calculate it.
00859         parseCigar();
00860     }
00861     // If alignment length > 0, subtract 1 from it to get the end.
00862     if(myAlignmentLength == 0)
00863     {
00864         // Length is 0, just return the start position.
00865         return(myRecordPtr->myPosition);
00866     }
00867     return(myRecordPtr->myPosition + myAlignmentLength - 1);
00868 }
00869 
00870 
00871 // Returns the inclusive rightmost position of the clipped sequence.
00872 int32_t SamRecord::get1BasedAlignmentEnd()
00873 {
00874     return(get0BasedAlignmentEnd() + 1);
00875 }
00876 
00877    
00878 // Return the length of the alignment.
00879 int32_t SamRecord::getAlignmentLength()
00880 {
00881     myStatus = SamStatus::SUCCESS;
00882     if(myAlignmentLength == -1)
00883     {
00884         // Alignment end has not been set, so calculate it.
00885         parseCigar();
00886     }
00887     // Return the alignment length.
00888     return(myAlignmentLength);
00889 }
00890 
00891 // Returns the inclusive left-most position adjust for clipped bases.
00892 int32_t SamRecord::get0BasedUnclippedStart()
00893 {
00894     myStatus = SamStatus::SUCCESS;
00895     if(myUnclippedStartOffset == -1)
00896     {
00897         // Unclipped has not yet been calculated, so parse the cigar to get it
00898         parseCigar();
00899     }
00900     return(myRecordPtr->myPosition - myUnclippedStartOffset);
00901 }
00902 
00903 
00904 // Returns the inclusive left-most position adjust for clipped bases.
00905 int32_t SamRecord::get1BasedUnclippedStart()
00906 {
00907     return(get0BasedUnclippedStart() + 1);
00908 }
00909 
00910 
00911 // Returns the inclusive right-most position adjust for clipped bases.
00912 int32_t SamRecord::get0BasedUnclippedEnd()
00913 {
00914     // myUnclippedEndOffset will be set by get0BasedAlignmentEnd if the 
00915     // cigar has not yet been parsed, so no need to check it here.
00916     return(get0BasedAlignmentEnd() + myUnclippedEndOffset);
00917 }
00918 
00919 
00920 // Returns the inclusive right-most position adjust for clipped bases.
00921 int32_t SamRecord::get1BasedUnclippedEnd()
00922 {
00923     return(get0BasedUnclippedEnd() + 1);
00924 }
00925 
00926 
00927 // Get the read name.
00928 const char* SamRecord::getReadName()
00929 {
00930     myStatus = SamStatus::SUCCESS;
00931     if(myReadName.Length() == 0)
00932     {
00933         // 0 Length, means that it is in the buffer, but has not yet
00934         // been synced to the string, so do the sync.
00935         myReadName = (char*)&(myRecordPtr->myData);
00936     }
00937     return myReadName.c_str();
00938 }
00939 
00940 
00941 const char* SamRecord::getCigar()
00942 {
00943     myStatus = SamStatus::SUCCESS;
00944     if(myCigar.Length() == 0)
00945     {
00946         // 0 Length, means that it is in the buffer, but has not yet
00947         // been synced to the string, so do the sync.
00948         parseCigarBinary();
00949     }
00950     return myCigar.c_str();
00951 }
00952 
00953 
00954 const char* SamRecord::getSequence()
00955 {
00956     return(getSequence(mySequenceTranslation));
00957 }
00958 
00959 
00960 const char* SamRecord::getSequence(SequenceTranslation translation)
00961 {
00962     myStatus = SamStatus::SUCCESS;
00963     if(mySequence.Length() == 0)
00964     {
00965         // 0 Length, means that it is in the buffer, but has not yet
00966         // been synced to the string, so do the sync.
00967         setSequenceAndQualityFromBuffer();
00968     }
00969 
00970     // Determine if translation needs to be done.
00971     if((translation == NONE) || (myRefPtr == NULL))
00972     {
00973         return mySequence.c_str();
00974     }
00975     else if(translation == EQUAL)
00976     {
00977         if(mySeqWithEq.length() == 0)
00978         {
00979             // Check to see if the sequence is defined.
00980             if(mySequence == "*")
00981             {
00982                 // Sequence is undefined, so no translation necessary.
00983                 mySeqWithEq = '*';
00984             }
00985             else
00986             {
00987                 // Sequence defined, so translate it.
00988                 SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(), 
00989                                                   myRecordPtr->myPosition,
00990                                                   myCigarRoller,
00991                                                   getReferenceName(),
00992                                                   *myRefPtr,
00993                                                   mySeqWithEq);
00994             }
00995         }
00996         return(mySeqWithEq.c_str());
00997     }
00998     else
00999     {
01000         // translation == BASES
01001         if(mySeqWithoutEq.length() == 0)
01002         {
01003             if(mySequence == "*")
01004             {
01005                 // Sequence is undefined, so no translation necessary.
01006                 mySeqWithoutEq = '*';
01007             }
01008             else
01009             {
01010                 // Sequence defined, so translate it.
01011                 SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(), 
01012                                                      myRecordPtr->myPosition,
01013                                                      myCigarRoller,
01014                                                      getReferenceName(),
01015                                                      *myRefPtr,
01016                                                      mySeqWithoutEq);
01017             }
01018         }
01019         return(mySeqWithoutEq.c_str());
01020     }
01021 }
01022 
01023 
01024 const char* SamRecord::getQuality() 
01025 {
01026     myStatus = SamStatus::SUCCESS;
01027     if(myQuality.Length() == 0)
01028     {
01029         // 0 Length, means that it is in the buffer, but has not yet
01030         // been synced to the string, so do the sync.
01031         setSequenceAndQualityFromBuffer();      
01032     }
01033     return myQuality.c_str();
01034 }
01035 
01036 
01037 char SamRecord::getSequence(int index)
01038 {
01039     return(getSequence(index, mySequenceTranslation));
01040 }
01041 
01042 
01043 char SamRecord::getSequence(int index, SequenceTranslation translation)
01044 {
01045     static const char * asciiBases = "=AC.G...T......N";
01046 
01047     // Determine the read length.
01048     int32_t readLen = getReadLength();
01049 
01050     // If the read length is 0, this method should not be called.
01051     if(readLen == 0)
01052     {
01053         String exceptionString = "SamRecord::getSequence(";
01054         exceptionString += index;
01055         exceptionString += ") is not allowed since sequence = '*'";
01056         throw std::runtime_error(exceptionString.c_str());
01057     }
01058     else if((index < 0) || (index >= readLen))
01059     {
01060         // Only get here if the index was out of range, so thow an exception.
01061         String exceptionString = "SamRecord::getSequence(";
01062         exceptionString += index;
01063         exceptionString += ") is out of range. Index must be between 0 and ";
01064         exceptionString += (readLen - 1);
01065         throw std::runtime_error(exceptionString.c_str());
01066     }
01067 
01068     // Determine if translation needs to be done.
01069     if((translation == NONE) || (myRefPtr == NULL))
01070     {
01071         // No translation needs to be done.
01072         if(mySequence.Length() == 0)
01073         {
01074             // Parse BAM sequence.
01075             // TODO - maybe store this pointer - and use that to track when
01076             // valid?
01077             unsigned char * packedSequence = 
01078                 (unsigned char *)myRecordPtr->myData + 
01079                 myRecordPtr->myReadNameLength +
01080                 myRecordPtr->myCigarLength * sizeof(int);
01081             
01082             return(index & 1 ?
01083                    asciiBases[packedSequence[index / 2] & 0xF] :
01084                    asciiBases[packedSequence[index / 2] >> 4]);
01085         }
01086         // Already have string.
01087         return(mySequence[index]);
01088     }
01089     else
01090     {
01091         // Need to translate the sequence either to have '=' or to not
01092         // have it.
01093         // First check to see if the sequence has been set.
01094         if(mySequence.Length() == 0)
01095         {
01096             // 0 Length, means that it is in the buffer, but has not yet
01097             // been synced to the string, so do the sync.
01098             setSequenceAndQualityFromBuffer();
01099         }
01100 
01101         // Check the type of translation.
01102         if(translation == EQUAL)
01103         {
01104             // Check whether or not the string has already been 
01105             // retrieved that has the '=' in it.
01106             if(mySeqWithEq.length() == 0)
01107             {
01108                 // The string with '=' has not yet been determined,
01109                 // so get the string.
01110                 // Check to see if the sequence is defined.
01111                 if(mySequence == "*")
01112                 {
01113                     // Sequence is undefined, so no translation necessary.
01114                     mySeqWithEq = '*';
01115                 }
01116                 else
01117                 {
01118                     // Sequence defined, so translate it.
01119                     SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(), 
01120                                                       myRecordPtr->myPosition, 
01121                                                       myCigarRoller,
01122                                                       getReferenceName(),
01123                                                       *myRefPtr,
01124                                                       mySeqWithEq);
01125                 }
01126             }
01127             // Sequence is set, so return it.
01128             return(mySeqWithEq[index]);
01129         }
01130         else
01131         {
01132             // translation == BASES
01133             // Check whether or not the string has already been 
01134             // retrieved that does not have the '=' in it.
01135             if(mySeqWithoutEq.length() == 0)
01136             {
01137                 // The string with '=' has not yet been determined,
01138                 // so get the string.
01139                 // Check to see if the sequence is defined.
01140                 if(mySequence == "*")
01141                 {
01142                     // Sequence is undefined, so no translation necessary.
01143                     mySeqWithoutEq = '*';
01144                 }
01145                 else
01146                 {
01147                     // Sequence defined, so translate it.
01148                     // The string without '=' has not yet been determined,
01149                     // so get the string.
01150                     SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(), 
01151                                                          myRecordPtr->myPosition, 
01152                                                          myCigarRoller,
01153                                                          getReferenceName(),
01154                                                          *myRefPtr,
01155                                                          mySeqWithoutEq);
01156                 }
01157             }
01158             // Sequence is set, so return it.
01159             return(mySeqWithoutEq[index]);
01160         }
01161     }
01162 }
01163 
01164 
01165 char SamRecord::getQuality(int index)
01166 {
01167     // Determine the read length.
01168     int32_t readLen = getReadLength();
01169 
01170     // If the read length is 0, return ' ' whose ascii code is below
01171     // the minimum ascii code for qualities.
01172     if(readLen == 0)
01173     {
01174         return(BaseUtilities::UNKNOWN_QUALITY_CHAR);
01175     }
01176     else if((index < 0) || (index >= readLen))
01177     {
01178         // Only get here if the index was out of range, so thow an exception.
01179         String exceptionString = "SamRecord::getQuality(";
01180         exceptionString += index;
01181         exceptionString += ") is out of range. Index must be between 0 and ";
01182         exceptionString += (readLen - 1);
01183         throw std::runtime_error(exceptionString.c_str());
01184     }
01185 
01186     if(myQuality.Length() == 0)
01187     {
01188         // Parse BAM Quality.
01189         unsigned char * packedQuality = 
01190             (unsigned char *)myRecordPtr->myData +
01191             myRecordPtr->myReadNameLength + 
01192             myRecordPtr->myCigarLength * sizeof(int) + 
01193             (myRecordPtr->myReadLength + 1) / 2;
01194         return(packedQuality[index] + 33);
01195     }
01196     else
01197     {
01198         // Already have string.
01199         if((myQuality.Length() == 1) && (myQuality[0] == '*'))
01200         {
01201             // Return 0xFF like it does for BAM.
01202             return(0xFF);
01203         }
01204         else
01205         {
01206             return(myQuality[index]);
01207         }
01208     }
01209 }
01210 
01211    
01212 Cigar* SamRecord::getCigarInfo()
01213 {
01214     // Check to see whether or not the Cigar has already been
01215     // set - this is determined by checking if alignment length
01216     // is set since alignment length and the cigar are set
01217     // at the same time.
01218     if(myAlignmentLength == -1)
01219     {
01220         // Not been set, so calculate it.
01221         parseCigar();
01222     }
01223     return(&myCigarRoller);
01224 }
01225 
01226 
01227 uint32_t SamRecord::getTagLength() 
01228 {
01229     myStatus = SamStatus::SUCCESS;
01230     if(myNeedToSetTagsFromBuffer)
01231     {
01232         // Tags are only set in the buffer, so the size of the tags is 
01233         // the length of the record minus the starting location of the tags.
01234         unsigned char * tagStart = 
01235             (unsigned char *)myRecordPtr->myData 
01236             + myRecordPtr->myReadNameLength 
01237             + myRecordPtr->myCigarLength * sizeof(int)
01238             + (myRecordPtr->myReadLength + 1) / 2 + myRecordPtr->myReadLength;
01239       
01240         // The non-tags take up from the start of the record to the tag start.
01241         // Do not include the block size part of the record since it is not
01242         // included in the size.
01243         uint32_t nonTagSize = 
01244             tagStart - (unsigned char*)&(myRecordPtr->myReferenceID);
01245         // Tags take up the size of the block minus the non-tag section.
01246         uint32_t tagSize = myRecordPtr->myBlockSize - nonTagSize;
01247         return(tagSize);
01248     }
01249 
01250     // Tags are stored outside the buffer, so myTagBufferSize is set.
01251     return(myTagBufferSize);
01252 }
01253 
01254 
01255 // Returns true if there is another tag and sets tag and vtype to the
01256 // appropriate values, and returns a pointer to the value.
01257 // Sets the Status to SUCCESS when a tag is successfully returned or
01258 // when there are no more tags.  Otherwise the status is set to describe
01259 // why it failed (parsing, etc).
01260 bool SamRecord::getNextSamTag(char* tag, char& vtype, void** value)
01261 {
01262     myStatus = SamStatus::SUCCESS;
01263     if(myNeedToSetTagsFromBuffer)
01264     {
01265         if(!setTagsFromBuffer())
01266         {
01267             // Failed to read the tags from the buffer, so cannot
01268             // get tags.
01269             return(false);
01270         }
01271     }
01272 
01273     // Increment the tag index to start looking at the next tag.
01274     // At the beginning, it is set to -1.
01275     myLastTagIndex++;
01276     int maxTagIndex = extras.Capacity();
01277     if(myLastTagIndex >= maxTagIndex)
01278     {
01279         // Hit the end of the tags, return false, no more tags.
01280         // Status is still success since this is not an error, 
01281         // it is just the end of the list.
01282         return(false);
01283     }
01284 
01285     bool tagFound = false;
01286     // Loop until a tag is found or the end of extras is hit.
01287     while((tagFound == false) && (myLastTagIndex < maxTagIndex))
01288     {
01289         if(extras.SlotInUse(myLastTagIndex))
01290         {
01291             // Found a slot to use.
01292             int key = extras.GetKey(myLastTagIndex);
01293             getTag(key, tag);
01294             getVtype(key, vtype);
01295             tagFound = true;
01296             // Get the value associated with the key based on the vtype.
01297             switch (vtype)
01298             {
01299                 case 'A' :
01300                     *value = getIntegerPtr(myLastTagIndex);
01301                     break;
01302                 case 'f' :
01303                     *value = getDoublePtr(myLastTagIndex);
01304                     break;
01305                 case 'c' :
01306                 case 'C' :
01307                 case 's' :
01308                 case 'S' :
01309                 case 'i' :
01310                 case 'I' :
01311                     vtype = 'i';
01312                     *value = getIntegerPtr(myLastTagIndex);
01313                     break;
01314                 case 'Z' :
01315                     *value = getStringPtr(myLastTagIndex);
01316                     break;
01317                 default:
01318                     myStatus.setStatus(SamStatus::FAIL_PARSE,
01319                                        "Unknown tag type");
01320                     tagFound = false;
01321                     break;
01322             }
01323         }
01324         if(!tagFound)
01325         {
01326             // Increment the index since a tag was not found.
01327             myLastTagIndex++;
01328         }
01329     }
01330     return(tagFound);
01331 }
01332 
01333 
01334 // Returns the values of all fields except the tags.
01335 bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName, 
01336                           String& cigar, String& sequence, String& quality)
01337 {
01338     return(getFields(recStruct, readName, cigar, sequence, quality,
01339                      mySequenceTranslation));
01340 }
01341 
01342 
01343 // Returns the values of all fields except the tags.
01344 bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName, 
01345                           String& cigar, String& sequence, String& quality,
01346                           SequenceTranslation translation)
01347 {
01348     myStatus = SamStatus::SUCCESS;
01349     if(myIsBufferSynced == false)
01350     {
01351         if(!fixBuffer(translation))
01352         {
01353             // failed to set the buffer, return false.
01354             return(false);
01355         }
01356     }
01357     memcpy(&recStruct, myRecordPtr, sizeof(bamRecordStruct));
01358 
01359     readName = getReadName();
01360     // Check the status.
01361     if(myStatus != SamStatus::SUCCESS)
01362     {
01363         // Failed to set the fields, return false.
01364         return(false);
01365     }
01366     cigar = getCigar();
01367     // Check the status.
01368     if(myStatus != SamStatus::SUCCESS)
01369     {
01370         // Failed to set the fields, return false.
01371         return(false);
01372     }
01373     sequence = getSequence(translation);
01374     // Check the status.
01375     if(myStatus != SamStatus::SUCCESS)
01376     {
01377         // Failed to set the fields, return false.
01378         return(false);
01379     }
01380     quality = getQuality();
01381     // Check the status.
01382     if(myStatus != SamStatus::SUCCESS)
01383     {
01384         // Failed to set the fields, return false.
01385         return(false);
01386     }
01387     return(true);
01388 }
01389 
01390 
01391 bool SamRecord::isIntegerType(char vtype) const
01392 {
01393     if((vtype == 'c') || (vtype == 'C') ||
01394        (vtype == 's') || (vtype == 'S') ||
01395        (vtype == 'i') || (vtype == 'I'))
01396     {
01397         return(true);
01398     }
01399     return(false);
01400 }
01401 
01402 
01403 bool SamRecord::isDoubleType(char vtype) const
01404 {
01405     if(vtype == 'f')
01406     {
01407         return(true);
01408     }
01409     return(false);
01410 }
01411 
01412 
01413 bool SamRecord::isCharType(char vtype) const
01414 {
01415     if(vtype == 'A')
01416     {
01417         return(true);
01418     }
01419     return(false);
01420 }
01421 
01422 
01423 bool SamRecord::isStringType(char vtype) const
01424 {
01425     if(vtype == 'Z')
01426     {
01427         return(true);
01428     }
01429     return(false);
01430 }
01431 
01432 
01433 void SamRecord::clearTags()
01434 {
01435     if(extras.Entries() != 0)
01436     {
01437         extras.Clear();
01438     }
01439     strings.Clear();
01440     integers.Clear();
01441     doubles.Clear();
01442     myTagBufferSize = 0;
01443     resetTagIter();
01444 }
01445 
01446 
01447 // Return the error after a failed SamRecord call.
01448 const SamStatus& SamRecord::getStatus()
01449 {
01450     return(myStatus);
01451 }
01452 
01453 
01454 String & SamRecord::getString(const char * tag)
01455 {
01456     // Init to success.
01457     myStatus = SamStatus::SUCCESS;
01458     // Parse the buffer if necessary.
01459     if(myNeedToSetTagsFromBuffer)
01460     {
01461         if(!setTagsFromBuffer())
01462         {
01463             // Failed to read the tags from the buffer, so cannot
01464             // get tags.
01465             // TODO - what do we want to do on failure?
01466         }
01467     }
01468     
01469     int key = MAKEKEY(tag[0], tag[1], 'Z');
01470     int offset = extras.Find(key);
01471 
01472     int value;
01473     if (offset < 0)
01474     {
01475         // TODO - what do we want to do on failure?
01476         return(NOT_FOUND_TAG_STRING);
01477     }
01478     else
01479         value = extras[offset];
01480 
01481     return strings[value];
01482 }
01483 
01484 int & SamRecord::getInteger(const char * tag)
01485 {
01486     // Init to success.
01487     myStatus = SamStatus::SUCCESS;
01488     // Parse the buffer if necessary.
01489     if(myNeedToSetTagsFromBuffer)
01490     {
01491         if(!setTagsFromBuffer())
01492         {
01493             // Failed to read the tags from the buffer, so cannot
01494             // get tags.
01495             // TODO - what do we want to do on failure?
01496         }
01497     }
01498     
01499     int key = MAKEKEY(tag[0], tag[1], 'i');
01500     int offset = extras.Find(key);
01501 
01502     int value;
01503     if (offset < 0)
01504     {
01505         // TODO - what do we want to do on failure?
01506         return NOT_FOUND_TAG_INT;
01507     }
01508     else
01509         value = extras[offset];
01510 
01511     return integers[value];
01512 }
01513 
01514 double & SamRecord::getDouble(const char * tag)
01515 {
01516     // Init to success.
01517     myStatus = SamStatus::SUCCESS;
01518     // Parse the buffer if necessary.
01519     if(myNeedToSetTagsFromBuffer)
01520     {
01521         if(!setTagsFromBuffer())
01522         {
01523             // Failed to read the tags from the buffer, so cannot
01524             // get tags.
01525             // TODO - what do we want to do on failure?
01526         }
01527     }
01528     
01529     int key = MAKEKEY(tag[0], tag[1], 'f');
01530     int offset = extras.Find(key);
01531 
01532     int value;
01533     if (offset < 0)
01534     {
01535         // TODO - what do we want to do on failure?
01536         return NOT_FOUND_TAG_DOUBLE;
01537     }
01538     else
01539         value = extras[offset];
01540 
01541     return doubles[value];
01542 }
01543 
01544 
01545 bool SamRecord::checkTag(const char * tag, char type)
01546 {
01547     // Init to success.
01548     myStatus = SamStatus::SUCCESS;
01549     // Parse the buffer if necessary.
01550     if(myNeedToSetTagsFromBuffer)
01551     {
01552         if(!setTagsFromBuffer())
01553         {
01554             // Failed to read the tags from the buffer, so cannot
01555             // get tags.
01556             return("");
01557         }
01558     }
01559     
01560     int key = MAKEKEY(tag[0], tag[1], type);
01561 
01562     return (extras.Find(key) != LH_NOTFOUND);
01563 }
01564 
01565 
01566 // Return the number of bases in this read that overlap the passed in
01567 // region.  (start & end are 0-based)
01568 uint32_t SamRecord::getNumOverlaps(int32_t start, int32_t end)
01569 {
01570     // Determine whether or not the cigar has been parsed, which sets up
01571     // the cigar roller.  This is determined by checking the alignment length.
01572     if(myAlignmentLength == -1)
01573     {
01574         parseCigar();
01575     }
01576     return(myCigarRoller.getNumOverlaps(start, end, get0BasedPosition()));
01577 }
01578 
01579 
01580 // // Get the Extra field: <tag>:<vtype>:<value> for the key.
01581 // void SamRecord::getSamExtraFieldFromKey(int key, String& extraField) 
01582 // {
01583 //     myStatus = SamStatus::SUCCESS;
01584 //     // Extract the tag from the key.
01585 //     char tag[3];
01586 //     getTag(key, tag);
01587 
01588 //     char vtype;
01589 //     getVtype(key, vtype);
01590 
01591 //     // Look up the key to get the offset into the appropriate array.
01592 //     int offset = extras.Find(key);
01593 
01594 //     // Get the value associated with the key based on the vtype.
01595 //     switch (vtype)
01596 //     {
01597 //         case 'A' :
01598 //             extraField = tag;
01599 //             extraField += ":A:";
01600 //             extraField += (char)(getInteger(offset));
01601 //             break;
01602 //         case 'f' :
01603 //             extraField = tag;
01604 //             extraField += ":f:";
01605 //             extraField += getDouble(offset);
01606 //             break;
01607 //         case 'c' :
01608 //         case 'C' :
01609 //         case 's' :
01610 //         case 'S' :
01611 //         case 'i' :
01612 //         case 'I' :
01613 //             extraField = tag;
01614 //             extraField += ":i:";
01615 //             extraField += getInteger(offset);
01616 //             break;
01617 //         case 'Z' :
01618 //             //         String valueString = getString(offset).c_str();
01619 //             extraField = tag;
01620 //             extraField += ":Z:";
01621 //             extraField += getString(offset);
01622 //             break;
01623 //         default:
01624 //             myStatus.setStatus(SamStatus::FAIL_PARSE,
01625 //                                "Unknown tag type.");
01626 //             break;
01627 //     }
01628 // }
01629 
01630 
01631 // Allocate space for the record - does a realloc.  
01632 // The passed in size is the size of the entire record including the
01633 // block size field.
01634 bool SamRecord::allocateRecordStructure(int size)
01635 {
01636     if (allocatedSize < size)
01637     {
01638         bamRecordStruct* tmpRecordPtr = 
01639             (bamRecordStruct *)realloc(myRecordPtr, size);
01640         if(tmpRecordPtr == NULL)
01641         {
01642             // FAILED to allocate memory
01643             fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
01644             myStatus.addError(SamStatus::FAIL_MEM, "Failed Memory Allocation.");
01645             return(false);
01646         }
01647         // Successfully allocated memory, so set myRecordPtr.
01648         myRecordPtr = tmpRecordPtr;
01649         allocatedSize = size;
01650     }
01651     return(true);
01652 }
01653 
01654 
01655 // Index is the index into the strings array.
01656 void* SamRecord::getStringPtr(int index)
01657 {
01658     int value = extras[index];
01659 
01660     return &(strings[value]);
01661 }
01662 
01663 void* SamRecord::getIntegerPtr(int offset)
01664 {
01665     int value = extras[offset];
01666 
01667     return &(integers[value]);
01668 }
01669 
01670 void* SamRecord::getDoublePtr(int offset)
01671 {
01672     int value = extras[offset];
01673 
01674     return &(doubles[value]);
01675 }
01676 
01677 
01678 // Fixes the buffer to match the variable length fields if they are set.
01679 bool SamRecord::fixBuffer(SequenceTranslation translation)
01680 {
01681     // Check to see if the buffer is already synced.
01682     if(myIsBufferSynced &&
01683        (myBufferSequenceTranslation == translation))
01684     {
01685         // Already synced, nothing to do.
01686         return(true);
01687     }
01688    
01689     // Set the bin if necessary.
01690     if(!myIsBinValid)
01691     {
01692         // The bin that is set in the record is not valid, so
01693         // reset it.
01694         myRecordPtr->myBin = 
01695             bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd());      
01696         myIsBinValid = true;
01697     }
01698 
01699     // Not synced.
01700     bool status = true;
01701 
01702     // First determine the size the buffer needs to be.
01703     uint8_t newReadNameLen = getReadNameLength();
01704     uint16_t newCigarLen = getCigarLength();
01705     int32_t newReadLen = getReadLength();
01706     uint32_t newTagLen = getTagLength();
01707     uint32_t bamSequenceLen = (newReadLen+1)/2;
01708 
01709     // The buffer size extends from the start of the record to data
01710     // plus the length of the variable fields,
01711     // Multiply the cigar length by 4 since it is the number of uint32_t fields.
01712     int newBufferSize = 
01713         ((unsigned char*)(&(myRecordPtr->myData)) - 
01714          (unsigned char*)myRecordPtr) +
01715         newReadNameLen + ((newCigarLen)*4) +
01716         newReadLen + bamSequenceLen + newTagLen;
01717    
01718     if(!allocateRecordStructure(newBufferSize))
01719     {
01720         // Failed to allocate space.
01721         return(false);
01722     }
01723 
01724     // Now that space has been added to the buffer, check to see what if
01725     // any fields need to be extracted from the buffer prior to starting to
01726     // overwrite it.  Fields need to be extracted from the buffer if the 
01727     // buffer is valid for the field and a previous variable length field has
01728     // changed length.
01729     bool readNameLenChange = (newReadNameLen != myRecordPtr->myReadNameLength);
01730     bool cigarLenChange = (newCigarLen != myRecordPtr->myCigarLength);
01731     bool readLenChange = (newReadLen != myRecordPtr->myReadLength);
01732 
01733     // If the tags are still stored in the buffer and any other fields changed
01734     // lengths, they need to be extracted.
01735     if(myIsTagsBufferValid &&
01736        (readNameLenChange | cigarLenChange | readLenChange))
01737     {
01738         status &= setTagsFromBuffer();
01739         // The tag buffer will not be valid once the other fields
01740         // are written, so set it to not valid.
01741         myIsTagsBufferValid = false;
01742     }
01743 
01744     // If the sequence or quality strings are still stored in the buffer, and
01745     // any of the previous fields have changed length, extract it from the
01746     // current buffer.
01747     if((myIsQualityBufferValid | myIsSequenceBufferValid) && 
01748        (readNameLenChange | cigarLenChange | readLenChange))
01749     {
01750         setSequenceAndQualityFromBuffer();
01751         // The quality and sequence buffers will not be valid once the other
01752         // fields are written, so set them to not valid.
01753         myIsQualityBufferValid = false;
01754         myIsSequenceBufferValid = false;
01755     }
01756 
01757     // If the cigar is still stored in the buffer, and any of the
01758     // previous fields have changed length, extract it from the current buffer.
01759     if((myIsCigarBufferValid) && 
01760        (readNameLenChange))
01761     {
01762         status &= parseCigarBinary();
01763         myIsCigarBufferValid = false;
01764     }
01765 
01766     // Set each value in the buffer if it is not already valid.
01767     if(!myIsReadNameBufferValid)
01768     {
01769         memcpy(&(myRecordPtr->myData), myReadName.c_str(), 
01770                newReadNameLen);
01771    
01772         // Set the new ReadNameLength.
01773         myRecordPtr->myReadNameLength = newReadNameLen;
01774         myIsReadNameBufferValid = true;
01775     }
01776 
01777     unsigned char * readNameEnds = (unsigned char*)(&(myRecordPtr->myData)) +
01778         myRecordPtr->myReadNameLength;
01779    
01780     // Set the Cigar.  Need to reformat from the string to 
01781     unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds;
01782       
01783     if(!myIsCigarBufferValid)
01784     {
01785         // The cigar was already parsed when it was set, so just copy
01786         // data from the temporary buffer.
01787         myRecordPtr->myCigarLength = newCigarLen;
01788         memcpy(packedCigar, myCigarTempBuffer, 
01789                myRecordPtr->myCigarLength * sizeof(uint32_t));
01790       
01791         myIsCigarBufferValid = true;
01792     }
01793 
01794     unsigned char * packedSequence = readNameEnds + 
01795         myRecordPtr->myCigarLength * sizeof(int);
01796     unsigned char * packedQuality = packedSequence + bamSequenceLen;
01797    
01798     if(!myIsSequenceBufferValid || !myIsQualityBufferValid || 
01799        (myBufferSequenceTranslation != translation))
01800     {
01801         myRecordPtr->myReadLength = newReadLen;
01802         // Determine if the quality needs to be set and is just a * and needs to
01803         // be set to 0xFF.
01804         bool noQuality = false;
01805         if((myQuality.Length() == 1) && (myQuality[0] == '*'))
01806         {
01807             noQuality = true;
01808         }
01809       
01810         const char* translatedSeq = NULL;
01811         // If the sequence is not valid in the buffer or it is not
01812         // properly translated, get the properly translated sequence
01813         // that needs to be put into the buffer.
01814         if((!myIsSequenceBufferValid) ||
01815            (translation != myBufferSequenceTranslation))
01816         {
01817             translatedSeq = getSequence(translation);
01818         }
01819 
01820         for (int i = 0; i < myRecordPtr->myReadLength; i++) 
01821         {
01822             if((!myIsSequenceBufferValid) ||
01823                (translation != myBufferSequenceTranslation))
01824             {
01825                 // Sequence buffer is not valid, so set the sequence.
01826                 int seqVal = 0;
01827                 switch(translatedSeq[i])
01828                 {
01829                     case '=':
01830                         seqVal = 0;
01831                         break;
01832                     case 'A':
01833                     case 'a':
01834                         seqVal = 1;
01835                         break;
01836                     case 'C':
01837                     case 'c':
01838                         seqVal = 2;
01839                         break;
01840                     case 'G':
01841                     case 'g':
01842                         seqVal = 4;
01843                         break;
01844                     case 'T':
01845                     case 't':
01846                         seqVal = 8;
01847                         break;
01848                     case 'N':
01849                     case 'n':
01850                     case '.':
01851                         seqVal = 15;
01852                         break;
01853                     default:
01854                         myStatus.addError(SamStatus::FAIL_PARSE,
01855                                           "Unknown Sequence character found.");
01856                         status = false;
01857                         break;
01858                 };
01859             
01860                 if(i & 1)
01861                 {
01862                     // Odd number i's go in the lower 4 bits, so OR in the
01863                     // lower bits
01864                     packedSequence[i/2] |= seqVal;
01865                 }
01866                 else
01867                 {
01868                     // Even i's go in the upper 4 bits and are always set first.
01869                     packedSequence[i/2] = seqVal << 4;
01870                 }
01871             }
01872 
01873             if(!myIsQualityBufferValid)
01874             {
01875                 // Set the quality.
01876                 if((noQuality) || (myQuality.Length() <= i))
01877                 {
01878                     // No quality or the quality is smaller than the sequence,
01879                     // so set it to 0xFF
01880                     packedQuality[i] = 0xFF;
01881                 }
01882                 else
01883                 {
01884                     // Copy the quality string.
01885                     packedQuality[i] = myQuality[i] - 33;
01886                 }
01887             }
01888         }
01889         myIsQualityBufferValid = true;
01890         myIsSequenceBufferValid = true;
01891         myBufferSequenceTranslation = translation;
01892     }
01893 
01894     if(!myIsTagsBufferValid)
01895     {
01896         status &= setTagsInBuffer();
01897     }
01898 
01899     // Set the lengths in the buffer.
01900     myRecordPtr->myReadNameLength = newReadNameLen;
01901     myRecordPtr->myCigarLength = newCigarLen;
01902     myRecordPtr->myReadLength = newReadLen;
01903 
01904     // Set the buffer block size to the size of the buffer minus the
01905     // first field.
01906     myRecordPtr->myBlockSize = newBufferSize - sizeof(int32_t);
01907 
01908     if(status)
01909     {
01910         myIsBufferSynced = true;
01911     }
01912 
01913     return(status);
01914 }
01915 
01916 
01917 // Sets the Sequence and Quality strings from the buffer.
01918 // They are done together in one method because they require the same
01919 // loop, so might as well be done at the same time.
01920 void SamRecord::setSequenceAndQualityFromBuffer()
01921 {
01922     // NOTE: If the sequence buffer is not valid, do not set the sequence
01923     // string from the buffer.
01924     // NOTE: If the quality buffer is not valid, do not set the quality string
01925     // from the buffer.
01926 
01927     // Extract the sequence if the buffer is valid and the string's length is 0.
01928     bool extractSequence = false;
01929     if(myIsSequenceBufferValid && (mySequence.Length() == 0))
01930     {
01931         extractSequence = true;
01932     }
01933 
01934     // Extract the quality if the buffer is valid and the string's length is 0.
01935     bool extractQuality = false;
01936     if(myIsQualityBufferValid && (myQuality.Length() == 0))
01937     {
01938         extractQuality = true;
01939     }
01940 
01941     // If neither the quality nor the sequence need to be extracted,
01942     // just return.
01943     if(!extractSequence && !extractQuality)
01944     {
01945         return;
01946     }
01947 
01948     // Set the sequence and quality strings..
01949     if(extractSequence)
01950     {
01951         mySequence.SetLength(myRecordPtr->myReadLength);
01952     }
01953     if(extractQuality)
01954     {
01955         myQuality.SetLength(myRecordPtr->myReadLength);
01956     }
01957    
01958     unsigned char * readNameEnds = 
01959         (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength;
01960     unsigned char * packedSequence = 
01961         readNameEnds + myRecordPtr->myCigarLength * sizeof(int);
01962     unsigned char * packedQuality = 
01963         packedSequence + (myRecordPtr->myReadLength + 1) / 2;
01964 
01965     const char * asciiBases = "=AC.G...T......N";
01966 
01967     // Flag to see if the quality is specified - the quality contains a value
01968     // other than 0xFF.  If all values are 0xFF, then there is no quality.
01969     bool qualitySpecified = false;
01970 
01971     for (int i = 0; i < myRecordPtr->myReadLength; i++)
01972     {
01973         if(extractSequence)
01974         {
01975             mySequence[i] = i & 1 ?
01976                 asciiBases[packedSequence[i / 2] & 0xF] :
01977                 asciiBases[packedSequence[i / 2] >> 4];
01978         }
01979 
01980         if(extractQuality)
01981         {
01982             if(packedQuality[i] != 0xFF)
01983             {
01984                 // Quality is specified, so mark the flag.
01985                 qualitySpecified = true;
01986             }
01987 
01988             myQuality[i] = packedQuality[i] + 33;
01989         }
01990     }
01991 
01992     // If the read length is 0, then set the sequence and quality to '*'
01993     if(myRecordPtr->myReadLength == 0)
01994     {
01995         if(extractSequence)
01996         {
01997             mySequence = "*";
01998         }
01999         if(extractQuality)
02000         {
02001             myQuality = "*";
02002         }
02003     }
02004     else if(extractQuality && !qualitySpecified)
02005     {
02006         // No quality was specified, so set it to "*"
02007         myQuality = "*";
02008     }
02009 }
02010 
02011 
02012 // Parse the cigar to calculate the alignment/unclipped end.
02013 bool SamRecord::parseCigar()
02014 {
02015     // Determine if the cigar string or cigar binary needs to be parsed.
02016     if(myCigar.Length() == 0)
02017     {
02018         // The cigar string is not yet set, so parse the binary.
02019         return(parseCigarBinary());
02020     }
02021     return(parseCigarString());
02022 }
02023 
02024 // Parse the cigar to calculate the alignment/unclipped end.
02025 bool SamRecord::parseCigarBinary()
02026 {
02027     // Only need to parse if the string is not already set.
02028     // The length of the cigar string is set to zero when the 
02029     // record is read from a file into the buffer.
02030     if(myCigar.Length() != 0)
02031     {
02032         // Already parsed.
02033         return(true);
02034     }
02035 
02036     unsigned char * readNameEnds = 
02037         (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength;
02038    
02039     unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds;
02040 
02041     myCigarRoller.Set(packedCigar, myRecordPtr->myCigarLength);
02042     
02043     myCigarRoller.getCigarString(myCigar);
02044 
02045     myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount();
02046 
02047     myUnclippedStartOffset = myCigarRoller.getNumBeginClips();
02048     myUnclippedEndOffset = myCigarRoller.getNumEndClips();
02049 
02050     // if the cigar length is 0, then set the cigar string to "*"
02051     if(myRecordPtr->myCigarLength == 0)
02052     {
02053         myCigar = "*";
02054         return(true);
02055     }
02056 
02057     // Copy the cigar into a temporary buffer.
02058     int newBufferSize = myRecordPtr->myCigarLength * sizeof(uint32_t);
02059     if(newBufferSize > myCigarTempBufferAllocatedSize)
02060     {
02061         uint32_t* tempBufferPtr = 
02062             (uint32_t*)realloc(myCigarTempBuffer, newBufferSize);
02063         if(tempBufferPtr == NULL)
02064         {
02065             // Failed to allocate memory.
02066             // Do not parse, just return.
02067             fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
02068             myStatus.addError(SamStatus::FAIL_MEM,
02069                               "Failed to Allocate Memory.");
02070             return(false);
02071         }
02072         myCigarTempBuffer = tempBufferPtr;
02073         myCigarTempBufferAllocatedSize = newBufferSize;
02074     }
02075 
02076     memcpy(myCigarTempBuffer, packedCigar, 
02077            myRecordPtr->myCigarLength * sizeof(uint32_t));
02078 
02079     // Set the length of the temp buffer.
02080     myCigarTempBufferLength = myRecordPtr->myCigarLength;
02081 
02082     return(true);
02083 }
02084 
02085 // Parse the cigar string to calculate the cigar length and alignment end.
02086 bool SamRecord::parseCigarString()
02087 {
02088     myCigarTempBufferLength = 0;
02089     if(myCigar == "*")
02090     {
02091         // Cigar is empty, so initialize the variables.
02092         myAlignmentLength = 0;
02093         myUnclippedStartOffset = 0;
02094         myUnclippedEndOffset = 0;
02095         myCigarRoller.clear();
02096         return(true);
02097     }
02098 
02099     myCigarRoller.Set(myCigar);
02100     
02101     myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount();
02102     
02103     myUnclippedStartOffset = myCigarRoller.getNumBeginClips();
02104     myUnclippedEndOffset = myCigarRoller.getNumEndClips();
02105 
02106     // Check to see if the Temporary Cigar Buffer is large enough to contain
02107     // this cigar.  If we make it the size of the length of the cigar string,
02108     // it will be more than large enough.
02109     int newBufferSize = myCigar.Length() * sizeof(uint32_t);
02110     if(newBufferSize > myCigarTempBufferAllocatedSize)
02111     {
02112         uint32_t* tempBufferPtr = 
02113             (uint32_t*)realloc(myCigarTempBuffer, newBufferSize);
02114         if(tempBufferPtr == NULL)
02115         {
02116             // Failed to allocate memory.
02117             // Do not parse, just return.
02118             fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
02119             myStatus.addError(SamStatus::FAIL_MEM,
02120                               "Failed to Allocate Memory.");
02121             return(false);
02122         }
02123         myCigarTempBuffer = tempBufferPtr;
02124         myCigarTempBufferAllocatedSize = newBufferSize;
02125     }
02126 
02127     // Track if there were any errors.
02128     bool status = true;
02129 
02130     // Track the index into the cigar string that is being parsed.
02131     char *cigarOp;
02132     const char* cigarEntryStart = myCigar.c_str();
02133     int opLen = 0;
02134     int op = 0;
02135 
02136     unsigned int * packedCigar = myCigarTempBuffer;
02137     // TODO - maybe one day make a cigar list... or maybe make a 
02138     // reference cigar string for ease of lookup....
02139     const char* endCigarString = cigarEntryStart + myCigar.Length();
02140     while(cigarEntryStart < endCigarString)
02141     {
02142         bool validCigarEntry = true;
02143         // Get the opLen from the string.  cigarOp will then point to 
02144         // the operation.
02145         opLen = strtol(cigarEntryStart, &cigarOp, 10);
02146         // Switch on the type of operation.
02147         switch(*cigarOp)
02148         {
02149             case('M'):
02150                 op = 0;
02151                 break;
02152             case('I'):
02153                 // Insert into the reference position, so do not increment the
02154                 // reference end position.
02155                 op = 1;
02156                 break;
02157             case('D'):
02158                 op = 2;
02159                 break;
02160             case('N'):
02161                 op = 3;
02162                 break;
02163             case('S'):
02164                 op = 4;
02165                 break;
02166             case('H'):
02167                 op = 5;
02168                 break;
02169             case('P'):
02170                 op = 6;
02171                 break;
02172             default:
02173                 fprintf(stderr, "ERROR parsing cigar\n");
02174                 validCigarEntry = false;
02175                 status = false;
02176                 myStatus.addError(SamStatus::FAIL_PARSE,
02177                                   "Unknown operation found when parsing the Cigar.");
02178                 break;
02179         };
02180         if(validCigarEntry)
02181         {
02182             // Increment the cigar length.
02183             ++myCigarTempBufferLength;
02184             *packedCigar = (opLen << 4) | op;
02185             packedCigar++;
02186         }
02187         // The next Entry starts at one past the cigar op, so set the start.
02188         cigarEntryStart = ++cigarOp;
02189     }
02190 
02191     // Check clipLength to adjust the end position.
02192     return(status);
02193 }
02194 
02195 
02196 bool SamRecord::setTagsFromBuffer()
02197 {
02198     // If the tags do not need to be set from the buffer, return true.
02199     if(myNeedToSetTagsFromBuffer == false)
02200     {
02201         // Already been set from the buffer.
02202         return(true);
02203     }
02204 
02205     // Mark false, as they are being set now.
02206     myNeedToSetTagsFromBuffer = false;
02207 
02208     unsigned char * readNameEnds = 
02209         (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength;
02210     unsigned char * packedSequence = 
02211         readNameEnds + myRecordPtr->myCigarLength * sizeof(int);
02212     unsigned char * packedQuality = 
02213         packedSequence + (myRecordPtr->myReadLength + 1) / 2;
02214 
02215     unsigned char * extraPtr = packedQuality + myRecordPtr->myReadLength;
02216 
02217     // Default to success, will be changed to false on failure.
02218     bool status = true;
02219 
02220     // Clear any previously set tags.
02221     clearTags();
02222     while (myRecordPtr->myBlockSize + 4 - 
02223            (extraPtr - (unsigned char *)myRecordPtr) > 0)
02224     {
02225         int key;
02226         int value;
02227         void * content = extraPtr + 3;
02228         int tagBufferSize = 0;
02229       
02230         switch (extraPtr[2])
02231         {
02232             case 'A' :
02233                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'A');
02234                 value = integers.Length();
02235                 integers.Push(* (char *) content);
02236                 extraPtr += 4;
02237                 tagBufferSize += 4;
02238                 break;
02239             case 'c' :
02240                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'c');
02241                 value = integers.Length();
02242                 integers.Push(* (char *) content);
02243                 extraPtr += 4;
02244                 tagBufferSize += 4;
02245                 break;
02246             case 'C' :
02247                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'C');
02248                 value = integers.Length();
02249                 integers.Push(* (unsigned char *) content);
02250                 extraPtr += 4;
02251                 tagBufferSize += 4;
02252                 break;
02253             case 's' :
02254                 key = MAKEKEY(extraPtr[0], extraPtr[1], 's');
02255                 value = integers.Length();
02256                 integers.Push(* (short *) content);
02257                 extraPtr += 5;
02258                 tagBufferSize += 5;
02259                 break;
02260             case 'S' :
02261                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'S');
02262                 value = integers.Length();
02263                 integers.Push(* (unsigned short *) content);
02264                 extraPtr += 5;
02265                 tagBufferSize += 5;
02266                 break;
02267             case 'i' :
02268                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'i');
02269                 value = integers.Length();
02270                 integers.Push(* (int *) content);
02271                 extraPtr += 7;
02272                 tagBufferSize += 7;
02273                 break;
02274             case 'I' :
02275                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'I');
02276                 value = integers.Length();
02277                 integers.Push((int) * (unsigned int *) content);
02278                 extraPtr += 7;
02279                 tagBufferSize += 7;
02280                 break;
02281             case 'Z' :
02282                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'Z');
02283                 value = strings.Length();
02284                 strings.Push((const char *) content);
02285                 extraPtr += 4 + strings.Last().Length();
02286                 tagBufferSize += 4 + strings.Last().Length();
02287                 break;
02288             case 'f' :
02289                 key = MAKEKEY(extraPtr[0], extraPtr[1], 'f');
02290                 value = doubles.Length();
02291                 doubles.Push(* (float *) content);
02292                 fprintf(stderr, "\n\nFLOAT!!!\n\n");
02293                 extraPtr += 7;
02294                 tagBufferSize += 7;
02295                 break;
02296             default :
02297                 fprintf(stderr, 
02298                         "parsing BAM - Unknown custom field of type %c%c:%c\n",
02299                         extraPtr[0], extraPtr[1], extraPtr[2]);
02300                 // Failed on read.
02301                 // Increment extraPtr just by the size of the 3 known fields
02302                 extraPtr += 3;
02303                 myStatus.addError(SamStatus::FAIL_PARSE,
02304                                   "Unknown tag type.");
02305                 status = false;
02306         }
02307 
02308         // Only add the tag if it has so far been successfully processed.
02309         if(status)
02310         {
02311             extras.Add(key, value);
02312             myTagBufferSize += tagBufferSize;
02313         }
02314     }
02315     return(status);
02316 }
02317 
02318 
02319 bool SamRecord::setTagsInBuffer()
02320 {
02321     // The buffer size extends from the start of the record to data
02322     // plus the length of the variable fields,
02323     // Multiply the cigar length by 4 since it is the number of uint32_t fields.
02324     int bamSequenceLength = (myRecordPtr->myReadLength+1)/2;
02325     int newBufferSize = ((unsigned char*)(&(myRecordPtr->myData)) - 
02326                          (unsigned char*)myRecordPtr) +  
02327         myRecordPtr->myReadNameLength + ((myRecordPtr->myCigarLength)*4) +
02328         myRecordPtr->myReadLength + bamSequenceLength + myTagBufferSize;
02329 
02330     // Make sure the buffer is big enough.
02331     if(!allocateRecordStructure(newBufferSize))
02332     {
02333         // Failed to allocate space.
02334         return(false);
02335     }
02336 
02337     unsigned char * readNameEnds = (unsigned char*)(&(myRecordPtr->myData)) +
02338         myRecordPtr->myReadNameLength;
02339     unsigned char * packedSequence = readNameEnds + 
02340         myRecordPtr->myCigarLength * sizeof(int);
02341     unsigned char * packedQuality = 
02342         packedSequence + bamSequenceLength;
02343    
02344     char * extraPtr = (char*)packedQuality + myRecordPtr->myReadLength;
02345 
02346     bool status = true;
02347 
02348     // Set the tags in the buffer.
02349     if (extras.Entries())
02350     {
02351         for (int i = 0; i < extras.Capacity(); i++)
02352         {
02353             if (extras.SlotInUse(i))
02354             {
02355                 int key = extras.GetKey(i);
02356                 getTag(key, extraPtr);
02357                 extraPtr += 2;
02358                 getVtype(key, extraPtr[0]);
02359                 char vtype = extraPtr[0];
02360 
02361                 // increment the pointer to where the value is.
02362                 extraPtr += 1;
02363 
02364                 switch (vtype)
02365                 {
02366                     case 'A' :
02367                         *(char*)extraPtr = (char)getInteger(i);
02368                         // sprintf(extraPtr, "%d", getInteger(i));
02369                         extraPtr += 1;
02370                         break;
02371                     case 'c' :
02372                         *(int8_t*)extraPtr = (int8_t)getInteger(i);
02373                         // sprintf(extraPtr, "%.4d", getInteger(i));
02374                         extraPtr += 1;
02375                         break;
02376                     case 'C' :
02377                         *(uint8_t*)extraPtr = (uint8_t)getInteger(i);
02378                         // sprintf(extraPtr, "%.4d", getInteger(i));
02379                         extraPtr += 1;
02380                         break;
02381                     case 's' :
02382                         *(int16_t*)extraPtr = (int16_t)getInteger(i);
02383                         // sprintf(extraPtr, "%.4d", getInteger(i));
02384                         extraPtr += 2;
02385                         break;
02386                     case 'S' :
02387                         *(uint16_t*)extraPtr = (uint16_t)getInteger(i);
02388                         // sprintf(extraPtr, "%.4d", getInteger(i));
02389                         extraPtr += 2;
02390                         break;
02391                     case 'i' :
02392                         *(int32_t*)extraPtr = (int32_t)getInteger(i);
02393                         // sprintf(extraPtr, "%.4d", getInteger(i));
02394                         extraPtr += 4;
02395                         break;
02396                     case 'I' :
02397                         *(uint32_t*)extraPtr = (uint32_t)getInteger(i);
02398                         // sprintf(extraPtr, "%.4d", getInteger(i));
02399                         extraPtr += 4;
02400                         break;
02401                     case 'Z' :
02402                         sprintf(extraPtr, "%s", getString(i).c_str());
02403                         extraPtr += getString(i).Length() + 1;
02404                         break;
02405                     case 'f' :
02406                         // TODO, figure out if %f is good enough.
02407                         sprintf(extraPtr, "%f", getDouble(i));
02408                         extraPtr += 4;
02409                         break;
02410                     default :
02411                         myStatus.addError(SamStatus::FAIL_PARSE,
02412                                           "Unknown tag type.");
02413                         status = false;
02414                         break;
02415                 }
02416             }
02417         }
02418     }
02419 
02420     // Validate that the extra pointer is at the end of the allocated buffer.
02421     // If not then there was a problem.
02422     if(extraPtr != (char*)myRecordPtr + newBufferSize)
02423     {
02424         fprintf(stderr, "ERROR updating the buffer.  Incorrect size.");
02425         myStatus.addError(SamStatus::FAIL_PARSE,
02426                           "ERROR updating the buffer.  Incorrect size.");
02427         status = false;
02428     }
02429 
02430 
02431     // The buffer tags are now in sync.
02432     myNeedToSetTagsInBuffer = false;
02433     myIsTagsBufferValid = true;
02434     return(status);
02435 }
02436 
02437 
02438 // Reset the variables for a newly set buffer.  The buffer must be set first
02439 // since this looks up the reference ids in the buffer to set the reference
02440 // names.
02441 void SamRecord::setVariablesForNewBuffer(SamFileHeader& header)
02442 {
02443     // Lookup the reference name & mate reference name associated with this
02444     // record.
02445     myReferenceName = 
02446         header.getReferenceLabel(myRecordPtr->myReferenceID);
02447     myMateReferenceName = 
02448         header.getReferenceLabel(myRecordPtr->myMateReferenceID);      
02449 
02450     // Clear the SAM Strings that are now not in-sync with the buffer.
02451     myReadName.SetLength(0);
02452     myCigar.SetLength(0);
02453     mySequence.SetLength(0);
02454     mySeqWithEq.clear();
02455     mySeqWithoutEq.clear();
02456     myQuality.SetLength(0);
02457     myNeedToSetTagsFromBuffer = true;
02458     myNeedToSetTagsInBuffer = false;
02459 
02460     //Set that the buffer is valid.
02461     myIsBufferSynced = true;
02462     // Set that the variable length buffer fields are valid.
02463     myIsReadNameBufferValid = true;
02464     myIsCigarBufferValid = true;
02465     myIsSequenceBufferValid = true;
02466     myBufferSequenceTranslation = NONE;
02467     myIsQualityBufferValid = true;
02468     myIsTagsBufferValid = true;
02469     myIsBinValid = true;
02470 }
02471 
02472 
02473 // Extract the vtype from the key.
02474 void SamRecord::getVtype(int key, char& vtype) const
02475 {
02476     // Extract the vtype from the key.
02477     vtype = (key >> 16) & 0xFF;
02478 }
02479 
02480 // Extract the tag from the key.
02481 void SamRecord::getTag(int key, char* tag) const
02482 {
02483     // Extract the tag from the key.
02484     tag[0] = key & 0xFF;
02485     tag[1] = (key >> 8) & 0xFF;
02486     tag[2] = 0;
02487 }
02488 
02489 
02490 // Index is the index into the strings array.
02491 String & SamRecord::getString(int index)
02492 {
02493     int value = extras[index];
02494 
02495     return strings[value];
02496 }
02497 
02498 int & SamRecord::getInteger(int offset)
02499 {
02500     int value = extras[offset];
02501 
02502     return integers[value];
02503 }
02504 
02505 double & SamRecord::getDouble(int offset)
02506 {
02507     int value = extras[offset];
02508 
02509     return doubles[value];
02510 }
02511 
02512 
Generated on Thu Dec 9 12:22:13 2010 for StatGen Software by  doxygen 1.6.3