SamRecord.cpp

00001 /*
00002  *  Copyright (C) 2010-2011  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <stdlib.h>
00019 #include <limits>
00020 #include <stdexcept>
00021 
00022 #include "bam.h"
00023 
00024 #include "SamRecord.h"
00025 #include "SamValidation.h"
00026 
00027 #include "BaseUtilities.h"
00028 #include "SamQuerySeqWithRefHelper.h"
00029 
00030 const char* SamRecord::DEFAULT_READ_NAME = "UNKNOWN";
00031 const char* SamRecord::FIELD_ABSENT_STRING = "=";
00032 
00033 
00034 SamRecord::SamRecord()
00035     : myStatus(),
00036       myRefPtr(NULL),
00037       mySequenceTranslation(NONE)
00038 {
00039     int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t);
00040 
00041     myRecordPtr = 
00042         (bamRecordStruct *) malloc(defaultAllocSize);
00043 
00044     myCigarTempBuffer = NULL;
00045     myCigarTempBufferAllocatedSize = 0;
00046 
00047     allocatedSize = defaultAllocSize;
00048 
00049     resetRecord();
00050 }
00051 
00052 
00053 SamRecord::SamRecord(ErrorHandler::HandlingType errorHandlingType)
00054     : myStatus(errorHandlingType),
00055       myRefPtr(NULL),
00056       mySequenceTranslation(NONE)
00057 {
00058     int32_t defaultAllocSize = DEFAULT_BLOCK_SIZE + sizeof(int32_t);
00059 
00060     myRecordPtr = 
00061         (bamRecordStruct *) malloc(defaultAllocSize);
00062 
00063     myCigarTempBuffer = NULL;
00064     myCigarTempBufferAllocatedSize = 0;
00065 
00066     allocatedSize = defaultAllocSize;
00067 
00068     resetRecord();
00069 }
00070 
00071 
00072 SamRecord::~SamRecord()
00073 {
00074     resetRecord();
00075 
00076     if(myRecordPtr != NULL)
00077     {
00078         free(myRecordPtr);
00079         myRecordPtr = NULL;
00080     }
00081     if(myCigarTempBuffer != NULL)
00082     {
00083         free(myCigarTempBuffer);
00084         myCigarTempBuffer = NULL;
00085         myCigarTempBufferAllocatedSize = 0;
00086     }
00087 }
00088 
00089 
00090 // Resets the fields of the record to a default value.
00091 void SamRecord::resetRecord()
00092 {
00093     myIsBufferSynced = true;
00094 
00095     myRecordPtr->myBlockSize = DEFAULT_BLOCK_SIZE;
00096     myRecordPtr->myReferenceID = -1;
00097     myRecordPtr->myPosition = -1;
00098     myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH;
00099     myRecordPtr->myMapQuality = 0;
00100     myRecordPtr->myBin = DEFAULT_BIN;
00101     myRecordPtr->myCigarLength = 0;
00102     myRecordPtr->myFlag = 0;
00103     myRecordPtr->myReadLength = 0;
00104     myRecordPtr->myMateReferenceID = -1;
00105     myRecordPtr->myMatePosition = -1;
00106     myRecordPtr->myInsertSize = 0;
00107    
00108     // Set the sam values for the variable length fields.
00109     // TODO - one way to speed this up might be to not set to "*" and just
00110     // clear them, and write out a '*' for SAM if it is empty.
00111     myReadName = DEFAULT_READ_NAME;
00112     myReferenceName = "*";
00113     myMateReferenceName = "*";
00114     myCigar = "*";
00115     mySequence = "*";
00116     mySeqWithEq.clear();
00117     mySeqWithoutEq.clear();
00118     myQuality = "*";
00119     myNeedToSetTagsFromBuffer = false;
00120     myNeedToSetTagsInBuffer = false;
00121 
00122     // Initialize the calculated alignment info to the uncalculated value.
00123     myAlignmentLength = -1;
00124     myUnclippedStartOffset = -1;
00125     myUnclippedEndOffset = -1;
00126 
00127     clearTags();
00128 
00129     // Set the bam values for the variable length fields.
00130     // Only the read name needs to be set, the others are a length of 0.
00131     // Set the read name.  The min size of myRecordPtr includes the size for
00132     // the default read name.
00133     memcpy(&(myRecordPtr->myData), myReadName.c_str(), 
00134            myRecordPtr->myReadNameLength);
00135 
00136     // Set that the variable length buffer fields are valid.
00137     myIsReadNameBufferValid = true;
00138     myIsCigarBufferValid = true;
00139     myIsSequenceBufferValid = true;
00140     myBufferSequenceTranslation = NONE;
00141     myIsQualityBufferValid = true;
00142     myIsTagsBufferValid = true;
00143     myIsBinValid = true;
00144 
00145     myCigarTempBufferLength = -1;
00146 
00147     myStatus = SamStatus::SUCCESS;
00148 
00149     NOT_FOUND_TAG_STRING = "";
00150     NOT_FOUND_TAG_INT = -1;
00151     NOT_FOUND_TAG_DOUBLE = -1;
00152 }
00153 
00154 
00155 // Reset the tag iterator to the beginning of the tags.
00156 void SamRecord::resetTagIter()
00157 {
00158     myLastTagIndex = -1;
00159 }
00160 
00161 
00162 // Returns whether or not the record is valid.
00163 // Header is needed to perform some validation against it.
00164 bool SamRecord::isValid(SamFileHeader& header)
00165 {
00166     myStatus = SamStatus::SUCCESS;
00167     SamValidationErrors invalidSamErrors;
00168     if(!SamValidator::isValid(header, *this, invalidSamErrors))
00169     {
00170         // The record is not valid.
00171         std::string errorMessage = "";
00172         invalidSamErrors.getErrorString(errorMessage);
00173         myStatus.setStatus(SamStatus::INVALID, errorMessage.c_str());
00174         return(false);
00175     }
00176     // The record is valid.
00177     return(true);
00178 }
00179 
00180 
00181 // Read the BAM record from a file.
00182 SamStatus::Status SamRecord::setBufferFromFile(IFILE filePtr, 
00183                                                SamFileHeader& header)
00184 {
00185     myStatus = SamStatus::SUCCESS;
00186     if((filePtr == NULL) || (filePtr->isOpen() == false))
00187     {
00188         // File is not open, return failure.
00189         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00190                            "Can't read from an unopened file.");
00191         return(SamStatus::FAIL_ORDER);
00192     }
00193 
00194     // Clear the record.
00195     resetRecord();
00196 
00197     // read the record size.
00198     int numBytes = 
00199         ifread(filePtr, &(myRecordPtr->myBlockSize), sizeof(int32_t));
00200 
00201     // Check to see if the end of the file was hit and no bytes were read.
00202     if(ifeof(filePtr) && (numBytes == 0))
00203     {
00204         // End of file, nothing was read, no more records.
00205         myStatus.setStatus(SamStatus::NO_MORE_RECS,
00206                            "No more records left to read.");
00207         return(SamStatus::NO_MORE_RECS);
00208     }
00209     
00210     if(numBytes != sizeof(int32_t))
00211     {
00212         // Failed to read the entire block size.  Either the end of the file
00213         // was reached early or there was an error.
00214         if(ifeof(filePtr))
00215         {
00216             // Error: end of the file reached prior to reading the rest of the
00217             // record.
00218             myStatus.setStatus(SamStatus::FAIL_PARSE, 
00219                                "EOF reached in the middle of a record.");
00220             return(SamStatus::FAIL_PARSE);
00221         }
00222         else
00223         {
00224             // Error reading.
00225             myStatus.setStatus(SamStatus::FAIL_IO, 
00226                                "Failed to read the record size.");
00227             return(SamStatus::FAIL_IO);
00228         }
00229     }
00230 
00231     // allocate space for the record size.
00232     if(!allocateRecordStructure(myRecordPtr->myBlockSize + sizeof(int32_t)))
00233     {
00234         // Failed to allocate space.
00235         // Status is set by allocateRecordStructure.
00236         return(SamStatus::FAIL_MEM);
00237     }
00238 
00239     // Read the rest of the alignment block, starting at the reference id.
00240     if(ifread(filePtr, &(myRecordPtr->myReferenceID), myRecordPtr->myBlockSize)
00241        != (unsigned int)myRecordPtr->myBlockSize)
00242     {
00243         // Error reading the record.  Reset it and return failure.
00244         resetRecord();
00245         myStatus.setStatus(SamStatus::FAIL_IO,
00246                            "Failed to read the record");
00247         return(SamStatus::FAIL_IO);
00248     }
00249 
00250     setVariablesForNewBuffer(header);
00251 
00252     // Return the status of the record.
00253     return(SamStatus::SUCCESS);
00254 }
00255 
00256 
00257 void SamRecord::setReference(GenomeSequence* reference)
00258 {
00259     myRefPtr = reference;
00260 }
00261 
00262 
00263 // Set the type of sequence translation to use when getting
00264 // the sequence.  The default type (if this method is never called) is
00265 // NONE (the sequence is left as-is).  This is used 
00266 void SamRecord::setSequenceTranslation(SequenceTranslation translation)
00267 {
00268     mySequenceTranslation = translation;
00269 }
00270 
00271 
00272 bool SamRecord::setReadName(const char* readName) 
00273 {
00274     myReadName = readName;
00275     myIsBufferSynced = false;
00276     myIsReadNameBufferValid = false;
00277     myStatus = SamStatus::SUCCESS;
00278 
00279     // The read name must at least have some length, otherwise this is a parsing
00280     // error.
00281     if(myReadName.Length() == 0)
00282     {
00283         // Invalid - reset ReadName return false.
00284         myReadName = DEFAULT_READ_NAME;
00285         myRecordPtr->myReadNameLength = DEFAULT_READ_NAME_LENGTH;
00286         myStatus.setStatus(SamStatus::INVALID, "0 length Query Name.");
00287         return(false);
00288     }
00289 
00290     return true;
00291 }
00292 
00293 
00294 bool SamRecord::setFlag(uint16_t flag)
00295 {
00296     myStatus = SamStatus::SUCCESS;
00297     myRecordPtr->myFlag = flag;
00298     return true;
00299 }
00300 
00301 
00302 bool SamRecord::setReferenceName(SamFileHeader& header,
00303                                  const char* referenceName) 
00304 {
00305     myStatus = SamStatus::SUCCESS;
00306 
00307     myReferenceName = referenceName;
00308     // If the reference ID does not already exist, add it (pass true)
00309     myRecordPtr->myReferenceID = header.getReferenceID(referenceName, true);
00310 
00311     return true;
00312 }
00313 
00314 
00315 bool SamRecord::set1BasedPosition(int32_t position) 
00316 {
00317     return(set0BasedPosition(position - 1));
00318 }
00319 
00320 
00321 bool SamRecord::set0BasedPosition(int32_t position)
00322 {
00323     myStatus = SamStatus::SUCCESS;
00324     myRecordPtr->myPosition = position;
00325     myIsBinValid = false;
00326     return true;
00327 }
00328 
00329 
00330 bool SamRecord::setMapQuality(uint8_t mapQuality)
00331 {
00332     myStatus = SamStatus::SUCCESS;
00333     myRecordPtr->myMapQuality = mapQuality;
00334     return true;
00335 }
00336 
00337 
00338 bool SamRecord::setCigar(const char* cigar)
00339 {
00340     myStatus = SamStatus::SUCCESS;
00341     myCigar = cigar;
00342  
00343     myIsBufferSynced = false;
00344     myIsCigarBufferValid = false;
00345     myCigarTempBufferLength = -1;
00346     myIsBinValid = false;
00347 
00348     // Initialize the calculated alignment info to the uncalculated value.
00349     myAlignmentLength = -1;
00350     myUnclippedStartOffset = -1;
00351     myUnclippedEndOffset = -1;
00352 
00353     return true;
00354 }
00355 
00356 
00357 bool SamRecord::setCigar(const Cigar& cigar)
00358 {
00359     myStatus = SamStatus::SUCCESS;
00360     cigar.getCigarString(myCigar);
00361  
00362     myIsBufferSynced = false;
00363     myIsCigarBufferValid = false;
00364     myCigarTempBufferLength = -1;
00365     myIsBinValid = false;
00366 
00367     // Initialize the calculated alignment info to the uncalculated value.
00368     myAlignmentLength = -1;
00369     myUnclippedStartOffset = -1;
00370     myUnclippedEndOffset = -1;
00371 
00372     return true;
00373 }
00374 
00375 
00376 bool SamRecord::setMateReferenceName(SamFileHeader& header,
00377                                      const char* mateReferenceName) 
00378 {
00379     myStatus = SamStatus::SUCCESS;
00380     // Set the mate reference, if it is "=", set it to be equal
00381     // to myReferenceName.  This assumes that myReferenceName has already
00382     // been called.
00383     if(strcmp(mateReferenceName, FIELD_ABSENT_STRING) == 0)
00384     {
00385         myMateReferenceName = myReferenceName;
00386     }
00387     else
00388     {
00389         myMateReferenceName = mateReferenceName;
00390     }
00391 
00392     // Set the Mate Reference ID.
00393     // If the reference ID does not already exist, add it (pass true)
00394     myRecordPtr->myMateReferenceID = 
00395         header.getReferenceID(myMateReferenceName, true);
00396 
00397     return true;
00398 }
00399 
00400 
00401 bool SamRecord::set1BasedMatePosition(int32_t matePosition)
00402 {
00403     return(set0BasedMatePosition(matePosition - 1));
00404 }
00405 
00406 
00407 bool SamRecord::set0BasedMatePosition(int32_t matePosition)
00408 {
00409     myStatus = SamStatus::SUCCESS;
00410     myRecordPtr->myMatePosition = matePosition;
00411     return true;
00412 }
00413 
00414 
00415 bool SamRecord::setInsertSize(int32_t insertSize)
00416 {
00417     myStatus = SamStatus::SUCCESS;
00418     myRecordPtr->myInsertSize = insertSize;
00419     return true;
00420 }
00421 
00422 
00423 bool SamRecord::setSequence(const char* seq) 
00424 {
00425     myStatus = SamStatus::SUCCESS;
00426     mySequence = seq;
00427     mySeqWithEq.clear();
00428     mySeqWithoutEq.clear();
00429    
00430     myIsBufferSynced = false;
00431     myIsSequenceBufferValid = false;
00432     return true;
00433 }
00434 
00435 
00436 bool SamRecord::setQuality(const char* quality) 
00437 {
00438     myStatus = SamStatus::SUCCESS;
00439     myQuality = quality;
00440     myIsBufferSynced = false;
00441     myIsQualityBufferValid = false;
00442     return true;
00443 }
00444 
00445 
00446 // Set the BAM record from the passeed in buffer of the specified size.
00447 // Note: The size includes the block size.
00448 SamStatus::Status SamRecord::setBuffer(const char* fromBuffer,
00449                                        uint32_t fromBufferSize,
00450                                        SamFileHeader& header)
00451 {
00452     myStatus = SamStatus::SUCCESS;
00453     if((fromBuffer == NULL) || (fromBufferSize == 0))
00454     {
00455         // Buffer is empty.
00456         myStatus.setStatus(SamStatus::FAIL_PARSE,
00457                            "Cannot parse an empty file.");
00458         return(SamStatus::FAIL_PARSE);
00459     }
00460 
00461     // Clear the record.   
00462     resetRecord();
00463 
00464     // allocate space for the record size.
00465     if(!allocateRecordStructure(fromBufferSize))
00466     {
00467         // Failed to allocate space.
00468         return(SamStatus::FAIL_MEM);
00469     }
00470    
00471     memcpy(myRecordPtr, fromBuffer, fromBufferSize);
00472 
00473     setVariablesForNewBuffer(header);
00474 
00475     // Return the status of the record.
00476     return(SamStatus::SUCCESS);
00477 }
00478 
00479 
00480 // Add the specified tag to the record.
00481 // Returns true if the tag was successfully added, false otherwise.
00482 bool SamRecord::addIntTag(const char* tag, int32_t value)
00483 {
00484     myStatus = SamStatus::SUCCESS;
00485     int key = 0;
00486     int index = 0;
00487     char bamvtype;
00488 
00489     int tagBufferSize = 0;
00490 
00491     // First check to see if the tags need to be synced to the buffer.
00492     if(myNeedToSetTagsFromBuffer)
00493     {
00494         if(!setTagsFromBuffer())
00495         {
00496             // Failed to read tags from the buffer, so cannot add new ones.
00497             return(false);
00498         }
00499     }
00500 
00501     // Ints come in as int.  But it can be represented in fewer bits.
00502     // So determine a more specific type that is in line with the
00503     // types for BAM files.
00504     // First check to see if it is a negative.
00505     if(value < 0)
00506     {
00507         // The int is negative, so it will need to use a signed type.
00508         // See if it is greater than the min value for a char.
00509         if(value > std::numeric_limits<char>::min())
00510         {
00511             // It can be stored in a signed char.
00512             bamvtype = 'c';
00513             tagBufferSize += 4;
00514         }
00515         else if(value > std::numeric_limits<short>::min())
00516         {
00517             // It fits in a signed short.
00518             bamvtype = 's';
00519             tagBufferSize += 5;
00520         }
00521         else
00522         {
00523             // Just store it as a signed int.
00524             bamvtype = 'i';
00525             tagBufferSize += 7;
00526         }
00527     }
00528     else
00529     {
00530         // It is positive, so an unsigned type can be used.
00531         if(value < std::numeric_limits<unsigned char>::max())
00532         {
00533             // It is under the max of an unsigned char.
00534             bamvtype = 'C';
00535             tagBufferSize += 4;
00536         }
00537         else if(value < std::numeric_limits<unsigned short>::max())
00538         {
00539             // It is under the max of an unsigned short.
00540             bamvtype = 'S';
00541             tagBufferSize += 5;
00542         }
00543         else
00544         {
00545             // Just store it as an unsigned int.
00546             bamvtype = 'I';
00547             tagBufferSize += 7;
00548         }
00549     }
00550 
00551     // Check to see if the tag is already there.
00552     key = MAKEKEY(tag[0], tag[1], bamvtype);
00553     unsigned int hashIndex = extras.Find(key);
00554     if(hashIndex != LH_NOTFOUND)
00555     {
00556         // Tag was already found.
00557         index = extras[hashIndex];
00558 
00559         // First check to see if the value changed.
00560         if((integers[index] == value) && (intType[index] == bamvtype))
00561         {
00562             // The value has not changed, so do nothing.
00563             return(true);
00564         }
00565         else
00566         {
00567             // Not the same value, so adjust the settings.
00568             // Subtract the size of the previous tag from tagBufferSize to get
00569             // the adjusted size.
00570             switch(intType[index])
00571             {
00572                 case 'c':
00573                 case 'C':
00574                     tagBufferSize -= 4;
00575                     break;
00576                 case 's':
00577                 case 'S':
00578                     tagBufferSize -= 5;
00579                     break;
00580                 case 'i':
00581                 case 'I':
00582                     tagBufferSize -= 7;
00583                     break;
00584                 default:
00585                 myStatus.setStatus(SamStatus::INVALID, 
00586                                    "unknown tag inttype type found.\n");
00587                 return(false);              
00588             }
00589             
00590             // Update the integer value and type.
00591             integers[index] = value;
00592             intType[index] = bamvtype;
00593         }
00594     }
00595     else
00596     {
00597         // Tag is not already there, so add it.
00598         index = integers.Length();
00599         
00600         integers.Push(value);
00601         intType.push_back(bamvtype);
00602 
00603         extras.Add(key, index);
00604     }
00605 
00606     // The buffer tags are now out of sync.
00607     myNeedToSetTagsInBuffer = true;
00608     myIsTagsBufferValid = false;
00609     myIsBufferSynced = false;
00610     myTagBufferSize += tagBufferSize;
00611 
00612     return(true);
00613 }
00614 
00615 
00616 // Add the specified tag to the record, replacing it if it is already there and
00617 // is different from the previous value.
00618 // Returns true if the tag was successfully added (or was already there), false otherwise.
00619 bool SamRecord::addTag(const char* tag, char vtype, const char* valuePtr)
00620 {
00621     if(vtype == 'i')
00622     {
00623         // integer type.  Call addIntTag to handle it.
00624         int intVal = atoi(valuePtr);
00625         return(addIntTag(tag, intVal));
00626     }
00627 
00628     // Non-int type.
00629     myStatus = SamStatus::SUCCESS;
00630     bool status = true; // default to successful.
00631     int key = 0;
00632     int index = 0;
00633 
00634     int tagBufferSize = 0;
00635 
00636     // First check to see if the tags need to be synced to the buffer.
00637     if(myNeedToSetTagsFromBuffer)
00638     {
00639         if(!setTagsFromBuffer())
00640         {
00641             // Failed to read tags from the buffer, so cannot add new ones.
00642             return(false);
00643         }
00644     }
00645 
00646     // First check to see if the tag is already there.
00647     key = MAKEKEY(tag[0], tag[1], vtype);
00648     unsigned int hashIndex = extras.Find(key);
00649     if(hashIndex != LH_NOTFOUND)
00650     {
00651         // The key was found in the hash, so get the lookup index.
00652         index = extras[hashIndex];
00653 
00654         // Adjust the currently pointed to value to the new setting.
00655         switch (vtype)
00656         {
00657             case 'A' :
00658                 // First check to see if the value changed.
00659                 if(integers[index] == (const int)*(valuePtr))
00660                 {
00661                     // The value has not changed, so do nothing.
00662                     return(true);
00663                 }
00664                 else
00665                 {
00666                     // Tag buffer size doesn't change between different 'A' entries.
00667                     integers[index] = (const int)*(valuePtr);
00668                     intType[index] = vtype;
00669                 }
00670                 break;
00671             case 'Z' :
00672                 // First check to see if the value changed.
00673                 if(strings[index] == valuePtr)
00674                 {
00675                     // The value has not changed, so do nothing.
00676                     return(true);
00677                 }
00678                 else
00679                 {
00680                     // Adjust the tagBufferSize by removing the size of the old string.
00681                     tagBufferSize -= strings[index].Length();
00682                     strings[index] = valuePtr;
00683                     // Adjust the tagBufferSize by adding the size of the new string.
00684                     tagBufferSize += strings[index].Length();
00685                 }
00686                 break;
00687             case 'f' :
00688                 // First check to see if the value changed.
00689                 if(doubles[index] == atof(valuePtr))
00690                 {
00691                     // The value has not changed, so do nothing.
00692                     return(true);
00693                 }
00694                 else
00695                 {
00696                     // Tag buffer size doesn't change between different 'f' entries.
00697                     doubles[index] = atof(valuePtr);
00698                 }
00699                 break;
00700             default :
00701                 fprintf(stderr,
00702                         "samFile::ReadSAM() - Unknown custom field of type %c\n",
00703                         vtype);
00704                 myStatus.setStatus(SamStatus::FAIL_PARSE, 
00705                                    "Unknown custom field in a tag");
00706                 status = false;
00707                 break;
00708         }
00709     }
00710     else
00711     {
00712         // The key was found not found in the hash, so add it.
00713         switch (vtype)
00714         {
00715             case 'A' :
00716                 index = integers.Length();
00717                 integers.Push((const int)*(valuePtr));
00718                 intType.push_back(vtype);
00719                 tagBufferSize += 4;
00720                 break;
00721             case 'Z' :
00722                 index = strings.Length();
00723                 strings.Push(valuePtr);
00724                 tagBufferSize += 4 + strings.Last().Length();
00725                 break;
00726             case 'f' :
00727                 index = doubles.Length();
00728                 doubles.Push(atof(valuePtr));
00729                 tagBufferSize += 7;
00730                 break;
00731             default :
00732                 fprintf(stderr,
00733                         "samFile::ReadSAM() - Unknown custom field of type %c\n",
00734                         vtype);
00735                 myStatus.setStatus(SamStatus::FAIL_PARSE, 
00736                                    "Unknown custom field in a tag");
00737                 status = false;
00738                 break;
00739         }
00740         if(status)
00741         {
00742             // If successful, add the key to extras.
00743             extras.Add(key, index);
00744         }
00745     }
00746 
00747     // Only add the tag if it has so far been successfully processed.
00748     if(status)
00749     {
00750         // The buffer tags are now out of sync.
00751         myNeedToSetTagsInBuffer = true;
00752         myIsTagsBufferValid = false;
00753         myIsBufferSynced = false;
00754         myTagBufferSize += tagBufferSize;
00755     }
00756     return(status);
00757 }
00758 
00759 
00760 //Shift indels to the left
00761 bool SamRecord::shiftIndelsLeft()
00762 {
00763     // Check to see whether or not the Cigar has already been
00764     // set - this is determined by checking if alignment length
00765     // is set since alignment length and the cigar are set
00766     // at the same time.
00767     if(myAlignmentLength == -1)
00768     {
00769         // Not been set, so calculate it.
00770         parseCigar();
00771     }
00772     
00773     // Track whether or not there was a shift.
00774     bool shifted = false;
00775 
00776     // Cigar is set, so now myCigarRoller can be used.
00777     // Track where in the read we are.
00778     uint32_t currentPos = 0;
00779 
00780     // Since the loop starts at 1 because the first operation can't be shifted,
00781     // increment the currentPos past the first operation.
00782     if(Cigar::foundInQuery(myCigarRoller[0]))
00783     {
00784         // This op was found in the read, increment the current position.
00785         currentPos += myCigarRoller[0].count;
00786     }
00787    
00788     int numOps = myCigarRoller.size();
00789     
00790     // Loop through the cigar operations from the 2nd operation since
00791     // the first operation is already on the end and can't shift.
00792     for(int currentOp = 1; currentOp < numOps; currentOp++)
00793     {
00794         if(myCigarRoller[currentOp].operation == Cigar::insert)
00795         {
00796             // For now, only shift a max of 1 operation.
00797             int prevOpIndex = currentOp-1;
00798             // Track the next op for seeing if it is the same as the
00799             // previous for merging reasons.
00800             int nextOpIndex = currentOp+1;
00801             if(nextOpIndex == numOps)
00802             {
00803                 // There is no next op, so set it equal to the current one.
00804                 nextOpIndex = currentOp;
00805             }
00806             // The start of the previous operation, so we know when we hit it
00807             // so we don't shift past it.
00808             uint32_t prevOpStart = 
00809                 currentPos - myCigarRoller[prevOpIndex].count;
00810 
00811             // We can only shift if the previous operation
00812             if(!Cigar::isMatchOrMismatch(myCigarRoller[prevOpIndex]))
00813             {
00814                 // TODO - shift past pads
00815                 // An insert is in the read, so increment the position.
00816                 currentPos += myCigarRoller[currentOp].count;                 
00817                 // Not a match/mismatch, so can't shift into it.
00818                 continue;
00819             }
00820                     
00821             // It is a match or mismatch, so check to see if we can
00822             // shift into it.
00823 
00824             // The end of the insert is calculated by adding the size
00825             // of this insert minus 1 to the start of the insert.
00826             uint32_t insertEndPos = 
00827                 currentPos + myCigarRoller[currentOp].count - 1;
00828                 
00829             // The insert starts at the current position.
00830             uint32_t insertStartPos = currentPos;
00831                 
00832             // Loop as long as the position before the insert start
00833             // matches the last character in the insert. If they match,
00834             // the insert can be shifted one index left because the
00835             // implied reference will not change.  If they do not match,
00836             // we can't shift because the implied reference would change.
00837             // Stop loop when insertStartPos = prevOpStart, because we 
00838             // don't want to move past that.
00839             while((insertStartPos > prevOpStart) && 
00840                   (getSequence(insertEndPos,BASES) == 
00841                    getSequence(insertStartPos - 1, BASES)))
00842             {
00843                 // We can shift, so move the insert start & end one left.
00844                 --insertEndPos;
00845                 --insertStartPos;
00846             }
00847 
00848             // Determine if a shift has occurred.
00849             int shiftLen = currentPos - insertStartPos;
00850             if(shiftLen > 0)
00851             {
00852                 // Shift occured, so adjust the cigar if the cigar will
00853                 // not become more operations.
00854                 // If the next operation is the same as the previous or
00855                 // if the insert and the previous operation switch positions
00856                 // then the cigar has the same number of operations.
00857                 // If the next operation is different, and the shift splits
00858                 // the previous operation in 2, then the cigar would
00859                 // become longer, so we do not want to shift.
00860                 if(myCigarRoller[nextOpIndex].operation == 
00861                    myCigarRoller[prevOpIndex].operation)
00862                 {
00863                     // The operations are the same, so merge them by adding
00864                     // the length of the shift to the next operation.
00865                     myCigarRoller.IncrementCount(nextOpIndex, shiftLen);
00866                     myCigarRoller.IncrementCount(prevOpIndex, -shiftLen);
00867 
00868                     // If the previous op length is 0, just remove that
00869                     // operation.
00870                     if(myCigarRoller[prevOpIndex].count == 0)
00871                     {
00872                         myCigarRoller.Remove(prevOpIndex);
00873                     }
00874                     shifted = true;
00875                 } 
00876                 else
00877                 {
00878                     // Can only shift if the insert shifts past the
00879                     // entire previous operation, otherwise an operation
00880                     // would need to be added.
00881                     if(insertStartPos == prevOpStart)
00882                     { 
00883                         // Swap the positions of the insert and the
00884                         // previous operation.
00885                         myCigarRoller.Update(currentOp,
00886                                              myCigarRoller[prevOpIndex].operation,
00887                                              myCigarRoller[prevOpIndex].count);
00888                         // Size of the previous op is the entire
00889                         // shift length.
00890                         myCigarRoller.Update(prevOpIndex, 
00891                                              Cigar::insert,
00892                                              shiftLen);
00893                         shifted = true;
00894                     }
00895                 }
00896             }
00897             // An insert is in the read, so increment the position.
00898             currentPos += myCigarRoller[currentOp].count;                 
00899         }
00900         else if(Cigar::foundInQuery(myCigarRoller[currentOp]))
00901         {
00902             // This op was found in the read, increment the current position.
00903             currentPos += myCigarRoller[currentOp].count;
00904         }
00905     }
00906     if(shifted)
00907     {
00908         // TODO - setCigar is currently inefficient because later the cigar
00909         // roller will be recalculated, but for now it will work.
00910         setCigar(myCigarRoller);
00911     }
00912     return(shifted);
00913 }
00914 
00915 
00916 // Get methods for record fields.
00917 const void* SamRecord::getRecordBuffer()
00918 {
00919     return(getRecordBuffer(mySequenceTranslation));
00920 }
00921 
00922 
00923 // Get methods for record fields.
00924 const void* SamRecord::getRecordBuffer(SequenceTranslation translation) 
00925 {
00926     myStatus = SamStatus::SUCCESS;
00927     bool status = true;
00928     // If the buffer is not synced or the sequence in the buffer is not
00929     // properly translated, fix the buffer.
00930     if((myIsBufferSynced == false) ||
00931        (myBufferSequenceTranslation != translation))
00932     {
00933         status &= fixBuffer(translation);
00934     }
00935     // If the buffer is synced, check to see if the tags need to be synced.
00936     if(myNeedToSetTagsInBuffer)
00937     {
00938         status &= setTagsInBuffer();
00939     }
00940     if(!status)
00941     {
00942         return(NULL);
00943     }
00944     return (const void *)myRecordPtr;
00945 }
00946 
00947 
00948 // Write the record as a buffer into the file using the class's 
00949 // sequence translation setting.
00950 SamStatus::Status SamRecord::writeRecordBuffer(IFILE filePtr)
00951 {
00952     return(writeRecordBuffer(filePtr, mySequenceTranslation));
00953 }
00954 
00955 
00956 // Write the record as a buffer into the file using the specified translation.
00957 SamStatus::Status SamRecord::writeRecordBuffer(IFILE filePtr, 
00958                                                SequenceTranslation translation)
00959 {
00960     myStatus = SamStatus::SUCCESS;
00961     if((filePtr == NULL) || (filePtr->isOpen() == false))
00962     {
00963         // File is not open, return failure.
00964         myStatus.setStatus(SamStatus::FAIL_ORDER,
00965                            "Can't write to an unopened file.");
00966         return(SamStatus::FAIL_ORDER);
00967     }
00968 
00969     if((myIsBufferSynced == false) ||
00970        (myBufferSequenceTranslation != translation))
00971     {
00972         if(!fixBuffer(translation))
00973         {
00974             return(myStatus.getStatus());
00975         }
00976     }
00977 
00978     // Write the record.
00979     unsigned int numBytesToWrite = myRecordPtr->myBlockSize + sizeof(int32_t);
00980     unsigned int numBytesWritten = 
00981         ifwrite(filePtr, myRecordPtr, numBytesToWrite);
00982 
00983     // Return status based on if the correct number of bytes were written.
00984     if(numBytesToWrite == numBytesWritten)
00985     {
00986         return(SamStatus::SUCCESS);
00987     }
00988     // The correct number of bytes were not written.
00989     myStatus.setStatus(SamStatus::FAIL_IO, "Failed to write the entire record.");
00990     return(SamStatus::FAIL_IO);
00991 }
00992 
00993 
00994 int32_t SamRecord::getBlockSize() 
00995 {
00996     myStatus = SamStatus::SUCCESS;
00997     // If the buffer isn't synced, sync the buffer to determine the
00998     // block size.
00999     if(myIsBufferSynced == false)
01000     {
01001         // Since this just returns the block size, the translation of
01002         // the sequence does not matter, so just use the currently set
01003         // value.
01004         fixBuffer(myBufferSequenceTranslation);
01005     }
01006     return myRecordPtr->myBlockSize;
01007 }
01008 
01009 
01010 // This method returns the reference name.
01011 const char* SamRecord::getReferenceName()
01012 {
01013     myStatus = SamStatus::SUCCESS;
01014     return myReferenceName.c_str();
01015 }
01016 
01017 
01018 int32_t SamRecord::getReferenceID()
01019 {
01020     myStatus = SamStatus::SUCCESS;
01021     return myRecordPtr->myReferenceID;
01022 }
01023 
01024 
01025 int32_t SamRecord::get1BasedPosition()
01026 {
01027     myStatus = SamStatus::SUCCESS;
01028     return (myRecordPtr->myPosition + 1);
01029 }
01030 
01031 
01032 int32_t SamRecord::get0BasedPosition()
01033 {
01034     myStatus = SamStatus::SUCCESS;
01035     return myRecordPtr->myPosition;
01036 }
01037 
01038 
01039 uint8_t SamRecord::getReadNameLength() 
01040 {
01041     myStatus = SamStatus::SUCCESS;
01042     // If the buffer is valid, return the size from there, otherwise get the 
01043     // size from the string length + 1 (ending null).
01044     if(myIsReadNameBufferValid)
01045     {
01046         return(myRecordPtr->myReadNameLength);
01047     }
01048    
01049     return(myReadName.Length() + 1);
01050 }
01051 
01052 
01053 uint8_t SamRecord::getMapQuality()
01054 {
01055     myStatus = SamStatus::SUCCESS;
01056     return myRecordPtr->myMapQuality;
01057 }
01058 
01059 
01060 uint16_t SamRecord::getBin()
01061 {
01062     myStatus = SamStatus::SUCCESS;
01063     if(!myIsBinValid)
01064     {
01065         // The bin that is set in the record is not valid, so
01066         // reset it.
01067         myRecordPtr->myBin = 
01068             bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd());      
01069         myIsBinValid = true;
01070     }
01071     return(myRecordPtr->myBin);
01072 }
01073 
01074 
01075 uint16_t SamRecord::getCigarLength()
01076 {
01077     myStatus = SamStatus::SUCCESS;
01078     // If the cigar buffer is valid
01079     // then get the length from there.
01080     if(myIsCigarBufferValid)
01081     {
01082         return myRecordPtr->myCigarLength;      
01083     }
01084 
01085     if(myCigarTempBufferLength == -1)
01086     {
01087         // The cigar buffer is not valid and the cigar temp buffer is not set,
01088         // so parse the string.
01089         parseCigarString();
01090     }
01091    
01092     // The temp buffer is now set, so return the size.
01093     return(myCigarTempBufferLength);
01094 }
01095 
01096 
01097 uint16_t SamRecord::getFlag()
01098 {
01099     myStatus = SamStatus::SUCCESS;
01100     return myRecordPtr->myFlag;
01101 }
01102 
01103 
01104 int32_t SamRecord::getReadLength() 
01105 {
01106     myStatus = SamStatus::SUCCESS;
01107     if(myIsSequenceBufferValid == false)
01108     {
01109         // If the sequence is "*", then return 0.
01110         if((mySequence.Length() == 1) && (mySequence[0] == '*'))
01111         {
01112             return(0);
01113         }
01114         // Do not add 1 since it is not null terminated.
01115         return(mySequence.Length());
01116     }
01117     return(myRecordPtr->myReadLength);
01118 }
01119 
01120 
01121 // This method returns the mate reference name.  If it is equal to the
01122 // reference name, it still returns the reference name.
01123 const char* SamRecord::getMateReferenceName()
01124 {
01125     myStatus = SamStatus::SUCCESS;
01126     return myMateReferenceName.c_str();
01127 }
01128 
01129 
01130 // This method returns the mate reference name.  If it is equal to the
01131 // reference name, it returns "=", unless they are both "*" in which case
01132 // "*" is returned.
01133 const char* SamRecord::getMateReferenceNameOrEqual()
01134 {
01135     myStatus = SamStatus::SUCCESS;
01136     if(myMateReferenceName == "*")
01137     {
01138         return(myMateReferenceName);
01139     }
01140     if(myMateReferenceName == getReferenceName())
01141     {
01142         return(FIELD_ABSENT_STRING);
01143     }
01144     else
01145     {
01146         return(myMateReferenceName);
01147     }
01148 }
01149 
01150 
01151 int32_t SamRecord::getMateReferenceID()
01152 {
01153     myStatus = SamStatus::SUCCESS;
01154     return myRecordPtr->myMateReferenceID;
01155 }
01156 
01157 
01158 int32_t SamRecord::get1BasedMatePosition()
01159 {
01160     myStatus = SamStatus::SUCCESS;
01161     return (myRecordPtr->myMatePosition + 1);
01162 }
01163 
01164 
01165 int32_t SamRecord::get0BasedMatePosition()
01166 {
01167     myStatus = SamStatus::SUCCESS;
01168     return myRecordPtr->myMatePosition;
01169 }
01170 
01171 
01172 int32_t SamRecord::getInsertSize()
01173 {
01174     myStatus = SamStatus::SUCCESS;
01175     return myRecordPtr->myInsertSize;
01176 }
01177 
01178 
01179 // Returns the inclusive rightmost position of the clipped sequence.
01180 int32_t SamRecord::get0BasedAlignmentEnd()
01181 {
01182     myStatus = SamStatus::SUCCESS;
01183     if(myAlignmentLength == -1)
01184     {
01185         // Alignment end has not been set, so calculate it.
01186         parseCigar();
01187     }
01188     // If alignment length > 0, subtract 1 from it to get the end.
01189     if(myAlignmentLength == 0)
01190     {
01191         // Length is 0, just return the start position.
01192         return(myRecordPtr->myPosition);
01193     }
01194     return(myRecordPtr->myPosition + myAlignmentLength - 1);
01195 }
01196 
01197 
01198 // Returns the inclusive rightmost position of the clipped sequence.
01199 int32_t SamRecord::get1BasedAlignmentEnd()
01200 {
01201     return(get0BasedAlignmentEnd() + 1);
01202 }
01203 
01204    
01205 // Return the length of the alignment.
01206 int32_t SamRecord::getAlignmentLength()
01207 {
01208     myStatus = SamStatus::SUCCESS;
01209     if(myAlignmentLength == -1)
01210     {
01211         // Alignment end has not been set, so calculate it.
01212         parseCigar();
01213     }
01214     // Return the alignment length.
01215     return(myAlignmentLength);
01216 }
01217 
01218 // Returns the inclusive left-most position adjust for clipped bases.
01219 int32_t SamRecord::get0BasedUnclippedStart()
01220 {
01221     myStatus = SamStatus::SUCCESS;
01222     if(myUnclippedStartOffset == -1)
01223     {
01224         // Unclipped has not yet been calculated, so parse the cigar to get it
01225         parseCigar();
01226     }
01227     return(myRecordPtr->myPosition - myUnclippedStartOffset);
01228 }
01229 
01230 
01231 // Returns the inclusive left-most position adjust for clipped bases.
01232 int32_t SamRecord::get1BasedUnclippedStart()
01233 {
01234     return(get0BasedUnclippedStart() + 1);
01235 }
01236 
01237 
01238 // Returns the inclusive right-most position adjust for clipped bases.
01239 int32_t SamRecord::get0BasedUnclippedEnd()
01240 {
01241     // myUnclippedEndOffset will be set by get0BasedAlignmentEnd if the 
01242     // cigar has not yet been parsed, so no need to check it here.
01243     return(get0BasedAlignmentEnd() + myUnclippedEndOffset);
01244 }
01245 
01246 
01247 // Returns the inclusive right-most position adjust for clipped bases.
01248 int32_t SamRecord::get1BasedUnclippedEnd()
01249 {
01250     return(get0BasedUnclippedEnd() + 1);
01251 }
01252 
01253 
01254 // Get the read name.
01255 const char* SamRecord::getReadName()
01256 {
01257     myStatus = SamStatus::SUCCESS;
01258     if(myReadName.Length() == 0)
01259     {
01260         // 0 Length, means that it is in the buffer, but has not yet
01261         // been synced to the string, so do the sync.
01262         myReadName = (char*)&(myRecordPtr->myData);
01263     }
01264     return myReadName.c_str();
01265 }
01266 
01267 
01268 const char* SamRecord::getCigar()
01269 {
01270     myStatus = SamStatus::SUCCESS;
01271     if(myCigar.Length() == 0)
01272     {
01273         // 0 Length, means that it is in the buffer, but has not yet
01274         // been synced to the string, so do the sync.
01275         parseCigarBinary();
01276     }
01277     return myCigar.c_str();
01278 }
01279 
01280 
01281 const char* SamRecord::getSequence()
01282 {
01283     return(getSequence(mySequenceTranslation));
01284 }
01285 
01286 
01287 const char* SamRecord::getSequence(SequenceTranslation translation)
01288 {
01289     myStatus = SamStatus::SUCCESS;
01290     if(mySequence.Length() == 0)
01291     {
01292         // 0 Length, means that it is in the buffer, but has not yet
01293         // been synced to the string, so do the sync.
01294         setSequenceAndQualityFromBuffer();
01295     }
01296 
01297     // Determine if translation needs to be done.
01298     if((translation == NONE) || (myRefPtr == NULL))
01299     {
01300         return mySequence.c_str();
01301     }
01302     else if(translation == EQUAL)
01303     {
01304         if(mySeqWithEq.length() == 0)
01305         {
01306             // Check to see if the sequence is defined.
01307             if(mySequence == "*")
01308             {
01309                 // Sequence is undefined, so no translation necessary.
01310                 mySeqWithEq = '*';
01311             }
01312             else
01313             {
01314                 // Sequence defined, so translate it.
01315                 SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(), 
01316                                                   myRecordPtr->myPosition,
01317                                                   *(getCigarInfo()),
01318                                                   getReferenceName(),
01319                                                   *myRefPtr,
01320                                                   mySeqWithEq);
01321             }
01322         }
01323         return(mySeqWithEq.c_str());
01324     }
01325     else
01326     {
01327         // translation == BASES
01328         if(mySeqWithoutEq.length() == 0)
01329         {
01330             if(mySequence == "*")
01331             {
01332                 // Sequence is undefined, so no translation necessary.
01333                 mySeqWithoutEq = '*';
01334             }
01335             else
01336             {
01337                 // Sequence defined, so translate it.
01338                 SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(), 
01339                                                      myRecordPtr->myPosition,
01340                                                      *(getCigarInfo()),
01341                                                      getReferenceName(),
01342                                                      *myRefPtr,
01343                                                      mySeqWithoutEq);
01344             }
01345         }
01346         return(mySeqWithoutEq.c_str());
01347     }
01348 }
01349 
01350 
01351 const char* SamRecord::getQuality() 
01352 {
01353     myStatus = SamStatus::SUCCESS;
01354     if(myQuality.Length() == 0)
01355     {
01356         // 0 Length, means that it is in the buffer, but has not yet
01357         // been synced to the string, so do the sync.
01358         setSequenceAndQualityFromBuffer();      
01359     }
01360     return myQuality.c_str();
01361 }
01362 
01363 
01364 char SamRecord::getSequence(int index)
01365 {
01366     return(getSequence(index, mySequenceTranslation));
01367 }
01368 
01369 
01370 char SamRecord::getSequence(int index, SequenceTranslation translation)
01371 {
01372     static const char * asciiBases = "=AC.G...T......N";
01373 
01374     // Determine the read length.
01375     int32_t readLen = getReadLength();
01376 
01377     // If the read length is 0, this method should not be called.
01378     if(readLen == 0)
01379     {
01380         String exceptionString = "SamRecord::getSequence(";
01381         exceptionString += index;
01382         exceptionString += ") is not allowed since sequence = '*'";
01383         throw std::runtime_error(exceptionString.c_str());
01384     }
01385     else if((index < 0) || (index >= readLen))
01386     {
01387         // Only get here if the index was out of range, so thow an exception.
01388         String exceptionString = "SamRecord::getSequence(";
01389         exceptionString += index;
01390         exceptionString += ") is out of range. Index must be between 0 and ";
01391         exceptionString += (readLen - 1);
01392         throw std::runtime_error(exceptionString.c_str());
01393     }
01394 
01395     // Determine if translation needs to be done.
01396     if((translation == NONE) || (myRefPtr == NULL))
01397     {
01398         // No translation needs to be done.
01399         if(mySequence.Length() == 0)
01400         {
01401             // Parse BAM sequence.
01402             // TODO - maybe store this pointer - and use that to track when
01403             // valid?
01404             unsigned char * packedSequence = 
01405                 (unsigned char *)myRecordPtr->myData + 
01406                 myRecordPtr->myReadNameLength +
01407                 myRecordPtr->myCigarLength * sizeof(int);
01408             
01409             return(index & 1 ?
01410                    asciiBases[packedSequence[index / 2] & 0xF] :
01411                    asciiBases[packedSequence[index / 2] >> 4]);
01412         }
01413         // Already have string.
01414         return(mySequence[index]);
01415     }
01416     else
01417     {
01418         // Need to translate the sequence either to have '=' or to not
01419         // have it.
01420         // First check to see if the sequence has been set.
01421         if(mySequence.Length() == 0)
01422         {
01423             // 0 Length, means that it is in the buffer, but has not yet
01424             // been synced to the string, so do the sync.
01425             setSequenceAndQualityFromBuffer();
01426         }
01427 
01428         // Check the type of translation.
01429         if(translation == EQUAL)
01430         {
01431             // Check whether or not the string has already been 
01432             // retrieved that has the '=' in it.
01433             if(mySeqWithEq.length() == 0)
01434             {
01435                 // The string with '=' has not yet been determined,
01436                 // so get the string.
01437                 // Check to see if the sequence is defined.
01438                 if(mySequence == "*")
01439                 {
01440                     // Sequence is undefined, so no translation necessary.
01441                     mySeqWithEq = '*';
01442                 }
01443                 else
01444                 {
01445                     // Sequence defined, so translate it.
01446                     SamQuerySeqWithRef::seqWithEquals(mySequence.c_str(), 
01447                                                       myRecordPtr->myPosition, 
01448                                                       *(getCigarInfo()),
01449                                                       getReferenceName(),
01450                                                       *myRefPtr,
01451                                                       mySeqWithEq);
01452                 }
01453             }
01454             // Sequence is set, so return it.
01455             return(mySeqWithEq[index]);
01456         }
01457         else
01458         {
01459             // translation == BASES
01460             // Check whether or not the string has already been 
01461             // retrieved that does not have the '=' in it.
01462             if(mySeqWithoutEq.length() == 0)
01463             {
01464                 // The string with '=' has not yet been determined,
01465                 // so get the string.
01466                 // Check to see if the sequence is defined.
01467                 if(mySequence == "*")
01468                 {
01469                     // Sequence is undefined, so no translation necessary.
01470                     mySeqWithoutEq = '*';
01471                 }
01472                 else
01473                 {
01474                     // Sequence defined, so translate it.
01475                     // The string without '=' has not yet been determined,
01476                     // so get the string.
01477                     SamQuerySeqWithRef::seqWithoutEquals(mySequence.c_str(), 
01478                                                          myRecordPtr->myPosition, 
01479                                                          *(getCigarInfo()),
01480                                                          getReferenceName(),
01481                                                          *myRefPtr,
01482                                                          mySeqWithoutEq);
01483                 }
01484             }
01485             // Sequence is set, so return it.
01486             return(mySeqWithoutEq[index]);
01487         }
01488     }
01489 }
01490 
01491 
01492 char SamRecord::getQuality(int index)
01493 {
01494     // Determine the read length.
01495     int32_t readLen = getReadLength();
01496 
01497     // If the read length is 0, return ' ' whose ascii code is below
01498     // the minimum ascii code for qualities.
01499     if(readLen == 0)
01500     {
01501         return(BaseUtilities::UNKNOWN_QUALITY_CHAR);
01502     }
01503     else if((index < 0) || (index >= readLen))
01504     {
01505         // Only get here if the index was out of range, so thow an exception.
01506         String exceptionString = "SamRecord::getQuality(";
01507         exceptionString += index;
01508         exceptionString += ") is out of range. Index must be between 0 and ";
01509         exceptionString += (readLen - 1);
01510         throw std::runtime_error(exceptionString.c_str());
01511     }
01512 
01513     if(myQuality.Length() == 0)
01514     {
01515         // Parse BAM Quality.
01516         unsigned char * packedQuality = 
01517             (unsigned char *)myRecordPtr->myData +
01518             myRecordPtr->myReadNameLength + 
01519             myRecordPtr->myCigarLength * sizeof(int) + 
01520             (myRecordPtr->myReadLength + 1) / 2;
01521         return(packedQuality[index] + 33);
01522     }
01523     else
01524     {
01525         // Already have string.
01526         if((myQuality.Length() == 1) && (myQuality[0] == '*'))
01527         {
01528             // Return the unknown quality character.
01529             return(BaseUtilities::UNKNOWN_QUALITY_CHAR);
01530         }
01531         else
01532         {
01533             return(myQuality[index]);
01534         }
01535     }
01536 }
01537 
01538    
01539 Cigar* SamRecord::getCigarInfo()
01540 {
01541     // Check to see whether or not the Cigar has already been
01542     // set - this is determined by checking if alignment length
01543     // is set since alignment length and the cigar are set
01544     // at the same time.
01545     if(myAlignmentLength == -1)
01546     {
01547         // Not been set, so calculate it.
01548         parseCigar();
01549     }
01550     return(&myCigarRoller);
01551 }
01552 
01553 
01554 uint32_t SamRecord::getTagLength()
01555 {
01556     myStatus = SamStatus::SUCCESS;
01557     if(myNeedToSetTagsFromBuffer)
01558     {
01559         // Tags are only set in the buffer, so the size of the tags is 
01560         // the length of the record minus the starting location of the tags.
01561         unsigned char * tagStart = 
01562             (unsigned char *)myRecordPtr->myData 
01563             + myRecordPtr->myReadNameLength 
01564             + myRecordPtr->myCigarLength * sizeof(int)
01565             + (myRecordPtr->myReadLength + 1) / 2 + myRecordPtr->myReadLength;
01566       
01567         // The non-tags take up from the start of the record to the tag start.
01568         // Do not include the block size part of the record since it is not
01569         // included in the size.
01570         uint32_t nonTagSize = 
01571             tagStart - (unsigned char*)&(myRecordPtr->myReferenceID);
01572         // Tags take up the size of the block minus the non-tag section.
01573         uint32_t tagSize = myRecordPtr->myBlockSize - nonTagSize;
01574         return(tagSize);
01575     }
01576 
01577     // Tags are stored outside the buffer, so myTagBufferSize is set.
01578     return(myTagBufferSize);
01579 }
01580 
01581 
01582 // Returns true if there is another tag and sets tag and vtype to the
01583 // appropriate values, and returns a pointer to the value.
01584 // Sets the Status to SUCCESS when a tag is successfully returned or
01585 // when there are no more tags.  Otherwise the status is set to describe
01586 // why it failed (parsing, etc).
01587 bool SamRecord::getNextSamTag(char* tag, char& vtype, void** value)
01588 {
01589     myStatus = SamStatus::SUCCESS;
01590     if(myNeedToSetTagsFromBuffer)
01591     {
01592         if(!setTagsFromBuffer())
01593         {
01594             // Failed to read the tags from the buffer, so cannot
01595             // get tags.
01596             return(false);
01597         }
01598     }
01599 
01600     // Increment the tag index to start looking at the next tag.
01601     // At the beginning, it is set to -1.
01602     myLastTagIndex++;
01603     int maxTagIndex = extras.Capacity();
01604     if(myLastTagIndex >= maxTagIndex)
01605     {
01606         // Hit the end of the tags, return false, no more tags.
01607         // Status is still success since this is not an error, 
01608         // it is just the end of the list.
01609         return(false);
01610     }
01611 
01612     bool tagFound = false;
01613     // Loop until a tag is found or the end of extras is hit.
01614     while((tagFound == false) && (myLastTagIndex < maxTagIndex))
01615     {
01616         if(extras.SlotInUse(myLastTagIndex))
01617         {
01618             // Found a slot to use.
01619             int key = extras.GetKey(myLastTagIndex);
01620             getTag(key, tag);
01621             getTypeFromKey(key, vtype);
01622             tagFound = true;
01623             // Get the value associated with the key based on the vtype.
01624             switch (vtype)
01625             {
01626                 case 'f' :
01627                     *value = getDoublePtr(myLastTagIndex);
01628                     break;
01629                 case 'i' :
01630                     *value = getIntegerPtr(myLastTagIndex, vtype);
01631                     if(vtype != 'A')
01632                     {
01633                         // Convert all int types to 'i'
01634                         vtype = 'i';
01635                     }
01636                     break;
01637                 case 'Z' :
01638                     *value = getStringPtr(myLastTagIndex);
01639                     break;
01640                 default:
01641                     myStatus.setStatus(SamStatus::FAIL_PARSE,
01642                                        "Unknown tag type");
01643                     tagFound = false;
01644                     break;
01645             }
01646         }
01647         if(!tagFound)
01648         {
01649             // Increment the index since a tag was not found.
01650             myLastTagIndex++;
01651         }
01652     }
01653     return(tagFound);
01654 }
01655 
01656 
01657 // Returns the values of all fields except the tags.
01658 bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName, 
01659                           String& cigar, String& sequence, String& quality)
01660 {
01661     return(getFields(recStruct, readName, cigar, sequence, quality,
01662                      mySequenceTranslation));
01663 }
01664 
01665 
01666 // Returns the values of all fields except the tags.
01667 bool SamRecord::getFields(bamRecordStruct& recStruct, String& readName, 
01668                           String& cigar, String& sequence, String& quality,
01669                           SequenceTranslation translation)
01670 {
01671     myStatus = SamStatus::SUCCESS;
01672     if(myIsBufferSynced == false)
01673     {
01674         if(!fixBuffer(translation))
01675         {
01676             // failed to set the buffer, return false.
01677             return(false);
01678         }
01679     }
01680     memcpy(&recStruct, myRecordPtr, sizeof(bamRecordStruct));
01681 
01682     readName = getReadName();
01683     // Check the status.
01684     if(myStatus != SamStatus::SUCCESS)
01685     {
01686         // Failed to set the fields, return false.
01687         return(false);
01688     }
01689     cigar = getCigar();
01690     // Check the status.
01691     if(myStatus != SamStatus::SUCCESS)
01692     {
01693         // Failed to set the fields, return false.
01694         return(false);
01695     }
01696     sequence = getSequence(translation);
01697     // Check the status.
01698     if(myStatus != SamStatus::SUCCESS)
01699     {
01700         // Failed to set the fields, return false.
01701         return(false);
01702     }
01703     quality = getQuality();
01704     // Check the status.
01705     if(myStatus != SamStatus::SUCCESS)
01706     {
01707         // Failed to set the fields, return false.
01708         return(false);
01709     }
01710     return(true);
01711 }
01712 
01713 
01714 // Returns the reference pointer.
01715 GenomeSequence* SamRecord::getReference()
01716 {
01717     return(myRefPtr);
01718 }
01719 
01720 
01721 bool SamRecord::isIntegerType(char vtype) const
01722 {
01723     if((vtype == 'c') || (vtype == 'C') ||
01724        (vtype == 's') || (vtype == 'S') ||
01725        (vtype == 'i') || (vtype == 'I'))
01726     {
01727         return(true);
01728     }
01729     return(false);
01730 }
01731 
01732 
01733 bool SamRecord::isDoubleType(char vtype) const
01734 {
01735     if(vtype == 'f')
01736     {
01737         return(true);
01738     }
01739     return(false);
01740 }
01741 
01742 
01743 bool SamRecord::isCharType(char vtype) const
01744 {
01745     if(vtype == 'A')
01746     {
01747         return(true);
01748     }
01749     return(false);
01750 }
01751 
01752 
01753 bool SamRecord::isStringType(char vtype) const
01754 {
01755     if(vtype == 'Z')
01756     {
01757         return(true);
01758     }
01759     return(false);
01760 }
01761 
01762 
01763 void SamRecord::clearTags()
01764 {
01765     if(extras.Entries() != 0)
01766     {
01767         extras.Clear();
01768     }
01769     strings.Clear();
01770     integers.Clear();
01771     intType.clear();
01772     doubles.Clear();
01773     myTagBufferSize = 0;
01774     resetTagIter();
01775 }
01776 
01777 
01778 bool SamRecord::rmTag(const char* tag, char type)
01779 {
01780     // Check the length of tag.
01781     if(strlen(tag) != 2)
01782     {
01783         // Tag is the wrong length.
01784         myStatus.setStatus(SamStatus::INVALID, 
01785                            "rmTag called with tag that is not 2 characters\n");
01786         return(false);
01787     }
01788 
01789     myStatus = SamStatus::SUCCESS;
01790     if(myNeedToSetTagsFromBuffer)
01791     {
01792         if(!setTagsFromBuffer())
01793         {
01794             // Failed to read the tags from the buffer, so cannot
01795             // get tags.
01796             return(false);
01797         }
01798     }
01799 
01800     // Construct the key.
01801     int key = MAKEKEY(tag[0], tag[1], type);
01802     // Look to see if the key exsists in the hash.
01803     int offset = extras.Find(key);
01804 
01805     if(offset < 0)
01806     {
01807         // Not found, so return true, successfully removed since
01808         // it is not in tag.
01809         return(true);
01810     }
01811 
01812     // Offset is set, so the key was found.
01813     // First if it is an integer, determine the actual type of the int.
01814     char vtype;
01815     getTypeFromKey(key, vtype);
01816     if(vtype == 'i')
01817     {
01818         vtype = getIntegerType(offset);
01819     }
01820 
01821     // Offset is set, so recalculate the buffer size without this entry.
01822     // Do NOT remove from strings, integers, or doubles because then
01823     // extras would need to be updated for all entries with the new indexes
01824     // into those variables.
01825     int rmBuffSize = 0;
01826     switch(vtype)
01827     {
01828         case 'A':
01829         case 'c':
01830         case 'C':
01831             rmBuffSize = 4;
01832             break;
01833         case 's':
01834         case 'S':
01835             rmBuffSize = 5;
01836             break;
01837         case 'i':
01838         case 'I':
01839             rmBuffSize = 7;
01840             break;
01841         case 'f':
01842             rmBuffSize = 7;
01843             break;
01844         case 'Z':
01845             rmBuffSize = 4 + getString(offset).Length();
01846             break;
01847         default:
01848             myStatus.setStatus(SamStatus::INVALID, 
01849                                "rmTag called with unknown type.\n");
01850             return(false);
01851             break;
01852     };
01853 
01854     // The buffer tags are now out of sync.
01855     myNeedToSetTagsInBuffer = true;
01856     myIsTagsBufferValid = false;
01857     myIsBufferSynced = false;
01858     myTagBufferSize -= rmBuffSize;
01859 
01860     // Remove from the hash.
01861     extras.Delete(offset);
01862     return(true);
01863 }
01864 
01865 
01866 bool SamRecord::rmTags(const char* tags)
01867 {
01868     const char* currentTagPtr = tags;
01869 
01870     myStatus = SamStatus::SUCCESS;
01871     if(myNeedToSetTagsFromBuffer)
01872     {
01873         if(!setTagsFromBuffer())
01874         {
01875             // Failed to read the tags from the buffer, so cannot
01876             // get tags.
01877             return(false);
01878         }
01879     }
01880     
01881     bool returnStatus = true;
01882 
01883     int rmBuffSize = 0;
01884     while(*currentTagPtr != '\0')
01885     {
01886 
01887         // Tags are formatted as: XY:Z
01888         // Where X is [A-Za-z], Y is [A-Za-z], and
01889         // Z is A,i,f,Z,H (cCsSI are also excepted)
01890         if((currentTagPtr[0] == '\0') || (currentTagPtr[1] == '\0') ||
01891            (currentTagPtr[2] != ':') || (currentTagPtr[3] == '\0'))
01892         {
01893             myStatus.setStatus(SamStatus::INVALID, 
01894                                "rmTags called with improperly formatted tags.\n");
01895             returnStatus = false;
01896             break;
01897         }
01898 
01899         // Construct the key.
01900         int key = MAKEKEY(currentTagPtr[0], currentTagPtr[1], 
01901                           currentTagPtr[3]);
01902         // Look to see if the key exsists in the hash.
01903         int offset = extras.Find(key);
01904 
01905         if(offset >= 0)
01906         {
01907             // Offset is set, so the key was found.
01908             // First if it is an integer, determine the actual type of the int.
01909             char vtype;
01910             getTypeFromKey(key, vtype);
01911             if(vtype == 'i')
01912             {
01913                 vtype = getIntegerType(offset);
01914             }
01915             
01916             // Offset is set, so recalculate the buffer size without this entry.
01917             // Do NOT remove from strings, integers, or doubles because then
01918             // extras would need to be updated for all entries with the new indexes
01919             // into those variables.
01920             switch(vtype)
01921             {
01922                 case 'A':
01923                 case 'c':
01924                 case 'C':
01925                     rmBuffSize += 4;
01926                     break;
01927                 case 's':
01928                 case 'S':
01929                     rmBuffSize += 5;
01930                     break;
01931                 case 'i':
01932                 case 'I':
01933                     rmBuffSize += 7;
01934                     break;
01935                 case 'f':
01936                     rmBuffSize += 7;
01937                     break;
01938                 case 'Z':
01939                     rmBuffSize += 4 + getString(offset).Length();
01940                     break;
01941                 default:
01942                     myStatus.setStatus(SamStatus::INVALID, 
01943                                        "rmTag called with unknown type.\n");
01944                     returnStatus = false;
01945                     break;
01946             };
01947             
01948             // Remove from the hash.
01949             extras.Delete(offset);
01950         }
01951         // Increment to the next tag.
01952         if(currentTagPtr[4] == ';')
01953         {
01954             // Increment once more.
01955             currentTagPtr += 5;
01956         }
01957         else if(currentTagPtr[4] != '\0')
01958         {
01959             // Invalid tag format. 
01960             myStatus.setStatus(SamStatus::INVALID, 
01961                                "rmTags called with improperly formatted tags.\n");
01962             returnStatus = false;
01963             break;
01964         }
01965         else
01966         {
01967             // Last Tag.
01968             currentTagPtr += 4;
01969         }
01970     }
01971 
01972     // The buffer tags are now out of sync.
01973     myNeedToSetTagsInBuffer = true;
01974     myIsTagsBufferValid = false;
01975     myIsBufferSynced = false;
01976     myTagBufferSize -= rmBuffSize;
01977     
01978 
01979     return(returnStatus);
01980 }
01981 
01982 
01983 // Return the error after a failed SamRecord call.
01984 const SamStatus& SamRecord::getStatus()
01985 {
01986     return(myStatus);
01987 }
01988 
01989 
01990 bool SamRecord::getTagsString(const char* tags, String& returnString, char delim)
01991 {
01992     const char* currentTagPtr = tags;
01993 
01994     returnString.Clear();
01995     myStatus = SamStatus::SUCCESS;
01996     if(myNeedToSetTagsFromBuffer)
01997     {
01998         if(!setTagsFromBuffer())
01999         {
02000             // Failed to read the tags from the buffer, so cannot
02001             // get tags.
02002             return(false);
02003         }
02004     }
02005     
02006     bool returnStatus = true;
02007 
02008     while(*currentTagPtr != '\0')
02009     {
02010         // Tags are formatted as: XY:Z
02011         // Where X is [A-Za-z], Y is [A-Za-z], and
02012         // Z is A,i,f,Z,H (cCsSI are also excepted)
02013         if((currentTagPtr[0] == '\0') || (currentTagPtr[1] == '\0') ||
02014            (currentTagPtr[2] != ':') || (currentTagPtr[3] == '\0'))
02015         {
02016             myStatus.setStatus(SamStatus::INVALID, 
02017                                "getTagsString called with improperly formatted tags.\n");
02018             returnStatus = false;
02019             break;
02020         }
02021 
02022         // Construct the key.
02023         int key = MAKEKEY(currentTagPtr[0], currentTagPtr[1], 
02024                           currentTagPtr[3]);
02025         // Look to see if the key exsists in the hash.
02026         int offset = extras.Find(key);
02027 
02028         if(offset >= 0)
02029         {
02030             // Offset is set, so the key was found.
02031             if(!returnString.IsEmpty())
02032             {
02033                 returnString += delim;
02034             }
02035             returnString += currentTagPtr[0];
02036             returnString += currentTagPtr[1];
02037             returnString += ':';
02038             returnString += currentTagPtr[3];
02039             returnString += ':';
02040 
02041             // First if it is an integer, determine the actual type of the int.
02042             char vtype;
02043             getTypeFromKey(key, vtype);
02044 
02045             
02046             // Offset is set, so recalculate the buffer size without this entry.
02047             // Do NOT remove from strings, integers, or doubles because then
02048             // extras would need to be updated for all entries with the new indexes
02049             // into those variables.
02050             switch(vtype)
02051             {
02052                 case 'i':
02053                     returnString += *(int*)getIntegerPtr(offset, vtype);
02054                     break;
02055                 case 'f':
02056                     returnString += *(double*)getDoublePtr(offset);
02057                     break;
02058                 case 'Z':
02059                     returnString += *(String*)getStringPtr(offset);
02060                     break;
02061                 default:
02062                     myStatus.setStatus(SamStatus::INVALID, 
02063                                        "rmTag called with unknown type.\n");
02064                     returnStatus = false;
02065                     break;
02066             };
02067         }
02068         // Increment to the next tag.
02069         if(currentTagPtr[4] == ';')
02070         {
02071             // Increment once more.
02072             currentTagPtr += 5;
02073         }
02074         else if(currentTagPtr[4] != '\0')
02075         {
02076             // Invalid tag format. 
02077             myStatus.setStatus(SamStatus::INVALID, 
02078                                "rmTags called with improperly formatted tags.\n");
02079             returnStatus = false;
02080             break;
02081         }
02082         else
02083         {
02084             // Last Tag.
02085             currentTagPtr += 4;
02086         }
02087     }
02088     return(returnStatus);
02089 }
02090 
02091 
02092 String* SamRecord::getStringTag(const char * tag)
02093 {
02094     // Parse the buffer if necessary.
02095     if(myNeedToSetTagsFromBuffer)
02096     {
02097         if(!setTagsFromBuffer())
02098         {
02099             // Failed to read the tags from the buffer, so cannot
02100             // get tags.  setTagsFromBuffer set the errors,
02101             // so just return null.
02102             return(NULL);
02103         }
02104     }
02105     
02106     int key = MAKEKEY(tag[0], tag[1], 'Z');
02107     int offset = extras.Find(key);
02108 
02109     int value;
02110     if (offset < 0)
02111     {
02112         // Tag not found.
02113         return(NULL);
02114     }
02115 
02116     // Offset is valid, so return the tag.
02117     value = extras[offset];
02118     return(&(strings[value]));
02119 }
02120 
02121 
02122 int* SamRecord::getIntegerTag(const char * tag)
02123 {
02124     // Init to success.
02125     myStatus = SamStatus::SUCCESS;
02126     // Parse the buffer if necessary.
02127     if(myNeedToSetTagsFromBuffer)
02128     {
02129         if(!setTagsFromBuffer())
02130         {
02131             // Failed to read the tags from the buffer, so cannot
02132             // get tags.  setTagsFromBuffer set the errors,
02133             // so just return null.
02134             return(NULL);
02135         }
02136     }
02137     
02138     int key = MAKEKEY(tag[0], tag[1], 'i');
02139     int offset = extras.Find(key);
02140 
02141     int value;
02142     if (offset < 0)
02143     {
02144         // Failed to find the tag.
02145         return(NULL);
02146     }
02147     else
02148         value = extras[offset];
02149 
02150     return(&(integers[value]));
02151 }
02152 
02153 
02154 double* SamRecord::getDoubleTag(const char * tag)
02155 {
02156     // Init to success.
02157     myStatus = SamStatus::SUCCESS;
02158     // Parse the buffer if necessary.
02159     if(myNeedToSetTagsFromBuffer)
02160     {
02161         if(!setTagsFromBuffer())
02162         {
02163             // Failed to read the tags from the buffer, so cannot
02164             // get tags.  setTagsFromBuffer set the errors,
02165             // so just return null.
02166             return(NULL);
02167         }
02168     }
02169     
02170     int key = MAKEKEY(tag[0], tag[1], 'f');
02171     int offset = extras.Find(key);
02172 
02173     int value;
02174     if (offset < 0)
02175     {
02176         // Failed to find the tag.
02177         return(NULL);
02178     }
02179     else
02180         value = extras[offset];
02181 
02182     return(&(doubles[value]));
02183 }
02184 
02185 
02186 String & SamRecord::getString(const char * tag)
02187 {
02188     // Init to success.
02189     myStatus = SamStatus::SUCCESS;
02190     // Parse the buffer if necessary.
02191     if(myNeedToSetTagsFromBuffer)
02192     {
02193         if(!setTagsFromBuffer())
02194         {
02195             // Failed to read the tags from the buffer, so cannot
02196             // get tags.
02197             // TODO - what do we want to do on failure?            
02198         }
02199     }
02200     
02201     int key = MAKEKEY(tag[0], tag[1], 'Z');
02202     int offset = extras.Find(key);
02203 
02204     int value;
02205     if (offset < 0)
02206     {
02207         // TODO - what do we want to do on failure?
02208         return(NOT_FOUND_TAG_STRING);
02209     }
02210     else
02211         value = extras[offset];
02212 
02213     return strings[value];
02214 }
02215 
02216 int & SamRecord::getInteger(const char * tag)
02217 {
02218     // Init to success.
02219     myStatus = SamStatus::SUCCESS;
02220     // Parse the buffer if necessary.
02221     if(myNeedToSetTagsFromBuffer)
02222     {
02223         if(!setTagsFromBuffer())
02224         {
02225             // Failed to read the tags from the buffer, so cannot
02226             // get tags.  setTagsFromBuffer set the error.
02227             // TODO - what do we want to do on failure?
02228         }
02229     }
02230     
02231     int key = MAKEKEY(tag[0], tag[1], 'i');
02232     int offset = extras.Find(key);
02233 
02234     int value;
02235     if (offset < 0)
02236     {
02237         // TODO - what do we want to do on failure?
02238         return NOT_FOUND_TAG_INT;
02239     }
02240     else
02241         value = extras[offset];
02242 
02243     return integers[value];
02244 }
02245 
02246 
02247 double & SamRecord::getDouble(const char * tag)
02248 {
02249     // Init to success.
02250     myStatus = SamStatus::SUCCESS;
02251     // Parse the buffer if necessary.
02252     if(myNeedToSetTagsFromBuffer)
02253     {
02254         if(!setTagsFromBuffer())
02255         {
02256             // Failed to read the tags from the buffer, so cannot
02257             // get tags.  setTagsFromBuffer set the error.
02258             // TODO - what do we want to do on failure?
02259         }
02260     }
02261     
02262     int key = MAKEKEY(tag[0], tag[1], 'f');
02263     int offset = extras.Find(key);
02264 
02265     int value;
02266     if (offset < 0)
02267     {
02268         // TODO - what do we want to do on failure?
02269         return NOT_FOUND_TAG_DOUBLE;
02270     }
02271     else
02272         value = extras[offset];
02273 
02274     return doubles[value];
02275 }
02276 
02277 
02278 bool SamRecord::checkTag(const char * tag, char type)
02279 {
02280     // Init to success.
02281     myStatus = SamStatus::SUCCESS;
02282     // Parse the buffer if necessary.
02283     if(myNeedToSetTagsFromBuffer)
02284     {
02285         if(!setTagsFromBuffer())
02286         {
02287             // Failed to read the tags from the buffer, so cannot
02288             // get tags.  setTagsFromBuffer set the error.
02289             return("");
02290         }
02291     }
02292     
02293     int key = MAKEKEY(tag[0], tag[1], type);
02294 
02295     return (extras.Find(key) != LH_NOTFOUND);
02296 }
02297 
02298 
02299 // Return the number of bases in this read that overlap the passed in
02300 // region.  (start & end are 0-based)
02301 uint32_t SamRecord::getNumOverlaps(int32_t start, int32_t end)
02302 {
02303     // Determine whether or not the cigar has been parsed, which sets up
02304     // the cigar roller.  This is determined by checking the alignment length.
02305     if(myAlignmentLength == -1)
02306     {
02307         parseCigar();
02308     }
02309     return(myCigarRoller.getNumOverlaps(start, end, get0BasedPosition()));
02310 }
02311 
02312 
02313 // // Get the Extra field: <tag>:<vtype>:<value> for the key.
02314 // void SamRecord::getSamExtraFieldFromKey(int key, String& extraField) 
02315 // {
02316 //     myStatus = SamStatus::SUCCESS;
02317 //     // Extract the tag from the key.
02318 //     char tag[3];
02319 //     getTag(key, tag);
02320 
02321 //     char vtype;
02322 //     getVtype(key, vtype);
02323 
02324 //     // Look up the key to get the offset into the appropriate array.
02325 //     int offset = extras.Find(key);
02326 
02327 //     // Get the value associated with the key based on the vtype.
02328 //     switch (vtype)
02329 //     {
02330 //         case 'A' :
02331 //             extraField = tag;
02332 //             extraField += ":A:";
02333 //             extraField += (char)(getInteger(offset));
02334 //             break;
02335 //         case 'f' :
02336 //             extraField = tag;
02337 //             extraField += ":f:";
02338 //             extraField += getDouble(offset);
02339 //             break;
02340 //         case 'c' :
02341 //         case 'C' :
02342 //         case 's' :
02343 //         case 'S' :
02344 //         case 'i' :
02345 //         case 'I' :
02346 //             extraField = tag;
02347 //             extraField += ":i:";
02348 //             extraField += getInteger(offset);
02349 //             break;
02350 //         case 'Z' :
02351 //             //         String valueString = getString(offset).c_str();
02352 //             extraField = tag;
02353 //             extraField += ":Z:";
02354 //             extraField += getString(offset);
02355 //             break;
02356 //         default:
02357 //             myStatus.setStatus(SamStatus::FAIL_PARSE,
02358 //                                "Unknown tag type.");
02359 //             break;
02360 //     }
02361 // }
02362 
02363 
02364 // Allocate space for the record - does a realloc.  
02365 // The passed in size is the size of the entire record including the
02366 // block size field.
02367 bool SamRecord::allocateRecordStructure(int size)
02368 {
02369     if (allocatedSize < size)
02370     {
02371         bamRecordStruct* tmpRecordPtr = 
02372             (bamRecordStruct *)realloc(myRecordPtr, size);
02373         if(tmpRecordPtr == NULL)
02374         {
02375             // FAILED to allocate memory
02376             fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
02377             myStatus.addError(SamStatus::FAIL_MEM, "Failed Memory Allocation.");
02378             return(false);
02379         }
02380         // Successfully allocated memory, so set myRecordPtr.
02381         myRecordPtr = tmpRecordPtr;
02382         allocatedSize = size;
02383     }
02384     return(true);
02385 }
02386 
02387 
02388 // Index is the index into the strings array.
02389 void* SamRecord::getStringPtr(int index)
02390 {
02391     int value = extras[index];
02392 
02393     return &(strings[value]);
02394 }
02395 
02396 void* SamRecord::getIntegerPtr(int offset, char& type)
02397 {
02398     int value = extras[offset];
02399 
02400     type = intType[value];
02401 
02402     return &(integers[value]);
02403 }
02404 
02405 void* SamRecord::getDoublePtr(int offset)
02406 {
02407     int value = extras[offset];
02408 
02409     return &(doubles[value]);
02410 }
02411 
02412 
02413 // Fixes the buffer to match the variable length fields if they are set.
02414 bool SamRecord::fixBuffer(SequenceTranslation translation)
02415 {
02416     // Check to see if the buffer is already synced.
02417     if(myIsBufferSynced &&
02418        (myBufferSequenceTranslation == translation))
02419     {
02420         // Already synced, nothing to do.
02421         return(true);
02422     }
02423    
02424     // Set the bin if necessary.
02425     if(!myIsBinValid)
02426     {
02427         // The bin that is set in the record is not valid, so
02428         // reset it.
02429         myRecordPtr->myBin = 
02430             bam_reg2bin(myRecordPtr->myPosition, get1BasedAlignmentEnd());      
02431         myIsBinValid = true;
02432     }
02433 
02434     // Not synced.
02435     bool status = true;
02436 
02437     // First determine the size the buffer needs to be.
02438     uint8_t newReadNameLen = getReadNameLength();
02439     uint16_t newCigarLen = getCigarLength();
02440     int32_t newReadLen = getReadLength();
02441     uint32_t newTagLen = getTagLength();
02442     uint32_t bamSequenceLen = (newReadLen+1)/2;
02443 
02444     // The buffer size extends from the start of the record to data
02445     // plus the length of the variable fields,
02446     // Multiply the cigar length by 4 since it is the number of uint32_t fields.
02447     int newBufferSize = 
02448         ((unsigned char*)(&(myRecordPtr->myData)) - 
02449          (unsigned char*)myRecordPtr) +
02450         newReadNameLen + ((newCigarLen)*4) +
02451         newReadLen + bamSequenceLen + newTagLen;
02452    
02453     if(!allocateRecordStructure(newBufferSize))
02454     {
02455         // Failed to allocate space.
02456         return(false);
02457     }
02458 
02459     // Now that space has been added to the buffer, check to see what if
02460     // any fields need to be extracted from the buffer prior to starting to
02461     // overwrite it.  Fields need to be extracted from the buffer if the 
02462     // buffer is valid for the field and a previous variable length field has
02463     // changed length.
02464     bool readNameLenChange = (newReadNameLen != myRecordPtr->myReadNameLength);
02465     bool cigarLenChange = (newCigarLen != myRecordPtr->myCigarLength);
02466     bool readLenChange = (newReadLen != myRecordPtr->myReadLength);
02467 
02468     // If the tags are still stored in the buffer and any other fields changed
02469     // lengths, they need to be extracted.
02470     if(myIsTagsBufferValid &&
02471        (readNameLenChange | cigarLenChange | readLenChange))
02472     {
02473         status &= setTagsFromBuffer();
02474         // The tag buffer will not be valid once the other fields
02475         // are written, so set it to not valid.
02476         myIsTagsBufferValid = false;
02477     }
02478 
02479     // If the sequence or quality strings are still stored in the buffer, and
02480     // any of the previous fields have changed length, extract it from the
02481     // current buffer.
02482     if((myIsQualityBufferValid | myIsSequenceBufferValid) && 
02483        (readNameLenChange | cigarLenChange | readLenChange))
02484     {
02485         setSequenceAndQualityFromBuffer();
02486         // The quality and sequence buffers will not be valid once the other
02487         // fields are written, so set them to not valid.
02488         myIsQualityBufferValid = false;
02489         myIsSequenceBufferValid = false;
02490     }
02491 
02492     // If the cigar is still stored in the buffer, and any of the
02493     // previous fields have changed length, extract it from the current buffer.
02494     if((myIsCigarBufferValid) && 
02495        (readNameLenChange))
02496     {
02497         status &= parseCigarBinary();
02498         myIsCigarBufferValid = false;
02499     }
02500 
02501     // Set each value in the buffer if it is not already valid.
02502     if(!myIsReadNameBufferValid)
02503     {
02504         memcpy(&(myRecordPtr->myData), myReadName.c_str(), 
02505                newReadNameLen);
02506    
02507         // Set the new ReadNameLength.
02508         myRecordPtr->myReadNameLength = newReadNameLen;
02509         myIsReadNameBufferValid = true;
02510     }
02511 
02512     unsigned char * readNameEnds = (unsigned char*)(&(myRecordPtr->myData)) +
02513         myRecordPtr->myReadNameLength;
02514    
02515     // Set the Cigar.  Need to reformat from the string to 
02516     unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds;
02517       
02518     if(!myIsCigarBufferValid)
02519     {
02520         // The cigar was already parsed when it was set, so just copy
02521         // data from the temporary buffer.
02522         myRecordPtr->myCigarLength = newCigarLen;
02523         memcpy(packedCigar, myCigarTempBuffer, 
02524                myRecordPtr->myCigarLength * sizeof(uint32_t));
02525       
02526         myIsCigarBufferValid = true;
02527     }
02528 
02529     unsigned char * packedSequence = readNameEnds + 
02530         myRecordPtr->myCigarLength * sizeof(int);
02531     unsigned char * packedQuality = packedSequence + bamSequenceLen;
02532    
02533     if(!myIsSequenceBufferValid || !myIsQualityBufferValid || 
02534        (myBufferSequenceTranslation != translation))
02535     {
02536         myRecordPtr->myReadLength = newReadLen;
02537         // Determine if the quality needs to be set and is just a * and needs to
02538         // be set to 0xFF.
02539         bool noQuality = false;
02540         if((myQuality.Length() == 1) && (myQuality[0] == '*'))
02541         {
02542             noQuality = true;
02543         }
02544       
02545         const char* translatedSeq = NULL;
02546         // If the sequence is not valid in the buffer or it is not
02547         // properly translated, get the properly translated sequence
02548         // that needs to be put into the buffer.
02549         if((!myIsSequenceBufferValid) ||
02550            (translation != myBufferSequenceTranslation))
02551         {
02552             translatedSeq = getSequence(translation);
02553         }
02554 
02555         for (int i = 0; i < myRecordPtr->myReadLength; i++) 
02556         {
02557             if((!myIsSequenceBufferValid) ||
02558                (translation != myBufferSequenceTranslation))
02559             {
02560                 // Sequence buffer is not valid, so set the sequence.
02561                 int seqVal = 0;
02562                 switch(translatedSeq[i])
02563                 {
02564                     case '=':
02565                         seqVal = 0;
02566                         break;
02567                     case 'A':
02568                     case 'a':
02569                         seqVal = 1;
02570                         break;
02571                     case 'C':
02572                     case 'c':
02573                         seqVal = 2;
02574                         break;
02575                     case 'G':
02576                     case 'g':
02577                         seqVal = 4;
02578                         break;
02579                     case 'T':
02580                     case 't':
02581                         seqVal = 8;
02582                         break;
02583                     case 'N':
02584                     case 'n':
02585                     case '.':
02586                         seqVal = 15;
02587                         break;
02588                     default:
02589                         myStatus.addError(SamStatus::FAIL_PARSE,
02590                                           "Unknown Sequence character found.");
02591                         status = false;
02592                         break;
02593                 };
02594             
02595                 if(i & 1)
02596                 {
02597                     // Odd number i's go in the lower 4 bits, so OR in the
02598                     // lower bits
02599                     packedSequence[i/2] |= seqVal;
02600                 }
02601                 else
02602                 {
02603                     // Even i's go in the upper 4 bits and are always set first.
02604                     packedSequence[i/2] = seqVal << 4;
02605                 }
02606             }
02607 
02608             if(!myIsQualityBufferValid)
02609             {
02610                 // Set the quality.
02611                 if((noQuality) || (myQuality.Length() <= i))
02612                 {
02613                     // No quality or the quality is smaller than the sequence,
02614                     // so set it to 0xFF
02615                     packedQuality[i] = 0xFF;
02616                 }
02617                 else
02618                 {
02619                     // Copy the quality string.
02620                     packedQuality[i] = myQuality[i] - 33;
02621                 }
02622             }
02623         }
02624         myIsQualityBufferValid = true;
02625         myIsSequenceBufferValid = true;
02626         myBufferSequenceTranslation = translation;
02627     }
02628 
02629     if(!myIsTagsBufferValid)
02630     {
02631         status &= setTagsInBuffer();
02632     }
02633 
02634     // Set the lengths in the buffer.
02635     myRecordPtr->myReadNameLength = newReadNameLen;
02636     myRecordPtr->myCigarLength = newCigarLen;
02637     myRecordPtr->myReadLength = newReadLen;
02638 
02639     // Set the buffer block size to the size of the buffer minus the
02640     // first field.
02641     myRecordPtr->myBlockSize = newBufferSize - sizeof(int32_t);
02642 
02643     if(status)
02644     {
02645         myIsBufferSynced = true;
02646     }
02647 
02648     return(status);
02649 }
02650 
02651 
02652 // Sets the Sequence and Quality strings from the buffer.
02653 // They are done together in one method because they require the same
02654 // loop, so might as well be done at the same time.
02655 void SamRecord::setSequenceAndQualityFromBuffer()
02656 {
02657     // NOTE: If the sequence buffer is not valid, do not set the sequence
02658     // string from the buffer.
02659     // NOTE: If the quality buffer is not valid, do not set the quality string
02660     // from the buffer.
02661 
02662     // Extract the sequence if the buffer is valid and the string's length is 0.
02663     bool extractSequence = false;
02664     if(myIsSequenceBufferValid && (mySequence.Length() == 0))
02665     {
02666         extractSequence = true;
02667     }
02668 
02669     // Extract the quality if the buffer is valid and the string's length is 0.
02670     bool extractQuality = false;
02671     if(myIsQualityBufferValid && (myQuality.Length() == 0))
02672     {
02673         extractQuality = true;
02674     }
02675 
02676     // If neither the quality nor the sequence need to be extracted,
02677     // just return.
02678     if(!extractSequence && !extractQuality)
02679     {
02680         return;
02681     }
02682 
02683     // Set the sequence and quality strings..
02684     if(extractSequence)
02685     {
02686         mySequence.SetLength(myRecordPtr->myReadLength);
02687     }
02688     if(extractQuality)
02689     {
02690         myQuality.SetLength(myRecordPtr->myReadLength);
02691     }
02692    
02693     unsigned char * readNameEnds = 
02694         (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength;
02695     unsigned char * packedSequence = 
02696         readNameEnds + myRecordPtr->myCigarLength * sizeof(int);
02697     unsigned char * packedQuality = 
02698         packedSequence + (myRecordPtr->myReadLength + 1) / 2;
02699 
02700     const char * asciiBases = "=AC.G...T......N";
02701 
02702     // Flag to see if the quality is specified - the quality contains a value
02703     // other than 0xFF.  If all values are 0xFF, then there is no quality.
02704     bool qualitySpecified = false;
02705 
02706     for (int i = 0; i < myRecordPtr->myReadLength; i++)
02707     {
02708         if(extractSequence)
02709         {
02710             mySequence[i] = i & 1 ?
02711                 asciiBases[packedSequence[i / 2] & 0xF] :
02712                 asciiBases[packedSequence[i / 2] >> 4];
02713         }
02714 
02715         if(extractQuality)
02716         {
02717             if(packedQuality[i] != 0xFF)
02718             {
02719                 // Quality is specified, so mark the flag.
02720                 qualitySpecified = true;
02721             }
02722 
02723             myQuality[i] = packedQuality[i] + 33;
02724         }
02725     }
02726 
02727     // If the read length is 0, then set the sequence and quality to '*'
02728     if(myRecordPtr->myReadLength == 0)
02729     {
02730         if(extractSequence)
02731         {
02732             mySequence = "*";
02733         }
02734         if(extractQuality)
02735         {
02736             myQuality = "*";
02737         }
02738     }
02739     else if(extractQuality && !qualitySpecified)
02740     {
02741         // No quality was specified, so set it to "*"
02742         myQuality = "*";
02743     }
02744 }
02745 
02746 
02747 // Parse the cigar to calculate the alignment/unclipped end.
02748 bool SamRecord::parseCigar()
02749 {
02750     // Determine if the cigar string or cigar binary needs to be parsed.
02751     if(myCigar.Length() == 0)
02752     {
02753         // The cigar string is not yet set, so parse the binary.
02754         return(parseCigarBinary());
02755     }
02756     return(parseCigarString());
02757 }
02758 
02759 // Parse the cigar to calculate the alignment/unclipped end.
02760 bool SamRecord::parseCigarBinary()
02761 {
02762     // Only need to parse if the string is not already set.
02763     // The length of the cigar string is set to zero when the 
02764     // record is read from a file into the buffer.
02765     if(myCigar.Length() != 0)
02766     {
02767         // Already parsed.
02768         return(true);
02769     }
02770 
02771     unsigned char * readNameEnds = 
02772         (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength;
02773    
02774     unsigned int * packedCigar = (unsigned int *) (void *) readNameEnds;
02775 
02776     myCigarRoller.Set(packedCigar, myRecordPtr->myCigarLength);
02777     
02778     myCigarRoller.getCigarString(myCigar);
02779 
02780     myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount();
02781 
02782     myUnclippedStartOffset = myCigarRoller.getNumBeginClips();
02783     myUnclippedEndOffset = myCigarRoller.getNumEndClips();
02784 
02785     // if the cigar length is 0, then set the cigar string to "*"
02786     if(myRecordPtr->myCigarLength == 0)
02787     {
02788         myCigar = "*";
02789         return(true);
02790     }
02791 
02792     // Copy the cigar into a temporary buffer.
02793     int newBufferSize = myRecordPtr->myCigarLength * sizeof(uint32_t);
02794     if(newBufferSize > myCigarTempBufferAllocatedSize)
02795     {
02796         uint32_t* tempBufferPtr = 
02797             (uint32_t*)realloc(myCigarTempBuffer, newBufferSize);
02798         if(tempBufferPtr == NULL)
02799         {
02800             // Failed to allocate memory.
02801             // Do not parse, just return.
02802             fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
02803             myStatus.addError(SamStatus::FAIL_MEM,
02804                               "Failed to Allocate Memory.");
02805             return(false);
02806         }
02807         myCigarTempBuffer = tempBufferPtr;
02808         myCigarTempBufferAllocatedSize = newBufferSize;
02809     }
02810 
02811     memcpy(myCigarTempBuffer, packedCigar, 
02812            myRecordPtr->myCigarLength * sizeof(uint32_t));
02813 
02814     // Set the length of the temp buffer.
02815     myCigarTempBufferLength = myRecordPtr->myCigarLength;
02816 
02817     return(true);
02818 }
02819 
02820 // Parse the cigar string to calculate the cigar length and alignment end.
02821 bool SamRecord::parseCigarString()
02822 {
02823     myCigarTempBufferLength = 0;
02824     if(myCigar == "*")
02825     {
02826         // Cigar is empty, so initialize the variables.
02827         myAlignmentLength = 0;
02828         myUnclippedStartOffset = 0;
02829         myUnclippedEndOffset = 0;
02830         myCigarRoller.clear();
02831         return(true);
02832     }
02833 
02834     myCigarRoller.Set(myCigar);
02835     
02836     myAlignmentLength = myCigarRoller.getExpectedReferenceBaseCount();
02837     
02838     myUnclippedStartOffset = myCigarRoller.getNumBeginClips();
02839     myUnclippedEndOffset = myCigarRoller.getNumEndClips();
02840 
02841     // Check to see if the Temporary Cigar Buffer is large enough to contain
02842     // this cigar.  If we make it the size of the length of the cigar string,
02843     // it will be more than large enough.
02844     int newBufferSize = myCigar.Length() * sizeof(uint32_t);
02845     if(newBufferSize > myCigarTempBufferAllocatedSize)
02846     {
02847         uint32_t* tempBufferPtr = 
02848             (uint32_t*)realloc(myCigarTempBuffer, newBufferSize);
02849         if(tempBufferPtr == NULL)
02850         {
02851             // Failed to allocate memory.
02852             // Do not parse, just return.
02853             fprintf(stderr, "FAILED TO ALLOCATE MEMORY!!!");
02854             myStatus.addError(SamStatus::FAIL_MEM,
02855                               "Failed to Allocate Memory.");
02856             return(false);
02857         }
02858         myCigarTempBuffer = tempBufferPtr;
02859         myCigarTempBufferAllocatedSize = newBufferSize;
02860     }
02861 
02862     // Track if there were any errors.
02863     bool status = true;
02864 
02865     // Track the index into the cigar string that is being parsed.
02866     char *cigarOp;
02867     const char* cigarEntryStart = myCigar.c_str();
02868     int opLen = 0;
02869     int op = 0;
02870 
02871     unsigned int * packedCigar = myCigarTempBuffer;
02872     // TODO - maybe one day make a cigar list... or maybe make a 
02873     // reference cigar string for ease of lookup....
02874     const char* endCigarString = cigarEntryStart + myCigar.Length();
02875     while(cigarEntryStart < endCigarString)
02876     {
02877         bool validCigarEntry = true;
02878         // Get the opLen from the string.  cigarOp will then point to 
02879         // the operation.
02880         opLen = strtol(cigarEntryStart, &cigarOp, 10);
02881         // Switch on the type of operation.
02882         switch(*cigarOp)
02883         {
02884             case('M'):
02885                 op = 0;
02886                 break;
02887             case('I'):
02888                 // Insert into the reference position, so do not increment the
02889                 // reference end position.
02890                 op = 1;
02891                 break;
02892             case('D'):
02893                 op = 2;
02894                 break;
02895             case('N'):
02896                 op = 3;
02897                 break;
02898             case('S'):
02899                 op = 4;
02900                 break;
02901             case('H'):
02902                 op = 5;
02903                 break;
02904             case('P'):
02905                 op = 6;
02906                 break;
02907             default:
02908                 fprintf(stderr, "ERROR parsing cigar\n");
02909                 validCigarEntry = false;
02910                 status = false;
02911                 myStatus.addError(SamStatus::FAIL_PARSE,
02912                                   "Unknown operation found when parsing the Cigar.");
02913                 break;
02914         };
02915         if(validCigarEntry)
02916         {
02917             // Increment the cigar length.
02918             ++myCigarTempBufferLength;
02919             *packedCigar = (opLen << 4) | op;
02920             packedCigar++;
02921         }
02922         // The next Entry starts at one past the cigar op, so set the start.
02923         cigarEntryStart = ++cigarOp;
02924     }
02925 
02926     // Check clipLength to adjust the end position.
02927     return(status);
02928 }
02929 
02930 
02931 bool SamRecord::setTagsFromBuffer()
02932 {
02933     // If the tags do not need to be set from the buffer, return true.
02934     if(myNeedToSetTagsFromBuffer == false)
02935     {
02936         // Already been set from the buffer.
02937         return(true);
02938     }
02939 
02940     // Mark false, as they are being set now.
02941     myNeedToSetTagsFromBuffer = false;
02942 
02943     unsigned char * readNameEnds = 
02944         (unsigned char *)myRecordPtr->myData + myRecordPtr->myReadNameLength;
02945     unsigned char * packedSequence = 
02946         readNameEnds + myRecordPtr->myCigarLength * sizeof(int);
02947     unsigned char * packedQuality = 
02948         packedSequence + (myRecordPtr->myReadLength + 1) / 2;
02949 
02950     unsigned char * extraPtr = packedQuality + myRecordPtr->myReadLength;
02951 
02952     // Default to success, will be changed to false on failure.
02953     bool status = true;
02954 
02955     // Clear any previously set tags.
02956     clearTags();
02957     while (myRecordPtr->myBlockSize + 4 - 
02958            (extraPtr - (unsigned char *)myRecordPtr) > 0)
02959     {
02960         int key;
02961         int value;
02962         void * content = extraPtr + 3;
02963         int tagBufferSize = 0;
02964 
02965         key = MAKEKEY(extraPtr[0], extraPtr[1], extraPtr[2]);
02966       
02967         switch (extraPtr[2])
02968         {
02969             case 'A' :
02970                 value = integers.Length();
02971                 integers.Push(* (char *) content);
02972                 intType.push_back(extraPtr[2]);
02973                 extraPtr += 4;
02974                 tagBufferSize += 4;
02975                 break;
02976             case 'c' :
02977                 value = integers.Length();
02978                 integers.Push(* (char *) content);
02979                 intType.push_back(extraPtr[2]);
02980                 extraPtr += 4;
02981                 tagBufferSize += 4;
02982                 break;
02983             case 'C' :
02984                 value = integers.Length();
02985                 integers.Push(* (unsigned char *) content);
02986                 intType.push_back(extraPtr[2]);
02987                 extraPtr += 4;
02988                 tagBufferSize += 4;
02989                 break;
02990             case 's' :
02991                 value = integers.Length();
02992                 integers.Push(* (short *) content);
02993                 intType.push_back(extraPtr[2]);
02994                 extraPtr += 5;
02995                 tagBufferSize += 5;
02996                 break;
02997             case 'S' :
02998                 value = integers.Length();
02999                 integers.Push(* (unsigned short *) content);
03000                 intType.push_back(extraPtr[2]);
03001                 extraPtr += 5;
03002                 tagBufferSize += 5;
03003                 break;
03004             case 'i' :
03005                 value = integers.Length();
03006                 integers.Push(* (int *) content);
03007                 intType.push_back(extraPtr[2]);
03008                 extraPtr += 7;
03009                 tagBufferSize += 7;
03010                 break;
03011             case 'I' :
03012                 value = integers.Length();
03013                 integers.Push((int) * (unsigned int *) content);
03014                 intType.push_back(extraPtr[2]);
03015                 extraPtr += 7;
03016                 tagBufferSize += 7;
03017                 break;
03018             case 'Z' :
03019                 value = strings.Length();
03020                 strings.Push((const char *) content);
03021                 extraPtr += 4 + strings.Last().Length();
03022                 tagBufferSize += 4 + strings.Last().Length();
03023                 break;
03024             case 'f' :
03025                 value = doubles.Length();
03026                 doubles.Push(* (float *) content);
03027                 fprintf(stderr, "\n\nFLOAT!!!\n\n");
03028                 extraPtr += 7;
03029                 tagBufferSize += 7;
03030                 break;
03031             default :
03032                 fprintf(stderr, 
03033                         "parsing BAM - Unknown custom field of type %c%c:%c\n",
03034                         extraPtr[0], extraPtr[1], extraPtr[2]);
03035                 // Failed on read.
03036                 // Increment extraPtr just by the size of the 3 known fields
03037                 extraPtr += 3;
03038                 myStatus.addError(SamStatus::FAIL_PARSE,
03039                                   "Unknown tag type.");
03040                 status = false;
03041         }
03042 
03043         // Only add the tag if it has so far been successfully processed.
03044         if(status)
03045         {
03046             extras.Add(key, value);
03047             myTagBufferSize += tagBufferSize;
03048         }
03049     }
03050     return(status);
03051 }
03052 
03053 
03054 bool SamRecord::setTagsInBuffer()
03055 {
03056     // The buffer size extends from the start of the record to data
03057     // plus the length of the variable fields,
03058     // Multiply the cigar length by 4 since it is the number of uint32_t fields.
03059     int bamSequenceLength = (myRecordPtr->myReadLength+1)/2;
03060     int newBufferSize = ((unsigned char*)(&(myRecordPtr->myData)) - 
03061                          (unsigned char*)myRecordPtr) +  
03062         myRecordPtr->myReadNameLength + ((myRecordPtr->myCigarLength)*4) +
03063         myRecordPtr->myReadLength + bamSequenceLength + myTagBufferSize;
03064 
03065     // Make sure the buffer is big enough.
03066     if(!allocateRecordStructure(newBufferSize))
03067     {
03068         // Failed to allocate space.
03069         return(false);
03070     }
03071 
03072     unsigned char * readNameEnds = (unsigned char*)(&(myRecordPtr->myData)) +
03073         myRecordPtr->myReadNameLength;
03074     unsigned char * packedSequence = readNameEnds + 
03075         myRecordPtr->myCigarLength * sizeof(int);
03076     unsigned char * packedQuality = 
03077         packedSequence + bamSequenceLength;
03078    
03079     char * extraPtr = (char*)packedQuality + myRecordPtr->myReadLength;
03080 
03081     bool status = true;
03082 
03083     // Set the tags in the buffer.
03084     if (extras.Entries())
03085     {
03086         for (int i = 0; i < extras.Capacity(); i++)
03087         {
03088             if (extras.SlotInUse(i))
03089             {
03090                 int key = extras.GetKey(i);
03091                 getTag(key, extraPtr);
03092                 extraPtr += 2;
03093                 char vtype;
03094                 getTypeFromKey(key, vtype);
03095  
03096                 if(vtype == 'i')
03097                 {
03098                     vtype = getIntegerType(i);
03099                 }
03100 
03101                 extraPtr[0] = vtype;
03102 
03103                 // increment the pointer to where the value is.
03104                 extraPtr += 1;
03105 
03106                 switch (vtype)
03107                 {
03108                     case 'A' :
03109                         *(char*)extraPtr = (char)getInteger(i);
03110                         // sprintf(extraPtr, "%d", getInteger(i));
03111                         extraPtr += 1;
03112                         break;
03113                     case 'c' :
03114                         *(int8_t*)extraPtr = (int8_t)getInteger(i);
03115                         // sprintf(extraPtr, "%.4d", getInteger(i));
03116                         extraPtr += 1;
03117                         break;
03118                     case 'C' :
03119                         *(uint8_t*)extraPtr = (uint8_t)getInteger(i);
03120                         // sprintf(extraPtr, "%.4d", getInteger(i));
03121                         extraPtr += 1;
03122                         break;
03123                     case 's' :
03124                         *(int16_t*)extraPtr = (int16_t)getInteger(i);
03125                         // sprintf(extraPtr, "%.4d", getInteger(i));
03126                         extraPtr += 2;
03127                         break;
03128                     case 'S' :
03129                         *(uint16_t*)extraPtr = (uint16_t)getInteger(i);
03130                         // sprintf(extraPtr, "%.4d", getInteger(i));
03131                         extraPtr += 2;
03132                         break;
03133                     case 'i' :
03134                         *(int32_t*)extraPtr = (int32_t)getInteger(i);
03135                         // sprintf(extraPtr, "%.4d", getInteger(i));
03136                         extraPtr += 4;
03137                         break;
03138                     case 'I' :
03139                         *(uint32_t*)extraPtr = (uint32_t)getInteger(i);
03140                         // sprintf(extraPtr, "%.4d", getInteger(i));
03141                         extraPtr += 4;
03142                         break;
03143                     case 'Z' :
03144                         sprintf(extraPtr, "%s", getString(i).c_str());
03145                         extraPtr += getString(i).Length() + 1;
03146                         break;
03147                     case 'f' :
03148                         // TODO, figure out if %f is good enough.
03149                         sprintf(extraPtr, "%f", getDouble(i));
03150                         extraPtr += 4;
03151                         break;
03152                     default :
03153                         myStatus.addError(SamStatus::FAIL_PARSE,
03154                                           "Unknown tag type.");
03155                         status = false;
03156                         break;
03157                 }
03158             }
03159         }
03160     }
03161 
03162     // Validate that the extra pointer is at the end of the allocated buffer.
03163     // If not then there was a problem.
03164     if(extraPtr != (char*)myRecordPtr + newBufferSize)
03165     {
03166         fprintf(stderr, "ERROR updating the buffer.  Incorrect size.");
03167         myStatus.addError(SamStatus::FAIL_PARSE,
03168                           "ERROR updating the buffer.  Incorrect size.");
03169         status = false;
03170     }
03171 
03172 
03173     // The buffer tags are now in sync.
03174     myNeedToSetTagsInBuffer = false;
03175     myIsTagsBufferValid = true;
03176     return(status);
03177 }
03178 
03179 
03180 // Reset the variables for a newly set buffer.  The buffer must be set first
03181 // since this looks up the reference ids in the buffer to set the reference
03182 // names.
03183 void SamRecord::setVariablesForNewBuffer(SamFileHeader& header)
03184 {
03185     // Lookup the reference name & mate reference name associated with this
03186     // record.
03187     myReferenceName = 
03188         header.getReferenceLabel(myRecordPtr->myReferenceID);
03189     myMateReferenceName = 
03190         header.getReferenceLabel(myRecordPtr->myMateReferenceID);      
03191 
03192     // Clear the SAM Strings that are now not in-sync with the buffer.
03193     myReadName.SetLength(0);
03194     myCigar.SetLength(0);
03195     mySequence.SetLength(0);
03196     mySeqWithEq.clear();
03197     mySeqWithoutEq.clear();
03198     myQuality.SetLength(0);
03199     myNeedToSetTagsFromBuffer = true;
03200     myNeedToSetTagsInBuffer = false;
03201 
03202     //Set that the buffer is valid.
03203     myIsBufferSynced = true;
03204     // Set that the variable length buffer fields are valid.
03205     myIsReadNameBufferValid = true;
03206     myIsCigarBufferValid = true;
03207     myIsSequenceBufferValid = true;
03208     myBufferSequenceTranslation = NONE;
03209     myIsQualityBufferValid = true;
03210     myIsTagsBufferValid = true;
03211     myIsBinValid = true;
03212 }
03213 
03214 
03215 // Extract the vtype from the key.
03216 void SamRecord::getTypeFromKey(int key, char& type) const
03217 {
03218     // Extract the type from the key.
03219     type = (key >> 16) & 0xFF;
03220 }
03221 
03222 
03223 // Extract the tag from the key.
03224 void SamRecord::getTag(int key, char* tag) const
03225 {
03226     // Extract the tag from the key.
03227     tag[0] = key & 0xFF;
03228     tag[1] = (key >> 8) & 0xFF;
03229     tag[2] = 0;
03230 }
03231 
03232 
03233 // Index is the index into the strings array.
03234 String & SamRecord::getString(int index)
03235 {
03236     int value = extras[index];
03237 
03238     return strings[value];
03239 }
03240 
03241 int & SamRecord::getInteger(int offset)
03242 {
03243     int value = extras[offset];
03244 
03245     return integers[value];
03246 }
03247 
03248 char & SamRecord::getIntegerType(int offset)
03249 {
03250     int value = extras[offset];
03251 
03252     return intType[value];
03253 }
03254 
03255 double & SamRecord::getDouble(int offset)
03256 {
03257     int value = extras[offset];
03258 
03259     return doubles[value];
03260 }
03261 
03262 
Generated on Tue Aug 23 18:19:04 2011 for libStatGen Software by  doxygen 1.6.3