SamFile.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 #include <stdexcept>
00018 #include "SamFile.h"
00019 #include "SamFileHeader.h"
00020 #include "SamRecord.h"
00021 #include "BamInterface.h"
00022 #include "SamInterface.h"
00023 #include "BgzfFileType.h"
00024 
00025 // Constructor, init variables.
00026 SamFile::SamFile()
00027     : myFilePtr(NULL),
00028       myInterfacePtr(NULL),
00029       myStatistics(NULL),
00030       myStatus(),
00031       myBamIndex(NULL),
00032       myRefPtr(NULL),
00033       myReadTranslation(SamRecord::NONE),
00034       myWriteTranslation(SamRecord::NONE),
00035       myAttemptRecovery(false)
00036 {
00037     resetFile();
00038 }
00039 
00040 
00041 // Constructor, init variables.
00042 SamFile::SamFile(ErrorHandler::HandlingType errorHandlingType)
00043     : myFilePtr(NULL),
00044       myInterfacePtr(NULL),
00045       myStatistics(NULL),
00046       myStatus(errorHandlingType),
00047       myBamIndex(NULL),
00048       myRefPtr(NULL),
00049       myReadTranslation(SamRecord::NONE),
00050       myWriteTranslation(SamRecord::NONE)
00051 {
00052     resetFile();
00053 }
00054 
00055 
00056 // Constructor, init variables and open the specified file based on the
00057 // specified mode (READ/WRITE).
00058 SamFile::SamFile(const char* filename, OpenType mode)
00059     : myStatus()
00060 {
00061     init(filename, mode, NULL);
00062 }
00063 
00064 
00065 // Constructor, init variables and open the specified file based on the
00066 // specified mode (READ/WRITE).  Default is READ..
00067 SamFile::SamFile(const char* filename, OpenType mode,
00068                  ErrorHandler::HandlingType errorHandlingType)
00069     : myStatus(errorHandlingType)
00070 {
00071     init(filename, mode, NULL);
00072 }
00073 
00074 
00075 // Constructor, init variables and open the specified file based on the
00076 // specified mode (READ/WRITE).
00077 SamFile::SamFile(const char* filename, OpenType mode, SamFileHeader* header)
00078     : myStatus()
00079 {
00080     init(filename, mode, header);
00081 }
00082 
00083 
00084 // Constructor, init variables and open the specified file based on the
00085 // specified mode (READ/WRITE).  Default is READ..
00086 SamFile::SamFile(const char* filename, OpenType mode,
00087                  ErrorHandler::HandlingType errorHandlingType, 
00088                  SamFileHeader* header)
00089     : myStatus(errorHandlingType)
00090 {
00091     init(filename, mode, header);
00092 }
00093 
00094 
00095 SamFile::~SamFile()
00096 {
00097     resetFile();
00098     if(myStatistics != NULL)
00099     {
00100         delete myStatistics;
00101     }
00102 }
00103 
00104 
00105 // Open a sam/bam file for reading with the specified filename.
00106 bool SamFile::OpenForRead(const char * filename, SamFileHeader* header)
00107 {
00108     // Reset for any previously operated on files.
00109     resetFile();
00110 
00111     int lastchar = 0;
00112 
00113     while (filename[lastchar] != 0) lastchar++;
00114 
00115     // If at least one character, check for '-'.
00116     if((lastchar >= 1) && (filename[0] == '-'))
00117     {
00118         // Read from stdin - determine type of file to read.
00119         // Determine if compressed bam.
00120         if(strcmp(filename, "-.bam") == 0)
00121         {
00122             // Compressed bam - open as bgzf.
00123             // -.bam is the filename, read compressed bam from stdin
00124             filename = "-";
00125 
00126             myFilePtr = new InputFile;
00127             // support recover mode - this switches in a reader
00128             // capable of recovering from bad BGZF compression blocks.
00129             myFilePtr->setAttemptRecovery(myAttemptRecovery);
00130             myFilePtr->openFile(filename, "rb", InputFile::BGZF);
00131 
00132             myInterfacePtr = new BamInterface;
00133 
00134             // Read the magic string.
00135             char magic[4];
00136             ifread(myFilePtr, magic, 4);
00137         }
00138         else if(strcmp(filename, "-.ubam") == 0)
00139         {
00140             // uncompressed BAM File.
00141             // -.ubam is the filename, read uncompressed bam from stdin.
00142             // uncompressed BAM is still compressed with BGZF, but using
00143             // compression level 0, so still open as BGZF since it has a
00144             // BGZF header.
00145             filename = "-";
00146 
00147             // Uncompressed, so do not require the eof block.
00148             BgzfFileType::setRequireEofBlock(false);
00149 
00150             myFilePtr = ifopen(filename, "rb", InputFile::BGZF);
00151         
00152             myInterfacePtr = new BamInterface;
00153 
00154             // Read the magic string.
00155             char magic[4];
00156             ifread(myFilePtr, magic, 4);
00157         }
00158         else
00159         {
00160             // SAM File.
00161             // read sam from stdin
00162             filename = "-";
00163             myFilePtr = ifopen(filename, "rb", InputFile::UNCOMPRESSED);
00164             myInterfacePtr = new SamInterface;
00165         }
00166     }
00167     else
00168     {
00169         // Not from stdin.  Read the file to determine the type.
00170 
00171         myFilePtr = new InputFile;
00172 
00173         // support recovery mode - this conditionally enables a reader
00174         // capable of recovering from bad BGZF compression blocks.
00175         myFilePtr->setAttemptRecovery(myAttemptRecovery);
00176         bool rc = myFilePtr->openFile(filename, "rb", InputFile::DEFAULT);
00177 
00178         if (rc == false)
00179         {
00180             std::string errorMessage = "Failed to Open ";
00181             errorMessage += filename;
00182             errorMessage += " for reading";
00183             myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str());
00184             delete myFilePtr;
00185             return(false);
00186         }
00187         
00188         char magic[4];
00189         ifread(myFilePtr, magic, 4);
00190         
00191         if (magic[0] == 'B' && magic[1] == 'A' && magic[2] == 'M' &&
00192             magic[3] == 1)
00193         {
00194             myInterfacePtr = new BamInterface;
00195             // Set that it is a bam file open for reading.  This is needed to
00196             // determine if an index file can be used.
00197             myIsBamOpenForRead = true;
00198         }
00199         else
00200         {
00201             // Not a bam, so rewind to the beginning of the file so it
00202             // can be read.
00203             ifrewind(myFilePtr);
00204             myInterfacePtr = new SamInterface;
00205         }
00206     }
00207 
00208     // File is open for reading.
00209     myIsOpenForRead = true;
00210 
00211     // Read the header if one was passed in.
00212     if(header != NULL)
00213     {
00214         return(ReadHeader(*header));
00215     }
00216 
00217     // Successfully opened the file.
00218     myStatus = SamStatus::SUCCESS;
00219     return(true);
00220 }
00221 
00222 
00223 // Open a sam/bam file for writing with the specified filename.
00224 bool SamFile::OpenForWrite(const char * filename, SamFileHeader* header)
00225 {
00226     // Reset for any previously operated on files.
00227     resetFile();
00228     
00229     int lastchar = 0;
00230     while (filename[lastchar] != 0) lastchar++;   
00231     if (lastchar >= 4 && 
00232         filename[lastchar - 4] == 'u' &&
00233         filename[lastchar - 3] == 'b' &&
00234         filename[lastchar - 2] == 'a' &&
00235         filename[lastchar - 1] == 'm')
00236     {
00237         // BAM File.
00238         // if -.ubam is the filename, write uncompressed bam to stdout
00239         if((lastchar == 6) && (filename[0] == '-') && (filename[1] == '.'))
00240         {
00241             filename = "-";
00242         }
00243 
00244         myFilePtr = ifopen(filename, "wb0", InputFile::BGZF);
00245 
00246         myInterfacePtr = new BamInterface;
00247     }
00248     else if (lastchar >= 3 && 
00249              filename[lastchar - 3] == 'b' &&
00250              filename[lastchar - 2] == 'a' &&
00251              filename[lastchar - 1] == 'm')
00252     {
00253         // BAM File.
00254         // if -.bam is the filename, write compressed bam to stdout
00255         if((lastchar == 5) && (filename[0] == '-') && (filename[1] == '.'))
00256         {
00257             filename = "-";
00258         }
00259         myFilePtr = ifopen(filename, "wb", InputFile::BGZF);
00260         
00261         myInterfacePtr = new BamInterface;
00262     }
00263     else
00264     {
00265         // SAM File
00266         // if - (followed by anything is the filename,
00267         // write uncompressed sam to stdout
00268         if((lastchar >= 1) && (filename[0] == '-'))
00269         {
00270             filename = "-";
00271         }
00272         myFilePtr = ifopen(filename, "wb", InputFile::UNCOMPRESSED);
00273    
00274         myInterfacePtr = new SamInterface;
00275     }
00276 
00277     if (myFilePtr == NULL)
00278     {
00279         std::string errorMessage = "Failed to Open ";
00280         errorMessage += filename;
00281         errorMessage += " for writing";
00282         myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str());
00283         return(false);
00284     }
00285    
00286     myIsOpenForWrite = true;
00287 
00288     // Write the header if one was passed in.
00289     if(header != NULL)
00290     {
00291         return(WriteHeader(*header));
00292     }
00293 
00294     // Successfully opened the file.
00295     myStatus = SamStatus::SUCCESS;
00296     return(true);
00297 }
00298 
00299 
00300 // Read BAM Index file.
00301 bool SamFile::ReadBamIndex(const char* bamIndexFilename)
00302 {
00303     // Cleanup a previously setup index.
00304     if(myBamIndex != NULL)
00305     {
00306         delete myBamIndex;
00307         myBamIndex = NULL;
00308     }
00309 
00310     // Create a new bam index.
00311     myBamIndex = new BamIndex();
00312     SamStatus::Status indexStat = myBamIndex->readIndex(bamIndexFilename);
00313 
00314     if(indexStat != SamStatus::SUCCESS)
00315     {
00316         std::string errorMessage = "Failed to read the bam Index file: ";
00317         errorMessage += bamIndexFilename;
00318         myStatus.setStatus(indexStat, errorMessage.c_str());
00319         delete myBamIndex;
00320         myBamIndex = NULL;
00321         return(false);
00322     }
00323     myStatus = SamStatus::SUCCESS;
00324     return(true);
00325 }
00326 
00327 
00328 // Read BAM Index file.
00329 bool SamFile::ReadBamIndex()
00330 {
00331     if(myFilePtr == NULL)
00332     {
00333         // Can't read the bam index file because the BAM file has not yet been
00334         // opened, so we don't know the base filename for the index file.
00335         std::string errorMessage = "Failed to read the bam Index file -"
00336             " the BAM file needs to be read first in order to determine"
00337             " the index filename.";
00338         myStatus.setStatus(SamStatus::FAIL_ORDER, errorMessage.c_str());
00339         return(false);
00340     }
00341 
00342     const char* bamBaseName = myFilePtr->getFileName();
00343     
00344     std::string indexName = bamBaseName;
00345     indexName += ".bai";
00346 
00347     bool foundFile = true;
00348     try
00349     {
00350         if(ReadBamIndex(indexName.c_str()) == false)
00351         {
00352             foundFile = false;
00353         }
00354     }
00355     catch (std::exception& e)
00356     {
00357         foundFile = false;
00358     }
00359 
00360     // Check to see if the index file was found.
00361     if(!foundFile)
00362     {
00363         // Not found - try without the bam extension.
00364         // Locate the start of the bam extension
00365         size_t startExt = indexName.find(".bam");
00366         if(startExt == std::string::npos)
00367         {
00368             // Could not find the .bam extension, so just return false since the
00369             // call to ReadBamIndex set the status.
00370             return(false);
00371         }
00372         // Remove ".bam" and try reading the index again.
00373         indexName.erase(startExt,  4);
00374         return(ReadBamIndex(indexName.c_str()));
00375     }
00376     return(true);
00377 }
00378 
00379 
00380 // Sets the reference to the specified genome sequence object.
00381 void SamFile::SetReference(GenomeSequence* reference)
00382 {
00383     myRefPtr = reference;
00384 }
00385 
00386 
00387 // Set the type of sequence translation to use when reading the sequence.
00388 void SamFile::SetReadSequenceTranslation(SamRecord::SequenceTranslation translation)
00389 {
00390     myReadTranslation = translation;
00391 }
00392 
00393 
00394 // Set the type of sequence translation to use when writing the sequence.
00395 void SamFile::SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation)
00396 {
00397     myWriteTranslation = translation;
00398 }
00399 
00400 // Close the file if there is one open.
00401 void SamFile::Close()
00402 {
00403     // Resetting the file will close it if it is open, and
00404     // will reset all other variables.
00405     resetFile();
00406 }
00407 
00408 
00409 // Returns whether or not the file has been opened.
00410 // return: int - true = open; false = not open.
00411 bool SamFile::IsOpen()
00412 {
00413     if (myFilePtr != NULL)
00414     {
00415         // File Pointer is set, so return if it is open.
00416         return(myFilePtr->isOpen());
00417     }
00418     // File pointer is not set, so return false, not open.
00419     return false;
00420 }
00421 
00422 
00423 // Returns whether or not the end of the file has been reached.
00424 // return: int - true = EOF; false = not eof.
00425 bool SamFile::IsEOF()
00426 {
00427     if (myFilePtr != NULL)
00428     {
00429         // File Pointer is set, so return if eof.
00430         return(ifeof(myFilePtr));
00431     }
00432     // File pointer is not set, so return true, eof.
00433     return true;
00434 }
00435 
00436 
00437 // Read the header from the currently opened file.
00438 bool SamFile::ReadHeader(SamFileHeader& header)
00439 {
00440     if(myIsOpenForRead == false)
00441     {
00442         // File is not open for read
00443         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00444                            "Cannot read header since the file is not open for reading");
00445         return(false);
00446     }
00447 
00448     if(myHasHeader == true)
00449     {
00450         // The header has already been read.
00451         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00452                            "Cannot read header since it has already been read.");
00453         return(false);
00454     }
00455 
00456     myStatus = myInterfacePtr->readHeader(myFilePtr, header);
00457     if(myStatus == SamStatus::SUCCESS)
00458     {
00459         // The header has now been successfully read.
00460         myHasHeader = true;
00461         return(true);
00462     }
00463     return(false);
00464 }
00465 
00466 
00467 // Write the header to the currently opened file.
00468 bool SamFile::WriteHeader(SamFileHeader& header)
00469 {
00470     if(myIsOpenForWrite == false)
00471     {
00472         // File is not open for write
00473         // -OR-
00474         // The header has already been written.
00475         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00476                            "Cannot write header since the file is not open for writing");
00477         return(false);
00478     }
00479 
00480     if(myHasHeader == true)
00481     {
00482         // The header has already been written.
00483         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00484                            "Cannot write header since it has already been written");
00485         return(false);
00486     }
00487 
00488     myStatus = myInterfacePtr->writeHeader(myFilePtr, header);
00489     if(myStatus == SamStatus::SUCCESS)
00490     {
00491         // The header has now been successfully written.
00492         myHasHeader = true;
00493         return(true);
00494     }
00495 
00496     // return the status.
00497     return(false);
00498 }
00499 
00500 
00501 // Read a record from the currently opened file.
00502 bool SamFile::ReadRecord(SamFileHeader& header, 
00503                          SamRecord& record)
00504 {
00505     myStatus = SamStatus::SUCCESS;
00506 
00507     if(myIsOpenForRead == false)
00508     {
00509         // File is not open for read
00510         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00511                            "Cannot read record since the file is not open for reading");
00512         throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to opening the file."));
00513         return(false);
00514     }
00515 
00516     if(myHasHeader == false)
00517     {
00518         // The header has not yet been read.
00519         // TODO - maybe just read the header.
00520         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00521                            "Cannot read record since the header has not been read.");
00522         throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to reading the header."));
00523         return(false);
00524     }
00525 
00526     // Check to see if a new region has been set.  If so, determine the
00527     // chunks for that region.
00528     if(myNewSection)
00529     {
00530         if(!processNewSection(header))
00531         {
00532             // Failed processing a new section.  Could be an 
00533             // order issue like the file not being open or the
00534             // indexed file not having been read.
00535             // processNewSection sets myStatus with the failure reason.
00536             return(false);
00537         }
00538     }
00539 
00540     // Check to see if the file should be read by index.
00541     if(myRefID != BamIndex::REF_ID_ALL)
00542     {
00543         // Reference ID is set, so read by index.
00544         return(readIndexedRecord(header, record));
00545     }
00546 
00547     record.setReference(myRefPtr);
00548     record.setSequenceTranslation(myReadTranslation);
00549 
00550     // File is open for reading and the header has been read, so read the next
00551     // record.
00552     myInterfacePtr->readRecord(myFilePtr, header, record, myStatus);
00553     if(myStatus == SamStatus::SUCCESS)
00554     {
00555         // A record was successfully read, so increment the record count.
00556         myRecordCount++;
00557 
00558         if(myStatistics != NULL)
00559         {
00560             // Statistics should be updated.
00561             myStatistics->updateStatistics(record);
00562         }
00563 
00564         // Successfully read the record, so check the sort order.
00565         if(!validateSortOrder(record, header))
00566         {
00567             // ValidateSortOrder sets the status on a failure.
00568             return(false);
00569         }
00570         return(true);
00571     }
00572     // Failed to read the record.
00573     return(false);
00574 }
00575 
00576 
00577 
00578 // Write a record to the currently opened file.
00579 bool SamFile::WriteRecord(SamFileHeader& header, 
00580                           SamRecord& record)
00581 {
00582     if(myIsOpenForWrite == false)
00583     {
00584         // File is not open for writing
00585         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00586                            "Cannot write record since the file is not open for writing");
00587         return(false);
00588     }
00589 
00590     if(myHasHeader == false)
00591     {
00592         // The header has not yet been written.
00593         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00594                            "Cannot write record since the header has not been written");
00595         return(false);
00596     }
00597 
00598     // Before trying to write the record, validate the sort order.
00599     if(!validateSortOrder(record, header))
00600     {
00601         // Not sorted like it is supposed to be, do not write the record
00602         myStatus.setStatus(SamStatus::INVALID_SORT, 
00603                            "Cannot write the record since the file is not properly sorted.");
00604         return(false);
00605     }
00606 
00607     if(myRefPtr != NULL)
00608     {
00609         record.setReference(myRefPtr);
00610     }
00611 
00612     // File is open for writing and the header has been written, so write the
00613     // record.
00614     myStatus = myInterfacePtr->writeRecord(myFilePtr, header, record,
00615                                            myWriteTranslation);
00616 
00617     if(myStatus == SamStatus::SUCCESS)
00618     {
00619         // A record was successfully written, so increment the record count.
00620         myRecordCount++;
00621         return(true);
00622     }
00623     return(false);
00624 }
00625 
00626 
00627 // Set the flag to validate that the file is sorted as it is read/written.
00628 // Must be called after the file has been opened.
00629 void SamFile::setSortedValidation(SortedType sortType)
00630 {
00631     mySortedType = sortType;
00632 }
00633 
00634 
00635 // Return the number of records that have been read/written so far.
00636 uint32_t SamFile::GetCurrentRecordCount()
00637 {
00638     return(myRecordCount);
00639 }
00640 
00641 
00642 // Sets what part of the SamFile should be read.
00643 bool SamFile::SetReadSection(int32_t refID)
00644 {
00645     // No start/end specified, so set back to default -1.
00646     return(SetReadSection(refID, -1, -1));
00647 }
00648 
00649 
00650 
00651 // Sets what part of the SamFile should be read.
00652 bool SamFile::SetReadSection(const char* refName)
00653 {
00654     // No start/end specified, so set back to default -1.
00655     return(SetReadSection(refName, -1, -1));
00656 }
00657 
00658 
00659 // Sets what part of the BAM file should be read.
00660 bool SamFile::SetReadSection(int32_t refID, int32_t start, int32_t end, 
00661                              bool overlap)
00662 {
00663     // If there is not a BAM file open for reading, return failure.
00664     // Opening a new file clears the read section, so it must be
00665     // set after the file is opened.
00666     if(!myIsBamOpenForRead)
00667     {
00668         // There is not a BAM file open for reading.
00669         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00670                            "Cannot set section since there is no bam file open");
00671         return(false);
00672     }
00673 
00674     myNewSection = true;
00675     myOverlapSection = overlap;
00676     myStartPos = start;
00677     myEndPos = end;
00678     myRefID = refID;
00679     myRefName.clear();
00680     myChunksToRead.clear();
00681     // Reset the end of the current chunk.  We are resetting our read, so
00682     // we no longer have a "current chunk" that we are reading.
00683     myCurrentChunkEnd = 0;
00684     myStatus = SamStatus::SUCCESS;
00685 
00686     // Reset the sort order criteria since we moved around in the file.    
00687     myPrevCoord = -1;
00688     myPrevRefID = 0;
00689     myPrevReadName.clear();
00690 
00691     return(true);
00692 }
00693 
00694 
00695 // Sets what part of the BAM file should be read.
00696 bool SamFile::SetReadSection(const char* refName, int32_t start, int32_t end,
00697                              bool overlap)
00698 {
00699     // If there is not a BAM file open for reading, return failure.
00700     // Opening a new file clears the read section, so it must be
00701     // set after the file is opened.
00702     if(!myIsBamOpenForRead)
00703     {
00704         // There is not a BAM file open for reading.
00705         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00706                            "Cannot set section since there is no bam file open");
00707         return(false);
00708     }
00709 
00710     myNewSection = true;
00711     myOverlapSection = overlap;
00712     myStartPos = start;
00713     myEndPos = end;
00714     if((strcmp(refName, "") == 0) || (strcmp(refName, "*") == 0))
00715     {
00716         // No Reference name specified, so read just the "-1" entries.
00717         myRefID = BamIndex::REF_ID_UNMAPPED;
00718     }
00719     else
00720     {
00721         // save the reference name and revert the reference ID to unknown
00722         // so it will be calculated later.
00723         myRefName = refName;
00724         myRefID = BamIndex::REF_ID_ALL;
00725     }
00726     myChunksToRead.clear();
00727     // Reset the end of the current chunk.  We are resetting our read, so
00728     // we no longer have a "current chunk" that we are reading.
00729     myCurrentChunkEnd = 0;
00730     myStatus = SamStatus::SUCCESS;
00731     
00732     // Reset the sort order criteria since we moved around in the file.    
00733     myPrevCoord = -1;
00734     myPrevRefID = 0;
00735     myPrevReadName.clear();
00736 
00737     return(true);
00738 }
00739 
00740 
00741 
00742 // Get the number of mapped reads in the specified reference id.  
00743 // Returns -1 for out of range refIDs.
00744 int32_t SamFile::getNumMappedReadsFromIndex(int32_t refID)
00745 {
00746     // The bam index must have already been read.
00747     if(myBamIndex == NULL)
00748     {
00749         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00750                            "Cannot get num mapped reads from the index until it has been read.");
00751         return(false);
00752     }
00753     return(myBamIndex->getNumMappedReads(refID));
00754 }
00755 
00756 
00757 // Get the number of unmapped reads in the specified reference id.  
00758 // Returns -1 for out of range refIDs.
00759 int32_t SamFile::getNumUnMappedReadsFromIndex(int32_t refID)
00760 {
00761     // The bam index must have already been read.
00762     if(myBamIndex == NULL)
00763     {
00764         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00765                            "Cannot get num unmapped reads from the index until it has been read.");
00766         return(false);
00767     }
00768     return(myBamIndex->getNumUnMappedReads(refID));
00769 }
00770 
00771 
00772 // Get the number of mapped reads in the specified reference id.  
00773 // Returns -1 for out of range references.
00774 int32_t SamFile::getNumMappedReadsFromIndex(const char* refName,
00775                                             SamFileHeader& header)
00776 {
00777     // The bam index must have already been read.
00778     if(myBamIndex == NULL)
00779     {
00780         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00781                            "Cannot get num mapped reads from the index until it has been read.");
00782         return(false);
00783     }
00784     int32_t refID = BamIndex::REF_ID_UNMAPPED;
00785     if((strcmp(refName, "") != 0) && (strcmp(refName, "*") != 0))
00786     {
00787         // Reference name specified, so read just the "-1" entries.
00788         refID =  header.getReferenceID(refName);
00789     }
00790     return(myBamIndex->getNumMappedReads(refID));
00791 }
00792 
00793 
00794 // Get the number of unmapped reads in the specified reference id.  
00795 // Returns -1 for out of range refIDs.
00796 int32_t SamFile::getNumUnMappedReadsFromIndex(const char* refName,
00797                                               SamFileHeader& header)
00798 {
00799     // The bam index must have already been read.
00800     if(myBamIndex == NULL)
00801     {
00802         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00803                            "Cannot get num unmapped reads from the index until it has been read.");
00804         return(false);
00805     }
00806     int32_t refID = BamIndex::REF_ID_UNMAPPED;
00807     if((strcmp(refName, "") != 0) && (strcmp(refName, "*") != 0))
00808     {
00809         // Reference name specified, so read just the "-1" entries.
00810         refID =  header.getReferenceID(refName);
00811     }
00812     return(myBamIndex->getNumUnMappedReads(refID));
00813 }
00814 
00815 
00816 // Returns the number of bases in the passed in read that overlap the
00817 // region that is currently set.
00818 uint32_t SamFile::GetNumOverlaps(SamRecord& samRecord)
00819 {
00820     if(myRefPtr != NULL)
00821     {
00822         samRecord.setReference(myRefPtr);
00823     }
00824     samRecord.setSequenceTranslation(myReadTranslation);
00825 
00826     // Get the overlaps in the sam record for the region currently set
00827     // for this file.
00828     return(samRecord.getNumOverlaps(myStartPos, myEndPos));
00829 }
00830 
00831 
00832 void SamFile::GenerateStatistics(bool genStats)
00833 {
00834     if(genStats)
00835     {
00836         if(myStatistics == NULL)
00837         {
00838             // Want to generate statistics, but do not yet have the
00839             // structure for them, so create one.
00840             myStatistics = new SamStatistics();
00841         }
00842     }
00843     else
00844     {
00845         // Do not generate statistics, so if myStatistics is not NULL, 
00846         // delete it.
00847         if(myStatistics != NULL)
00848         {
00849             delete myStatistics;
00850             myStatistics = NULL;
00851         }
00852     }
00853 
00854 }
00855 
00856 
00857 const BamIndex* SamFile::GetBamIndex()
00858 {
00859     return(myBamIndex);
00860 }
00861 
00862 
00863 // initialize.
00864 void SamFile::init(const char* filename, OpenType mode, SamFileHeader* header)
00865 {
00866     myFilePtr = NULL;
00867     myInterfacePtr = NULL;
00868     myStatistics = NULL;
00869     myBamIndex = NULL;
00870     myRefPtr = NULL;
00871     myReadTranslation = SamRecord::NONE;
00872     myWriteTranslation = SamRecord::NONE;
00873         
00874     resetFile();
00875 
00876     bool openStatus = true;
00877     if(mode == READ)
00878     {
00879         // open the file for read.
00880         openStatus = OpenForRead(filename, header);
00881     }
00882     else
00883     {
00884         // open the file for write.
00885         openStatus = OpenForWrite(filename, header);
00886     }
00887     if(!openStatus)
00888     {
00889         // Failed to open the file - print error and abort.
00890         fprintf(stderr, "%s\n", GetStatusMessage());
00891         std::cerr << "FAILURE - EXITING!!!" << std::endl;
00892         exit(-1);
00893     }
00894 }
00895 
00896 
00897 // Reset variables for each file.
00898 void SamFile::resetFile()
00899 {
00900     // Close the file.
00901     if (myFilePtr != NULL)
00902     {
00903         // If we already have an open file, close it.
00904         ifclose(myFilePtr);
00905         myFilePtr = NULL;
00906     }
00907     if(myInterfacePtr != NULL)
00908     {
00909         delete myInterfacePtr;
00910         myInterfacePtr = NULL;
00911     }
00912 
00913     myIsOpenForRead = false;
00914     myIsOpenForWrite = false;
00915     myHasHeader = false;
00916     mySortedType = UNSORTED;
00917     myPrevReadName.clear();
00918     myPrevCoord = -1;
00919     myPrevRefID = 0;
00920     myRecordCount = 0;
00921     myStatus = SamStatus::SUCCESS;
00922 
00923     // Reset indexed bam values.
00924     myIsBamOpenForRead = false;
00925     myRefID = BamIndex::REF_ID_ALL;
00926     myStartPos = -1;
00927     myEndPos = -1;
00928     myNewSection = false;
00929     myOverlapSection = true;
00930     myCurrentChunkEnd = 0;
00931     myChunksToRead.clear();
00932     if(myBamIndex != NULL)
00933     {
00934         delete myBamIndex;
00935         myBamIndex = NULL;
00936     }
00937 
00938     // If statistics are being generated, reset them.
00939     if(myStatistics != NULL)
00940     {
00941         myStatistics->reset();
00942     }
00943 
00944     myRefName.clear();
00945 }
00946 
00947 
00948 // Validate that the record is sorted compared to the previously read record
00949 // if there is one, according to the specified sort order.
00950 // If the sort order is UNSORTED, true is returned.
00951 bool SamFile::validateSortOrder(SamRecord& record, SamFileHeader& header)
00952 {
00953     if(myRefPtr != NULL)
00954     {
00955         record.setReference(myRefPtr);
00956     }
00957     record.setSequenceTranslation(myReadTranslation);
00958 
00959     bool status = false;
00960     if(mySortedType == UNSORTED)
00961     {
00962         // Unsorted, so nothing to validate, just return true.
00963         status = true;
00964     }
00965     else 
00966     {
00967         // Check to see if mySortedType is based on the header.
00968         if(mySortedType == FLAG)
00969         {
00970             // Determine the sorted type from what was read out of the header.
00971             mySortedType = getSortOrderFromHeader(header);
00972         }
00973 
00974         if(mySortedType == QUERY_NAME)
00975         {
00976             // Validate that it is sorted by query name.
00977             // Get the query name from the record.
00978             const char* readName = record.getReadName();
00979             if(myPrevReadName.compare(readName) > 0)
00980             {
00981                 // The previous name is greater than the new record's name, so
00982                 // return false.
00983                 String errorMessage = "ERROR: File is not sorted at record ";
00984                 errorMessage += myRecordCount;
00985                 myStatus.setStatus(SamStatus::INVALID_SORT, 
00986                                    errorMessage.c_str());
00987                 status = false;
00988             }
00989             else
00990             {
00991                 myPrevReadName = readName;
00992                 status = true;
00993             }
00994         }
00995         else 
00996         {
00997             // Validate that it is sorted by COORDINATES.
00998             // Get the leftmost coordinate and the reference index.
00999             int32_t refID = record.getReferenceID();
01000             int32_t coord = record.get0BasedPosition();
01001             // The unmapped reference id is at the end of a sorted file.
01002             if(refID == BamIndex::REF_ID_UNMAPPED)
01003             {
01004                 // A new reference ID that is for the unmapped reads
01005                 // is always valid.
01006                 status = true;
01007                 myPrevRefID = refID;
01008                 myPrevCoord = coord;
01009             }
01010             else if(myPrevRefID == BamIndex::REF_ID_UNMAPPED)
01011             {
01012                 // Previous reference ID was for unmapped reads, but the
01013                 // current one is not, so this is not sorted.
01014                 String errorMessage = "ERROR: File is not sorted at record ";
01015                 errorMessage += myRecordCount;
01016                 myStatus.setStatus(SamStatus::INVALID_SORT, 
01017                                    errorMessage.c_str());
01018                 status = false;
01019             }
01020             else if(refID < myPrevRefID)
01021             {
01022                 // Current reference id is less than the previous one, 
01023                 //meaning that it is not sorted.
01024                 String errorMessage = "ERROR: File is not sorted at record ";
01025                 errorMessage += myRecordCount;
01026                 myStatus.setStatus(SamStatus::INVALID_SORT, 
01027                                    errorMessage.c_str());
01028                 status = false;
01029             }
01030             else
01031             {
01032                 // The reference IDs are in the correct order.
01033                 if(refID > myPrevRefID)
01034                 {
01035                     // New reference id, so set the previous coordinate to -1
01036                     myPrevCoord = -1;
01037                 }
01038             
01039                 // Check the coordinates.
01040                 if(coord < myPrevCoord)
01041                 {
01042                     // New Coord is less than the previous position.
01043                     String errorMessage = "ERROR: File is not sorted at record ";
01044                     errorMessage += myRecordCount;
01045                     myStatus.setStatus(SamStatus::INVALID_SORT, 
01046                                        errorMessage.c_str());
01047                     status = false;
01048                 }
01049                 else
01050                 {
01051                     myPrevRefID = refID;
01052                     myPrevCoord = coord;
01053                     status = true;
01054                 }
01055             }
01056         }
01057     }
01058 
01059     return(status);
01060 }
01061 
01062 
01063 SamFile::SortedType SamFile::getSortOrderFromHeader(SamFileHeader& header)
01064 {
01065     const char* tag = header.getSortOrder();
01066    
01067     // Default to unsorted since if it is not specified in the header
01068     // that is the value that should be used.
01069     SortedType headerSortOrder = UNSORTED;
01070     if(strcmp(tag, "queryname") == 0)
01071     {
01072         headerSortOrder = QUERY_NAME;
01073     }
01074     else if(strcmp(tag, "coordinate") == 0)
01075     {
01076         headerSortOrder = COORDINATE;
01077     }
01078     return(headerSortOrder);
01079 }
01080 
01081 
01082 // Read a record from the currently opened file based on the indexing info.
01083 bool SamFile::readIndexedRecord(SamFileHeader& header, 
01084                                 SamRecord& record)
01085 {
01086     record.setReference(myRefPtr);
01087     record.setSequenceTranslation(myReadTranslation);
01088 
01089     // A section to read is set and the file is open for reading, as verified
01090     // prior to calling into this protected method.
01091     bool recordFound = false;
01092     while(!recordFound)
01093     {
01094         // Check to see if we have more to read out of the current chunk.
01095         // By checking the current position in relation to the current
01096         // end chunk.  If the current position is >= the end of the
01097         // current chunk, then we must see to the next chunk.
01098         uint64_t currentPos = iftell(myFilePtr);
01099         if(currentPos >= myCurrentChunkEnd)
01100         {
01101             // If there are no more chunks to process, return failure.
01102             if(myChunksToRead.empty())
01103             {
01104                 myStatus = SamStatus::NO_MORE_RECS;
01105                 return(false);
01106             }
01107       
01108             // There are more chunks left, so get the next chunk.
01109             Chunk newChunk = myChunksToRead.pop();
01110 
01111             // Move to the location of the new chunk if it is not adjacent
01112             // to the current chunk.
01113             if(newChunk.chunk_beg != currentPos)
01114             {
01115                 // New chunk is not adjacent, so move to it.
01116                 if(ifseek(myFilePtr, newChunk.chunk_beg, SEEK_SET) != true)
01117                 {
01118                     // seek failed, cannot retrieve next record, return failure.
01119                     myStatus.setStatus(SamStatus::FAIL_IO, 
01120                                        "Failed to seek to the next record");
01121                     return(false);
01122                 }
01123             }
01124             // Seek succeeded, set the end of the new chunk.
01125             myCurrentChunkEnd = newChunk.chunk_end;
01126         }
01127 
01128         // At the position of the next record.  Read the record.
01129         myInterfacePtr->readRecord(myFilePtr, header, record, myStatus);
01130         if(myStatus != SamStatus::SUCCESS)
01131         {
01132             // Reading the record failed, return failure.
01133             return(false);
01134         }
01135 
01136         // Successfully read the record.  Check to see if it is in the correct 
01137         // reference/position.
01138         if(record.getReferenceID() != myRefID)
01139         {
01140             // Incorrect reference ID, return no more records.
01141             myStatus = SamStatus::NO_MORE_RECS;
01142             return(false);
01143         }
01144    
01145         // Found a record.
01146         recordFound = true;
01147 
01148         // If start/end position are set, verify that the alignment falls
01149         // within those.
01150         // If the alignment start is greater than the end of the region,
01151         // return NO_MORE_RECS.
01152         // Since myEndPos is Exclusive 0-based, anything >= myEndPos is outside
01153         // of the region.
01154         if((myEndPos != -1) && (record.get0BasedPosition() >= myEndPos))
01155         {
01156             myStatus = SamStatus::NO_MORE_RECS;
01157             return(false);
01158         }
01159         
01160         // We know the start is less than the end position, so the alignment
01161         // overlaps the region if the alignment end position is greater than the
01162         // start of the region.
01163         if((myStartPos != -1) && (record.get0BasedAlignmentEnd() < myStartPos))
01164         {
01165             // If it does not overlap the region, so go to the next
01166             // record...set recordFound back to false.
01167             recordFound = false;
01168         }
01169 
01170         if(!myOverlapSection)
01171         {
01172             // Needs to be fully contained.  Not fully contained if
01173             // 1) the record start position is < the region start position.
01174             // or
01175             // 2) the end position is specified and the record end position
01176             //    is greater than or equal to the region end position.
01177             //    (equal to since the region is exclusive.
01178             if((record.get0BasedPosition() < myStartPos) ||
01179                ((myEndPos != -1) && 
01180                 (record.get0BasedAlignmentEnd() >= myEndPos)))
01181             {
01182                 // This record is not fully contained, so move on to the next
01183                 // record.
01184                 recordFound = false;
01185             }
01186         }
01187     }
01188 
01189     // Successfully read the record, so check the sort order.
01190     if(!validateSortOrder(record, header))
01191     {
01192         myStatus.setStatus(SamStatus::INVALID_SORT, 
01193                            "The file is not properly sorted.");
01194         return(false);
01195     }
01196    
01197     // If it made it here, then the record was successfully read
01198     // and met the criteria, so return success.
01199     myRecordCount++;
01200 
01201     if(myStatistics != NULL)
01202     {
01203         // Successfully read, statistics should be updated.
01204         myStatistics->updateStatistics(record);
01205     }
01206     
01207     return(true);
01208 }
01209 
01210 
01211 bool SamFile::processNewSection(SamFileHeader &header)
01212 {
01213     myNewSection = false;
01214 
01215     // If there is no index file, return failure.
01216     if(myBamIndex == NULL)
01217     {
01218         // No bam index has been read.
01219         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01220                            "Cannot read section since there is no index file open");
01221         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the BAM Index file."));
01222         return(false);
01223     }
01224 
01225     // If there is not a BAM file open for reading, return failure.
01226     if(!myIsBamOpenForRead)
01227     {
01228         // There is not a BAM file open for reading.
01229         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01230                            "Cannot read section since there is no bam file open");
01231         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section without opening a BAM file."));
01232         return(false);
01233     }
01234 
01235     if(myHasHeader == false)
01236     {
01237         // The header has not yet been read.
01238         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01239                            "Cannot read record since the header has not been read.");
01240         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the header."));
01241         return(false);
01242     }
01243 
01244     // Indexed Bam open for read, so disable read buffering because iftell
01245     // will be used.
01246     // Needs to be done here after we already know that the header has been
01247     // read.
01248     myFilePtr->disableBuffering();
01249 
01250     myChunksToRead.clear();
01251     // Reset the end of the current chunk.  We are resetting our read, so
01252     // we no longer have a "current chunk" that we are reading.
01253     myCurrentChunkEnd = 0;
01254 
01255     // Check to see if the read section was set based on the reference name
01256     // but not yet converted to reference id.
01257     if(!myRefName.empty())
01258     {
01259         myRefID = header.getReferenceID(myRefName.c_str());
01260         // Clear the myRefName length so this code is only executed once.
01261         myRefName.clear();
01262 
01263         // Check to see if a reference id was found.
01264         if(myRefID == SamReferenceInfo::NO_REF_ID)
01265         {
01266             myStatus = SamStatus::NO_MORE_RECS;
01267             return(false);
01268         }
01269     }
01270 
01271     // Get the chunks associated with this reference region.
01272     if(myBamIndex->getChunksForRegion(myRefID, myStartPos, myEndPos, 
01273                                       myChunksToRead) == true)
01274     {
01275         myStatus = SamStatus::SUCCESS;
01276     }
01277     else
01278     {
01279         String errorMsg = "Failed to get the specified region, refID = ";
01280         errorMsg += myRefID;
01281         errorMsg += "; startPos = ";
01282         errorMsg += myStartPos;
01283         errorMsg += "; endPos = ";
01284         errorMsg += myEndPos;
01285         myStatus.setStatus(SamStatus::FAIL_PARSE, 
01286                            errorMsg);
01287     }
01288     return(true);
01289 }
01290 
01291 //
01292 // When the caller to SamFile::ReadRecord() catches an
01293 // exception, it may choose to call this method to resync
01294 // on the underlying binary stream.
01295 //
01296 // Arguments: a callback function that will requires length bytes
01297 // of data to validate a record header.
01298 //
01299 // The expected use case is to re-sync on the next probably valid
01300 // BAM record, so that we can resume reading even after detecting
01301 // a corrupted BAM file.
01302 //
01303 bool SamFile::attemptRecoverySync(bool (*checkSignature)(void *data) , int length)
01304 {
01305     if(myFilePtr==NULL) return false;
01306     // non-recovery aware objects will just return false:
01307     return myFilePtr->attemptRecoverySync(checkSignature, length);
01308 }
01309 
01310 // Default Constructor.
01311 SamFileReader::SamFileReader()
01312     : SamFile()
01313 {
01314 }
01315 
01316 
01317 // Constructor that opens the specified file for read.
01318 SamFileReader::SamFileReader(const char* filename)
01319     : SamFile(filename, READ)
01320 {
01321 }
01322 
01323 
01324 
01325 
01326 // Constructor that opens the specified file for read.
01327 SamFileReader::SamFileReader(const char* filename,
01328                              ErrorHandler::HandlingType errorHandlingType)
01329     : SamFile(filename, READ, errorHandlingType)
01330 {
01331 }
01332 
01333 
01334 // Constructor that opens the specified file for read.
01335 SamFileReader::SamFileReader(const char* filename,
01336                              SamFileHeader* header)
01337     : SamFile(filename, READ, header)
01338 {
01339 }
01340 
01341 
01342 // Constructor that opens the specified file for read.
01343 SamFileReader::SamFileReader(const char* filename,
01344                              ErrorHandler::HandlingType errorHandlingType,
01345                              SamFileHeader* header)
01346     : SamFile(filename, READ, errorHandlingType, header)
01347 {
01348 }
01349 
01350 
01351 SamFileReader::~SamFileReader()
01352 {
01353 }
01354 
01355 
01356 // Default Constructor.
01357 SamFileWriter::SamFileWriter()
01358     : SamFile()
01359 {
01360 }
01361 
01362 
01363 // Constructor that opens the specified file for write.
01364 SamFileWriter::SamFileWriter(const char* filename)
01365     : SamFile(filename, WRITE)
01366 {
01367 }
01368 
01369 
01370 // Constructor that opens the specified file for write.
01371 SamFileWriter::SamFileWriter(const char* filename,
01372                              ErrorHandler::HandlingType errorHandlingType)
01373     : SamFile(filename, WRITE, errorHandlingType)
01374 {
01375 }
01376 
01377 
01378 // Constructor that opens the specified file for write.
01379 SamFileWriter::SamFileWriter(const char* filename,
01380                              SamFileHeader* header)
01381     : SamFile(filename, WRITE, header)
01382 {
01383 }
01384 
01385 
01386 // Constructor that opens the specified file for write.
01387 SamFileWriter::SamFileWriter(const char* filename,
01388                              ErrorHandler::HandlingType errorHandlingType,
01389                              SamFileHeader* header)
01390     : SamFile(filename, WRITE, errorHandlingType, header)
01391 {
01392 }
01393 
01394 
01395 SamFileWriter::~SamFileWriter()
01396 {
01397 }
Generated on Tue Sep 6 17:51:59 2011 for libStatGen Software by  doxygen 1.6.3