libStatGen Software  1
SamFile.cpp
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 #include <stdexcept>
00018 #include "SamFile.h"
00019 #include "SamFileHeader.h"
00020 #include "SamRecord.h"
00021 #include "BamInterface.h"
00022 #include "SamInterface.h"
00023 #include "BgzfFileType.h"
00024 
00025 // Constructor, init variables.
00026 SamFile::SamFile()
00027     : myStatus()
00028 {
00029     init();
00030     resetFile();
00031 }
00032 
00033 
00034 // Constructor, init variables.
00035 SamFile::SamFile(ErrorHandler::HandlingType errorHandlingType)
00036     : myStatus(errorHandlingType)
00037 {
00038     init();
00039     resetFile();
00040 }
00041 
00042 
00043 // Constructor, init variables and open the specified file based on the
00044 // specified mode (READ/WRITE).
00045 SamFile::SamFile(const char* filename, OpenType mode)
00046     : myStatus()
00047 {
00048     init(filename, mode, NULL);
00049 }
00050 
00051 
00052 // Constructor, init variables and open the specified file based on the
00053 // specified mode (READ/WRITE).  Default is READ..
00054 SamFile::SamFile(const char* filename, OpenType mode,
00055                  ErrorHandler::HandlingType errorHandlingType)
00056     : myStatus(errorHandlingType)
00057 {
00058     init(filename, mode, NULL);
00059 }
00060 
00061 
00062 // Constructor, init variables and open the specified file based on the
00063 // specified mode (READ/WRITE).
00064 SamFile::SamFile(const char* filename, OpenType mode, SamFileHeader* header)
00065     : myStatus()
00066 {
00067     init(filename, mode, header);
00068 }
00069 
00070 
00071 // Constructor, init variables and open the specified file based on the
00072 // specified mode (READ/WRITE).  Default is READ..
00073 SamFile::SamFile(const char* filename, OpenType mode,
00074                  ErrorHandler::HandlingType errorHandlingType, 
00075                  SamFileHeader* header)
00076     : myStatus(errorHandlingType)
00077 {
00078     init(filename, mode, header);
00079 }
00080 
00081 
00082 SamFile::~SamFile()
00083 {
00084     resetFile();
00085     if(myStatistics != NULL)
00086     {
00087         delete myStatistics;
00088     }
00089 }
00090 
00091 
00092 // Open a sam/bam file for reading with the specified filename.
00093 bool SamFile::OpenForRead(const char * filename, SamFileHeader* header)
00094 {
00095     // Reset for any previously operated on files.
00096     resetFile();
00097 
00098     int lastchar = 0;
00099 
00100     while (filename[lastchar] != 0) lastchar++;
00101 
00102     // If at least one character, check for '-'.
00103     if((lastchar >= 1) && (filename[0] == '-'))
00104     {
00105         // Read from stdin - determine type of file to read.
00106         // Determine if compressed bam.
00107         if(strcmp(filename, "-.bam") == 0)
00108         {
00109             // Compressed bam - open as bgzf.
00110             // -.bam is the filename, read compressed bam from stdin
00111             filename = "-";
00112 
00113             myFilePtr = new InputFile;
00114             // support recover mode - this switches in a reader
00115             // capable of recovering from bad BGZF compression blocks.
00116             myFilePtr->setAttemptRecovery(myAttemptRecovery);
00117             myFilePtr->openFile(filename, "rb", InputFile::BGZF);
00118 
00119             myInterfacePtr = new BamInterface;
00120 
00121             // Read the magic string.
00122             char magic[4];
00123             ifread(myFilePtr, magic, 4);
00124         }
00125         else if(strcmp(filename, "-.ubam") == 0)
00126         {
00127             // uncompressed BAM File.
00128             // -.ubam is the filename, read uncompressed bam from stdin.
00129             // uncompressed BAM is still compressed with BGZF, but using
00130             // compression level 0, so still open as BGZF since it has a
00131             // BGZF header.
00132             filename = "-";
00133 
00134             // Uncompressed, so do not require the eof block.
00135 #ifdef __ZLIB_AVAILABLE__
00136             BgzfFileType::setRequireEofBlock(false);
00137 #endif
00138             myFilePtr = ifopen(filename, "rb", InputFile::BGZF);
00139         
00140             myInterfacePtr = new BamInterface;
00141 
00142             // Read the magic string.
00143             char magic[4];
00144             ifread(myFilePtr, magic, 4);
00145         }
00146         else if((strcmp(filename, "-") == 0) || (strcmp(filename, "-.sam") == 0))
00147         {
00148             // SAM File.
00149             // read sam from stdin
00150             filename = "-";
00151             myFilePtr = ifopen(filename, "rb", InputFile::UNCOMPRESSED);
00152             myInterfacePtr = new SamInterface;
00153         }
00154         else
00155         {
00156             std::string errorMessage = "Invalid SAM/BAM filename, ";
00157             errorMessage += filename;
00158             errorMessage += ".  From stdin, can only be '-', '-.sam', '-.bam', or '-.ubam'";
00159             myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str());
00160             delete myFilePtr;
00161             myFilePtr = NULL;
00162             return(false);          
00163         }
00164     }
00165     else
00166     {
00167         // Not from stdin.  Read the file to determine the type.
00168 
00169         myFilePtr = new InputFile;
00170 
00171         // support recovery mode - this conditionally enables a reader
00172         // capable of recovering from bad BGZF compression blocks.
00173         myFilePtr->setAttemptRecovery(myAttemptRecovery);
00174         bool rc = myFilePtr->openFile(filename, "rb", InputFile::DEFAULT);
00175 
00176         if (rc == false)
00177         {
00178             std::string errorMessage = "Failed to Open ";
00179             errorMessage += filename;
00180             errorMessage += " for reading";
00181             myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str());
00182             delete myFilePtr;
00183             myFilePtr = NULL;
00184             return(false);
00185         }
00186         
00187         char magic[4];
00188         ifread(myFilePtr, magic, 4);
00189         
00190         if (magic[0] == 'B' && magic[1] == 'A' && magic[2] == 'M' &&
00191             magic[3] == 1)
00192         {
00193             myInterfacePtr = new BamInterface;
00194             // Set that it is a bam file open for reading.  This is needed to
00195             // determine if an index file can be used.
00196             myIsBamOpenForRead = true;
00197         }
00198         else
00199         {
00200             // Not a bam, so rewind to the beginning of the file so it
00201             // can be read.
00202             ifrewind(myFilePtr);
00203             myInterfacePtr = new SamInterface;
00204         }
00205     }
00206 
00207     // File is open for reading.
00208     myIsOpenForRead = true;
00209 
00210     // Read the header if one was passed in.
00211     if(header != NULL)
00212     {
00213         return(ReadHeader(*header));
00214     }
00215 
00216     // Successfully opened the file.
00217     myStatus = SamStatus::SUCCESS;
00218     return(true);
00219 }
00220 
00221 
00222 // Open a sam/bam file for writing with the specified filename.
00223 bool SamFile::OpenForWrite(const char * filename, SamFileHeader* header)
00224 {
00225     // Reset for any previously operated on files.
00226     resetFile();
00227     
00228     int lastchar = 0;
00229     while (filename[lastchar] != 0) lastchar++;   
00230     if (lastchar >= 4 && 
00231         filename[lastchar - 4] == 'u' &&
00232         filename[lastchar - 3] == 'b' &&
00233         filename[lastchar - 2] == 'a' &&
00234         filename[lastchar - 1] == 'm')
00235     {
00236         // BAM File.
00237         // if -.ubam is the filename, write uncompressed bam to stdout
00238         if((lastchar == 6) && (filename[0] == '-') && (filename[1] == '.'))
00239         {
00240             filename = "-";
00241         }
00242 
00243         myFilePtr = ifopen(filename, "wb0", InputFile::BGZF);
00244 
00245         myInterfacePtr = new BamInterface;
00246     }
00247     else if (lastchar >= 3 && 
00248              filename[lastchar - 3] == 'b' &&
00249              filename[lastchar - 2] == 'a' &&
00250              filename[lastchar - 1] == 'm')
00251     {
00252         // BAM File.
00253         // if -.bam is the filename, write compressed bam to stdout
00254         if((lastchar == 5) && (filename[0] == '-') && (filename[1] == '.'))
00255         {
00256             filename = "-";
00257         }
00258         myFilePtr = ifopen(filename, "wb", InputFile::BGZF);
00259         
00260         myInterfacePtr = new BamInterface;
00261     }
00262     else
00263     {
00264         // SAM File
00265         // if - (followed by anything is the filename,
00266         // write uncompressed sam to stdout
00267         if((lastchar >= 1) && (filename[0] == '-'))
00268         {
00269             filename = "-";
00270         }
00271         myFilePtr = ifopen(filename, "wb", InputFile::UNCOMPRESSED);
00272    
00273         myInterfacePtr = new SamInterface;
00274     }
00275 
00276     if (myFilePtr == NULL)
00277     {
00278         std::string errorMessage = "Failed to Open ";
00279         errorMessage += filename;
00280         errorMessage += " for writing";
00281         myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str());
00282         return(false);
00283     }
00284    
00285     myIsOpenForWrite = true;
00286 
00287     // Write the header if one was passed in.
00288     if(header != NULL)
00289     {
00290         return(WriteHeader(*header));
00291     }
00292 
00293     // Successfully opened the file.
00294     myStatus = SamStatus::SUCCESS;
00295     return(true);
00296 }
00297 
00298 
00299 // Read BAM Index file.
00300 bool SamFile::ReadBamIndex(const char* bamIndexFilename)
00301 {
00302     // Cleanup a previously setup index.
00303     if(myBamIndex != NULL)
00304     {
00305         delete myBamIndex;
00306         myBamIndex = NULL;
00307     }
00308 
00309     // Create a new bam index.
00310     myBamIndex = new BamIndex();
00311     SamStatus::Status indexStat = myBamIndex->readIndex(bamIndexFilename);
00312 
00313     if(indexStat != SamStatus::SUCCESS)
00314     {
00315         std::string errorMessage = "Failed to read the bam Index file: ";
00316         errorMessage += bamIndexFilename;
00317         myStatus.setStatus(indexStat, errorMessage.c_str());
00318         delete myBamIndex;
00319         myBamIndex = NULL;
00320         return(false);
00321     }
00322     myStatus = SamStatus::SUCCESS;
00323     return(true);
00324 }
00325 
00326 
00327 // Read BAM Index file.
00328 bool SamFile::ReadBamIndex()
00329 {
00330     if(myFilePtr == NULL)
00331     {
00332         // Can't read the bam index file because the BAM file has not yet been
00333         // opened, so we don't know the base filename for the index file.
00334         std::string errorMessage = "Failed to read the bam Index file -"
00335             " the BAM file needs to be read first in order to determine"
00336             " the index filename.";
00337         myStatus.setStatus(SamStatus::FAIL_ORDER, errorMessage.c_str());
00338         return(false);
00339     }
00340 
00341     const char* bamBaseName = myFilePtr->getFileName();
00342     
00343     std::string indexName = bamBaseName;
00344     indexName += ".bai";
00345 
00346     bool foundFile = true;
00347     try
00348     {
00349         if(ReadBamIndex(indexName.c_str()) == false)
00350         {
00351             foundFile = false;
00352         }
00353     }
00354     catch (std::exception&)
00355     {
00356         foundFile = false;
00357     }
00358 
00359     // Check to see if the index file was found.
00360     if(!foundFile)
00361     {
00362         // Not found - try without the bam extension.
00363         // Locate the start of the bam extension
00364         size_t startExt = indexName.find(".bam");
00365         if(startExt == std::string::npos)
00366         {
00367             // Could not find the .bam extension, so just return false since the
00368             // call to ReadBamIndex set the status.
00369             return(false);
00370         }
00371         // Remove ".bam" and try reading the index again.
00372         indexName.erase(startExt,  4);
00373         return(ReadBamIndex(indexName.c_str()));
00374     }
00375     return(true);
00376 }
00377 
00378 
00379 // Sets the reference to the specified genome sequence object.
00380 void SamFile::SetReference(GenomeSequence* reference)
00381 {
00382     myRefPtr = reference;
00383 }
00384 
00385 
00386 // Set the type of sequence translation to use when reading the sequence.
00387 void SamFile::SetReadSequenceTranslation(SamRecord::SequenceTranslation translation)
00388 {
00389     myReadTranslation = translation;
00390 }
00391 
00392 
00393 // Set the type of sequence translation to use when writing the sequence.
00394 void SamFile::SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation)
00395 {
00396     myWriteTranslation = translation;
00397 }
00398 
00399 // Close the file if there is one open.
00400 void SamFile::Close()
00401 {
00402     // Resetting the file will close it if it is open, and
00403     // will reset all other variables.
00404     resetFile();
00405 }
00406 
00407 
00408 // Returns whether or not the file has been opened.
00409 // return: int - true = open; false = not open.
00410 bool SamFile::IsOpen()
00411 {
00412     if (myFilePtr != NULL)
00413     {
00414         // File Pointer is set, so return if it is open.
00415         return(myFilePtr->isOpen());
00416     }
00417     // File pointer is not set, so return false, not open.
00418     return false;
00419 }
00420 
00421 
00422 // Returns whether or not the end of the file has been reached.
00423 // return: int - true = EOF; false = not eof.
00424 bool SamFile::IsEOF()
00425 {
00426     if (myFilePtr != NULL)
00427     {
00428         // File Pointer is set, so return if eof.
00429         return(ifeof(myFilePtr));
00430     }
00431     // File pointer is not set, so return true, eof.
00432     return true;
00433 }
00434 
00435 
00436 // Read the header from the currently opened file.
00437 bool SamFile::ReadHeader(SamFileHeader& header)
00438 {
00439     myStatus = SamStatus::SUCCESS;
00440     if(myIsOpenForRead == false)
00441     {
00442         // File is not open for read
00443         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00444                            "Cannot read header since the file is not open for reading");
00445         return(false);
00446     }
00447 
00448     if(myHasHeader == true)
00449     {
00450         // The header has already been read.
00451         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00452                            "Cannot read header since it has already been read.");
00453         return(false);
00454     }
00455 
00456     if(myInterfacePtr->readHeader(myFilePtr, header, myStatus))
00457     {
00458         // The header has now been successfully read.
00459         myHasHeader = true;
00460         return(true);
00461     }
00462     return(false);
00463 }
00464 
00465 
00466 // Write the header to the currently opened file.
00467 bool SamFile::WriteHeader(SamFileHeader& header)
00468 {
00469     myStatus = SamStatus::SUCCESS;
00470     if(myIsOpenForWrite == false)
00471     {
00472         // File is not open for write
00473         // -OR-
00474         // The header has already been written.
00475         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00476                            "Cannot write header since the file is not open for writing");
00477         return(false);
00478     }
00479 
00480     if(myHasHeader == true)
00481     {
00482         // The header has already been written.
00483         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00484                            "Cannot write header since it has already been written");
00485         return(false);
00486     }
00487 
00488     if(myInterfacePtr->writeHeader(myFilePtr, header, myStatus))
00489     {
00490         // The header has now been successfully written.
00491         myHasHeader = true;
00492         return(true);
00493     }
00494 
00495     // return the status.
00496     return(false);
00497 }
00498 
00499 
00500 // Read a record from the currently opened file.
00501 bool SamFile::ReadRecord(SamFileHeader& header, 
00502                          SamRecord& record)
00503 {
00504     myStatus = SamStatus::SUCCESS;
00505 
00506     if(myIsOpenForRead == false)
00507     {
00508         // File is not open for read
00509         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00510                            "Cannot read record since the file is not open for reading");
00511         throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to opening the file."));
00512         return(false);
00513     }
00514 
00515     if(myHasHeader == false)
00516     {
00517         // The header has not yet been read.
00518         // TODO - maybe just read the header.
00519         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00520                            "Cannot read record since the header has not been read.");
00521         throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to reading the header."));
00522         return(false);
00523     }
00524 
00525     // Check to see if a new region has been set.  If so, determine the
00526     // chunks for that region.
00527     if(myNewSection)
00528     {
00529         if(!processNewSection(header))
00530         {
00531             // Failed processing a new section.  Could be an 
00532             // order issue like the file not being open or the
00533             // indexed file not having been read.
00534             // processNewSection sets myStatus with the failure reason.
00535             return(false);
00536         }
00537     }
00538 
00539     // Read until a record is not successfully read or there are no more
00540     // requested records.
00541     while(myStatus == SamStatus::SUCCESS)
00542     {
00543         record.setReference(myRefPtr);
00544         record.setSequenceTranslation(myReadTranslation);
00545 
00546         // If reading by index, this method will setup to ensure it is in
00547         // the correct position for the next record (if not already there).
00548         // Sets myStatus if it could not move to a good section.
00549         // Just returns true if it is not setup to read by index.
00550         if(!ensureIndexedReadPosition())
00551         {
00552             // Either there are no more records in the section
00553             // or it failed to move to the right section, so there
00554             // is nothing more to read, stop looping.
00555             break;
00556         }
00557         
00558         // File is open for reading and the header has been read, so read the
00559         // next record.
00560         myInterfacePtr->readRecord(myFilePtr, header, record, myStatus);
00561         if(myStatus != SamStatus::SUCCESS)
00562         {
00563             // Failed to read the record, so break out of the loop.
00564             break;
00565         }
00566 
00567         // Successfully read a record, so check if we should filter it.
00568         // First check if it is out of the section.  Returns true
00569         // if not reading by sections, returns false if the record
00570         // is outside of the section.  Sets status to NO_MORE_RECS if
00571         // there is nothing left ot read in the section.
00572         if(!checkRecordInSection(record))
00573         {
00574             // The record is not in the section.
00575             // The while loop will detect if NO_MORE_RECS was set.
00576             continue;
00577         }
00578 
00579         // Check the flag for required/excluded flags.
00580         uint16_t flag = record.getFlag();
00581         if((flag & myRequiredFlags) != myRequiredFlags)
00582         {
00583             // The record does not conatain all required flags, so
00584             // continue looking.
00585             continue;
00586         }
00587         if((flag & myExcludedFlags) != 0)
00588         {
00589             // The record contains an excluded flag, so continue looking.
00590             continue;
00591         }
00592 
00593         //increment the record count.
00594         myRecordCount++;
00595         
00596         if(myStatistics != NULL)
00597         {
00598             // Statistics should be updated.
00599             myStatistics->updateStatistics(record);
00600         }
00601         
00602         // Successfully read the record, so check the sort order.
00603         if(!validateSortOrder(record, header))
00604         {
00605             // ValidateSortOrder sets the status on a failure.
00606             return(false);
00607         }
00608         return(true);
00609 
00610     } // End while loop that checks if a desired record is found or failure.
00611 
00612     // Return true if a record was found.
00613     return(myStatus == SamStatus::SUCCESS);
00614 }
00615 
00616 
00617 
00618 // Write a record to the currently opened file.
00619 bool SamFile::WriteRecord(SamFileHeader& header, 
00620                           SamRecord& record)
00621 {
00622     if(myIsOpenForWrite == false)
00623     {
00624         // File is not open for writing
00625         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00626                            "Cannot write record since the file is not open for writing");
00627         return(false);
00628     }
00629 
00630     if(myHasHeader == false)
00631     {
00632         // The header has not yet been written.
00633         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00634                            "Cannot write record since the header has not been written");
00635         return(false);
00636     }
00637 
00638     // Before trying to write the record, validate the sort order.
00639     if(!validateSortOrder(record, header))
00640     {
00641         // Not sorted like it is supposed to be, do not write the record
00642         myStatus.setStatus(SamStatus::INVALID_SORT, 
00643                            "Cannot write the record since the file is not properly sorted.");
00644         return(false);
00645     }
00646 
00647     if(myRefPtr != NULL)
00648     {
00649         record.setReference(myRefPtr);
00650     }
00651 
00652     // File is open for writing and the header has been written, so write the
00653     // record.
00654     myStatus = myInterfacePtr->writeRecord(myFilePtr, header, record,
00655                                            myWriteTranslation);
00656 
00657     if(myStatus == SamStatus::SUCCESS)
00658     {
00659         // A record was successfully written, so increment the record count.
00660         myRecordCount++;
00661         return(true);
00662     }
00663     return(false);
00664 }
00665 
00666 
00667 // Set the flag to validate that the file is sorted as it is read/written.
00668 // Must be called after the file has been opened.
00669 void SamFile::setSortedValidation(SortedType sortType)
00670 {
00671     mySortedType = sortType;
00672 }
00673 
00674 
00675 // Return the number of records that have been read/written so far.
00676 uint32_t SamFile::GetCurrentRecordCount()
00677 {
00678     return(myRecordCount);
00679 }
00680 
00681 
00682 // Sets what part of the SamFile should be read.
00683 bool SamFile::SetReadSection(int32_t refID)
00684 {
00685     // No start/end specified, so set back to default -1.
00686     return(SetReadSection(refID, -1, -1));
00687 }
00688 
00689 
00690 
00691 // Sets what part of the SamFile should be read.
00692 bool SamFile::SetReadSection(const char* refName)
00693 {
00694     // No start/end specified, so set back to default -1.
00695     return(SetReadSection(refName, -1, -1));
00696 }
00697 
00698 
00699 // Sets what part of the BAM file should be read.
00700 bool SamFile::SetReadSection(int32_t refID, int32_t start, int32_t end, 
00701                              bool overlap)
00702 {
00703     // If there is not a BAM file open for reading, return failure.
00704     // Opening a new file clears the read section, so it must be
00705     // set after the file is opened.
00706     if(!myIsBamOpenForRead)
00707     {
00708         // There is not a BAM file open for reading.
00709         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00710                            "Cannot set section since there is no bam file open");
00711         return(false);
00712     }
00713 
00714     myNewSection = true;
00715     myOverlapSection = overlap;
00716     myStartPos = start;
00717     myEndPos = end;
00718     myRefID = refID;
00719     myRefName.clear();
00720     myChunksToRead.clear();
00721     // Reset the end of the current chunk.  We are resetting our read, so
00722     // we no longer have a "current chunk" that we are reading.
00723     myCurrentChunkEnd = 0;
00724     myStatus = SamStatus::SUCCESS;
00725 
00726     // Reset the sort order criteria since we moved around in the file.    
00727     myPrevCoord = -1;
00728     myPrevRefID = 0;
00729     myPrevReadName.Clear();
00730 
00731     return(true);
00732 }
00733 
00734 
00735 // Sets what part of the BAM file should be read.
00736 bool SamFile::SetReadSection(const char* refName, int32_t start, int32_t end,
00737                              bool overlap)
00738 {
00739     // If there is not a BAM file open for reading, return failure.
00740     // Opening a new file clears the read section, so it must be
00741     // set after the file is opened.
00742     if(!myIsBamOpenForRead)
00743     {
00744         // There is not a BAM file open for reading.
00745         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00746                            "Cannot set section since there is no bam file open");
00747         return(false);
00748     }
00749 
00750     myNewSection = true;
00751     myOverlapSection = overlap;
00752     myStartPos = start;
00753     myEndPos = end;
00754     if((strcmp(refName, "") == 0) || (strcmp(refName, "*") == 0))
00755     {
00756         // No Reference name specified, so read just the "-1" entries.
00757         myRefID = BamIndex::REF_ID_UNMAPPED;
00758     }
00759     else
00760     {
00761         // save the reference name and revert the reference ID to unknown
00762         // so it will be calculated later.
00763         myRefName = refName;
00764         myRefID = BamIndex::REF_ID_ALL;
00765     }
00766     myChunksToRead.clear();
00767     // Reset the end of the current chunk.  We are resetting our read, so
00768     // we no longer have a "current chunk" that we are reading.
00769     myCurrentChunkEnd = 0;
00770     myStatus = SamStatus::SUCCESS;
00771     
00772     // Reset the sort order criteria since we moved around in the file.    
00773     myPrevCoord = -1;
00774     myPrevRefID = 0;
00775     myPrevReadName.Clear();
00776 
00777     return(true);
00778 }
00779 
00780 
00781 void SamFile::SetReadFlags(uint16_t requiredFlags, uint16_t excludedFlags)
00782 {
00783     myRequiredFlags = requiredFlags;
00784     myExcludedFlags = excludedFlags;
00785 }
00786 
00787 
00788 // Get the number of mapped reads in the specified reference id.  
00789 // Returns -1 for out of range refIDs.
00790 int32_t SamFile::getNumMappedReadsFromIndex(int32_t refID)
00791 {
00792     // The bam index must have already been read.
00793     if(myBamIndex == NULL)
00794     {
00795         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00796                            "Cannot get num mapped reads from the index until it has been read.");
00797         return(false);
00798     }
00799     return(myBamIndex->getNumMappedReads(refID));
00800 }
00801 
00802 
00803 // Get the number of unmapped reads in the specified reference id.  
00804 // Returns -1 for out of range refIDs.
00805 int32_t SamFile::getNumUnMappedReadsFromIndex(int32_t refID)
00806 {
00807     // The bam index must have already been read.
00808     if(myBamIndex == NULL)
00809     {
00810         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00811                            "Cannot get num unmapped reads from the index until it has been read.");
00812         return(false);
00813     }
00814     return(myBamIndex->getNumUnMappedReads(refID));
00815 }
00816 
00817 
00818 // Get the number of mapped reads in the specified reference id.  
00819 // Returns -1 for out of range references.
00820 int32_t SamFile::getNumMappedReadsFromIndex(const char* refName,
00821                                             SamFileHeader& header)
00822 {
00823     // The bam index must have already been read.
00824     if(myBamIndex == NULL)
00825     {
00826         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00827                            "Cannot get num mapped reads from the index until it has been read.");
00828         return(false);
00829     }
00830     int32_t refID = BamIndex::REF_ID_UNMAPPED;
00831     if((strcmp(refName, "") != 0) && (strcmp(refName, "*") != 0))
00832     {
00833         // Reference name specified, so read just the "-1" entries.
00834         refID =  header.getReferenceID(refName);
00835     }
00836     return(myBamIndex->getNumMappedReads(refID));
00837 }
00838 
00839 
00840 // Get the number of unmapped reads in the specified reference id.  
00841 // Returns -1 for out of range refIDs.
00842 int32_t SamFile::getNumUnMappedReadsFromIndex(const char* refName,
00843                                               SamFileHeader& header)
00844 {
00845     // The bam index must have already been read.
00846     if(myBamIndex == NULL)
00847     {
00848         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00849                            "Cannot get num unmapped reads from the index until it has been read.");
00850         return(false);
00851     }
00852     int32_t refID = BamIndex::REF_ID_UNMAPPED;
00853     if((strcmp(refName, "") != 0) && (strcmp(refName, "*") != 0))
00854     {
00855         // Reference name specified, so read just the "-1" entries.
00856         refID =  header.getReferenceID(refName);
00857     }
00858     return(myBamIndex->getNumUnMappedReads(refID));
00859 }
00860 
00861 
00862 // Returns the number of bases in the passed in read that overlap the
00863 // region that is currently set.
00864 uint32_t SamFile::GetNumOverlaps(SamRecord& samRecord)
00865 {
00866     if(myRefPtr != NULL)
00867     {
00868         samRecord.setReference(myRefPtr);
00869     }
00870     samRecord.setSequenceTranslation(myReadTranslation);
00871 
00872     // Get the overlaps in the sam record for the region currently set
00873     // for this file.
00874     return(samRecord.getNumOverlaps(myStartPos, myEndPos));
00875 }
00876 
00877 
00878 void SamFile::GenerateStatistics(bool genStats)
00879 {
00880     if(genStats)
00881     {
00882         if(myStatistics == NULL)
00883         {
00884             // Want to generate statistics, but do not yet have the
00885             // structure for them, so create one.
00886             myStatistics = new SamStatistics();
00887         }
00888     }
00889     else
00890     {
00891         // Do not generate statistics, so if myStatistics is not NULL, 
00892         // delete it.
00893         if(myStatistics != NULL)
00894         {
00895             delete myStatistics;
00896             myStatistics = NULL;
00897         }
00898     }
00899 
00900 }
00901 
00902 
00903 const BamIndex* SamFile::GetBamIndex()
00904 {
00905     return(myBamIndex);
00906 }
00907 
00908 
00909 // initialize.
00910 void SamFile::init()
00911 {
00912     myFilePtr = NULL;
00913     myInterfacePtr = NULL;
00914     myStatistics = NULL;
00915     myBamIndex = NULL;
00916     myRefPtr = NULL;
00917     myReadTranslation = SamRecord::NONE;
00918     myWriteTranslation = SamRecord::NONE;
00919     myAttemptRecovery = false;
00920     myRequiredFlags = 0;
00921     myExcludedFlags = 0;
00922 }
00923 
00924 
00925 void SamFile::init(const char* filename, OpenType mode, SamFileHeader* header)
00926 {
00927     init();
00928         
00929     resetFile();
00930 
00931     bool openStatus = true;
00932     if(mode == READ)
00933     {
00934         // open the file for read.
00935         openStatus = OpenForRead(filename, header);
00936     }
00937     else
00938     {
00939         // open the file for write.
00940         openStatus = OpenForWrite(filename, header);
00941     }
00942     if(!openStatus)
00943     {
00944         // Failed to open the file - print error and abort.
00945         fprintf(stderr, "%s\n", GetStatusMessage());
00946         std::cerr << "FAILURE - EXITING!!!" << std::endl;
00947         exit(-1);
00948     }
00949 }
00950 
00951 
00952 // Reset variables for each file.
00953 void SamFile::resetFile()
00954 {
00955     // Close the file.
00956     if (myFilePtr != NULL)
00957     {
00958         // If we already have an open file, close it.
00959         ifclose(myFilePtr);
00960         myFilePtr = NULL;
00961     }
00962     if(myInterfacePtr != NULL)
00963     {
00964         delete myInterfacePtr;
00965         myInterfacePtr = NULL;
00966     }
00967 
00968     myIsOpenForRead = false;
00969     myIsOpenForWrite = false;
00970     myHasHeader = false;
00971     mySortedType = UNSORTED;
00972     myPrevReadName.Clear();
00973     myPrevCoord = -1;
00974     myPrevRefID = 0;
00975     myRecordCount = 0;
00976     myStatus = SamStatus::SUCCESS;
00977 
00978     // Reset indexed bam values.
00979     myIsBamOpenForRead = false;
00980     myRefID = BamIndex::REF_ID_ALL;
00981     myStartPos = -1;
00982     myEndPos = -1;
00983     myNewSection = false;
00984     myOverlapSection = true;
00985     myCurrentChunkEnd = 0;
00986     myChunksToRead.clear();
00987     if(myBamIndex != NULL)
00988     {
00989         delete myBamIndex;
00990         myBamIndex = NULL;
00991     }
00992 
00993     // If statistics are being generated, reset them.
00994     if(myStatistics != NULL)
00995     {
00996         myStatistics->reset();
00997     }
00998 
00999     myRefName.clear();
01000 }
01001 
01002 
01003 // Validate that the record is sorted compared to the previously read record
01004 // if there is one, according to the specified sort order.
01005 // If the sort order is UNSORTED, true is returned.
01006 bool SamFile::validateSortOrder(SamRecord& record, SamFileHeader& header)
01007 {
01008     if(myRefPtr != NULL)
01009     {
01010         record.setReference(myRefPtr);
01011     }
01012     record.setSequenceTranslation(myReadTranslation);
01013 
01014     bool status = false;
01015     if(mySortedType == UNSORTED)
01016     {
01017         // Unsorted, so nothing to validate, just return true.
01018         status = true;
01019     }
01020     else 
01021     {
01022         // Check to see if mySortedType is based on the header.
01023         if(mySortedType == FLAG)
01024         {
01025             // Determine the sorted type from what was read out of the header.
01026             mySortedType = getSortOrderFromHeader(header);
01027         }
01028 
01029         if(mySortedType == QUERY_NAME)
01030         {
01031             // Validate that it is sorted by query name.
01032             // Get the query name from the record.
01033             const char* readName = record.getReadName();
01034 
01035             // Check if it is sorted either in samtools way or picard's way.
01036             if((myPrevReadName.Compare(readName) > 0) &&
01037                (strcmp(myPrevReadName.c_str(), readName) > 0))
01038             {
01039                 // return false.
01040                 String errorMessage = "ERROR: File is not sorted by read name at record ";
01041                 errorMessage += myRecordCount;
01042                 errorMessage += "\n\tPrevious record was ";
01043                 errorMessage += myPrevReadName;
01044                 errorMessage += ", but this record is ";
01045                 errorMessage += readName;
01046                 myStatus.setStatus(SamStatus::INVALID_SORT, 
01047                                    errorMessage.c_str());
01048                 status = false;
01049             }
01050             else
01051             {
01052                 myPrevReadName = readName;
01053                 status = true;
01054             }
01055         }
01056         else 
01057         {
01058             // Validate that it is sorted by COORDINATES.
01059             // Get the leftmost coordinate and the reference index.
01060             int32_t refID = record.getReferenceID();
01061             int32_t coord = record.get0BasedPosition();
01062             // The unmapped reference id is at the end of a sorted file.
01063             if(refID == BamIndex::REF_ID_UNMAPPED)
01064             {
01065                 // A new reference ID that is for the unmapped reads
01066                 // is always valid.
01067                 status = true;
01068                 myPrevRefID = refID;
01069                 myPrevCoord = coord;
01070             }
01071             else if(myPrevRefID == BamIndex::REF_ID_UNMAPPED)
01072             {
01073                 // Previous reference ID was for unmapped reads, but the
01074                 // current one is not, so this is not sorted.
01075                 String errorMessage = "ERROR: File is not coordinate sorted at record ";
01076                 errorMessage += myRecordCount;
01077                 errorMessage += "\n\tPrevious record was unmapped, but this record is ";
01078                 errorMessage += header.getReferenceLabel(refID) + ":" + coord;
01079                 myStatus.setStatus(SamStatus::INVALID_SORT, 
01080                                    errorMessage.c_str());
01081                 status = false;
01082             }
01083             else if(refID < myPrevRefID)
01084             {
01085                 // Current reference id is less than the previous one, 
01086                 //meaning that it is not sorted.
01087                 String errorMessage = "ERROR: File is not coordinate sorted at record ";
01088                 errorMessage += myRecordCount;
01089                 errorMessage += "\n\tPrevious record was ";
01090                 errorMessage += header.getReferenceLabel(myPrevRefID) + ":" + myPrevCoord;
01091                 errorMessage += ", but this record is ";
01092                 errorMessage += header.getReferenceLabel(refID) + ":" + coord;
01093                 myStatus.setStatus(SamStatus::INVALID_SORT, 
01094                                    errorMessage.c_str());
01095                 status = false;
01096             }
01097             else
01098             {
01099                 // The reference IDs are in the correct order.
01100                 if(refID > myPrevRefID)
01101                 {
01102                     // New reference id, so set the previous coordinate to -1
01103                     myPrevCoord = -1;
01104                 }
01105             
01106                 // Check the coordinates.
01107                 if(coord < myPrevCoord)
01108                 {
01109                     // New Coord is less than the previous position.
01110                     String errorMessage = "ERROR: File is not coordinate sorted at record ";
01111                     errorMessage += myRecordCount;
01112                     errorMessage += "\n\tPreviousRecord was ";
01113                     errorMessage += header.getReferenceLabel(myPrevRefID) + ":" + myPrevCoord;
01114                     errorMessage += ", but this record is ";
01115                     errorMessage += header.getReferenceLabel(refID) + ":" + coord;
01116                     myStatus.setStatus(SamStatus::INVALID_SORT, 
01117                                        errorMessage.c_str());
01118                     status = false;
01119                 }
01120                 else
01121                 {
01122                     myPrevRefID = refID;
01123                     myPrevCoord = coord;
01124                     status = true;
01125                 }
01126             }
01127         }
01128     }
01129 
01130     return(status);
01131 }
01132 
01133 
01134 SamFile::SortedType SamFile::getSortOrderFromHeader(SamFileHeader& header)
01135 {
01136     const char* tag = header.getSortOrder();
01137    
01138     // Default to unsorted since if it is not specified in the header
01139     // that is the value that should be used.
01140     SortedType headerSortOrder = UNSORTED;
01141     if(strcmp(tag, "queryname") == 0)
01142     {
01143         headerSortOrder = QUERY_NAME;
01144     }
01145     else if(strcmp(tag, "coordinate") == 0)
01146     {
01147         headerSortOrder = COORDINATE;
01148     }
01149     return(headerSortOrder);
01150 }
01151 
01152 
01153  bool SamFile::ensureIndexedReadPosition()
01154  {
01155      // If no sections are specified, return true.
01156      if(myRefID == BamIndex::REF_ID_ALL)
01157      {
01158          return(true);
01159      }
01160 
01161      // Check to see if we have more to read out of the current chunk.
01162      // By checking the current position in relation to the current
01163      // end chunk.  If the current position is >= the end of the
01164      // current chunk, then we must see to the next chunk.
01165      uint64_t currentPos = iftell(myFilePtr);
01166      if(currentPos >= myCurrentChunkEnd)
01167      {
01168          // If there are no more chunks to process, return failure.
01169          if(myChunksToRead.empty())
01170          {
01171              myStatus = SamStatus::NO_MORE_RECS;
01172              return(false);
01173          }
01174          
01175          // There are more chunks left, so get the next chunk.
01176          Chunk newChunk = myChunksToRead.pop();
01177          
01178          // Move to the location of the new chunk if it is not adjacent
01179          // to the current chunk.
01180          if(newChunk.chunk_beg != currentPos)
01181          {
01182              // New chunk is not adjacent, so move to it.
01183              if(ifseek(myFilePtr, newChunk.chunk_beg, SEEK_SET) != true)
01184              {
01185                  // seek failed, cannot retrieve next record, return failure.
01186                  myStatus.setStatus(SamStatus::FAIL_IO, 
01187                                     "Failed to seek to the next record");
01188                  return(false);
01189              }
01190          }
01191          // Seek succeeded, set the end of the new chunk.
01192          myCurrentChunkEnd = newChunk.chunk_end;
01193      }
01194      return(true);
01195  }
01196 
01197 
01198 bool SamFile::checkRecordInSection(SamRecord& record)
01199 {
01200     bool recordFound = true;
01201     if(myRefID == BamIndex::REF_ID_ALL)
01202     {
01203         return(true);
01204     }
01205     // Check to see if it is in the correct reference/position.
01206     if(record.getReferenceID() != myRefID)
01207     {
01208         // Incorrect reference ID, return no more records.
01209         myStatus = SamStatus::NO_MORE_RECS;
01210         return(false);
01211     }
01212    
01213     // Found a record.
01214     recordFound = true;
01215 
01216     // If start/end position are set, verify that the alignment falls
01217     // within those.
01218     // If the alignment start is greater than the end of the region,
01219     // return NO_MORE_RECS.
01220     // Since myEndPos is Exclusive 0-based, anything >= myEndPos is outside
01221     // of the region.
01222     if((myEndPos != -1) && (record.get0BasedPosition() >= myEndPos))
01223     {
01224         myStatus = SamStatus::NO_MORE_RECS;
01225         return(false);
01226     }
01227         
01228     // We know the start is less than the end position, so the alignment
01229     // overlaps the region if the alignment end position is greater than the
01230     // start of the region.
01231     if((myStartPos != -1) && (record.get0BasedAlignmentEnd() < myStartPos))
01232     {
01233         // If it does not overlap the region, so go to the next
01234         // record...set recordFound back to false.
01235         recordFound = false;
01236     }
01237 
01238     if(!myOverlapSection)
01239     {
01240         // Needs to be fully contained.  Not fully contained if
01241         // 1) the record start position is < the region start position.
01242         // or
01243         // 2) the end position is specified and the record end position
01244         //    is greater than or equal to the region end position.
01245         //    (equal to since the region is exclusive.
01246         if((record.get0BasedPosition() < myStartPos) ||
01247            ((myEndPos != -1) && 
01248             (record.get0BasedAlignmentEnd() >= myEndPos)))
01249         {
01250             // This record is not fully contained, so move on to the next
01251             // record.
01252             recordFound = false;
01253         }
01254     }
01255 
01256     return(recordFound);
01257 }
01258    
01259 
01260 bool SamFile::processNewSection(SamFileHeader &header)
01261 {
01262     myNewSection = false;
01263 
01264     // If there is no index file, return failure.
01265     if(myBamIndex == NULL)
01266     {
01267         // No bam index has been read.
01268         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01269                            "Cannot read section since there is no index file open");
01270         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the BAM Index file."));
01271         return(false);
01272     }
01273 
01274     // If there is not a BAM file open for reading, return failure.
01275     if(!myIsBamOpenForRead)
01276     {
01277         // There is not a BAM file open for reading.
01278         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01279                            "Cannot read section since there is no bam file open");
01280         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section without opening a BAM file."));
01281         return(false);
01282     }
01283 
01284     if(myHasHeader == false)
01285     {
01286         // The header has not yet been read.
01287         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01288                            "Cannot read record since the header has not been read.");
01289         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the header."));
01290         return(false);
01291     }
01292 
01293     // Indexed Bam open for read, so disable read buffering because iftell
01294     // will be used.
01295     // Needs to be done here after we already know that the header has been
01296     // read.
01297     myFilePtr->disableBuffering();
01298 
01299     myChunksToRead.clear();
01300     // Reset the end of the current chunk.  We are resetting our read, so
01301     // we no longer have a "current chunk" that we are reading.
01302     myCurrentChunkEnd = 0;
01303 
01304     // Check to see if the read section was set based on the reference name
01305     // but not yet converted to reference id.
01306     if(!myRefName.empty())
01307     {
01308         myRefID = header.getReferenceID(myRefName.c_str());
01309         // Clear the myRefName length so this code is only executed once.
01310         myRefName.clear();
01311 
01312         // Check to see if a reference id was found.
01313         if(myRefID == SamReferenceInfo::NO_REF_ID)
01314         {
01315             myStatus = SamStatus::NO_MORE_RECS;
01316             return(false);
01317         }
01318     }
01319 
01320     // Get the chunks associated with this reference region.
01321     if(myBamIndex->getChunksForRegion(myRefID, myStartPos, myEndPos, 
01322                                       myChunksToRead) == true)
01323     {
01324         myStatus = SamStatus::SUCCESS;
01325     }
01326     else
01327     {
01328         String errorMsg = "Failed to get the specified region, refID = ";
01329         errorMsg += myRefID;
01330         errorMsg += "; startPos = ";
01331         errorMsg += myStartPos;
01332         errorMsg += "; endPos = ";
01333         errorMsg += myEndPos;
01334         myStatus.setStatus(SamStatus::FAIL_PARSE, 
01335                            errorMsg);
01336     }
01337     return(true);
01338 }
01339 
01340 //
01341 // When the caller to SamFile::ReadRecord() catches an
01342 // exception, it may choose to call this method to resync
01343 // on the underlying binary stream.
01344 //
01345 // Arguments: a callback function that will requires length bytes
01346 // of data to validate a record header.
01347 //
01348 // The expected use case is to re-sync on the next probably valid
01349 // BAM record, so that we can resume reading even after detecting
01350 // a corrupted BAM file.
01351 //
01352 bool SamFile::attemptRecoverySync(bool (*checkSignature)(void *data) , int length)
01353 {
01354     if(myFilePtr==NULL) return false;
01355     // non-recovery aware objects will just return false:
01356     return myFilePtr->attemptRecoverySync(checkSignature, length);
01357 }
01358 
01359 // Default Constructor.
01360 SamFileReader::SamFileReader()
01361     : SamFile()
01362 {
01363 }
01364 
01365 
01366 // Constructor that opens the specified file for read.
01367 SamFileReader::SamFileReader(const char* filename)
01368     : SamFile(filename, READ)
01369 {
01370 }
01371 
01372 
01373 
01374 
01375 // Constructor that opens the specified file for read.
01376 SamFileReader::SamFileReader(const char* filename,
01377                              ErrorHandler::HandlingType errorHandlingType)
01378     : SamFile(filename, READ, errorHandlingType)
01379 {
01380 }
01381 
01382 
01383 // Constructor that opens the specified file for read.
01384 SamFileReader::SamFileReader(const char* filename,
01385                              SamFileHeader* header)
01386     : SamFile(filename, READ, header)
01387 {
01388 }
01389 
01390 
01391 // Constructor that opens the specified file for read.
01392 SamFileReader::SamFileReader(const char* filename,
01393                              ErrorHandler::HandlingType errorHandlingType,
01394                              SamFileHeader* header)
01395     : SamFile(filename, READ, errorHandlingType, header)
01396 {
01397 }
01398 
01399 
01400 SamFileReader::~SamFileReader()
01401 {
01402 }
01403 
01404 
01405 // Default Constructor.
01406 SamFileWriter::SamFileWriter()
01407     : SamFile()
01408 {
01409 }
01410 
01411 
01412 // Constructor that opens the specified file for write.
01413 SamFileWriter::SamFileWriter(const char* filename)
01414     : SamFile(filename, WRITE)
01415 {
01416 }
01417 
01418 
01419 // Constructor that opens the specified file for write.
01420 SamFileWriter::SamFileWriter(const char* filename,
01421                              ErrorHandler::HandlingType errorHandlingType)
01422     : SamFile(filename, WRITE, errorHandlingType)
01423 {
01424 }
01425 
01426 
01427 // Constructor that opens the specified file for write.
01428 SamFileWriter::SamFileWriter(const char* filename,
01429                              SamFileHeader* header)
01430     : SamFile(filename, WRITE, header)
01431 {
01432 }
01433 
01434 
01435 // Constructor that opens the specified file for write.
01436 SamFileWriter::SamFileWriter(const char* filename,
01437                              ErrorHandler::HandlingType errorHandlingType,
01438                              SamFileHeader* header)
01439     : SamFile(filename, WRITE, errorHandlingType, header)
01440 {
01441 }
01442 
01443 
01444 SamFileWriter::~SamFileWriter()
01445 {
01446 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends