SamFile.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 #include <stdexcept>
00018 #include "SamFile.h"
00019 #include "SamFileHeader.h"
00020 #include "SamRecord.h"
00021 #include "BamInterface.h"
00022 #include "SamInterface.h"
00023 #include "BgzfFileType.h"
00024 
00025 // Constructor, init variables.
00026 SamFile::SamFile()
00027     : myStatus()
00028 {
00029     init();
00030     resetFile();
00031 }
00032 
00033 
00034 // Constructor, init variables.
00035 SamFile::SamFile(ErrorHandler::HandlingType errorHandlingType)
00036     : myStatus(errorHandlingType)
00037 {
00038     init();
00039     resetFile();
00040 }
00041 
00042 
00043 // Constructor, init variables and open the specified file based on the
00044 // specified mode (READ/WRITE).
00045 SamFile::SamFile(const char* filename, OpenType mode)
00046     : myStatus()
00047 {
00048     init(filename, mode, NULL);
00049 }
00050 
00051 
00052 // Constructor, init variables and open the specified file based on the
00053 // specified mode (READ/WRITE).  Default is READ..
00054 SamFile::SamFile(const char* filename, OpenType mode,
00055                  ErrorHandler::HandlingType errorHandlingType)
00056     : myStatus(errorHandlingType)
00057 {
00058     init(filename, mode, NULL);
00059 }
00060 
00061 
00062 // Constructor, init variables and open the specified file based on the
00063 // specified mode (READ/WRITE).
00064 SamFile::SamFile(const char* filename, OpenType mode, SamFileHeader* header)
00065     : myStatus()
00066 {
00067     init(filename, mode, header);
00068 }
00069 
00070 
00071 // Constructor, init variables and open the specified file based on the
00072 // specified mode (READ/WRITE).  Default is READ..
00073 SamFile::SamFile(const char* filename, OpenType mode,
00074                  ErrorHandler::HandlingType errorHandlingType, 
00075                  SamFileHeader* header)
00076     : myStatus(errorHandlingType)
00077 {
00078     init(filename, mode, header);
00079 }
00080 
00081 
00082 SamFile::~SamFile()
00083 {
00084     resetFile();
00085     if(myStatistics != NULL)
00086     {
00087         delete myStatistics;
00088     }
00089 }
00090 
00091 
00092 // Open a sam/bam file for reading with the specified filename.
00093 bool SamFile::OpenForRead(const char * filename, SamFileHeader* header)
00094 {
00095     // Reset for any previously operated on files.
00096     resetFile();
00097 
00098     int lastchar = 0;
00099 
00100     while (filename[lastchar] != 0) lastchar++;
00101 
00102     // If at least one character, check for '-'.
00103     if((lastchar >= 1) && (filename[0] == '-'))
00104     {
00105         // Read from stdin - determine type of file to read.
00106         // Determine if compressed bam.
00107         if(strcmp(filename, "-.bam") == 0)
00108         {
00109             // Compressed bam - open as bgzf.
00110             // -.bam is the filename, read compressed bam from stdin
00111             filename = "-";
00112 
00113             myFilePtr = new InputFile;
00114             // support recover mode - this switches in a reader
00115             // capable of recovering from bad BGZF compression blocks.
00116             myFilePtr->setAttemptRecovery(myAttemptRecovery);
00117             myFilePtr->openFile(filename, "rb", InputFile::BGZF);
00118 
00119             myInterfacePtr = new BamInterface;
00120 
00121             // Read the magic string.
00122             char magic[4];
00123             ifread(myFilePtr, magic, 4);
00124         }
00125         else if(strcmp(filename, "-.ubam") == 0)
00126         {
00127             // uncompressed BAM File.
00128             // -.ubam is the filename, read uncompressed bam from stdin.
00129             // uncompressed BAM is still compressed with BGZF, but using
00130             // compression level 0, so still open as BGZF since it has a
00131             // BGZF header.
00132             filename = "-";
00133 
00134             // Uncompressed, so do not require the eof block.
00135 #ifdef __ZLIB_AVAILABLE__
00136             BgzfFileType::setRequireEofBlock(false);
00137 #endif
00138             myFilePtr = ifopen(filename, "rb", InputFile::BGZF);
00139         
00140             myInterfacePtr = new BamInterface;
00141 
00142             // Read the magic string.
00143             char magic[4];
00144             ifread(myFilePtr, magic, 4);
00145         }
00146         else
00147         {
00148             // SAM File.
00149             // read sam from stdin
00150             filename = "-";
00151             myFilePtr = ifopen(filename, "rb", InputFile::UNCOMPRESSED);
00152             myInterfacePtr = new SamInterface;
00153         }
00154     }
00155     else
00156     {
00157         // Not from stdin.  Read the file to determine the type.
00158 
00159         myFilePtr = new InputFile;
00160 
00161         // support recovery mode - this conditionally enables a reader
00162         // capable of recovering from bad BGZF compression blocks.
00163         myFilePtr->setAttemptRecovery(myAttemptRecovery);
00164         bool rc = myFilePtr->openFile(filename, "rb", InputFile::DEFAULT);
00165 
00166         if (rc == false)
00167         {
00168             std::string errorMessage = "Failed to Open ";
00169             errorMessage += filename;
00170             errorMessage += " for reading";
00171             myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str());
00172             delete myFilePtr;
00173             myFilePtr = NULL;
00174             return(false);
00175         }
00176         
00177         char magic[4];
00178         ifread(myFilePtr, magic, 4);
00179         
00180         if (magic[0] == 'B' && magic[1] == 'A' && magic[2] == 'M' &&
00181             magic[3] == 1)
00182         {
00183             myInterfacePtr = new BamInterface;
00184             // Set that it is a bam file open for reading.  This is needed to
00185             // determine if an index file can be used.
00186             myIsBamOpenForRead = true;
00187         }
00188         else
00189         {
00190             // Not a bam, so rewind to the beginning of the file so it
00191             // can be read.
00192             ifrewind(myFilePtr);
00193             myInterfacePtr = new SamInterface;
00194         }
00195     }
00196 
00197     // File is open for reading.
00198     myIsOpenForRead = true;
00199 
00200     // Read the header if one was passed in.
00201     if(header != NULL)
00202     {
00203         return(ReadHeader(*header));
00204     }
00205 
00206     // Successfully opened the file.
00207     myStatus = SamStatus::SUCCESS;
00208     return(true);
00209 }
00210 
00211 
00212 // Open a sam/bam file for writing with the specified filename.
00213 bool SamFile::OpenForWrite(const char * filename, SamFileHeader* header)
00214 {
00215     // Reset for any previously operated on files.
00216     resetFile();
00217     
00218     int lastchar = 0;
00219     while (filename[lastchar] != 0) lastchar++;   
00220     if (lastchar >= 4 && 
00221         filename[lastchar - 4] == 'u' &&
00222         filename[lastchar - 3] == 'b' &&
00223         filename[lastchar - 2] == 'a' &&
00224         filename[lastchar - 1] == 'm')
00225     {
00226         // BAM File.
00227         // if -.ubam is the filename, write uncompressed bam to stdout
00228         if((lastchar == 6) && (filename[0] == '-') && (filename[1] == '.'))
00229         {
00230             filename = "-";
00231         }
00232 
00233         myFilePtr = ifopen(filename, "wb0", InputFile::BGZF);
00234 
00235         myInterfacePtr = new BamInterface;
00236     }
00237     else if (lastchar >= 3 && 
00238              filename[lastchar - 3] == 'b' &&
00239              filename[lastchar - 2] == 'a' &&
00240              filename[lastchar - 1] == 'm')
00241     {
00242         // BAM File.
00243         // if -.bam is the filename, write compressed bam to stdout
00244         if((lastchar == 5) && (filename[0] == '-') && (filename[1] == '.'))
00245         {
00246             filename = "-";
00247         }
00248         myFilePtr = ifopen(filename, "wb", InputFile::BGZF);
00249         
00250         myInterfacePtr = new BamInterface;
00251     }
00252     else
00253     {
00254         // SAM File
00255         // if - (followed by anything is the filename,
00256         // write uncompressed sam to stdout
00257         if((lastchar >= 1) && (filename[0] == '-'))
00258         {
00259             filename = "-";
00260         }
00261         myFilePtr = ifopen(filename, "wb", InputFile::UNCOMPRESSED);
00262    
00263         myInterfacePtr = new SamInterface;
00264     }
00265 
00266     if (myFilePtr == NULL)
00267     {
00268         std::string errorMessage = "Failed to Open ";
00269         errorMessage += filename;
00270         errorMessage += " for writing";
00271         myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str());
00272         return(false);
00273     }
00274    
00275     myIsOpenForWrite = true;
00276 
00277     // Write the header if one was passed in.
00278     if(header != NULL)
00279     {
00280         return(WriteHeader(*header));
00281     }
00282 
00283     // Successfully opened the file.
00284     myStatus = SamStatus::SUCCESS;
00285     return(true);
00286 }
00287 
00288 
00289 // Read BAM Index file.
00290 bool SamFile::ReadBamIndex(const char* bamIndexFilename)
00291 {
00292     // Cleanup a previously setup index.
00293     if(myBamIndex != NULL)
00294     {
00295         delete myBamIndex;
00296         myBamIndex = NULL;
00297     }
00298 
00299     // Create a new bam index.
00300     myBamIndex = new BamIndex();
00301     SamStatus::Status indexStat = myBamIndex->readIndex(bamIndexFilename);
00302 
00303     if(indexStat != SamStatus::SUCCESS)
00304     {
00305         std::string errorMessage = "Failed to read the bam Index file: ";
00306         errorMessage += bamIndexFilename;
00307         myStatus.setStatus(indexStat, errorMessage.c_str());
00308         delete myBamIndex;
00309         myBamIndex = NULL;
00310         return(false);
00311     }
00312     myStatus = SamStatus::SUCCESS;
00313     return(true);
00314 }
00315 
00316 
00317 // Read BAM Index file.
00318 bool SamFile::ReadBamIndex()
00319 {
00320     if(myFilePtr == NULL)
00321     {
00322         // Can't read the bam index file because the BAM file has not yet been
00323         // opened, so we don't know the base filename for the index file.
00324         std::string errorMessage = "Failed to read the bam Index file -"
00325             " the BAM file needs to be read first in order to determine"
00326             " the index filename.";
00327         myStatus.setStatus(SamStatus::FAIL_ORDER, errorMessage.c_str());
00328         return(false);
00329     }
00330 
00331     const char* bamBaseName = myFilePtr->getFileName();
00332     
00333     std::string indexName = bamBaseName;
00334     indexName += ".bai";
00335 
00336     bool foundFile = true;
00337     try
00338     {
00339         if(ReadBamIndex(indexName.c_str()) == false)
00340         {
00341             foundFile = false;
00342         }
00343     }
00344     catch (std::exception& e)
00345     {
00346         foundFile = false;
00347     }
00348 
00349     // Check to see if the index file was found.
00350     if(!foundFile)
00351     {
00352         // Not found - try without the bam extension.
00353         // Locate the start of the bam extension
00354         size_t startExt = indexName.find(".bam");
00355         if(startExt == std::string::npos)
00356         {
00357             // Could not find the .bam extension, so just return false since the
00358             // call to ReadBamIndex set the status.
00359             return(false);
00360         }
00361         // Remove ".bam" and try reading the index again.
00362         indexName.erase(startExt,  4);
00363         return(ReadBamIndex(indexName.c_str()));
00364     }
00365     return(true);
00366 }
00367 
00368 
00369 // Sets the reference to the specified genome sequence object.
00370 void SamFile::SetReference(GenomeSequence* reference)
00371 {
00372     myRefPtr = reference;
00373 }
00374 
00375 
00376 // Set the type of sequence translation to use when reading the sequence.
00377 void SamFile::SetReadSequenceTranslation(SamRecord::SequenceTranslation translation)
00378 {
00379     myReadTranslation = translation;
00380 }
00381 
00382 
00383 // Set the type of sequence translation to use when writing the sequence.
00384 void SamFile::SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation)
00385 {
00386     myWriteTranslation = translation;
00387 }
00388 
00389 // Close the file if there is one open.
00390 void SamFile::Close()
00391 {
00392     // Resetting the file will close it if it is open, and
00393     // will reset all other variables.
00394     resetFile();
00395 }
00396 
00397 
00398 // Returns whether or not the file has been opened.
00399 // return: int - true = open; false = not open.
00400 bool SamFile::IsOpen()
00401 {
00402     if (myFilePtr != NULL)
00403     {
00404         // File Pointer is set, so return if it is open.
00405         return(myFilePtr->isOpen());
00406     }
00407     // File pointer is not set, so return false, not open.
00408     return false;
00409 }
00410 
00411 
00412 // Returns whether or not the end of the file has been reached.
00413 // return: int - true = EOF; false = not eof.
00414 bool SamFile::IsEOF()
00415 {
00416     if (myFilePtr != NULL)
00417     {
00418         // File Pointer is set, so return if eof.
00419         return(ifeof(myFilePtr));
00420     }
00421     // File pointer is not set, so return true, eof.
00422     return true;
00423 }
00424 
00425 
00426 // Read the header from the currently opened file.
00427 bool SamFile::ReadHeader(SamFileHeader& header)
00428 {
00429     myStatus = SamStatus::SUCCESS;
00430     if(myIsOpenForRead == false)
00431     {
00432         // File is not open for read
00433         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00434                            "Cannot read header since the file is not open for reading");
00435         return(false);
00436     }
00437 
00438     if(myHasHeader == true)
00439     {
00440         // The header has already been read.
00441         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00442                            "Cannot read header since it has already been read.");
00443         return(false);
00444     }
00445 
00446     if(myInterfacePtr->readHeader(myFilePtr, header, myStatus))
00447     {
00448         // The header has now been successfully read.
00449         myHasHeader = true;
00450         return(true);
00451     }
00452     return(false);
00453 }
00454 
00455 
00456 // Write the header to the currently opened file.
00457 bool SamFile::WriteHeader(SamFileHeader& header)
00458 {
00459     myStatus = SamStatus::SUCCESS;
00460     if(myIsOpenForWrite == false)
00461     {
00462         // File is not open for write
00463         // -OR-
00464         // The header has already been written.
00465         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00466                            "Cannot write header since the file is not open for writing");
00467         return(false);
00468     }
00469 
00470     if(myHasHeader == true)
00471     {
00472         // The header has already been written.
00473         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00474                            "Cannot write header since it has already been written");
00475         return(false);
00476     }
00477 
00478     if(myInterfacePtr->writeHeader(myFilePtr, header, myStatus))
00479     {
00480         // The header has now been successfully written.
00481         myHasHeader = true;
00482         return(true);
00483     }
00484 
00485     // return the status.
00486     return(false);
00487 }
00488 
00489 
00490 // Read a record from the currently opened file.
00491 bool SamFile::ReadRecord(SamFileHeader& header, 
00492                          SamRecord& record)
00493 {
00494     myStatus = SamStatus::SUCCESS;
00495 
00496     if(myIsOpenForRead == false)
00497     {
00498         // File is not open for read
00499         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00500                            "Cannot read record since the file is not open for reading");
00501         throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to opening the file."));
00502         return(false);
00503     }
00504 
00505     if(myHasHeader == false)
00506     {
00507         // The header has not yet been read.
00508         // TODO - maybe just read the header.
00509         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00510                            "Cannot read record since the header has not been read.");
00511         throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to reading the header."));
00512         return(false);
00513     }
00514 
00515     // Check to see if a new region has been set.  If so, determine the
00516     // chunks for that region.
00517     if(myNewSection)
00518     {
00519         if(!processNewSection(header))
00520         {
00521             // Failed processing a new section.  Could be an 
00522             // order issue like the file not being open or the
00523             // indexed file not having been read.
00524             // processNewSection sets myStatus with the failure reason.
00525             return(false);
00526         }
00527     }
00528 
00529     // Read until a record is not successfully read or there are no more
00530     // requested records.
00531     while(myStatus == SamStatus::SUCCESS)
00532     {
00533         record.setReference(myRefPtr);
00534         record.setSequenceTranslation(myReadTranslation);
00535 
00536         // If reading by index, this method will setup to ensure it is in
00537         // the correct position for the next record (if not already there).
00538         // Sets myStatus if it could not move to a good section.
00539         // Just returns true if it is not setup to read by index.
00540         if(!ensureIndexedReadPosition())
00541         {
00542             // Either there are no more records in the section
00543             // or it failed to move to the right section, so there
00544             // is nothing more to read, stop looping.
00545             break;
00546         }
00547         
00548         // File is open for reading and the header has been read, so read the
00549         // next record.
00550         myInterfacePtr->readRecord(myFilePtr, header, record, myStatus);
00551         if(myStatus != SamStatus::SUCCESS)
00552         {
00553             // Failed to read the record, so break out of the loop.
00554             break;
00555         }
00556 
00557         // Successfully read a record, so check if we should filter it.
00558         // First check if it is out of the section.  Returns true
00559         // if not reading by sections, returns false if the record
00560         // is outside of the section.  Sets status to NO_MORE_RECS if
00561         // there is nothing left ot read in the section.
00562         if(!checkRecordInSection(record))
00563         {
00564             // The record is not in the section.
00565             // The while loop will detect if NO_MORE_RECS was set.
00566             continue;
00567         }
00568 
00569         // Check the flag for required/excluded flags.
00570         uint16_t flag = record.getFlag();
00571         if((flag & myRequiredFlags) != myRequiredFlags)
00572         {
00573             // The record does not conatain all required flags, so
00574             // continue looking.
00575             continue;
00576         }
00577         if((flag & myExcludedFlags) != 0)
00578         {
00579             // The record contains an excluded flag, so continue looking.
00580             continue;
00581         }
00582 
00583         //increment the record count.
00584         myRecordCount++;
00585         
00586         if(myStatistics != NULL)
00587         {
00588             // Statistics should be updated.
00589             myStatistics->updateStatistics(record);
00590         }
00591         
00592         // Successfully read the record, so check the sort order.
00593         if(!validateSortOrder(record, header))
00594         {
00595             // ValidateSortOrder sets the status on a failure.
00596             return(false);
00597         }
00598         return(true);
00599 
00600     } // End while loop that checks if a desired record is found or failure.
00601 
00602     // Return true if a record was found.
00603     return(myStatus == SamStatus::SUCCESS);
00604 }
00605 
00606 
00607 
00608 // Write a record to the currently opened file.
00609 bool SamFile::WriteRecord(SamFileHeader& header, 
00610                           SamRecord& record)
00611 {
00612     if(myIsOpenForWrite == false)
00613     {
00614         // File is not open for writing
00615         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00616                            "Cannot write record since the file is not open for writing");
00617         return(false);
00618     }
00619 
00620     if(myHasHeader == false)
00621     {
00622         // The header has not yet been written.
00623         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00624                            "Cannot write record since the header has not been written");
00625         return(false);
00626     }
00627 
00628     // Before trying to write the record, validate the sort order.
00629     if(!validateSortOrder(record, header))
00630     {
00631         // Not sorted like it is supposed to be, do not write the record
00632         myStatus.setStatus(SamStatus::INVALID_SORT, 
00633                            "Cannot write the record since the file is not properly sorted.");
00634         return(false);
00635     }
00636 
00637     if(myRefPtr != NULL)
00638     {
00639         record.setReference(myRefPtr);
00640     }
00641 
00642     // File is open for writing and the header has been written, so write the
00643     // record.
00644     myStatus = myInterfacePtr->writeRecord(myFilePtr, header, record,
00645                                            myWriteTranslation);
00646 
00647     if(myStatus == SamStatus::SUCCESS)
00648     {
00649         // A record was successfully written, so increment the record count.
00650         myRecordCount++;
00651         return(true);
00652     }
00653     return(false);
00654 }
00655 
00656 
00657 // Set the flag to validate that the file is sorted as it is read/written.
00658 // Must be called after the file has been opened.
00659 void SamFile::setSortedValidation(SortedType sortType)
00660 {
00661     mySortedType = sortType;
00662 }
00663 
00664 
00665 // Return the number of records that have been read/written so far.
00666 uint32_t SamFile::GetCurrentRecordCount()
00667 {
00668     return(myRecordCount);
00669 }
00670 
00671 
00672 // Sets what part of the SamFile should be read.
00673 bool SamFile::SetReadSection(int32_t refID)
00674 {
00675     // No start/end specified, so set back to default -1.
00676     return(SetReadSection(refID, -1, -1));
00677 }
00678 
00679 
00680 
00681 // Sets what part of the SamFile should be read.
00682 bool SamFile::SetReadSection(const char* refName)
00683 {
00684     // No start/end specified, so set back to default -1.
00685     return(SetReadSection(refName, -1, -1));
00686 }
00687 
00688 
00689 // Sets what part of the BAM file should be read.
00690 bool SamFile::SetReadSection(int32_t refID, int32_t start, int32_t end, 
00691                              bool overlap)
00692 {
00693     // If there is not a BAM file open for reading, return failure.
00694     // Opening a new file clears the read section, so it must be
00695     // set after the file is opened.
00696     if(!myIsBamOpenForRead)
00697     {
00698         // There is not a BAM file open for reading.
00699         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00700                            "Cannot set section since there is no bam file open");
00701         return(false);
00702     }
00703 
00704     myNewSection = true;
00705     myOverlapSection = overlap;
00706     myStartPos = start;
00707     myEndPos = end;
00708     myRefID = refID;
00709     myRefName.clear();
00710     myChunksToRead.clear();
00711     // Reset the end of the current chunk.  We are resetting our read, so
00712     // we no longer have a "current chunk" that we are reading.
00713     myCurrentChunkEnd = 0;
00714     myStatus = SamStatus::SUCCESS;
00715 
00716     // Reset the sort order criteria since we moved around in the file.    
00717     myPrevCoord = -1;
00718     myPrevRefID = 0;
00719     myPrevReadName.clear();
00720 
00721     return(true);
00722 }
00723 
00724 
00725 // Sets what part of the BAM file should be read.
00726 bool SamFile::SetReadSection(const char* refName, int32_t start, int32_t end,
00727                              bool overlap)
00728 {
00729     // If there is not a BAM file open for reading, return failure.
00730     // Opening a new file clears the read section, so it must be
00731     // set after the file is opened.
00732     if(!myIsBamOpenForRead)
00733     {
00734         // There is not a BAM file open for reading.
00735         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00736                            "Cannot set section since there is no bam file open");
00737         return(false);
00738     }
00739 
00740     myNewSection = true;
00741     myOverlapSection = overlap;
00742     myStartPos = start;
00743     myEndPos = end;
00744     if((strcmp(refName, "") == 0) || (strcmp(refName, "*") == 0))
00745     {
00746         // No Reference name specified, so read just the "-1" entries.
00747         myRefID = BamIndex::REF_ID_UNMAPPED;
00748     }
00749     else
00750     {
00751         // save the reference name and revert the reference ID to unknown
00752         // so it will be calculated later.
00753         myRefName = refName;
00754         myRefID = BamIndex::REF_ID_ALL;
00755     }
00756     myChunksToRead.clear();
00757     // Reset the end of the current chunk.  We are resetting our read, so
00758     // we no longer have a "current chunk" that we are reading.
00759     myCurrentChunkEnd = 0;
00760     myStatus = SamStatus::SUCCESS;
00761     
00762     // Reset the sort order criteria since we moved around in the file.    
00763     myPrevCoord = -1;
00764     myPrevRefID = 0;
00765     myPrevReadName.clear();
00766 
00767     return(true);
00768 }
00769 
00770 
00771 void SamFile::SetReadFlags(uint16_t requiredFlags, uint16_t excludedFlags)
00772 {
00773     myRequiredFlags = requiredFlags;
00774     myExcludedFlags = excludedFlags;
00775 }
00776 
00777 
00778 // Get the number of mapped reads in the specified reference id.  
00779 // Returns -1 for out of range refIDs.
00780 int32_t SamFile::getNumMappedReadsFromIndex(int32_t refID)
00781 {
00782     // The bam index must have already been read.
00783     if(myBamIndex == NULL)
00784     {
00785         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00786                            "Cannot get num mapped reads from the index until it has been read.");
00787         return(false);
00788     }
00789     return(myBamIndex->getNumMappedReads(refID));
00790 }
00791 
00792 
00793 // Get the number of unmapped reads in the specified reference id.  
00794 // Returns -1 for out of range refIDs.
00795 int32_t SamFile::getNumUnMappedReadsFromIndex(int32_t refID)
00796 {
00797     // The bam index must have already been read.
00798     if(myBamIndex == NULL)
00799     {
00800         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00801                            "Cannot get num unmapped reads from the index until it has been read.");
00802         return(false);
00803     }
00804     return(myBamIndex->getNumUnMappedReads(refID));
00805 }
00806 
00807 
00808 // Get the number of mapped reads in the specified reference id.  
00809 // Returns -1 for out of range references.
00810 int32_t SamFile::getNumMappedReadsFromIndex(const char* refName,
00811                                             SamFileHeader& header)
00812 {
00813     // The bam index must have already been read.
00814     if(myBamIndex == NULL)
00815     {
00816         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00817                            "Cannot get num mapped reads from the index until it has been read.");
00818         return(false);
00819     }
00820     int32_t refID = BamIndex::REF_ID_UNMAPPED;
00821     if((strcmp(refName, "") != 0) && (strcmp(refName, "*") != 0))
00822     {
00823         // Reference name specified, so read just the "-1" entries.
00824         refID =  header.getReferenceID(refName);
00825     }
00826     return(myBamIndex->getNumMappedReads(refID));
00827 }
00828 
00829 
00830 // Get the number of unmapped reads in the specified reference id.  
00831 // Returns -1 for out of range refIDs.
00832 int32_t SamFile::getNumUnMappedReadsFromIndex(const char* refName,
00833                                               SamFileHeader& header)
00834 {
00835     // The bam index must have already been read.
00836     if(myBamIndex == NULL)
00837     {
00838         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00839                            "Cannot get num unmapped reads from the index until it has been read.");
00840         return(false);
00841     }
00842     int32_t refID = BamIndex::REF_ID_UNMAPPED;
00843     if((strcmp(refName, "") != 0) && (strcmp(refName, "*") != 0))
00844     {
00845         // Reference name specified, so read just the "-1" entries.
00846         refID =  header.getReferenceID(refName);
00847     }
00848     return(myBamIndex->getNumUnMappedReads(refID));
00849 }
00850 
00851 
00852 // Returns the number of bases in the passed in read that overlap the
00853 // region that is currently set.
00854 uint32_t SamFile::GetNumOverlaps(SamRecord& samRecord)
00855 {
00856     if(myRefPtr != NULL)
00857     {
00858         samRecord.setReference(myRefPtr);
00859     }
00860     samRecord.setSequenceTranslation(myReadTranslation);
00861 
00862     // Get the overlaps in the sam record for the region currently set
00863     // for this file.
00864     return(samRecord.getNumOverlaps(myStartPos, myEndPos));
00865 }
00866 
00867 
00868 void SamFile::GenerateStatistics(bool genStats)
00869 {
00870     if(genStats)
00871     {
00872         if(myStatistics == NULL)
00873         {
00874             // Want to generate statistics, but do not yet have the
00875             // structure for them, so create one.
00876             myStatistics = new SamStatistics();
00877         }
00878     }
00879     else
00880     {
00881         // Do not generate statistics, so if myStatistics is not NULL, 
00882         // delete it.
00883         if(myStatistics != NULL)
00884         {
00885             delete myStatistics;
00886             myStatistics = NULL;
00887         }
00888     }
00889 
00890 }
00891 
00892 
00893 const BamIndex* SamFile::GetBamIndex()
00894 {
00895     return(myBamIndex);
00896 }
00897 
00898 
00899 // initialize.
00900 void SamFile::init()
00901 {
00902     myFilePtr = NULL;
00903     myInterfacePtr = NULL;
00904     myStatistics = NULL;
00905     myBamIndex = NULL;
00906     myRefPtr = NULL;
00907     myReadTranslation = SamRecord::NONE;
00908     myWriteTranslation = SamRecord::NONE;
00909     myAttemptRecovery = false;
00910     myRequiredFlags = 0;
00911     myExcludedFlags = 0;
00912 }
00913 
00914 
00915 void SamFile::init(const char* filename, OpenType mode, SamFileHeader* header)
00916 {
00917     init();
00918         
00919     resetFile();
00920 
00921     bool openStatus = true;
00922     if(mode == READ)
00923     {
00924         // open the file for read.
00925         openStatus = OpenForRead(filename, header);
00926     }
00927     else
00928     {
00929         // open the file for write.
00930         openStatus = OpenForWrite(filename, header);
00931     }
00932     if(!openStatus)
00933     {
00934         // Failed to open the file - print error and abort.
00935         fprintf(stderr, "%s\n", GetStatusMessage());
00936         std::cerr << "FAILURE - EXITING!!!" << std::endl;
00937         exit(-1);
00938     }
00939 }
00940 
00941 
00942 // Reset variables for each file.
00943 void SamFile::resetFile()
00944 {
00945     // Close the file.
00946     if (myFilePtr != NULL)
00947     {
00948         // If we already have an open file, close it.
00949         ifclose(myFilePtr);
00950         myFilePtr = NULL;
00951     }
00952     if(myInterfacePtr != NULL)
00953     {
00954         delete myInterfacePtr;
00955         myInterfacePtr = NULL;
00956     }
00957 
00958     myIsOpenForRead = false;
00959     myIsOpenForWrite = false;
00960     myHasHeader = false;
00961     mySortedType = UNSORTED;
00962     myPrevReadName.clear();
00963     myPrevCoord = -1;
00964     myPrevRefID = 0;
00965     myRecordCount = 0;
00966     myStatus = SamStatus::SUCCESS;
00967 
00968     // Reset indexed bam values.
00969     myIsBamOpenForRead = false;
00970     myRefID = BamIndex::REF_ID_ALL;
00971     myStartPos = -1;
00972     myEndPos = -1;
00973     myNewSection = false;
00974     myOverlapSection = true;
00975     myCurrentChunkEnd = 0;
00976     myChunksToRead.clear();
00977     if(myBamIndex != NULL)
00978     {
00979         delete myBamIndex;
00980         myBamIndex = NULL;
00981     }
00982 
00983     // If statistics are being generated, reset them.
00984     if(myStatistics != NULL)
00985     {
00986         myStatistics->reset();
00987     }
00988 
00989     myRefName.clear();
00990 }
00991 
00992 
00993 // Validate that the record is sorted compared to the previously read record
00994 // if there is one, according to the specified sort order.
00995 // If the sort order is UNSORTED, true is returned.
00996 bool SamFile::validateSortOrder(SamRecord& record, SamFileHeader& header)
00997 {
00998     if(myRefPtr != NULL)
00999     {
01000         record.setReference(myRefPtr);
01001     }
01002     record.setSequenceTranslation(myReadTranslation);
01003 
01004     bool status = false;
01005     if(mySortedType == UNSORTED)
01006     {
01007         // Unsorted, so nothing to validate, just return true.
01008         status = true;
01009     }
01010     else 
01011     {
01012         // Check to see if mySortedType is based on the header.
01013         if(mySortedType == FLAG)
01014         {
01015             // Determine the sorted type from what was read out of the header.
01016             mySortedType = getSortOrderFromHeader(header);
01017         }
01018 
01019         if(mySortedType == QUERY_NAME)
01020         {
01021             // Validate that it is sorted by query name.
01022             // Get the query name from the record.
01023             const char* readName = record.getReadName();
01024             if(myPrevReadName.compare(readName) > 0)
01025             {
01026                 // The previous name is greater than the new record's name, so
01027                 // return false.
01028                 String errorMessage = "ERROR: File is not sorted at record ";
01029                 errorMessage += myRecordCount;
01030                 myStatus.setStatus(SamStatus::INVALID_SORT, 
01031                                    errorMessage.c_str());
01032                 status = false;
01033             }
01034             else
01035             {
01036                 myPrevReadName = readName;
01037                 status = true;
01038             }
01039         }
01040         else 
01041         {
01042             // Validate that it is sorted by COORDINATES.
01043             // Get the leftmost coordinate and the reference index.
01044             int32_t refID = record.getReferenceID();
01045             int32_t coord = record.get0BasedPosition();
01046             // The unmapped reference id is at the end of a sorted file.
01047             if(refID == BamIndex::REF_ID_UNMAPPED)
01048             {
01049                 // A new reference ID that is for the unmapped reads
01050                 // is always valid.
01051                 status = true;
01052                 myPrevRefID = refID;
01053                 myPrevCoord = coord;
01054             }
01055             else if(myPrevRefID == BamIndex::REF_ID_UNMAPPED)
01056             {
01057                 // Previous reference ID was for unmapped reads, but the
01058                 // current one is not, so this is not sorted.
01059                 String errorMessage = "ERROR: File is not sorted at record ";
01060                 errorMessage += myRecordCount;
01061                 myStatus.setStatus(SamStatus::INVALID_SORT, 
01062                                    errorMessage.c_str());
01063                 status = false;
01064             }
01065             else if(refID < myPrevRefID)
01066             {
01067                 // Current reference id is less than the previous one, 
01068                 //meaning that it is not sorted.
01069                 String errorMessage = "ERROR: File is not sorted at record ";
01070                 errorMessage += myRecordCount;
01071                 myStatus.setStatus(SamStatus::INVALID_SORT, 
01072                                    errorMessage.c_str());
01073                 status = false;
01074             }
01075             else
01076             {
01077                 // The reference IDs are in the correct order.
01078                 if(refID > myPrevRefID)
01079                 {
01080                     // New reference id, so set the previous coordinate to -1
01081                     myPrevCoord = -1;
01082                 }
01083             
01084                 // Check the coordinates.
01085                 if(coord < myPrevCoord)
01086                 {
01087                     // New Coord is less than the previous position.
01088                     String errorMessage = "ERROR: File is not sorted at record ";
01089                     errorMessage += myRecordCount;
01090                     myStatus.setStatus(SamStatus::INVALID_SORT, 
01091                                        errorMessage.c_str());
01092                     status = false;
01093                 }
01094                 else
01095                 {
01096                     myPrevRefID = refID;
01097                     myPrevCoord = coord;
01098                     status = true;
01099                 }
01100             }
01101         }
01102     }
01103 
01104     return(status);
01105 }
01106 
01107 
01108 SamFile::SortedType SamFile::getSortOrderFromHeader(SamFileHeader& header)
01109 {
01110     const char* tag = header.getSortOrder();
01111    
01112     // Default to unsorted since if it is not specified in the header
01113     // that is the value that should be used.
01114     SortedType headerSortOrder = UNSORTED;
01115     if(strcmp(tag, "queryname") == 0)
01116     {
01117         headerSortOrder = QUERY_NAME;
01118     }
01119     else if(strcmp(tag, "coordinate") == 0)
01120     {
01121         headerSortOrder = COORDINATE;
01122     }
01123     return(headerSortOrder);
01124 }
01125 
01126 
01127  bool SamFile::ensureIndexedReadPosition()
01128  {
01129      // If no sections are specified, return true.
01130      if(myRefID == BamIndex::REF_ID_ALL)
01131      {
01132          return(true);
01133      }
01134 
01135      // Check to see if we have more to read out of the current chunk.
01136      // By checking the current position in relation to the current
01137      // end chunk.  If the current position is >= the end of the
01138      // current chunk, then we must see to the next chunk.
01139      uint64_t currentPos = iftell(myFilePtr);
01140      if(currentPos >= myCurrentChunkEnd)
01141      {
01142          // If there are no more chunks to process, return failure.
01143          if(myChunksToRead.empty())
01144          {
01145              myStatus = SamStatus::NO_MORE_RECS;
01146              return(false);
01147          }
01148          
01149          // There are more chunks left, so get the next chunk.
01150          Chunk newChunk = myChunksToRead.pop();
01151          
01152          // Move to the location of the new chunk if it is not adjacent
01153          // to the current chunk.
01154          if(newChunk.chunk_beg != currentPos)
01155          {
01156              // New chunk is not adjacent, so move to it.
01157              if(ifseek(myFilePtr, newChunk.chunk_beg, SEEK_SET) != true)
01158              {
01159                  // seek failed, cannot retrieve next record, return failure.
01160                  myStatus.setStatus(SamStatus::FAIL_IO, 
01161                                     "Failed to seek to the next record");
01162                  return(false);
01163              }
01164          }
01165          // Seek succeeded, set the end of the new chunk.
01166          myCurrentChunkEnd = newChunk.chunk_end;
01167      }
01168      return(true);
01169  }
01170 
01171 
01172 bool SamFile::checkRecordInSection(SamRecord& record)
01173 {
01174     bool recordFound = true;
01175     if(myRefID == BamIndex::REF_ID_ALL)
01176     {
01177         return(true);
01178     }
01179     // Check to see if it is in the correct reference/position.
01180     if(record.getReferenceID() != myRefID)
01181     {
01182         // Incorrect reference ID, return no more records.
01183         myStatus = SamStatus::NO_MORE_RECS;
01184         return(false);
01185     }
01186    
01187     // Found a record.
01188     recordFound = true;
01189 
01190     // If start/end position are set, verify that the alignment falls
01191     // within those.
01192     // If the alignment start is greater than the end of the region,
01193     // return NO_MORE_RECS.
01194     // Since myEndPos is Exclusive 0-based, anything >= myEndPos is outside
01195     // of the region.
01196     if((myEndPos != -1) && (record.get0BasedPosition() >= myEndPos))
01197     {
01198         myStatus = SamStatus::NO_MORE_RECS;
01199         return(false);
01200     }
01201         
01202     // We know the start is less than the end position, so the alignment
01203     // overlaps the region if the alignment end position is greater than the
01204     // start of the region.
01205     if((myStartPos != -1) && (record.get0BasedAlignmentEnd() < myStartPos))
01206     {
01207         // If it does not overlap the region, so go to the next
01208         // record...set recordFound back to false.
01209         recordFound = false;
01210     }
01211 
01212     if(!myOverlapSection)
01213     {
01214         // Needs to be fully contained.  Not fully contained if
01215         // 1) the record start position is < the region start position.
01216         // or
01217         // 2) the end position is specified and the record end position
01218         //    is greater than or equal to the region end position.
01219         //    (equal to since the region is exclusive.
01220         if((record.get0BasedPosition() < myStartPos) ||
01221            ((myEndPos != -1) && 
01222             (record.get0BasedAlignmentEnd() >= myEndPos)))
01223         {
01224             // This record is not fully contained, so move on to the next
01225             // record.
01226             recordFound = false;
01227         }
01228     }
01229 
01230     return(recordFound);
01231 }
01232    
01233 
01234 bool SamFile::processNewSection(SamFileHeader &header)
01235 {
01236     myNewSection = false;
01237 
01238     // If there is no index file, return failure.
01239     if(myBamIndex == NULL)
01240     {
01241         // No bam index has been read.
01242         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01243                            "Cannot read section since there is no index file open");
01244         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the BAM Index file."));
01245         return(false);
01246     }
01247 
01248     // If there is not a BAM file open for reading, return failure.
01249     if(!myIsBamOpenForRead)
01250     {
01251         // There is not a BAM file open for reading.
01252         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01253                            "Cannot read section since there is no bam file open");
01254         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section without opening a BAM file."));
01255         return(false);
01256     }
01257 
01258     if(myHasHeader == false)
01259     {
01260         // The header has not yet been read.
01261         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01262                            "Cannot read record since the header has not been read.");
01263         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the header."));
01264         return(false);
01265     }
01266 
01267     // Indexed Bam open for read, so disable read buffering because iftell
01268     // will be used.
01269     // Needs to be done here after we already know that the header has been
01270     // read.
01271     myFilePtr->disableBuffering();
01272 
01273     myChunksToRead.clear();
01274     // Reset the end of the current chunk.  We are resetting our read, so
01275     // we no longer have a "current chunk" that we are reading.
01276     myCurrentChunkEnd = 0;
01277 
01278     // Check to see if the read section was set based on the reference name
01279     // but not yet converted to reference id.
01280     if(!myRefName.empty())
01281     {
01282         myRefID = header.getReferenceID(myRefName.c_str());
01283         // Clear the myRefName length so this code is only executed once.
01284         myRefName.clear();
01285 
01286         // Check to see if a reference id was found.
01287         if(myRefID == SamReferenceInfo::NO_REF_ID)
01288         {
01289             myStatus = SamStatus::NO_MORE_RECS;
01290             return(false);
01291         }
01292     }
01293 
01294     // Get the chunks associated with this reference region.
01295     if(myBamIndex->getChunksForRegion(myRefID, myStartPos, myEndPos, 
01296                                       myChunksToRead) == true)
01297     {
01298         myStatus = SamStatus::SUCCESS;
01299     }
01300     else
01301     {
01302         String errorMsg = "Failed to get the specified region, refID = ";
01303         errorMsg += myRefID;
01304         errorMsg += "; startPos = ";
01305         errorMsg += myStartPos;
01306         errorMsg += "; endPos = ";
01307         errorMsg += myEndPos;
01308         myStatus.setStatus(SamStatus::FAIL_PARSE, 
01309                            errorMsg);
01310     }
01311     return(true);
01312 }
01313 
01314 //
01315 // When the caller to SamFile::ReadRecord() catches an
01316 // exception, it may choose to call this method to resync
01317 // on the underlying binary stream.
01318 //
01319 // Arguments: a callback function that will requires length bytes
01320 // of data to validate a record header.
01321 //
01322 // The expected use case is to re-sync on the next probably valid
01323 // BAM record, so that we can resume reading even after detecting
01324 // a corrupted BAM file.
01325 //
01326 bool SamFile::attemptRecoverySync(bool (*checkSignature)(void *data) , int length)
01327 {
01328     if(myFilePtr==NULL) return false;
01329     // non-recovery aware objects will just return false:
01330     return myFilePtr->attemptRecoverySync(checkSignature, length);
01331 }
01332 
01333 // Default Constructor.
01334 SamFileReader::SamFileReader()
01335     : SamFile()
01336 {
01337 }
01338 
01339 
01340 // Constructor that opens the specified file for read.
01341 SamFileReader::SamFileReader(const char* filename)
01342     : SamFile(filename, READ)
01343 {
01344 }
01345 
01346 
01347 
01348 
01349 // Constructor that opens the specified file for read.
01350 SamFileReader::SamFileReader(const char* filename,
01351                              ErrorHandler::HandlingType errorHandlingType)
01352     : SamFile(filename, READ, errorHandlingType)
01353 {
01354 }
01355 
01356 
01357 // Constructor that opens the specified file for read.
01358 SamFileReader::SamFileReader(const char* filename,
01359                              SamFileHeader* header)
01360     : SamFile(filename, READ, header)
01361 {
01362 }
01363 
01364 
01365 // Constructor that opens the specified file for read.
01366 SamFileReader::SamFileReader(const char* filename,
01367                              ErrorHandler::HandlingType errorHandlingType,
01368                              SamFileHeader* header)
01369     : SamFile(filename, READ, errorHandlingType, header)
01370 {
01371 }
01372 
01373 
01374 SamFileReader::~SamFileReader()
01375 {
01376 }
01377 
01378 
01379 // Default Constructor.
01380 SamFileWriter::SamFileWriter()
01381     : SamFile()
01382 {
01383 }
01384 
01385 
01386 // Constructor that opens the specified file for write.
01387 SamFileWriter::SamFileWriter(const char* filename)
01388     : SamFile(filename, WRITE)
01389 {
01390 }
01391 
01392 
01393 // Constructor that opens the specified file for write.
01394 SamFileWriter::SamFileWriter(const char* filename,
01395                              ErrorHandler::HandlingType errorHandlingType)
01396     : SamFile(filename, WRITE, errorHandlingType)
01397 {
01398 }
01399 
01400 
01401 // Constructor that opens the specified file for write.
01402 SamFileWriter::SamFileWriter(const char* filename,
01403                              SamFileHeader* header)
01404     : SamFile(filename, WRITE, header)
01405 {
01406 }
01407 
01408 
01409 // Constructor that opens the specified file for write.
01410 SamFileWriter::SamFileWriter(const char* filename,
01411                              ErrorHandler::HandlingType errorHandlingType,
01412                              SamFileHeader* header)
01413     : SamFile(filename, WRITE, errorHandlingType, header)
01414 {
01415 }
01416 
01417 
01418 SamFileWriter::~SamFileWriter()
01419 {
01420 }
Generated on Mon Feb 11 13:45:17 2013 for libStatGen Software by  doxygen 1.6.3