SamFile.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 #include <stdexcept>
00018 #include "SamFile.h"
00019 #include "SamFileHeader.h"
00020 #include "SamRecord.h"
00021 #include "BamInterface.h"
00022 #include "SamInterface.h"
00023 
00024 // Constructor, init variables.
00025 SamFile::SamFile()
00026     : myFilePtr(NULL),
00027       myInterfacePtr(NULL),
00028       myStatistics(NULL),
00029       myStatus(),
00030       myBamIndex(NULL),
00031       myRefPtr(NULL),
00032       myReadTranslation(SamRecord::NONE),
00033       myWriteTranslation(SamRecord::NONE)
00034 {
00035     resetFile();
00036 }
00037 
00038 
00039 // Constructor, init variables.
00040 SamFile::SamFile(ErrorHandler::HandlingType errorHandlingType)
00041     : myFilePtr(NULL),
00042       myInterfacePtr(NULL),
00043       myStatistics(NULL),
00044       myStatus(errorHandlingType),
00045       myBamIndex(NULL),
00046       myRefPtr(NULL),
00047       myReadTranslation(SamRecord::NONE),
00048       myWriteTranslation(SamRecord::NONE)
00049 {
00050     resetFile();
00051 }
00052 
00053 
00054 // Constructor, init variables and open the specified file based on the
00055 // specified mode (READ/WRITE).
00056 SamFile::SamFile(const char* filename, OpenType mode)
00057     : myStatus()
00058 {
00059     init(filename, mode, NULL);
00060 }
00061 
00062 
00063 // Constructor, init variables and open the specified file based on the
00064 // specified mode (READ/WRITE).  Default is READ..
00065 SamFile::SamFile(const char* filename, OpenType mode,
00066                  ErrorHandler::HandlingType errorHandlingType)
00067     : myStatus(errorHandlingType)
00068 {
00069     init(filename, mode, NULL);
00070 }
00071 
00072 
00073 // Constructor, init variables and open the specified file based on the
00074 // specified mode (READ/WRITE).
00075 SamFile::SamFile(const char* filename, OpenType mode, SamFileHeader* header)
00076     : myStatus()
00077 {
00078     init(filename, mode, header);
00079 }
00080 
00081 
00082 // Constructor, init variables and open the specified file based on the
00083 // specified mode (READ/WRITE).  Default is READ..
00084 SamFile::SamFile(const char* filename, OpenType mode,
00085                  ErrorHandler::HandlingType errorHandlingType, 
00086                  SamFileHeader* header)
00087     : myStatus(errorHandlingType)
00088 {
00089     init(filename, mode, header);
00090 }
00091 
00092 
00093 SamFile::~SamFile()
00094 {
00095     resetFile();
00096     if(myStatistics != NULL)
00097     {
00098         delete myStatistics;
00099     }
00100 }
00101 
00102 
00103 // Open a sam/bam file for reading with the specified filename.
00104 bool SamFile::OpenForRead(const char * filename, SamFileHeader* header)
00105 {
00106     // Reset for any previously operated on files.
00107     resetFile();
00108 
00109     int lastchar = 0;
00110 
00111     while (filename[lastchar] != 0) lastchar++;
00112 
00113     // If at least one character, check for '-'.
00114     if((lastchar >= 1) && (filename[0] == '-'))
00115     {
00116         // Read from stdin - determine type of file to read.
00117         // Determine if compressed bam.
00118         if(strcmp(filename, "-.bam") == 0)
00119         {
00120             // Compressed bam - open as bgzf.
00121             // -.bam is the filename, read compressed bam from stdin
00122             filename = "-";
00123             myFilePtr = ifopen(filename, "rb", InputFile::BGZF);
00124             myFilePtr->disableBuffering();
00125             myInterfacePtr = new BamInterface;
00126 
00127             // Read the magic string.
00128             char magic[4];
00129             ifread(myFilePtr, magic, 4);
00130         }
00131         else if(strcmp(filename, "-.ubam") == 0)
00132         {
00133             // uncompressed BAM File.
00134             // -.ubam is the filename, read uncompressed bam from stdin
00135             filename = "-";
00136             myFilePtr = ifopen(filename, "rb", InputFile::UNCOMPRESSED);
00137         
00138             myInterfacePtr = new BamInterface;
00139 
00140             // Read the magic string.
00141             char magic[4];
00142             ifread(myFilePtr, magic, 4);
00143         }
00144         else
00145         {
00146             // SAM File.
00147             // read sam from stdin
00148             filename = "-";
00149             myFilePtr = ifopen(filename, "rb", InputFile::UNCOMPRESSED);
00150             myInterfacePtr = new SamInterface;
00151         }
00152     }
00153     else
00154     {
00155         // Not from stdin.  Read the file to determine the type.
00156         myFilePtr = ifopen(filename, "rb");
00157         
00158         if (myFilePtr == NULL)
00159         {
00160             std::string errorMessage = "Failed to Open ";
00161             errorMessage += filename;
00162             errorMessage += " for reading";
00163             myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str());
00164             return(false);
00165         }
00166         
00167         char magic[4];
00168         ifread(myFilePtr, magic, 4);
00169         
00170         if (magic[0] == 'B' && magic[1] == 'A' && magic[2] == 'M' &&
00171             magic[3] == 1)
00172         {
00173             myInterfacePtr = new BamInterface;
00174             // Set that it is a bam file open for reading.  This is needed to
00175             // determine if an index file can be used.
00176             myIsBamOpenForRead = true;
00177         }
00178         else
00179         {
00180             // Not a bam, so rewind to the beginning of the file so it
00181             // can be read.
00182             ifrewind(myFilePtr);
00183             myInterfacePtr = new SamInterface;
00184         }
00185     }
00186 
00187     // File is open for reading.
00188     myIsOpenForRead = true;
00189 
00190     // Read the header if one was passed in.
00191     if(header != NULL)
00192     {
00193         return(ReadHeader(*header));
00194     }
00195 
00196     // Successfully opened the file.
00197     myStatus = SamStatus::SUCCESS;
00198     return(true);
00199 }
00200 
00201 
00202 // Open a sam/bam file for writing with the specified filename.
00203 bool SamFile::OpenForWrite(const char * filename, SamFileHeader* header)
00204 {
00205     // Reset for any previously operated on files.
00206     resetFile();
00207     
00208     int lastchar = 0;
00209     while (filename[lastchar] != 0) lastchar++;   
00210     if (lastchar >= 4 && 
00211         filename[lastchar - 4] == 'u' &&
00212         filename[lastchar - 3] == 'b' &&
00213         filename[lastchar - 2] == 'a' &&
00214         filename[lastchar - 1] == 'm')
00215     {
00216         // BAM File.
00217         // if -.ubam is the filename, write uncompressed bam to stdout
00218         if((lastchar == 6) && (filename[0] == '-') && (filename[1] == '.'))
00219         {
00220             filename = "-";
00221         }
00222         myFilePtr = ifopen(filename, "wb", InputFile::UNCOMPRESSED);
00223 
00224         myInterfacePtr = new BamInterface;
00225     }
00226     else if (lastchar >= 3 && 
00227              filename[lastchar - 3] == 'b' &&
00228              filename[lastchar - 2] == 'a' &&
00229              filename[lastchar - 1] == 'm')
00230     {
00231         // BAM File.
00232         // if -.bam is the filename, write compressed bam to stdout
00233         if((lastchar == 5) && (filename[0] == '-') && (filename[1] == '.'))
00234         {
00235             filename = "-";
00236         }
00237         myFilePtr = ifopen(filename, "wb", InputFile::BGZF);
00238         
00239         myInterfacePtr = new BamInterface;
00240     }
00241     else
00242     {
00243         // SAM File
00244         // if - (followed by anything is the filename,
00245         // write uncompressed sam to stdout
00246         if((lastchar >= 1) && (filename[0] == '-'))
00247         {
00248             filename = "-";
00249         }
00250         myFilePtr = ifopen(filename, "wb", InputFile::UNCOMPRESSED);
00251    
00252         myInterfacePtr = new SamInterface;
00253     }
00254 
00255     if (myFilePtr == NULL)
00256     {
00257         std::string errorMessage = "Failed to Open ";
00258         errorMessage += filename;
00259         errorMessage += " for writing";
00260         myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str());
00261         return(false);
00262     }
00263    
00264     myIsOpenForWrite = true;
00265 
00266     // Write the header if one was passed in.
00267     if(header != NULL)
00268     {
00269         return(WriteHeader(*header));
00270     }
00271 
00272     // Successfully opened the file.
00273     myStatus = SamStatus::SUCCESS;
00274     return(true);
00275 }
00276 
00277 
00278 // Read BAM Index file.
00279 bool SamFile::ReadBamIndex(const char* bamIndexFilename)
00280 {
00281     // Cleanup a previously setup index.
00282     if(myBamIndex != NULL)
00283     {
00284         delete myBamIndex;
00285         myBamIndex = NULL;
00286     }
00287 
00288     // Create a new bam index.
00289     myBamIndex = new BamIndex();
00290     SamStatus::Status indexStat = myBamIndex->readIndex(bamIndexFilename);
00291 
00292     if(indexStat != SamStatus::SUCCESS)
00293     {
00294         std::string errorMessage = "Failed to read the bam Index file: ";
00295         errorMessage += bamIndexFilename;
00296         myStatus.setStatus(indexStat, errorMessage.c_str());
00297         delete myBamIndex;
00298         myBamIndex = NULL;
00299         return(false);
00300     }
00301     myStatus = SamStatus::SUCCESS;
00302     return(true);
00303 }
00304 
00305 
00306 // Read BAM Index file.
00307 bool SamFile::ReadBamIndex()
00308 {
00309     if(myFilePtr == NULL)
00310     {
00311         // Can't read the bam index file because the BAM file has not yet been
00312         // opened, so we don't know the base filename for the index file.
00313         std::string errorMessage = "Failed to read the bam Index file -"
00314             " the BAM file needs to be read first in order to determine"
00315             " the index filename.";
00316         myStatus.setStatus(SamStatus::FAIL_ORDER, errorMessage.c_str());
00317         return(false);
00318     }
00319 
00320     const char* bamBaseName = myFilePtr->getFileName();
00321     
00322     std::string indexName = bamBaseName;
00323     indexName += ".bai";
00324 
00325     bool foundFile = true;
00326     try
00327     {
00328         if(ReadBamIndex(indexName.c_str()) == false)
00329         {
00330             foundFile = false;
00331         }
00332     }
00333     catch (std::exception& e)
00334     {
00335         foundFile = false;
00336     }
00337 
00338     // Check to see if the index file was found.
00339     if(!foundFile)
00340     {
00341         // Not found - try without the bam extension.
00342         // Locate the start of the bam extension
00343         size_t startExt = indexName.find(".bam");
00344         if(startExt == std::string::npos)
00345         {
00346             // Could not find the .bam extension, so just return false since the
00347             // call to ReadBamIndex set the status.
00348             return(false);
00349         }
00350         // Remove ".bam" and try reading the index again.
00351         indexName.erase(startExt,  4);
00352         return(ReadBamIndex(indexName.c_str()));
00353     }
00354     return(true);
00355 }
00356 
00357 
00358 // Sets the reference to the specified genome sequence object.
00359 void SamFile::SetReference(GenomeSequence* reference)
00360 {
00361     myRefPtr = reference;
00362 }
00363 
00364 
00365 // Set the type of sequence translation to use when reading the sequence.
00366 void SamFile::SetReadSequenceTranslation(SamRecord::SequenceTranslation translation)
00367 {
00368     myReadTranslation = translation;
00369 }
00370 
00371 
00372 // Set the type of sequence translation to use when writing the sequence.
00373 void SamFile::SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation)
00374 {
00375     myWriteTranslation = translation;
00376 }
00377 
00378 // Close the file if there is one open.
00379 void SamFile::Close()
00380 {
00381     // Resetting the file will close it if it is open, and
00382     // will reset all other variables.
00383     resetFile();
00384 }
00385 
00386 
00387 // Returns whether or not the end of the file has been reached.
00388 // return: int - true = EOF; false = not eof.
00389 bool SamFile::IsEOF()
00390 {
00391     if (myFilePtr != NULL)
00392     {
00393         // File Pointer is set, so return if eof.
00394         return(ifeof(myFilePtr));
00395     }
00396     // File pointer is not set, so return true, eof.
00397     return true;
00398 }
00399 
00400 
00401 // Read the header from the currently opened file.
00402 bool SamFile::ReadHeader(SamFileHeader& header)
00403 {
00404     if(myIsOpenForRead == false)
00405     {
00406         // File is not open for read
00407         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00408                            "Cannot read header since the file is not open for reading");
00409         return(false);
00410     }
00411 
00412     if(myHasHeader == true)
00413     {
00414         // The header has already been read.
00415         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00416                            "Cannot read header since it has already been read.");
00417         return(false);
00418     }
00419 
00420     myStatus = myInterfacePtr->readHeader(myFilePtr, header);
00421     if(myStatus == SamStatus::SUCCESS)
00422     {
00423         // The header has now been successfully read.
00424         myHasHeader = true;
00425         return(true);
00426     }
00427     return(false);
00428 }
00429 
00430 
00431 // Write the header to the currently opened file.
00432 bool SamFile::WriteHeader(SamFileHeader& header)
00433 {
00434     if(myIsOpenForWrite == false)
00435     {
00436         // File is not open for write
00437         // -OR-
00438         // The header has already been written.
00439         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00440                            "Cannot write header since the file is not open for writing");
00441         return(false);
00442     }
00443 
00444     if(myHasHeader == true)
00445     {
00446         // The header has already been written.
00447         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00448                            "Cannot write header since it has already been written");
00449         return(false);
00450     }
00451 
00452     myStatus = myInterfacePtr->writeHeader(myFilePtr, header);
00453     if(myStatus == SamStatus::SUCCESS)
00454     {
00455         // The header has now been successfully written.
00456         myHasHeader = true;
00457         return(true);
00458     }
00459 
00460     // return the status.
00461     return(false);
00462 }
00463 
00464 
00465 // Read a record from the currently opened file.
00466 bool SamFile::ReadRecord(SamFileHeader& header, 
00467                          SamRecord& record)
00468 {
00469     myStatus = SamStatus::SUCCESS;
00470 
00471     if(myIsOpenForRead == false)
00472     {
00473         // File is not open for read
00474         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00475                            "Cannot read record since the file is not open for reading");
00476         throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to opening the file."));
00477         return(false);
00478     }
00479 
00480     if(myHasHeader == false)
00481     {
00482         // The header has not yet been read.
00483         // TODO - maybe just read the header.
00484         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00485                            "Cannot read record since the header has not been read.");
00486         throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to reading the header."));
00487         return(false);
00488     }
00489 
00490     // Check to see if a new region has been set.  If so, determine the
00491     // chunks for that region.
00492     if(myNewSection)
00493     {
00494         if(!processNewSection(header))
00495         {
00496             // Failed processing a new section.  Could be an 
00497             // order issue like the file not being open or the
00498             // indexed file not having been read.
00499             // processNewSection sets myStatus with the failure reason.
00500             return(false);
00501         }
00502     }
00503 
00504     // Check to see if the file should be read by index.
00505     if(myRefID != BamIndex::REF_ID_ALL)
00506     {
00507         // Reference ID is set, so read by index.
00508         return(readIndexedRecord(header, record));
00509     }
00510 
00511     record.setReference(myRefPtr);
00512     record.setSequenceTranslation(myReadTranslation);
00513 
00514     // File is open for reading and the header has been read, so read the next
00515     // record.
00516     myInterfacePtr->readRecord(myFilePtr, header, record, myStatus);
00517     if(myStatus == SamStatus::SUCCESS)
00518     {
00519         // A record was successfully read, so increment the record count.
00520         myRecordCount++;
00521 
00522         if(myStatistics != NULL)
00523         {
00524             // Statistics should be updated.
00525             myStatistics->updateStatistics(record);
00526         }
00527 
00528         // Successfully read the record, so check the sort order.
00529         if(!validateSortOrder(record, header))
00530         {
00531             // ValidateSortOrder sets the status on a failure.
00532             return(false);
00533         }
00534         return(true);
00535     }
00536     // Failed to read the record.
00537     return(false);
00538 }
00539 
00540 
00541 
00542 // Write a record to the currently opened file.
00543 bool SamFile::WriteRecord(SamFileHeader& header, 
00544                           SamRecord& record)
00545 {
00546     if(myIsOpenForWrite == false)
00547     {
00548         // File is not open for writing
00549         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00550                            "Cannot write record since the file is not open for writing");
00551         return(false);
00552     }
00553 
00554     if(myHasHeader == false)
00555     {
00556         // The header has not yet been written.
00557         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00558                            "Cannot write record since the header has not been written");
00559         return(false);
00560     }
00561 
00562     // Before trying to write the record, validate the sort order.
00563     if(!validateSortOrder(record, header))
00564     {
00565         // Not sorted like it is supposed to be, do not write the record
00566         myStatus.setStatus(SamStatus::INVALID_SORT, 
00567                            "Cannot write the record since the file is not properly sorted.");
00568         return(false);
00569     }
00570 
00571     if(myRefPtr != NULL)
00572     {
00573         record.setReference(myRefPtr);
00574     }
00575 
00576     // File is open for writing and the header has been written, so write the
00577     // record.
00578     myStatus = myInterfacePtr->writeRecord(myFilePtr, header, record,
00579                                            myWriteTranslation);
00580 
00581     if(myStatus == SamStatus::SUCCESS)
00582     {
00583         // A record was successfully written, so increment the record count.
00584         myRecordCount++;
00585         return(true);
00586     }
00587     return(false);
00588 }
00589 
00590 
00591 // Set the flag to validate that the file is sorted as it is read/written.
00592 // Must be called after the file has been opened.
00593 void SamFile::setSortedValidation(SortedType sortType)
00594 {
00595     mySortedType = sortType;
00596 }
00597 
00598 
00599 // Return the number of records that have been read/written so far.
00600 uint32_t SamFile::GetCurrentRecordCount()
00601 {
00602     return(myRecordCount);
00603 }
00604 
00605 
00606 // Sets what part of the SamFile should be read.
00607 bool SamFile::SetReadSection(int32_t refID)
00608 {
00609     // No start/end specified, so set back to default -1.
00610     return(SetReadSection(refID, -1, -1));
00611 }
00612 
00613 
00614 
00615 // Sets what part of the SamFile should be read.
00616 bool SamFile::SetReadSection(const char* refName)
00617 {
00618     // No start/end specified, so set back to default -1.
00619     return(SetReadSection(refName, -1, -1));
00620 }
00621 
00622 
00623 // Sets what part of the BAM file should be read.
00624 bool SamFile::SetReadSection(int32_t refID, int32_t start, int32_t end)
00625 {
00626     // If there is not a BAM file open for reading, return failure.
00627     // Opening a new file clears the read section, so it must be
00628     // set after the file is opened.
00629     if(!myIsBamOpenForRead)
00630     {
00631         // There is not a BAM file open for reading.
00632         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00633                            "Canot set section since there is no bam file open");
00634         return(false);
00635     }
00636 
00637     // Indexed Bam open for read, so disable read buffering because iftell will be used.
00638     myFilePtr->disableBuffering();
00639 
00640     myNewSection = true;
00641     myStartPos = start;
00642     myEndPos = end;
00643     myRefID = refID;
00644     myRefName.clear();
00645     myChunksToRead.clear();
00646     // Reset the end of the current chunk.  We are resetting our read, so
00647     // we no longer have a "current chunk" that we are reading.
00648     myCurrentChunkEnd = 0;
00649     myStatus = SamStatus::SUCCESS;
00650 
00651     // Reset the sort order criteria since we moved around in the file.    
00652     myPrevCoord = -1;
00653     myPrevRefID = 0;
00654     myPrevReadName.clear();
00655 
00656     return(true);
00657 }
00658 
00659 
00660 // Sets what part of the BAM file should be read.
00661 bool SamFile::SetReadSection(const char* refName, int32_t start, int32_t end)
00662 {
00663     // If there is not a BAM file open for reading, return failure.
00664     // Opening a new file clears the read section, so it must be
00665     // set after the file is opened.
00666     if(!myIsBamOpenForRead)
00667     {
00668         // There is not a BAM file open for reading.
00669         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00670                            "Canot set section since there is no bam file open");
00671         return(false);
00672     }
00673 
00674     // Indexed Bam open for read, so disable read buffering because iftell will be used.
00675     myFilePtr->disableBuffering();
00676 
00677     myNewSection = true;
00678     myStartPos = start;
00679     myEndPos = end;
00680     if((strcmp(refName, "") == 0) || (strcmp(refName, "*") == 0))
00681     {
00682         // No Reference name specified, so read just the "-1" entries.
00683         myRefID = BamIndex::REF_ID_UNMAPPED;
00684     }
00685     else
00686     {
00687         // save the reference name and revert the reference ID to unknown
00688         // so it will be calculated later.
00689         myRefName = refName;
00690         myRefID = BamIndex::REF_ID_ALL;
00691     }
00692     myChunksToRead.clear();
00693     // Reset the end of the current chunk.  We are resetting our read, so
00694     // we no longer have a "current chunk" that we are reading.
00695     myCurrentChunkEnd = 0;
00696     myStatus = SamStatus::SUCCESS;
00697     
00698     // Reset the sort order criteria since we moved around in the file.    
00699     myPrevCoord = -1;
00700     myPrevRefID = 0;
00701     myPrevReadName.clear();
00702 
00703     return(true);
00704 }
00705 
00706 
00707 
00708 // Get the number of mapped reads in the specified reference id.  
00709 // Returns -1 for out of range refIDs.
00710 int32_t SamFile::getNumMappedReadsFromIndex(int32_t refID)
00711 {
00712     // The bam index must have already been read.
00713     if(myBamIndex == NULL)
00714     {
00715         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00716                            "Canot get num mapped reads from the index until it has been read.");
00717         return(false);
00718     }
00719     return(myBamIndex->getNumMappedReads(refID));
00720 }
00721 
00722 
00723 // Get the number of unmapped reads in the specified reference id.  
00724 // Returns -1 for out of range refIDs.
00725 int32_t SamFile::getNumUnMappedReadsFromIndex(int32_t refID)
00726 {
00727     // The bam index must have already been read.
00728     if(myBamIndex == NULL)
00729     {
00730         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00731                            "Canot get num unmapped reads from the index until it has been read.");
00732         return(false);
00733     }
00734     return(myBamIndex->getNumUnMappedReads(refID));
00735 }
00736 
00737 
00738 // Get the number of mapped reads in the specified reference id.  
00739 // Returns -1 for out of range refIDs.
00740 int32_t SamFile::getNumMappedReadsFromIndex(const char* refName,
00741                                             SamFileHeader& header)
00742 {
00743     // The bam index must have already been read.
00744     if(myBamIndex == NULL)
00745     {
00746         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00747                            "Canot get num mapped reads from the index until it has been read.");
00748         return(false);
00749     }
00750     int32_t refID = BamIndex::REF_ID_UNMAPPED;
00751     if((strcmp(refName, "") != 0) && (strcmp(refName, "*") != 0))
00752     {
00753         // Reference name specified, so read just the "-1" entries.
00754         refID =  header.getReferenceID(refName);
00755     }
00756     return(myBamIndex->getNumMappedReads(refID));
00757 }
00758 
00759 
00760 // Get the number of unmapped reads in the specified reference id.  
00761 // Returns -1 for out of range refIDs.
00762 int32_t SamFile::getNumUnMappedReadsFromIndex(const char* refName,
00763                                               SamFileHeader& header)
00764 {
00765     // The bam index must have already been read.
00766     if(myBamIndex == NULL)
00767     {
00768         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00769                            "Canot get num unmapped reads from the index until it has been read.");
00770         return(false);
00771     }
00772     int32_t refID = BamIndex::REF_ID_UNMAPPED;
00773     if((strcmp(refName, "") != 0) && (strcmp(refName, "*") != 0))
00774     {
00775         // Reference name specified, so read just the "-1" entries.
00776         refID =  header.getReferenceID(refName);
00777     }
00778     return(myBamIndex->getNumUnMappedReads(refID));
00779 }
00780 
00781 
00782 // Returns the number of bases in the passed in read that overlap the
00783 // region that is currently set.
00784 uint32_t SamFile::GetNumOverlaps(SamRecord& samRecord)
00785 {
00786     if(myRefPtr != NULL)
00787     {
00788         samRecord.setReference(myRefPtr);
00789     }
00790     samRecord.setSequenceTranslation(myReadTranslation);
00791 
00792     // Get the overlaps in the sam record for the region currently set
00793     // for this file.
00794     return(samRecord.getNumOverlaps(myStartPos, myEndPos));
00795 }
00796 
00797 
00798 void SamFile::GenerateStatistics(bool genStats)
00799 {
00800     if(genStats)
00801     {
00802         if(myStatistics == NULL)
00803         {
00804             // Want to generate statistics, but do not yet have the
00805             // structure for them, so create one.
00806             myStatistics = new SamStatistics();
00807         }
00808     }
00809     else
00810     {
00811         // Do not generate statistics, so if myStatistics is not NULL, 
00812         // delete it.
00813         if(myStatistics != NULL)
00814         {
00815             delete myStatistics;
00816             myStatistics = NULL;
00817         }
00818     }
00819 
00820 }
00821 
00822 
00823 // initialize.
00824 void SamFile::init(const char* filename, OpenType mode, SamFileHeader* header)
00825 {
00826     myFilePtr = NULL;
00827     myInterfacePtr = NULL;
00828     myStatistics = NULL;
00829     myBamIndex = NULL;
00830     myRefPtr = NULL;
00831     myReadTranslation = SamRecord::NONE;
00832     myWriteTranslation = SamRecord::NONE;
00833         
00834     resetFile();
00835 
00836     bool openStatus = true;
00837     if(mode == READ)
00838     {
00839         // open the file for read.
00840         openStatus = OpenForRead(filename, header);
00841     }
00842     else
00843     {
00844         // open the file for write.
00845         openStatus = OpenForWrite(filename, header);
00846     }
00847     if(!openStatus)
00848     {
00849         // Failed to open the file - print error and abort.
00850         fprintf(stderr, "%s\n", GetStatusMessage());
00851         std::cerr << "FAILURE - EXITING!!!" << std::endl;
00852         exit(-1);
00853     }
00854 }
00855 
00856 
00857 // Reset variables for each file.
00858 void SamFile::resetFile()
00859 {
00860     // Close the file.
00861     if (myFilePtr != NULL)
00862     {
00863         // If we already have an open file, close it.
00864         ifclose(myFilePtr);
00865         myFilePtr = NULL;
00866     }
00867     if(myInterfacePtr != NULL)
00868     {
00869         delete myInterfacePtr;
00870         myInterfacePtr = NULL;
00871     }
00872 
00873     myIsOpenForRead = false;
00874     myIsOpenForWrite = false;
00875     myHasHeader = false;
00876     mySortedType = UNSORTED;
00877     myPrevReadName.clear();
00878     myPrevCoord = -1;
00879     myPrevRefID = 0;
00880     myRecordCount = 0;
00881     myStatus = SamStatus::SUCCESS;
00882 
00883     // Reset indexed bam values.
00884     myIsBamOpenForRead = false;
00885     myRefID = BamIndex::REF_ID_ALL;
00886     myStartPos = -1;
00887     myEndPos = -1;
00888     myNewSection = false;
00889     myCurrentChunkEnd = 0;
00890     myChunksToRead.clear();
00891     if(myBamIndex != NULL)
00892     {
00893         delete myBamIndex;
00894         myBamIndex = NULL;
00895     }
00896 
00897     // If statistics are being generated, reset them.
00898     if(myStatistics != NULL)
00899     {
00900         myStatistics->reset();
00901     }
00902 
00903     myRefName.clear();
00904 }
00905 
00906 
00907 // Validate that the record is sorted compared to the previously read record
00908 // if there is one, according to the specified sort order.
00909 // If the sort order is UNSORTED, true is returned.
00910 bool SamFile::validateSortOrder(SamRecord& record, SamFileHeader& header)
00911 {
00912     if(myRefPtr != NULL)
00913     {
00914         record.setReference(myRefPtr);
00915     }
00916     record.setSequenceTranslation(myReadTranslation);
00917 
00918     bool status = false;
00919     if(mySortedType == UNSORTED)
00920     {
00921         // Unsorted, so nothing to validate, just return true.
00922         status = true;
00923     }
00924     else 
00925     {
00926         // Check to see if mySortedType is based on the header.
00927         if(mySortedType == FLAG)
00928         {
00929             // Determine the sorted type from what was read out of the header.
00930             mySortedType = getSortOrderFromHeader(header);
00931         }
00932 
00933         if(mySortedType == QUERY_NAME)
00934         {
00935             // Validate that it is sorted by query name.
00936             // Get the query name from the record.
00937             const char* readName = record.getReadName();
00938             if(myPrevReadName.compare(readName) > 0)
00939             {
00940                 // The previous name is greater than the new record's name, so
00941                 // return false.
00942                 String errorMessage = "ERROR: File is not sorted at record ";
00943                 errorMessage += myRecordCount;
00944                 myStatus.setStatus(SamStatus::INVALID_SORT, 
00945                                    errorMessage.c_str());
00946                 status = false;
00947             }
00948             else
00949             {
00950                 myPrevReadName = readName;
00951                 status = true;
00952             }
00953         }
00954         else 
00955         {
00956             // Validate that it is sorted by COORDINATES.
00957             // Get the leftmost coordinate and the reference index.
00958             int32_t refID = record.getReferenceID();
00959             int32_t coord = record.get0BasedPosition();
00960             // The unmapped reference id is at the end of a sorted file.
00961             if(refID == BamIndex::REF_ID_UNMAPPED)
00962             {
00963                 // A new reference ID that is for the unmapped reads
00964                 // is always valid.
00965                 status = true;
00966                 myPrevRefID = refID;
00967                 myPrevCoord = coord;
00968             }
00969             else if(myPrevRefID == BamIndex::REF_ID_UNMAPPED)
00970             {
00971                 // Previous reference ID was for unmapped reads, but the
00972                 // current one is not, so this is not sorted.
00973                 String errorMessage = "ERROR: File is not sorted at record ";
00974                 errorMessage += myRecordCount;
00975                 myStatus.setStatus(SamStatus::INVALID_SORT, 
00976                                    errorMessage.c_str());
00977                 status = false;
00978             }
00979             else if(refID < myPrevRefID)
00980             {
00981                 // Current reference id is less than the previous one, 
00982                 //meaning that it is not sorted.
00983                 String errorMessage = "ERROR: File is not sorted at record ";
00984                 errorMessage += myRecordCount;
00985                 myStatus.setStatus(SamStatus::INVALID_SORT, 
00986                                    errorMessage.c_str());
00987                 status = false;
00988             }
00989             else
00990             {
00991                 // The reference IDs are in the correct order.
00992                 if(refID > myPrevRefID)
00993                 {
00994                     // New reference id, so set the previous coordinate to -1
00995                     myPrevCoord = -1;
00996                 }
00997             
00998                 // Check the coordinates.
00999                 if(coord < myPrevCoord)
01000                 {
01001                     // New Coord is less than the previous position.
01002                     String errorMessage = "ERROR: File is not sorted at record ";
01003                     errorMessage += myRecordCount;
01004                     myStatus.setStatus(SamStatus::INVALID_SORT, 
01005                                        errorMessage.c_str());
01006                     status = false;
01007                 }
01008                 else
01009                 {
01010                     myPrevRefID = refID;
01011                     myPrevCoord = coord;
01012                     status = true;
01013                 }
01014             }
01015         }
01016     }
01017 
01018     return(status);
01019 }
01020 
01021 
01022 SamFile::SortedType SamFile::getSortOrderFromHeader(SamFileHeader& header)
01023 {
01024     const char* tag = header.getSortOrder();
01025    
01026     // Default to unsorted since if it is not specified in the header
01027     // that is the value that should be used.
01028     SortedType headerSortOrder = UNSORTED;
01029     if(strcmp(tag, "queryname") == 0)
01030     {
01031         headerSortOrder = QUERY_NAME;
01032     }
01033     else if(strcmp(tag, "coordinate") == 0)
01034     {
01035         headerSortOrder = COORDINATE;
01036     }
01037     return(headerSortOrder);
01038 }
01039 
01040 
01041 // Read a record from the currently opened file based on the indexing info.
01042 bool SamFile::readIndexedRecord(SamFileHeader& header, 
01043                                 SamRecord& record)
01044 {
01045     record.setReference(myRefPtr);
01046     record.setSequenceTranslation(myReadTranslation);
01047 
01048     // A section to read is set and the file is open for reading, as verified
01049     // prior to calling into this protected method.
01050     bool recordFound = false;
01051     while(!recordFound)
01052     {
01053         // Check to see if we have more to read out of the current chunk.
01054         // By checking the current position in relation to the current
01055         // end chunk.  If the current position is >= the end of the
01056         // current chunk, then we must see to the next chunk.
01057         uint64_t currentPos = iftell(myFilePtr);
01058         if(currentPos >= myCurrentChunkEnd)
01059         {
01060             // If there are no more chunks to process, return failure.
01061             if(myChunksToRead.empty())
01062             {
01063                 myStatus = SamStatus::NO_MORE_RECS;
01064                 return(false);
01065             }
01066       
01067             // There are more chunks left, so get the next chunk.
01068             Chunk newChunk = myChunksToRead.pop();
01069 
01070             // Move to the location of the new chunk if it is not adjacent
01071             // to the current chunk.
01072             if(newChunk.chunk_beg != currentPos)
01073             {
01074                 // New chunk is not adjacent, so move to it.
01075                 if(ifseek(myFilePtr, newChunk.chunk_beg, SEEK_SET) != true)
01076                 {
01077                     // seek failed, cannot retrieve next record, return failure.
01078                     myStatus.setStatus(SamStatus::FAIL_IO, 
01079                                        "Failed to seek to the next record");
01080                     return(false);
01081                 }
01082             }
01083             // Seek succeeded, set the end of the new chunk.
01084             myCurrentChunkEnd = newChunk.chunk_end;
01085         }
01086 
01087         // At the position of the next record.  Read the record.
01088         myInterfacePtr->readRecord(myFilePtr, header, record, myStatus);
01089         if(myStatus != SamStatus::SUCCESS)
01090         {
01091             // Reading the record failed, return failure.
01092             return(false);
01093         }
01094 
01095         // Successfully read the record.  Check to see if it is in the correct 
01096         // reference/position.
01097         if(record.getReferenceID() != myRefID)
01098         {
01099             // Incorrect reference ID, return failure.
01100             myStatus = SamStatus::NO_MORE_RECS;
01101             return(false);
01102         }
01103    
01104         // Found a record.
01105         recordFound = true;
01106 
01107         // If start/end position are set, verify that the alignment falls
01108         // completely within those.
01109         if((myStartPos != -1) && (myEndPos != -1))
01110         {
01111             // Start & end are specified, so check if the alignment 
01112             // overlaps this section.
01113 
01114             // If the alignment start is greater than the end of the region,
01115             // return NO_MORE_RECS.
01116             if(record.get0BasedPosition() >= myEndPos)
01117             {
01118                 myStatus = SamStatus::NO_MORE_RECS;
01119                 return(false);
01120             }
01121 
01122             // We know the start is less than the end position, so the alignment
01123             // overlaps the region if the alignment end position is greater than the
01124             // start of the region.
01125             if(record.get0BasedAlignmentEnd() < myStartPos)
01126             {
01127                 // If it does not overlap the region, so go to the next
01128                 // record...set recordFound back to false.
01129                 recordFound = false;
01130             }
01131         }
01132     }
01133 
01134     // Successfully read the record, so check the sort order.
01135     if(!validateSortOrder(record, header))
01136     {
01137         myStatus.setStatus(SamStatus::INVALID_SORT, 
01138                            "The file is not properly sorted.");
01139         return(false);
01140     }
01141    
01142     // If it made it here, then the record was successfully read
01143     // and met the criteria, so return success.
01144     myRecordCount++;
01145 
01146     if(myStatistics != NULL)
01147     {
01148         // Successfully read, statistics should be updated.
01149         myStatistics->updateStatistics(record);
01150     }
01151     
01152     return(true);
01153 }
01154 
01155 
01156 bool SamFile::processNewSection(SamFileHeader &header)
01157 {
01158     myNewSection = false;
01159 
01160     // If there is no index file, return failure.
01161     if(myBamIndex == NULL)
01162     {
01163         // No bam index has been read.
01164         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01165                            "Canot read section since there is no index file open");
01166         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the BAM Index file."));
01167         return(false);
01168     }
01169 
01170     // If there is not a BAM file open for reading, return failure.
01171     if(!myIsBamOpenForRead)
01172     {
01173         // There is not a BAM file open for reading.
01174         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01175                            "Canot read section since there is no bam file open");
01176         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section without opening a BAM file."));
01177         return(false);
01178     }
01179 
01180     if(myHasHeader == false)
01181     {
01182         // The header has not yet been read.
01183         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01184                            "Cannot read record since the header has not been read.");
01185         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the header."));
01186         return(false);
01187     }
01188 
01189     myChunksToRead.clear();
01190     // Reset the end of the current chunk.  We are resetting our read, so
01191     // we no longer have a "current chunk" that we are reading.
01192     myCurrentChunkEnd = 0;
01193 
01194     // Check to see if the read section was set based on the reference name
01195     // but not yet converted to reference id.
01196     if(!myRefName.empty())
01197     {
01198         myRefID = header.getReferenceID(myRefName.c_str());
01199         // Clear the myRefName length so this code is only executed once.
01200         myRefName.clear();
01201     }
01202 
01203     // Get the chunks associated with this reference region.
01204     if(myBamIndex->getChunksForRegion(myRefID, myStartPos, myEndPos, 
01205                                       myChunksToRead) == true)
01206     {
01207         myStatus = SamStatus::SUCCESS;
01208     }
01209     else
01210     {
01211         myStatus.setStatus(SamStatus::FAIL_PARSE, 
01212                            "Failed to get the specified region.");
01213     }
01214     return(true);
01215 }
01216 
01217 
01218 // Default Constructor.
01219 SamFileReader::SamFileReader()
01220     : SamFile()
01221 {
01222 }
01223 
01224 
01225 // Constructor that opens the specified file for read.
01226 SamFileReader::SamFileReader(const char* filename)
01227     : SamFile(filename, READ)
01228 {
01229 }
01230 
01231 
01232 
01233 
01234 // Constructor that opens the specified file for read.
01235 SamFileReader::SamFileReader(const char* filename,
01236                              ErrorHandler::HandlingType errorHandlingType)
01237     : SamFile(filename, READ, errorHandlingType)
01238 {
01239 }
01240 
01241 
01242 // Constructor that opens the specified file for read.
01243 SamFileReader::SamFileReader(const char* filename,
01244                              SamFileHeader* header)
01245     : SamFile(filename, READ, header)
01246 {
01247 }
01248 
01249 
01250 // Constructor that opens the specified file for read.
01251 SamFileReader::SamFileReader(const char* filename,
01252                              ErrorHandler::HandlingType errorHandlingType,
01253                              SamFileHeader* header)
01254     : SamFile(filename, READ, errorHandlingType, header)
01255 {
01256 }
01257 
01258 
01259 SamFileReader::~SamFileReader()
01260 {
01261 }
01262 
01263 
01264 // Default Constructor.
01265 SamFileWriter::SamFileWriter()
01266     : SamFile()
01267 {
01268 }
01269 
01270 
01271 // Constructor that opens the specified file for write.
01272 SamFileWriter::SamFileWriter(const char* filename)
01273     : SamFile(filename, WRITE)
01274 {
01275 }
01276 
01277 
01278 // Constructor that opens the specified file for write.
01279 SamFileWriter::SamFileWriter(const char* filename,
01280                              ErrorHandler::HandlingType errorHandlingType)
01281     : SamFile(filename, WRITE, errorHandlingType)
01282 {
01283 }
01284 
01285 
01286 // Constructor that opens the specified file for write.
01287 SamFileWriter::SamFileWriter(const char* filename,
01288                              SamFileHeader* header)
01289     : SamFile(filename, WRITE, header)
01290 {
01291 }
01292 
01293 
01294 // Constructor that opens the specified file for write.
01295 SamFileWriter::SamFileWriter(const char* filename,
01296                              ErrorHandler::HandlingType errorHandlingType,
01297                              SamFileHeader* header)
01298     : SamFile(filename, WRITE, errorHandlingType, header)
01299 {
01300 }
01301 
01302 
01303 SamFileWriter::~SamFileWriter()
01304 {
01305 }
Generated on Tue Mar 22 22:50:13 2011 for StatGen Software by  doxygen 1.6.3