SamFile.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_FILE_H__
00019 #define __SAM_FILE_H__
00020 
00021 #include "SamStatus.h"
00022 #include "InputFile.h"
00023 #include "SamFileHeader.h"
00024 #include "SamRecord.h"
00025 #include "GenericSamInterface.h"
00026 #include "BamIndex.h"
00027 #include "SamStatistics.h"
00028 
00029 /// Allows the user to easily read/write a SAM/BAM file.
00030 class SamFile
00031 {
00032 public:
00033     /// Enum for indicating whether to open the file for read or write.
00034     enum OpenType {
00035         READ, ///< open for reading.
00036         WRITE ///< open for writing.
00037     };
00038     
00039     
00040     /// Enum for indicating the type of sort for the file.
00041     enum SortedType {
00042         UNSORTED = 0, ///< file is not sorted.
00043         FLAG,         ///< SO flag from the header indicates the sort type.
00044         COORDINATE,   ///< file is sorted by coordinate.
00045         QUERY_NAME    ///< file is sorted by queryname.
00046     };
00047     
00048     /// Default Constructor.
00049     SamFile();
00050 
00051     /// Constructor that sets the error handling type.
00052     /// \param errorHandlingType how to handle errors.
00053     SamFile(ErrorHandler::HandlingType errorHandlingType);
00054 
00055     /// Constructor that opens the specified file based on the specified mode
00056     /// (READ/WRITE).
00057     /// \param filename name of the file to open.
00058     /// \param mode mode to use for opening the file.
00059     SamFile(const char* filename, OpenType mode);
00060 
00061     /// Constructor that opens the specified file based on the specified mode
00062     /// (READ/WRITE) and handles errors per the specified handleType.
00063     /// \param filename name of the file to open.
00064     /// \param mode mode to use for opening the file.
00065     /// \param errorHandlingType how to handle errors.
00066     SamFile(const char* filename, OpenType mode,
00067             ErrorHandler::HandlingType errorHandlingType);
00068 
00069     /// Constructor that opens the specified file based on the specified mode
00070     /// (READ/WRITE).
00071     /// \param filename name of the file to open.
00072     /// \param mode mode to use for opening the file.
00073     /// \param header to read into or write from
00074     SamFile(const char* filename, OpenType mode, SamFileHeader* header);
00075 
00076     /// Constructor that opens the specified file based on the specified mode
00077     /// (READ/WRITE) and handles errors per the specified handleType.
00078     /// \param filename name of the file to open.
00079     /// \param mode mode to use for opening the file.
00080     /// \param errorHandlingType how to handle errors.
00081     /// \param header to read into or write from
00082     SamFile(const char* filename, OpenType mode,
00083             ErrorHandler::HandlingType errorHandlingType,
00084             SamFileHeader* header);
00085 
00086     virtual ~SamFile();
00087    
00088     /// Open a sam/bam file for reading with the specified filename.
00089     /// \param  filename the sam/bam file to open for reading.
00090     /// \param header to read into or write from (optional)
00091     /// \return true = success; false = failure.
00092     bool OpenForRead(const char * filename, SamFileHeader* header = NULL);
00093 
00094     /// Open a sam/bam file for writing with the specified filename.
00095     /// \param  filename the sam/bam file to open for writing.
00096     /// \param header to read into or write from (optional)
00097     /// \return true = success; false = failure.
00098     bool OpenForWrite(const char * filename, SamFileHeader* header = NULL);
00099 
00100     /// Read the specified bam index file.  It must be read prior to setting a
00101     /// read section, for seeking and reading portions of a bam file.
00102     /// \param filename the name of the bam index file to be read.
00103     /// \return true = success; false = failure.
00104     bool ReadBamIndex(const char * filename);
00105 
00106     /// Read the bam index file using the BAM filename as a base. 
00107     /// It must be read prior to setting a read section, for seeking
00108     /// and reading portions of a bam file.
00109     /// Must be read after opening the BAM file since it uses the
00110     /// BAM filename as a base name for the index file.
00111     /// First it tries filename.bam.bai. If that fails, it tries
00112     /// it without the .bam extension, filename.bai.
00113     /// \return true = success; false = failure.
00114     bool ReadBamIndex();
00115 
00116     /// Sets the reference to the specified genome sequence object.
00117     /// \param reference pointer to the GenomeSequence object.
00118     void SetReference(GenomeSequence* reference);
00119 
00120     /// Set the type of sequence translation to use when reading
00121     /// the sequence.  Passed down to the SamRecord when it is read.  
00122     // The default type (if this method is never called) is
00123     /// NONE (the sequence is left as-is).
00124     /// \param translation type of sequence translation to use.
00125     void SetReadSequenceTranslation(SamRecord::SequenceTranslation translation);
00126 
00127     /// Set the type of sequence translation to use when writing
00128     /// the sequence.  Passed down to the SamRecord when it is written.
00129     /// The default type (if this method is never called) is
00130     /// NONE (the sequence is left as-is).
00131     /// \param translation type of sequence translation to use.
00132     void SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation);
00133 
00134     /// Close the file if there is one open.
00135     void Close();
00136 
00137     /// Returns whether or not the file has been opened successfully.
00138     /// \return true = open; false = not open.
00139     bool IsOpen();
00140    
00141     /// Returns whether or not the end of the file has been reached.
00142     /// \return true = EOF; false = not eof.
00143     /// If the file is not open, false is returned.
00144     bool IsEOF();
00145    
00146     /// Reads the header section from the file and stores it in
00147     /// the passed in header.
00148     /// \return true = success; false = failure.
00149     bool ReadHeader(SamFileHeader& header);
00150    
00151     /// Writes the specified header into the file.
00152     /// \return true = success; false = failure.
00153     bool WriteHeader(SamFileHeader& header);
00154 
00155     /// Reads the next record from the file & stores it in the passed in record.
00156     /// \return true  = record was successfully set.
00157     ///                false = record was not successfully set.
00158     bool ReadRecord(SamFileHeader& header, SamRecord& record);
00159    
00160     /// Writes the specified record into the file.
00161     /// \return true = success; false = failure.
00162     bool WriteRecord(SamFileHeader& header, SamRecord& record);
00163    
00164     /// Set the flag to validate that the file is sorted as it is read/written.
00165     /// Must be called after the file has been opened.
00166     /// Sorting validation is reset everytime SetReadPosition is called since
00167     /// it can jump around in the file.
00168     void setSortedValidation(SortedType sortType);
00169 
00170     /// Return the number of records that have been read/written so far.
00171     uint32_t GetCurrentRecordCount();
00172 
00173     /// Get the Status of the last call that sets status.
00174     /// To remain backwards compatable - will be removed later.
00175     inline SamStatus::Status GetFailure()
00176     {
00177         return(GetStatus());
00178     }
00179 
00180     /// Get the Status of the last call that sets status.
00181     inline SamStatus::Status GetStatus()
00182     {
00183         return(myStatus.getStatus());
00184     }
00185 
00186     /// Get the Status of the last call that sets status.
00187     inline const char* GetStatusMessage()
00188     {
00189         return(myStatus.getStatusMessage());
00190     }
00191 
00192     /// Sets what part of the BAM file should be read.  This version will
00193     /// set it to only read a specific reference id.  The records for that
00194     /// reference id will be retrieved on each ReadRecord call.  When all
00195     /// records have been retrieved for the specified reference id, ReadRecord
00196     /// will return failure until a new read section is set.
00197     /// Must be called only after the file has been opened for reading.
00198     /// Sorting validation is reset everytime SetReadPosition is called since
00199     /// it can jump around in the file.
00200     /// \param  refID the reference ID of the records to read from the file.
00201     /// \return true = success; false = failure.
00202     bool SetReadSection(int32_t refID);
00203 
00204     /// Sets what part of the BAM file should be read.  This version will
00205     /// set it to only read a specific reference name.  The records for that
00206     /// reference id will be retrieved on each ReadRecord call.  When all
00207     /// records have been retrieved for the specified reference name,
00208     /// ReadRecord will return failure until a new read section is set.
00209     /// Must be called only after the file has been opened for reading.
00210     /// Sorting validation is reset everytime SetReadPosition is called since
00211     /// it can jump around in the file.
00212     /// \param  refName the reference name of the records to read from the file.
00213     /// \return true = success; false = failure.
00214     bool SetReadSection(const char* refName);
00215 
00216     /// Sets what part of the BAM file should be read.  This version will
00217     /// set it to only read a specific reference id and start/end position.
00218     /// The records for this section will be retrieved on each ReadRecord
00219     /// call.  When all records have been retrieved for the specified section,
00220     /// ReadRecord will return failure until a new read section is set.
00221     /// Must be called only after the file has been opened for reading.
00222     /// Sorting validation is reset everytime SetReadPosition is called since
00223     /// it can jump around in the file.
00224     /// \param  refID the reference ID of the records to read from the file.
00225     /// \param  start inclusive 0-based start position of records that should be read for this refID.
00226     /// \param  end exclusive 0-based end position of records that should be read for this refID.
00227     /// \param overlap When true (default), return reads that just overlap the region.  When false, only return reads that fall completely within the region
00228     /// \return true = success; false = failure.   
00229     bool SetReadSection(int32_t refID, int32_t start, int32_t end, 
00230                         bool overlap = true);
00231 
00232     /// Sets what part of the BAM file should be read.  This version will
00233     /// set it to only read a specific reference name and start/end position.
00234     /// The records for this section will be retrieved on each ReadRecord
00235     /// call.  When all records have been retrieved for the specified section,
00236     /// ReadRecord will return failure until a new read section is set.
00237     /// Must be called only after the file has been opened for reading.
00238     /// Sorting validation is reset everytime SetReadPosition is called since
00239     /// it can jump around in the file.
00240     /// \param  refName the reference name of the records to read from the file.
00241     /// \param  start inclusive 0-based start position of records that should be read for this refID.
00242     /// \param  end exclusive 0-based end position of records that should be read for this refID.
00243     /// \param overlap When true (default), return reads that just overlap the region.  When false, only return reads that fall completely within the region
00244     /// \return true = success; false = failure.   
00245     bool SetReadSection(const char* refName, int32_t start, int32_t end, 
00246                         bool overlap = true);
00247 
00248     /// Get the number of mapped reads in the specified reference id.  
00249     /// Returns -1 for out of range refIDs.
00250     /// \param refID reference ID for which to extract the number of mapped reads.
00251     /// \return number of mapped reads for the specified reference id.
00252     int32_t getNumMappedReadsFromIndex(int32_t refID);
00253 
00254     /// Get the number of unmapped reads in the specified reference id.  
00255     /// Returns -1 for out of range refIDs.
00256     /// \param refID reference ID for which to extract the number of unmapped reads.
00257     /// \return number of unmapped reads for the specified reference id.
00258     int32_t getNumUnMappedReadsFromIndex(int32_t refID);
00259 
00260     /// Get the number of mapped reads in the specified reference name.
00261     /// Returns -1 for unknown reference names.
00262     /// \param refName reference name for which to extract the number of mapped reads.
00263     /// \param header header object containing the map from refName to refID
00264     /// \return number of mapped reads for the specified reference name.
00265     int32_t getNumMappedReadsFromIndex(const char* refName,
00266                                        SamFileHeader& header);
00267 
00268     /// Get the number of unmapped reads in the specified reference name.
00269     /// Returns -1 for unknown reference names.
00270     /// \param refName reference name for which to extract the number of unmapped reads.
00271     /// \param header header object containing the map from refName to refID
00272     /// \return number of unmapped reads for the specified reference name.
00273     int32_t getNumUnMappedReadsFromIndex(const char* refName,
00274                                          SamFileHeader& header);
00275 
00276     /// Returns the number of bases in the passed in read that overlap the
00277     /// region that is currently set.
00278     /// \param samRecord to check for overlapping bases.
00279     /// \return number of bases that overlap region that is currently set.
00280     uint32_t GetNumOverlaps(SamRecord& samRecord);
00281 
00282     /// Whether or not statistics should be generated for this file.
00283     /// The value is carried over between files and is not reset, but
00284     /// the statistics themselves are reset between files.
00285     /// \param genStats set to true if statistics should be generated, false if not.
00286     void GenerateStatistics(bool genStats);
00287 
00288     /// Return the bam index if one has been opened.
00289     /// \return const pointer to the bam index, or null if one has not been opened.
00290     const BamIndex* GetBamIndex();
00291 
00292     /// Get the current file position.
00293     /// \return current position in the file.
00294     inline long int GetCurrentPosition()
00295     {
00296         return(iftell(myFilePtr));
00297     }
00298     
00299     inline void DisableBuffering()
00300     {
00301         if(myFilePtr != NULL)
00302         {
00303             myFilePtr->disableBuffering();
00304         }
00305     }
00306 
00307     
00308     inline void PrintStatistics() {if(myStatistics != NULL) myStatistics->print();}
00309 
00310 protected:
00311     void init(const char* filename, OpenType mode, SamFileHeader* header);
00312 
00313     /// Resets the file prepping for a new file.
00314     void resetFile();
00315 
00316     /// Validate that the record is sorted compared to the previously read
00317     /// record if there is one, according to the specified sort order.
00318     /// If the sort order is UNSORTED, true is returned.
00319     /// Sorting validation is reset everytime SetReadPosition is called since
00320     /// it can jump around in the file.
00321     bool validateSortOrder(SamRecord& record, SamFileHeader& header);
00322    
00323     // Return the sort order as defined by the header.  If it is undefined
00324     // or set to an unknown value, UNSORTED is returned.
00325     SortedType getSortOrderFromHeader(SamFileHeader& header);
00326 
00327     /// Overwrites read record to read from the specific reference only.
00328     bool readIndexedRecord(SamFileHeader& header, SamRecord& record);
00329 
00330     bool processNewSection(SamFileHeader &header);
00331 
00332     IFILE  myFilePtr;
00333     GenericSamInterface* myInterfacePtr;
00334 
00335     /// Flag to indicate if a file is open for reading.
00336     bool myIsOpenForRead;
00337     /// Flag to indicate if a file is open for writing.
00338     bool myIsOpenForWrite;
00339     /// Flag to indicate if a header has been read/written - required before
00340     /// being able to read/write a record.
00341     bool myHasHeader;
00342 
00343     SortedType mySortedType;
00344 
00345     /// Previous values used for checking if the file is sorted.
00346     int32_t myPrevCoord;
00347     int32_t myPrevRefID;
00348     std::string myPrevReadName;
00349 
00350     /// Keep a count of the number of records that have been read/written so far.
00351     uint32_t myRecordCount;
00352 
00353     /// Pointer to the statistics for this file.
00354     SamStatistics* myStatistics;
00355    
00356     /// The status of the last SamFile command.
00357     SamStatus myStatus;
00358 
00359     /// Values for reading Sorted BAM files via the index.
00360     bool myIsBamOpenForRead;
00361     bool myNewSection;
00362     // whether to return reads that overlap (true) the section or
00363     // are fully enclosed (false) in the section.
00364     bool myOverlapSection;
00365     int32_t myRefID;
00366     int32_t myStartPos;
00367     int32_t myEndPos;
00368     uint64_t myCurrentChunkEnd;
00369     SortedChunkList myChunksToRead;
00370     BamIndex* myBamIndex;
00371 
00372     GenomeSequence* myRefPtr;
00373     SamRecord::SequenceTranslation myReadTranslation;
00374     SamRecord::SequenceTranslation myWriteTranslation;
00375     
00376     std::string myRefName;
00377 
00378 private:
00379     bool    myAttemptRecovery;
00380 
00381 public:
00382 
00383     bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length);
00384 
00385     void setAttemptRecovery(bool flag = false)
00386     {
00387         myAttemptRecovery = flag;
00388     }
00389 
00390 };
00391 
00392 
00393 class SamFileReader : public SamFile
00394 {
00395 public:
00396 
00397     /// Default Constructor.
00398     SamFileReader();
00399 
00400     /// Constructor that opens the specified file for read.
00401     SamFileReader(const char* filename);
00402 
00403     /// Constructor that opens the specified file for read.
00404     SamFileReader(const char* filename,
00405                   ErrorHandler::HandlingType errorHandlingType);
00406 
00407     /// Constructor that opens the specified file for read and reads
00408     /// the header from the file.
00409     SamFileReader(const char* filename,
00410                   SamFileHeader* header);
00411 
00412     /// Constructor that opens the specified file for read and reads
00413     /// the header from the file.
00414     SamFileReader(const char* filename,
00415                   ErrorHandler::HandlingType errorHandlingType,
00416                   SamFileHeader* header);
00417 
00418     virtual ~SamFileReader();
00419 };
00420 
00421 
00422 class SamFileWriter : public SamFile
00423 {
00424 public:
00425     /// Default Constructor.
00426     SamFileWriter();
00427 
00428     /// Constructor that opens the specified file for write.
00429     SamFileWriter(const char* filename);
00430 
00431     /// Constructor that opens the specified file for write.
00432     SamFileWriter(const char* filename,
00433                   ErrorHandler::HandlingType errorHandlingType);
00434 
00435     /// Constructor that opens the specified file for write and write
00436     /// the specified header into the file.
00437     SamFileWriter(const char* filename,
00438                   SamFileHeader* header);
00439 
00440     /// Constructor that opens the specified file for write and write
00441     /// the specified header into the file.
00442     SamFileWriter(const char* filename,
00443                   ErrorHandler::HandlingType errorHandlingType,
00444                   SamFileHeader* header);
00445 
00446     virtual ~SamFileWriter();
00447 };
00448 
00449 #endif
Generated on Tue Aug 23 18:19:04 2011 for libStatGen Software by  doxygen 1.6.3