SamFile.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_FILE_H__
00019 #define __SAM_FILE_H__
00020 
00021 #include "SamStatus.h"
00022 #include "InputFile.h"
00023 #include "SamFileHeader.h"
00024 #include "SamRecord.h"
00025 #include "GenericSamInterface.h"
00026 #include "BamIndex.h"
00027 #include "SamStatistics.h"
00028 
00029 /// Allows the user to easily read/write a SAM/BAM file.
00030 class SamFile
00031 {
00032 public:
00033     /// Enum for indicating whether to open the file for read or write.
00034     enum OpenType {
00035         READ, ///< open for reading.
00036         WRITE ///< open for writing.
00037     };
00038     
00039     
00040     /// Enum for indicating the type of sort for the file.
00041     enum SortedType {
00042         UNSORTED = 0, ///< file is not sorted.
00043         FLAG,         ///< SO flag from the header indicates the sort type.
00044         COORDINATE,   ///< file is sorted by coordinate.
00045         QUERY_NAME    ///< file is sorted by queryname.
00046     };
00047     
00048     /// Default Constructor.
00049     SamFile();
00050 
00051     /// Constructor that sets the error handling type.
00052     /// \param errorHandlingType how to handle errors.
00053     SamFile(ErrorHandler::HandlingType errorHandlingType);
00054 
00055     /// Constructor that opens the specified file based on the specified mode
00056     /// (READ/WRITE).
00057     /// \param filename name of the file to open.
00058     /// \param mode mode to use for opening the file.
00059     SamFile(const char* filename, OpenType mode);
00060 
00061     /// Constructor that opens the specified file based on the specified mode
00062     /// (READ/WRITE) and handles errors per the specified handleType.
00063     /// \param filename name of the file to open.
00064     /// \param mode mode to use for opening the file.
00065     /// \param errorHandlingType how to handle errors.
00066     SamFile(const char* filename, OpenType mode,
00067             ErrorHandler::HandlingType errorHandlingType);
00068 
00069     /// Constructor that opens the specified file based on the specified mode
00070     /// (READ/WRITE).
00071     /// \param filename name of the file to open.
00072     /// \param mode mode to use for opening the file.
00073     /// \param header to read into or write from
00074     SamFile(const char* filename, OpenType mode, SamFileHeader* header);
00075 
00076     /// Constructor that opens the specified file based on the specified mode
00077     /// (READ/WRITE) and handles errors per the specified handleType.
00078     /// \param filename name of the file to open.
00079     /// \param mode mode to use for opening the file.
00080     /// \param errorHandlingType how to handle errors.
00081     /// \param header to read into or write from
00082     SamFile(const char* filename, OpenType mode,
00083             ErrorHandler::HandlingType errorHandlingType,
00084             SamFileHeader* header);
00085 
00086     virtual ~SamFile();
00087    
00088     /// Open a sam/bam file for reading with the specified filename.
00089     /// \param  filename the sam/bam file to open for reading.
00090     /// \param header to read into or write from (optional)
00091     /// \return true = success; false = failure.
00092     bool OpenForRead(const char * filename, SamFileHeader* header = NULL);
00093 
00094     /// Open a sam/bam file for writing with the specified filename.
00095     /// \param  filename the sam/bam file to open for writing.
00096     /// \param header to read into or write from (optional)
00097     /// \return true = success; false = failure.
00098     bool OpenForWrite(const char * filename, SamFileHeader* header = NULL);
00099 
00100     /// Read the specified bam index file.  It must be read prior to setting a
00101     /// read section, for seeking and reading portions of a bam file.
00102     /// \param filename the name of the bam index file to be read.
00103     /// \return true = success; false = failure.
00104     bool ReadBamIndex(const char * filename);
00105 
00106     /// Read the bam index file using the BAM filename as a base. 
00107     /// It must be read prior to setting a read section, for seeking
00108     /// and reading portions of a bam file.
00109     /// Must be read after opening the BAM file since it uses the
00110     /// BAM filename as a base name for the index file.
00111     /// First it tries filename.bam.bai. If that fails, it tries
00112     /// it without the .bam extension, filename.bai.
00113     /// \return true = success; false = failure.
00114     bool ReadBamIndex();
00115 
00116     /// Sets the reference to the specified genome sequence object.
00117     /// \param reference pointer to the GenomeSequence object.
00118     void SetReference(GenomeSequence* reference);
00119 
00120     /// Set the type of sequence translation to use when reading
00121     /// the sequence.  Passed down to the SamRecord when it is read.  
00122     // The default type (if this method is never called) is
00123     /// NONE (the sequence is left as-is).
00124     /// \param translation type of sequence translation to use.
00125     void SetReadSequenceTranslation(SamRecord::SequenceTranslation translation);
00126 
00127     /// Set the type of sequence translation to use when writing
00128     /// the sequence.  Passed down to the SamRecord when it is written.
00129     /// The default type (if this method is never called) is
00130     /// NONE (the sequence is left as-is).
00131     /// \param translation type of sequence translation to use.
00132     void SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation);
00133 
00134     /// Close the file if there is one open.
00135     void Close();
00136 
00137     /// Returns whether or not the end of the file has been reached.
00138     /// \return true = EOF; false = not eof.
00139     /// If the file is not open, false is returned.
00140     bool IsEOF();
00141    
00142     /// Reads the header section from the file and stores it in
00143     /// the passed in header.
00144     /// \return true = success; false = failure.
00145     bool ReadHeader(SamFileHeader& header);
00146    
00147     /// Writes the specified header into the file.
00148     /// \return true = success; false = failure.
00149     bool WriteHeader(SamFileHeader& header);
00150 
00151     /// Reads the next record from the file & stores it in the passed in record.
00152     /// \return true  = record was successfully set.
00153     ///                false = record was not successfully set.
00154     bool ReadRecord(SamFileHeader& header, SamRecord& record);
00155    
00156     /// Writes the specified record into the file.
00157     /// \return true = success; false = failure.
00158     bool WriteRecord(SamFileHeader& header, SamRecord& record);
00159    
00160     /// Set the flag to validate that the file is sorted as it is read/written.
00161     /// Must be called after the file has been opened.
00162     /// Sorting validation is reset everytime SetReadPosition is called since
00163     /// it can jump around in the file.
00164     void setSortedValidation(SortedType sortType);
00165 
00166     /// Return the number of records that have been read/written so far.
00167     uint32_t GetCurrentRecordCount();
00168 
00169     /// Get the Status of the last call that sets status.
00170     /// To remain backwards compatable - will be removed later.
00171     inline SamStatus::Status GetFailure()
00172     {
00173         return(GetStatus());
00174     }
00175 
00176     /// Get the Status of the last call that sets status.
00177     inline SamStatus::Status GetStatus()
00178     {
00179         return(myStatus.getStatus());
00180     }
00181 
00182     /// Get the Status of the last call that sets status.
00183     inline const char* GetStatusMessage()
00184     {
00185         return(myStatus.getStatusMessage());
00186     }
00187 
00188     /// Sets what part of the BAM file should be read.  This version will
00189     /// set it to only read a specific reference id.  The records for that
00190     /// reference id will be retrieved on each ReadRecord call.  When all
00191     /// records have been retrieved for the specified reference id, ReadRecord
00192     /// will return failure until a new read section is set.
00193     /// Must be called only after the file has been opened for reading.
00194     /// Sorting validation is reset everytime SetReadPosition is called since
00195     /// it can jump around in the file.
00196     /// \param  refID the reference ID of the records to read from the file.
00197     /// \return true = success; false = failure.
00198     bool SetReadSection(int32_t refID);
00199 
00200     /// Sets what part of the BAM file should be read.  This version will
00201     /// set it to only read a specific reference name.  The records for that
00202     /// reference id will be retrieved on each ReadRecord call.  When all
00203     /// records have been retrieved for the specified reference name,
00204     /// ReadRecord will return failure until a new read section is set.
00205     /// Must be called only after the file has been opened for reading.
00206     /// Sorting validation is reset everytime SetReadPosition is called since
00207     /// it can jump around in the file.
00208     /// \param  refName the reference name of the records to read from the file.
00209     /// \return true = success; false = failure.
00210     bool SetReadSection(const char* refName);
00211 
00212     /// Sets what part of the BAM file should be read.  This version will
00213     /// set it to only read a specific reference id and start/end position.
00214     /// The records for this section will be retrieved on each ReadRecord
00215     /// call.  When all records have been retrieved for the specified section,
00216     /// ReadRecord will return failure until a new read section is set.
00217     /// Must be called only after the file has been opened for reading.
00218     /// Sorting validation is reset everytime SetReadPosition is called since
00219     /// it can jump around in the file.
00220     /// \param  refID the reference ID of the records to read from the file.
00221     /// \param  start inclusive 0-based start position of records that should be read for this refID.
00222     /// \param  end exclusive 0-based end position of records that should be read for this refID.
00223     /// \return true = success; false = failure.   
00224     bool SetReadSection(int32_t refID, int32_t start, int32_t end);
00225 
00226     /// Sets what part of the BAM file should be read.  This version will
00227     /// set it to only read a specific reference name and start/end position.
00228     /// The records for this section will be retrieved on each ReadRecord
00229     /// call.  When all records have been retrieved for the specified section,
00230     /// ReadRecord will return failure until a new read section is set.
00231     /// Must be called only after the file has been opened for reading.
00232     /// Sorting validation is reset everytime SetReadPosition is called since
00233     /// it can jump around in the file.
00234     /// \param  refName the reference name of the records to read from the file.
00235     /// \param  start inclusive 0-based start position of records that should be read for this refID.
00236     /// \param  end exclusive 0-based end position of records that should be read for this refID.
00237     /// \return true = success; false = failure.   
00238     bool SetReadSection(const char* refName, int32_t start, int32_t end);
00239 
00240     /// Get the number of mapped reads in the specified reference id.  
00241     /// Returns -1 for out of range refIDs.
00242     /// \param refID reference ID for which to extract the number of mapped reads.
00243     /// \return number of mapped reads for the specified reference id.
00244     int32_t getNumMappedReadsFromIndex(int32_t refID);
00245 
00246     /// Get the number of unmapped reads in the specified reference id.  
00247     /// Returns -1 for out of range refIDs.
00248     /// \param refID reference ID for which to extract the number of unmapped reads.
00249     /// \return number of unmapped reads for the specified reference id.
00250     int32_t getNumUnMappedReadsFromIndex(int32_t refID);
00251 
00252     /// Get the number of mapped reads in the specified reference name.
00253     /// Returns -1 for unknown reference names.
00254     /// \param refName reference name for which to extract the number of mapped reads.
00255     /// \param header header object containing the map from refName to refID
00256     /// \return number of mapped reads for the specified reference name.
00257     int32_t getNumMappedReadsFromIndex(const char* refName,
00258                                        SamFileHeader& header);
00259 
00260     /// Get the number of unmapped reads in the specified reference name.
00261     /// Returns -1 for unknown reference names.
00262     /// \param refName reference name for which to extract the number of unmapped reads.
00263     /// \param header header object containing the map from refName to refID
00264     /// \return number of unmapped reads for the specified reference name.
00265     int32_t getNumUnMappedReadsFromIndex(const char* refName,
00266                                          SamFileHeader& header);
00267 
00268     /// Returns the number of bases in the passed in read that overlap the
00269     /// region that is currently set.
00270     /// \param samRecord to check for overlapping bases.
00271     /// \return number of bases that overlap region that is currently set.
00272     uint32_t GetNumOverlaps(SamRecord& samRecord);
00273 
00274     /// Whether or not statistics should be generated for this file.
00275     /// The value is carried over between files and is not reset, but
00276     /// the statistics themselves are reset between files.
00277     /// \param genStats set to true if statistics should be generated, false if not.
00278     void GenerateStatistics(bool genStats);
00279 
00280     inline void PrintStatistics() {if(myStatistics != NULL) myStatistics->print();}
00281 
00282 protected:
00283     void init(const char* filename, OpenType mode, SamFileHeader* header);
00284 
00285     /// Resets the file prepping for a new file.
00286     void resetFile();
00287 
00288     /// Validate that the record is sorted compared to the previously read
00289     /// record if there is one, according to the specified sort order.
00290     /// If the sort order is UNSORTED, true is returned.
00291     /// Sorting validation is reset everytime SetReadPosition is called since
00292     /// it can jump around in the file.
00293     bool validateSortOrder(SamRecord& record, SamFileHeader& header);
00294    
00295     // Return the sort order as defined by the header.  If it is undefined
00296     // or set to an unknown value, UNSORTED is returned.
00297     SortedType getSortOrderFromHeader(SamFileHeader& header);
00298 
00299     /// Overwrites read record to read from the specific reference only.
00300     bool readIndexedRecord(SamFileHeader& header, SamRecord& record);
00301 
00302     bool processNewSection(SamFileHeader &header);
00303 
00304     IFILE  myFilePtr;
00305     GenericSamInterface* myInterfacePtr;
00306 
00307     /// Flag to indicate if a file is open for reading.
00308     bool myIsOpenForRead;
00309     /// Flag to indicate if a file is open for writing.
00310     bool myIsOpenForWrite;
00311     /// Flag to indicate if a header has been read/written - required before
00312     /// being able to read/write a record.
00313     bool myHasHeader;
00314 
00315     SortedType mySortedType;
00316 
00317     /// Previous values used for checking if the file is sorted.
00318     int32_t myPrevCoord;
00319     int32_t myPrevRefID;
00320     std::string myPrevReadName;
00321 
00322     /// Keep a count of the number of records that have been read/written so far.
00323     uint32_t myRecordCount;
00324 
00325     /// Pointer to the statistics for this file.
00326     SamStatistics* myStatistics;
00327    
00328     /// The status of the last SamFile command.
00329     SamStatus myStatus;
00330 
00331     /// Values for reading Sorted BAM files via the index.
00332     bool myIsBamOpenForRead;
00333     bool myNewSection;
00334     int32_t myRefID;
00335     int32_t myStartPos;
00336     int32_t myEndPos;
00337     uint64_t myCurrentChunkEnd;
00338     SortedChunkList myChunksToRead;
00339     BamIndex* myBamIndex;
00340 
00341     GenomeSequence* myRefPtr;
00342     SamRecord::SequenceTranslation myReadTranslation;
00343     SamRecord::SequenceTranslation myWriteTranslation;
00344     
00345     std::string myRefName;
00346 };
00347 
00348 
00349 class SamFileReader : public SamFile
00350 {
00351 public:
00352 
00353     /// Default Constructor.
00354     SamFileReader();
00355 
00356     /// Constructor that opens the specified file for read.
00357     SamFileReader(const char* filename);
00358 
00359     /// Constructor that opens the specified file for read.
00360     SamFileReader(const char* filename,
00361                   ErrorHandler::HandlingType errorHandlingType);
00362 
00363     /// Constructor that opens the specified file for read and reads
00364     /// the header from the file.
00365     SamFileReader(const char* filename,
00366                   SamFileHeader* header);
00367 
00368     /// Constructor that opens the specified file for read and reads
00369     /// the header from the file.
00370     SamFileReader(const char* filename,
00371                   ErrorHandler::HandlingType errorHandlingType,
00372                   SamFileHeader* header);
00373 
00374     virtual ~SamFileReader();
00375 };
00376 
00377 
00378 class SamFileWriter : public SamFile
00379 {
00380 public:
00381     /// Default Constructor.
00382     SamFileWriter();
00383 
00384     /// Constructor that opens the specified file for write.
00385     SamFileWriter(const char* filename);
00386 
00387     /// Constructor that opens the specified file for write.
00388     SamFileWriter(const char* filename,
00389                   ErrorHandler::HandlingType errorHandlingType);
00390 
00391     /// Constructor that opens the specified file for write and write
00392     /// the specified header into the file.
00393     SamFileWriter(const char* filename,
00394                   SamFileHeader* header);
00395 
00396     /// Constructor that opens the specified file for write and write
00397     /// the specified header into the file.
00398     SamFileWriter(const char* filename,
00399                   ErrorHandler::HandlingType errorHandlingType,
00400                   SamFileHeader* header);
00401 
00402     virtual ~SamFileWriter();
00403 };
00404 
00405 #endif
Generated on Tue Mar 22 22:50:13 2011 for StatGen Software by  doxygen 1.6.3