SamFile.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_FILE_H__
00019 #define __SAM_FILE_H__
00020 
00021 #include "SamStatus.h"
00022 #include "InputFile.h"
00023 #include "SamFileHeader.h"
00024 #include "SamRecord.h"
00025 #include "GenericSamInterface.h"
00026 #include "BamIndex.h"
00027 #include "SamStatistics.h"
00028 
00029 /// Allows the user to easily read/write a SAM/BAM file.
00030 class SamFile
00031 {
00032 public:
00033     /// Enum for indicating whether to open the file for read or write.
00034     enum OpenType {
00035         READ, ///< open for reading.
00036         WRITE ///< open for writing.
00037     };
00038     
00039     
00040     /// Enum for indicating the type of sort for the file.
00041     enum SortedType {
00042         UNSORTED = 0, ///< file is not sorted.
00043         FLAG,         ///< SO flag from the header indicates the sort type.
00044         COORDINATE,   ///< file is sorted by coordinate.
00045         QUERY_NAME    ///< file is sorted by queryname.
00046     };
00047     
00048     /// Default Constructor.
00049     SamFile();
00050 
00051     /// Constructor that sets the error handling type.
00052     /// \param errorHandlingType how to handle errors.
00053     SamFile(ErrorHandler::HandlingType errorHandlingType);
00054 
00055     /// Constructor that opens the specified file based on the specified mode
00056     /// (READ/WRITE).
00057     /// \param filename name of the file to open.
00058     /// \param mode mode to use for opening the file.
00059     SamFile(const char* filename, OpenType mode);
00060 
00061     /// Constructor that opens the specified file based on the specified mode
00062     /// (READ/WRITE) and handles errors per the specified handleType.
00063     /// \param filename name of the file to open.
00064     /// \param mode mode to use for opening the file.
00065     /// \param errorHandlingType how to handle errors.
00066     SamFile(const char* filename, OpenType mode,
00067             ErrorHandler::HandlingType errorHandlingType);
00068 
00069     virtual ~SamFile();
00070    
00071     /// Open a sam/bam file for reading with the specified filename.
00072     /// \param  filename: the sam/bam file to open for reading.
00073     /// \return true = success; false = failure.   
00074     bool OpenForRead(const char * filename);
00075 
00076     /// Open a sam/bam file for writing with the specified filename.
00077     /// \return true = success; false = failure.
00078     bool OpenForWrite(const char * filename);
00079 
00080     /// Reads the specified bam index file.  It must be read prior to setting a
00081     /// read section, for seeking and reading portions of a bam file.
00082     /// \return true = success; false = failure.   
00083     bool ReadBamIndex(const char * filename);
00084 
00085     /// Sets the reference to the specified genome sequence object.
00086     /// \param reference pointer to the GenomeSequence object.
00087     void SetReference(GenomeSequence* reference);
00088 
00089     /// Set the type of sequence translation to use when reading
00090     /// the sequence.  Passed down to the SamRecord when it is read.  
00091     // The default type (if this method is never called) is
00092     /// NONE (the sequence is left as-is).
00093     /// \param translation type of sequence translation to use.
00094     void SetReadSequenceTranslation(SamRecord::SequenceTranslation translation);
00095 
00096     /// Set the type of sequence translation to use when writing
00097     /// the sequence.  Passed down to the SamRecord when it is written.
00098     /// The default type (if this method is never called) is
00099     /// NONE (the sequence is left as-is).
00100     /// \param translation type of sequence translation to use.
00101     void SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation);
00102 
00103     /// Close the file if there is one open.
00104     void Close();
00105 
00106     /// Returns whether or not the end of the file has been reached.
00107     /// \return true = EOF; false = not eof.
00108     /// If the file is not open, false is returned.
00109     bool IsEOF();
00110    
00111     /// Reads the header section from the file and stores it in
00112     /// the passed in header.
00113     /// \return true = success; false = failure.
00114     bool ReadHeader(SamFileHeader& header);
00115    
00116     /// Writes the specified header into the file.
00117     /// \return true = success; false = failure.
00118     bool WriteHeader(SamFileHeader& header);
00119 
00120     /// Reads the next record from the file & stores it in the passed in record.
00121     /// \return true  = record was successfully set.
00122     ///                false = record was not successfully set.
00123     bool ReadRecord(SamFileHeader& header, SamRecord& record);
00124    
00125     /// Writes the specified record into the file.
00126     /// \return true = success; false = failure.
00127     bool WriteRecord(SamFileHeader& header, SamRecord& record);
00128    
00129     /// Set the flag to validate that the file is sorted as it is read/written.
00130     /// Must be called after the file has been opened.
00131     void setSortedValidation(SortedType sortType);
00132 
00133     /// Return the number of records that have been read/written so far.
00134     uint32_t GetCurrentRecordCount();
00135 
00136     /// Get the Status of the last call that sets status.
00137     /// To remain backwards compatable - will be removed later.
00138     inline SamStatus::Status GetFailure()
00139     {
00140         return(GetStatus());
00141     }
00142 
00143     /// Get the Status of the last call that sets status.
00144     inline SamStatus::Status GetStatus()
00145     {
00146         return(myStatus.getStatus());
00147     }
00148 
00149     /// Get the Status of the last call that sets status.
00150     inline const char* GetStatusMessage()
00151     {
00152         return(myStatus.getStatusMessage());
00153     }
00154 
00155     /// Sets what part of the BAM file should be read.  This version will
00156     /// set it to only read a specific reference id.  The records for that
00157     /// reference id will be retrieved on each ReadRecord call.  When all
00158     /// records have been retrieved for the specified reference id, ReadRecord
00159     /// will return failure until a new read section is set.
00160     /// Must be called only after the file has been opened for reading.
00161     /// \param  refID the reference ID of the records to read from the file.
00162     /// \return true = success; false = failure.
00163     bool SetReadSection(int32_t refID);
00164 
00165     /// Sets what part of the BAM file should be read.  This version will
00166     /// set it to only read a specific reference name.  The records for that
00167     /// reference id will be retrieved on each ReadRecord call.  When all
00168     /// records have been retrieved for the specified reference name,
00169     /// ReadRecord will return failure until a new read section is set.
00170     /// Must be called only after the file has been opened for reading.
00171     /// \param  refName the reference name of the records to read from the file.
00172     /// \return true = success; false = failure.
00173     bool SetReadSection(const char* refName);
00174 
00175     /// Sets what part of the BAM file should be read.  This version will
00176     /// set it to only read a specific reference id and start/end position.
00177     /// The records for this section will be retrieved on each ReadRecord
00178     /// call.  When all records have been retrieved for the specified section,
00179     /// ReadRecord will return failure until a new read section is set.
00180     /// Must be called only after the file has been opened for reading.
00181     /// \param  refID the reference ID of the records to read from the file.
00182     /// \param  start inclusive 0-based start position of records that should be read for this refID.
00183     /// \param  end exclusive 0-based end position of records that should be read for this refID.
00184     /// \return true = success; false = failure.   
00185     bool SetReadSection(int32_t refID, int32_t start, int32_t end);
00186 
00187     /// Sets what part of the BAM file should be read.  This version will
00188     /// set it to only read a specific reference name and start/end position.
00189     /// The records for this section will be retrieved on each ReadRecord
00190     /// call.  When all records have been retrieved for the specified section,
00191     /// ReadRecord will return failure until a new read section is set.
00192     /// Must be called only after the file has been opened for reading.
00193     /// \param  refName the reference name of the records to read from the file.
00194     /// \param  start inclusive 0-based start position of records that should be read for this refID.
00195     /// \param  end exclusive 0-based end position of records that should be read for this refID.
00196     /// \return true = success; false = failure.   
00197     bool SetReadSection(const char* refName, int32_t start, int32_t end);
00198 
00199     /// Returns the number of bases in the passed in read that overlap the
00200     /// region that is currently set.
00201     /// \param samRecord to check for overlapping bases.
00202     /// \return number of bases that overlap region that is currently set.
00203     uint32_t GetNumOverlaps(SamRecord& samRecord);
00204 
00205     /// Whether or not statistics should be generated for this file.
00206     /// The value is carried over between files and is not reset, but
00207     /// the statistics themselves are reset between files.
00208     /// \param genStats set to true if statistics should be generated, false if not.
00209     void GenerateStatistics(bool genStats);
00210 
00211     inline void PrintStatistics() {if(myStatistics != NULL) myStatistics->print();}
00212 
00213 protected:
00214     /// Resets the file prepping for a new file.
00215     void resetFile();
00216 
00217     /// Validate that the record is sorted compared to the previously read
00218     /// record if there is one, according to the specified sort order.
00219     /// If the sort order is UNSORTED, true is returned.
00220     bool validateSortOrder(SamRecord& record, SamFileHeader& header);
00221    
00222     // Return the sort order as defined by the header.  If it is undefined
00223     // or set to an unknown value, UNSORTED is returned.
00224     SortedType getSortOrderFromHeader(SamFileHeader& header);
00225 
00226     /// Overwrites read record to read from the specific reference only.
00227     bool readIndexedRecord(SamFileHeader& header, SamRecord& record);
00228 
00229     bool processNewSection(SamFileHeader &header);
00230 
00231     IFILE  myFilePtr;
00232     GenericSamInterface* myInterfacePtr;
00233 
00234     /// Flag to indicate if a file is open for reading.
00235     bool myIsOpenForRead;
00236     /// Flag to indicate if a file is open for writing.
00237     bool myIsOpenForWrite;
00238     /// Flag to indicate if a header has been read/written - required before
00239     /// being able to read/write a record.
00240     bool myHasHeader;
00241 
00242     SortedType mySortedType;
00243 
00244     /// Previous values used for checking if the file is sorted.
00245     int32_t myPrevCoord;
00246     int32_t myPrevRefID;
00247     std::string myPrevReadName;
00248 
00249     /// Keep a count of the number of records that have been read/written so far.
00250     uint32_t myRecordCount;
00251 
00252     /// Pointer to the statistics for this file.
00253     SamStatistics* myStatistics;
00254    
00255     /// The status of the last SamFile command.
00256     SamStatus myStatus;
00257 
00258     /// Values for reading Sorted BAM files via the index.
00259     bool myIsBamOpenForRead;
00260     bool myNewSection;
00261     int32_t myRefID;
00262     int32_t myStartPos;
00263     int32_t myEndPos;
00264     uint64_t myCurrentChunkEnd;
00265     SortedChunkList myChunksToRead;
00266     BamIndex* myBamIndex;
00267 
00268     GenomeSequence* myRefPtr;
00269     SamRecord::SequenceTranslation myReadTranslation;
00270     SamRecord::SequenceTranslation myWriteTranslation;
00271     
00272     std::string myRefName;
00273 };
00274 
00275 
00276 class SamFileReader : public SamFile
00277 {
00278 public:
00279 
00280     /// Default Constructor.
00281     SamFileReader();
00282 
00283     /// Constructor that opens the specified file for read.
00284     SamFileReader(const char* filename);
00285 
00286     virtual ~SamFileReader();
00287 };
00288 
00289 
00290 class SamFileWriter : public SamFile
00291 {
00292 public:
00293     /// Default Constructor.
00294     SamFileWriter();
00295 
00296     /// Constructor that opens the specified file for write.
00297     SamFileWriter(const char* filename);
00298 
00299     virtual ~SamFileWriter();
00300 };
00301 
00302 #endif
Generated on Thu Dec 9 12:22:12 2010 for StatGen Software by  doxygen 1.6.3