SamFile.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_FILE_H__
00019 #define __SAM_FILE_H__
00020 
00021 #include "SamStatus.h"
00022 #include "InputFile.h"
00023 #include "SamFileHeader.h"
00024 #include "SamRecord.h"
00025 #include "GenericSamInterface.h"
00026 #include "BamIndex.h"
00027 #include "SamStatistics.h"
00028 
00029 class SamFile
00030 {
00031 public:
00032     enum OpenType {READ, WRITE};
00033 
00034     /// Enum for indicating the type of sort for the file.
00035     enum SortedType {
00036         UNSORTED = 0, ///< file is not sorted.
00037         FLAG,         ///< SO flag from the header indicates the sort type.
00038         COORDINATE,   ///< file is sorted by coordinate.
00039         QUERY_NAME    ///< file is sorted by queryname.
00040     };
00041     
00042     /// Default Constructor.
00043     SamFile();
00044 
00045     /// Constructor that sets the error handling type.
00046     /// \param errorHandlingType how to handle errors.
00047     SamFile(ErrorHandler::HandlingType errorHandlingType);
00048 
00049     /// Constructor that opens the specified file based on the specified mode
00050     /// (READ/WRITE).
00051     /// \param filename name of the file to open.
00052     /// \param mode mode to use for opening the file.
00053     SamFile(const char* filename, OpenType mode);
00054 
00055     /// Constructor that opens the specified file based on the specified mode
00056     /// (READ/WRITE) and handles errors per the specified handleType.
00057     /// \param filename name of the file to open.
00058     /// \param mode mode to use for opening the file.
00059     /// \param errorHandlingType how to handle errors.
00060     SamFile(const char* filename, OpenType mode,
00061             ErrorHandler::HandlingType errorHandlingType);
00062 
00063     virtual ~SamFile();
00064    
00065     /// Open a sam/bam file for reading with the specified filename.
00066     /// \param  filename: the sam/bam file to open for reading.
00067     /// \return true = success; false = failure.   
00068     bool OpenForRead(const char * filename);
00069 
00070     /// Open a sam/bam file for writing with the specified filename.
00071     /// \return true = success; false = failure.
00072     bool OpenForWrite(const char * filename);
00073 
00074     /// Reads the specified bam index file.  It must be read prior to setting a
00075     /// read section, for seeking and reading portions of a bam file.
00076     /// \return true = success; false = failure.   
00077     bool ReadBamIndex(const char * filename);
00078 
00079     /// Close the file if there is one open.
00080     void Close();
00081 
00082     /// Returns whether or not the end of the file has been reached.
00083     /// \return true = EOF; false = not eof.
00084     /// If the file is not open, false is returned.
00085     bool IsEOF();
00086    
00087     /// Reads the header section from the file and stores it in
00088     /// the passed in header.
00089     /// \return true = success; false = failure.
00090     bool ReadHeader(SamFileHeader& header);
00091    
00092     /// Writes the specified header into the file.
00093     /// \return true = success; false = failure.
00094     bool WriteHeader(SamFileHeader& header);
00095 
00096     /// Reads the next record from the file & stores it in the passed in record.
00097     /// \return true  = record was successfully set.
00098     ///                false = record was not successfully set.
00099     bool ReadRecord(SamFileHeader& header, SamRecord& record);
00100    
00101     /// Writes the specified record into the file.
00102     /// \return true = success; false = failure.
00103     bool WriteRecord(SamFileHeader& header, SamRecord& record);
00104    
00105     /// Set the flag to validate that the file is sorted as it is read/written.
00106     /// Must be called after the file has been opened.
00107     void setSortedValidation(SortedType sortType);
00108 
00109     /// Return the number of records that have been read/written so far.
00110     uint32_t GetCurrentRecordCount();
00111 
00112     /// Get the Status of the last call that sets status.
00113     /// To remain backwards compatable - will be removed later.
00114     inline SamStatus::Status GetFailure()
00115     {
00116         return(GetStatus());
00117     }
00118 
00119     /// Get the Status of the last call that sets status.
00120     inline SamStatus::Status GetStatus()
00121     {
00122         return(myStatus.getStatus());
00123     }
00124 
00125     /// Get the Status of the last call that sets status.
00126     inline const char* GetStatusMessage()
00127     {
00128         return(myStatus.getStatusMessage());
00129     }
00130 
00131     /// Sets what part of the BAM file should be read.  This version will
00132     /// set it to only read a specific reference id.  The records for that
00133     /// reference id will be retrieved on each ReadRecord call.  When all
00134     /// records have been retrieved for the specified reference id, ReadRecord
00135     /// will return failure until a new read section is set.
00136     /// Must be called only after the file has been opened for reading.
00137     /// \param  refID the reference ID of the records to read from the file.
00138     /// \return true = success; false = failure.
00139     bool SetReadSection(int32_t refID);
00140 
00141     /// Sets what part of the BAM file should be read.  This version will
00142     /// set it to only read a specific reference name.  The records for that
00143     /// reference id will be retrieved on each ReadRecord call.  When all
00144     /// records have been retrieved for the specified reference name,
00145     /// ReadRecord will return failure until a new read section is set.
00146     /// Must be called only after the file has been opened for reading.
00147     /// \param  refName the reference name of the records to read from the file.
00148     /// \return true = success; false = failure.
00149     bool SetReadSection(const char* refName);
00150 
00151     /// Sets what part of the BAM file should be read.  This version will
00152     /// set it to only read a specific reference id and start/end position.
00153     /// The records for this section will be retrieved on each ReadRecord
00154     /// call.  When all records have been retrieved for the specified section,
00155     /// ReadRecord will return failure until a new read section is set.
00156     /// Must be called only after the file has been opened for reading.
00157     /// \param  refID the reference ID of the records to read from the file.
00158     /// \param  start inclusive 0-based start position of records that should be read for this refID.
00159     /// \param  end exclusive 0-based end position of records that should be read for this refID.
00160     /// \return true = success; false = failure.   
00161     bool SetReadSection(int32_t refID, int32_t start, int32_t end);
00162 
00163     /// Sets what part of the BAM file should be read.  This version will
00164     /// set it to only read a specific reference name and start/end position.
00165     /// The records for this section will be retrieved on each ReadRecord
00166     /// call.  When all records have been retrieved for the specified section,
00167     /// ReadRecord will return failure until a new read section is set.
00168     /// Must be called only after the file has been opened for reading.
00169     /// \param  refName the reference name of the records to read from the file.
00170     /// \param  start inclusive 0-based start position of records that should be read for this refID.
00171     /// \param  end exclusive 0-based end position of records that should be read for this refID.
00172     /// \return true = success; false = failure.   
00173     bool SetReadSection(const char* refName, int32_t start, int32_t end);
00174 
00175     /// Returns the number of bases in the passed in read that overlap the
00176     /// region that is currently set.
00177     /// \param samRecord to check for overlapping bases.
00178     /// \return number of bases that overlap region that is currently set.
00179     uint32_t GetNumOverlaps(SamRecord& samRecord);
00180 
00181     /// Whether or not statistics should be generated for this file.
00182     /// The value is carried over between files and is not reset, but
00183     /// the statistics themselves are reset between files.
00184     /// \param genStats set to true if statistics should be generated, false if not.
00185     void GenerateStatistics(bool genStats);
00186 
00187     inline void PrintStatistics() {if(myStatistics != NULL) myStatistics->print();}
00188 
00189 protected:
00190     void resetFile();
00191 
00192     /// Validate that the record is sorted compared to the previously read record
00193     /// if there is one, according to the specified sort order.
00194     /// If the sort order is UNSORTED, true is returned.
00195     bool validateSortOrder(SamRecord& record, SamFileHeader& header);
00196    
00197     // Return the sort order as defined by the header.  If it is undefined
00198     // or set to an unknown value, UNSORTED is returned.
00199     SortedType getSortOrderFromHeader(SamFileHeader& header);
00200 
00201     /// Overwrites read record to read from the specific reference only.
00202     bool readIndexedRecord(SamFileHeader& header, SamRecord& record);
00203 
00204     bool processNewSection(SamFileHeader &header);
00205 
00206     IFILE  myFilePtr;
00207     GenericSamInterface* myInterfacePtr;
00208 
00209     /// Flag to indicate if a file is open for reading.
00210     bool myIsOpenForRead;
00211     /// Flag to indicate if a file is open for writing.
00212     bool myIsOpenForWrite;
00213     /// Flag to indicate if a header has been read/written - required before
00214     /// being able to read/write a record.
00215     bool myHasHeader;
00216 
00217     SortedType mySortedType;
00218 
00219     /// Previous values used for checking if the file is sorted.
00220     int32_t myPrevCoord;
00221     int32_t myPrevRefID;
00222     std::string myPrevReadName;
00223 
00224     /// Keep a count of the number of records that have been read/written so far.
00225     uint32_t myRecordCount;
00226 
00227     /// Pointer to the statistics for this file.
00228     SamStatistics* myStatistics;
00229    
00230     /// The status of the last SamFile command.
00231     SamStatus myStatus;
00232 
00233     /// Values for reading Sorted BAM files via the index.
00234     bool myIsBamOpenForRead;
00235     bool myNewSection;
00236     int32_t myRefID;
00237     int32_t myStartPos;
00238     int32_t myEndPos;
00239     uint64_t myCurrentChunkEnd;
00240     SortedChunkList myChunksToRead;
00241     BamIndex* myBamIndex;
00242 
00243     std::string myRefName;
00244 };
00245 
00246 
00247 class SamFileReader : public SamFile
00248 {
00249 public:
00250 
00251     /// Default Constructor.
00252     SamFileReader();
00253 
00254     /// Constructor that opens the specified file for read.
00255     SamFileReader(const char* filename);
00256 
00257     virtual ~SamFileReader();
00258 };
00259 
00260 
00261 class SamFileWriter : public SamFile
00262 {
00263 public:
00264     /// Default Constructor.
00265     SamFileWriter();
00266 
00267     /// Constructor that opens the specified file for write.
00268     SamFileWriter(const char* filename);
00269 
00270     virtual ~SamFileWriter();
00271 };
00272 
00273 #endif
Generated on Wed Nov 17 15:38:27 2010 for StatGen Software by  doxygen 1.6.3