00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __SAM_FILE_H__ 00019 #define __SAM_FILE_H__ 00020 00021 #include "SamStatus.h" 00022 #include "InputFile.h" 00023 #include "SamFileHeader.h" 00024 #include "SamRecord.h" 00025 #include "GenericSamInterface.h" 00026 #include "BamIndex.h" 00027 #include "SamStatistics.h" 00028 00029 class SamFile 00030 { 00031 public: 00032 enum OpenType {READ, WRITE}; 00033 00034 /// Enum for indicating the type of sort for the file. 00035 enum SortedType { 00036 UNSORTED = 0, ///< file is not sorted. 00037 FLAG, ///< SO flag from the header indicates the sort type. 00038 COORDINATE, ///< file is sorted by coordinate. 00039 QUERY_NAME ///< file is sorted by queryname. 00040 }; 00041 00042 /// Default Constructor. 00043 SamFile(); 00044 00045 /// Constructor that sets the error handling type. 00046 /// \param errorHandlingType how to handle errors. 00047 SamFile(ErrorHandler::HandlingType errorHandlingType); 00048 00049 /// Constructor that opens the specified file based on the specified mode 00050 /// (READ/WRITE). 00051 /// \param filename name of the file to open. 00052 /// \param mode mode to use for opening the file. 00053 SamFile(const char* filename, OpenType mode); 00054 00055 /// Constructor that opens the specified file based on the specified mode 00056 /// (READ/WRITE) and handles errors per the specified handleType. 00057 /// \param filename name of the file to open. 00058 /// \param mode mode to use for opening the file. 00059 /// \param errorHandlingType how to handle errors. 00060 SamFile(const char* filename, OpenType mode, 00061 ErrorHandler::HandlingType errorHandlingType); 00062 00063 virtual ~SamFile(); 00064 00065 /// Open a sam/bam file for reading with the specified filename. 00066 /// \param filename: the sam/bam file to open for reading. 00067 /// \return true = success; false = failure. 00068 bool OpenForRead(const char * filename); 00069 00070 /// Open a sam/bam file for writing with the specified filename. 00071 /// \return true = success; false = failure. 00072 bool OpenForWrite(const char * filename); 00073 00074 /// Reads the specified bam index file. It must be read prior to setting a 00075 /// read section, for seeking and reading portions of a bam file. 00076 /// \return true = success; false = failure. 00077 bool ReadBamIndex(const char * filename); 00078 00079 /// Close the file if there is one open. 00080 void Close(); 00081 00082 /// Returns whether or not the end of the file has been reached. 00083 /// \return true = EOF; false = not eof. 00084 /// If the file is not open, false is returned. 00085 bool IsEOF(); 00086 00087 /// Reads the header section from the file and stores it in 00088 /// the passed in header. 00089 /// \return true = success; false = failure. 00090 bool ReadHeader(SamFileHeader& header); 00091 00092 /// Writes the specified header into the file. 00093 /// \return true = success; false = failure. 00094 bool WriteHeader(SamFileHeader& header); 00095 00096 /// Reads the next record from the file & stores it in the passed in record. 00097 /// \return true = record was successfully set. 00098 /// false = record was not successfully set. 00099 bool ReadRecord(SamFileHeader& header, SamRecord& record); 00100 00101 /// Writes the specified record into the file. 00102 /// \return true = success; false = failure. 00103 bool WriteRecord(SamFileHeader& header, SamRecord& record); 00104 00105 /// Set the flag to validate that the file is sorted as it is read/written. 00106 /// Must be called after the file has been opened. 00107 void setSortedValidation(SortedType sortType); 00108 00109 /// Return the number of records that have been read/written so far. 00110 uint32_t GetCurrentRecordCount(); 00111 00112 /// Get the Status of the last call that sets status. 00113 /// To remain backwards compatable - will be removed later. 00114 inline SamStatus::Status GetFailure() 00115 { 00116 return(GetStatus()); 00117 } 00118 00119 /// Get the Status of the last call that sets status. 00120 inline SamStatus::Status GetStatus() 00121 { 00122 return(myStatus.getStatus()); 00123 } 00124 00125 /// Get the Status of the last call that sets status. 00126 inline const char* GetStatusMessage() 00127 { 00128 return(myStatus.getStatusMessage()); 00129 } 00130 00131 /// Sets what part of the BAM file should be read. This version will 00132 /// set it to only read a specific reference id. The records for that 00133 /// reference id will be retrieved on each ReadRecord call. When all 00134 /// records have been retrieved for the specified reference id, ReadRecord 00135 /// will return failure until a new read section is set. 00136 /// Must be called only after the file has been opened for reading. 00137 /// \param refID the reference ID of the records to read from the file. 00138 /// \return true = success; false = failure. 00139 bool SetReadSection(int32_t refID); 00140 00141 /// Sets what part of the BAM file should be read. This version will 00142 /// set it to only read a specific reference name. The records for that 00143 /// reference id will be retrieved on each ReadRecord call. When all 00144 /// records have been retrieved for the specified reference name, 00145 /// ReadRecord will return failure until a new read section is set. 00146 /// Must be called only after the file has been opened for reading. 00147 /// \param refName the reference name of the records to read from the file. 00148 /// \return true = success; false = failure. 00149 bool SetReadSection(const char* refName); 00150 00151 /// Sets what part of the BAM file should be read. This version will 00152 /// set it to only read a specific reference id and start/end position. 00153 /// The records for this section will be retrieved on each ReadRecord 00154 /// call. When all records have been retrieved for the specified section, 00155 /// ReadRecord will return failure until a new read section is set. 00156 /// Must be called only after the file has been opened for reading. 00157 /// \param refID the reference ID of the records to read from the file. 00158 /// \param start inclusive 0-based start position of records that should be read for this refID. 00159 /// \param end exclusive 0-based end position of records that should be read for this refID. 00160 /// \return true = success; false = failure. 00161 bool SetReadSection(int32_t refID, int32_t start, int32_t end); 00162 00163 /// Sets what part of the BAM file should be read. This version will 00164 /// set it to only read a specific reference name and start/end position. 00165 /// The records for this section will be retrieved on each ReadRecord 00166 /// call. When all records have been retrieved for the specified section, 00167 /// ReadRecord will return failure until a new read section is set. 00168 /// Must be called only after the file has been opened for reading. 00169 /// \param refName the reference name of the records to read from the file. 00170 /// \param start inclusive 0-based start position of records that should be read for this refID. 00171 /// \param end exclusive 0-based end position of records that should be read for this refID. 00172 /// \return true = success; false = failure. 00173 bool SetReadSection(const char* refName, int32_t start, int32_t end); 00174 00175 /// Returns the number of bases in the passed in read that overlap the 00176 /// region that is currently set. 00177 /// \param samRecord to check for overlapping bases. 00178 /// \return number of bases that overlap region that is currently set. 00179 uint32_t GetNumOverlaps(SamRecord& samRecord); 00180 00181 /// Whether or not statistics should be generated for this file. 00182 /// The value is carried over between files and is not reset, but 00183 /// the statistics themselves are reset between files. 00184 /// \param genStats set to true if statistics should be generated, false if not. 00185 void GenerateStatistics(bool genStats); 00186 00187 inline void PrintStatistics() {if(myStatistics != NULL) myStatistics->print();} 00188 00189 protected: 00190 void resetFile(); 00191 00192 /// Validate that the record is sorted compared to the previously read record 00193 /// if there is one, according to the specified sort order. 00194 /// If the sort order is UNSORTED, true is returned. 00195 bool validateSortOrder(SamRecord& record, SamFileHeader& header); 00196 00197 // Return the sort order as defined by the header. If it is undefined 00198 // or set to an unknown value, UNSORTED is returned. 00199 SortedType getSortOrderFromHeader(SamFileHeader& header); 00200 00201 /// Overwrites read record to read from the specific reference only. 00202 bool readIndexedRecord(SamFileHeader& header, SamRecord& record); 00203 00204 bool processNewSection(SamFileHeader &header); 00205 00206 IFILE myFilePtr; 00207 GenericSamInterface* myInterfacePtr; 00208 00209 /// Flag to indicate if a file is open for reading. 00210 bool myIsOpenForRead; 00211 /// Flag to indicate if a file is open for writing. 00212 bool myIsOpenForWrite; 00213 /// Flag to indicate if a header has been read/written - required before 00214 /// being able to read/write a record. 00215 bool myHasHeader; 00216 00217 SortedType mySortedType; 00218 00219 /// Previous values used for checking if the file is sorted. 00220 int32_t myPrevCoord; 00221 int32_t myPrevRefID; 00222 std::string myPrevReadName; 00223 00224 /// Keep a count of the number of records that have been read/written so far. 00225 uint32_t myRecordCount; 00226 00227 /// Pointer to the statistics for this file. 00228 SamStatistics* myStatistics; 00229 00230 /// The status of the last SamFile command. 00231 SamStatus myStatus; 00232 00233 /// Values for reading Sorted BAM files via the index. 00234 bool myIsBamOpenForRead; 00235 bool myNewSection; 00236 int32_t myRefID; 00237 int32_t myStartPos; 00238 int32_t myEndPos; 00239 uint64_t myCurrentChunkEnd; 00240 SortedChunkList myChunksToRead; 00241 BamIndex* myBamIndex; 00242 00243 std::string myRefName; 00244 }; 00245 00246 00247 class SamFileReader : public SamFile 00248 { 00249 public: 00250 00251 /// Default Constructor. 00252 SamFileReader(); 00253 00254 /// Constructor that opens the specified file for read. 00255 SamFileReader(const char* filename); 00256 00257 virtual ~SamFileReader(); 00258 }; 00259 00260 00261 class SamFileWriter : public SamFile 00262 { 00263 public: 00264 /// Default Constructor. 00265 SamFileWriter(); 00266 00267 /// Constructor that opens the specified file for write. 00268 SamFileWriter(const char* filename); 00269 00270 virtual ~SamFileWriter(); 00271 }; 00272 00273 #endif