00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __SAM_FILE_H__ 00019 #define __SAM_FILE_H__ 00020 00021 #include "SamStatus.h" 00022 #include "InputFile.h" 00023 #include "SamFileHeader.h" 00024 #include "SamRecord.h" 00025 #include "GenericSamInterface.h" 00026 #include "BamIndex.h" 00027 #include "SamStatistics.h" 00028 00029 /// Allows the user to easily read/write a SAM/BAM file. 00030 class SamFile 00031 { 00032 public: 00033 /// Enum for indicating whether to open the file for read or write. 00034 enum OpenType { 00035 READ, ///< open for reading. 00036 WRITE ///< open for writing. 00037 }; 00038 00039 00040 /// Enum for indicating the type of sort for the file. 00041 enum SortedType { 00042 UNSORTED = 0, ///< file is not sorted. 00043 FLAG, ///< SO flag from the header indicates the sort type. 00044 COORDINATE, ///< file is sorted by coordinate. 00045 QUERY_NAME ///< file is sorted by queryname. 00046 }; 00047 00048 /// Default Constructor. 00049 SamFile(); 00050 00051 /// Constructor that sets the error handling type. 00052 /// \param errorHandlingType how to handle errors. 00053 SamFile(ErrorHandler::HandlingType errorHandlingType); 00054 00055 /// Constructor that opens the specified file based on the specified mode 00056 /// (READ/WRITE). 00057 /// \param filename name of the file to open. 00058 /// \param mode mode to use for opening the file. 00059 SamFile(const char* filename, OpenType mode); 00060 00061 /// Constructor that opens the specified file based on the specified mode 00062 /// (READ/WRITE) and handles errors per the specified handleType. 00063 /// \param filename name of the file to open. 00064 /// \param mode mode to use for opening the file. 00065 /// \param errorHandlingType how to handle errors. 00066 SamFile(const char* filename, OpenType mode, 00067 ErrorHandler::HandlingType errorHandlingType); 00068 00069 virtual ~SamFile(); 00070 00071 /// Open a sam/bam file for reading with the specified filename. 00072 /// \param filename: the sam/bam file to open for reading. 00073 /// \return true = success; false = failure. 00074 bool OpenForRead(const char * filename); 00075 00076 /// Open a sam/bam file for writing with the specified filename. 00077 /// \return true = success; false = failure. 00078 bool OpenForWrite(const char * filename); 00079 00080 /// Reads the specified bam index file. It must be read prior to setting a 00081 /// read section, for seeking and reading portions of a bam file. 00082 /// \return true = success; false = failure. 00083 bool ReadBamIndex(const char * filename); 00084 00085 /// Sets the reference to the specified genome sequence object. 00086 /// \param reference pointer to the GenomeSequence object. 00087 void SetReference(GenomeSequence* reference); 00088 00089 /// Set the type of sequence translation to use when reading 00090 /// the sequence. Passed down to the SamRecord when it is read. 00091 // The default type (if this method is never called) is 00092 /// NONE (the sequence is left as-is). 00093 /// \param translation type of sequence translation to use. 00094 void SetReadSequenceTranslation(SamRecord::SequenceTranslation translation); 00095 00096 /// Set the type of sequence translation to use when writing 00097 /// the sequence. Passed down to the SamRecord when it is written. 00098 /// The default type (if this method is never called) is 00099 /// NONE (the sequence is left as-is). 00100 /// \param translation type of sequence translation to use. 00101 void SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation); 00102 00103 /// Close the file if there is one open. 00104 void Close(); 00105 00106 /// Returns whether or not the end of the file has been reached. 00107 /// \return true = EOF; false = not eof. 00108 /// If the file is not open, false is returned. 00109 bool IsEOF(); 00110 00111 /// Reads the header section from the file and stores it in 00112 /// the passed in header. 00113 /// \return true = success; false = failure. 00114 bool ReadHeader(SamFileHeader& header); 00115 00116 /// Writes the specified header into the file. 00117 /// \return true = success; false = failure. 00118 bool WriteHeader(SamFileHeader& header); 00119 00120 /// Reads the next record from the file & stores it in the passed in record. 00121 /// \return true = record was successfully set. 00122 /// false = record was not successfully set. 00123 bool ReadRecord(SamFileHeader& header, SamRecord& record); 00124 00125 /// Writes the specified record into the file. 00126 /// \return true = success; false = failure. 00127 bool WriteRecord(SamFileHeader& header, SamRecord& record); 00128 00129 /// Set the flag to validate that the file is sorted as it is read/written. 00130 /// Must be called after the file has been opened. 00131 void setSortedValidation(SortedType sortType); 00132 00133 /// Return the number of records that have been read/written so far. 00134 uint32_t GetCurrentRecordCount(); 00135 00136 /// Get the Status of the last call that sets status. 00137 /// To remain backwards compatable - will be removed later. 00138 inline SamStatus::Status GetFailure() 00139 { 00140 return(GetStatus()); 00141 } 00142 00143 /// Get the Status of the last call that sets status. 00144 inline SamStatus::Status GetStatus() 00145 { 00146 return(myStatus.getStatus()); 00147 } 00148 00149 /// Get the Status of the last call that sets status. 00150 inline const char* GetStatusMessage() 00151 { 00152 return(myStatus.getStatusMessage()); 00153 } 00154 00155 /// Sets what part of the BAM file should be read. This version will 00156 /// set it to only read a specific reference id. The records for that 00157 /// reference id will be retrieved on each ReadRecord call. When all 00158 /// records have been retrieved for the specified reference id, ReadRecord 00159 /// will return failure until a new read section is set. 00160 /// Must be called only after the file has been opened for reading. 00161 /// \param refID the reference ID of the records to read from the file. 00162 /// \return true = success; false = failure. 00163 bool SetReadSection(int32_t refID); 00164 00165 /// Sets what part of the BAM file should be read. This version will 00166 /// set it to only read a specific reference name. The records for that 00167 /// reference id will be retrieved on each ReadRecord call. When all 00168 /// records have been retrieved for the specified reference name, 00169 /// ReadRecord will return failure until a new read section is set. 00170 /// Must be called only after the file has been opened for reading. 00171 /// \param refName the reference name of the records to read from the file. 00172 /// \return true = success; false = failure. 00173 bool SetReadSection(const char* refName); 00174 00175 /// Sets what part of the BAM file should be read. This version will 00176 /// set it to only read a specific reference id and start/end position. 00177 /// The records for this section will be retrieved on each ReadRecord 00178 /// call. When all records have been retrieved for the specified section, 00179 /// ReadRecord will return failure until a new read section is set. 00180 /// Must be called only after the file has been opened for reading. 00181 /// \param refID the reference ID of the records to read from the file. 00182 /// \param start inclusive 0-based start position of records that should be read for this refID. 00183 /// \param end exclusive 0-based end position of records that should be read for this refID. 00184 /// \return true = success; false = failure. 00185 bool SetReadSection(int32_t refID, int32_t start, int32_t end); 00186 00187 /// Sets what part of the BAM file should be read. This version will 00188 /// set it to only read a specific reference name and start/end position. 00189 /// The records for this section will be retrieved on each ReadRecord 00190 /// call. When all records have been retrieved for the specified section, 00191 /// ReadRecord will return failure until a new read section is set. 00192 /// Must be called only after the file has been opened for reading. 00193 /// \param refName the reference name of the records to read from the file. 00194 /// \param start inclusive 0-based start position of records that should be read for this refID. 00195 /// \param end exclusive 0-based end position of records that should be read for this refID. 00196 /// \return true = success; false = failure. 00197 bool SetReadSection(const char* refName, int32_t start, int32_t end); 00198 00199 /// Returns the number of bases in the passed in read that overlap the 00200 /// region that is currently set. 00201 /// \param samRecord to check for overlapping bases. 00202 /// \return number of bases that overlap region that is currently set. 00203 uint32_t GetNumOverlaps(SamRecord& samRecord); 00204 00205 /// Whether or not statistics should be generated for this file. 00206 /// The value is carried over between files and is not reset, but 00207 /// the statistics themselves are reset between files. 00208 /// \param genStats set to true if statistics should be generated, false if not. 00209 void GenerateStatistics(bool genStats); 00210 00211 inline void PrintStatistics() {if(myStatistics != NULL) myStatistics->print();} 00212 00213 protected: 00214 /// Resets the file prepping for a new file. 00215 void resetFile(); 00216 00217 /// Validate that the record is sorted compared to the previously read 00218 /// record if there is one, according to the specified sort order. 00219 /// If the sort order is UNSORTED, true is returned. 00220 bool validateSortOrder(SamRecord& record, SamFileHeader& header); 00221 00222 // Return the sort order as defined by the header. If it is undefined 00223 // or set to an unknown value, UNSORTED is returned. 00224 SortedType getSortOrderFromHeader(SamFileHeader& header); 00225 00226 /// Overwrites read record to read from the specific reference only. 00227 bool readIndexedRecord(SamFileHeader& header, SamRecord& record); 00228 00229 bool processNewSection(SamFileHeader &header); 00230 00231 IFILE myFilePtr; 00232 GenericSamInterface* myInterfacePtr; 00233 00234 /// Flag to indicate if a file is open for reading. 00235 bool myIsOpenForRead; 00236 /// Flag to indicate if a file is open for writing. 00237 bool myIsOpenForWrite; 00238 /// Flag to indicate if a header has been read/written - required before 00239 /// being able to read/write a record. 00240 bool myHasHeader; 00241 00242 SortedType mySortedType; 00243 00244 /// Previous values used for checking if the file is sorted. 00245 int32_t myPrevCoord; 00246 int32_t myPrevRefID; 00247 std::string myPrevReadName; 00248 00249 /// Keep a count of the number of records that have been read/written so far. 00250 uint32_t myRecordCount; 00251 00252 /// Pointer to the statistics for this file. 00253 SamStatistics* myStatistics; 00254 00255 /// The status of the last SamFile command. 00256 SamStatus myStatus; 00257 00258 /// Values for reading Sorted BAM files via the index. 00259 bool myIsBamOpenForRead; 00260 bool myNewSection; 00261 int32_t myRefID; 00262 int32_t myStartPos; 00263 int32_t myEndPos; 00264 uint64_t myCurrentChunkEnd; 00265 SortedChunkList myChunksToRead; 00266 BamIndex* myBamIndex; 00267 00268 GenomeSequence* myRefPtr; 00269 SamRecord::SequenceTranslation myReadTranslation; 00270 SamRecord::SequenceTranslation myWriteTranslation; 00271 00272 std::string myRefName; 00273 }; 00274 00275 00276 class SamFileReader : public SamFile 00277 { 00278 public: 00279 00280 /// Default Constructor. 00281 SamFileReader(); 00282 00283 /// Constructor that opens the specified file for read. 00284 SamFileReader(const char* filename); 00285 00286 virtual ~SamFileReader(); 00287 }; 00288 00289 00290 class SamFileWriter : public SamFile 00291 { 00292 public: 00293 /// Default Constructor. 00294 SamFileWriter(); 00295 00296 /// Constructor that opens the specified file for write. 00297 SamFileWriter(const char* filename); 00298 00299 virtual ~SamFileWriter(); 00300 }; 00301 00302 #endif
1.6.3