00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __SAM_FILE_H__ 00019 #define __SAM_FILE_H__ 00020 00021 #include "SamStatus.h" 00022 #include "InputFile.h" 00023 #include "SamFileHeader.h" 00024 #include "SamRecord.h" 00025 #include "GenericSamInterface.h" 00026 #include "BamIndex.h" 00027 #include "SamStatistics.h" 00028 00029 /// Allows the user to easily read/write a SAM/BAM file. 00030 class SamFile 00031 { 00032 public: 00033 /// Enum for indicating whether to open the file for read or write. 00034 enum OpenType { 00035 READ, ///< open for reading. 00036 WRITE ///< open for writing. 00037 }; 00038 00039 00040 /// Enum for indicating the type of sort for the file. 00041 enum SortedType { 00042 UNSORTED = 0, ///< file is not sorted. 00043 FLAG, ///< SO flag from the header indicates the sort type. 00044 COORDINATE, ///< file is sorted by coordinate. 00045 QUERY_NAME ///< file is sorted by queryname. 00046 }; 00047 00048 /// Default Constructor. 00049 SamFile(); 00050 00051 /// Constructor that sets the error handling type. 00052 /// \param errorHandlingType how to handle errors. 00053 SamFile(ErrorHandler::HandlingType errorHandlingType); 00054 00055 /// Constructor that opens the specified file based on the specified mode 00056 /// (READ/WRITE). 00057 /// \param filename name of the file to open. 00058 /// \param mode mode to use for opening the file. 00059 SamFile(const char* filename, OpenType mode); 00060 00061 /// Constructor that opens the specified file based on the specified mode 00062 /// (READ/WRITE) and handles errors per the specified handleType. 00063 /// \param filename name of the file to open. 00064 /// \param mode mode to use for opening the file. 00065 /// \param errorHandlingType how to handle errors. 00066 SamFile(const char* filename, OpenType mode, 00067 ErrorHandler::HandlingType errorHandlingType); 00068 00069 /// Constructor that opens the specified file based on the specified mode 00070 /// (READ/WRITE). 00071 /// \param filename name of the file to open. 00072 /// \param mode mode to use for opening the file. 00073 /// \param header to read into or write from 00074 SamFile(const char* filename, OpenType mode, SamFileHeader* header); 00075 00076 /// Constructor that opens the specified file based on the specified mode 00077 /// (READ/WRITE) and handles errors per the specified handleType. 00078 /// \param filename name of the file to open. 00079 /// \param mode mode to use for opening the file. 00080 /// \param errorHandlingType how to handle errors. 00081 /// \param header to read into or write from 00082 SamFile(const char* filename, OpenType mode, 00083 ErrorHandler::HandlingType errorHandlingType, 00084 SamFileHeader* header); 00085 00086 virtual ~SamFile(); 00087 00088 /// Open a sam/bam file for reading with the specified filename. 00089 /// \param filename the sam/bam file to open for reading. 00090 /// \param header to read into or write from (optional) 00091 /// \return true = success; false = failure. 00092 bool OpenForRead(const char * filename, SamFileHeader* header = NULL); 00093 00094 /// Open a sam/bam file for writing with the specified filename. 00095 /// \param filename the sam/bam file to open for writing. 00096 /// \param header to read into or write from (optional) 00097 /// \return true = success; false = failure. 00098 bool OpenForWrite(const char * filename, SamFileHeader* header = NULL); 00099 00100 /// Read the specified bam index file. It must be read prior to setting a 00101 /// read section, for seeking and reading portions of a bam file. 00102 /// \param filename the name of the bam index file to be read. 00103 /// \return true = success; false = failure. 00104 bool ReadBamIndex(const char * filename); 00105 00106 /// Read the bam index file using the BAM filename as a base. 00107 /// It must be read prior to setting a read section, for seeking 00108 /// and reading portions of a bam file. 00109 /// Must be read after opening the BAM file since it uses the 00110 /// BAM filename as a base name for the index file. 00111 /// First it tries filename.bam.bai. If that fails, it tries 00112 /// it without the .bam extension, filename.bai. 00113 /// \return true = success; false = failure. 00114 bool ReadBamIndex(); 00115 00116 /// Sets the reference to the specified genome sequence object. 00117 /// \param reference pointer to the GenomeSequence object. 00118 void SetReference(GenomeSequence* reference); 00119 00120 /// Set the type of sequence translation to use when reading 00121 /// the sequence. Passed down to the SamRecord when it is read. 00122 // The default type (if this method is never called) is 00123 /// NONE (the sequence is left as-is). 00124 /// \param translation type of sequence translation to use. 00125 void SetReadSequenceTranslation(SamRecord::SequenceTranslation translation); 00126 00127 /// Set the type of sequence translation to use when writing 00128 /// the sequence. Passed down to the SamRecord when it is written. 00129 /// The default type (if this method is never called) is 00130 /// NONE (the sequence is left as-is). 00131 /// \param translation type of sequence translation to use. 00132 void SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation); 00133 00134 /// Close the file if there is one open. 00135 void Close(); 00136 00137 /// Returns whether or not the file has been opened successfully. 00138 /// \return true = open; false = not open. 00139 bool IsOpen(); 00140 00141 /// Returns whether or not the end of the file has been reached. 00142 /// \return true = EOF; false = not eof. 00143 /// If the file is not open, false is returned. 00144 bool IsEOF(); 00145 00146 /// Reads the header section from the file and stores it in 00147 /// the passed in header. 00148 /// \return true = success; false = failure. 00149 bool ReadHeader(SamFileHeader& header); 00150 00151 /// Writes the specified header into the file. 00152 /// \return true = success; false = failure. 00153 bool WriteHeader(SamFileHeader& header); 00154 00155 /// Reads the next record from the file & stores it in the passed in record. 00156 /// \return true = record was successfully set. 00157 /// false = record was not successfully set. 00158 bool ReadRecord(SamFileHeader& header, SamRecord& record); 00159 00160 /// Writes the specified record into the file. 00161 /// \return true = success; false = failure. 00162 bool WriteRecord(SamFileHeader& header, SamRecord& record); 00163 00164 /// Set the flag to validate that the file is sorted as it is read/written. 00165 /// Must be called after the file has been opened. 00166 /// Sorting validation is reset everytime SetReadPosition is called since 00167 /// it can jump around in the file. 00168 void setSortedValidation(SortedType sortType); 00169 00170 /// Return the number of records that have been read/written so far. 00171 uint32_t GetCurrentRecordCount(); 00172 00173 /// Get the Status of the last call that sets status. 00174 /// To remain backwards compatable - will be removed later. 00175 inline SamStatus::Status GetFailure() 00176 { 00177 return(GetStatus()); 00178 } 00179 00180 /// Get the Status of the last call that sets status. 00181 inline SamStatus::Status GetStatus() 00182 { 00183 return(myStatus.getStatus()); 00184 } 00185 00186 /// Get the Status of the last call that sets status. 00187 inline const char* GetStatusMessage() 00188 { 00189 return(myStatus.getStatusMessage()); 00190 } 00191 00192 /// Sets what part of the BAM file should be read. This version will 00193 /// set it to only read a specific reference id. The records for that 00194 /// reference id will be retrieved on each ReadRecord call. When all 00195 /// records have been retrieved for the specified reference id, ReadRecord 00196 /// will return failure until a new read section is set. 00197 /// Must be called only after the file has been opened for reading. 00198 /// Sorting validation is reset everytime SetReadPosition is called since 00199 /// it can jump around in the file. 00200 /// \param refID the reference ID of the records to read from the file. 00201 /// \return true = success; false = failure. 00202 bool SetReadSection(int32_t refID); 00203 00204 /// Sets what part of the BAM file should be read. This version will 00205 /// set it to only read a specific reference name. The records for that 00206 /// reference id will be retrieved on each ReadRecord call. When all 00207 /// records have been retrieved for the specified reference name, 00208 /// ReadRecord will return failure until a new read section is set. 00209 /// Must be called only after the file has been opened for reading. 00210 /// Sorting validation is reset everytime SetReadPosition is called since 00211 /// it can jump around in the file. 00212 /// \param refName the reference name of the records to read from the file. 00213 /// \return true = success; false = failure. 00214 bool SetReadSection(const char* refName); 00215 00216 /// Sets what part of the BAM file should be read. This version will 00217 /// set it to only read a specific reference id and start/end position. 00218 /// The records for this section will be retrieved on each ReadRecord 00219 /// call. When all records have been retrieved for the specified section, 00220 /// ReadRecord will return failure until a new read section is set. 00221 /// Must be called only after the file has been opened for reading. 00222 /// Sorting validation is reset everytime SetReadPosition is called since 00223 /// it can jump around in the file. 00224 /// \param refID the reference ID of the records to read from the file. 00225 /// \param start inclusive 0-based start position of records that should be read for this refID. 00226 /// \param end exclusive 0-based end position of records that should be read for this refID. 00227 /// \param overlap When true (default), return reads that just overlap the region. When false, only return reads that fall completely within the region 00228 /// \return true = success; false = failure. 00229 bool SetReadSection(int32_t refID, int32_t start, int32_t end, 00230 bool overlap = true); 00231 00232 /// Sets what part of the BAM file should be read. This version will 00233 /// set it to only read a specific reference name and start/end position. 00234 /// The records for this section will be retrieved on each ReadRecord 00235 /// call. When all records have been retrieved for the specified section, 00236 /// ReadRecord will return failure until a new read section is set. 00237 /// Must be called only after the file has been opened for reading. 00238 /// Sorting validation is reset everytime SetReadPosition is called since 00239 /// it can jump around in the file. 00240 /// \param refName the reference name of the records to read from the file. 00241 /// \param start inclusive 0-based start position of records that should be read for this refID. 00242 /// \param end exclusive 0-based end position of records that should be read for this refID. 00243 /// \param overlap When true (default), return reads that just overlap the region. When false, only return reads that fall completely within the region 00244 /// \return true = success; false = failure. 00245 bool SetReadSection(const char* refName, int32_t start, int32_t end, 00246 bool overlap = true); 00247 00248 /// Get the number of mapped reads in the specified reference id. 00249 /// Returns -1 for out of range refIDs. 00250 /// \param refID reference ID for which to extract the number of mapped reads. 00251 /// \return number of mapped reads for the specified reference id. 00252 int32_t getNumMappedReadsFromIndex(int32_t refID); 00253 00254 /// Get the number of unmapped reads in the specified reference id. 00255 /// Returns -1 for out of range refIDs. 00256 /// \param refID reference ID for which to extract the number of unmapped reads. 00257 /// \return number of unmapped reads for the specified reference id. 00258 int32_t getNumUnMappedReadsFromIndex(int32_t refID); 00259 00260 /// Get the number of mapped reads in the specified reference name. 00261 /// Returns -1 for unknown reference names. 00262 /// \param refName reference name for which to extract the number of mapped reads. 00263 /// \param header header object containing the map from refName to refID 00264 /// \return number of mapped reads for the specified reference name. 00265 int32_t getNumMappedReadsFromIndex(const char* refName, 00266 SamFileHeader& header); 00267 00268 /// Get the number of unmapped reads in the specified reference name. 00269 /// Returns -1 for unknown reference names. 00270 /// \param refName reference name for which to extract the number of unmapped reads. 00271 /// \param header header object containing the map from refName to refID 00272 /// \return number of unmapped reads for the specified reference name. 00273 int32_t getNumUnMappedReadsFromIndex(const char* refName, 00274 SamFileHeader& header); 00275 00276 /// Returns the number of bases in the passed in read that overlap the 00277 /// region that is currently set. 00278 /// \param samRecord to check for overlapping bases. 00279 /// \return number of bases that overlap region that is currently set. 00280 uint32_t GetNumOverlaps(SamRecord& samRecord); 00281 00282 /// Whether or not statistics should be generated for this file. 00283 /// The value is carried over between files and is not reset, but 00284 /// the statistics themselves are reset between files. 00285 /// \param genStats set to true if statistics should be generated, false if not. 00286 void GenerateStatistics(bool genStats); 00287 00288 /// Return the bam index if one has been opened. 00289 /// \return const pointer to the bam index, or null if one has not been opened. 00290 const BamIndex* GetBamIndex(); 00291 00292 /// Get the current file position. 00293 /// \return current position in the file. 00294 inline long int GetCurrentPosition() 00295 { 00296 return(iftell(myFilePtr)); 00297 } 00298 00299 inline void DisableBuffering() 00300 { 00301 if(myFilePtr != NULL) 00302 { 00303 myFilePtr->disableBuffering(); 00304 } 00305 } 00306 00307 00308 inline void PrintStatistics() {if(myStatistics != NULL) myStatistics->print();} 00309 00310 protected: 00311 void init(const char* filename, OpenType mode, SamFileHeader* header); 00312 00313 /// Resets the file prepping for a new file. 00314 void resetFile(); 00315 00316 /// Validate that the record is sorted compared to the previously read 00317 /// record if there is one, according to the specified sort order. 00318 /// If the sort order is UNSORTED, true is returned. 00319 /// Sorting validation is reset everytime SetReadPosition is called since 00320 /// it can jump around in the file. 00321 bool validateSortOrder(SamRecord& record, SamFileHeader& header); 00322 00323 // Return the sort order as defined by the header. If it is undefined 00324 // or set to an unknown value, UNSORTED is returned. 00325 SortedType getSortOrderFromHeader(SamFileHeader& header); 00326 00327 /// Overwrites read record to read from the specific reference only. 00328 bool readIndexedRecord(SamFileHeader& header, SamRecord& record); 00329 00330 bool processNewSection(SamFileHeader &header); 00331 00332 IFILE myFilePtr; 00333 GenericSamInterface* myInterfacePtr; 00334 00335 /// Flag to indicate if a file is open for reading. 00336 bool myIsOpenForRead; 00337 /// Flag to indicate if a file is open for writing. 00338 bool myIsOpenForWrite; 00339 /// Flag to indicate if a header has been read/written - required before 00340 /// being able to read/write a record. 00341 bool myHasHeader; 00342 00343 SortedType mySortedType; 00344 00345 /// Previous values used for checking if the file is sorted. 00346 int32_t myPrevCoord; 00347 int32_t myPrevRefID; 00348 std::string myPrevReadName; 00349 00350 /// Keep a count of the number of records that have been read/written so far. 00351 uint32_t myRecordCount; 00352 00353 /// Pointer to the statistics for this file. 00354 SamStatistics* myStatistics; 00355 00356 /// The status of the last SamFile command. 00357 SamStatus myStatus; 00358 00359 /// Values for reading Sorted BAM files via the index. 00360 bool myIsBamOpenForRead; 00361 bool myNewSection; 00362 // whether to return reads that overlap (true) the section or 00363 // are fully enclosed (false) in the section. 00364 bool myOverlapSection; 00365 int32_t myRefID; 00366 int32_t myStartPos; 00367 int32_t myEndPos; 00368 uint64_t myCurrentChunkEnd; 00369 SortedChunkList myChunksToRead; 00370 BamIndex* myBamIndex; 00371 00372 GenomeSequence* myRefPtr; 00373 SamRecord::SequenceTranslation myReadTranslation; 00374 SamRecord::SequenceTranslation myWriteTranslation; 00375 00376 std::string myRefName; 00377 00378 private: 00379 bool myAttemptRecovery; 00380 00381 public: 00382 00383 bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length); 00384 00385 void setAttemptRecovery(bool flag = false) 00386 { 00387 myAttemptRecovery = flag; 00388 } 00389 00390 }; 00391 00392 00393 class SamFileReader : public SamFile 00394 { 00395 public: 00396 00397 /// Default Constructor. 00398 SamFileReader(); 00399 00400 /// Constructor that opens the specified file for read. 00401 SamFileReader(const char* filename); 00402 00403 /// Constructor that opens the specified file for read. 00404 SamFileReader(const char* filename, 00405 ErrorHandler::HandlingType errorHandlingType); 00406 00407 /// Constructor that opens the specified file for read and reads 00408 /// the header from the file. 00409 SamFileReader(const char* filename, 00410 SamFileHeader* header); 00411 00412 /// Constructor that opens the specified file for read and reads 00413 /// the header from the file. 00414 SamFileReader(const char* filename, 00415 ErrorHandler::HandlingType errorHandlingType, 00416 SamFileHeader* header); 00417 00418 virtual ~SamFileReader(); 00419 }; 00420 00421 00422 class SamFileWriter : public SamFile 00423 { 00424 public: 00425 /// Default Constructor. 00426 SamFileWriter(); 00427 00428 /// Constructor that opens the specified file for write. 00429 SamFileWriter(const char* filename); 00430 00431 /// Constructor that opens the specified file for write. 00432 SamFileWriter(const char* filename, 00433 ErrorHandler::HandlingType errorHandlingType); 00434 00435 /// Constructor that opens the specified file for write and write 00436 /// the specified header into the file. 00437 SamFileWriter(const char* filename, 00438 SamFileHeader* header); 00439 00440 /// Constructor that opens the specified file for write and write 00441 /// the specified header into the file. 00442 SamFileWriter(const char* filename, 00443 ErrorHandler::HandlingType errorHandlingType, 00444 SamFileHeader* header); 00445 00446 virtual ~SamFileWriter(); 00447 }; 00448 00449 #endif