libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __SAM_FILE_H__ 00019 #define __SAM_FILE_H__ 00020 00021 #include "SamStatus.h" 00022 #include "InputFile.h" 00023 #include "SamFileHeader.h" 00024 #include "SamRecord.h" 00025 #include "GenericSamInterface.h" 00026 #include "BamIndex.h" 00027 #include "SamStatistics.h" 00028 00029 /// Allows the user to easily read/write a SAM/BAM file. 00030 /// The SamFile class contains additional functionality that allows a user 00031 /// to read specific sections of sorted & indexed BAM files. In order to 00032 /// take advantage of this capability, the index file must be read prior to 00033 /// setting the read section. This logic saves the time of having to read 00034 /// the entire file and takes advantage of the seeking capability of BGZF. 00035 class SamFile 00036 { 00037 public: 00038 /// Enum for indicating whether to open the file for read or write. 00039 enum OpenType { 00040 READ, ///< open for reading. 00041 WRITE ///< open for writing. 00042 }; 00043 00044 00045 /// Enum for indicating the type of sort expected in the file. 00046 enum SortedType { 00047 UNSORTED = 0, ///< file is not sorted. 00048 FLAG, ///< SO flag from the header indicates the sort type. 00049 COORDINATE, ///< file is sorted by coordinate. 00050 QUERY_NAME ///< file is sorted by queryname. 00051 }; 00052 00053 /// Default Constructor, initializes the variables, but does not open 00054 /// any files. 00055 SamFile(); 00056 00057 /// Constructor that sets the error handling type. 00058 /// \param errorHandlingType how to handle errors. 00059 SamFile(ErrorHandler::HandlingType errorHandlingType); 00060 00061 /// Constructor that opens the specified file based on the specified mode 00062 /// (READ/WRITE), aborts if the file could not be opened. 00063 /// \param filename name of the file to open. 00064 /// \param mode mode to use for opening the file. 00065 SamFile(const char* filename, OpenType mode); 00066 00067 /// Constructor that opens the specified file based on the specified mode 00068 /// (READ/WRITE) and handles errors per the specified handleType. 00069 /// \param filename name of the file to open. 00070 /// \param mode mode to use for opening the file. 00071 /// \param errorHandlingType how to handle errors. 00072 SamFile(const char* filename, OpenType mode, 00073 ErrorHandler::HandlingType errorHandlingType); 00074 00075 /// Constructor that opens the specified file based on the specified mode 00076 /// (READ/WRITE) and reads the header, aborts if the file could not be 00077 /// opened or the header not read. 00078 /// \param filename name of the file to open. 00079 /// \param mode mode to use for opening the file. 00080 /// \param header to read into or write from 00081 SamFile(const char* filename, OpenType mode, SamFileHeader* header); 00082 00083 /// Constructor that opens the specified file based on the specified mode 00084 /// (READ/WRITE) and reads the header, handling errors per the specified 00085 /// handleType. 00086 /// \param filename name of the file to open. 00087 /// \param mode mode to use for opening the file. 00088 /// \param errorHandlingType how to handle errors. 00089 /// \param header to read into or write from 00090 SamFile(const char* filename, OpenType mode, 00091 ErrorHandler::HandlingType errorHandlingType, 00092 SamFileHeader* header); 00093 00094 /// Destructor 00095 virtual ~SamFile(); 00096 00097 /// Open a sam/bam file for reading with the specified filename, 00098 /// determing the type of file and SAM/BAM by reading the file 00099 /// (if not stdin). 00100 /// \param filename the sam/bam file to open for reading. 00101 /// \param header to read into or write from (optional) 00102 /// \return true = success; false = failure. 00103 bool OpenForRead(const char * filename, SamFileHeader* header = NULL); 00104 00105 /// Open a sam/bam file for writing with the specified filename, 00106 /// determining SAM/BAM from the extension (.bam = BAM). 00107 /// \param filename the sam/bam file to open for writing. 00108 /// \param header to read into or write from (optional) 00109 /// \return true = success; false = failure. 00110 bool OpenForWrite(const char * filename, SamFileHeader* header = NULL); 00111 00112 /// Read the specified bam index file. It must be read prior to setting a 00113 /// read section, for seeking and reading portions of a bam file. 00114 /// \param filename the name of the bam index file to be read. 00115 /// \return true = success; false = failure. 00116 bool ReadBamIndex(const char * filename); 00117 00118 /// Read the bam index file using the BAM filename as a base. 00119 /// It must be read prior to setting a read section, for seeking 00120 /// and reading portions of a bam file. 00121 /// Must be read after opening the BAM file since it uses the 00122 /// BAM filename as a base name for the index file. 00123 /// First it tries filename.bam.bai. If that fails, it tries 00124 /// it without the .bam extension, filename.bai. 00125 /// \return true = success; false = failure. 00126 bool ReadBamIndex(); 00127 00128 /// Sets the reference to the specified genome sequence object. 00129 /// \param reference pointer to the GenomeSequence object. 00130 void SetReference(GenomeSequence* reference); 00131 00132 /// Set the type of sequence translation to use when reading 00133 /// the sequence. Passed down to the SamRecord when it is read. 00134 /// The default type (if this method is never called) is 00135 /// NONE (the sequence is left as-is). 00136 /// \param translation type of sequence translation to use. 00137 void SetReadSequenceTranslation(SamRecord::SequenceTranslation translation); 00138 00139 /// Set the type of sequence translation to use when writing 00140 /// the sequence. Passed down to the SamRecord when it is written. 00141 /// The default type (if this method is never called) is 00142 /// NONE (the sequence is left as-is). 00143 /// \param translation type of sequence translation to use. 00144 void SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation); 00145 00146 /// Close the file if there is one open. 00147 void Close(); 00148 00149 /// Returns whether or not the file has been opened successfully. 00150 /// \return true = open; false = not open. 00151 bool IsOpen(); 00152 00153 /// Returns whether or not the end of the file has been reached. 00154 /// \return true = EOF; false = not eof. 00155 /// If the file is not open, true is returned. 00156 bool IsEOF(); 00157 00158 /// Reads the header section from the file and stores it in 00159 /// the passed in header. 00160 /// \return true = success; false = failure. 00161 bool ReadHeader(SamFileHeader& header); 00162 00163 /// Writes the specified header into the file. 00164 /// \return true = success; false = failure. 00165 bool WriteHeader(SamFileHeader& header); 00166 00167 /// Reads the next record from the file & stores it in the passed in record. 00168 /// 00169 /// If it is an indexed BAM file and SetReadSection was called, 00170 /// only alignments in the section specified by SetReadSection are read. 00171 /// If they all have already been read, this method returns false. 00172 /// 00173 /// Validates that the record is sorted according to the value set by 00174 /// setSortedValidation. No sorting validation is done if specified to be 00175 /// unsorted, or setSortedValidation was never called. 00176 /// \return true = record was successfully set (and sorted if applicable), 00177 /// false = record was not successfully set 00178 /// (or not sorted as expected). 00179 bool ReadRecord(SamFileHeader& header, SamRecord& record); 00180 00181 /// Writes the specified record into the file. 00182 /// Validates that the record is sorted according to the value set by 00183 /// setSortedValidation. No sorting validation is done if specified to 00184 /// be unsorted, or setSortedValidation was never called. Returns false 00185 /// and does not write the record if the record was not properly sorted. 00186 /// \return true = success; false = failure. 00187 bool WriteRecord(SamFileHeader& header, SamRecord& record); 00188 00189 /// Set the flag to validate that the file is sorted as it is read/written. 00190 /// Must be called after the file has been opened. 00191 /// Sorting validation is reset everytime SetReadPosition is called since 00192 /// it can jump around in the file. 00193 /// \param sortType specifies the type of sort to be checked for. 00194 void setSortedValidation(SortedType sortType); 00195 00196 /// Return the number of records that have been read/written so far. 00197 uint32_t GetCurrentRecordCount(); 00198 00199 /// Deprecated, get the Status of the last call that sets status. 00200 /// To remain backwards compatable - will be removed later. 00201 inline SamStatus::Status GetFailure() 00202 { 00203 return(GetStatus()); 00204 } 00205 00206 /// Get the Status of the last call that sets status. 00207 inline SamStatus::Status GetStatus() 00208 { 00209 return(myStatus.getStatus()); 00210 } 00211 00212 /// Get the Status Message of the last call that sets status. 00213 inline const char* GetStatusMessage() 00214 { 00215 return(myStatus.getStatusMessage()); 00216 } 00217 00218 /// Sets which reference id (index into the BAM list of reference 00219 /// information) of the BAM file should be read. The records 00220 /// for that reference id will be retrieved on each ReadRecord call. 00221 /// Reference ids start at 0, and -1 indicates reads with no reference. 00222 /// When all records have been retrieved for the specified reference id, 00223 /// ReadRecord will return failure until a new read section is set. 00224 /// Must be called only after the file has been opened for reading. 00225 /// Sorting validation is reset everytime SetReadPosition is called since 00226 /// it can jump around in the file. 00227 /// \param refID the reference ID of the records to read from the file. 00228 /// \return true = success; false = failure. 00229 bool SetReadSection(int32_t refID); 00230 00231 /// Sets which reference name of the BAM file should be read. The records 00232 /// for that reference name will be retrieved on each ReadRecord call. 00233 /// Specify "" or "*" to read records not associated with a reference. 00234 /// When all records have been retrieved for the specified reference name, 00235 /// ReadRecord will return failure until a new read section is set. 00236 /// Must be called only after the file has been opened for reading. 00237 /// Sorting validation is reset everytime SetReadPosition is called since 00238 /// it can jump around in the file. 00239 /// \param refName the reference name of the records to read from the file. 00240 /// \return true = success; false = failure. 00241 bool SetReadSection(const char* refName); 00242 00243 /// Sets which reference id (index into the BAM list of reference 00244 /// information) & start/end positions of the BAM file should be read. 00245 /// The records for that reference id and positions will be retrieved on 00246 /// each ReadRecord call. Reference ids start at 0, and -1 indicates 00247 /// reads with no reference. When all records have been retrieved for the 00248 /// specified reference id, ReadRecord will return failure until a new read 00249 /// section is set. Must be called only after the file has been opened 00250 /// for reading. Sorting validation is reset everytime SetReadPosition is 00251 /// called since it can jump around in the file. 00252 /// \param refID the reference ID of the records to read from the file. 00253 /// \param start inclusive 0-based start position of records that should be read for this refID. 00254 /// \param end exclusive 0-based end position of records that should be read for this refID. 00255 /// \param overlap When true (default), return reads that just overlap the region; when false, only return reads that fall completely within the region 00256 /// \return true = success; false = failure. 00257 bool SetReadSection(int32_t refID, int32_t start, int32_t end, 00258 bool overlap = true); 00259 00260 /// Sets which reference name & start/end positions of the BAM file should 00261 /// be read. The records for this reference name & positions will be 00262 /// retrieved on each ReadRecord call. Specify "" or "*" to indicate 00263 /// reads with no reference. When all records have been retrieved for 00264 /// the specified section, ReadRecord will return failure until a new read 00265 /// section is set. Must be called only after the file has been opened for 00266 /// reading. Sorting validation is reset everytime SetReadSection is 00267 /// called since it can jump around in the file. 00268 /// \param refName the reference name of the records to read from the file. 00269 /// \param start inclusive 0-based start position of records that should be read for this refID. 00270 /// \param end exclusive 0-based end position of records that should be read for this refID. 00271 /// \param overlap When true (default), return reads that just overlap the region; when false, only return reads that fall completely within the region 00272 /// \return true = success; false = failure. 00273 bool SetReadSection(const char* refName, int32_t start, int32_t end, 00274 bool overlap = true); 00275 00276 /// Specify which reads should be returned by ReadRecord. 00277 /// Reads will only be returned by ReadRecord that contain the specified 00278 /// required flags and that do not contain any of the specified excluded 00279 /// flags. ReadRecord will continue to read from the file until a record 00280 /// that complies with these flag settings is found or until the end of the 00281 /// file/region. 00282 /// \param requiredFlags flags that are required to be in records 00283 /// returned by ReadRecord (set to 0x0 if there are no required flags). 00284 /// \param excludedFlags flags that are required to not be in records 00285 /// returned by ReadRecord (set to 0x0 if there are no excluded flags). 00286 void SetReadFlags(uint16_t requiredFlags, uint16_t excludedFlags); 00287 00288 /// Get the number of mapped reads in the specified reference id. 00289 /// Returns -1 for out of range refIDs. 00290 /// \param refID reference ID for which to extract the number of mapped reads. 00291 /// \return number of mapped reads for the specified reference id. 00292 int32_t getNumMappedReadsFromIndex(int32_t refID); 00293 00294 /// Get the number of unmapped reads in the specified reference id. 00295 /// Returns -1 for out of range refIDs. 00296 /// \param refID reference ID for which to extract the number of unmapped reads. 00297 /// \return number of unmapped reads for the specified reference id. 00298 int32_t getNumUnMappedReadsFromIndex(int32_t refID); 00299 00300 /// Get the number of mapped reads in the specified reference name. 00301 /// Returns -1 for unknown reference names. 00302 /// \param refName reference name for which to extract the number of mapped reads. 00303 /// \param header header object containing the map from refName to refID 00304 /// \return number of mapped reads for the specified reference name. 00305 int32_t getNumMappedReadsFromIndex(const char* refName, 00306 SamFileHeader& header); 00307 00308 /// Get the number of unmapped reads in the specified reference name. 00309 /// Returns -1 for unknown reference names. 00310 /// \param refName reference name for which to extract the number of unmapped reads. 00311 /// \param header header object containing the map from refName to refID 00312 /// \return number of unmapped reads for the specified reference name. 00313 int32_t getNumUnMappedReadsFromIndex(const char* refName, 00314 SamFileHeader& header); 00315 00316 /// Returns the number of bases in the passed in read that overlap the 00317 /// region that is currently set. Overlapping means that the bases occur 00318 /// in both the read and the reference as either matches or mismatches. 00319 /// This does not count insertions, deletions, clips, pads, or skips. 00320 /// \param samRecord to check for overlapping bases. 00321 /// \return number of bases that overlap region that is currently set. 00322 uint32_t GetNumOverlaps(SamRecord& samRecord); 00323 00324 /// Whether or not statistics should be generated for this file. 00325 /// The value is carried over between files and is not reset, but 00326 /// the statistics themselves are reset between files. 00327 /// \param genStats set to true if statistics should be generated, false if not. 00328 void GenerateStatistics(bool genStats); 00329 00330 /// Return the bam index if one has been opened. 00331 /// \return const pointer to the bam index, or null if one has not been opened. 00332 const BamIndex* GetBamIndex(); 00333 00334 /// Get the current file position. 00335 /// \return current position in the file. 00336 inline int64_t GetCurrentPosition() 00337 { 00338 return(iftell(myFilePtr)); 00339 } 00340 00341 /// Turn off file read buffering. 00342 inline void DisableBuffering() 00343 { 00344 if(myFilePtr != NULL) 00345 { 00346 myFilePtr->disableBuffering(); 00347 } 00348 } 00349 00350 /// Print the statistics that have been recorded due to a call to 00351 /// GenerateStatistics. 00352 inline void PrintStatistics() {if(myStatistics != NULL) myStatistics->print();} 00353 00354 protected: 00355 void init(); 00356 void init(const char* filename, OpenType mode, SamFileHeader* header); 00357 00358 /// Resets the file prepping for a new file. 00359 void resetFile(); 00360 00361 /// Validate that the record is sorted compared to the previously read 00362 /// record if there is one, according to the specified sort order. 00363 /// If the sort order is UNSORTED, true is returned. 00364 /// Sorting validation is reset everytime SetReadPosition is called since 00365 /// it can jump around in the file. 00366 bool validateSortOrder(SamRecord& record, SamFileHeader& header); 00367 00368 // Return the sort order as defined by the header. If it is undefined 00369 // or set to an unknown value, UNSORTED is returned. 00370 SortedType getSortOrderFromHeader(SamFileHeader& header); 00371 00372 bool processNewSection(SamFileHeader &header); 00373 00374 // Check if there is more to read in the current chunk, if not, 00375 // move to the next chunk. 00376 // If no sections are specified or it successfully found a chunk to read, 00377 // return true. 00378 // Sets the status and returns false if it was unable to move to a new chunk 00379 // or there are no more chunks to read, otherwise returns true. 00380 bool ensureIndexedReadPosition(); 00381 00382 // Check whether or not the record falls within the specified section. 00383 // If no sections are specified or this read falls within the 00384 // specified sections, return true. 00385 // If it does not, return false. 00386 // If the record position indicates there will be no more records within the 00387 // region, return false AND set the sam status to indicate NO_MORE_RECS. 00388 bool checkRecordInSection(SamRecord& record); 00389 00390 IFILE myFilePtr; 00391 GenericSamInterface* myInterfacePtr; 00392 00393 /// Flag to indicate if a file is open for reading. 00394 bool myIsOpenForRead; 00395 /// Flag to indicate if a file is open for writing. 00396 bool myIsOpenForWrite; 00397 /// Flag to indicate if a header has been read/written - required before 00398 /// being able to read/write a record. 00399 bool myHasHeader; 00400 00401 SortedType mySortedType; 00402 00403 /// Previous values used for checking if the file is sorted. 00404 int32_t myPrevCoord; 00405 int32_t myPrevRefID; 00406 String myPrevReadName; 00407 00408 /// Keep a count of the number of records that have been read/written so far. 00409 uint32_t myRecordCount; 00410 00411 /// Pointer to the statistics for this file. 00412 SamStatistics* myStatistics; 00413 00414 /// The status of the last SamFile command. 00415 SamStatus myStatus; 00416 00417 /// Values for reading Sorted BAM files via the index. 00418 bool myIsBamOpenForRead; 00419 bool myNewSection; 00420 // whether to return reads that overlap (true) the section or 00421 // are fully enclosed (false) in the section. 00422 bool myOverlapSection; 00423 int32_t myRefID; 00424 int32_t myStartPos; 00425 int32_t myEndPos; 00426 uint64_t myCurrentChunkEnd; 00427 SortedChunkList myChunksToRead; 00428 BamIndex* myBamIndex; 00429 00430 GenomeSequence* myRefPtr; 00431 SamRecord::SequenceTranslation myReadTranslation; 00432 SamRecord::SequenceTranslation myWriteTranslation; 00433 00434 std::string myRefName; 00435 00436 private: 00437 bool myAttemptRecovery; 00438 00439 uint16_t myRequiredFlags; 00440 uint16_t myExcludedFlags; 00441 00442 public: 00443 00444 bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length); 00445 00446 void setAttemptRecovery(bool flag = false) 00447 { 00448 myAttemptRecovery = flag; 00449 } 00450 00451 }; 00452 00453 00454 /// Child class of SamFile for reading files. 00455 class SamFileReader : public SamFile 00456 { 00457 public: 00458 00459 /// Default Constructor. 00460 SamFileReader(); 00461 00462 /// Constructor that opens the specified file for read. 00463 SamFileReader(const char* filename); 00464 00465 /// Constructor that opens the specified file for read. 00466 SamFileReader(const char* filename, 00467 ErrorHandler::HandlingType errorHandlingType); 00468 00469 /// Constructor that opens the specified file for read and reads 00470 /// the header from the file. 00471 SamFileReader(const char* filename, 00472 SamFileHeader* header); 00473 00474 /// Constructor that opens the specified file for read and reads 00475 /// the header from the file. 00476 SamFileReader(const char* filename, 00477 ErrorHandler::HandlingType errorHandlingType, 00478 SamFileHeader* header); 00479 00480 virtual ~SamFileReader(); 00481 }; 00482 00483 00484 /// Child class of SamFile for writing files. 00485 class SamFileWriter : public SamFile 00486 { 00487 public: 00488 /// Default Constructor. 00489 SamFileWriter(); 00490 00491 /// Constructor that opens the specified file for write. 00492 SamFileWriter(const char* filename); 00493 00494 /// Constructor that opens the specified file for write. 00495 SamFileWriter(const char* filename, 00496 ErrorHandler::HandlingType errorHandlingType); 00497 00498 /// Constructor that opens the specified file for write and write 00499 /// the specified header into the file. 00500 SamFileWriter(const char* filename, 00501 SamFileHeader* header); 00502 00503 /// Constructor that opens the specified file for write and write 00504 /// the specified header into the file. 00505 SamFileWriter(const char* filename, 00506 ErrorHandler::HandlingType errorHandlingType, 00507 SamFileHeader* header); 00508 00509 virtual ~SamFileWriter(); 00510 }; 00511 00512 #endif