libStatGen Software  1
SamFile.h
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_FILE_H__
00019 #define __SAM_FILE_H__
00020 
00021 #include "SamStatus.h"
00022 #include "InputFile.h"
00023 #include "SamFileHeader.h"
00024 #include "SamRecord.h"
00025 #include "GenericSamInterface.h"
00026 #include "BamIndex.h"
00027 #include "SamStatistics.h"
00028 
00029 /// Allows the user to easily read/write a SAM/BAM file.
00030 /// The SamFile class contains additional functionality that allows a user
00031 /// to read specific sections of sorted & indexed BAM files.  In order to
00032 /// take advantage of this capability, the index file must be read prior to
00033 /// setting the read section.  This logic saves the time of having to read
00034 /// the entire file and takes advantage of the seeking capability of BGZF.
00035 class SamFile
00036 {
00037 public:
00038     /// Enum for indicating whether to open the file for read or write.
00039     enum OpenType {
00040         READ, ///< open for reading.
00041         WRITE ///< open for writing.
00042     };
00043     
00044     
00045     /// Enum for indicating the type of sort expected in the file.
00046     enum SortedType {
00047         UNSORTED = 0, ///< file is not sorted.
00048         FLAG,         ///< SO flag from the header indicates the sort type.
00049         COORDINATE,   ///< file is sorted by coordinate.
00050         QUERY_NAME    ///< file is sorted by queryname.
00051     };
00052     
00053     /// Default Constructor, initializes the variables, but does not open
00054     /// any files.
00055     SamFile();
00056 
00057     /// Constructor that sets the error handling type.
00058     /// \param errorHandlingType how to handle errors.
00059     SamFile(ErrorHandler::HandlingType errorHandlingType);
00060 
00061     /// Constructor that opens the specified file based on the specified mode
00062     /// (READ/WRITE), aborts if the file could not be opened.
00063     /// \param filename name of the file to open.
00064     /// \param mode mode to use for opening the file.
00065     SamFile(const char* filename, OpenType mode);
00066 
00067     /// Constructor that opens the specified file based on the specified mode
00068     /// (READ/WRITE) and handles errors per the specified handleType.
00069     /// \param filename name of the file to open.
00070     /// \param mode mode to use for opening the file.
00071     /// \param errorHandlingType how to handle errors.
00072     SamFile(const char* filename, OpenType mode,
00073             ErrorHandler::HandlingType errorHandlingType);
00074 
00075     /// Constructor that opens the specified file based on the specified mode
00076     /// (READ/WRITE) and reads the header, aborts if the file could not be
00077     /// opened or the header not read.
00078     /// \param filename name of the file to open.
00079     /// \param mode mode to use for opening the file.
00080     /// \param header to read into or write from
00081     SamFile(const char* filename, OpenType mode, SamFileHeader* header);
00082 
00083     /// Constructor that opens the specified file based on the specified mode
00084     /// (READ/WRITE) and reads the header, handling errors per the specified
00085     /// handleType.
00086     /// \param filename name of the file to open.
00087     /// \param mode mode to use for opening the file.
00088     /// \param errorHandlingType how to handle errors.
00089     /// \param header to read into or write from
00090     SamFile(const char* filename, OpenType mode,
00091             ErrorHandler::HandlingType errorHandlingType,
00092             SamFileHeader* header);
00093 
00094     /// Destructor
00095     virtual ~SamFile();
00096    
00097     /// Open a sam/bam file for reading with the specified filename,
00098     /// determing the type of file and SAM/BAM  by reading the file
00099     /// (if not stdin).
00100     /// \param  filename the sam/bam file to open for reading.
00101     /// \param header to read into or write from (optional)
00102     /// \return true = success; false = failure.
00103     bool OpenForRead(const char * filename, SamFileHeader* header = NULL);
00104 
00105     /// Open a sam/bam file for writing with the specified filename,
00106     /// determining SAM/BAM from the extension (.bam = BAM).
00107     /// \param  filename the sam/bam file to open for writing.
00108     /// \param header to read into or write from (optional)
00109     /// \return true = success; false = failure.
00110     bool OpenForWrite(const char * filename, SamFileHeader* header = NULL);
00111 
00112     /// Read the specified bam index file.  It must be read prior to setting a
00113     /// read section, for seeking and reading portions of a bam file.
00114     /// \param filename the name of the bam index file to be read.
00115     /// \return true = success; false = failure.
00116     bool ReadBamIndex(const char * filename);
00117 
00118     /// Read the bam index file using the BAM filename as a base. 
00119     /// It must be read prior to setting a read section, for seeking
00120     /// and reading portions of a bam file.
00121     /// Must be read after opening the BAM file since it uses the
00122     /// BAM filename as a base name for the index file.
00123     /// First it tries filename.bam.bai. If that fails, it tries
00124     /// it without the .bam extension, filename.bai.
00125     /// \return true = success; false = failure.
00126     bool ReadBamIndex();
00127 
00128     /// Sets the reference to the specified genome sequence object.
00129     /// \param reference pointer to the GenomeSequence object.
00130     void SetReference(GenomeSequence* reference);
00131 
00132     /// Set the type of sequence translation to use when reading
00133     /// the sequence.  Passed down to the SamRecord when it is read.  
00134     /// The default type (if this method is never called) is
00135     /// NONE (the sequence is left as-is).
00136     /// \param translation type of sequence translation to use.
00137     void SetReadSequenceTranslation(SamRecord::SequenceTranslation translation);
00138 
00139     /// Set the type of sequence translation to use when writing
00140     /// the sequence.  Passed down to the SamRecord when it is written.
00141     /// The default type (if this method is never called) is
00142     /// NONE (the sequence is left as-is).
00143     /// \param translation type of sequence translation to use.
00144     void SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation);
00145 
00146     /// Close the file if there is one open.
00147     void Close();
00148 
00149     /// Returns whether or not the file has been opened successfully.
00150     /// \return true = open; false = not open.
00151     bool IsOpen();
00152    
00153     /// Returns whether or not the end of the file has been reached.
00154     /// \return true = EOF; false = not eof.
00155     /// If the file is not open, true is returned.
00156     bool IsEOF();
00157    
00158     /// Reads the header section from the file and stores it in
00159     /// the passed in header.
00160     /// \return true = success; false = failure.
00161     bool ReadHeader(SamFileHeader& header);
00162    
00163     /// Writes the specified header into the file.
00164     /// \return true = success; false = failure.
00165     bool WriteHeader(SamFileHeader& header);
00166 
00167     /// Reads the next record from the file & stores it in the passed in record.
00168     ///
00169     /// If it is an indexed BAM file and SetReadSection was called, 
00170     /// only alignments in the section specified by SetReadSection are read.
00171     /// If they all have already been read, this method returns false.
00172     ///
00173     /// Validates that the record is sorted according to the value set by
00174     /// setSortedValidation. No sorting validation is done if specified to be
00175     /// unsorted, or setSortedValidation was never called.
00176     /// \return true  = record was successfully set (and sorted if applicable),
00177     ///                false = record was not successfully set 
00178     ///                (or not sorted as expected).
00179     bool ReadRecord(SamFileHeader& header, SamRecord& record);
00180    
00181     /// Writes the specified record into the file.
00182     /// Validates that the record is sorted according to the value set by
00183     /// setSortedValidation. No sorting validation is done if specified to
00184     /// be unsorted, or setSortedValidation was never called.  Returns false
00185     /// and does not write the record if the record was not properly sorted.
00186     /// \return true = success; false = failure.
00187     bool WriteRecord(SamFileHeader& header, SamRecord& record);
00188    
00189     /// Set the flag to validate that the file is sorted as it is read/written.
00190     /// Must be called after the file has been opened.
00191     /// Sorting validation is reset everytime SetReadPosition is called since
00192     /// it can jump around in the file.
00193     /// \param sortType specifies the type of sort to be checked for.
00194     void setSortedValidation(SortedType sortType);
00195 
00196     /// Return the number of records that have been read/written so far.
00197     uint32_t GetCurrentRecordCount();
00198 
00199     /// Deprecated, get the Status of the last call that sets status.
00200     /// To remain backwards compatable - will be removed later.
00201     inline SamStatus::Status GetFailure()
00202     {
00203         return(GetStatus());
00204     }
00205 
00206     /// Get the Status of the last call that sets status.
00207     inline SamStatus::Status GetStatus()
00208     {
00209         return(myStatus.getStatus());
00210     }
00211 
00212     /// Get the Status Message of the last call that sets status.
00213     inline const char* GetStatusMessage()
00214     {
00215         return(myStatus.getStatusMessage());
00216     }
00217 
00218     /// Sets which reference id (index into the BAM list of reference
00219     /// information) of the BAM file should be read.  The records
00220     /// for that reference id will be retrieved on each ReadRecord call.  
00221     /// Reference ids start at 0, and -1 indicates reads with no reference.
00222     /// When all records have been retrieved for the specified reference id, 
00223     /// ReadRecord will return failure until a new read section is set.
00224     /// Must be called only after the file has been opened for reading.
00225     /// Sorting validation is reset everytime SetReadPosition is called since
00226     /// it can jump around in the file.
00227     /// \param  refID the reference ID of the records to read from the file.
00228     /// \return true = success; false = failure.
00229     bool SetReadSection(int32_t refID);
00230 
00231     /// Sets which reference name of the BAM file should be read.  The records
00232     /// for that reference name will be retrieved on each ReadRecord call.
00233     /// Specify "" or "*" to read records not associated with a reference.
00234     /// When all records have been retrieved for the specified reference name,
00235     /// ReadRecord will return failure until a new read section is set.
00236     /// Must be called only after the file has been opened for reading.
00237     /// Sorting validation is reset everytime SetReadPosition is called since
00238     /// it can jump around in the file.
00239     /// \param  refName the reference name of the records to read from the file.
00240     /// \return true = success; false = failure.
00241     bool SetReadSection(const char* refName);
00242 
00243     /// Sets which reference id (index into the BAM list of reference
00244     /// information) & start/end positions of the BAM file should be read.
00245     /// The records for that reference id and positions will be retrieved on
00246     /// each ReadRecord call.  Reference ids start at 0, and -1 indicates
00247     /// reads with no reference.  When all records have been retrieved for the
00248     /// specified reference id, ReadRecord will return failure until a new read
00249     /// section is set.  Must be called only after the file has been opened
00250     /// for reading.  Sorting validation is reset everytime SetReadPosition is
00251     /// called since it can jump around in the file.
00252     /// \param  refID the reference ID of the records to read from the file.
00253     /// \param  start inclusive 0-based start position of records that should be read for this refID.
00254     /// \param  end exclusive 0-based end position of records that should be read for this refID.
00255     /// \param overlap When true (default), return reads that just overlap the region; when false, only return reads that fall completely within the region
00256     /// \return true = success; false = failure.   
00257     bool SetReadSection(int32_t refID, int32_t start, int32_t end, 
00258                         bool overlap = true);
00259 
00260     /// Sets which reference name & start/end positions of the BAM file should
00261     /// be read.  The records for this reference name & positions will be
00262     /// retrieved on each ReadRecord call.  Specify "" or "*" to indicate
00263     /// reads with no reference.  When all records have been retrieved for
00264     /// the specified section, ReadRecord will return failure until a new read
00265     /// section is set.  Must be called only after the file has been opened for
00266     /// reading.  Sorting validation is reset everytime SetReadSection is
00267     /// called since it can jump around in the file.
00268     /// \param  refName the reference name of the records to read from the file.
00269     /// \param  start inclusive 0-based start position of records that should be read for this refID.
00270     /// \param  end exclusive 0-based end position of records that should be read for this refID.
00271     /// \param overlap When true (default), return reads that just overlap the region; when false, only return reads that fall completely within the region
00272     /// \return true = success; false = failure.   
00273     bool SetReadSection(const char* refName, int32_t start, int32_t end, 
00274                         bool overlap = true);
00275 
00276     /// Specify which reads should be returned by ReadRecord.
00277     /// Reads will only be returned by ReadRecord that contain the specified
00278     /// required flags and that do not contain any of the specified excluded
00279     /// flags.  ReadRecord will continue to read from the file until a record
00280     /// that complies with these flag settings is found or until the end of the
00281     /// file/region.
00282     /// \param requiredFlags flags that are required to be in records
00283     /// returned by ReadRecord (set to 0x0 if there are no required flags).
00284     /// \param excludedFlags flags that are required to not be in records
00285     /// returned by ReadRecord (set to 0x0 if there are no excluded flags).
00286     void SetReadFlags(uint16_t requiredFlags, uint16_t excludedFlags);
00287 
00288     /// Get the number of mapped reads in the specified reference id.  
00289     /// Returns -1 for out of range refIDs.
00290     /// \param refID reference ID for which to extract the number of mapped reads.
00291     /// \return number of mapped reads for the specified reference id.
00292     int32_t getNumMappedReadsFromIndex(int32_t refID);
00293 
00294     /// Get the number of unmapped reads in the specified reference id.  
00295     /// Returns -1 for out of range refIDs.
00296     /// \param refID reference ID for which to extract the number of unmapped reads.
00297     /// \return number of unmapped reads for the specified reference id.
00298     int32_t getNumUnMappedReadsFromIndex(int32_t refID);
00299 
00300     /// Get the number of mapped reads in the specified reference name.
00301     /// Returns -1 for unknown reference names.
00302     /// \param refName reference name for which to extract the number of mapped reads.
00303     /// \param header header object containing the map from refName to refID
00304     /// \return number of mapped reads for the specified reference name.
00305     int32_t getNumMappedReadsFromIndex(const char* refName,
00306                                        SamFileHeader& header);
00307 
00308     /// Get the number of unmapped reads in the specified reference name.
00309     /// Returns -1 for unknown reference names.
00310     /// \param refName reference name for which to extract the number of unmapped reads.
00311     /// \param header header object containing the map from refName to refID
00312     /// \return number of unmapped reads for the specified reference name.
00313     int32_t getNumUnMappedReadsFromIndex(const char* refName,
00314                                          SamFileHeader& header);
00315 
00316     /// Returns the number of bases in the passed in read that overlap the
00317     /// region that is currently set.  Overlapping means that the bases occur
00318     /// in both the read and the reference as either matches or mismatches.
00319     /// This does not count insertions, deletions, clips, pads, or skips.
00320     /// \param samRecord to check for overlapping bases.
00321     /// \return number of bases that overlap region that is currently set.
00322     uint32_t GetNumOverlaps(SamRecord& samRecord);
00323 
00324     /// Whether or not statistics should be generated for this file.
00325     /// The value is carried over between files and is not reset, but
00326     /// the statistics themselves are reset between files.
00327     /// \param genStats set to true if statistics should be generated, false if not.
00328     void GenerateStatistics(bool genStats);
00329 
00330     /// Return the bam index if one has been opened.
00331     /// \return const pointer to the bam index, or null if one has not been opened.
00332     const BamIndex* GetBamIndex();
00333 
00334     /// Get the current file position.
00335     /// \return current position in the file.
00336     inline int64_t GetCurrentPosition()
00337     {
00338         return(iftell(myFilePtr));
00339     }
00340     
00341     /// Turn off file read buffering.
00342     inline void DisableBuffering()
00343     {
00344         if(myFilePtr != NULL)
00345         {
00346             myFilePtr->disableBuffering();
00347         }
00348     }
00349 
00350     /// Print the statistics that have been recorded due to a call to
00351     /// GenerateStatistics.
00352     inline void PrintStatistics() {if(myStatistics != NULL) myStatistics->print();}
00353 
00354 protected:
00355     void init();
00356     void init(const char* filename, OpenType mode, SamFileHeader* header);
00357 
00358     /// Resets the file prepping for a new file.
00359     void resetFile();
00360 
00361     /// Validate that the record is sorted compared to the previously read
00362     /// record if there is one, according to the specified sort order.
00363     /// If the sort order is UNSORTED, true is returned.
00364     /// Sorting validation is reset everytime SetReadPosition is called since
00365     /// it can jump around in the file.
00366     bool validateSortOrder(SamRecord& record, SamFileHeader& header);
00367    
00368     // Return the sort order as defined by the header.  If it is undefined
00369     // or set to an unknown value, UNSORTED is returned.
00370     SortedType getSortOrderFromHeader(SamFileHeader& header);
00371 
00372     bool processNewSection(SamFileHeader &header);
00373 
00374     // Check if there is more to read in the current chunk, if not,
00375     // move to the next chunk.
00376     // If no sections are specified or it successfully found a chunk to read,
00377     // return true.
00378     // Sets the status and returns false if it was unable to move to a new chunk
00379     // or there are no more chunks to read, otherwise returns true.
00380     bool ensureIndexedReadPosition();
00381 
00382     // Check whether or not the record falls within the specified section. 
00383     // If no sections are specified or this read falls within the
00384     // specified sections, return true.
00385     // If it does not, return false.
00386     // If the record position indicates there will be no more records within the
00387     // region, return false AND set the sam status to indicate NO_MORE_RECS.
00388     bool checkRecordInSection(SamRecord& record);
00389 
00390     IFILE  myFilePtr;
00391     GenericSamInterface* myInterfacePtr;
00392 
00393     /// Flag to indicate if a file is open for reading.
00394     bool myIsOpenForRead;
00395     /// Flag to indicate if a file is open for writing.
00396     bool myIsOpenForWrite;
00397     /// Flag to indicate if a header has been read/written - required before
00398     /// being able to read/write a record.
00399     bool myHasHeader;
00400 
00401     SortedType mySortedType;
00402 
00403     /// Previous values used for checking if the file is sorted.
00404     int32_t myPrevCoord;
00405     int32_t myPrevRefID;
00406     String myPrevReadName;
00407 
00408     /// Keep a count of the number of records that have been read/written so far.
00409     uint32_t myRecordCount;
00410 
00411     /// Pointer to the statistics for this file.
00412     SamStatistics* myStatistics;
00413    
00414     /// The status of the last SamFile command.
00415     SamStatus myStatus;
00416 
00417     /// Values for reading Sorted BAM files via the index.
00418     bool myIsBamOpenForRead;
00419     bool myNewSection;
00420     // whether to return reads that overlap (true) the section or
00421     // are fully enclosed (false) in the section.
00422     bool myOverlapSection;
00423     int32_t myRefID;
00424     int32_t myStartPos;
00425     int32_t myEndPos;
00426     uint64_t myCurrentChunkEnd;
00427     SortedChunkList myChunksToRead;
00428     BamIndex* myBamIndex;
00429 
00430     GenomeSequence* myRefPtr;
00431     SamRecord::SequenceTranslation myReadTranslation;
00432     SamRecord::SequenceTranslation myWriteTranslation;
00433     
00434     std::string myRefName;
00435 
00436 private:
00437     bool    myAttemptRecovery;
00438 
00439     uint16_t myRequiredFlags;
00440     uint16_t myExcludedFlags;
00441 
00442 public:
00443 
00444     bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length);
00445 
00446     void setAttemptRecovery(bool flag = false)
00447     {
00448         myAttemptRecovery = flag;
00449     }
00450 
00451 };
00452 
00453 
00454 /// Child class of SamFile for reading files.
00455 class SamFileReader : public SamFile
00456 {
00457 public:
00458 
00459     /// Default Constructor.
00460     SamFileReader();
00461 
00462     /// Constructor that opens the specified file for read.
00463     SamFileReader(const char* filename);
00464 
00465     /// Constructor that opens the specified file for read.
00466     SamFileReader(const char* filename,
00467                   ErrorHandler::HandlingType errorHandlingType);
00468 
00469     /// Constructor that opens the specified file for read and reads
00470     /// the header from the file.
00471     SamFileReader(const char* filename,
00472                   SamFileHeader* header);
00473 
00474     /// Constructor that opens the specified file for read and reads
00475     /// the header from the file.
00476     SamFileReader(const char* filename,
00477                   ErrorHandler::HandlingType errorHandlingType,
00478                   SamFileHeader* header);
00479 
00480     virtual ~SamFileReader();
00481 };
00482 
00483 
00484 /// Child class of SamFile for writing files.
00485 class SamFileWriter : public SamFile
00486 {
00487 public:
00488     /// Default Constructor.
00489     SamFileWriter();
00490 
00491     /// Constructor that opens the specified file for write.
00492     SamFileWriter(const char* filename);
00493 
00494     /// Constructor that opens the specified file for write.
00495     SamFileWriter(const char* filename,
00496                   ErrorHandler::HandlingType errorHandlingType);
00497 
00498     /// Constructor that opens the specified file for write and write
00499     /// the specified header into the file.
00500     SamFileWriter(const char* filename,
00501                   SamFileHeader* header);
00502 
00503     /// Constructor that opens the specified file for write and write
00504     /// the specified header into the file.
00505     SamFileWriter(const char* filename,
00506                   ErrorHandler::HandlingType errorHandlingType,
00507                   SamFileHeader* header);
00508 
00509     virtual ~SamFileWriter();
00510 };
00511 
00512 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends