SamFile.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_FILE_H__
00019 #define __SAM_FILE_H__
00020 
00021 #include "SamStatus.h"
00022 #include "InputFile.h"
00023 #include "SamFileHeader.h"
00024 #include "SamRecord.h"
00025 #include "GenericSamInterface.h"
00026 #include "BamIndex.h"
00027 #include "SamStatistics.h"
00028 
00029 /// Allows the user to easily read/write a SAM/BAM file.
00030 /// The SamFile class contains additional functionality that allows a user
00031 /// to read specific sections of sorted & indexed BAM files.  In order to
00032 /// take advantage of this capability, the index file must be read prior to
00033 /// setting the read section.  This logic saves the time of having to read
00034 /// the entire file and takes advantage of the seeking capability of BGZF.
00035 class SamFile
00036 {
00037 public:
00038     /// Enum for indicating whether to open the file for read or write.
00039     enum OpenType {
00040         READ, ///< open for reading.
00041         WRITE ///< open for writing.
00042     };
00043     
00044     
00045     /// Enum for indicating the type of sort expected in the file.
00046     enum SortedType {
00047         UNSORTED = 0, ///< file is not sorted.
00048         FLAG,         ///< SO flag from the header indicates the sort type.
00049         COORDINATE,   ///< file is sorted by coordinate.
00050         QUERY_NAME    ///< file is sorted by queryname.
00051     };
00052     
00053     /// Default Constructor, initializes the variables, but does not open
00054     /// any files.
00055     SamFile();
00056 
00057     /// Constructor that sets the error handling type.
00058     /// \param errorHandlingType how to handle errors.
00059     SamFile(ErrorHandler::HandlingType errorHandlingType);
00060 
00061     /// Constructor that opens the specified file based on the specified mode
00062     /// (READ/WRITE), aborts if the file could not be opened.
00063     /// \param filename name of the file to open.
00064     /// \param mode mode to use for opening the file.
00065     SamFile(const char* filename, OpenType mode);
00066 
00067     /// Constructor that opens the specified file based on the specified mode
00068     /// (READ/WRITE) and handles errors per the specified handleType.
00069     /// \param filename name of the file to open.
00070     /// \param mode mode to use for opening the file.
00071     /// \param errorHandlingType how to handle errors.
00072     SamFile(const char* filename, OpenType mode,
00073             ErrorHandler::HandlingType errorHandlingType);
00074 
00075     /// Constructor that opens the specified file based on the specified mode
00076     /// (READ/WRITE) and reads the header, aborts if the file could not be
00077     /// opened or the header not read.
00078     /// \param filename name of the file to open.
00079     /// \param mode mode to use for opening the file.
00080     /// \param header to read into or write from
00081     SamFile(const char* filename, OpenType mode, SamFileHeader* header);
00082 
00083     /// Constructor that opens the specified file based on the specified mode
00084     /// (READ/WRITE) and reads the header, handling errors per the specified
00085     /// handleType.
00086     /// \param filename name of the file to open.
00087     /// \param mode mode to use for opening the file.
00088     /// \param errorHandlingType how to handle errors.
00089     /// \param header to read into or write from
00090     SamFile(const char* filename, OpenType mode,
00091             ErrorHandler::HandlingType errorHandlingType,
00092             SamFileHeader* header);
00093 
00094     /// Destructor
00095     virtual ~SamFile();
00096    
00097     /// Open a sam/bam file for reading with the specified filename,
00098     /// determing the type of file and SAM/BAM  by reading the file
00099     /// (if not stdin).
00100     /// \param  filename the sam/bam file to open for reading.
00101     /// \param header to read into or write from (optional)
00102     /// \return true = success; false = failure.
00103     bool OpenForRead(const char * filename, SamFileHeader* header = NULL);
00104 
00105     /// Open a sam/bam file for writing with the specified filename,
00106     /// determining SAM/BAM from the extension (.bam = BAM).
00107     /// \param  filename the sam/bam file to open for writing.
00108     /// \param header to read into or write from (optional)
00109     /// \return true = success; false = failure.
00110     bool OpenForWrite(const char * filename, SamFileHeader* header = NULL);
00111 
00112     /// Read the specified bam index file.  It must be read prior to setting a
00113     /// read section, for seeking and reading portions of a bam file.
00114     /// \param filename the name of the bam index file to be read.
00115     /// \return true = success; false = failure.
00116     bool ReadBamIndex(const char * filename);
00117 
00118     /// Read the bam index file using the BAM filename as a base. 
00119     /// It must be read prior to setting a read section, for seeking
00120     /// and reading portions of a bam file.
00121     /// Must be read after opening the BAM file since it uses the
00122     /// BAM filename as a base name for the index file.
00123     /// First it tries filename.bam.bai. If that fails, it tries
00124     /// it without the .bam extension, filename.bai.
00125     /// \return true = success; false = failure.
00126     bool ReadBamIndex();
00127 
00128     /// Sets the reference to the specified genome sequence object.
00129     /// \param reference pointer to the GenomeSequence object.
00130     void SetReference(GenomeSequence* reference);
00131 
00132     /// Set the type of sequence translation to use when reading
00133     /// the sequence.  Passed down to the SamRecord when it is read.  
00134     /// The default type (if this method is never called) is
00135     /// NONE (the sequence is left as-is).
00136     /// \param translation type of sequence translation to use.
00137     void SetReadSequenceTranslation(SamRecord::SequenceTranslation translation);
00138 
00139     /// Set the type of sequence translation to use when writing
00140     /// the sequence.  Passed down to the SamRecord when it is written.
00141     /// The default type (if this method is never called) is
00142     /// NONE (the sequence is left as-is).
00143     /// \param translation type of sequence translation to use.
00144     void SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation);
00145 
00146     /// Close the file if there is one open.
00147     void Close();
00148 
00149     /// Returns whether or not the file has been opened successfully.
00150     /// \return true = open; false = not open.
00151     bool IsOpen();
00152    
00153     /// Returns whether or not the end of the file has been reached.
00154     /// \return true = EOF; false = not eof.
00155     /// If the file is not open, true is returned.
00156     bool IsEOF();
00157    
00158     /// Reads the header section from the file and stores it in
00159     /// the passed in header.
00160     /// \return true = success; false = failure.
00161     bool ReadHeader(SamFileHeader& header);
00162    
00163     /// Writes the specified header into the file.
00164     /// \return true = success; false = failure.
00165     bool WriteHeader(SamFileHeader& header);
00166 
00167     /// Reads the next record from the file & stores it in the passed in record.
00168     ///
00169     /// If it is an indexed BAM file and SetReadSection was called, 
00170     /// only alignments in the section specified by SetReadSection are read.
00171     /// If they all have already been read, this method returns false.
00172     ///
00173     /// Validates that the record is sorted according to the value set by
00174     /// setSortedValidation. No sorting validation is done if specified to be
00175     /// unsorted, or setSortedValidation was never called.
00176     /// \return true  = record was successfully set (and sorted if applicable),
00177     ///                false = record was not successfully set 
00178     ///                (or not sorted as expected).
00179     bool ReadRecord(SamFileHeader& header, SamRecord& record);
00180    
00181     /// Writes the specified record into the file.
00182     /// Validates that the record is sorted according to the value set by
00183     /// setSortedValidation. No sorting validation is done if specified to
00184     /// be unsorted, or setSortedValidation was never called.  Returns false
00185     /// and does not write the record if the record was not properly sorted.
00186     /// \return true = success; false = failure.
00187     bool WriteRecord(SamFileHeader& header, SamRecord& record);
00188    
00189     /// Set the flag to validate that the file is sorted as it is read/written.
00190     /// Must be called after the file has been opened.
00191     /// Sorting validation is reset everytime SetReadPosition is called since
00192     /// it can jump around in the file.
00193     /// \param sortType specifies the type of sort to be checked for.
00194     void setSortedValidation(SortedType sortType);
00195 
00196     /// Return the number of records that have been read/written so far.
00197     uint32_t GetCurrentRecordCount();
00198 
00199     /// Deprecated, get the Status of the last call that sets status.
00200     /// To remain backwards compatable - will be removed later.
00201     inline SamStatus::Status GetFailure()
00202     {
00203         return(GetStatus());
00204     }
00205 
00206     /// Get the Status of the last call that sets status.
00207     inline SamStatus::Status GetStatus()
00208     {
00209         return(myStatus.getStatus());
00210     }
00211 
00212     /// Get the Status Message of the last call that sets status.
00213     inline const char* GetStatusMessage()
00214     {
00215         return(myStatus.getStatusMessage());
00216     }
00217 
00218     /// Sets which reference id (index into the BAM list of reference
00219     /// information) of the BAM file should be read.  The records
00220     /// for that reference id will be retrieved on each ReadRecord call.  
00221     /// Reference ids start at 0, and -1 indicates reads with no reference.
00222     /// When all records have been retrieved for the specified reference id, 
00223     /// ReadRecord will return failure until a new read section is set.
00224     /// Must be called only after the file has been opened for reading.
00225     /// Sorting validation is reset everytime SetReadPosition is called since
00226     /// it can jump around in the file.
00227     /// \param  refID the reference ID of the records to read from the file.
00228     /// \return true = success; false = failure.
00229     bool SetReadSection(int32_t refID);
00230 
00231     /// Sets which reference name of the BAM file should be read.  The records
00232     /// for that reference name will be retrieved on each ReadRecord call.
00233     /// Specify "" or "*" to read records not associated with a reference.
00234     /// When all records have been retrieved for the specified reference name,
00235     /// ReadRecord will return failure until a new read section is set.
00236     /// Must be called only after the file has been opened for reading.
00237     /// Sorting validation is reset everytime SetReadPosition is called since
00238     /// it can jump around in the file.
00239     /// \param  refName the reference name of the records to read from the file.
00240     /// \return true = success; false = failure.
00241     bool SetReadSection(const char* refName);
00242 
00243     /// Sets which reference id (index into the BAM list of reference
00244     /// information) & start/end positions of the BAM file should be read.
00245     /// The records for that reference id and positions will be retrieved on
00246     /// each ReadRecord call.  Reference ids start at 0, and -1 indicates
00247     /// reads with no reference.  When all records have been retrieved for the
00248     /// specified reference id, ReadRecord will return failure until a new read
00249     /// section is set.  Must be called only after the file has been opened
00250     /// for reading.  Sorting validation is reset everytime SetReadPosition is
00251     /// called since it can jump around in the file.
00252     /// \param  refID the reference ID of the records to read from the file.
00253     /// \param  start inclusive 0-based start position of records that should be read for this refID.
00254     /// \param  end exclusive 0-based end position of records that should be read for this refID.
00255     /// \param overlap When true (default), return reads that just overlap the region; when false, only return reads that fall completely within the region
00256     /// \return true = success; false = failure.   
00257     bool SetReadSection(int32_t refID, int32_t start, int32_t end, 
00258                         bool overlap = true);
00259 
00260     /// Sets which reference name & start/end positions of the BAM file should
00261     /// be read.  The records for this reference name & positions will be
00262     /// retrieved on each ReadRecord call.  Specify "" or "*" to indicate
00263     /// reads with no reference.  When all records have been retrieved for
00264     /// the specified section, ReadRecord will return failure until a new read
00265     /// section is set.  Must be called only after the file has been opened for
00266     /// reading.  Sorting validation is reset everytime SetReadPosition is
00267     /// called since it can jump around in the file.
00268     /// \param  refName the reference name of the records to read from the file.
00269     /// \param  start inclusive 0-based start position of records that should be read for this refID.
00270     /// \param  end exclusive 0-based end position of records that should be read for this refID.
00271     /// \param overlap When true (default), return reads that just overlap the region; when false, only return reads that fall completely within the region
00272     /// \return true = success; false = failure.   
00273     bool SetReadSection(const char* refName, int32_t start, int32_t end, 
00274                         bool overlap = true);
00275 
00276     /// Get the number of mapped reads in the specified reference id.  
00277     /// Returns -1 for out of range refIDs.
00278     /// \param refID reference ID for which to extract the number of mapped reads.
00279     /// \return number of mapped reads for the specified reference id.
00280     int32_t getNumMappedReadsFromIndex(int32_t refID);
00281 
00282     /// Get the number of unmapped reads in the specified reference id.  
00283     /// Returns -1 for out of range refIDs.
00284     /// \param refID reference ID for which to extract the number of unmapped reads.
00285     /// \return number of unmapped reads for the specified reference id.
00286     int32_t getNumUnMappedReadsFromIndex(int32_t refID);
00287 
00288     /// Get the number of mapped reads in the specified reference name.
00289     /// Returns -1 for unknown reference names.
00290     /// \param refName reference name for which to extract the number of mapped reads.
00291     /// \param header header object containing the map from refName to refID
00292     /// \return number of mapped reads for the specified reference name.
00293     int32_t getNumMappedReadsFromIndex(const char* refName,
00294                                        SamFileHeader& header);
00295 
00296     /// Get the number of unmapped reads in the specified reference name.
00297     /// Returns -1 for unknown reference names.
00298     /// \param refName reference name for which to extract the number of unmapped reads.
00299     /// \param header header object containing the map from refName to refID
00300     /// \return number of unmapped reads for the specified reference name.
00301     int32_t getNumUnMappedReadsFromIndex(const char* refName,
00302                                          SamFileHeader& header);
00303 
00304     /// Returns the number of bases in the passed in read that overlap the
00305     /// region that is currently set.  Overlapping means that the bases occur
00306     /// in both the read and the reference as either matches or mismatches.
00307     /// This does not count insertions, deletions, clips, pads, or skips.
00308     /// \param samRecord to check for overlapping bases.
00309     /// \return number of bases that overlap region that is currently set.
00310     uint32_t GetNumOverlaps(SamRecord& samRecord);
00311 
00312     /// Whether or not statistics should be generated for this file.
00313     /// The value is carried over between files and is not reset, but
00314     /// the statistics themselves are reset between files.
00315     /// \param genStats set to true if statistics should be generated, false if not.
00316     void GenerateStatistics(bool genStats);
00317 
00318     /// Return the bam index if one has been opened.
00319     /// \return const pointer to the bam index, or null if one has not been opened.
00320     const BamIndex* GetBamIndex();
00321 
00322     /// Get the current file position.
00323     /// \return current position in the file.
00324     inline long int GetCurrentPosition()
00325     {
00326         return(iftell(myFilePtr));
00327     }
00328     
00329     /// Turn off file read buffering.
00330     inline void DisableBuffering()
00331     {
00332         if(myFilePtr != NULL)
00333         {
00334             myFilePtr->disableBuffering();
00335         }
00336     }
00337 
00338     /// Print the statistics that have been recorded due to a call to
00339     /// GenerateStatistics.
00340     inline void PrintStatistics() {if(myStatistics != NULL) myStatistics->print();}
00341 
00342 protected:
00343     void init(const char* filename, OpenType mode, SamFileHeader* header);
00344 
00345     /// Resets the file prepping for a new file.
00346     void resetFile();
00347 
00348     /// Validate that the record is sorted compared to the previously read
00349     /// record if there is one, according to the specified sort order.
00350     /// If the sort order is UNSORTED, true is returned.
00351     /// Sorting validation is reset everytime SetReadPosition is called since
00352     /// it can jump around in the file.
00353     bool validateSortOrder(SamRecord& record, SamFileHeader& header);
00354    
00355     // Return the sort order as defined by the header.  If it is undefined
00356     // or set to an unknown value, UNSORTED is returned.
00357     SortedType getSortOrderFromHeader(SamFileHeader& header);
00358 
00359     /// Overwrites read record to read from the specific reference only.
00360     bool readIndexedRecord(SamFileHeader& header, SamRecord& record);
00361 
00362     bool processNewSection(SamFileHeader &header);
00363 
00364     IFILE  myFilePtr;
00365     GenericSamInterface* myInterfacePtr;
00366 
00367     /// Flag to indicate if a file is open for reading.
00368     bool myIsOpenForRead;
00369     /// Flag to indicate if a file is open for writing.
00370     bool myIsOpenForWrite;
00371     /// Flag to indicate if a header has been read/written - required before
00372     /// being able to read/write a record.
00373     bool myHasHeader;
00374 
00375     SortedType mySortedType;
00376 
00377     /// Previous values used for checking if the file is sorted.
00378     int32_t myPrevCoord;
00379     int32_t myPrevRefID;
00380     std::string myPrevReadName;
00381 
00382     /// Keep a count of the number of records that have been read/written so far.
00383     uint32_t myRecordCount;
00384 
00385     /// Pointer to the statistics for this file.
00386     SamStatistics* myStatistics;
00387    
00388     /// The status of the last SamFile command.
00389     SamStatus myStatus;
00390 
00391     /// Values for reading Sorted BAM files via the index.
00392     bool myIsBamOpenForRead;
00393     bool myNewSection;
00394     // whether to return reads that overlap (true) the section or
00395     // are fully enclosed (false) in the section.
00396     bool myOverlapSection;
00397     int32_t myRefID;
00398     int32_t myStartPos;
00399     int32_t myEndPos;
00400     uint64_t myCurrentChunkEnd;
00401     SortedChunkList myChunksToRead;
00402     BamIndex* myBamIndex;
00403 
00404     GenomeSequence* myRefPtr;
00405     SamRecord::SequenceTranslation myReadTranslation;
00406     SamRecord::SequenceTranslation myWriteTranslation;
00407     
00408     std::string myRefName;
00409 
00410 private:
00411     bool    myAttemptRecovery;
00412 
00413 public:
00414 
00415     bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length);
00416 
00417     void setAttemptRecovery(bool flag = false)
00418     {
00419         myAttemptRecovery = flag;
00420     }
00421 
00422 };
00423 
00424 
00425 /// Child class of SamFile for reading files.
00426 class SamFileReader : public SamFile
00427 {
00428 public:
00429 
00430     /// Default Constructor.
00431     SamFileReader();
00432 
00433     /// Constructor that opens the specified file for read.
00434     SamFileReader(const char* filename);
00435 
00436     /// Constructor that opens the specified file for read.
00437     SamFileReader(const char* filename,
00438                   ErrorHandler::HandlingType errorHandlingType);
00439 
00440     /// Constructor that opens the specified file for read and reads
00441     /// the header from the file.
00442     SamFileReader(const char* filename,
00443                   SamFileHeader* header);
00444 
00445     /// Constructor that opens the specified file for read and reads
00446     /// the header from the file.
00447     SamFileReader(const char* filename,
00448                   ErrorHandler::HandlingType errorHandlingType,
00449                   SamFileHeader* header);
00450 
00451     virtual ~SamFileReader();
00452 };
00453 
00454 
00455 /// Child class of SamFile for writing files.
00456 class SamFileWriter : public SamFile
00457 {
00458 public:
00459     /// Default Constructor.
00460     SamFileWriter();
00461 
00462     /// Constructor that opens the specified file for write.
00463     SamFileWriter(const char* filename);
00464 
00465     /// Constructor that opens the specified file for write.
00466     SamFileWriter(const char* filename,
00467                   ErrorHandler::HandlingType errorHandlingType);
00468 
00469     /// Constructor that opens the specified file for write and write
00470     /// the specified header into the file.
00471     SamFileWriter(const char* filename,
00472                   SamFileHeader* header);
00473 
00474     /// Constructor that opens the specified file for write and write
00475     /// the specified header into the file.
00476     SamFileWriter(const char* filename,
00477                   ErrorHandler::HandlingType errorHandlingType,
00478                   SamFileHeader* header);
00479 
00480     virtual ~SamFileWriter();
00481 };
00482 
00483 #endif
Generated on Tue Sep 6 17:51:59 2011 for libStatGen Software by  doxygen 1.6.3