SamFileHeader.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_FILE_HEADER_H__
00019 #define __SAM_FILE_HEADER_H__
00020 
00021 #include <map>
00022 #include <stdint.h>
00023 
00024 #include "SamReferenceInfo.h"
00025 #include "SamHeaderHD.h"
00026 #include "SamHeaderSQ.h"
00027 #include "SamHeaderRG.h"
00028 #include "SamHeaderPG.h"
00029 #include "SamStatus.h"
00030 
00031 /// This class allows a user to get/set the fields in a SAM/BAM Header.
00032 /// Sam/Bam headers contain comments and multiple SamHeaderRecords 
00033 /// (HD, SQs, RGs, PGs) comprised of tag/value pairs with each tag only
00034 /// appearing once within a specific record.
00035 class SamFileHeader
00036 {
00037 public:
00038     SamFileHeader();
00039     ~SamFileHeader();
00040 
00041     /////////////////////////////
00042     /// @name  Copying a Header
00043     /// These methods are ways of copying the contents of one header into
00044     /// another one.
00045     //@{
00046 
00047     /// Copy Constructor copies the specified header into this one.
00048     SamFileHeader(const SamFileHeader& header);
00049 
00050     /// Overload operator = to copy the passed in header into this header.
00051     SamFileHeader & operator = (const SamFileHeader& header);
00052 
00053     /// Copy method copies the passed in header into this header.
00054     bool copy(const SamFileHeader& header);
00055     //@}
00056 
00057     /// Initialize the header.
00058     void resetHeader();
00059 
00060     /////////////////////////////
00061     /// @name  Get the Entire Header
00062     /// Get the entire header as a single string.
00063     //@{
00064 
00065     /// Set the passed in string to the entire header string, clearing its
00066     /// current contents.
00067     /// \return true if successfully set (even if set to "")
00068     bool getHeaderString(std::string& header) const;
00069 
00070     //@}
00071 
00072     /// Get the reference ID for the specified reference name (chromosome).
00073     /// If addID is set to true, a reference id will be created for the
00074     /// referenceName if one does not already exist.  If addID is set to
00075     /// false (default), it will return SamReferenceInfo::NO_REF_ID.
00076     int   getReferenceID(const String & referenceName, bool addID = false);
00077 
00078     /// Get the reference ID for the specified reference name (chromosome).
00079     /// If addID is set to true, a reference id will be created for the
00080     /// referenceName if one does not already exist.  If addID is set to
00081     /// false (default), it will return SamReferenceInfo::NO_REF_ID.
00082     int   getReferenceID(const char* referenceName, bool addID = false);
00083 
00084     /// Return the reference name (chromosome) for the specified reference id.
00085     const String & getReferenceLabel(int id) const;
00086 
00087     /// Get the Reference Information
00088     const SamReferenceInfo* getReferenceInfo() const;
00089 
00090     /// Add reference sequence name and reference sequence length to the header.
00091     void addReferenceInfo(const char* referenceSequenceName, 
00092                           int32_t referenceSequenceLength);
00093 
00094     /// Populate the reference info from the SQ fields.
00095     void generateReferenceInfo();
00096 
00097     ////////////////////////////////////////////////////////////////////////
00098     // Set Values in the header
00099     ////////////////////////////////////////////////////////////////////////
00100 
00101     /////////////////////////////////////////
00102     /// @name  Adding an entire header/comment line.
00103     /// These methods are ways of adding an entire header line at once.
00104     //@{
00105 
00106     /// Add a header line that is just one tag with a const char* value.
00107     /// Note: This method will only do one tag per type on a line, so if a
00108     /// type has multiple tags, the whole line needs to be added at once,
00109     /// and a different method should be used.
00110     bool addHeaderLine(const char* type, const char* tag, const char* value); 
00111 
00112     /// Add a header line that is already preformatted in a const char* without
00113     /// a trailing "\n".
00114     bool addHeaderLine(const char* headerLine);
00115 
00116     /// Add the specified comment to the header (do not include "@CO" or "\n").
00117     /// \return true if successfully added, false if not.
00118     bool addComment(const char* comment);
00119 
00120     //@}
00121 
00122     
00123     /////////////////////////////////////////
00124     /// @name  Set/Add/Remove a Single Tag
00125     /// The passed in tag should be the two character SAM tag as defined
00126     /// in the SAM spec.  A tag is removed from the header record by setting
00127     /// it to "". For the SQ and RG header types, the key tags (SN for SQ
00128     /// and ID for RG) may not be modified or removed once set. This is
00129     /// because these values are used as a lookup key for the header record, 
00130     /// so the entire record must be removed.
00131     //@{
00132 
00133 //     // Set the specified header type tag to the specified value in the 
00134 //     // header with the specified keyID.  keyID must be specified when
00135 //     // type = SQ, RG, or PG.
00136 //     bool setTag(SamHeaderRecord::SamHeaderRecordType type, const char* tag,
00137 //                 const char* value, const char* keyID = NULL);
00138 
00139     /// Set the specified tag to the specified value in the HD header, remove
00140     /// the tag by specifying value="".
00141     /// \return true if the tag was successfully set, false if not.
00142     bool setHDTag(const char* tag, const char* value);
00143 
00144     /// Set the specified tag to the specified value in the SQ header with
00145     /// the specified name, remove the tag by specifying value="".  If the
00146     /// header does not yet exist, the header is added and so is the SN tag
00147     /// with the value set to the passed in name.  The SN tag may not be 
00148     /// modified or removed after it is set unless the entire record is deleted.
00149     /// \return true if the tag was successfully set, false if not.
00150     bool setSQTag(const char* tag, const char* value, const char* name);
00151 
00152     /// Set the specified tag to the specified value in the RG header with
00153     /// the specified id, remove the tag by specifying value="".  If the
00154     /// header does not yet exist, the header is added and so is the ID tag
00155     /// with the value set to the passed in id.  The ID tag may not be 
00156     /// modified or removed after it is set unless the entire record is deleted.
00157     /// \return true if the tag was successfully set, false if not.
00158     bool setRGTag(const char* tag, const char* value, const char* id);
00159 
00160     /// Set the specified tag to the specified value in the PG header with
00161     /// the specified id, remove the tag by specifying value="".  If the
00162     /// header does not yet exist, the header is added and so is the ID tag
00163     /// with the value set to the passed in id.  The ID tag may not be 
00164     /// modified or removed after it is set unless the entire record is deleted.
00165     /// \return true if the tag was successfully set, false if not.
00166     bool setPGTag(const char* tag, const char* value, const char* id);
00167 
00168     //@}
00169 
00170     /////////////////////////////////////////
00171     /// @name  Add an Already Setup SamHeaderRecord
00172     /// NOTE: These methods add a pointer to the passed in record.
00173     /// The header record will be deleted when it's cleaned up from this header.
00174     /// NOTE: Do NOT delete the passed in record, the SamFileHeader class
00175     /// takes care of that itself.
00176     //@{
00177 
00178     /// Add the HD record to the header.
00179     /// Note: it adds a pointer to the passed in header record.  The header
00180     /// record will be deleted when it is cleaned up from this header.
00181     /// \ returns true if the record was successfully added, false otherwise.
00182     bool addHD(SamHeaderHD* hd);
00183 
00184     /// Add the SQ record to the header.
00185     /// Note: it adds a pointer to the passed in header record.  The header
00186     /// record will be deleted when it is cleaned up from this header.
00187     /// \ returns true if the record was successfully added, false otherwise.
00188     bool addSQ(SamHeaderSQ* sq);
00189 
00190     /// Add the RG record to the header.
00191     /// Note: it adds a pointer to the passed in header record.  The header
00192     /// record will be deleted when it is cleaned up from this header.
00193     /// \ returns true if the record was successfully added, false otherwise.
00194     bool addRG(SamHeaderRG* rg);
00195 
00196     /// Add the PG record to the header.
00197     /// Note: it adds a pointer to the passed in header record.  The header
00198     /// record will be deleted when it is cleaned up from this header.
00199     /// \ returns true if the record was successfully added, false otherwise.
00200     bool addPG(SamHeaderPG* pg);
00201 
00202     //@}
00203 
00204     ////////////////////////////////////////////////////////////////////////
00205     /// @name  Remove an Entire Header Record
00206     //@{
00207 
00208     /// Remove the HD record.
00209     /// \return true if successfully removed or did not exist, false if
00210     /// the record still exists.
00211     bool removeHD();
00212 
00213     /// Remove SQ record with the specified key.
00214     /// \return true if successfully removed or did not exist, false if
00215     /// the record still exists.
00216     bool removeSQ(const char* name);
00217 
00218     /// Remove RG record with the specified key.
00219     /// \return true if successfully removed or did not exist, false if
00220     /// the record still exists.
00221     bool removeRG(const char* id);
00222 
00223     /// Remove PG record with the specified key.
00224     /// \return true if successfully removed or did not exist, false if
00225     /// the record still exists.
00226     bool removePG(const char* id);
00227 
00228     //@}
00229 
00230 
00231     ////////////////////////////////////////////////////////////////////////
00232     //
00233     ////////////////////////////////////////////////////////////////////////
00234     SamStatus::Status setHeaderFromBamFile(IFILE filePtr);
00235     
00236 
00237     ////////////////////////////////////////////////////////////////////////
00238     /// @name  Get a Specific Tag
00239     /// These methods return the value associated with the specified tag.
00240     /// If the tag does not exist in the record "" is returned.
00241     ///
00242     /// For SQ, RG, and PG the value returned is for the tag associated with
00243     /// the specified key (name/id). If a record with that key does not exist
00244     /// or if the tag does not exist for the record with that key, "" is 
00245     /// returned.
00246     //@{
00247 
00248     /// Returns the value associated with the specified HD tag, returning "" if
00249     /// the tag does not exist in the header.
00250     const char* getHDTagValue(const char* tag);
00251 
00252     /// Get the value associated with the specified tag on the SQ line with
00253     /// the specified sequence name, returning "" if the tag or key does
00254     /// not exist.
00255     const char* getSQTagValue(const char* tag, const char* name);
00256 
00257     /// Get the value associated with the specified tag on the RG line with
00258     /// the specified read group identifier, returning "" if the tag or key does
00259     /// not exist.
00260     const char* getRGTagValue(const char* tag, const char* id);
00261 
00262     /// Get the value associated with the specified tag on the RG line with
00263     /// the specified id, returning "" if the tag or key does
00264     /// not exist.
00265     const char* getPGTagValue(const char* tag, const char* id);
00266 
00267     //@}
00268 
00269     /// Get the number of SQ objects.
00270     int getNumSQs();
00271 
00272     /// Get the number of RG objects.
00273     int getNumRGs();
00274 
00275     /// Get the number of PG objects.
00276     int getNumPGs();
00277 
00278     ////////////////////////////////////////////////////////////////////////
00279     /// @name  Get a Specific Header Record
00280     /// These methods return a reference to the specific record that was
00281     /// requested, returning NULL if that record does not exist in the header.
00282     ///
00283     /// The returned record can be modified to add/remove some tags.
00284     /// Since a reference is returned, the SamHeaderFile automatically 
00285     /// reflects these changes.
00286     //@{
00287 
00288     /// Get the HD object, returning NULL if there is no HD record.
00289     SamHeaderHD* getHD();
00290 
00291     /// Get the SQ object with the specified sequence name, returning NULL
00292     /// if there is no SQ object with that key.
00293     SamHeaderSQ* getSQ(const char* name);
00294 
00295     /// Get the RG object with the specified read group identifier, returning
00296     /// NULL if there is no RG object with that key..
00297     SamHeaderRG* getRG(const char* id);
00298 
00299     /// Get the PG object with the specified id, returning NULL
00300     /// if there is no PG object with that key..
00301     SamHeaderPG* getPG(const char* id);
00302 
00303     //@}
00304 
00305 //     //////////////////////////////////
00306 //     // Set methods for header fields.
00307 //     bool setVersion(const char* version);
00308 //     bool setSortOrder(const char* sortOrder);
00309 //     bool addSequenceName(const char* sequenceName);
00310 //     bool setSequenceLength(const char* keyID, int sequenceLength);
00311 //     bool setGenomeAssemblyId(const char* keyID, const char* genomeAssemblyId);
00312 //     bool setMD5Checksum(const char* keyID, const char* md5sum);
00313 //     bool setURI(const char* keyID, const char* uri);
00314 //     bool setSpecies(const char* keyID, const char* species);
00315 //     bool addReadGroupID(const char* readGroupID);
00316 //     bool setSample(const char* keyID, const char* sample);
00317 //     bool setLibrary(const char* keyID, const char* library);
00318 //     bool setDescription(const char* keyID, const char* description);
00319 //     bool setPlatformUnit(const char* keyID, const char* platform);
00320 //     bool setPredictedMedianInsertSize(const char* keyID, const char* isize);
00321 //     bool setSequencingCenter(const char* keyID, const char* center);
00322 //     bool setRunDate(const char* keyID, const char* runDate);
00323 //     bool setTechnology(const char* keyID, const char* technology);
00324 //     bool addProgram(const char* programID);
00325 //     bool setProgramVersion(const char* keyID, const char* version);
00326 //     bool setCommandLine(const char* keyID, const char* commandLine);
00327     
00328 //     ///////////////////////////////////
00329 //     // Get methods for header fields.
00330 //     // Returns the number of SQ entries in the header.
00331 //     int32_t getSequenceDictionaryCount();
00332 
00333     /// Return the Sort Order value that is set in the Header, returning ""
00334     /// if this field does not exist.
00335     const char* getSortOrder();
00336 
00337 
00338     /// DEPRECATED
00339     const char* getTagSO();
00340 
00341     /////////////////////////////
00342     /// @name  Get the Header Record/Comment/Line by Record/Comment/Line
00343     /// These methods iterate through the header.
00344     /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
00345     /// same iterator.  getNextHeaderRecord that takes a header type
00346     /// uses the same iterator as the getNextXXRecord with that type.
00347     /// Otherwise the iterators are independent.
00348     //@{
00349 
00350     /// Get the next SQ header record.  After all SQ headers have been
00351     /// retrieved, NULL is returned until a reset is called.
00352     /// Independent from getNextHeaderRecord, getNextHeaderLine and the
00353     /// other getNextXXRecord methods and the associated reset methods.
00354     SamHeaderRecord* getNextSQRecord();
00355 
00356     /// Get the next RG header record.  After all RG headers have been
00357     /// retrieved, NULL is returned until a reset is called.
00358     /// Independent from getNextHeaderRecord, getNextHeaderLine and the
00359     /// other getNextXXRecord methods and the associated reset methods.
00360     SamHeaderRecord* getNextRGRecord();
00361 
00362     /// Get the next PG header record.  After all PG headers have been
00363     /// retrieved, NULL is returned until a reset is called.
00364     /// Independent from getNextHeaderRecord, getNextHeaderLine and the
00365     /// other getNextXXRecord methods and the associated reset methods.
00366     SamHeaderRecord* getNextPGRecord();
00367 
00368     /// Reset to the beginning of the header records so the next call
00369     /// to getNextSQRecord returns the first SQ header record.
00370     void resetSQRecordIter();
00371 
00372     /// Reset to the beginning of the header records so the next call
00373     /// to getNextRGRecord returns the first RG header record.
00374     void resetRGRecordIter();
00375 
00376     /// Reset to the beginning of the header records so the next call
00377     /// to getNextPGRecord returns the first PG header record.
00378     void resetPGRecordIter();
00379 
00380     /// Get the next header record of the specified type starting from the
00381     /// specified index and update the index.
00382     /// After all headers of that type have been retrieved,
00383     /// NULL is returned until a reset is called for that type.
00384     SamHeaderRecord* getNextHeaderRecord(uint32_t& index, 
00385                                          SamHeaderRecord::SamHeaderRecordType headerType);
00386 
00387     /// Get the next header record, but not comment line.  After all headers
00388     /// have been retrieved, NULL is returned until a reset is called.
00389     /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
00390     /// same iterator.
00391     SamHeaderRecord* getNextHeaderRecord();
00392 
00393     /// Set the passed in string to the next header/comment line, overwritting
00394     /// the passed in string.  If there are no more header lines or there
00395     /// is an error, false is returned and the passed in string is set to ""
00396     /// until a rest is called.
00397     /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
00398     /// same iterator.
00399     bool getNextHeaderLine(std::string &headerLine);
00400 
00401     /// Reset to the beginning of the header records so the next call
00402     /// to getNextHeaderRecord returns the first header line.
00403     void resetHeaderRecordIter();
00404    
00405     /// Returns the comment on the next comment line.  Returns "" if all comment
00406     /// lines have been returned, until resetCommentIter is called.
00407     const char* getNextComment();
00408 
00409     /// Resets to the beginning of the comments so getNextComment returns
00410     /// the first comment.
00411     void resetCommentIter();
00412 
00413     //@}
00414 
00415 
00416 private:
00417     // Parse the header string.
00418     bool parseHeader(String& header);
00419 
00420     // Parse the specified line of the header.
00421     bool parseHeaderLine(const String& headerLine);
00422 
00423     // Set the passed in string to the header line at the specified index.
00424     // It does NOT clear the current contents of header.
00425     bool getHeaderLine(unsigned int index, std::string& header) const;
00426 
00427     int16_t makeKey(char ch1, char ch2)
00428     {
00429         return((ch1 << 8) + ch2);
00430     }
00431 
00432     // Only one HD type is allowed per file.
00433     SamHeaderHD* myHD;
00434 
00435     // There can be multiple SQ Types, indexed by SN.
00436     StringHash mySQs;
00437 
00438     // There can be multiple RG Types, indexed by ID.
00439     StringHash myRGs;
00440 
00441     // There can be multiple PG types, indexed by ID.
00442     StringHash myPGs;
00443 
00444     // Reference Name information
00445     SamReferenceInfo myReferenceInfo;
00446 
00447     // Vector of comments
00448     std::vector<std::string> myComments;
00449 
00450     std::vector<SamHeaderRecord*> myHeaderRecords;
00451 
00452     uint32_t myCurrentSQIndex;
00453 
00454     uint32_t myCurrentRGIndex;
00455 
00456     uint32_t myCurrentPGIndex;
00457 
00458     uint32_t myCurrentHeaderIndex;
00459 
00460     uint32_t myCurrentCommentIndex;
00461 
00462     static const std::string EMPTY_RETURN;
00463 };
00464 
00465 #endif
00466 
Generated on Tue Sep 6 17:51:59 2011 for libStatGen Software by  doxygen 1.6.3