libStatGen Software  1
SamFileHeader.h
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_FILE_HEADER_H__
00019 #define __SAM_FILE_HEADER_H__
00020 
00021 #include <map>
00022 #include <stdint.h>
00023 
00024 #include "SamReferenceInfo.h"
00025 #include "SamHeaderHD.h"
00026 #include "SamHeaderSQ.h"
00027 #include "SamHeaderRG.h"
00028 #include "SamHeaderPG.h"
00029 
00030 /// This class allows a user to get/set the fields in a SAM/BAM Header.
00031 /// Sam/Bam headers contain comments and multiple SamHeaderRecords 
00032 /// (HD, SQs, RGs, PGs) comprised of tag/value pairs with each tag only
00033 /// appearing once within a specific record.
00034 class SamFileHeader
00035 {
00036 public:
00037     SamFileHeader();
00038     ~SamFileHeader();
00039 
00040     /////////////////////////////
00041     /// @name  Copying a Header
00042     /// These methods are ways of copying the contents of one header into
00043     /// another one.
00044     //@{
00045 
00046     /// Copy Constructor copies the specified header into this one.
00047     SamFileHeader(const SamFileHeader& header);
00048 
00049     /// Overload operator = to copy the passed in header into this header.
00050     SamFileHeader & operator = (const SamFileHeader& header);
00051 
00052     /// Copy method copies the passed in header into this header.
00053     /// Returns true if at least one header line was successfully copied.
00054     bool copy(const SamFileHeader& header);
00055     //@}
00056 
00057     /// Initialize the header.
00058     void resetHeader();
00059 
00060     /////////////////////////////
00061     /// @name  Get the Entire Header
00062     /// Get the entire header as a single string.
00063     //@{
00064 
00065     /// Set the passed in string to the entire header string, clearing its
00066     /// current contents.
00067     /// \return true if successfully set (even if set to "")
00068     bool getHeaderString(std::string& header) const;
00069 
00070     //@}
00071 
00072     /// Get the reference ID for the specified reference name (chromosome).
00073     /// If addID is set to true, a reference id will be created for the
00074     /// referenceName if one does not already exist.  If addID is set to
00075     /// false (default), it will return SamReferenceInfo::NO_REF_ID.
00076     int getReferenceID(const String & referenceName, bool addID = false);
00077 
00078     /// Get the reference ID for the specified reference name (chromosome).
00079     /// If addID is set to true, a reference id will be created for the
00080     /// referenceName if one does not already exist.  If addID is set to
00081     /// false (default), it will return SamReferenceInfo::NO_REF_ID.
00082     int getReferenceID(const char* referenceName, bool addID = false);
00083 
00084     /// Return the reference name (chromosome) for the specified reference id.
00085     const String & getReferenceLabel(int id) const;
00086 
00087     /// Get the Reference Information
00088     const SamReferenceInfo& getReferenceInfo() const;
00089 
00090     // Get the Reference Information for updating separately when reading
00091     // BAMs...should only be called by BamInterface.
00092     SamReferenceInfo& getReferenceInfoForBamInterface();
00093 
00094     ////////////////////////////////////////////////////////////////////////
00095     // Set Values in the header
00096     ////////////////////////////////////////////////////////////////////////
00097 
00098     /////////////////////////////////////////
00099     /// @name  Adding an entire header/comment line.
00100     /// These methods are ways of adding an entire header line at once.
00101     //@{
00102 
00103     /// Add a header line that is just one tag with a const char* value.
00104     /// Note: This method will only do one tag per type on a line, so if a
00105     /// type has multiple tags, the whole line needs to be added at once,
00106     /// and a different method should be used.
00107     bool addHeaderLine(const char* type, const char* tag, const char* value); 
00108 
00109     /// Add a header line that is already preformatted in a const char*.
00110     /// Returns true if at least one header line was successfully added.
00111     bool addHeaderLine(const char* headerLine);
00112 
00113     /// Add a header that is already preformatted in a const char*.
00114     /// Returns true if at least one header line was successfully added.
00115     bool addHeader(const char* header);
00116 
00117     /// Add the specified comment to the header (do not include "@CO" or "\n").
00118     /// \return true if successfully added, false if not.
00119     bool addComment(const char* comment);
00120 
00121     //@}
00122 
00123     
00124     /////////////////////////////////////////
00125     /// @name  Set/Add/Remove a Single Tag
00126     /// The passed in tag should be the two character SAM tag as defined
00127     /// in the SAM spec.  A tag is removed from the header record by setting
00128     /// it to "". For the SQ and RG header types, the key tags (SN for SQ
00129     /// and ID for RG) may not be modified or removed once set. This is
00130     /// because these values are used as a lookup key for the header record, 
00131     /// so the entire record must be removed.
00132     //@{
00133 
00134 //     // Set the specified header type tag to the specified value in the 
00135 //     // header with the specified keyID.  keyID must be specified when
00136 //     // type = SQ, RG, or PG.
00137 //     bool setTag(SamHeaderRecord::SamHeaderRecordType type, const char* tag,
00138 //                 const char* value, const char* keyID = NULL);
00139 
00140     /// Set the specified tag to the specified value in the HD header, remove
00141     /// the tag by specifying value="".
00142     /// \return true if the tag was successfully set, false if not.
00143     bool setHDTag(const char* tag, const char* value);
00144 
00145     /// Set the specified tag to the specified value in the SQ header with
00146     /// the specified name, remove the tag by specifying value="".  If the
00147     /// header does not yet exist, the tag must be "LN" and the header is added
00148     /// with the specified LN value and the SN value passed in name.  
00149     /// The SN & LN tags may not be modified or removed after they are
00150     /// set unless the entire record is deleted.
00151     /// \return true if the tag was successfully set, false if not.
00152     bool setSQTag(const char* tag, const char* value, const char* name);
00153 
00154     /// Set the specified tag to the specified value in the RG header with
00155     /// the specified id, remove the tag by specifying value="".  If the
00156     /// header does not yet exist, the header is added and so is the ID tag
00157     /// with the value set to the passed in id.  The ID tag may not be 
00158     /// modified or removed after it is set unless the entire record is deleted.
00159     /// \return true if the tag was successfully set, false if not.
00160     bool setRGTag(const char* tag, const char* value, const char* id);
00161 
00162     /// Set the specified tag to the specified value in the PG header with
00163     /// the specified id, remove the tag by specifying value="".  If the
00164     /// header does not yet exist, the header is added and so is the ID tag
00165     /// with the value set to the passed in id.  The ID tag may not be 
00166     /// modified or removed after it is set unless the entire record is deleted.
00167     /// \return true if the tag was successfully set, false if not.
00168     bool setPGTag(const char* tag, const char* value, const char* id);
00169 
00170     //@}
00171 
00172     /////////////////////////////////////////
00173     /// @name  Add an Already Setup SamHeaderRecord
00174     /// NOTE: These methods add a pointer to the passed in record.
00175     /// The header record will be deleted when it's cleaned up from this header.
00176     /// NOTE: Do NOT delete the passed in record, the SamFileHeader class
00177     /// takes care of that itself.
00178     //@{
00179 
00180     /// Add the HD record to the header.
00181     /// Note: it adds a pointer to the passed in header record.  The header
00182     /// record will be deleted when it is cleaned up from this header.
00183     /// \return true if the record was successfully added, false otherwise.
00184     bool addHD(SamHeaderHD* hd);
00185 
00186     /// Add the SQ record to the header.
00187     /// Note: it adds a pointer to the passed in header record.  The header
00188     /// record will be deleted when it is cleaned up from this header.
00189     /// \return true if the record was successfully added, false otherwise.
00190     bool addSQ(SamHeaderSQ* sq);
00191 
00192     /// Add the RG record to the header.
00193     /// Note: it adds a pointer to the passed in header record.  The header
00194     /// record will be deleted when it is cleaned up from this header.
00195     /// \return true if the record was successfully added, false otherwise.
00196     bool addRG(SamHeaderRG* rg);
00197 
00198     /// Add the PG record to the header.
00199     /// Note: it adds a pointer to the passed in header record.  The header
00200     /// record will be deleted when it is cleaned up from this header.
00201     /// \return true if the record was successfully added, false otherwise.
00202     bool addPG(SamHeaderPG* pg);
00203 
00204     /// Add a copy of the specified header record to the header.
00205     /// Note: it creates a new header record that is identical to the specified
00206     /// one and adds it to the header.  The passed in pointer will not be
00207     /// deleted due to this.
00208     /// \return true if the record was successfully added, false otherwise.
00209     bool addRecordCopy(const SamHeaderRecord& hdrRec);
00210 
00211     //@}
00212 
00213     ////////////////////////////////////////////////////////////////////////
00214     /// @name  Remove an Entire Header Record
00215     //@{
00216 
00217     /// Remove the HD record.
00218     /// \return true if successfully removed or did not exist, false if
00219     /// the record still exists.
00220     bool removeHD();
00221 
00222     /// Remove SQ record with the specified key.
00223     /// NOTE: Does not remove it from the BAM index.
00224     /// \return true if successfully removed or did not exist, false if
00225     /// the record still exists.
00226     bool removeSQ(const char* name);
00227 
00228     /// Remove RG record with the specified key.
00229     /// \return true if successfully removed or did not exist, false if
00230     /// the record still exists.
00231     bool removeRG(const char* id);
00232 
00233     /// Remove PG record with the specified key.
00234     /// \return true if successfully removed or did not exist, false if
00235     /// the record still exists.
00236     bool removePG(const char* id);
00237 
00238     //@}
00239 
00240     ////////////////////////////////////////////////////////////////////////
00241     /// @name  Get a Specific Tag
00242     /// These methods return the value associated with the specified tag.
00243     /// If the tag does not exist in the record "" is returned.
00244     ///
00245     /// For SQ, RG, and PG the value returned is for the tag associated with
00246     /// the specified key (name/id). If a record with that key does not exist
00247     /// or if the tag does not exist for the record with that key, "" is 
00248     /// returned.
00249     //@{
00250 
00251     /// Returns the value associated with the specified HD tag, returning "" if
00252     /// the tag does not exist in the header.
00253     const char* getHDTagValue(const char* tag);
00254 
00255     /// Get the value associated with the specified tag on the SQ line with
00256     /// the specified sequence name, returning "" if the tag or key does
00257     /// not exist.
00258     const char* getSQTagValue(const char* tag, const char* name);
00259 
00260     /// Get the value associated with the specified tag on the RG line with
00261     /// the specified read group identifier, returning "" if the tag or key does
00262     /// not exist.
00263     const char* getRGTagValue(const char* tag, const char* id);
00264 
00265     /// Get the value associated with the specified tag on the RG line with
00266     /// the specified id, returning "" if the tag or key does
00267     /// not exist.
00268     const char* getPGTagValue(const char* tag, const char* id);
00269 
00270     //@}
00271 
00272     /// Get the number of SQ objects.
00273     int getNumSQs();
00274 
00275     /// Get the number of RG objects.
00276     int getNumRGs();
00277 
00278     /// Get the number of PG objects.
00279     int getNumPGs();
00280 
00281     ////////////////////////////////////////////////////////////////////////
00282     /// @name  Get a Specific Header Record
00283     /// These methods return a reference to the specific record that was
00284     /// requested, returning NULL if that record does not exist in the header.
00285     ///
00286     /// The returned record can be modified to add/remove some tags.
00287     /// Since a reference is returned, the SamHeaderFile automatically 
00288     /// reflects these changes.
00289     //@{
00290 
00291     /// Get the HD object, returning NULL if there is no HD record.
00292     SamHeaderHD* getHD();
00293 
00294     /// Get the SQ object with the specified sequence name, returning NULL
00295     /// if there is no SQ object with that key.
00296     SamHeaderSQ* getSQ(const char* name);
00297 
00298     /// Get the RG object with the specified read group identifier, returning
00299     /// NULL if there is no RG object with that key..
00300     SamHeaderRG* getRG(const char* id);
00301 
00302     /// Get the PG object with the specified id, returning NULL
00303     /// if there is no PG object with that key..
00304     SamHeaderPG* getPG(const char* id);
00305 
00306     //@}
00307 
00308 //     //////////////////////////////////
00309 //     // Set methods for header fields.
00310 //     bool setVersion(const char* version);
00311 //     bool setSortOrder(const char* sortOrder);
00312 //     bool addSequenceName(const char* sequenceName);
00313 //     bool setSequenceLength(const char* keyID, int sequenceLength);
00314 //     bool setGenomeAssemblyId(const char* keyID, const char* genomeAssemblyId);
00315 //     bool setMD5Checksum(const char* keyID, const char* md5sum);
00316 //     bool setURI(const char* keyID, const char* uri);
00317 //     bool setSpecies(const char* keyID, const char* species);
00318 //     bool addReadGroupID(const char* readGroupID);
00319 //     bool setSample(const char* keyID, const char* sample);
00320 //     bool setLibrary(const char* keyID, const char* library);
00321 //     bool setDescription(const char* keyID, const char* description);
00322 //     bool setPlatformUnit(const char* keyID, const char* platform);
00323 //     bool setPredictedMedianInsertSize(const char* keyID, const char* isize);
00324 //     bool setSequencingCenter(const char* keyID, const char* center);
00325 //     bool setRunDate(const char* keyID, const char* runDate);
00326 //     bool setTechnology(const char* keyID, const char* technology);
00327 //     bool addProgram(const char* programID);
00328 //     bool setProgramVersion(const char* keyID, const char* version);
00329 //     bool setCommandLine(const char* keyID, const char* commandLine);
00330     
00331 //     ///////////////////////////////////
00332 //     // Get methods for header fields.
00333 //     // Returns the number of SQ entries in the header.
00334 //     int32_t getSequenceDictionaryCount();
00335 
00336     /// Return the Sort Order value that is set in the Header, returning ""
00337     /// if this field does not exist.
00338     const char* getSortOrder();
00339 
00340 
00341     /// DEPRECATED
00342     const char* getTagSO();
00343 
00344     /////////////////////////////
00345     /// @name  Get the Header Record/Comment/Line by Record/Comment/Line
00346     /// These methods iterate through the header.
00347     /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
00348     /// same iterator.  getNextHeaderRecord that takes a header type
00349     /// uses the same iterator as the getNextXXRecord with that type.
00350     /// Otherwise the iterators are independent.
00351     //@{
00352 
00353     /// Get the next SQ header record.  After all SQ headers have been
00354     /// retrieved, NULL is returned until a reset is called.
00355     /// Independent from getNextHeaderRecord, getNextHeaderLine and the
00356     /// other getNextXXRecord methods and the associated reset methods.
00357     SamHeaderRecord* getNextSQRecord();
00358 
00359     /// Get the next RG header record.  After all RG headers have been
00360     /// retrieved, NULL is returned until a reset is called.
00361     /// Independent from getNextHeaderRecord, getNextHeaderLine and the
00362     /// other getNextXXRecord methods and the associated reset methods.
00363     SamHeaderRecord* getNextRGRecord();
00364 
00365     /// Get the next PG header record.  After all PG headers have been
00366     /// retrieved, NULL is returned until a reset is called.
00367     /// Independent from getNextHeaderRecord, getNextHeaderLine and the
00368     /// other getNextXXRecord methods and the associated reset methods.
00369     SamHeaderRecord* getNextPGRecord();
00370 
00371     /// Reset to the beginning of the header records so the next call
00372     /// to getNextSQRecord returns the first SQ header record.
00373     void resetSQRecordIter();
00374 
00375     /// Reset to the beginning of the header records so the next call
00376     /// to getNextRGRecord returns the first RG header record.
00377     void resetRGRecordIter();
00378 
00379     /// Reset to the beginning of the header records so the next call
00380     /// to getNextPGRecord returns the first PG header record.
00381     void resetPGRecordIter();
00382 
00383     /// Get the next header record of the specified type starting from the
00384     /// specified index and update the index.
00385     /// After all headers of that type have been retrieved,
00386     /// NULL is returned until a reset is called for that type.
00387     SamHeaderRecord* getNextHeaderRecord(uint32_t& index, 
00388                                          SamHeaderRecord::SamHeaderRecordType headerType);
00389 
00390     /// Get the next header record, but not comment line.  After all headers
00391     /// have been retrieved, NULL is returned until a reset is called.
00392     /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
00393     /// same iterator.
00394     SamHeaderRecord* getNextHeaderRecord();
00395 
00396     /// Set the passed in string to the next header line, overwritting
00397     /// the passed in string.  If there are no more header lines or there
00398     /// is an error, false is returned and the passed in string is set to ""
00399     /// until a rest is called.
00400     /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
00401     /// same iterator.
00402     bool getNextHeaderLine(std::string &headerLine);
00403 
00404     /// Reset to the beginning of the header records so the next call
00405     /// to getNextHeaderRecord returns the first header line.
00406     void resetHeaderRecordIter();
00407    
00408     /// Append all of the comment lines to the specified string.
00409     void appendCommentLines(std::string &commentLines);
00410 
00411     /// Returns the comment on the next comment line.  Returns "" if all comment
00412     /// lines have been returned, until resetCommentIter is called.
00413     const char* getNextComment();
00414 
00415     /// Resets to the beginning of the comments so getNextComment returns
00416     /// the first comment.
00417     void resetCommentIter();
00418 
00419     //@}
00420 
00421 
00422     /// Get the failure message if a method returned failure.
00423     const char* getErrorMessage()  { return(myErrorMessage.c_str()); }
00424 
00425     static const std::string EMPTY_RETURN;
00426 
00427 private:
00428     // Parse the header string. 
00429     bool parseHeader(String& header);
00430 
00431     // Parse the specified line of the header.
00432     bool parseHeaderLine(const String& headerLine);
00433 
00434     // Set the passed in string to the header line at the specified index.
00435     // It does NOT clear the current contents of header.
00436     bool getHeaderLine(unsigned int index, std::string& header) const;
00437 
00438     int16_t makeKey(char ch1, char ch2)
00439     {
00440         return((ch1 << 8) + ch2);
00441     }
00442 
00443     // Only one HD type is allowed per file.
00444     SamHeaderHD* myHD;
00445 
00446     // There can be multiple SQ Types, indexed by SN.
00447     StringHash mySQs;
00448 
00449     // There can be multiple RG Types, indexed by ID.
00450     StringHash myRGs;
00451 
00452     // There can be multiple PG types, indexed by ID.
00453     StringHash myPGs;
00454 
00455     // Reference Name information
00456     SamReferenceInfo myReferenceInfo;
00457 
00458     // Vector of comments
00459     std::vector<std::string> myComments;
00460 
00461     std::vector<SamHeaderRecord*> myHeaderRecords;
00462 
00463     std::string myErrorMessage;
00464 
00465     uint32_t myCurrentSQIndex;
00466 
00467     uint32_t myCurrentRGIndex;
00468 
00469     uint32_t myCurrentPGIndex;
00470 
00471     uint32_t myCurrentHeaderIndex;
00472 
00473     uint32_t myCurrentCommentIndex;
00474 };
00475 
00476 #endif
00477 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends