SamFileHeader.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __SAM_FILE_HEADER_H__
00019 #define __SAM_FILE_HEADER_H__
00020 
00021 #include <map>
00022 #include <stdint.h>
00023 
00024 #include "SamReferenceInfo.h"
00025 #include "SamHeaderHD.h"
00026 #include "SamHeaderSQ.h"
00027 #include "SamHeaderRG.h"
00028 #include "SamHeaderPG.h"
00029 #include "SamStatus.h"
00030 
00031 class SamFileHeader
00032 {
00033 public:
00034     SamFileHeader();
00035     ~SamFileHeader();
00036 
00037     // Copy Constructor   
00038     SamFileHeader(const SamFileHeader& header);
00039 
00040     // Overload operator = to copy the passed in header into this header.
00041     SamFileHeader & operator = (const SamFileHeader& header);
00042 
00043     // Overload operator = to copy the passed in header into this header.
00044     bool copy(const SamFileHeader& header);
00045 
00046     void resetHeader();
00047 
00048     // Set the passed in string to the entire header string.  Clearing its
00049     // current contents.
00050     // Return true if successfully set (even if set to "")
00051     bool getHeaderString(std::string& header) const;
00052 
00053     int   getReferenceID(const String & referenceName);
00054     int   getReferenceID(const char* referenceName);
00055     const String & getReferenceLabel(int id) const;
00056 
00057     // Get the Reference Information
00058     const SamReferenceInfo* getReferenceInfo() const;
00059 
00060     // Add reference sequence name and reference sequence length to the header.
00061     void addReferenceInfo(const char* referenceSequenceName, 
00062                           int32_t referenceSequenceLength);
00063 
00064     ////////////////////////////////////////////////////////////////////////
00065     // Set Values in the header
00066     ////////////////////////////////////////////////////////////////////////
00067 
00068     // Add a header line that is just one tag with a const char* value.
00069     bool addHeaderLine(const char* type, const char* tag, const char* value); 
00070     // Add a header line that is already preformatted in a const char*.
00071     // It is assumed that the line does not contain a \n.
00072     bool addHeaderLine(const char* headerLine);
00073 
00074 //     // Set the specified header type tag to the specified value in the 
00075 //     // header with the specified keyID.  keyID must be specified when
00076 //     // type = SQ, RG, or PG.
00077 //     bool setTag(SamHeaderRecord::SamHeaderRecordType type, const char* tag,
00078 //                 const char* value, const char* keyID = NULL);
00079 
00080     // Set the specified tag to the specified value in the HD header.
00081     bool setHDTag(const char* tag, const char* value);
00082 
00083     // Set the specified tag to the specified value in the SQ header with
00084     // the specified name.
00085     // If the header does not yet exist, the header is added.
00086     bool setSQTag(const char* tag, const char* value, const char* name);
00087 
00088     // Set the specified tag to the specified value in the RG header with
00089     // the read group identifier.
00090     // If the header does not yet exist, the header is added.
00091     bool setRGTag(const char* tag, const char* value, const char* id);
00092 
00093     // Set the specified tag to the specified value in the PG header with
00094     // the specified id.
00095     // If the header does not yet exist, the header is added.
00096     bool setPGTag(const char* tag, const char* value, const char* id);
00097 
00098     // Add the HD record to the header.
00099     // Note: it adds a pointer to the passed in header record.  The header
00100     // record will be deleted when it is cleaned up from this header.
00101     bool addHD(SamHeaderHD* hd);
00102 
00103     // Add the SQ record to the header.
00104     // Note: it adds a pointer to the passed in header record.  The header
00105     // record will be deleted when it is cleaned up from this header.
00106     bool addSQ(SamHeaderSQ* sq);
00107 
00108     // Add the RG record to the header.
00109     // Note: it adds a pointer to the passed in header record.  The header
00110     // record will be deleted when it is cleaned up from this header.
00111     bool addRG(SamHeaderRG* rg);
00112 
00113     // Add the PG record to the header.
00114     // Note: it adds a pointer to the passed in header record.  The header
00115     // record will be deleted when it is cleaned up from this header.
00116     bool addPG(SamHeaderPG* pg);
00117 
00118     ////////////////////////////////////////////////////////////////////////
00119     // Remove entries from the header
00120     ////////////////////////////////////////////////////////////////////////
00121     bool removeHD();  // Remove the HD record.
00122     bool removeSQ(const char* name); // Remove SQ record with the specified key.
00123     bool removeRG(const char* id); // Remove RG record with the specified key.
00124     bool removePG(const char* id); // Remove PG record with the specified key.
00125 
00126 
00127     ////////////////////////////////////////////////////////////////////////
00128     //
00129     ////////////////////////////////////////////////////////////////////////
00130     SamStatus::Status setHeaderFromBamFile(IFILE filePtr);
00131     
00132     const char* getHDTagValue(const char* tag);
00133     // Get the value associated with the specified tag on the SQ line with
00134     // the specified sequence name.
00135     const char* getSQTagValue(const char* tag, const char* name);
00136     // Get the value associated with the specified tag on the RG line with
00137     // the specified read group identifier.
00138     const char* getRGTagValue(const char* tag, const char* id);
00139     // Get the value associated with the specified tag on the RG line with
00140     // the specified id.
00141     const char* getPGTagValue(const char* tag, const char* id);
00142 
00143     // Get the number of SQ objects.
00144     int getNumSQs();
00145 
00146     // Get the number of RG objects.
00147     int getNumRGs();
00148 
00149     // Get the number of PG objects.
00150     int getNumPGs();
00151 
00152     // Get the HD object.
00153     SamHeaderHD* getHD();
00154 
00155     // Get the SQ object with the specified sequence name.
00156     SamHeaderSQ* getSQ(const char* name);
00157 
00158     // Get the RG object with the specified read group identifier.
00159     SamHeaderRG* getRG(const char* id);
00160 
00161     // Get the PG object with the specified id.
00162     SamHeaderPG* getPG(const char* id);
00163 
00164 //     //////////////////////////////////
00165 //     // Set methods for header fields.
00166 //     bool setVersion(const char* version);
00167 //     bool setSortOrder(const char* sortOrder);
00168 //     bool addSequenceName(const char* sequenceName);
00169 //     bool setSequenceLength(const char* keyID, int sequenceLength);
00170 //     bool setGenomeAssemblyId(const char* keyID, const char* genomeAssemblyId);
00171 //     bool setMD5Checksum(const char* keyID, const char* md5sum);
00172 //     bool setURI(const char* keyID, const char* uri);
00173 //     bool setSpecies(const char* keyID, const char* species);
00174 //     bool addReadGroupID(const char* readGroupID);
00175 //     bool setSample(const char* keyID, const char* sample);
00176 //     bool setLibrary(const char* keyID, const char* library);
00177 //     bool setDescription(const char* keyID, const char* description);
00178 //     bool setPlatformUnit(const char* keyID, const char* platform);
00179 //     bool setPredictedMedianInsertSize(const char* keyID, const char* isize);
00180 //     bool setSequencingCenter(const char* keyID, const char* center);
00181 //     bool setRunDate(const char* keyID, const char* runDate);
00182 //     bool setTechnology(const char* keyID, const char* technology);
00183 //     bool addProgram(const char* programID);
00184 //     bool setProgramVersion(const char* keyID, const char* version);
00185 //     bool setCommandLine(const char* keyID, const char* commandLine);
00186     
00187 //     ///////////////////////////////////
00188 //     // Get methods for header fields.
00189 //     // Returns the number of SQ entries in the header.
00190 //     int32_t getSequenceDictionaryCount();
00191     // Return the Sort Order value that is set in the Header.
00192     // If this field does not exist, "" is returned.
00193     const char* getSortOrder();
00194 
00195 
00196     // DEPRECATED
00197     const char* getTagSO();
00198 
00199     // Get the next SQ header record.  After all SQ headers have been retrieved,
00200     // NULL is returned until a reset is called.
00201     SamHeaderRecord* getNextSQRecord();
00202 
00203     // Get the next RG header record.  After all RG headers have been retrieved,
00204     // NULL is returned until a reset is called.
00205     SamHeaderRecord* getNextRGRecord();
00206 
00207     // Get the next PG header record.  After all PG headers have been retrieved,
00208     // NULL is returned until a reset is called.
00209     SamHeaderRecord* getNextPGRecord();
00210 
00211     // Reset to the beginning of the header records so the next call
00212     // to getNextSQRecord returns the first SQ header record.
00213     void resetSQRecordIter();
00214 
00215     // Reset to the beginning of the header records so the next call
00216     // to getNextRGRecord returns the first RG header record.
00217     void resetRGRecordIter();
00218 
00219     // Reset to the beginning of the header records so the next call
00220     // to getNextPGRecord returns the first PG header record.
00221     void resetPGRecordIter();
00222 
00223     // Get the next header record of the specified type.
00224     // Pass in the index to start looking at and the type to look for.
00225     // Update the index.
00226     // After all headers of that type have been retrieved,
00227     // NULL is returned until a reset is called for that type.
00228     SamHeaderRecord* getNextHeaderRecord(uint32_t& index, 
00229                                          SamHeaderRecord::SamHeaderRecordType headerType);
00230 
00231     // Get the next header record.  After all headers have been retrieved,
00232     // NULL is returned until a reset is called.  Does not return the
00233     // Comment lines.
00234     // NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
00235     // same iterator.
00236     SamHeaderRecord* getNextHeaderRecord();
00237 
00238 
00239     // Set the passed in string to the next header line.  The passed in 
00240     // string will be overwritten.  If there are no more header lines or there
00241     // is an error, false is returned and the passed in string is set to ""
00242     // until a rest is called.
00243     // Will also return the comment lines.
00244     // NOTE: both getNextHeaderRecord and getNextHeaderLine increment the
00245     // same iterator.
00246     bool getNextHeaderLine(std::string &headerLine);
00247 
00248     // Reset to the beginning of the header records so the next call
00249     // to getNextHeaderRecord returns the first header line.
00250     void resetHeaderRecordIter();
00251    
00252     // Returns the comment on the next comment line.  Returns "" if all comment
00253     // lines have been returned, until resetCommentIter is called.
00254     const char* getNextComment();
00255 
00256     // Resets to the beginning of the comments so getNextComment returns
00257     // the first comment.
00258     void resetCommentIter();
00259 
00260     // Add a comment.
00261     bool addComment(const char* comment);
00262 
00263     // Populate the reference info from the SQ fields.
00264     void generateReferenceInfo();
00265 
00266 
00267 private:
00268     // Parse the header string.
00269     bool parseHeader(String& header);
00270 
00271     // Parse the specified line of the header.
00272     bool parseHeaderLine(const String& headerLine);
00273 
00274     // Set the passed in string to the header line at the specified index.
00275     // It does NOT clear the current contents of header.
00276     bool getHeaderLine(unsigned int index, std::string& header) const;
00277 
00278     int16_t makeKey(char ch1, char ch2)
00279     {
00280         return((ch1 << 8) + ch2);
00281     }
00282 
00283     // Only one HD type is allowed per file.
00284     SamHeaderHD* myHD;
00285 
00286     // There can be multiple SQ Types, indexed by SN.
00287     StringHash mySQs;
00288 
00289     // There can be multiple RG Types, indexed by ID.
00290     StringHash myRGs;
00291 
00292     // There can be multiple PG types, indexed by ID.
00293     StringHash myPGs;
00294 
00295     // Reference Name information
00296     SamReferenceInfo myReferenceInfo;
00297 
00298     // Vector of comments
00299     std::vector<std::string> myComments;
00300 
00301     std::vector<SamHeaderRecord*> myHeaderRecords;
00302 
00303     uint32_t myCurrentSQIndex;
00304 
00305     uint32_t myCurrentRGIndex;
00306 
00307     uint32_t myCurrentPGIndex;
00308 
00309     uint32_t myCurrentHeaderIndex;
00310 
00311     uint32_t myCurrentCommentIndex;
00312 
00313     static const std::string EMPTY_RETURN;
00314 };
00315 
00316 #endif
00317 
Generated on Wed Nov 17 15:38:27 2010 for StatGen Software by  doxygen 1.6.3