00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __SAM_FILE_HEADER_H__ 00019 #define __SAM_FILE_HEADER_H__ 00020 00021 #include <map> 00022 #include <stdint.h> 00023 00024 #include "SamReferenceInfo.h" 00025 #include "SamHeaderHD.h" 00026 #include "SamHeaderSQ.h" 00027 #include "SamHeaderRG.h" 00028 #include "SamHeaderPG.h" 00029 #include "SamStatus.h" 00030 00031 class SamFileHeader 00032 { 00033 public: 00034 SamFileHeader(); 00035 ~SamFileHeader(); 00036 00037 // Copy Constructor 00038 SamFileHeader(const SamFileHeader& header); 00039 00040 // Overload operator = to copy the passed in header into this header. 00041 SamFileHeader & operator = (const SamFileHeader& header); 00042 00043 // Overload operator = to copy the passed in header into this header. 00044 bool copy(const SamFileHeader& header); 00045 00046 void resetHeader(); 00047 00048 // Set the passed in string to the entire header string. Clearing its 00049 // current contents. 00050 // Return true if successfully set (even if set to "") 00051 bool getHeaderString(std::string& header) const; 00052 00053 // Get the reference ID for the specified name. 00054 // If addID is set to true, a reference id will be created for the 00055 // referenceName if one does not already exist. If addID is set to 00056 // false (default), it will return SamReferenceInfo::NO_REF_ID. 00057 int getReferenceID(const String & referenceName, bool addID = false); 00058 int getReferenceID(const char* referenceName, bool addID = false); 00059 const String & getReferenceLabel(int id) const; 00060 00061 // Get the Reference Information 00062 const SamReferenceInfo* getReferenceInfo() const; 00063 00064 // Add reference sequence name and reference sequence length to the header. 00065 void addReferenceInfo(const char* referenceSequenceName, 00066 int32_t referenceSequenceLength); 00067 00068 //////////////////////////////////////////////////////////////////////// 00069 // Set Values in the header 00070 //////////////////////////////////////////////////////////////////////// 00071 00072 // Add a header line that is just one tag with a const char* value. 00073 bool addHeaderLine(const char* type, const char* tag, const char* value); 00074 // Add a header line that is already preformatted in a const char*. 00075 // It is assumed that the line does not contain a \n. 00076 bool addHeaderLine(const char* headerLine); 00077 00078 // // Set the specified header type tag to the specified value in the 00079 // // header with the specified keyID. keyID must be specified when 00080 // // type = SQ, RG, or PG. 00081 // bool setTag(SamHeaderRecord::SamHeaderRecordType type, const char* tag, 00082 // const char* value, const char* keyID = NULL); 00083 00084 // Set the specified tag to the specified value in the HD header. 00085 bool setHDTag(const char* tag, const char* value); 00086 00087 // Set the specified tag to the specified value in the SQ header with 00088 // the specified name. 00089 // If the header does not yet exist, the header is added. 00090 bool setSQTag(const char* tag, const char* value, const char* name); 00091 00092 // Set the specified tag to the specified value in the RG header with 00093 // the read group identifier. 00094 // If the header does not yet exist, the header is added. 00095 bool setRGTag(const char* tag, const char* value, const char* id); 00096 00097 // Set the specified tag to the specified value in the PG header with 00098 // the specified id. 00099 // If the header does not yet exist, the header is added. 00100 bool setPGTag(const char* tag, const char* value, const char* id); 00101 00102 // Add the HD record to the header. 00103 // Note: it adds a pointer to the passed in header record. The header 00104 // record will be deleted when it is cleaned up from this header. 00105 bool addHD(SamHeaderHD* hd); 00106 00107 // Add the SQ record to the header. 00108 // Note: it adds a pointer to the passed in header record. The header 00109 // record will be deleted when it is cleaned up from this header. 00110 bool addSQ(SamHeaderSQ* sq); 00111 00112 // Add the RG record to the header. 00113 // Note: it adds a pointer to the passed in header record. The header 00114 // record will be deleted when it is cleaned up from this header. 00115 bool addRG(SamHeaderRG* rg); 00116 00117 // Add the PG record to the header. 00118 // Note: it adds a pointer to the passed in header record. The header 00119 // record will be deleted when it is cleaned up from this header. 00120 bool addPG(SamHeaderPG* pg); 00121 00122 //////////////////////////////////////////////////////////////////////// 00123 // Remove entries from the header 00124 //////////////////////////////////////////////////////////////////////// 00125 bool removeHD(); // Remove the HD record. 00126 bool removeSQ(const char* name); // Remove SQ record with the specified key. 00127 bool removeRG(const char* id); // Remove RG record with the specified key. 00128 bool removePG(const char* id); // Remove PG record with the specified key. 00129 00130 00131 //////////////////////////////////////////////////////////////////////// 00132 // 00133 //////////////////////////////////////////////////////////////////////// 00134 SamStatus::Status setHeaderFromBamFile(IFILE filePtr); 00135 00136 const char* getHDTagValue(const char* tag); 00137 // Get the value associated with the specified tag on the SQ line with 00138 // the specified sequence name. 00139 const char* getSQTagValue(const char* tag, const char* name); 00140 // Get the value associated with the specified tag on the RG line with 00141 // the specified read group identifier. 00142 const char* getRGTagValue(const char* tag, const char* id); 00143 // Get the value associated with the specified tag on the RG line with 00144 // the specified id. 00145 const char* getPGTagValue(const char* tag, const char* id); 00146 00147 // Get the number of SQ objects. 00148 int getNumSQs(); 00149 00150 // Get the number of RG objects. 00151 int getNumRGs(); 00152 00153 // Get the number of PG objects. 00154 int getNumPGs(); 00155 00156 // Get the HD object. 00157 SamHeaderHD* getHD(); 00158 00159 // Get the SQ object with the specified sequence name. 00160 SamHeaderSQ* getSQ(const char* name); 00161 00162 // Get the RG object with the specified read group identifier. 00163 SamHeaderRG* getRG(const char* id); 00164 00165 // Get the PG object with the specified id. 00166 SamHeaderPG* getPG(const char* id); 00167 00168 // ////////////////////////////////// 00169 // // Set methods for header fields. 00170 // bool setVersion(const char* version); 00171 // bool setSortOrder(const char* sortOrder); 00172 // bool addSequenceName(const char* sequenceName); 00173 // bool setSequenceLength(const char* keyID, int sequenceLength); 00174 // bool setGenomeAssemblyId(const char* keyID, const char* genomeAssemblyId); 00175 // bool setMD5Checksum(const char* keyID, const char* md5sum); 00176 // bool setURI(const char* keyID, const char* uri); 00177 // bool setSpecies(const char* keyID, const char* species); 00178 // bool addReadGroupID(const char* readGroupID); 00179 // bool setSample(const char* keyID, const char* sample); 00180 // bool setLibrary(const char* keyID, const char* library); 00181 // bool setDescription(const char* keyID, const char* description); 00182 // bool setPlatformUnit(const char* keyID, const char* platform); 00183 // bool setPredictedMedianInsertSize(const char* keyID, const char* isize); 00184 // bool setSequencingCenter(const char* keyID, const char* center); 00185 // bool setRunDate(const char* keyID, const char* runDate); 00186 // bool setTechnology(const char* keyID, const char* technology); 00187 // bool addProgram(const char* programID); 00188 // bool setProgramVersion(const char* keyID, const char* version); 00189 // bool setCommandLine(const char* keyID, const char* commandLine); 00190 00191 // /////////////////////////////////// 00192 // // Get methods for header fields. 00193 // // Returns the number of SQ entries in the header. 00194 // int32_t getSequenceDictionaryCount(); 00195 // Return the Sort Order value that is set in the Header. 00196 // If this field does not exist, "" is returned. 00197 const char* getSortOrder(); 00198 00199 00200 // DEPRECATED 00201 const char* getTagSO(); 00202 00203 // Get the next SQ header record. After all SQ headers have been retrieved, 00204 // NULL is returned until a reset is called. 00205 SamHeaderRecord* getNextSQRecord(); 00206 00207 // Get the next RG header record. After all RG headers have been retrieved, 00208 // NULL is returned until a reset is called. 00209 SamHeaderRecord* getNextRGRecord(); 00210 00211 // Get the next PG header record. After all PG headers have been retrieved, 00212 // NULL is returned until a reset is called. 00213 SamHeaderRecord* getNextPGRecord(); 00214 00215 // Reset to the beginning of the header records so the next call 00216 // to getNextSQRecord returns the first SQ header record. 00217 void resetSQRecordIter(); 00218 00219 // Reset to the beginning of the header records so the next call 00220 // to getNextRGRecord returns the first RG header record. 00221 void resetRGRecordIter(); 00222 00223 // Reset to the beginning of the header records so the next call 00224 // to getNextPGRecord returns the first PG header record. 00225 void resetPGRecordIter(); 00226 00227 // Get the next header record of the specified type. 00228 // Pass in the index to start looking at and the type to look for. 00229 // Update the index. 00230 // After all headers of that type have been retrieved, 00231 // NULL is returned until a reset is called for that type. 00232 SamHeaderRecord* getNextHeaderRecord(uint32_t& index, 00233 SamHeaderRecord::SamHeaderRecordType headerType); 00234 00235 // Get the next header record. After all headers have been retrieved, 00236 // NULL is returned until a reset is called. Does not return the 00237 // Comment lines. 00238 // NOTE: both getNextHeaderRecord and getNextHeaderLine increment the 00239 // same iterator. 00240 SamHeaderRecord* getNextHeaderRecord(); 00241 00242 00243 // Set the passed in string to the next header line. The passed in 00244 // string will be overwritten. If there are no more header lines or there 00245 // is an error, false is returned and the passed in string is set to "" 00246 // until a rest is called. 00247 // Will also return the comment lines. 00248 // NOTE: both getNextHeaderRecord and getNextHeaderLine increment the 00249 // same iterator. 00250 bool getNextHeaderLine(std::string &headerLine); 00251 00252 // Reset to the beginning of the header records so the next call 00253 // to getNextHeaderRecord returns the first header line. 00254 void resetHeaderRecordIter(); 00255 00256 // Returns the comment on the next comment line. Returns "" if all comment 00257 // lines have been returned, until resetCommentIter is called. 00258 const char* getNextComment(); 00259 00260 // Resets to the beginning of the comments so getNextComment returns 00261 // the first comment. 00262 void resetCommentIter(); 00263 00264 // Add a comment. 00265 bool addComment(const char* comment); 00266 00267 // Populate the reference info from the SQ fields. 00268 void generateReferenceInfo(); 00269 00270 00271 private: 00272 // Parse the header string. 00273 bool parseHeader(String& header); 00274 00275 // Parse the specified line of the header. 00276 bool parseHeaderLine(const String& headerLine); 00277 00278 // Set the passed in string to the header line at the specified index. 00279 // It does NOT clear the current contents of header. 00280 bool getHeaderLine(unsigned int index, std::string& header) const; 00281 00282 int16_t makeKey(char ch1, char ch2) 00283 { 00284 return((ch1 << 8) + ch2); 00285 } 00286 00287 // Only one HD type is allowed per file. 00288 SamHeaderHD* myHD; 00289 00290 // There can be multiple SQ Types, indexed by SN. 00291 StringHash mySQs; 00292 00293 // There can be multiple RG Types, indexed by ID. 00294 StringHash myRGs; 00295 00296 // There can be multiple PG types, indexed by ID. 00297 StringHash myPGs; 00298 00299 // Reference Name information 00300 SamReferenceInfo myReferenceInfo; 00301 00302 // Vector of comments 00303 std::vector<std::string> myComments; 00304 00305 std::vector<SamHeaderRecord*> myHeaderRecords; 00306 00307 uint32_t myCurrentSQIndex; 00308 00309 uint32_t myCurrentRGIndex; 00310 00311 uint32_t myCurrentPGIndex; 00312 00313 uint32_t myCurrentHeaderIndex; 00314 00315 uint32_t myCurrentCommentIndex; 00316 00317 static const std::string EMPTY_RETURN; 00318 }; 00319 00320 #endif 00321