00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __SAM_FILE_HEADER_H__ 00019 #define __SAM_FILE_HEADER_H__ 00020 00021 #include <map> 00022 #include <stdint.h> 00023 00024 #include "SamReferenceInfo.h" 00025 #include "SamHeaderHD.h" 00026 #include "SamHeaderSQ.h" 00027 #include "SamHeaderRG.h" 00028 #include "SamHeaderPG.h" 00029 #include "SamStatus.h" 00030 00031 /// This class allows a user to get/set the fields in a SAM/BAM Header. 00032 /// Sam/Bam headers contain comments and multiple SamHeaderRecords 00033 /// (HD, SQs, RGs, PGs) comprised of tag/value pairs with each tag only 00034 /// appearing once within a specific record. 00035 class SamFileHeader 00036 { 00037 public: 00038 SamFileHeader(); 00039 ~SamFileHeader(); 00040 00041 ///////////////////////////// 00042 /// @name Copying a Header 00043 /// These methods are ways of copying the contents of one header into 00044 /// another one. 00045 //@{ 00046 00047 /// Copy Constructor copies the specified header into this one. 00048 SamFileHeader(const SamFileHeader& header); 00049 00050 /// Overload operator = to copy the passed in header into this header. 00051 SamFileHeader & operator = (const SamFileHeader& header); 00052 00053 /// Copy method copies the passed in header into this header. 00054 bool copy(const SamFileHeader& header); 00055 //@} 00056 00057 /// Initialize the header. 00058 void resetHeader(); 00059 00060 ///////////////////////////// 00061 /// @name Get the Entire Header 00062 /// Get the entire header as a single string. 00063 //@{ 00064 00065 /// Set the passed in string to the entire header string, clearing its 00066 /// current contents. 00067 /// \return true if successfully set (even if set to "") 00068 bool getHeaderString(std::string& header) const; 00069 00070 //@} 00071 00072 /// Get the reference ID for the specified reference name (chromosome). 00073 /// If addID is set to true, a reference id will be created for the 00074 /// referenceName if one does not already exist. If addID is set to 00075 /// false (default), it will return SamReferenceInfo::NO_REF_ID. 00076 int getReferenceID(const String & referenceName, bool addID = false); 00077 00078 /// Get the reference ID for the specified reference name (chromosome). 00079 /// If addID is set to true, a reference id will be created for the 00080 /// referenceName if one does not already exist. If addID is set to 00081 /// false (default), it will return SamReferenceInfo::NO_REF_ID. 00082 int getReferenceID(const char* referenceName, bool addID = false); 00083 00084 /// Return the reference name (chromosome) for the specified reference id. 00085 const String & getReferenceLabel(int id) const; 00086 00087 /// Get the Reference Information 00088 const SamReferenceInfo* getReferenceInfo() const; 00089 00090 /// Add reference sequence name and reference sequence length to the header. 00091 void addReferenceInfo(const char* referenceSequenceName, 00092 int32_t referenceSequenceLength); 00093 00094 /// Populate the reference info from the SQ fields. 00095 void generateReferenceInfo(); 00096 00097 //////////////////////////////////////////////////////////////////////// 00098 // Set Values in the header 00099 //////////////////////////////////////////////////////////////////////// 00100 00101 ///////////////////////////////////////// 00102 /// @name Adding an entire header/comment line. 00103 /// These methods are ways of adding an entire header line at once. 00104 //@{ 00105 00106 /// Add a header line that is just one tag with a const char* value. 00107 /// Note: This method will only do one tag per type on a line, so if a 00108 /// type has multiple tags, the whole line needs to be added at once, 00109 /// and a different method should be used. 00110 bool addHeaderLine(const char* type, const char* tag, const char* value); 00111 00112 /// Add a header line that is already preformatted in a const char* without 00113 /// a trailing "\n". 00114 bool addHeaderLine(const char* headerLine); 00115 00116 /// Add the specified comment to the header (do not include "@CO" or "\n"). 00117 /// \return true if successfully added, false if not. 00118 bool addComment(const char* comment); 00119 00120 //@} 00121 00122 00123 ///////////////////////////////////////// 00124 /// @name Set/Add/Remove a Single Tag 00125 /// The passed in tag should be the two character SAM tag as defined 00126 /// in the SAM spec. A tag is removed from the header record by setting 00127 /// it to "". For the SQ and RG header types, the key tags (SN for SQ 00128 /// and ID for RG) may not be modified or removed once set. This is 00129 /// because these values are used as a lookup key for the header record, 00130 /// so the entire record must be removed. 00131 //@{ 00132 00133 // // Set the specified header type tag to the specified value in the 00134 // // header with the specified keyID. keyID must be specified when 00135 // // type = SQ, RG, or PG. 00136 // bool setTag(SamHeaderRecord::SamHeaderRecordType type, const char* tag, 00137 // const char* value, const char* keyID = NULL); 00138 00139 /// Set the specified tag to the specified value in the HD header, remove 00140 /// the tag by specifying value="". 00141 /// \return true if the tag was successfully set, false if not. 00142 bool setHDTag(const char* tag, const char* value); 00143 00144 /// Set the specified tag to the specified value in the SQ header with 00145 /// the specified name, remove the tag by specifying value="". If the 00146 /// header does not yet exist, the header is added and so is the SN tag 00147 /// with the value set to the passed in name. The SN tag may not be 00148 /// modified or removed after it is set unless the entire record is deleted. 00149 /// \return true if the tag was successfully set, false if not. 00150 bool setSQTag(const char* tag, const char* value, const char* name); 00151 00152 /// Set the specified tag to the specified value in the RG header with 00153 /// the specified id, remove the tag by specifying value="". If the 00154 /// header does not yet exist, the header is added and so is the ID tag 00155 /// with the value set to the passed in id. The ID tag may not be 00156 /// modified or removed after it is set unless the entire record is deleted. 00157 /// \return true if the tag was successfully set, false if not. 00158 bool setRGTag(const char* tag, const char* value, const char* id); 00159 00160 /// Set the specified tag to the specified value in the PG header with 00161 /// the specified id, remove the tag by specifying value="". If the 00162 /// header does not yet exist, the header is added and so is the ID tag 00163 /// with the value set to the passed in id. The ID tag may not be 00164 /// modified or removed after it is set unless the entire record is deleted. 00165 /// \return true if the tag was successfully set, false if not. 00166 bool setPGTag(const char* tag, const char* value, const char* id); 00167 00168 //@} 00169 00170 ///////////////////////////////////////// 00171 /// @name Add an Already Setup SamHeaderRecord 00172 /// NOTE: These methods add a pointer to the passed in record. 00173 /// The header record will be deleted when it's cleaned up from this header. 00174 /// NOTE: Do NOT delete the passed in record, the SamFileHeader class 00175 /// takes care of that itself. 00176 //@{ 00177 00178 /// Add the HD record to the header. 00179 /// Note: it adds a pointer to the passed in header record. The header 00180 /// record will be deleted when it is cleaned up from this header. 00181 /// \ returns true if the record was successfully added, false otherwise. 00182 bool addHD(SamHeaderHD* hd); 00183 00184 /// Add the SQ record to the header. 00185 /// Note: it adds a pointer to the passed in header record. The header 00186 /// record will be deleted when it is cleaned up from this header. 00187 /// \ returns true if the record was successfully added, false otherwise. 00188 bool addSQ(SamHeaderSQ* sq); 00189 00190 /// Add the RG record to the header. 00191 /// Note: it adds a pointer to the passed in header record. The header 00192 /// record will be deleted when it is cleaned up from this header. 00193 /// \ returns true if the record was successfully added, false otherwise. 00194 bool addRG(SamHeaderRG* rg); 00195 00196 /// Add the PG record to the header. 00197 /// Note: it adds a pointer to the passed in header record. The header 00198 /// record will be deleted when it is cleaned up from this header. 00199 /// \ returns true if the record was successfully added, false otherwise. 00200 bool addPG(SamHeaderPG* pg); 00201 00202 //@} 00203 00204 //////////////////////////////////////////////////////////////////////// 00205 /// @name Remove an Entire Header Record 00206 //@{ 00207 00208 /// Remove the HD record. 00209 /// \return true if successfully removed or did not exist, false if 00210 /// the record still exists. 00211 bool removeHD(); 00212 00213 /// Remove SQ record with the specified key. 00214 /// \return true if successfully removed or did not exist, false if 00215 /// the record still exists. 00216 bool removeSQ(const char* name); 00217 00218 /// Remove RG record with the specified key. 00219 /// \return true if successfully removed or did not exist, false if 00220 /// the record still exists. 00221 bool removeRG(const char* id); 00222 00223 /// Remove PG record with the specified key. 00224 /// \return true if successfully removed or did not exist, false if 00225 /// the record still exists. 00226 bool removePG(const char* id); 00227 00228 //@} 00229 00230 00231 //////////////////////////////////////////////////////////////////////// 00232 // 00233 //////////////////////////////////////////////////////////////////////// 00234 SamStatus::Status setHeaderFromBamFile(IFILE filePtr); 00235 00236 00237 //////////////////////////////////////////////////////////////////////// 00238 /// @name Get a Specific Tag 00239 /// These methods return the value associated with the specified tag. 00240 /// If the tag does not exist in the record "" is returned. 00241 /// 00242 /// For SQ, RG, and PG the value returned is for the tag associated with 00243 /// the specified key (name/id). If a record with that key does not exist 00244 /// or if the tag does not exist for the record with that key, "" is 00245 /// returned. 00246 //@{ 00247 00248 /// Returns the value associated with the specified HD tag, returning "" if 00249 /// the tag does not exist in the header. 00250 const char* getHDTagValue(const char* tag); 00251 00252 /// Get the value associated with the specified tag on the SQ line with 00253 /// the specified sequence name, returning "" if the tag or key does 00254 /// not exist. 00255 const char* getSQTagValue(const char* tag, const char* name); 00256 00257 /// Get the value associated with the specified tag on the RG line with 00258 /// the specified read group identifier, returning "" if the tag or key does 00259 /// not exist. 00260 const char* getRGTagValue(const char* tag, const char* id); 00261 00262 /// Get the value associated with the specified tag on the RG line with 00263 /// the specified id, returning "" if the tag or key does 00264 /// not exist. 00265 const char* getPGTagValue(const char* tag, const char* id); 00266 00267 //@} 00268 00269 /// Get the number of SQ objects. 00270 int getNumSQs(); 00271 00272 /// Get the number of RG objects. 00273 int getNumRGs(); 00274 00275 /// Get the number of PG objects. 00276 int getNumPGs(); 00277 00278 //////////////////////////////////////////////////////////////////////// 00279 /// @name Get a Specific Header Record 00280 /// These methods return a reference to the specific record that was 00281 /// requested, returning NULL if that record does not exist in the header. 00282 /// 00283 /// The returned record can be modified to add/remove some tags. 00284 /// Since a reference is returned, the SamHeaderFile automatically 00285 /// reflects these changes. 00286 //@{ 00287 00288 /// Get the HD object, returning NULL if there is no HD record. 00289 SamHeaderHD* getHD(); 00290 00291 /// Get the SQ object with the specified sequence name, returning NULL 00292 /// if there is no SQ object with that key. 00293 SamHeaderSQ* getSQ(const char* name); 00294 00295 /// Get the RG object with the specified read group identifier, returning 00296 /// NULL if there is no RG object with that key.. 00297 SamHeaderRG* getRG(const char* id); 00298 00299 /// Get the PG object with the specified id, returning NULL 00300 /// if there is no PG object with that key.. 00301 SamHeaderPG* getPG(const char* id); 00302 00303 //@} 00304 00305 // ////////////////////////////////// 00306 // // Set methods for header fields. 00307 // bool setVersion(const char* version); 00308 // bool setSortOrder(const char* sortOrder); 00309 // bool addSequenceName(const char* sequenceName); 00310 // bool setSequenceLength(const char* keyID, int sequenceLength); 00311 // bool setGenomeAssemblyId(const char* keyID, const char* genomeAssemblyId); 00312 // bool setMD5Checksum(const char* keyID, const char* md5sum); 00313 // bool setURI(const char* keyID, const char* uri); 00314 // bool setSpecies(const char* keyID, const char* species); 00315 // bool addReadGroupID(const char* readGroupID); 00316 // bool setSample(const char* keyID, const char* sample); 00317 // bool setLibrary(const char* keyID, const char* library); 00318 // bool setDescription(const char* keyID, const char* description); 00319 // bool setPlatformUnit(const char* keyID, const char* platform); 00320 // bool setPredictedMedianInsertSize(const char* keyID, const char* isize); 00321 // bool setSequencingCenter(const char* keyID, const char* center); 00322 // bool setRunDate(const char* keyID, const char* runDate); 00323 // bool setTechnology(const char* keyID, const char* technology); 00324 // bool addProgram(const char* programID); 00325 // bool setProgramVersion(const char* keyID, const char* version); 00326 // bool setCommandLine(const char* keyID, const char* commandLine); 00327 00328 // /////////////////////////////////// 00329 // // Get methods for header fields. 00330 // // Returns the number of SQ entries in the header. 00331 // int32_t getSequenceDictionaryCount(); 00332 00333 /// Return the Sort Order value that is set in the Header, returning "" 00334 /// if this field does not exist. 00335 const char* getSortOrder(); 00336 00337 00338 /// DEPRECATED 00339 const char* getTagSO(); 00340 00341 ///////////////////////////// 00342 /// @name Get the Header Record/Comment/Line by Record/Comment/Line 00343 /// These methods iterate through the header. 00344 /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the 00345 /// same iterator. getNextHeaderRecord that takes a header type 00346 /// uses the same iterator as the getNextXXRecord with that type. 00347 /// Otherwise the iterators are independent. 00348 //@{ 00349 00350 /// Get the next SQ header record. After all SQ headers have been 00351 /// retrieved, NULL is returned until a reset is called. 00352 /// Independent from getNextHeaderRecord, getNextHeaderLine and the 00353 /// other getNextXXRecord methods and the associated reset methods. 00354 SamHeaderRecord* getNextSQRecord(); 00355 00356 /// Get the next RG header record. After all RG headers have been 00357 /// retrieved, NULL is returned until a reset is called. 00358 /// Independent from getNextHeaderRecord, getNextHeaderLine and the 00359 /// other getNextXXRecord methods and the associated reset methods. 00360 SamHeaderRecord* getNextRGRecord(); 00361 00362 /// Get the next PG header record. After all PG headers have been 00363 /// retrieved, NULL is returned until a reset is called. 00364 /// Independent from getNextHeaderRecord, getNextHeaderLine and the 00365 /// other getNextXXRecord methods and the associated reset methods. 00366 SamHeaderRecord* getNextPGRecord(); 00367 00368 /// Reset to the beginning of the header records so the next call 00369 /// to getNextSQRecord returns the first SQ header record. 00370 void resetSQRecordIter(); 00371 00372 /// Reset to the beginning of the header records so the next call 00373 /// to getNextRGRecord returns the first RG header record. 00374 void resetRGRecordIter(); 00375 00376 /// Reset to the beginning of the header records so the next call 00377 /// to getNextPGRecord returns the first PG header record. 00378 void resetPGRecordIter(); 00379 00380 /// Get the next header record of the specified type starting from the 00381 /// specified index and update the index. 00382 /// After all headers of that type have been retrieved, 00383 /// NULL is returned until a reset is called for that type. 00384 SamHeaderRecord* getNextHeaderRecord(uint32_t& index, 00385 SamHeaderRecord::SamHeaderRecordType headerType); 00386 00387 /// Get the next header record, but not comment line. After all headers 00388 /// have been retrieved, NULL is returned until a reset is called. 00389 /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the 00390 /// same iterator. 00391 SamHeaderRecord* getNextHeaderRecord(); 00392 00393 /// Set the passed in string to the next header/comment line, overwritting 00394 /// the passed in string. If there are no more header lines or there 00395 /// is an error, false is returned and the passed in string is set to "" 00396 /// until a rest is called. 00397 /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the 00398 /// same iterator. 00399 bool getNextHeaderLine(std::string &headerLine); 00400 00401 /// Reset to the beginning of the header records so the next call 00402 /// to getNextHeaderRecord returns the first header line. 00403 void resetHeaderRecordIter(); 00404 00405 /// Returns the comment on the next comment line. Returns "" if all comment 00406 /// lines have been returned, until resetCommentIter is called. 00407 const char* getNextComment(); 00408 00409 /// Resets to the beginning of the comments so getNextComment returns 00410 /// the first comment. 00411 void resetCommentIter(); 00412 00413 //@} 00414 00415 00416 private: 00417 // Parse the header string. 00418 bool parseHeader(String& header); 00419 00420 // Parse the specified line of the header. 00421 bool parseHeaderLine(const String& headerLine); 00422 00423 // Set the passed in string to the header line at the specified index. 00424 // It does NOT clear the current contents of header. 00425 bool getHeaderLine(unsigned int index, std::string& header) const; 00426 00427 int16_t makeKey(char ch1, char ch2) 00428 { 00429 return((ch1 << 8) + ch2); 00430 } 00431 00432 // Only one HD type is allowed per file. 00433 SamHeaderHD* myHD; 00434 00435 // There can be multiple SQ Types, indexed by SN. 00436 StringHash mySQs; 00437 00438 // There can be multiple RG Types, indexed by ID. 00439 StringHash myRGs; 00440 00441 // There can be multiple PG types, indexed by ID. 00442 StringHash myPGs; 00443 00444 // Reference Name information 00445 SamReferenceInfo myReferenceInfo; 00446 00447 // Vector of comments 00448 std::vector<std::string> myComments; 00449 00450 std::vector<SamHeaderRecord*> myHeaderRecords; 00451 00452 uint32_t myCurrentSQIndex; 00453 00454 uint32_t myCurrentRGIndex; 00455 00456 uint32_t myCurrentPGIndex; 00457 00458 uint32_t myCurrentHeaderIndex; 00459 00460 uint32_t myCurrentCommentIndex; 00461 00462 static const std::string EMPTY_RETURN; 00463 }; 00464 00465 #endif 00466