libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __SAM_FILE_HEADER_H__ 00019 #define __SAM_FILE_HEADER_H__ 00020 00021 #include <map> 00022 #include <stdint.h> 00023 00024 #include "SamReferenceInfo.h" 00025 #include "SamHeaderHD.h" 00026 #include "SamHeaderSQ.h" 00027 #include "SamHeaderRG.h" 00028 #include "SamHeaderPG.h" 00029 00030 /// This class allows a user to get/set the fields in a SAM/BAM Header. 00031 /// Sam/Bam headers contain comments and multiple SamHeaderRecords 00032 /// (HD, SQs, RGs, PGs) comprised of tag/value pairs with each tag only 00033 /// appearing once within a specific record. 00034 class SamFileHeader 00035 { 00036 public: 00037 SamFileHeader(); 00038 ~SamFileHeader(); 00039 00040 ///////////////////////////// 00041 /// @name Copying a Header 00042 /// These methods are ways of copying the contents of one header into 00043 /// another one. 00044 //@{ 00045 00046 /// Copy Constructor copies the specified header into this one. 00047 SamFileHeader(const SamFileHeader& header); 00048 00049 /// Overload operator = to copy the passed in header into this header. 00050 SamFileHeader & operator = (const SamFileHeader& header); 00051 00052 /// Copy method copies the passed in header into this header. 00053 /// Returns true if at least one header line was successfully copied. 00054 bool copy(const SamFileHeader& header); 00055 //@} 00056 00057 /// Initialize the header. 00058 void resetHeader(); 00059 00060 ///////////////////////////// 00061 /// @name Get the Entire Header 00062 /// Get the entire header as a single string. 00063 //@{ 00064 00065 /// Set the passed in string to the entire header string, clearing its 00066 /// current contents. 00067 /// \return true if successfully set (even if set to "") 00068 bool getHeaderString(std::string& header) const; 00069 00070 //@} 00071 00072 /// Get the reference ID for the specified reference name (chromosome). 00073 /// If addID is set to true, a reference id will be created for the 00074 /// referenceName if one does not already exist. If addID is set to 00075 /// false (default), it will return SamReferenceInfo::NO_REF_ID. 00076 int getReferenceID(const String & referenceName, bool addID = false); 00077 00078 /// Get the reference ID for the specified reference name (chromosome). 00079 /// If addID is set to true, a reference id will be created for the 00080 /// referenceName if one does not already exist. If addID is set to 00081 /// false (default), it will return SamReferenceInfo::NO_REF_ID. 00082 int getReferenceID(const char* referenceName, bool addID = false); 00083 00084 /// Return the reference name (chromosome) for the specified reference id. 00085 const String & getReferenceLabel(int id) const; 00086 00087 /// Get the Reference Information 00088 const SamReferenceInfo& getReferenceInfo() const; 00089 00090 // Get the Reference Information for updating separately when reading 00091 // BAMs...should only be called by BamInterface. 00092 SamReferenceInfo& getReferenceInfoForBamInterface(); 00093 00094 //////////////////////////////////////////////////////////////////////// 00095 // Set Values in the header 00096 //////////////////////////////////////////////////////////////////////// 00097 00098 ///////////////////////////////////////// 00099 /// @name Adding an entire header/comment line. 00100 /// These methods are ways of adding an entire header line at once. 00101 //@{ 00102 00103 /// Add a header line that is just one tag with a const char* value. 00104 /// Note: This method will only do one tag per type on a line, so if a 00105 /// type has multiple tags, the whole line needs to be added at once, 00106 /// and a different method should be used. 00107 bool addHeaderLine(const char* type, const char* tag, const char* value); 00108 00109 /// Add a header line that is already preformatted in a const char*. 00110 /// Returns true if at least one header line was successfully added. 00111 bool addHeaderLine(const char* headerLine); 00112 00113 /// Add a header that is already preformatted in a const char*. 00114 /// Returns true if at least one header line was successfully added. 00115 bool addHeader(const char* header); 00116 00117 /// Add the specified comment to the header (do not include "@CO" or "\n"). 00118 /// \return true if successfully added, false if not. 00119 bool addComment(const char* comment); 00120 00121 //@} 00122 00123 00124 ///////////////////////////////////////// 00125 /// @name Set/Add/Remove a Single Tag 00126 /// The passed in tag should be the two character SAM tag as defined 00127 /// in the SAM spec. A tag is removed from the header record by setting 00128 /// it to "". For the SQ and RG header types, the key tags (SN for SQ 00129 /// and ID for RG) may not be modified or removed once set. This is 00130 /// because these values are used as a lookup key for the header record, 00131 /// so the entire record must be removed. 00132 //@{ 00133 00134 // // Set the specified header type tag to the specified value in the 00135 // // header with the specified keyID. keyID must be specified when 00136 // // type = SQ, RG, or PG. 00137 // bool setTag(SamHeaderRecord::SamHeaderRecordType type, const char* tag, 00138 // const char* value, const char* keyID = NULL); 00139 00140 /// Set the specified tag to the specified value in the HD header, remove 00141 /// the tag by specifying value="". 00142 /// \return true if the tag was successfully set, false if not. 00143 bool setHDTag(const char* tag, const char* value); 00144 00145 /// Set the specified tag to the specified value in the SQ header with 00146 /// the specified name, remove the tag by specifying value="". If the 00147 /// header does not yet exist, the tag must be "LN" and the header is added 00148 /// with the specified LN value and the SN value passed in name. 00149 /// The SN & LN tags may not be modified or removed after they are 00150 /// set unless the entire record is deleted. 00151 /// \return true if the tag was successfully set, false if not. 00152 bool setSQTag(const char* tag, const char* value, const char* name); 00153 00154 /// Set the specified tag to the specified value in the RG header with 00155 /// the specified id, remove the tag by specifying value="". If the 00156 /// header does not yet exist, the header is added and so is the ID tag 00157 /// with the value set to the passed in id. The ID tag may not be 00158 /// modified or removed after it is set unless the entire record is deleted. 00159 /// \return true if the tag was successfully set, false if not. 00160 bool setRGTag(const char* tag, const char* value, const char* id); 00161 00162 /// Set the specified tag to the specified value in the PG header with 00163 /// the specified id, remove the tag by specifying value="". If the 00164 /// header does not yet exist, the header is added and so is the ID tag 00165 /// with the value set to the passed in id. The ID tag may not be 00166 /// modified or removed after it is set unless the entire record is deleted. 00167 /// \return true if the tag was successfully set, false if not. 00168 bool setPGTag(const char* tag, const char* value, const char* id); 00169 00170 //@} 00171 00172 ///////////////////////////////////////// 00173 /// @name Add an Already Setup SamHeaderRecord 00174 /// NOTE: These methods add a pointer to the passed in record. 00175 /// The header record will be deleted when it's cleaned up from this header. 00176 /// NOTE: Do NOT delete the passed in record, the SamFileHeader class 00177 /// takes care of that itself. 00178 //@{ 00179 00180 /// Add the HD record to the header. 00181 /// Note: it adds a pointer to the passed in header record. The header 00182 /// record will be deleted when it is cleaned up from this header. 00183 /// \return true if the record was successfully added, false otherwise. 00184 bool addHD(SamHeaderHD* hd); 00185 00186 /// Add the SQ record to the header. 00187 /// Note: it adds a pointer to the passed in header record. The header 00188 /// record will be deleted when it is cleaned up from this header. 00189 /// \return true if the record was successfully added, false otherwise. 00190 bool addSQ(SamHeaderSQ* sq); 00191 00192 /// Add the RG record to the header. 00193 /// Note: it adds a pointer to the passed in header record. The header 00194 /// record will be deleted when it is cleaned up from this header. 00195 /// \return true if the record was successfully added, false otherwise. 00196 bool addRG(SamHeaderRG* rg); 00197 00198 /// Add the PG record to the header. 00199 /// Note: it adds a pointer to the passed in header record. The header 00200 /// record will be deleted when it is cleaned up from this header. 00201 /// \return true if the record was successfully added, false otherwise. 00202 bool addPG(SamHeaderPG* pg); 00203 00204 /// Add a copy of the specified header record to the header. 00205 /// Note: it creates a new header record that is identical to the specified 00206 /// one and adds it to the header. The passed in pointer will not be 00207 /// deleted due to this. 00208 /// \return true if the record was successfully added, false otherwise. 00209 bool addRecordCopy(const SamHeaderRecord& hdrRec); 00210 00211 //@} 00212 00213 //////////////////////////////////////////////////////////////////////// 00214 /// @name Remove an Entire Header Record 00215 //@{ 00216 00217 /// Remove the HD record. 00218 /// \return true if successfully removed or did not exist, false if 00219 /// the record still exists. 00220 bool removeHD(); 00221 00222 /// Remove SQ record with the specified key. 00223 /// NOTE: Does not remove it from the BAM index. 00224 /// \return true if successfully removed or did not exist, false if 00225 /// the record still exists. 00226 bool removeSQ(const char* name); 00227 00228 /// Remove RG record with the specified key. 00229 /// \return true if successfully removed or did not exist, false if 00230 /// the record still exists. 00231 bool removeRG(const char* id); 00232 00233 /// Remove PG record with the specified key. 00234 /// \return true if successfully removed or did not exist, false if 00235 /// the record still exists. 00236 bool removePG(const char* id); 00237 00238 //@} 00239 00240 //////////////////////////////////////////////////////////////////////// 00241 /// @name Get a Specific Tag 00242 /// These methods return the value associated with the specified tag. 00243 /// If the tag does not exist in the record "" is returned. 00244 /// 00245 /// For SQ, RG, and PG the value returned is for the tag associated with 00246 /// the specified key (name/id). If a record with that key does not exist 00247 /// or if the tag does not exist for the record with that key, "" is 00248 /// returned. 00249 //@{ 00250 00251 /// Returns the value associated with the specified HD tag, returning "" if 00252 /// the tag does not exist in the header. 00253 const char* getHDTagValue(const char* tag); 00254 00255 /// Get the value associated with the specified tag on the SQ line with 00256 /// the specified sequence name, returning "" if the tag or key does 00257 /// not exist. 00258 const char* getSQTagValue(const char* tag, const char* name); 00259 00260 /// Get the value associated with the specified tag on the RG line with 00261 /// the specified read group identifier, returning "" if the tag or key does 00262 /// not exist. 00263 const char* getRGTagValue(const char* tag, const char* id); 00264 00265 /// Get the value associated with the specified tag on the RG line with 00266 /// the specified id, returning "" if the tag or key does 00267 /// not exist. 00268 const char* getPGTagValue(const char* tag, const char* id); 00269 00270 //@} 00271 00272 /// Get the number of SQ objects. 00273 int getNumSQs(); 00274 00275 /// Get the number of RG objects. 00276 int getNumRGs(); 00277 00278 /// Get the number of PG objects. 00279 int getNumPGs(); 00280 00281 //////////////////////////////////////////////////////////////////////// 00282 /// @name Get a Specific Header Record 00283 /// These methods return a reference to the specific record that was 00284 /// requested, returning NULL if that record does not exist in the header. 00285 /// 00286 /// The returned record can be modified to add/remove some tags. 00287 /// Since a reference is returned, the SamHeaderFile automatically 00288 /// reflects these changes. 00289 //@{ 00290 00291 /// Get the HD object, returning NULL if there is no HD record. 00292 SamHeaderHD* getHD(); 00293 00294 /// Get the SQ object with the specified sequence name, returning NULL 00295 /// if there is no SQ object with that key. 00296 SamHeaderSQ* getSQ(const char* name); 00297 00298 /// Get the RG object with the specified read group identifier, returning 00299 /// NULL if there is no RG object with that key.. 00300 SamHeaderRG* getRG(const char* id); 00301 00302 /// Get the PG object with the specified id, returning NULL 00303 /// if there is no PG object with that key.. 00304 SamHeaderPG* getPG(const char* id); 00305 00306 //@} 00307 00308 // ////////////////////////////////// 00309 // // Set methods for header fields. 00310 // bool setVersion(const char* version); 00311 // bool setSortOrder(const char* sortOrder); 00312 // bool addSequenceName(const char* sequenceName); 00313 // bool setSequenceLength(const char* keyID, int sequenceLength); 00314 // bool setGenomeAssemblyId(const char* keyID, const char* genomeAssemblyId); 00315 // bool setMD5Checksum(const char* keyID, const char* md5sum); 00316 // bool setURI(const char* keyID, const char* uri); 00317 // bool setSpecies(const char* keyID, const char* species); 00318 // bool addReadGroupID(const char* readGroupID); 00319 // bool setSample(const char* keyID, const char* sample); 00320 // bool setLibrary(const char* keyID, const char* library); 00321 // bool setDescription(const char* keyID, const char* description); 00322 // bool setPlatformUnit(const char* keyID, const char* platform); 00323 // bool setPredictedMedianInsertSize(const char* keyID, const char* isize); 00324 // bool setSequencingCenter(const char* keyID, const char* center); 00325 // bool setRunDate(const char* keyID, const char* runDate); 00326 // bool setTechnology(const char* keyID, const char* technology); 00327 // bool addProgram(const char* programID); 00328 // bool setProgramVersion(const char* keyID, const char* version); 00329 // bool setCommandLine(const char* keyID, const char* commandLine); 00330 00331 // /////////////////////////////////// 00332 // // Get methods for header fields. 00333 // // Returns the number of SQ entries in the header. 00334 // int32_t getSequenceDictionaryCount(); 00335 00336 /// Return the Sort Order value that is set in the Header, returning "" 00337 /// if this field does not exist. 00338 const char* getSortOrder(); 00339 00340 00341 /// DEPRECATED 00342 const char* getTagSO(); 00343 00344 ///////////////////////////// 00345 /// @name Get the Header Record/Comment/Line by Record/Comment/Line 00346 /// These methods iterate through the header. 00347 /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the 00348 /// same iterator. getNextHeaderRecord that takes a header type 00349 /// uses the same iterator as the getNextXXRecord with that type. 00350 /// Otherwise the iterators are independent. 00351 //@{ 00352 00353 /// Get the next SQ header record. After all SQ headers have been 00354 /// retrieved, NULL is returned until a reset is called. 00355 /// Independent from getNextHeaderRecord, getNextHeaderLine and the 00356 /// other getNextXXRecord methods and the associated reset methods. 00357 SamHeaderRecord* getNextSQRecord(); 00358 00359 /// Get the next RG header record. After all RG headers have been 00360 /// retrieved, NULL is returned until a reset is called. 00361 /// Independent from getNextHeaderRecord, getNextHeaderLine and the 00362 /// other getNextXXRecord methods and the associated reset methods. 00363 SamHeaderRecord* getNextRGRecord(); 00364 00365 /// Get the next PG header record. After all PG headers have been 00366 /// retrieved, NULL is returned until a reset is called. 00367 /// Independent from getNextHeaderRecord, getNextHeaderLine and the 00368 /// other getNextXXRecord methods and the associated reset methods. 00369 SamHeaderRecord* getNextPGRecord(); 00370 00371 /// Reset to the beginning of the header records so the next call 00372 /// to getNextSQRecord returns the first SQ header record. 00373 void resetSQRecordIter(); 00374 00375 /// Reset to the beginning of the header records so the next call 00376 /// to getNextRGRecord returns the first RG header record. 00377 void resetRGRecordIter(); 00378 00379 /// Reset to the beginning of the header records so the next call 00380 /// to getNextPGRecord returns the first PG header record. 00381 void resetPGRecordIter(); 00382 00383 /// Get the next header record of the specified type starting from the 00384 /// specified index and update the index. 00385 /// After all headers of that type have been retrieved, 00386 /// NULL is returned until a reset is called for that type. 00387 SamHeaderRecord* getNextHeaderRecord(uint32_t& index, 00388 SamHeaderRecord::SamHeaderRecordType headerType); 00389 00390 /// Get the next header record, but not comment line. After all headers 00391 /// have been retrieved, NULL is returned until a reset is called. 00392 /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the 00393 /// same iterator. 00394 SamHeaderRecord* getNextHeaderRecord(); 00395 00396 /// Set the passed in string to the next header line, overwritting 00397 /// the passed in string. If there are no more header lines or there 00398 /// is an error, false is returned and the passed in string is set to "" 00399 /// until a rest is called. 00400 /// NOTE: both getNextHeaderRecord and getNextHeaderLine increment the 00401 /// same iterator. 00402 bool getNextHeaderLine(std::string &headerLine); 00403 00404 /// Reset to the beginning of the header records so the next call 00405 /// to getNextHeaderRecord returns the first header line. 00406 void resetHeaderRecordIter(); 00407 00408 /// Append all of the comment lines to the specified string. 00409 void appendCommentLines(std::string &commentLines); 00410 00411 /// Returns the comment on the next comment line. Returns "" if all comment 00412 /// lines have been returned, until resetCommentIter is called. 00413 const char* getNextComment(); 00414 00415 /// Resets to the beginning of the comments so getNextComment returns 00416 /// the first comment. 00417 void resetCommentIter(); 00418 00419 //@} 00420 00421 00422 /// Get the failure message if a method returned failure. 00423 const char* getErrorMessage() { return(myErrorMessage.c_str()); } 00424 00425 static const std::string EMPTY_RETURN; 00426 00427 private: 00428 // Parse the header string. 00429 bool parseHeader(String& header); 00430 00431 // Parse the specified line of the header. 00432 bool parseHeaderLine(const String& headerLine); 00433 00434 // Set the passed in string to the header line at the specified index. 00435 // It does NOT clear the current contents of header. 00436 bool getHeaderLine(unsigned int index, std::string& header) const; 00437 00438 int16_t makeKey(char ch1, char ch2) 00439 { 00440 return((ch1 << 8) + ch2); 00441 } 00442 00443 // Only one HD type is allowed per file. 00444 SamHeaderHD* myHD; 00445 00446 // There can be multiple SQ Types, indexed by SN. 00447 StringHash mySQs; 00448 00449 // There can be multiple RG Types, indexed by ID. 00450 StringHash myRGs; 00451 00452 // There can be multiple PG types, indexed by ID. 00453 StringHash myPGs; 00454 00455 // Reference Name information 00456 SamReferenceInfo myReferenceInfo; 00457 00458 // Vector of comments 00459 std::vector<std::string> myComments; 00460 00461 std::vector<SamHeaderRecord*> myHeaderRecords; 00462 00463 std::string myErrorMessage; 00464 00465 uint32_t myCurrentSQIndex; 00466 00467 uint32_t myCurrentRGIndex; 00468 00469 uint32_t myCurrentPGIndex; 00470 00471 uint32_t myCurrentHeaderIndex; 00472 00473 uint32_t myCurrentCommentIndex; 00474 }; 00475 00476 #endif 00477