libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2011 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include "SamTags.h" 00019 #include "BaseUtilities.h" 00020 00021 const char* SamTags::BQ_TAG = "BQ"; 00022 const char SamTags::BQ_TAG_TYPE = 'Z'; 00023 const char* SamTags::MD_TAG = "MD"; 00024 const char SamTags::MD_TAG_TYPE = 'Z'; 00025 const char* SamTags::ORIG_POS_TAG = "OP"; 00026 const char SamTags::ORIG_POS_TAG_TYPE = 'i'; 00027 const char* SamTags::ORIG_CIGAR_TAG = "OC"; 00028 const char SamTags::ORIG_CIGAR_TAG_TYPE = 'Z'; 00029 const char* SamTags::ORIG_QUAL_TAG = "OQ"; 00030 const char SamTags::ORIG_QUAL_TAG_TYPE = 'Z'; 00031 00032 00033 // Create the MD tag for the specified input record and the genome. 00034 bool SamTags::createMDTag(String& outputMDtag, SamRecord& inputRec, 00035 GenomeSequence& genome) 00036 { 00037 outputMDtag.Clear(); 00038 // Get the cigar to use for determing alignment. 00039 Cigar* cigarInfo = inputRec.getCigarInfo(); 00040 if(cigarInfo == NULL) 00041 { 00042 throw(std::runtime_error("Cannot createMDTag - failed to get the cigar")); 00043 return(false); 00044 } 00045 int32_t queryIndex = Cigar::INDEX_NA; 00046 00047 // get where this read starts on the reference. 00048 uint32_t startOfReadOnRefIndex = 00049 genome.getGenomePosition(inputRec.getReferenceName()); 00050 if(startOfReadOnRefIndex == (uint32_t)INVALID_CHROMOSOME_INDEX) 00051 { 00052 // Failed to find the reference for this chromosome, so return false. 00053 return(false); 00054 } 00055 startOfReadOnRefIndex += inputRec.get0BasedPosition(); 00056 00057 // Track the number of consecutive matches. 00058 int32_t matchCount = 0; 00059 // Track if it is currently in a deletion so it knows when not to add 00060 // a '^'. 00061 bool currentDeletion = false; 00062 00063 // Loop through the Reference indices (ignores insertions/pads/clips). 00064 for(int refOffset = 0; 00065 refOffset < cigarInfo->getExpectedReferenceBaseCount(); 00066 ++refOffset) 00067 { 00068 // Get the query index for this reference position.. 00069 queryIndex = cigarInfo->getQueryIndex(refOffset); 00070 00071 char refBase = genome[startOfReadOnRefIndex + refOffset]; 00072 00073 if(queryIndex != Cigar::INDEX_NA) 00074 { 00075 // Both the reference and the read have a base, so get the bases. 00076 char readBase = inputRec.getSequence(queryIndex); 00077 currentDeletion = false; 00078 00079 // If neither base is unknown and they are the same, count it 00080 // as a match. 00081 if(!BaseUtilities::isAmbiguous(readBase) && 00082 !BaseUtilities::isAmbiguous(refBase) && 00083 (BaseUtilities::areEqual(readBase, refBase))) 00084 { 00085 // Match, so update counter. 00086 ++matchCount; 00087 } 00088 else 00089 { 00090 // Mismatch, so output the number of matches if any. 00091 if(matchCount != 0) 00092 { 00093 outputMDtag += matchCount; 00094 matchCount = 0; 00095 } 00096 outputMDtag += refBase; 00097 } 00098 } 00099 else 00100 { 00101 // This reference position is not in the query, so it is a deletion. 00102 // Deletion, so output the number of matches if any. 00103 if(matchCount != 0) 00104 { 00105 outputMDtag += matchCount; 00106 matchCount = 0; 00107 } 00108 00109 if(!currentDeletion) 00110 { 00111 // Not currently in a deletion, so add the ^ 00112 outputMDtag += '^'; 00113 } 00114 // Add the deleted base. 00115 outputMDtag += refBase; 00116 currentDeletion = true; 00117 } 00118 } 00119 00120 // output the match count at the end. 00121 outputMDtag += matchCount; 00122 return(true); 00123 } 00124 00125 // Check to see if the MD tag in the record is accurate. 00126 bool SamTags::isMDTagCorrect(SamRecord& inputRec, GenomeSequence& genome) 00127 { 00128 String calcMDtag; 00129 if(!createMDTag(calcMDtag, inputRec, genome)) 00130 { 00131 // Could not generate the MD tag, so just return that it is incorrect. 00132 return(false); 00133 } 00134 00135 const String* origMDtag = inputRec.getStringTag(MD_TAG); 00136 00137 if(origMDtag == NULL) 00138 { 00139 // There was no tag. 00140 // if there is not a new tag, then they are the same and true 00141 // should be returned. If there is a new tag, then the old one was 00142 // wrong so false should be returned. So just return the result of 00143 // IsEmpty. 00144 return(calcMDtag.IsEmpty()); 00145 } 00146 else 00147 { 00148 // origMDtag is not NULL, so just compare the two tags. 00149 return(calcMDtag == *origMDtag); 00150 } 00151 } 00152 00153 00154 // Update/Add the MD tag in the inputRec. 00155 bool SamTags::updateMDTag(SamRecord& inputRec, GenomeSequence& genome) 00156 { 00157 // Calculate the new MD tag. 00158 String calcMDtag; 00159 createMDTag(calcMDtag, inputRec, genome); 00160 00161 // Add the MD tag. If it is already there and is different it will 00162 // replace it. If it is already there and it is the same, it won't 00163 // do anything. 00164 return(inputRec.addTag(MD_TAG, MD_TAG_TYPE, calcMDtag.c_str())); 00165 }