libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include <stdexcept> 00019 00020 #include "SamQuerySeqWithRefHelper.h" 00021 #include "BaseUtilities.h" 00022 #include "SamFlag.h" 00023 00024 SamQuerySeqWithRefIter::SamQuerySeqWithRefIter(SamRecord& record, 00025 GenomeSequence& refSequence, 00026 bool forward) 00027 : myRecord(record), 00028 myRefSequence(refSequence), 00029 myCigar(NULL), 00030 myStartOfReadOnRefIndex(INVALID_GENOME_INDEX), 00031 myQueryIndex(0), 00032 myForward(forward) 00033 { 00034 myCigar = myRecord.getCigarInfo(); 00035 myStartOfReadOnRefIndex = 00036 refSequence.getGenomePosition(myRecord.getReferenceName()); 00037 00038 if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX) 00039 { 00040 // This reference name was found in the reference file, so 00041 // add the start position. 00042 myStartOfReadOnRefIndex += myRecord.get0BasedPosition(); 00043 } 00044 00045 if(!forward) 00046 { 00047 myQueryIndex = myRecord.getReadLength() - 1; 00048 } 00049 } 00050 00051 00052 SamQuerySeqWithRefIter::~SamQuerySeqWithRefIter() 00053 { 00054 } 00055 00056 00057 00058 bool SamQuerySeqWithRefIter::reset(bool forward) 00059 { 00060 myCigar = myRecord.getCigarInfo(); 00061 if(myCigar == NULL) 00062 { 00063 // Failed to get Cigar. 00064 return(false); 00065 } 00066 00067 // Get where the position of where this read starts as mapped to the 00068 // reference. 00069 myStartOfReadOnRefIndex = 00070 myRefSequence.getGenomePosition(myRecord.getReferenceName()); 00071 00072 if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX) 00073 { 00074 // This reference name was found in the reference file, so 00075 // add the start position. 00076 myStartOfReadOnRefIndex += myRecord.get0BasedPosition(); 00077 } 00078 00079 myForward = forward; 00080 00081 if(myForward) 00082 { 00083 myQueryIndex = 0; 00084 } 00085 else 00086 { 00087 // reverse, so start at the last entry. 00088 myQueryIndex = myRecord.getReadLength() - 1; 00089 } 00090 return(true); 00091 } 00092 00093 00094 // Returns information for the next position where the query and the 00095 // reference match or mismatch. To be a match or mismatch, both the query 00096 // and reference must have a base that is not 'N'. 00097 // This means: 00098 // insertions and deletions are not mismatches or matches. 00099 // 'N' bases are not matches or mismatches 00100 // Returns true if an entry was found, false if there are no more matches or 00101 // mismatches. 00102 bool SamQuerySeqWithRefIter::getNextMatchMismatch(SamSingleBaseMatchInfo& matchMismatchInfo) 00103 { 00104 // Check whether or not this read is mapped. 00105 // If the read is not mapped, return no matches. 00106 if(!SamFlag::isMapped(myRecord.getFlag())) 00107 { 00108 // Not mapped. 00109 return(false); 00110 } 00111 00112 // Check that the Cigar is set. 00113 if(myCigar == NULL) 00114 { 00115 // Error. 00116 throw(std::runtime_error("Cannot determine matches/mismatches since failed to retrieve the cigar")); 00117 return(false); 00118 } 00119 00120 // If myStartOfReadOnRefIndex is the default (unset) value, then 00121 // the reference was not found, so return false, no matches/mismatches. 00122 if(myStartOfReadOnRefIndex == INVALID_GENOME_INDEX) 00123 { 00124 // This reference name was not found in the reference file, so just 00125 // return no matches/mismatches. 00126 return(false); 00127 } 00128 00129 00130 // Repull the read length from the record to check just in case the 00131 // record has changed length. 00132 // Loop until a match or mismatch is found as long as query index 00133 // is still within the read (Loop is broken by a return). 00134 while((myQueryIndex < myRecord.getReadLength()) && 00135 (myQueryIndex >= 0)) 00136 { 00137 // Still more bases, look for a match/mismatch. 00138 00139 // Get the reference offset for this read position. 00140 int32_t refOffset = myCigar->getRefOffset(myQueryIndex); 00141 if(refOffset == Cigar::INDEX_NA) 00142 { 00143 // This is either a softclip or an insertion 00144 // which do not count as a match or a mismatch, so 00145 // go to the next index. 00146 nextIndex(); 00147 continue; 00148 } 00149 00150 // Both the reference and the read have a base, so get the bases. 00151 char readBase = myRecord.getSequence(myQueryIndex, SamRecord::NONE); 00152 char refBase = myRefSequence[myStartOfReadOnRefIndex + refOffset]; 00153 00154 // If either the read or the reference bases are unknown, then 00155 // it does not count as a match or a mismatch. 00156 if(BaseUtilities::isAmbiguous(readBase) || 00157 BaseUtilities::isAmbiguous(refBase)) 00158 { 00159 // Either the reference base or the read base are unknown, 00160 // so skip this position. 00161 nextIndex(); 00162 continue; 00163 } 00164 00165 // Both the read & the reference have a known base, so it is either 00166 // a match or a mismatch. 00167 matchMismatchInfo.setQueryIndex(myQueryIndex); 00168 00169 // Check if they are equal. 00170 if(BaseUtilities::areEqual(readBase, refBase)) 00171 { 00172 // Match. 00173 matchMismatchInfo.setType(SamSingleBaseMatchInfo::MATCH); 00174 // Increment the query index to the next position. 00175 nextIndex(); 00176 return(true); 00177 } 00178 else 00179 { 00180 // Mismatch 00181 matchMismatchInfo.setType(SamSingleBaseMatchInfo::MISMATCH); 00182 // Increment the query index to the next position. 00183 nextIndex(); 00184 return(true); 00185 } 00186 } 00187 00188 // No matches or mismatches were found, so return false. 00189 return(false); 00190 } 00191 00192 00193 void SamQuerySeqWithRefIter::nextIndex() 00194 { 00195 if(myForward) 00196 { 00197 ++myQueryIndex; 00198 } 00199 else 00200 { 00201 --myQueryIndex; 00202 } 00203 } 00204 00205 00206 SamSingleBaseMatchInfo::SamSingleBaseMatchInfo() 00207 : myType(UNKNOWN), 00208 myQueryIndex(0) 00209 { 00210 } 00211 00212 00213 SamSingleBaseMatchInfo::~SamSingleBaseMatchInfo() 00214 { 00215 } 00216 00217 00218 SamSingleBaseMatchInfo::Type SamSingleBaseMatchInfo::getType() 00219 { 00220 return(myType); 00221 } 00222 00223 00224 int32_t SamSingleBaseMatchInfo::getQueryIndex() 00225 { 00226 return(myQueryIndex); 00227 } 00228 00229 00230 void SamSingleBaseMatchInfo::setType(Type newType) 00231 { 00232 myType = newType; 00233 } 00234 00235 00236 void SamSingleBaseMatchInfo::setQueryIndex(int32_t queryIndex) 00237 { 00238 myQueryIndex = queryIndex; 00239 } 00240 00241 00242 /////////////////////////////////////////////////////////////////////////// 00243 void SamQuerySeqWithRef::seqWithEquals(const char* currentSeq, 00244 int32_t seq0BasedPos, 00245 Cigar& cigar, 00246 const char* referenceName, 00247 const GenomeSequence& refSequence, 00248 std::string& updatedSeq) 00249 { 00250 updatedSeq = currentSeq; 00251 00252 int32_t seqLength = updatedSeq.length(); 00253 int32_t queryIndex = 0; 00254 00255 uint32_t startOfReadOnRefIndex = 00256 refSequence.getGenomePosition(referenceName); 00257 00258 if(startOfReadOnRefIndex == INVALID_GENOME_INDEX) 00259 { 00260 // This reference name was not found in the reference file, so just 00261 // return. 00262 return; 00263 } 00264 startOfReadOnRefIndex += seq0BasedPos; 00265 00266 // Loop until the entire sequence has been updated. 00267 while(queryIndex < seqLength) 00268 { 00269 // Still more bases, look for matches. 00270 00271 // Get the reference offset for this read position. 00272 int32_t refOffset = cigar.getRefOffset(queryIndex); 00273 if(refOffset != Cigar::INDEX_NA) 00274 { 00275 // Both the reference and the read have a base, so get the bases. 00276 char readBase = currentSeq[queryIndex]; 00277 char refBase = refSequence[startOfReadOnRefIndex + refOffset]; 00278 00279 // If neither base is unknown and they are the same, count it 00280 // as a match. 00281 if(!BaseUtilities::isAmbiguous(readBase) && 00282 !BaseUtilities::isAmbiguous(refBase) && 00283 (BaseUtilities::areEqual(readBase, refBase))) 00284 { 00285 // Match. 00286 updatedSeq[queryIndex] = '='; 00287 } 00288 } 00289 // Increment the query index to the next position. 00290 ++queryIndex; 00291 continue; 00292 } 00293 } 00294 00295 00296 void SamQuerySeqWithRef::seqWithoutEquals(const char* currentSeq, 00297 int32_t seq0BasedPos, 00298 Cigar& cigar, 00299 const char* referenceName, 00300 const GenomeSequence& refSequence, 00301 std::string& updatedSeq) 00302 { 00303 updatedSeq = currentSeq; 00304 00305 int32_t seqLength = updatedSeq.length(); 00306 int32_t queryIndex = 0; 00307 00308 uint32_t startOfReadOnRefIndex = 00309 refSequence.getGenomePosition(referenceName); 00310 00311 if(startOfReadOnRefIndex == INVALID_GENOME_INDEX) 00312 { 00313 // This reference name was not found in the reference file, so just 00314 // return. 00315 return; 00316 } 00317 startOfReadOnRefIndex += seq0BasedPos; 00318 00319 // Loop until the entire sequence has been updated. 00320 while(queryIndex < seqLength) 00321 { 00322 // Still more bases, look for matches. 00323 00324 // Get the reference offset for this read position. 00325 int32_t refOffset = cigar.getRefOffset(queryIndex); 00326 if(refOffset != Cigar::INDEX_NA) 00327 { 00328 // Both the reference and the read have a base, so get the bases. 00329 char readBase = currentSeq[queryIndex]; 00330 char refBase = refSequence[startOfReadOnRefIndex + refOffset]; 00331 00332 // If the bases are equal, set the sequence to the reference 00333 // base. (Skips the check for ambiguous to catch a case where 00334 // ambiguous had been converted to a '=', and if both are ambiguous, 00335 // it will still be set to ambiguous.) 00336 if(BaseUtilities::areEqual(readBase, refBase)) 00337 { 00338 // Match. 00339 updatedSeq[queryIndex] = refBase; 00340 } 00341 } 00342 00343 // Increment the query index to the next position. 00344 ++queryIndex; 00345 continue; 00346 } 00347 }