SamQuerySeqWithRefHelper.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <stdexcept>
00019 
00020 #include "SamQuerySeqWithRefHelper.h"
00021 #include "BaseUtilities.h"
00022 #include "SamFlag.h"
00023 
00024 SamQuerySeqWithRefIter::SamQuerySeqWithRefIter(SamRecord& record,
00025                                                GenomeSequence& refSequence,
00026                                                bool forward)
00027     : myRecord(record),
00028       myRefSequence(refSequence),
00029       myCigar(NULL),
00030       myStartOfReadOnRefIndex(0),
00031       myQueryIndex(0),
00032       myForward(forward)
00033 {
00034     myCigar = myRecord.getCigarInfo();   
00035     myStartOfReadOnRefIndex = 
00036         myRefSequence.getGenomePosition(myRecord.getReferenceName()) +
00037         myRecord.get0BasedPosition();
00038 
00039     if(!forward)
00040     {
00041         myQueryIndex = myRecord.getReadLength() - 1;
00042     }
00043 }
00044 
00045 
00046 SamQuerySeqWithRefIter::~SamQuerySeqWithRefIter()
00047 {
00048 }
00049 
00050 
00051 
00052 bool SamQuerySeqWithRefIter::reset(bool forward)
00053 {
00054     myCigar = myRecord.getCigarInfo();
00055     if(myCigar == NULL)
00056     {
00057         // Failed to get Cigar.
00058         return(false);
00059     }
00060 
00061     // Get where the position of where this read starts as mapped to the 
00062     // reference.
00063     myStartOfReadOnRefIndex = 
00064         myRefSequence.getGenomePosition(myRecord.getReferenceName()) +
00065         myRecord.get0BasedPosition();
00066 
00067     myForward = forward;
00068 
00069     if(myForward)
00070     {
00071         myQueryIndex = 0;
00072     }
00073     else
00074     {
00075         // reverse, so start at the last entry.
00076         myQueryIndex = myRecord.getReadLength() - 1;
00077     }
00078     return(true);
00079 }
00080 
00081 
00082 // Returns information for the next position where the query and the 
00083 // reference match or mismatch.  To be a match or mismatch, both the query
00084 // and reference must have a base that is not 'N'.
00085 // This means:
00086 //    insertions and deletions are not mismatches or matches.
00087 //    'N' bases are not matches or mismatches
00088 // Returns true if an entry was found, false if there are no more matches or
00089 // mismatches.
00090 bool SamQuerySeqWithRefIter::getNextMatchMismatch(SamSingleBaseMatchInfo& matchMismatchInfo)
00091 {
00092     // Check whether or not this read is mapped. 
00093     // If the read is not mapped, return no matches.
00094     if(!SamFlag::isMapped(myRecord.getFlag()))
00095     {
00096         // Not mapped.
00097         return(false);
00098     }
00099 
00100     // Check that the Cigar is set.
00101     if(myCigar == NULL)
00102     {
00103         // Error.
00104         throw(std::runtime_error("Cannot determine matches/mismatches since failed to retrieve the cigar"));
00105         return(false);
00106     }
00107 
00108     // Repull the read length from the record to check just in case the
00109     // record has changed length.
00110     // Loop until a match or mismatch is found as long as query index
00111     // is still within the read  (Loop is broken by a return).
00112     while((myQueryIndex < myRecord.getReadLength()) &&
00113           (myQueryIndex >= 0))
00114     {
00115         // Still more bases, look for a match/mismatch.
00116 
00117         // Get the reference offset for this read position.
00118         int32_t refOffset = myCigar->getRefOffset(myQueryIndex);
00119         if(refOffset == Cigar::INDEX_NA)
00120         {
00121             // This is either a softclip or an insertion
00122             // which do not count as a match or a mismatch, so
00123             // go to the next index.
00124             nextIndex();
00125             continue;
00126         }
00127         
00128         // Both the reference and the read have a base, so get the bases.
00129         char readBase = myRecord.getSequence(myQueryIndex, SamRecord::NONE);
00130         char refBase = myRefSequence[myStartOfReadOnRefIndex + refOffset];
00131        
00132         // If either the read or the reference bases are unknown, then
00133         // it does not count as a match or a mismatch.
00134         if(BaseUtilities::isAmbiguous(readBase) || 
00135            BaseUtilities::isAmbiguous(refBase))
00136         {
00137             // Either the reference base or the read base are unknown,
00138             // so skip this position.
00139             nextIndex();
00140             continue;
00141         }
00142 
00143         // Both the read & the reference have a known base, so it is either
00144         // a match or a mismatch.
00145         matchMismatchInfo.setQueryIndex(myQueryIndex);
00146 
00147         // Check if they are equal.
00148         if(BaseUtilities::areEqual(readBase, refBase))
00149         {
00150             // Match.
00151             matchMismatchInfo.setType(SamSingleBaseMatchInfo::MATCH);
00152             // Increment the query index to the next position.
00153             nextIndex();
00154             return(true);
00155         }
00156         else
00157         {
00158             // Mismatch
00159             matchMismatchInfo.setType(SamSingleBaseMatchInfo::MISMATCH);
00160             // Increment the query index to the next position.
00161             nextIndex();
00162             return(true);
00163         }
00164     }
00165 
00166     // No matches or mismatches were found, so return false.
00167     return(false);
00168 }
00169 
00170 
00171 void SamQuerySeqWithRefIter::nextIndex()
00172 {
00173     if(myForward)
00174     {
00175         ++myQueryIndex;
00176     }
00177     else
00178     {
00179         --myQueryIndex;
00180     }
00181 }
00182 
00183 
00184 SamSingleBaseMatchInfo::SamSingleBaseMatchInfo()
00185     : myType(UNKNOWN),
00186       myQueryIndex(0)
00187 {
00188 }
00189 
00190 
00191 SamSingleBaseMatchInfo::~SamSingleBaseMatchInfo()
00192 {
00193 }
00194 
00195 
00196 SamSingleBaseMatchInfo::Type SamSingleBaseMatchInfo::getType()
00197 {
00198     return(myType);
00199 }
00200 
00201 
00202 int32_t SamSingleBaseMatchInfo::getQueryIndex()
00203 {
00204     return(myQueryIndex);
00205 }
00206 
00207 
00208 void SamSingleBaseMatchInfo::setType(Type newType)
00209 {
00210     myType = newType;
00211 }
00212 
00213 
00214 void SamSingleBaseMatchInfo::setQueryIndex(int32_t queryIndex)
00215 {
00216     myQueryIndex = queryIndex;
00217 }
00218 
00219 
00220 ///////////////////////////////////////////////////////////////////////////
00221 void SamQuerySeqWithRef::seqWithEquals(const char* currentSeq,
00222                                        int32_t seq0BasedPos,
00223                                        Cigar& cigar, 
00224                                        const char* referenceName,
00225                                        const GenomeSequence& refSequence,
00226                                        std::string& updatedSeq)
00227 {
00228     updatedSeq = currentSeq;
00229 
00230     int32_t seqLength = updatedSeq.length();
00231     int32_t queryIndex = 0;
00232 
00233     uint32_t startOfReadOnRefIndex = 
00234         refSequence.getGenomePosition(referenceName) + seq0BasedPos;
00235 
00236     // Loop until the entire sequence has been updated.
00237     while(queryIndex < seqLength)
00238     {
00239         // Still more bases, look for matches.
00240 
00241         // Get the reference offset for this read position.
00242         int32_t refOffset = cigar.getRefOffset(queryIndex);
00243         if(refOffset != Cigar::INDEX_NA)
00244         {
00245             // Both the reference and the read have a base, so get the bases.
00246             char readBase = currentSeq[queryIndex];
00247             char refBase = refSequence[startOfReadOnRefIndex + refOffset];
00248 
00249             // If neither base is unknown and they are the same, count it
00250             // as a match.
00251             if(!BaseUtilities::isAmbiguous(readBase) && 
00252                !BaseUtilities::isAmbiguous(refBase) && 
00253                (BaseUtilities::areEqual(readBase, refBase)))
00254             {
00255                 // Match.
00256                 updatedSeq[queryIndex] = '=';
00257             }
00258         }
00259         // Increment the query index to the next position.
00260         ++queryIndex;
00261         continue;
00262     }
00263 }
00264 
00265 
00266 void SamQuerySeqWithRef::seqWithoutEquals(const char* currentSeq,
00267                                           int32_t seq0BasedPos,
00268                                           Cigar& cigar, 
00269                                           const char* referenceName,
00270                                           const GenomeSequence& refSequence,
00271                                           std::string& updatedSeq)
00272 {
00273     updatedSeq = currentSeq;
00274 
00275     int32_t seqLength = updatedSeq.length();
00276     int32_t queryIndex = 0;
00277 
00278     uint32_t startOfReadOnRefIndex = 
00279         refSequence.getGenomePosition(referenceName) + seq0BasedPos;
00280 
00281     // Loop until the entire sequence has been updated.
00282     while(queryIndex < seqLength)
00283     {
00284         // Still more bases, look for matches.
00285 
00286         // Get the reference offset for this read position.
00287         int32_t refOffset = cigar.getRefOffset(queryIndex);
00288         if(refOffset != Cigar::INDEX_NA)
00289         {
00290             // Both the reference and the read have a base, so get the bases.
00291             char readBase = currentSeq[queryIndex];
00292             char refBase = refSequence[startOfReadOnRefIndex + refOffset];
00293             
00294             // If the bases are equal, set the sequence to the reference
00295             // base. (Skips the check for ambiguous to catch a case where
00296             // ambiguous had been converted to a '=', and if both are ambiguous,
00297             // it will still be set to ambiguous.)
00298             if(BaseUtilities::areEqual(readBase, refBase))
00299             {
00300                 // Match.
00301                 updatedSeq[queryIndex] = refBase;
00302             }
00303         }
00304 
00305         // Increment the query index to the next position.
00306         ++queryIndex;
00307         continue;
00308     }
00309 }
Generated on Tue Mar 22 22:50:13 2011 for StatGen Software by  doxygen 1.6.3