libStatGen Software  1
SamQuerySeqWithRefHelper.cpp
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <stdexcept>
00019 
00020 #include "SamQuerySeqWithRefHelper.h"
00021 #include "BaseUtilities.h"
00022 #include "SamFlag.h"
00023 
00024 SamQuerySeqWithRefIter::SamQuerySeqWithRefIter(SamRecord& record,
00025                                                GenomeSequence& refSequence,
00026                                                bool forward)
00027     : myRecord(record),
00028       myRefSequence(refSequence),
00029       myCigar(NULL),
00030       myStartOfReadOnRefIndex(INVALID_GENOME_INDEX),
00031       myQueryIndex(0),
00032       myForward(forward)
00033 {
00034     myCigar = myRecord.getCigarInfo();
00035     myStartOfReadOnRefIndex = 
00036         refSequence.getGenomePosition(myRecord.getReferenceName());
00037     
00038     if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX)
00039     {
00040         // This reference name was found in the reference file, so 
00041         // add the start position.
00042         myStartOfReadOnRefIndex += myRecord.get0BasedPosition();
00043     }
00044 
00045     if(!forward)
00046     {
00047         myQueryIndex = myRecord.getReadLength() - 1;
00048     }
00049 }
00050 
00051 
00052 SamQuerySeqWithRefIter::~SamQuerySeqWithRefIter()
00053 {
00054 }
00055 
00056 
00057 
00058 bool SamQuerySeqWithRefIter::reset(bool forward)
00059 {
00060     myCigar = myRecord.getCigarInfo();
00061     if(myCigar == NULL)
00062     {
00063         // Failed to get Cigar.
00064         return(false);
00065     }
00066 
00067     // Get where the position of where this read starts as mapped to the 
00068     // reference.
00069     myStartOfReadOnRefIndex = 
00070         myRefSequence.getGenomePosition(myRecord.getReferenceName());
00071     
00072     if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX)
00073     {
00074         // This reference name was found in the reference file, so 
00075         // add the start position.
00076         myStartOfReadOnRefIndex += myRecord.get0BasedPosition();
00077     }
00078 
00079     myForward = forward;
00080     
00081     if(myForward)
00082     {
00083         myQueryIndex = 0;
00084     }
00085     else
00086     {
00087         // reverse, so start at the last entry.
00088         myQueryIndex = myRecord.getReadLength() - 1;
00089     }
00090     return(true);
00091 }
00092 
00093 
00094 // Returns information for the next position where the query and the 
00095 // reference match or mismatch.  To be a match or mismatch, both the query
00096 // and reference must have a base that is not 'N'.
00097 // This means:
00098 //    insertions and deletions are not mismatches or matches.
00099 //    'N' bases are not matches or mismatches
00100 // Returns true if an entry was found, false if there are no more matches or
00101 // mismatches.
00102 bool SamQuerySeqWithRefIter::getNextMatchMismatch(SamSingleBaseMatchInfo& matchMismatchInfo)
00103 {
00104     // Check whether or not this read is mapped. 
00105     // If the read is not mapped, return no matches.
00106     if(!SamFlag::isMapped(myRecord.getFlag()))
00107     {
00108         // Not mapped.
00109         return(false);
00110     }
00111 
00112     // Check that the Cigar is set.
00113     if(myCigar == NULL)
00114     {
00115         // Error.
00116         throw(std::runtime_error("Cannot determine matches/mismatches since failed to retrieve the cigar"));
00117         return(false);
00118     }
00119 
00120     // If myStartOfReadOnRefIndex is the default (unset) value, then
00121     // the reference was not found, so return false, no matches/mismatches.
00122     if(myStartOfReadOnRefIndex == INVALID_GENOME_INDEX)
00123     {
00124         // This reference name was not found in the reference file, so just
00125         // return no matches/mismatches.
00126         return(false);
00127     }
00128 
00129 
00130     // Repull the read length from the record to check just in case the
00131     // record has changed length.
00132     // Loop until a match or mismatch is found as long as query index
00133     // is still within the read  (Loop is broken by a return).
00134     while((myQueryIndex < myRecord.getReadLength()) &&
00135           (myQueryIndex >= 0))
00136     {
00137         // Still more bases, look for a match/mismatch.
00138 
00139         // Get the reference offset for this read position.
00140         int32_t refOffset = myCigar->getRefOffset(myQueryIndex);
00141         if(refOffset == Cigar::INDEX_NA)
00142         {
00143             // This is either a softclip or an insertion
00144             // which do not count as a match or a mismatch, so
00145             // go to the next index.
00146             nextIndex();
00147             continue;
00148         }
00149         
00150         // Both the reference and the read have a base, so get the bases.
00151         char readBase = myRecord.getSequence(myQueryIndex, SamRecord::NONE);
00152         char refBase = myRefSequence[myStartOfReadOnRefIndex + refOffset];
00153        
00154         // If either the read or the reference bases are unknown, then
00155         // it does not count as a match or a mismatch.
00156         if(BaseUtilities::isAmbiguous(readBase) || 
00157            BaseUtilities::isAmbiguous(refBase))
00158         {
00159             // Either the reference base or the read base are unknown,
00160             // so skip this position.
00161             nextIndex();
00162             continue;
00163         }
00164 
00165         // Both the read & the reference have a known base, so it is either
00166         // a match or a mismatch.
00167         matchMismatchInfo.setQueryIndex(myQueryIndex);
00168 
00169         // Check if they are equal.
00170         if(BaseUtilities::areEqual(readBase, refBase))
00171         {
00172             // Match.
00173             matchMismatchInfo.setType(SamSingleBaseMatchInfo::MATCH);
00174             // Increment the query index to the next position.
00175             nextIndex();
00176             return(true);
00177         }
00178         else
00179         {
00180             // Mismatch
00181             matchMismatchInfo.setType(SamSingleBaseMatchInfo::MISMATCH);
00182             // Increment the query index to the next position.
00183             nextIndex();
00184             return(true);
00185         }
00186     }
00187 
00188     // No matches or mismatches were found, so return false.
00189     return(false);
00190 }
00191 
00192 
00193 void SamQuerySeqWithRefIter::nextIndex()
00194 {
00195     if(myForward)
00196     {
00197         ++myQueryIndex;
00198     }
00199     else
00200     {
00201         --myQueryIndex;
00202     }
00203 }
00204 
00205 
00206 SamSingleBaseMatchInfo::SamSingleBaseMatchInfo()
00207     : myType(UNKNOWN),
00208       myQueryIndex(0)
00209 {
00210 }
00211 
00212 
00213 SamSingleBaseMatchInfo::~SamSingleBaseMatchInfo()
00214 {
00215 }
00216 
00217 
00218 SamSingleBaseMatchInfo::Type SamSingleBaseMatchInfo::getType()
00219 {
00220     return(myType);
00221 }
00222 
00223 
00224 int32_t SamSingleBaseMatchInfo::getQueryIndex()
00225 {
00226     return(myQueryIndex);
00227 }
00228 
00229 
00230 void SamSingleBaseMatchInfo::setType(Type newType)
00231 {
00232     myType = newType;
00233 }
00234 
00235 
00236 void SamSingleBaseMatchInfo::setQueryIndex(int32_t queryIndex)
00237 {
00238     myQueryIndex = queryIndex;
00239 }
00240 
00241 
00242 ///////////////////////////////////////////////////////////////////////////
00243 void SamQuerySeqWithRef::seqWithEquals(const char* currentSeq,
00244                                        int32_t seq0BasedPos,
00245                                        Cigar& cigar, 
00246                                        const char* referenceName,
00247                                        const GenomeSequence& refSequence,
00248                                        std::string& updatedSeq)
00249 {
00250     updatedSeq = currentSeq;
00251 
00252     int32_t seqLength = updatedSeq.length();
00253     int32_t queryIndex = 0;
00254 
00255     uint32_t startOfReadOnRefIndex = 
00256         refSequence.getGenomePosition(referenceName);
00257     
00258     if(startOfReadOnRefIndex == INVALID_GENOME_INDEX)
00259     {
00260         // This reference name was not found in the reference file, so just
00261         // return.
00262         return;
00263     }
00264     startOfReadOnRefIndex += seq0BasedPos;
00265     
00266     // Loop until the entire sequence has been updated.
00267     while(queryIndex < seqLength)
00268     {
00269         // Still more bases, look for matches.
00270 
00271         // Get the reference offset for this read position.
00272         int32_t refOffset = cigar.getRefOffset(queryIndex);
00273         if(refOffset != Cigar::INDEX_NA)
00274         {
00275             // Both the reference and the read have a base, so get the bases.
00276             char readBase = currentSeq[queryIndex];
00277             char refBase = refSequence[startOfReadOnRefIndex + refOffset];
00278 
00279             // If neither base is unknown and they are the same, count it
00280             // as a match.
00281             if(!BaseUtilities::isAmbiguous(readBase) && 
00282                !BaseUtilities::isAmbiguous(refBase) && 
00283                (BaseUtilities::areEqual(readBase, refBase)))
00284             {
00285                 // Match.
00286                 updatedSeq[queryIndex] = '=';
00287             }
00288         }
00289         // Increment the query index to the next position.
00290         ++queryIndex;
00291         continue;
00292     }
00293 }
00294 
00295 
00296 void SamQuerySeqWithRef::seqWithoutEquals(const char* currentSeq,
00297                                           int32_t seq0BasedPos,
00298                                           Cigar& cigar, 
00299                                           const char* referenceName,
00300                                           const GenomeSequence& refSequence,
00301                                           std::string& updatedSeq)
00302 {
00303     updatedSeq = currentSeq;
00304 
00305     int32_t seqLength = updatedSeq.length();
00306     int32_t queryIndex = 0;
00307 
00308     uint32_t startOfReadOnRefIndex = 
00309         refSequence.getGenomePosition(referenceName);
00310     
00311     if(startOfReadOnRefIndex == INVALID_GENOME_INDEX)
00312     {
00313         // This reference name was not found in the reference file, so just
00314         // return.
00315         return;
00316     }
00317     startOfReadOnRefIndex += seq0BasedPos;
00318     
00319     // Loop until the entire sequence has been updated.
00320     while(queryIndex < seqLength)
00321     {
00322         // Still more bases, look for matches.
00323 
00324         // Get the reference offset for this read position.
00325         int32_t refOffset = cigar.getRefOffset(queryIndex);
00326         if(refOffset != Cigar::INDEX_NA)
00327         {
00328             // Both the reference and the read have a base, so get the bases.
00329             char readBase = currentSeq[queryIndex];
00330             char refBase = refSequence[startOfReadOnRefIndex + refOffset];
00331             
00332             // If the bases are equal, set the sequence to the reference
00333             // base. (Skips the check for ambiguous to catch a case where
00334             // ambiguous had been converted to a '=', and if both are ambiguous,
00335             // it will still be set to ambiguous.)
00336             if(BaseUtilities::areEqual(readBase, refBase))
00337             {
00338                 // Match.
00339                 updatedSeq[queryIndex] = refBase;
00340             }
00341         }
00342 
00343         // Increment the query index to the next position.
00344         ++queryIndex;
00345         continue;
00346     }
00347 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends