libStatGen Software  1
SamTags.cpp
00001 /*
00002  *  Copyright (C) 2011  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "SamTags.h"
00019 #include "BaseUtilities.h"
00020 
00021 const char* SamTags::BQ_TAG = "BQ";
00022 const char SamTags::BQ_TAG_TYPE = 'Z';
00023 const char* SamTags::MD_TAG = "MD";
00024 const char SamTags::MD_TAG_TYPE = 'Z';
00025 const char* SamTags::ORIG_POS_TAG = "OP";
00026 const char SamTags::ORIG_POS_TAG_TYPE = 'i';
00027 const char* SamTags::ORIG_CIGAR_TAG = "OC";
00028 const char SamTags::ORIG_CIGAR_TAG_TYPE = 'Z';
00029 const char* SamTags::ORIG_QUAL_TAG = "OQ";
00030 const char SamTags::ORIG_QUAL_TAG_TYPE = 'Z';
00031 
00032 
00033 // Create the MD tag for the specified input record and the genome.
00034 bool SamTags::createMDTag(String& outputMDtag, SamRecord& inputRec, 
00035                           GenomeSequence& genome)
00036 {
00037     outputMDtag.Clear();
00038     // Get the cigar to use for determing alignment.
00039     Cigar* cigarInfo = inputRec.getCigarInfo();
00040     if(cigarInfo == NULL)
00041     {
00042         throw(std::runtime_error("Cannot createMDTag - failed to get the cigar"));
00043         return(false);
00044     }
00045     int32_t queryIndex = Cigar::INDEX_NA;
00046 
00047     // get where this read starts on the reference.
00048     uint32_t startOfReadOnRefIndex = 
00049         genome.getGenomePosition(inputRec.getReferenceName());
00050     if(startOfReadOnRefIndex == (uint32_t)INVALID_CHROMOSOME_INDEX)
00051     {
00052         // Failed to find the reference for this chromosome, so return false.
00053         return(false);
00054     }
00055     startOfReadOnRefIndex += inputRec.get0BasedPosition();
00056 
00057     // Track the number of consecutive matches.
00058     int32_t matchCount = 0;
00059     // Track if it is currently in a deletion so it knows when not to add
00060     // a '^'.
00061     bool currentDeletion = false;
00062 
00063     // Loop through the Reference indices (ignores insertions/pads/clips).
00064     for(int refOffset = 0; 
00065         refOffset < cigarInfo->getExpectedReferenceBaseCount();
00066         ++refOffset)
00067     {
00068         // Get the query index for this reference position..
00069         queryIndex = cigarInfo->getQueryIndex(refOffset);
00070 
00071         char refBase = genome[startOfReadOnRefIndex + refOffset];
00072 
00073         if(queryIndex != Cigar::INDEX_NA)
00074         {
00075             // Both the reference and the read have a base, so get the bases.
00076             char readBase = inputRec.getSequence(queryIndex);
00077             currentDeletion = false;
00078 
00079             // If neither base is unknown and they are the same, count it
00080             // as a match.
00081             if(!BaseUtilities::isAmbiguous(readBase) && 
00082                !BaseUtilities::isAmbiguous(refBase) && 
00083                (BaseUtilities::areEqual(readBase, refBase)))
00084             {
00085                 // Match, so update counter.
00086                 ++matchCount;
00087             }
00088             else
00089             {
00090                 // Mismatch, so output the number of matches if any.
00091                 if(matchCount != 0)
00092                 {
00093                     outputMDtag += matchCount;
00094                     matchCount = 0;
00095                 }
00096                 outputMDtag += refBase;
00097             }
00098         }
00099         else
00100         {
00101             // This reference position is not in the query, so it is a deletion.
00102             // Deletion, so output the number of matches if any.
00103             if(matchCount != 0)
00104             {
00105                 outputMDtag += matchCount;
00106                 matchCount = 0;
00107             }
00108 
00109             if(!currentDeletion)
00110             {
00111                 // Not currently in a deletion, so add the ^
00112                 outputMDtag += '^';
00113             }
00114             // Add the deleted base.
00115             outputMDtag += refBase;
00116             currentDeletion = true;
00117         }
00118     }
00119 
00120     // output the match count at the end.
00121     outputMDtag += matchCount;
00122     return(true);
00123 }
00124 
00125 // Check to see if the MD tag in the record is accurate.
00126 bool SamTags::isMDTagCorrect(SamRecord& inputRec, GenomeSequence& genome)
00127 {
00128     String calcMDtag;
00129     if(!createMDTag(calcMDtag, inputRec, genome))
00130     {
00131         // Could not generate the MD tag, so just return that it is incorrect.
00132         return(false);
00133     }
00134     
00135     const String* origMDtag = inputRec.getStringTag(MD_TAG);
00136     
00137     if(origMDtag == NULL)
00138     {
00139         // There was no tag.
00140         // if there is not a new tag, then they are the same and true
00141         // should be returned.  If there is a new tag, then the old one was
00142         // wrong so false should be returned.  So just return the result of
00143         // IsEmpty.
00144         return(calcMDtag.IsEmpty());
00145     }
00146     else
00147     {
00148         // origMDtag is not NULL, so just compare the two tags.
00149         return(calcMDtag == *origMDtag);
00150     }
00151 }
00152 
00153 
00154 // Update/Add the MD tag in the inputRec.
00155 bool SamTags::updateMDTag(SamRecord& inputRec, GenomeSequence& genome)
00156 {
00157     // Calculate the new MD tag.
00158     String calcMDtag;
00159     createMDTag(calcMDtag, inputRec, genome);
00160     
00161     // Add the MD tag.  If it is already there and is different it will
00162     // replace it.  If it is already there and it is the same, it won't
00163     // do anything.
00164     return(inputRec.addTag(MD_TAG, MD_TAG_TYPE, calcMDtag.c_str()));
00165 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends