libStatGen Software  1
TrimSequence.h
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _TRIMSEQUENCE_H
00019 #define _TRIMSEQUENCE_H
00020 
00021 #include <assert.h>
00022 #include <stdint.h>
00023 #include <stdlib.h>
00024 
00025 #ifndef __WIN32__
00026 #include <unistd.h>
00027 #endif
00028 
00029 ///
00030 /// TrimSequence is a templated function to find bases
00031 /// which are below a certain moving mean threshold,
00032 /// and can be applied to either end of the sequence string.
00033 ///
00034 /// @param sequence is the input sequence
00035 /// @param meanValue is the value below which we wish to trim.
00036 /// @return the iterator of the location at which untrimmed values begin
00037 ///
00038 /// Details:
00039 ///
00040 /// trimFromLeft is a bool indicating which direction we wish
00041 /// to trim.  true -> left to right, false is right to left.
00042 ///
00043 /// The code is convoluted enough here, so for implementation
00044 /// and testing sanity, the following definitions are made:
00045 ///
00046 /// When trimFromLeft is true:
00047 ///    result == sequence.begin() implies no trimming
00048 ///    result == sequence.end() implies all values are trimmed
00049 ///
00050 /// When trimFromLeft is false:
00051 ///    result == sequence.begin() implies all values are trimmed
00052 ///    result == sequence.end() no values are trimmed
00053 ///
00054 /// result will always be in the range [sequence.begin() , sequence.end())
00055 /// (begin is inclusive, end is exclusive).
00056 ///
00057 /// NOTE: See TrimSequence.h and test/TrimSequence_test.cpp for examples
00058 ///
00059 /// THIS CODE IS EXCEPTIONALLY FRAGILE.  DO NOT ATTEMPT TO FIX OR
00060 /// IMPROVE WITHOUT INCLUDING DOCUMENTED, UNDERSTANABLE TEST CASES THAT CLEARLY
00061 /// SHOW WHY OR WHY NOT SOMETHING WORKS.
00062 ///
00063 template<typename sequenceType, typename meanValueType>
00064 typename sequenceType::iterator trimSequence(sequenceType &sequence, meanValueType meanValue, const bool trimFromLeft)
00065 {
00066     const int howManyValues = 4;    // this is used in signed arithmetic below
00067     int windowThreshold = howManyValues * meanValue;
00068     int64_t sumOfWindow = 0;
00069     typename sequenceType::iterator it;
00070 
00071     //
00072     // Sanity check to weed out what otherwise would be
00073     // a large number of boundary checks below.  If the input
00074     // is too small, just punt it back to the caller.  Technically,
00075     // we can still trim, but we'd just do the simple iteration
00076     // loop.  Save that for when we care.
00077     //
00078 
00079     if (sequence.size() < (size_t) howManyValues)
00080         return trimFromLeft? sequence.begin() : sequence.end();
00081 
00082     typename sequenceType::iterator sequenceBegin;
00083     typename sequenceType::iterator sequenceEnd;
00084 
00085     // The algorithm should be clear and efficient
00086     // so it does not bother to write codes for two directions.
00087     // It that way, we avoid thinking trimming from left and right interchangably.
00088     if (trimFromLeft)
00089     {
00090         // sequenceBegin is inclusive, sequenceEnd is exclusive,
00091         sequenceBegin = sequence.begin();
00092         sequenceEnd = sequence.end();
00093 
00094         for (it = sequenceBegin; it < sequenceBegin + howManyValues; it++)
00095             sumOfWindow += *it;
00096 
00097         for (; it < sequenceEnd; it ++)
00098         {
00099             if (sumOfWindow > windowThreshold)
00100                 break;
00101             sumOfWindow += *it;
00102             sumOfWindow -= *(it - howManyValues);
00103         }
00104         // here it is in the range of [sequenceBegin+howManyValues, sequenceEnd] inclusively
00105         // the range is also [sequence.begin() + howManyValues, sequence.end()]
00106         while (*(it-1) >= meanValue && (it-1) >= sequenceBegin)
00107             it--;
00108     }
00109     else
00110     {
00111         sequenceBegin = sequence.end() - 1;
00112         sequenceEnd = sequence.begin() - 1;
00113 
00114         for (it = sequenceBegin; it > sequenceBegin - howManyValues; it--)
00115             sumOfWindow += *it;
00116 
00117         for (; it > sequenceEnd; it--)
00118         {
00119             if (sumOfWindow > windowThreshold)
00120                 break;
00121             sumOfWindow += *it;
00122             sumOfWindow -= *(it + howManyValues);
00123         }
00124 
00125         // here it is in the range of [sequenceEnd, sequenceBegin - howManyValues] inclusively
00126         // the range is also [sequence.begin() -1, sequence.end() - 1 - howManyValues]
00127         while (*(it+1) >= meanValue && (it+1) <= sequenceBegin)
00128             it ++;
00129         // note, the return value should in the range [sequence.begin(), sequence.end()]
00130         it += 1;
00131     }
00132 
00133     // 'it' may be sequence.end() in some cases
00134     assert(it >= sequence.begin() && it <= sequence.end());
00135 
00136     return it;
00137 }
00138 
00139 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends