TrimSequence.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _TRIMSEQUENCE_H
00019 #define _TRIMSEQUENCE_H
00020 
00021 #include <assert.h>
00022 #include <stdint.h>
00023 #include <stdlib.h>
00024 #include <unistd.h>
00025 
00026 ///
00027 /// TrimSequence is a templated function to find bases
00028 /// which are below a certain moving mean threshold,
00029 /// and can be applied to either end of the sequence string.
00030 ///
00031 /// @param sequence is the input sequence
00032 /// @param meanValue is the value below which we wish to trim.
00033 /// @return the iterator of the location at which untrimmed values begin
00034 ///
00035 /// Details:
00036 ///
00037 /// trimFromLeft is a bool indicating which direction we wish
00038 /// to trim.  true -> left to right, false is right to left.
00039 ///
00040 /// The code is convoluted enough here, so for implementation
00041 /// and testing sanity, the following definitions are made:
00042 ///
00043 /// When trimFromLeft is true:
00044 ///    result == sequence.begin() implies no trimming
00045 ///    result == sequence.end() implies all values are trimmed
00046 ///
00047 /// When trimFromLeft is false:
00048 ///    result == sequence.begin() implies all values are trimmed
00049 ///    result == sequence.end() no values are trimmed
00050 ///
00051 /// result will always be in the range [sequence.begin() , sequence.end())
00052 /// (begin is inclusive, end is exclusive).
00053 ///
00054 /// NOTE: See TrimSequence.h and test/TrimSequence_test.cpp for examples
00055 ///
00056 /// THIS CODE IS EXCEPTIONALLY FRAGILE.  DO NOT ATTEMPT TO FIX OR
00057 /// IMPROVE WITHOUT INCLUDING DOCUMENTED, UNDERSTANABLE TEST CASES THAT CLEARLY
00058 /// SHOW WHY OR WHY NOT SOMETHING WORKS.
00059 ///
00060 template<typename sequenceType, typename meanValueType>
00061 typename sequenceType::iterator trimSequence(sequenceType &sequence, meanValueType meanValue, const bool trimFromLeft)
00062 {
00063     const int howManyValues = 4;    // this is used in signed arithmetic below
00064     int windowThreshold = howManyValues * meanValue;
00065     int64_t sumOfWindow = 0;
00066     typename sequenceType::iterator it;
00067 
00068     //
00069     // Sanity check to weed out what otherwise would be
00070     // a large number of boundary checks below.  If the input
00071     // is too small, just punt it back to the caller.  Technically,
00072     // we can still trim, but we'd just do the simple iteration
00073     // loop.  Save that for when we care.
00074     //
00075 
00076     if (sequence.size() < (size_t) howManyValues)
00077         return trimFromLeft? sequence.begin() : sequence.end();
00078 
00079     typename sequenceType::iterator sequenceBegin;
00080     typename sequenceType::iterator sequenceEnd;
00081 
00082     // The algorithm should be clear and efficient
00083     // so it does not bother to write codes for two directions.
00084     // It that way, we avoid thinking trimming from left and right interchangably.
00085     if (trimFromLeft)
00086     {
00087         // sequenceBegin is inclusive, sequenceEnd is exclusive,
00088         sequenceBegin = sequence.begin();
00089         sequenceEnd = sequence.end();
00090 
00091         for (it = sequenceBegin; it < sequenceBegin + howManyValues; it++)
00092             sumOfWindow += *it;
00093 
00094         for (; it < sequenceEnd; it ++)
00095         {
00096             if (sumOfWindow > windowThreshold)
00097                 break;
00098             sumOfWindow += *it;
00099             sumOfWindow -= *(it - howManyValues);
00100         }
00101         // here it is in the range of [sequenceBegin+howManyValues, sequenceEnd] inclusively
00102         // the range is also [sequence.begin() + howManyValues, sequence.end()]
00103         while (*(it-1) >= meanValue && (it-1) >= sequenceBegin)
00104             it--;
00105     }
00106     else
00107     {
00108         sequenceBegin = sequence.end() - 1;
00109         sequenceEnd = sequence.begin() - 1;
00110 
00111         for (it = sequenceBegin; it > sequenceBegin - howManyValues; it--)
00112             sumOfWindow += *it;
00113 
00114         for (; it > sequenceEnd; it--)
00115         {
00116             if (sumOfWindow > windowThreshold)
00117                 break;
00118             sumOfWindow += *it;
00119             sumOfWindow -= *(it + howManyValues);
00120         }
00121 
00122         // here it is in the range of [sequenceEnd, sequenceBegin - howManyValues] inclusively
00123         // the range is also [sequence.begin() -1, sequence.end() - 1 - howManyValues]
00124         while (*(it+1) >= meanValue && (it+1) <= sequenceBegin)
00125             it ++;
00126         // note, the return value should in the range [sequence.begin(), sequence.end()]
00127         it += 1;
00128     }
00129 
00130     // 'it' may be sequence.end() in some cases
00131     assert(it >= sequence.begin() && it <= sequence.end());
00132 
00133     return it;
00134 }
00135 
00136 #endif
Generated on Wed Nov 17 15:38:29 2010 for StatGen Software by  doxygen 1.6.3