00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef _TRIMSEQUENCE_H 00019 #define _TRIMSEQUENCE_H 00020 00021 #include <assert.h> 00022 #include <stdint.h> 00023 #include <stdlib.h> 00024 #include <unistd.h> 00025 00026 /// 00027 /// TrimSequence is a templated function to find bases 00028 /// which are below a certain moving mean threshold, 00029 /// and can be applied to either end of the sequence string. 00030 /// 00031 /// @param sequence is the input sequence 00032 /// @param meanValue is the value below which we wish to trim. 00033 /// @return the iterator of the location at which untrimmed values begin 00034 /// 00035 /// Details: 00036 /// 00037 /// trimFromLeft is a bool indicating which direction we wish 00038 /// to trim. true -> left to right, false is right to left. 00039 /// 00040 /// The code is convoluted enough here, so for implementation 00041 /// and testing sanity, the following definitions are made: 00042 /// 00043 /// When trimFromLeft is true: 00044 /// result == sequence.begin() implies no trimming 00045 /// result == sequence.end() implies all values are trimmed 00046 /// 00047 /// When trimFromLeft is false: 00048 /// result == sequence.begin() implies all values are trimmed 00049 /// result == sequence.end() no values are trimmed 00050 /// 00051 /// result will always be in the range [sequence.begin() , sequence.end()) 00052 /// (begin is inclusive, end is exclusive). 00053 /// 00054 /// NOTE: See TrimSequence.h and test/TrimSequence_test.cpp for examples 00055 /// 00056 /// THIS CODE IS EXCEPTIONALLY FRAGILE. DO NOT ATTEMPT TO FIX OR 00057 /// IMPROVE WITHOUT INCLUDING DOCUMENTED, UNDERSTANABLE TEST CASES THAT CLEARLY 00058 /// SHOW WHY OR WHY NOT SOMETHING WORKS. 00059 /// 00060 template<typename sequenceType, typename meanValueType> 00061 typename sequenceType::iterator trimSequence(sequenceType &sequence, meanValueType meanValue, const bool trimFromLeft) 00062 { 00063 const int howManyValues = 4; // this is used in signed arithmetic below 00064 int windowThreshold = howManyValues * meanValue; 00065 int64_t sumOfWindow = 0; 00066 typename sequenceType::iterator it; 00067 00068 // 00069 // Sanity check to weed out what otherwise would be 00070 // a large number of boundary checks below. If the input 00071 // is too small, just punt it back to the caller. Technically, 00072 // we can still trim, but we'd just do the simple iteration 00073 // loop. Save that for when we care. 00074 // 00075 00076 if (sequence.size() < (size_t) howManyValues) 00077 return trimFromLeft? sequence.begin() : sequence.end(); 00078 00079 typename sequenceType::iterator sequenceBegin; 00080 typename sequenceType::iterator sequenceEnd; 00081 00082 // The algorithm should be clear and efficient 00083 // so it does not bother to write codes for two directions. 00084 // It that way, we avoid thinking trimming from left and right interchangably. 00085 if (trimFromLeft) 00086 { 00087 // sequenceBegin is inclusive, sequenceEnd is exclusive, 00088 sequenceBegin = sequence.begin(); 00089 sequenceEnd = sequence.end(); 00090 00091 for (it = sequenceBegin; it < sequenceBegin + howManyValues; it++) 00092 sumOfWindow += *it; 00093 00094 for (; it < sequenceEnd; it ++) 00095 { 00096 if (sumOfWindow > windowThreshold) 00097 break; 00098 sumOfWindow += *it; 00099 sumOfWindow -= *(it - howManyValues); 00100 } 00101 // here it is in the range of [sequenceBegin+howManyValues, sequenceEnd] inclusively 00102 // the range is also [sequence.begin() + howManyValues, sequence.end()] 00103 while (*(it-1) >= meanValue && (it-1) >= sequenceBegin) 00104 it--; 00105 } 00106 else 00107 { 00108 sequenceBegin = sequence.end() - 1; 00109 sequenceEnd = sequence.begin() - 1; 00110 00111 for (it = sequenceBegin; it > sequenceBegin - howManyValues; it--) 00112 sumOfWindow += *it; 00113 00114 for (; it > sequenceEnd; it--) 00115 { 00116 if (sumOfWindow > windowThreshold) 00117 break; 00118 sumOfWindow += *it; 00119 sumOfWindow -= *(it + howManyValues); 00120 } 00121 00122 // here it is in the range of [sequenceEnd, sequenceBegin - howManyValues] inclusively 00123 // the range is also [sequence.begin() -1, sequence.end() - 1 - howManyValues] 00124 while (*(it+1) >= meanValue && (it+1) <= sequenceBegin) 00125 it ++; 00126 // note, the return value should in the range [sequence.begin(), sequence.end()] 00127 it += 1; 00128 } 00129 00130 // 'it' may be sequence.end() in some cases 00131 assert(it >= sequence.begin() && it <= sequence.end()); 00132 00133 return it; 00134 } 00135 00136 #endif