libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef _TRIMSEQUENCE_H 00019 #define _TRIMSEQUENCE_H 00020 00021 #include <assert.h> 00022 #include <stdint.h> 00023 #include <stdlib.h> 00024 00025 #ifndef __WIN32__ 00026 #include <unistd.h> 00027 #endif 00028 00029 /// 00030 /// TrimSequence is a templated function to find bases 00031 /// which are below a certain moving mean threshold, 00032 /// and can be applied to either end of the sequence string. 00033 /// 00034 /// @param sequence is the input sequence 00035 /// @param meanValue is the value below which we wish to trim. 00036 /// @return the iterator of the location at which untrimmed values begin 00037 /// 00038 /// Details: 00039 /// 00040 /// trimFromLeft is a bool indicating which direction we wish 00041 /// to trim. true -> left to right, false is right to left. 00042 /// 00043 /// The code is convoluted enough here, so for implementation 00044 /// and testing sanity, the following definitions are made: 00045 /// 00046 /// When trimFromLeft is true: 00047 /// result == sequence.begin() implies no trimming 00048 /// result == sequence.end() implies all values are trimmed 00049 /// 00050 /// When trimFromLeft is false: 00051 /// result == sequence.begin() implies all values are trimmed 00052 /// result == sequence.end() no values are trimmed 00053 /// 00054 /// result will always be in the range [sequence.begin() , sequence.end()) 00055 /// (begin is inclusive, end is exclusive). 00056 /// 00057 /// NOTE: See TrimSequence.h and test/TrimSequence_test.cpp for examples 00058 /// 00059 /// THIS CODE IS EXCEPTIONALLY FRAGILE. DO NOT ATTEMPT TO FIX OR 00060 /// IMPROVE WITHOUT INCLUDING DOCUMENTED, UNDERSTANABLE TEST CASES THAT CLEARLY 00061 /// SHOW WHY OR WHY NOT SOMETHING WORKS. 00062 /// 00063 template<typename sequenceType, typename meanValueType> 00064 typename sequenceType::iterator trimSequence(sequenceType &sequence, meanValueType meanValue, const bool trimFromLeft) 00065 { 00066 const int howManyValues = 4; // this is used in signed arithmetic below 00067 int windowThreshold = howManyValues * meanValue; 00068 int64_t sumOfWindow = 0; 00069 typename sequenceType::iterator it; 00070 00071 // 00072 // Sanity check to weed out what otherwise would be 00073 // a large number of boundary checks below. If the input 00074 // is too small, just punt it back to the caller. Technically, 00075 // we can still trim, but we'd just do the simple iteration 00076 // loop. Save that for when we care. 00077 // 00078 00079 if (sequence.size() < (size_t) howManyValues) 00080 return trimFromLeft? sequence.begin() : sequence.end(); 00081 00082 typename sequenceType::iterator sequenceBegin; 00083 typename sequenceType::iterator sequenceEnd; 00084 00085 // The algorithm should be clear and efficient 00086 // so it does not bother to write codes for two directions. 00087 // It that way, we avoid thinking trimming from left and right interchangably. 00088 if (trimFromLeft) 00089 { 00090 // sequenceBegin is inclusive, sequenceEnd is exclusive, 00091 sequenceBegin = sequence.begin(); 00092 sequenceEnd = sequence.end(); 00093 00094 for (it = sequenceBegin; it < sequenceBegin + howManyValues; it++) 00095 sumOfWindow += *it; 00096 00097 for (; it < sequenceEnd; it ++) 00098 { 00099 if (sumOfWindow > windowThreshold) 00100 break; 00101 sumOfWindow += *it; 00102 sumOfWindow -= *(it - howManyValues); 00103 } 00104 // here it is in the range of [sequenceBegin+howManyValues, sequenceEnd] inclusively 00105 // the range is also [sequence.begin() + howManyValues, sequence.end()] 00106 while (*(it-1) >= meanValue && (it-1) >= sequenceBegin) 00107 it--; 00108 } 00109 else 00110 { 00111 sequenceBegin = sequence.end() - 1; 00112 sequenceEnd = sequence.begin() - 1; 00113 00114 for (it = sequenceBegin; it > sequenceBegin - howManyValues; it--) 00115 sumOfWindow += *it; 00116 00117 for (; it > sequenceEnd; it--) 00118 { 00119 if (sumOfWindow > windowThreshold) 00120 break; 00121 sumOfWindow += *it; 00122 sumOfWindow -= *(it + howManyValues); 00123 } 00124 00125 // here it is in the range of [sequenceEnd, sequenceBegin - howManyValues] inclusively 00126 // the range is also [sequence.begin() -1, sequence.end() - 1 - howManyValues] 00127 while (*(it+1) >= meanValue && (it+1) <= sequenceBegin) 00128 it ++; 00129 // note, the return value should in the range [sequence.begin(), sequence.end()] 00130 it += 1; 00131 } 00132 00133 // 'it' may be sequence.end() in some cases 00134 assert(it >= sequence.begin() && it <= sequence.end()); 00135 00136 return it; 00137 } 00138 00139 #endif