libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include "PedigreeAlleleFreq.h" 00019 #include "QuickIndex.h" 00020 #include "Error.h" 00021 00022 #include <math.h> 00023 00024 int CountAlleles(Pedigree & /* ped */, int marker) 00025 { 00026 // With automatic recoding in the pedigree loader there 00027 // is no need to iterate through the pedigree ... 00028 MarkerInfo * info = Pedigree::GetMarkerInfo(marker); 00029 00030 return info->CountAlleles(); 00031 } 00032 00033 void LumpAlleles(Pedigree & ped, int marker, double threshold, bool reorder) 00034 { 00035 // find out how many alleles there are 00036 int alleles = ped.CountAlleles(marker); 00037 00038 if (alleles < 2) return; 00039 00040 MarkerInfo * info = PedigreeGlobals::GetMarkerInfo(marker); 00041 00042 if (alleles < info->freq.Length()) 00043 alleles = info->freq.Length() - 1; 00044 00045 IntArray counts(alleles + 1); 00046 counts.Zero(); 00047 00048 // Count number of occurrences for each allele 00049 for (int i = 0; i < ped.count; i++) 00050 { 00051 counts[int(ped[i].markers[marker][0])]++; 00052 counts[int(ped[i].markers[marker][1])]++; 00053 } 00054 00055 // Calculate treshold for lumping alleles 00056 int total = 0; 00057 for (int i = 1; i <= alleles; i++) 00058 total += counts[i]; 00059 int thresh = int(total * threshold); 00060 00061 // If threshold is set at zero, we artificially increase 00062 // counts for alleles that do not appear in the pedigree 00063 // but whose frequencies are set > 0.0. This ensures that 00064 // allele frequency data does not get discarded when simply 00065 // recoding alleles (vs. lumping) 00066 if (thresh == 0) 00067 for (int i = 1; i < info->freq.Length(); i++) 00068 if (counts[i] == 0 && info->freq[i] > 0.0) 00069 counts[i] = 1, total++; 00070 00071 // If allele reordering is disabled, put in dummy allele 00072 // counts so as to ensure that allele have desired ordering 00073 if (!reorder) 00074 { 00075 QuickIndex index(info->alleleLabels); 00076 index.Reverse(); 00077 00078 for (int i = 0; i < index.Length(); i++) 00079 counts[index[i]] = i + 1; 00080 00081 total = counts.Sum(1, counts.Length() - 1); 00082 } 00083 00084 // Order all alleles according to their frequency 00085 // Zero, which corresponds to missing values, stays put! 00086 counts[0] = total + 1; 00087 QuickIndex index(counts); 00088 index.Reverse(); 00089 00090 // recode alleles 00091 // all alleles where frequency < thresh are labelled N 00092 // use counts array to keep track of labels 00093 int N = 0; 00094 bool rare = false; 00095 for (int i = 0; i <= alleles; i++) 00096 if (counts[index[i]] > thresh) 00097 { 00098 counts[index[i]] = i; 00099 N++; 00100 } 00101 else 00102 { 00103 if (counts[index[i]] > 0) 00104 rare = true; 00105 counts[index[i]] = N; 00106 } 00107 00108 // This loop does the recoding 00109 for (int i = 0; i < ped.count; i++) 00110 { 00111 Alleles & current = ped[i].markers[marker]; 00112 current[0] = counts[current[0]]; 00113 current[1] = counts[current[1]]; 00114 } 00115 00116 StringArray oldLabels(info->alleleLabels); 00117 String label; 00118 00119 info->alleleLabels.Clear(); 00120 info->alleleNumbers.Clear(); 00121 00122 for (int i = 0; i < N; i++) 00123 { 00124 if (oldLabels.Length() <= index[i]) 00125 info->alleleLabels.Push(label = index[i]); 00126 else 00127 info->alleleLabels.Push(oldLabels[index[i]]); 00128 00129 if (i) info->alleleNumbers.SetInteger(info->alleleLabels.Last(), i); 00130 } 00131 00132 // Reorder allele frequencies if necessary 00133 if (info->freq.Length()) 00134 { 00135 Vector freq(info->freq); 00136 00137 info->freq.Dimension(N); 00138 info->freq[0] = 0.0; 00139 00140 for (int i = 1; i < N; i++) 00141 { 00142 info->freq[i] = freq[index[i]]; 00143 freq[index[i]] = 0; 00144 } 00145 00146 if ((1.0 - info->freq.Sum()) > 1e-10) 00147 rare = true; 00148 00149 if (rare) 00150 { 00151 info->freq.Dimension(N + 1); 00152 info->freq[N] = 1.0 - info->freq.Sum(); 00153 } 00154 } 00155 00156 if (rare) 00157 { 00158 info->alleleLabels.Push("OTHER"); 00159 info->alleleNumbers.SetInteger("OTHER", info->alleleLabels.Length()); 00160 } 00161 } 00162 00163 bool EstimateFrequencies(Pedigree & ped, int marker, int estimator) 00164 { 00165 int alleleCount = CountAlleles(ped, marker); 00166 00167 IntArray founder(alleleCount + 1); 00168 IntArray all(alleleCount + 1); 00169 00170 founder.Zero(); 00171 all.Zero(); 00172 00173 for (int i = 0; i < ped.count; i++) 00174 { 00175 // When counting alleles, note that males only carry one X chromosome 00176 // and are arbitrarily scored as homozygous. 00177 all[ped[i].markers[marker][0]]++; 00178 if (!ped.chromosomeX || ped[i].sex != SEX_MALE) 00179 all[ped[i].markers[marker][1]]++; 00180 if (!ped[i].isFounder()) continue; 00181 founder[ped[i].markers[marker][0]]++; 00182 if (!ped.chromosomeX || ped[i].sex != SEX_MALE) 00183 founder[ped[i].markers[marker][1]]++; 00184 } 00185 00186 MarkerInfo * info = ped.GetMarkerInfo(marker); 00187 00188 if (info->freq.dim > 0) 00189 { 00190 // previous allele frequency information is available 00191 if (alleleCount >= info->freq.dim) 00192 error("For marker %s, input files define %d alleles, but at least\n" 00193 "one other allele (named '%s') occurs in the pedigree\n", 00194 (const char *) info->name, info->freq.dim - 1, 00195 (const char *) info->GetAlleleLabel(alleleCount)); 00196 00197 for (int i = 1; i <= alleleCount; i++) 00198 if (all[i] > 0 && info->freq[i] <= 0.0) 00199 error("Although allele %s for marker %s has frequency zero,\n" 00200 "it occurs %d times in the pedigree", 00201 (const char *) info->GetAlleleLabel(i), (const char *) info->name, all[i]); 00202 00203 return false; 00204 } 00205 else 00206 { 00207 if (alleleCount < 1) 00208 { 00209 // If no one is genotyped, default to two equifrequent allele 00210 // since some programs do not like monomorphic markers 00211 info->freq.Dimension(3); 00212 info->freq[0] = 0.0; 00213 info->freq[1] = 0.99999; 00214 info->freq[2] = 0.00001; 00215 return true; 00216 } 00217 00218 info->freq.Dimension(alleleCount + 1); 00219 info->freq.Zero(); 00220 00221 if (estimator == FREQ_FOUNDERS && founder.Sum() > founder[0]) 00222 { 00223 // Make sure the frequency of alleles occuring in the pedigree 00224 // is never zero 00225 for (int i = 1; i <= alleleCount; i++) 00226 if (founder[i] == 0 && all[i] > 0) 00227 founder[i] = 1; 00228 00229 // To get frequencies, just multiply counts by 1 / total_counts 00230 double factor = 1.0 / (founder.Sum() - founder[0]); 00231 00232 for (int i = 1; i <= alleleCount; i++) 00233 info->freq[i] = founder[i] * factor; 00234 } 00235 else if (estimator == FREQ_ALL || estimator == FREQ_FOUNDERS) 00236 { 00237 // To get frequencies, just multiply counts by 1 / total_counts 00238 double factor = 1.0 / (all.Sum() - all[0]); 00239 00240 for (int i = 1; i <= alleleCount; i++) 00241 info->freq[i] = all[i] * factor; 00242 } 00243 else if (estimator == FREQ_EQUAL) 00244 // Assume all alleles have equal frequency 00245 { 00246 // Count the number of observed alleles 00247 all[0] = 0; 00248 int alleles = all.CountIfGreater(0); 00249 double freq = 1.0 / alleles; 00250 00251 // Set equal frequencies for all occuring alleles 00252 for (int i = 0; i <= alleleCount; i++) 00253 info->freq[i] = all[i] ? freq : 0.0; 00254 } 00255 } 00256 00257 return true; 00258 } 00259