libStatGen Software  1
PedigreeAlleleFreq.cpp
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "PedigreeAlleleFreq.h"
00019 #include "QuickIndex.h"
00020 #include "Error.h"
00021 
00022 #include <math.h>
00023 
00024 int CountAlleles(Pedigree & /* ped */, int marker)
00025 {
00026     // With automatic recoding in the pedigree loader there
00027     // is no need to iterate through the pedigree ...
00028     MarkerInfo * info = Pedigree::GetMarkerInfo(marker);
00029 
00030     return info->CountAlleles();
00031 }
00032 
00033 void LumpAlleles(Pedigree & ped, int marker, double threshold, bool reorder)
00034 {
00035     // find out how many alleles there are
00036     int alleles = ped.CountAlleles(marker);
00037 
00038     if (alleles < 2) return;
00039 
00040     MarkerInfo * info = PedigreeGlobals::GetMarkerInfo(marker);
00041 
00042     if (alleles < info->freq.Length())
00043         alleles = info->freq.Length() - 1;
00044 
00045     IntArray counts(alleles + 1);
00046     counts.Zero();
00047 
00048     // Count number of occurrences for each allele
00049     for (int i = 0; i < ped.count; i++)
00050     {
00051         counts[int(ped[i].markers[marker][0])]++;
00052         counts[int(ped[i].markers[marker][1])]++;
00053     }
00054 
00055     // Calculate treshold for lumping alleles
00056     int total = 0;
00057     for (int i = 1; i <= alleles; i++)
00058         total += counts[i];
00059     int thresh = int(total * threshold);
00060 
00061     // If threshold is set at zero, we artificially increase
00062     // counts for alleles that do not appear in the pedigree
00063     // but whose frequencies are set > 0.0. This ensures that
00064     // allele frequency data does not get discarded when simply
00065     // recoding alleles (vs. lumping)
00066     if (thresh == 0)
00067         for (int i = 1; i < info->freq.Length(); i++)
00068             if (counts[i] == 0 && info->freq[i] > 0.0)
00069                 counts[i] = 1, total++;
00070 
00071     // If allele reordering is disabled, put in dummy allele
00072     // counts so as to ensure that allele have desired ordering
00073     if (!reorder)
00074     {
00075         QuickIndex index(info->alleleLabels);
00076         index.Reverse();
00077 
00078         for (int i = 0; i < index.Length(); i++)
00079             counts[index[i]] = i + 1;
00080 
00081         total = counts.Sum(1, counts.Length() - 1);
00082     }
00083 
00084     // Order all alleles according to their frequency
00085     // Zero, which corresponds to missing values, stays put!
00086     counts[0] = total + 1;
00087     QuickIndex index(counts);
00088     index.Reverse();
00089 
00090     // recode alleles
00091     // all alleles where frequency < thresh are labelled N
00092     // use counts array to keep track of labels
00093     int  N = 0;
00094     bool rare = false;
00095     for (int i = 0; i <= alleles; i++)
00096         if (counts[index[i]] > thresh)
00097         {
00098             counts[index[i]] = i;
00099             N++;
00100         }
00101         else
00102         {
00103             if (counts[index[i]] > 0)
00104                 rare = true;
00105             counts[index[i]] = N;
00106         }
00107 
00108     // This loop does the recoding
00109     for (int i = 0; i < ped.count; i++)
00110     {
00111         Alleles & current = ped[i].markers[marker];
00112         current[0] = counts[current[0]];
00113         current[1] = counts[current[1]];
00114     }
00115 
00116     StringArray  oldLabels(info->alleleLabels);
00117     String       label;
00118 
00119     info->alleleLabels.Clear();
00120     info->alleleNumbers.Clear();
00121 
00122     for (int i = 0; i < N; i++)
00123     {
00124         if (oldLabels.Length() <= index[i])
00125             info->alleleLabels.Push(label = index[i]);
00126         else
00127             info->alleleLabels.Push(oldLabels[index[i]]);
00128 
00129         if (i) info->alleleNumbers.SetInteger(info->alleleLabels.Last(), i);
00130     }
00131 
00132     // Reorder allele frequencies if necessary
00133     if (info->freq.Length())
00134     {
00135         Vector freq(info->freq);
00136 
00137         info->freq.Dimension(N);
00138         info->freq[0] = 0.0;
00139 
00140         for (int i = 1; i < N; i++)
00141         {
00142             info->freq[i]  = freq[index[i]];
00143             freq[index[i]] = 0;
00144         }
00145 
00146         if ((1.0 - info->freq.Sum()) > 1e-10)
00147             rare = true;
00148 
00149         if (rare)
00150         {
00151             info->freq.Dimension(N + 1);
00152             info->freq[N] = 1.0 - info->freq.Sum();
00153         }
00154     }
00155 
00156     if (rare)
00157     {
00158         info->alleleLabels.Push("OTHER");
00159         info->alleleNumbers.SetInteger("OTHER", info->alleleLabels.Length());
00160     }
00161 }
00162 
00163 bool EstimateFrequencies(Pedigree & ped, int marker, int estimator)
00164 {
00165     int alleleCount = CountAlleles(ped, marker);
00166 
00167     IntArray founder(alleleCount + 1);
00168     IntArray all(alleleCount + 1);
00169 
00170     founder.Zero();
00171     all.Zero();
00172 
00173     for (int i = 0; i < ped.count; i++)
00174     {
00175         // When counting alleles, note that males only carry one X chromosome
00176         // and are arbitrarily scored as homozygous.
00177         all[ped[i].markers[marker][0]]++;
00178         if (!ped.chromosomeX || ped[i].sex != SEX_MALE)
00179             all[ped[i].markers[marker][1]]++;
00180         if (!ped[i].isFounder()) continue;
00181         founder[ped[i].markers[marker][0]]++;
00182         if (!ped.chromosomeX || ped[i].sex != SEX_MALE)
00183             founder[ped[i].markers[marker][1]]++;
00184     }
00185 
00186     MarkerInfo * info = ped.GetMarkerInfo(marker);
00187 
00188     if (info->freq.dim > 0)
00189     {
00190         // previous allele frequency information is available
00191         if (alleleCount >= info->freq.dim)
00192             error("For marker %s, input files define %d alleles, but at least\n"
00193                   "one other allele (named '%s') occurs in the pedigree\n",
00194                   (const char *) info->name, info->freq.dim - 1,
00195                   (const char *) info->GetAlleleLabel(alleleCount));
00196 
00197         for (int i = 1; i <= alleleCount; i++)
00198             if (all[i] > 0 && info->freq[i] <= 0.0)
00199                 error("Although allele %s for marker %s has frequency zero,\n"
00200                       "it occurs %d times in the pedigree",
00201                       (const char *) info->GetAlleleLabel(i), (const char *) info->name, all[i]);
00202 
00203         return false;
00204     }
00205     else
00206     {
00207         if (alleleCount < 1)
00208         {
00209             // If no one is genotyped, default to two equifrequent allele
00210             // since some programs do not like monomorphic markers
00211             info->freq.Dimension(3);
00212             info->freq[0] = 0.0;
00213             info->freq[1] = 0.99999;
00214             info->freq[2] = 0.00001;
00215             return true;
00216         }
00217 
00218         info->freq.Dimension(alleleCount + 1);
00219         info->freq.Zero();
00220 
00221         if (estimator == FREQ_FOUNDERS && founder.Sum() > founder[0])
00222         {
00223             // Make sure the frequency of alleles occuring in the pedigree
00224             // is never zero
00225             for (int i = 1; i <= alleleCount; i++)
00226                 if (founder[i] == 0 && all[i] > 0)
00227                     founder[i] = 1;
00228 
00229             // To get frequencies, just multiply counts by 1 / total_counts
00230             double factor = 1.0 / (founder.Sum() - founder[0]);
00231 
00232             for (int i = 1; i <= alleleCount; i++)
00233                 info->freq[i] = founder[i] * factor;
00234         }
00235         else if (estimator == FREQ_ALL || estimator == FREQ_FOUNDERS)
00236         {
00237             // To get frequencies, just multiply counts by 1 / total_counts
00238             double factor = 1.0 / (all.Sum() - all[0]);
00239 
00240             for (int i = 1; i <= alleleCount; i++)
00241                 info->freq[i] = all[i] * factor;
00242         }
00243         else if (estimator == FREQ_EQUAL)
00244             // Assume all alleles have equal frequency
00245         {
00246             // Count the number of observed alleles
00247             all[0] = 0;
00248             int alleles = all.CountIfGreater(0);
00249             double freq = 1.0 / alleles;
00250 
00251             // Set equal frequencies for all occuring alleles
00252             for (int i = 0; i <= alleleCount; i++)
00253                 info->freq[i] = all[i] ? freq : 0.0;
00254         }
00255     }
00256 
00257     return true;
00258 }
00259 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends