PedigreeLoader.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "Pedigree.h"
00019 #include "FortranFormat.h"
00020 #include "Error.h"
00021 
00022 #include <stdlib.h>
00023 #include <ctype.h>
00024 #include <string.h>
00025 
00026 void Pedigree::Prepare(IFILE & input)
00027 {
00028     pd.Load(input);
00029 }
00030 
00031 void Pedigree::Load(IFILE & input)
00032 {
00033     if (pd.mendelFormat)
00034     {
00035         LoadMendel(input);
00036         return;
00037     }
00038 
00039     int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
00040 
00041     int textCols = pd.CountTextColumns() + 5;
00042     int oldCount = count;
00043     bool warn    = true;
00044     int line     = 0;
00045 
00046     String      buffer;
00047     StringArray tokens;
00048 
00049     while (!ifeof(input))
00050     {
00051         int field = 0;
00052 
00053         buffer.ReadLine(input);
00054 
00055         tokens.Clear();
00056         tokens.AddTokens(buffer, SEPARATORS);
00057 
00058         if (tokens.Length() == 0) continue;
00059         if (tokens[0].SlowCompare("end") == 0) break;
00060 
00061         line++;
00062 
00063         if (tokens.Length() < textCols)
00064         {
00065             if (buffer.Length() > 79)
00066             {
00067                 buffer.SetLength(75);
00068                 buffer += " ...";
00069             }
00070 
00071             String description;
00072 
00073             pd.ColumnSummary(description);
00074             error("Loading Pedigree...\n\n"
00075                   "Expecting %d columns (%s),\n"
00076                   "but read only %d columns in line %d.\n\n"
00077                   "The problem line is transcribed below:\n%s\n",
00078                   textCols, (const char *) description,
00079                   tokens.Length(), line, (const char  *) buffer);
00080         }
00081 
00082         if (tokens.Length() > textCols && warn && textCols > 5)
00083         {
00084             pd.ColumnSummary(buffer);
00085             printf("WARNING -- Trailing columns in pedigree file will be ignored\n"
00086                    "  Expecting %d data columns (%s)\n"
00087                    "  However line %d, for example, has %d data columns\n\n",
00088                    textCols - 5, (const char *) buffer, line, tokens.Length() - 5);
00089             warn = false;
00090         }
00091 
00092         Person * p;
00093 
00094         // create a new person if necessary
00095         if (oldCount==0 || (p = FindPerson(tokens[0], tokens[1], oldCount))==NULL)
00096         {
00097             if (count == size) Grow();
00098 
00099             p = persons[count++] = new Person;
00100         }
00101 
00102         p->famid = tokens[field++];         // famid
00103         p->pid = tokens[field++];           // pid
00104         p->fatid = tokens[field++];         // fatid
00105         p->motid = tokens[field++];         // motid
00106 
00107         bool failure = false;
00108         p->sex = TranslateSexCode(tokens[field++], failure);
00109         if (failure)
00110             error("Can't interpret the sex of individual #%d\n"
00111                   "Family: %s  Individual: %s  Sex Code: %s", count,
00112                   (const char *) p->famid, (const char *) p->pid,
00113                   (const char *) tokens[field-1]);
00114 
00115         if (sexAsCovariate)
00116         {
00117             if (p->sex)
00118                 p->covariates[sexCovariate] = p->sex;
00119             else
00120                 p->covariates[sexCovariate] = _NAN_;
00121         }
00122 
00123         for (int col = 0; col < pd.columnCount; col++)
00124             switch (pd.columns[col])
00125             {
00126                 case pcAffection :
00127                 {
00128                     int a = pd.columnHash[col];
00129                     int new_status;
00130 
00131                     const char * affection = tokens[field++];
00132 
00133                     switch (toupper(affection[0]))
00134                     {
00135                         case '1' :
00136                         case 'N' :
00137                         case 'U' :
00138                             new_status = 1;
00139                             break;
00140                         case '2' :
00141                         case 'D' :
00142                         case 'A' :
00143                         case 'Y' :
00144                             new_status = 2;
00145                             break;
00146                         default :
00147                             new_status = atoi(affection);
00148                             if (new_status < 0 || new_status > 2)
00149                                 error("Incorrect formating for affection status "
00150                                       "Col %d, Affection %s\n"
00151                                       "Family: %s  Individual: %s  Status: %s",
00152                                       col, (const char *) affectionNames[a],
00153                                       (const char *) p->famid, (const char *) p->pid,
00154                                       affection);
00155                     }
00156                     if (new_status != 0 && p->affections[a] != 0 &&
00157                             new_status != p->affections[a])
00158                         error("Conflict with previous affection status - "
00159                               "Col %d, Affection %s\n"
00160                               "Family: %s  Individual: %s  Old: %d New: %d",
00161                               col, (const char *) affectionNames[a],
00162                               (const char *) p->famid, (const char *) p->pid,
00163                               p->affections[a], new_status);
00164                     if (new_status) p->affections[a] = new_status;
00165                     break;
00166                 }
00167                 case pcMarker :
00168                 {
00169                     int m = pd.columnHash[col];
00170 
00171                     Alleles new_genotype;
00172 
00173                     new_genotype[0] = LoadAllele(m, tokens[field++]);
00174                     new_genotype[1] = LoadAllele(m, tokens[field++]);
00175 
00176                     if (p->markers[m].isKnown() && new_genotype.isKnown() &&
00177                             new_genotype != p->markers[m])
00178                     {
00179                         MarkerInfo * info = GetMarkerInfo(m);
00180 
00181                         error("Conflict with previous genotype - Col %d, Marker %s\n"
00182                               "Family: %s  Individual: %s  Old: %s/%s New: %s/%s",
00183                               col, (const char *) markerNames[m],
00184                               (const char *) p->famid, (const char *) p->pid,
00185                               (const char *) info->GetAlleleLabel(p->markers[m][0]),
00186                               (const char *) info->GetAlleleLabel(p->markers[m][1]),
00187                               (const char *) info->GetAlleleLabel(new_genotype[0]),
00188                               (const char *) info->GetAlleleLabel(new_genotype[1]));
00189                     }
00190 
00191                     if (new_genotype.isKnown()) p->markers[m] = new_genotype;
00192                     break;
00193                 }
00194                 case pcTrait :
00195                 case pcUndocumentedTraitCovariate :
00196                 {
00197                     int t = pd.columnHash[col];
00198                     double new_pheno = _NAN_;
00199 
00200                     if (pd.columns[col] == pcUndocumentedTraitCovariate)
00201                         t = t / 32768;
00202 
00203                     const char * value = tokens[field++];
00204                     char * flag = NULL;
00205 
00206                     if (missing == (const char *) NULL || strcmp(value, missing) != 0)
00207                         new_pheno = strtod(value, &flag);
00208                     if (flag != NULL && *flag) new_pheno = _NAN_;
00209 
00210                     if (p->traits[t] != _NAN_ && new_pheno != _NAN_ &&
00211                             new_pheno != p->traits[t])
00212                         error("Conflict with previous phenotype - Col %d, Trait %s\n"
00213                               "Family: %s  Individual: %s  Old: %f New: %f",
00214                               col, (const char *) traitNames[t],
00215                               (const char *) p->famid, (const char *) p->pid,
00216                               p->traits[t], new_pheno);
00217 
00218                     if (new_pheno != _NAN_) p->traits[t] = new_pheno;
00219                     if (pd.columns[col] == pcTrait) break;
00220                 }
00221                 case pcCovariate :
00222                 {
00223                     int c = pd.columnHash[col];
00224                     double new_covar = _NAN_;
00225 
00226                     if (pd.columns[col] == pcUndocumentedTraitCovariate)
00227                     {
00228                         c = c % 32768;
00229                         field--;
00230                     }
00231 
00232                     const char * value = tokens[field++];
00233                     char * flag = NULL;
00234 
00235                     if (missing == (const char *) NULL || strcmp(value, missing) != 0)
00236                         new_covar = strtod(value, &flag);
00237                     if (flag != NULL && *flag) new_covar = _NAN_;
00238 
00239                     if (p->covariates[c] != _NAN_ && new_covar != _NAN_ &&
00240                             new_covar != p->covariates[c])
00241                         error("Conflict with previous value - Col %d, Covariate %s\n"
00242                               "Family: %s  Individual: %s  Old: %f New: %f",
00243                               col, (const char *) covariateNames[c],
00244                               (const char *) p->famid, (const char *) p->pid,
00245                               p->covariates[c], new_covar);
00246 
00247                     if (new_covar != _NAN_) p->covariates[c] = new_covar;
00248                     break;
00249                 }
00250                 case pcSkip :
00251                     field++;
00252                     break;
00253                 case pcZygosity :
00254                 {
00255                     int new_zygosity;
00256 
00257                     const char * zygosity = tokens[field++];
00258 
00259                     switch (zygosity[0])
00260                     {
00261                         case 'D' :
00262                         case 'd' :
00263                             new_zygosity = 2;
00264                             break;
00265                         case 'M' :
00266                         case 'm' :
00267                             new_zygosity = 1;
00268                             break;
00269                         default :
00270                             new_zygosity = atoi(zygosity);
00271                     }
00272                     if (p->zygosity != 0 && new_zygosity != p->zygosity)
00273                         error("Conflict with previous zygosity - "
00274                               "Column %d in pedigree\n"
00275                               "Family: %s  Individual: %s  Old: %d New: %d\n",
00276                               col, (const char *) p->famid, (const char *) p->pid,
00277                               p->zygosity, new_zygosity);
00278                     p->zygosity = new_zygosity;
00279                     break;
00280                 }
00281                 case pcEnd :
00282                     break;
00283                 default :
00284                     error("Inconsistent Pedigree Description -- Internal Error");
00285             }
00286     }
00287 
00288     Sort();
00289 }
00290 
00291 void Pedigree::LoadMendel(IFILE & input)
00292 {
00293     // First, retrieve the two format statements from file
00294     String familyHeader;
00295     String individualRecord;
00296 
00297     familyHeader.ReadLine(input);
00298     individualRecord.ReadLine(input);
00299 
00300     // Then create two FORTRAN input streams...
00301     // One will be used for retrieving family labels and sizes, the other
00302     // will be used for individual information
00303     FortranFormat headers, records;
00304 
00305     headers.SetInputFile(input);
00306     headers.SetFormat(familyHeader);
00307 
00308     records.SetInputFile(input);
00309     records.SetFormat(individualRecord);
00310 
00311     // Storage for key pieces of information
00312     String famid;
00313     String phenotype;
00314     String affectionCode;
00315     String affectionStem;
00316     int    familySize;
00317 
00318     String allele1, allele2;
00319 
00320     int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
00321 
00322     while (!ifeof(input))
00323     {
00324         if (count == size)
00325             Grow();
00326 
00327         // Retrieve header for next family
00328         familySize = headers.GetNextInteger();
00329         headers.GetNextField(famid);
00330         headers.Flush();
00331 
00332         if (famid.IsEmpty())
00333         {
00334             if (ifeof(input) && familySize == 0)
00335                 break;
00336             else
00337                 error("Blank family id encountered\n");
00338         }
00339 
00340         // Retrieve each individual in the family
00341         for (int i = 0; i < familySize; i++)
00342         {
00343             Person * p = persons[count++] = new Person;
00344 
00345             // Retrieve basic pedigree structure
00346             p->famid = famid;
00347             records.GetNextField(p->pid);
00348             records.GetNextField(p->fatid);
00349             records.GetNextField(p->motid);
00350 
00351             if (p->pid.IsEmpty())
00352                 error("No unique identifier for individual #%d in family %s\n",
00353                       i + 1, (const char *) famid);
00354 
00355             if (p->pid.Compare(".") == 0)
00356                 error("Family %s has an individual named '.', but this code is\n"
00357                       "reserved to indicate missing parents\n");
00358 
00359             if (p->fatid.IsEmpty()) p->fatid = ".";
00360             if (p->motid.IsEmpty()) p->motid = ".";
00361 
00362             // Retrieve and decode sex code
00363             char sex = records.GetNextCharacter();
00364 
00365             switch (sex)
00366             {
00367                 case '0' :
00368                 case 'x' :
00369                 case 'X' :
00370                 case '?' :
00371                 case 0 :
00372                     p->sex = 0;
00373                     break;
00374                 case '1' :
00375                 case 'm' :
00376                 case 'M' :
00377                     p->sex = 1;
00378                     break;
00379                 case '2' :
00380                 case 'f' :
00381                 case 'F' :
00382                     p->sex = 2;
00383                     break;
00384                 default :
00385                     error("Can't interpret the sex of individual #%d\n"
00386                           "Family: %s  Individual: %s  Sex Code: %s", count,
00387                           (const char *) p->famid, (const char *) p->pid, sex);
00388             };
00389 
00390             if (sexAsCovariate)
00391             {
00392                 if (p->sex)
00393                     p->covariates[sexCovariate] = p->sex;
00394                 else
00395                     p->covariates[sexCovariate] = _NAN_;
00396             }
00397 
00398             // Retrieve and decode zygosity
00399             char zygosity = records.GetNextCharacter();
00400 
00401             // Mendel uses a unique character to indicate each MZ pair,
00402             // we use a unique odd number...
00403             if (zygosity)
00404                 p->zygosity = (zygosity - ' ') * 2 - 1;
00405 
00406             affectionStem.Clear();
00407             for (int col = 0; col < pd.columnCount; col++)
00408                 switch (pd.columns[col])
00409                 {
00410                     case pcAffection :
00411                     {
00412                         int a = pd.columnHash[col];
00413 
00414                         // We expand each Mendel non-codominant trait into multiple
00415                         // affection status column... First, if this is  not a
00416                         // continuation of a previous expansion we first retrieve
00417                         // and encode the affection status.
00418                         if (affectionStem.Length() == 0 ||
00419                                 affectionNames[a].CompareToStem(affectionStem) != 0)
00420                         {
00421                             affectionStem.Copy(affectionNames[a], 0, affectionNames[a].FindChar('>') + 1);
00422                             records.GetNextField(phenotype);
00423                             affectionCode = affectionStem + phenotype;
00424                         }
00425 
00426                         // Then encode each phenotype appropriately
00427                         if (phenotype.IsEmpty())
00428                             p->affections[a] = 0;
00429                         else
00430                             p->affections[a] = affectionCode.Compare(affectionNames[a]) == 0 ? 2 : 1;
00431 
00432                         break;
00433                     }
00434                     case pcMarker :
00435                     {
00436                         int m = pd.columnHash[col];
00437 
00438                         records.GetNextField(phenotype);
00439 
00440                         if (phenotype.IsEmpty())
00441                         {
00442                             p->markers[m].one = p->markers[m].two = 0;
00443                             continue;
00444                         }
00445 
00446                         int separator = phenotype.FindChar('/');
00447                         if (separator == -1) separator = phenotype.FindChar('|');
00448 
00449                         if (separator == -1)
00450                             error("At marker %s, person %s in family %s has genotype %s.\n"
00451                                   "This genotype is not in the 'al1/al2' format.\n",
00452                                   (const char *) markerNames[m],
00453                                   (const char *) p->pid,
00454                                   (const char *) p->famid,
00455                                   (const char *) phenotype);
00456 
00457                         allele1.Copy(phenotype, 0, separator);
00458                         allele1.Trim();
00459 
00460                         allele2.Copy(phenotype, separator + 1, 8);
00461                         allele2.Trim();
00462 
00463                         MarkerInfo * info = GetMarkerInfo(m);
00464 
00465                         int one = info->alleleNumbers.Integer(allele1);
00466 
00467                         if (one < 0)
00468                         {
00469                             if (info->freq.Length() == 0)
00470                                 one = info->NewAllele(allele1);
00471                             else
00472                                 error("At marker %s, person %s in family %s has genotype %s.\n"
00473                                       "However, '%s' is not a valid allele for this marker.\n",
00474                                       (const char *) markerNames[m],
00475                                       (const char *) p->pid,
00476                                       (const char *) p->famid,
00477                                       (const char *) phenotype,
00478                                       (const char *) allele1);
00479                         }
00480 
00481                         int two = info->alleleNumbers.Integer(allele2);
00482 
00483                         if (two < 0)
00484                         {
00485                             if (info->freq.Length() == 0)
00486                                 two = info->NewAllele(allele2);
00487                             else
00488                                 error("At marker %s, person %s in family %s has genotype %s.\n"
00489                                       "However, '%s' is not a valid allele for this marker.\n",
00490                                       (const char *) markerNames[m],
00491                                       (const char *) p->pid,
00492                                       (const char *) p->famid,
00493                                       (const char *) phenotype,
00494                                       (const char *) allele2);
00495                         }
00496 
00497                         p->markers[m].one = one;
00498                         p->markers[m].two = two;
00499                         break;
00500                     }
00501                     case pcEnd :
00502                         break;
00503                     case pcTrait :
00504                     case pcCovariate :
00505                     case pcSkip :
00506                     case pcZygosity :
00507                     default:
00508                         error("Inconsistent Pedigree Description -- Internal Error");
00509                 }
00510 
00511             records.Flush();
00512         }
00513     }
00514 
00515     Sort();
00516 }
00517 
00518 void Pedigree::Prepare(const char * filename)
00519 {
00520     // Clear any previously loaded pedigree description
00521     if (multiPd != NULL)
00522         delete [] multiPd;
00523 
00524     multiFileCount = 1;
00525 
00526     // Enable multifile support
00527     StringArray filenames;
00528 
00529     filenames.AddColumns(filename, ',');
00530 
00531     if (filenames.Length() <= 1)
00532         pd.Load(filename);
00533     else
00534     {
00535         printf("AUTOMATIC MERGE ENABLED: Detected multiple datafile names, separated by commas...\n");
00536 
00537         multiPd = new PedigreeDescription[filenames.Length()];
00538 
00539         for (int i = 0; i < filenames.Length(); i++)
00540         {
00541             printf("  AUTOMATIC MERGE: Reading data file '%s' ...\n", (const char *) filenames[i]);
00542             multiPd[i].Load(filenames[i], false);
00543         }
00544 
00545         multiFileCount = filenames.Length();
00546     }
00547 }
00548 
00549 void Pedigree::Load(const char * filename, bool allowFailures)
00550 {
00551     if (multiFileCount <= 1)
00552     {
00553         IFILE f = ifopen(filename, "rb");
00554 
00555         if (f == NULL && allowFailures)
00556             return;
00557 
00558         if (f == NULL)
00559             error(
00560                 "The pedigree file %s cannot be opened\n\n"
00561                 "Common causes for this problem are:\n"
00562                 "  * You might not have used the correct options to specify input file names,\n"
00563                 "    please check the program documentation for information on how to do this\n\n"
00564                 "  * The file doesn't exist or the filename might have been misspelt\n\n"
00565                 "  * The file exists but it is being used by another program which you will need\n"
00566                 "    to close\n\n"
00567                 "  * The file is larger than 2GB and you haven't compiled this application with\n"
00568                 "    large file support.\n\n",
00569                 filename);
00570 
00571         Load(f);
00572         ifclose(f);
00573     }
00574     else
00575     {
00576         StringArray filenames;
00577 
00578         filenames.AddColumns(filename, ',');
00579 
00580         if (filenames.Length() != multiFileCount)
00581             error("Different numbers of comma separated data and pedigree file names provided\n");
00582 
00583         for (int i = 0; i < filenames.Length(); i++)
00584         {
00585             printf("  AUTOMATIC MERGE: Datafile '%s' matched to pedigree '%s' ...\n",
00586                    (const char *) multiPd[i].filename, (const char *) filenames[i]);
00587 
00588             pd = multiPd[i];
00589 
00590             IFILE f = ifopen(filenames[i], "rb");
00591 
00592             if (f == NULL)
00593                 error("The pedigree file '%s' cannot be opened\n\n",
00594                       (const char *) filenames[i]);
00595 
00596             Load(f);
00597             ifclose(f);
00598         }
00599 
00600         printf("\n");
00601     }
00602 }
00603 
00604 int Pedigree::TranslateSexCode(const char * code, bool & failure)
00605 {
00606     failure = false;
00607 
00608     switch (code[0])
00609     {
00610         case 'x' :
00611         case 'X' :
00612         case '?' :
00613             return 0;
00614         case '1' :
00615         case 'm' :
00616         case 'M' :
00617             return 1;
00618         case '2' :
00619         case 'f' :
00620         case 'F' :
00621             return 2;
00622         default :
00623         {
00624             bool result = atoi(code);
00625 
00626             if (result != 0 && result != 1 && result != 2)
00627             {
00628                 failure = true;
00629                 result = 0;
00630             }
00631 
00632             return result;
00633         }
00634     };
00635 }
00636 
00637 
00638 
Generated on Wed Nov 17 15:38:29 2010 for StatGen Software by  doxygen 1.6.3