libStatGen Software  1
PedigreeLoader.cpp
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "Pedigree.h"
00019 #include "FortranFormat.h"
00020 #include "Error.h"
00021 
00022 #include <stdlib.h>
00023 #include <ctype.h>
00024 #include <string.h>
00025 
00026 void Pedigree::Prepare(IFILE & input)
00027 {
00028     pd.Load(input);
00029 }
00030 
00031 void Pedigree::Load(IFILE & input)
00032 {
00033     if (pd.mendelFormat)
00034     {
00035         LoadMendel(input);
00036         return;
00037     }
00038 
00039     int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
00040 
00041     int textCols = pd.CountTextColumns() + 5;
00042     int oldCount = count;
00043     bool warn    = true;
00044     int line     = 0;
00045 
00046     String      buffer;
00047     StringArray tokens;
00048 
00049     while (!ifeof(input))
00050     {
00051         int field = 0;
00052 
00053         buffer.ReadLine(input);
00054 
00055         tokens.Clear();
00056         tokens.AddTokens(buffer, WHITESPACE);
00057 
00058         if (tokens.Length() == 0) continue;
00059         if (tokens[0].SlowCompare("end") == 0) break;
00060 
00061         line++;
00062 
00063         if (tokens.Length() < textCols)
00064         {
00065             if (buffer.Length() > 79)
00066             {
00067                 buffer.SetLength(75);
00068                 buffer += " ...";
00069             }
00070 
00071             String description;
00072 
00073             pd.ColumnSummary(description);
00074             error("Loading Pedigree...\n\n"
00075                   "Expecting %d columns (%s),\n"
00076                   "but read only %d columns in line %d.\n\n"
00077                   "The problem line is transcribed below:\n%s\n",
00078                   textCols, (const char *) description,
00079                   tokens.Length(), line, (const char  *) buffer);
00080         }
00081 
00082         if (tokens.Length() > textCols && warn && textCols > 5)
00083         {
00084             pd.ColumnSummary(buffer);
00085             printf("WARNING -- Trailing columns in pedigree file will be ignored\n"
00086                    "  Expecting %d data columns (%s)\n"
00087                    "  However line %d, for example, has %d data columns\n\n",
00088                    textCols - 5, (const char *) buffer, line, tokens.Length() - 5);
00089             warn = false;
00090         }
00091 
00092         Person * p;
00093 
00094         // create a new person if necessary
00095         if (oldCount==0 || (p = FindPerson(tokens[0], tokens[1], oldCount))==NULL)
00096         {
00097             if (count == size) Grow();
00098 
00099             p = persons[count++] = new Person;
00100         }
00101 
00102         p->famid = tokens[field++];         // famid
00103         p->pid = tokens[field++];           // pid
00104         p->fatid = tokens[field++];         // fatid
00105         p->motid = tokens[field++];         // motid
00106 
00107         bool failure = false;
00108         p->sex = TranslateSexCode(tokens[field++], failure);
00109         if (failure)
00110             error("Can't interpret the sex of individual #%d\n"
00111                   "Family: %s  Individual: %s  Sex Code: %s", count,
00112                   (const char *) p->famid, (const char *) p->pid,
00113                   (const char *) tokens[field-1]);
00114 
00115         if (sexAsCovariate)
00116         {
00117             if (p->sex)
00118                 p->covariates[sexCovariate] = p->sex;
00119             else
00120                 p->covariates[sexCovariate] = _NAN_;
00121         }
00122 
00123         for (int col = 0; col < pd.columnCount; col++)
00124             switch (pd.columns[col])
00125             {
00126                 case pcAffection :
00127                 {
00128                     int a = pd.columnHash[col];
00129                     int new_status;
00130 
00131                     const char * affection = tokens[field++];
00132 
00133                     switch (toupper(affection[0]))
00134                     {
00135                         case '1' :
00136                         case 'N' :
00137                         case 'U' :
00138                             new_status = 1;
00139                             break;
00140                         case '2' :
00141                         case 'D' :
00142                         case 'A' :
00143                         case 'Y' :
00144                             new_status = 2;
00145                             break;
00146                         default :
00147                             new_status = atoi(affection);
00148                             if (new_status < 0 || new_status > 2)
00149                                 error("Incorrect formating for affection status "
00150                                       "Col %d, Affection %s\n"
00151                                       "Family: %s  Individual: %s  Status: %s",
00152                                       col, (const char *) affectionNames[a],
00153                                       (const char *) p->famid, (const char *) p->pid,
00154                                       affection);
00155                     }
00156                     if (new_status != 0 && p->affections[a] != 0 &&
00157                             new_status != p->affections[a])
00158                         error("Conflict with previous affection status - "
00159                               "Col %d, Affection %s\n"
00160                               "Family: %s  Individual: %s  Old: %d New: %d",
00161                               col, (const char *) affectionNames[a],
00162                               (const char *) p->famid, (const char *) p->pid,
00163                               p->affections[a], new_status);
00164                     if (new_status) p->affections[a] = new_status;
00165                     break;
00166                 }
00167                 case pcMarker :
00168                 {
00169                     int m = pd.columnHash[col];
00170 
00171                     Alleles new_genotype;
00172 
00173                     new_genotype[0] = LoadAllele(m, tokens[field++]);
00174                     new_genotype[1] = LoadAllele(m, tokens[field++]);
00175 
00176                     if (p->markers[m].isKnown() && new_genotype.isKnown() &&
00177                             new_genotype != p->markers[m])
00178                     {
00179                         MarkerInfo * info = GetMarkerInfo(m);
00180 
00181                         error("Conflict with previous genotype - Col %d, Marker %s\n"
00182                               "Family: %s  Individual: %s  Old: %s/%s New: %s/%s",
00183                               col, (const char *) markerNames[m],
00184                               (const char *) p->famid, (const char *) p->pid,
00185                               (const char *) info->GetAlleleLabel(p->markers[m][0]),
00186                               (const char *) info->GetAlleleLabel(p->markers[m][1]),
00187                               (const char *) info->GetAlleleLabel(new_genotype[0]),
00188                               (const char *) info->GetAlleleLabel(new_genotype[1]));
00189                     }
00190 
00191                     if (new_genotype.isKnown()) p->markers[m] = new_genotype;
00192                     break;
00193                 }
00194                 case pcTrait :
00195                 case pcUndocumentedTraitCovariate :
00196                 {
00197                     int t = pd.columnHash[col];
00198                     double new_pheno = _NAN_;
00199 
00200                     if (pd.columns[col] == pcUndocumentedTraitCovariate)
00201                         t = t / 32768;
00202 
00203                     const char * value = tokens[field++];
00204                     char * flag = NULL;
00205 
00206                     if (missing == (const char *) NULL || strcmp(value, missing) != 0)
00207                         new_pheno = strtod(value, &flag);
00208                     if (flag != NULL && *flag) new_pheno = _NAN_;
00209 
00210                     if (p->traits[t] != _NAN_ && new_pheno != _NAN_ &&
00211                             new_pheno != p->traits[t])
00212                         error("Conflict with previous phenotype - Col %d, Trait %s\n"
00213                               "Family: %s  Individual: %s  Old: %f New: %f",
00214                               col, (const char *) traitNames[t],
00215                               (const char *) p->famid, (const char *) p->pid,
00216                               p->traits[t], new_pheno);
00217 
00218                     if (new_pheno != _NAN_) p->traits[t] = new_pheno;
00219                     if (pd.columns[col] == pcTrait) break;
00220                 }
00221                 case pcCovariate :
00222                 {
00223                     int c = pd.columnHash[col];
00224                     double new_covar = _NAN_;
00225 
00226                     if (pd.columns[col] == pcUndocumentedTraitCovariate)
00227                     {
00228                         c = c % 32768;
00229                         field--;
00230                     }
00231 
00232                     const char * value = tokens[field++];
00233                     char * flag = NULL;
00234 
00235                     if (missing == (const char *) NULL || strcmp(value, missing) != 0)
00236                         new_covar = strtod(value, &flag);
00237                     if (flag != NULL && *flag) new_covar = _NAN_;
00238 
00239                     if (p->covariates[c] != _NAN_ && new_covar != _NAN_ &&
00240                             new_covar != p->covariates[c])
00241                         error("Conflict with previous value - Col %d, Covariate %s\n"
00242                               "Family: %s  Individual: %s  Old: %f New: %f",
00243                               col, (const char *) covariateNames[c],
00244                               (const char *) p->famid, (const char *) p->pid,
00245                               p->covariates[c], new_covar);
00246 
00247                     if (new_covar != _NAN_) p->covariates[c] = new_covar;
00248                     break;
00249                 }
00250                 case pcString :
00251                 {
00252                     int c = pd.columnHash[col];
00253 
00254                     if (!p->strings[c].IsEmpty() && p->strings[c] != tokens[field])
00255                         error("Conflict with previous value - Col %d, String %s\n"
00256                               "Family: %s  Individual: %s  Old: %s New: %s",
00257                               col, (const char *) stringNames[c],
00258                               (const char *) p->famid, (const char *) p->pid,
00259                               (const char *) p->strings[c], (const char *) tokens[field]);
00260 
00261                     p->strings[c] = tokens[field++];
00262 
00263                     break;
00264                 }
00265                 case pcSkip :
00266                     field++;
00267                     break;
00268                 case pcZygosity :
00269                 {
00270                     int new_zygosity;
00271 
00272                     const char * zygosity = tokens[field++];
00273 
00274                     switch (zygosity[0])
00275                     {
00276                         case 'D' :
00277                         case 'd' :
00278                             new_zygosity = 2;
00279                             break;
00280                         case 'M' :
00281                         case 'm' :
00282                             new_zygosity = 1;
00283                             break;
00284                         default :
00285                             new_zygosity = atoi(zygosity);
00286                     }
00287                     if (p->zygosity != 0 && new_zygosity != p->zygosity)
00288                         error("Conflict with previous zygosity - "
00289                               "Column %d in pedigree\n"
00290                               "Family: %s  Individual: %s  Old: %d New: %d\n",
00291                               col, (const char *) p->famid, (const char *) p->pid,
00292                               p->zygosity, new_zygosity);
00293                     p->zygosity = new_zygosity;
00294                     break;
00295                 }
00296                 case pcEnd :
00297                     break;
00298                 default :
00299                     error("Inconsistent Pedigree Description -- Internal Error");
00300             }
00301     }
00302 
00303     Sort();
00304 }
00305 
00306 void Pedigree::LoadMendel(IFILE & input)
00307 {
00308     // First, retrieve the two format statements from file
00309     String familyHeader;
00310     String individualRecord;
00311 
00312     familyHeader.ReadLine(input);
00313     individualRecord.ReadLine(input);
00314 
00315     // Then create two FORTRAN input streams...
00316     // One will be used for retrieving family labels and sizes, the other
00317     // will be used for individual information
00318     FortranFormat headers, records;
00319 
00320     headers.SetInputFile(input);
00321     headers.SetFormat(familyHeader);
00322 
00323     records.SetInputFile(input);
00324     records.SetFormat(individualRecord);
00325 
00326     // Storage for key pieces of information
00327     String famid;
00328     String phenotype;
00329     String affectionCode;
00330     String affectionStem;
00331     int    familySize;
00332 
00333     String allele1, allele2;
00334 
00335     int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
00336 
00337     while (!ifeof(input))
00338     {
00339         if (count == size)
00340             Grow();
00341 
00342         // Retrieve header for next family
00343         familySize = headers.GetNextInteger();
00344         headers.GetNextField(famid);
00345         headers.Flush();
00346 
00347         if (famid.IsEmpty())
00348         {
00349             if (ifeof(input) && familySize == 0)
00350                 break;
00351             else
00352                 error("Blank family id encountered\n");
00353         }
00354 
00355         // Retrieve each individual in the family
00356         for (int i = 0; i < familySize; i++)
00357         {
00358             Person * p = persons[count++] = new Person;
00359 
00360             // Retrieve basic pedigree structure
00361             p->famid = famid;
00362             records.GetNextField(p->pid);
00363             records.GetNextField(p->fatid);
00364             records.GetNextField(p->motid);
00365 
00366             if (p->pid.IsEmpty())
00367                 error("No unique identifier for individual #%d in family %s\n",
00368                       i + 1, (const char *) famid);
00369 
00370             if (p->pid.Compare(".") == 0)
00371                 error("Family %s has an individual named '.', but this code is\n"
00372                       "reserved to indicate missing parents\n");
00373 
00374             if (p->fatid.IsEmpty()) p->fatid = ".";
00375             if (p->motid.IsEmpty()) p->motid = ".";
00376 
00377             // Retrieve and decode sex code
00378             char sex = records.GetNextCharacter();
00379 
00380             switch (sex)
00381             {
00382                 case '0' :
00383                 case 'x' :
00384                 case 'X' :
00385                 case '?' :
00386                 case 0 :
00387                     p->sex = 0;
00388                     break;
00389                 case '1' :
00390                 case 'm' :
00391                 case 'M' :
00392                     p->sex = 1;
00393                     break;
00394                 case '2' :
00395                 case 'f' :
00396                 case 'F' :
00397                     p->sex = 2;
00398                     break;
00399                 default :
00400                     error("Can't interpret the sex of individual #%d\n"
00401                           "Family: %s  Individual: %s  Sex Code: %s", count,
00402                           (const char *) p->famid, (const char *) p->pid, sex);
00403             };
00404 
00405             if (sexAsCovariate)
00406             {
00407                 if (p->sex)
00408                     p->covariates[sexCovariate] = p->sex;
00409                 else
00410                     p->covariates[sexCovariate] = _NAN_;
00411             }
00412 
00413             // Retrieve and decode zygosity
00414             char zygosity = records.GetNextCharacter();
00415 
00416             // Mendel uses a unique character to indicate each MZ pair,
00417             // we use a unique odd number...
00418             if (zygosity)
00419                 p->zygosity = (zygosity - ' ') * 2 - 1;
00420 
00421             affectionStem.Clear();
00422             for (int col = 0; col < pd.columnCount; col++)
00423                 switch (pd.columns[col])
00424                 {
00425                     case pcAffection :
00426                     {
00427                         int a = pd.columnHash[col];
00428 
00429                         // We expand each Mendel non-codominant trait into multiple
00430                         // affection status column... First, if this is  not a
00431                         // continuation of a previous expansion we first retrieve
00432                         // and encode the affection status.
00433                         if (affectionStem.Length() == 0 ||
00434                                 affectionNames[a].CompareToStem(affectionStem) != 0)
00435                         {
00436                             affectionStem.Copy(affectionNames[a], 0, affectionNames[a].FindChar('>') + 1);
00437                             records.GetNextField(phenotype);
00438                             affectionCode = affectionStem + phenotype;
00439                         }
00440 
00441                         // Then encode each phenotype appropriately
00442                         if (phenotype.IsEmpty())
00443                             p->affections[a] = 0;
00444                         else
00445                             p->affections[a] = affectionCode.Compare(affectionNames[a]) == 0 ? 2 : 1;
00446 
00447                         break;
00448                     }
00449                     case pcMarker :
00450                     {
00451                         int m = pd.columnHash[col];
00452 
00453                         records.GetNextField(phenotype);
00454 
00455                         if (phenotype.IsEmpty())
00456                         {
00457                             p->markers[m].one = p->markers[m].two = 0;
00458                             continue;
00459                         }
00460 
00461                         int separator = phenotype.FindChar('/');
00462                         if (separator == -1) separator = phenotype.FindChar('|');
00463 
00464                         if (separator == -1)
00465                             error("At marker %s, person %s in family %s has genotype %s.\n"
00466                                   "This genotype is not in the 'al1/al2' format.\n",
00467                                   (const char *) markerNames[m],
00468                                   (const char *) p->pid,
00469                                   (const char *) p->famid,
00470                                   (const char *) phenotype);
00471 
00472                         allele1.Copy(phenotype, 0, separator);
00473                         allele1.Trim();
00474 
00475                         allele2.Copy(phenotype, separator + 1, 8);
00476                         allele2.Trim();
00477 
00478                         MarkerInfo * info = GetMarkerInfo(m);
00479 
00480                         int one = info->alleleNumbers.Integer(allele1);
00481 
00482                         if (one < 0)
00483                         {
00484                             if (info->freq.Length() == 0)
00485                                 one = info->NewAllele(allele1);
00486                             else
00487                                 error("At marker %s, person %s in family %s has genotype %s.\n"
00488                                       "However, '%s' is not a valid allele for this marker.\n",
00489                                       (const char *) markerNames[m],
00490                                       (const char *) p->pid,
00491                                       (const char *) p->famid,
00492                                       (const char *) phenotype,
00493                                       (const char *) allele1);
00494                         }
00495 
00496                         int two = info->alleleNumbers.Integer(allele2);
00497 
00498                         if (two < 0)
00499                         {
00500                             if (info->freq.Length() == 0)
00501                                 two = info->NewAllele(allele2);
00502                             else
00503                                 error("At marker %s, person %s in family %s has genotype %s.\n"
00504                                       "However, '%s' is not a valid allele for this marker.\n",
00505                                       (const char *) markerNames[m],
00506                                       (const char *) p->pid,
00507                                       (const char *) p->famid,
00508                                       (const char *) phenotype,
00509                                       (const char *) allele2);
00510                         }
00511 
00512                         p->markers[m].one = one;
00513                         p->markers[m].two = two;
00514                         break;
00515                     }
00516                     case pcEnd :
00517                         break;
00518                     case pcTrait :
00519                     case pcCovariate :
00520                     case pcSkip :
00521                     case pcZygosity :
00522                     default:
00523                         error("Inconsistent Pedigree Description -- Internal Error");
00524                 }
00525 
00526             records.Flush();
00527         }
00528     }
00529 
00530     Sort();
00531 }
00532 
00533 void Pedigree::Prepare(const char * filename)
00534 {
00535     // Clear any previously loaded pedigree description
00536     if (multiPd != NULL)
00537         delete [] multiPd;
00538 
00539     multiFileCount = 1;
00540 
00541     // Enable multifile support
00542     StringArray filenames;
00543 
00544     filenames.AddColumns(filename, ',');
00545 
00546     if (filenames.Length() <= 1)
00547         pd.Load(filename);
00548     else
00549     {
00550         printf("AUTOMATIC MERGE ENABLED: Detected multiple datafile names, separated by commas...\n");
00551 
00552         multiPd = new PedigreeDescription[filenames.Length()];
00553 
00554         for (int i = 0; i < filenames.Length(); i++)
00555         {
00556             printf("  AUTOMATIC MERGE: Reading data file '%s' ...\n", (const char *) filenames[i]);
00557             multiPd[i].Load(filenames[i], false);
00558         }
00559 
00560         multiFileCount = filenames.Length();
00561     }
00562 }
00563 
00564 void Pedigree::Load(const char * filename, bool allowFailures)
00565 {
00566     if (multiFileCount <= 1)
00567     {
00568         IFILE f = ifopen(filename, "rb");
00569 
00570         if (f == NULL && allowFailures)
00571             return;
00572 
00573         if (f == NULL)
00574             error(
00575                 "The pedigree file %s cannot be opened\n\n"
00576                 "Common causes for this problem are:\n"
00577                 "  * You might not have used the correct options to specify input file names,\n"
00578                 "    please check the program documentation for information on how to do this\n\n"
00579                 "  * The file doesn't exist or the filename might have been misspelt\n\n"
00580                 "  * The file exists but it is being used by another program which you will need\n"
00581                 "    to close\n\n"
00582                 "  * The file is larger than 2GB and you haven't compiled this application with\n"
00583                 "    large file support.\n\n",
00584                 filename);
00585 
00586         Load(f);
00587         ifclose(f);
00588     }
00589     else
00590     {
00591         StringArray filenames;
00592 
00593         filenames.AddColumns(filename, ',');
00594 
00595         if (filenames.Length() != multiFileCount)
00596             error("Different numbers of comma separated data and pedigree file names provided\n");
00597 
00598         for (int i = 0; i < filenames.Length(); i++)
00599         {
00600             printf("  AUTOMATIC MERGE: Datafile '%s' matched to pedigree '%s' ...\n",
00601                    (const char *) multiPd[i].filename, (const char *) filenames[i]);
00602 
00603             pd = multiPd[i];
00604 
00605             IFILE f = ifopen(filenames[i], "rb");
00606 
00607             if (f == NULL)
00608                 error("The pedigree file '%s' cannot be opened\n\n",
00609                       (const char *) filenames[i]);
00610 
00611             Load(f);
00612             ifclose(f);
00613         }
00614 
00615         printf("\n");
00616     }
00617 }
00618 
00619 int Pedigree::TranslateSexCode(const char * code, bool & failure)
00620 {
00621     failure = false;
00622 
00623     switch (code[0])
00624     {
00625         case 'x' :
00626         case 'X' :
00627         case '?' :
00628             return 0;
00629         case '1' :
00630         case 'm' :
00631         case 'M' :
00632             return 1;
00633         case '2' :
00634         case 'f' :
00635         case 'F' :
00636             return 2;
00637         default :
00638         {
00639             bool result = atoi(code);
00640 
00641             if (result != 0 && result != 1 && result != 2)
00642             {
00643                 failure = true;
00644                 result = 0;
00645             }
00646 
00647             return result;
00648         }
00649     };
00650 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends