00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "Pedigree.h"
00019 #include "FortranFormat.h"
00020 #include "Error.h"
00021
00022 #include <stdlib.h>
00023 #include <ctype.h>
00024 #include <string.h>
00025
00026 void Pedigree::Prepare(IFILE & input)
00027 {
00028 pd.Load(input);
00029 }
00030
00031 void Pedigree::Load(IFILE & input)
00032 {
00033 if (pd.mendelFormat)
00034 {
00035 LoadMendel(input);
00036 return;
00037 }
00038
00039 int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
00040
00041 int textCols = pd.CountTextColumns() + 5;
00042 int oldCount = count;
00043 bool warn = true;
00044 int line = 0;
00045
00046 String buffer;
00047 StringArray tokens;
00048
00049 while (!ifeof(input))
00050 {
00051 int field = 0;
00052
00053 buffer.ReadLine(input);
00054
00055 tokens.Clear();
00056 tokens.AddTokens(buffer, SEPARATORS);
00057
00058 if (tokens.Length() == 0) continue;
00059 if (tokens[0].SlowCompare("end") == 0) break;
00060
00061 line++;
00062
00063 if (tokens.Length() < textCols)
00064 {
00065 if (buffer.Length() > 79)
00066 {
00067 buffer.SetLength(75);
00068 buffer += " ...";
00069 }
00070
00071 String description;
00072
00073 pd.ColumnSummary(description);
00074 error("Loading Pedigree...\n\n"
00075 "Expecting %d columns (%s),\n"
00076 "but read only %d columns in line %d.\n\n"
00077 "The problem line is transcribed below:\n%s\n",
00078 textCols, (const char *) description,
00079 tokens.Length(), line, (const char *) buffer);
00080 }
00081
00082 if (tokens.Length() > textCols && warn && textCols > 5)
00083 {
00084 pd.ColumnSummary(buffer);
00085 printf("WARNING -- Trailing columns in pedigree file will be ignored\n"
00086 " Expecting %d data columns (%s)\n"
00087 " However line %d, for example, has %d data columns\n\n",
00088 textCols - 5, (const char *) buffer, line, tokens.Length() - 5);
00089 warn = false;
00090 }
00091
00092 Person * p;
00093
00094
00095 if (oldCount==0 || (p = FindPerson(tokens[0], tokens[1], oldCount))==NULL)
00096 {
00097 if (count == size) Grow();
00098
00099 p = persons[count++] = new Person;
00100 }
00101
00102 p->famid = tokens[field++];
00103 p->pid = tokens[field++];
00104 p->fatid = tokens[field++];
00105 p->motid = tokens[field++];
00106
00107 bool failure = false;
00108 p->sex = TranslateSexCode(tokens[field++], failure);
00109 if (failure)
00110 error("Can't interpret the sex of individual #%d\n"
00111 "Family: %s Individual: %s Sex Code: %s", count,
00112 (const char *) p->famid, (const char *) p->pid,
00113 (const char *) tokens[field-1]);
00114
00115 if (sexAsCovariate)
00116 {
00117 if (p->sex)
00118 p->covariates[sexCovariate] = p->sex;
00119 else
00120 p->covariates[sexCovariate] = _NAN_;
00121 }
00122
00123 for (int col = 0; col < pd.columnCount; col++)
00124 switch (pd.columns[col])
00125 {
00126 case pcAffection :
00127 {
00128 int a = pd.columnHash[col];
00129 int new_status;
00130
00131 const char * affection = tokens[field++];
00132
00133 switch (toupper(affection[0]))
00134 {
00135 case '1' :
00136 case 'N' :
00137 case 'U' :
00138 new_status = 1;
00139 break;
00140 case '2' :
00141 case 'D' :
00142 case 'A' :
00143 case 'Y' :
00144 new_status = 2;
00145 break;
00146 default :
00147 new_status = atoi(affection);
00148 if (new_status < 0 || new_status > 2)
00149 error("Incorrect formating for affection status "
00150 "Col %d, Affection %s\n"
00151 "Family: %s Individual: %s Status: %s",
00152 col, (const char *) affectionNames[a],
00153 (const char *) p->famid, (const char *) p->pid,
00154 affection);
00155 }
00156 if (new_status != 0 && p->affections[a] != 0 &&
00157 new_status != p->affections[a])
00158 error("Conflict with previous affection status - "
00159 "Col %d, Affection %s\n"
00160 "Family: %s Individual: %s Old: %d New: %d",
00161 col, (const char *) affectionNames[a],
00162 (const char *) p->famid, (const char *) p->pid,
00163 p->affections[a], new_status);
00164 if (new_status) p->affections[a] = new_status;
00165 break;
00166 }
00167 case pcMarker :
00168 {
00169 int m = pd.columnHash[col];
00170
00171 Alleles new_genotype;
00172
00173 new_genotype[0] = LoadAllele(m, tokens[field++]);
00174 new_genotype[1] = LoadAllele(m, tokens[field++]);
00175
00176 if (p->markers[m].isKnown() && new_genotype.isKnown() &&
00177 new_genotype != p->markers[m])
00178 {
00179 MarkerInfo * info = GetMarkerInfo(m);
00180
00181 error("Conflict with previous genotype - Col %d, Marker %s\n"
00182 "Family: %s Individual: %s Old: %s/%s New: %s/%s",
00183 col, (const char *) markerNames[m],
00184 (const char *) p->famid, (const char *) p->pid,
00185 (const char *) info->GetAlleleLabel(p->markers[m][0]),
00186 (const char *) info->GetAlleleLabel(p->markers[m][1]),
00187 (const char *) info->GetAlleleLabel(new_genotype[0]),
00188 (const char *) info->GetAlleleLabel(new_genotype[1]));
00189 }
00190
00191 if (new_genotype.isKnown()) p->markers[m] = new_genotype;
00192 break;
00193 }
00194 case pcTrait :
00195 case pcUndocumentedTraitCovariate :
00196 {
00197 int t = pd.columnHash[col];
00198 double new_pheno = _NAN_;
00199
00200 if (pd.columns[col] == pcUndocumentedTraitCovariate)
00201 t = t / 32768;
00202
00203 const char * value = tokens[field++];
00204 char * flag = NULL;
00205
00206 if (missing == (const char *) NULL || strcmp(value, missing) != 0)
00207 new_pheno = strtod(value, &flag);
00208 if (flag != NULL && *flag) new_pheno = _NAN_;
00209
00210 if (p->traits[t] != _NAN_ && new_pheno != _NAN_ &&
00211 new_pheno != p->traits[t])
00212 error("Conflict with previous phenotype - Col %d, Trait %s\n"
00213 "Family: %s Individual: %s Old: %f New: %f",
00214 col, (const char *) traitNames[t],
00215 (const char *) p->famid, (const char *) p->pid,
00216 p->traits[t], new_pheno);
00217
00218 if (new_pheno != _NAN_) p->traits[t] = new_pheno;
00219 if (pd.columns[col] == pcTrait) break;
00220 }
00221 case pcCovariate :
00222 {
00223 int c = pd.columnHash[col];
00224 double new_covar = _NAN_;
00225
00226 if (pd.columns[col] == pcUndocumentedTraitCovariate)
00227 {
00228 c = c % 32768;
00229 field--;
00230 }
00231
00232 const char * value = tokens[field++];
00233 char * flag = NULL;
00234
00235 if (missing == (const char *) NULL || strcmp(value, missing) != 0)
00236 new_covar = strtod(value, &flag);
00237 if (flag != NULL && *flag) new_covar = _NAN_;
00238
00239 if (p->covariates[c] != _NAN_ && new_covar != _NAN_ &&
00240 new_covar != p->covariates[c])
00241 error("Conflict with previous value - Col %d, Covariate %s\n"
00242 "Family: %s Individual: %s Old: %f New: %f",
00243 col, (const char *) covariateNames[c],
00244 (const char *) p->famid, (const char *) p->pid,
00245 p->covariates[c], new_covar);
00246
00247 if (new_covar != _NAN_) p->covariates[c] = new_covar;
00248 break;
00249 }
00250 case pcString :
00251 {
00252 int c = pd.columnHash[col];
00253
00254 if (!p->strings[c].IsEmpty() && p->strings[c] != tokens[field])
00255 error("Conflict with previous value - Col %d, String %s\n"
00256 "Family: %s Individual: %s Old: %s New: %s",
00257 col, (const char *) stringNames[c],
00258 (const char *) p->famid, (const char *) p->pid,
00259 (const char *) p->strings[c], (const char *) tokens[field]);
00260
00261 p->strings[c] = tokens[field++];
00262
00263 break;
00264 }
00265 case pcSkip :
00266 field++;
00267 break;
00268 case pcZygosity :
00269 {
00270 int new_zygosity;
00271
00272 const char * zygosity = tokens[field++];
00273
00274 switch (zygosity[0])
00275 {
00276 case 'D' :
00277 case 'd' :
00278 new_zygosity = 2;
00279 break;
00280 case 'M' :
00281 case 'm' :
00282 new_zygosity = 1;
00283 break;
00284 default :
00285 new_zygosity = atoi(zygosity);
00286 }
00287 if (p->zygosity != 0 && new_zygosity != p->zygosity)
00288 error("Conflict with previous zygosity - "
00289 "Column %d in pedigree\n"
00290 "Family: %s Individual: %s Old: %d New: %d\n",
00291 col, (const char *) p->famid, (const char *) p->pid,
00292 p->zygosity, new_zygosity);
00293 p->zygosity = new_zygosity;
00294 break;
00295 }
00296 case pcEnd :
00297 break;
00298 default :
00299 error("Inconsistent Pedigree Description -- Internal Error");
00300 }
00301 }
00302
00303 Sort();
00304 }
00305
00306 void Pedigree::LoadMendel(IFILE & input)
00307 {
00308
00309 String familyHeader;
00310 String individualRecord;
00311
00312 familyHeader.ReadLine(input);
00313 individualRecord.ReadLine(input);
00314
00315
00316
00317
00318 FortranFormat headers, records;
00319
00320 headers.SetInputFile(input);
00321 headers.SetFormat(familyHeader);
00322
00323 records.SetInputFile(input);
00324 records.SetFormat(individualRecord);
00325
00326
00327 String famid;
00328 String phenotype;
00329 String affectionCode;
00330 String affectionStem;
00331 int familySize;
00332
00333 String allele1, allele2;
00334
00335 int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
00336
00337 while (!ifeof(input))
00338 {
00339 if (count == size)
00340 Grow();
00341
00342
00343 familySize = headers.GetNextInteger();
00344 headers.GetNextField(famid);
00345 headers.Flush();
00346
00347 if (famid.IsEmpty())
00348 {
00349 if (ifeof(input) && familySize == 0)
00350 break;
00351 else
00352 error("Blank family id encountered\n");
00353 }
00354
00355
00356 for (int i = 0; i < familySize; i++)
00357 {
00358 Person * p = persons[count++] = new Person;
00359
00360
00361 p->famid = famid;
00362 records.GetNextField(p->pid);
00363 records.GetNextField(p->fatid);
00364 records.GetNextField(p->motid);
00365
00366 if (p->pid.IsEmpty())
00367 error("No unique identifier for individual #%d in family %s\n",
00368 i + 1, (const char *) famid);
00369
00370 if (p->pid.Compare(".") == 0)
00371 error("Family %s has an individual named '.', but this code is\n"
00372 "reserved to indicate missing parents\n");
00373
00374 if (p->fatid.IsEmpty()) p->fatid = ".";
00375 if (p->motid.IsEmpty()) p->motid = ".";
00376
00377
00378 char sex = records.GetNextCharacter();
00379
00380 switch (sex)
00381 {
00382 case '0' :
00383 case 'x' :
00384 case 'X' :
00385 case '?' :
00386 case 0 :
00387 p->sex = 0;
00388 break;
00389 case '1' :
00390 case 'm' :
00391 case 'M' :
00392 p->sex = 1;
00393 break;
00394 case '2' :
00395 case 'f' :
00396 case 'F' :
00397 p->sex = 2;
00398 break;
00399 default :
00400 error("Can't interpret the sex of individual #%d\n"
00401 "Family: %s Individual: %s Sex Code: %s", count,
00402 (const char *) p->famid, (const char *) p->pid, sex);
00403 };
00404
00405 if (sexAsCovariate)
00406 {
00407 if (p->sex)
00408 p->covariates[sexCovariate] = p->sex;
00409 else
00410 p->covariates[sexCovariate] = _NAN_;
00411 }
00412
00413
00414 char zygosity = records.GetNextCharacter();
00415
00416
00417
00418 if (zygosity)
00419 p->zygosity = (zygosity - ' ') * 2 - 1;
00420
00421 affectionStem.Clear();
00422 for (int col = 0; col < pd.columnCount; col++)
00423 switch (pd.columns[col])
00424 {
00425 case pcAffection :
00426 {
00427 int a = pd.columnHash[col];
00428
00429
00430
00431
00432
00433 if (affectionStem.Length() == 0 ||
00434 affectionNames[a].CompareToStem(affectionStem) != 0)
00435 {
00436 affectionStem.Copy(affectionNames[a], 0, affectionNames[a].FindChar('>') + 1);
00437 records.GetNextField(phenotype);
00438 affectionCode = affectionStem + phenotype;
00439 }
00440
00441
00442 if (phenotype.IsEmpty())
00443 p->affections[a] = 0;
00444 else
00445 p->affections[a] = affectionCode.Compare(affectionNames[a]) == 0 ? 2 : 1;
00446
00447 break;
00448 }
00449 case pcMarker :
00450 {
00451 int m = pd.columnHash[col];
00452
00453 records.GetNextField(phenotype);
00454
00455 if (phenotype.IsEmpty())
00456 {
00457 p->markers[m].one = p->markers[m].two = 0;
00458 continue;
00459 }
00460
00461 int separator = phenotype.FindChar('/');
00462 if (separator == -1) separator = phenotype.FindChar('|');
00463
00464 if (separator == -1)
00465 error("At marker %s, person %s in family %s has genotype %s.\n"
00466 "This genotype is not in the 'al1/al2' format.\n",
00467 (const char *) markerNames[m],
00468 (const char *) p->pid,
00469 (const char *) p->famid,
00470 (const char *) phenotype);
00471
00472 allele1.Copy(phenotype, 0, separator);
00473 allele1.Trim();
00474
00475 allele2.Copy(phenotype, separator + 1, 8);
00476 allele2.Trim();
00477
00478 MarkerInfo * info = GetMarkerInfo(m);
00479
00480 int one = info->alleleNumbers.Integer(allele1);
00481
00482 if (one < 0)
00483 {
00484 if (info->freq.Length() == 0)
00485 one = info->NewAllele(allele1);
00486 else
00487 error("At marker %s, person %s in family %s has genotype %s.\n"
00488 "However, '%s' is not a valid allele for this marker.\n",
00489 (const char *) markerNames[m],
00490 (const char *) p->pid,
00491 (const char *) p->famid,
00492 (const char *) phenotype,
00493 (const char *) allele1);
00494 }
00495
00496 int two = info->alleleNumbers.Integer(allele2);
00497
00498 if (two < 0)
00499 {
00500 if (info->freq.Length() == 0)
00501 two = info->NewAllele(allele2);
00502 else
00503 error("At marker %s, person %s in family %s has genotype %s.\n"
00504 "However, '%s' is not a valid allele for this marker.\n",
00505 (const char *) markerNames[m],
00506 (const char *) p->pid,
00507 (const char *) p->famid,
00508 (const char *) phenotype,
00509 (const char *) allele2);
00510 }
00511
00512 p->markers[m].one = one;
00513 p->markers[m].two = two;
00514 break;
00515 }
00516 case pcEnd :
00517 break;
00518 case pcTrait :
00519 case pcCovariate :
00520 case pcSkip :
00521 case pcZygosity :
00522 default:
00523 error("Inconsistent Pedigree Description -- Internal Error");
00524 }
00525
00526 records.Flush();
00527 }
00528 }
00529
00530 Sort();
00531 }
00532
00533 void Pedigree::Prepare(const char * filename)
00534 {
00535
00536 if (multiPd != NULL)
00537 delete [] multiPd;
00538
00539 multiFileCount = 1;
00540
00541
00542 StringArray filenames;
00543
00544 filenames.AddColumns(filename, ',');
00545
00546 if (filenames.Length() <= 1)
00547 pd.Load(filename);
00548 else
00549 {
00550 printf("AUTOMATIC MERGE ENABLED: Detected multiple datafile names, separated by commas...\n");
00551
00552 multiPd = new PedigreeDescription[filenames.Length()];
00553
00554 for (int i = 0; i < filenames.Length(); i++)
00555 {
00556 printf(" AUTOMATIC MERGE: Reading data file '%s' ...\n", (const char *) filenames[i]);
00557 multiPd[i].Load(filenames[i], false);
00558 }
00559
00560 multiFileCount = filenames.Length();
00561 }
00562 }
00563
00564 void Pedigree::Load(const char * filename, bool allowFailures)
00565 {
00566 if (multiFileCount <= 1)
00567 {
00568 IFILE f = ifopen(filename, "rb");
00569
00570 if (f == NULL && allowFailures)
00571 return;
00572
00573 if (f == NULL)
00574 error(
00575 "The pedigree file %s cannot be opened\n\n"
00576 "Common causes for this problem are:\n"
00577 " * You might not have used the correct options to specify input file names,\n"
00578 " please check the program documentation for information on how to do this\n\n"
00579 " * The file doesn't exist or the filename might have been misspelt\n\n"
00580 " * The file exists but it is being used by another program which you will need\n"
00581 " to close\n\n"
00582 " * The file is larger than 2GB and you haven't compiled this application with\n"
00583 " large file support.\n\n",
00584 filename);
00585
00586 Load(f);
00587 ifclose(f);
00588 }
00589 else
00590 {
00591 StringArray filenames;
00592
00593 filenames.AddColumns(filename, ',');
00594
00595 if (filenames.Length() != multiFileCount)
00596 error("Different numbers of comma separated data and pedigree file names provided\n");
00597
00598 for (int i = 0; i < filenames.Length(); i++)
00599 {
00600 printf(" AUTOMATIC MERGE: Datafile '%s' matched to pedigree '%s' ...\n",
00601 (const char *) multiPd[i].filename, (const char *) filenames[i]);
00602
00603 pd = multiPd[i];
00604
00605 IFILE f = ifopen(filenames[i], "rb");
00606
00607 if (f == NULL)
00608 error("The pedigree file '%s' cannot be opened\n\n",
00609 (const char *) filenames[i]);
00610
00611 Load(f);
00612 ifclose(f);
00613 }
00614
00615 printf("\n");
00616 }
00617 }
00618
00619 int Pedigree::TranslateSexCode(const char * code, bool & failure)
00620 {
00621 failure = false;
00622
00623 switch (code[0])
00624 {
00625 case 'x' :
00626 case 'X' :
00627 case '?' :
00628 return 0;
00629 case '1' :
00630 case 'm' :
00631 case 'M' :
00632 return 1;
00633 case '2' :
00634 case 'f' :
00635 case 'F' :
00636 return 2;
00637 default :
00638 {
00639 bool result = atoi(code);
00640
00641 if (result != 0 && result != 1 && result != 2)
00642 {
00643 failure = true;
00644 result = 0;
00645 }
00646
00647 return result;
00648 }
00649 };
00650 }