00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "Pedigree.h"
00019 #include "FortranFormat.h"
00020 #include "Error.h"
00021
00022 #include <stdlib.h>
00023 #include <ctype.h>
00024 #include <string.h>
00025
00026 void Pedigree::Prepare(IFILE & input)
00027 {
00028 pd.Load(input);
00029 }
00030
00031 void Pedigree::Load(IFILE & input)
00032 {
00033 if (pd.mendelFormat)
00034 {
00035 LoadMendel(input);
00036 return;
00037 }
00038
00039 int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
00040
00041 int textCols = pd.CountTextColumns() + 5;
00042 int oldCount = count;
00043 bool warn = true;
00044 int line = 0;
00045
00046 String buffer;
00047 StringArray tokens;
00048
00049 while (!ifeof(input))
00050 {
00051 int field = 0;
00052
00053 buffer.ReadLine(input);
00054
00055 tokens.Clear();
00056 tokens.AddTokens(buffer, SEPARATORS);
00057
00058 if (tokens.Length() == 0) continue;
00059 if (tokens[0].SlowCompare("end") == 0) break;
00060
00061 line++;
00062
00063 if (tokens.Length() < textCols)
00064 {
00065 if (buffer.Length() > 79)
00066 {
00067 buffer.SetLength(75);
00068 buffer += " ...";
00069 }
00070
00071 String description;
00072
00073 pd.ColumnSummary(description);
00074 error("Loading Pedigree...\n\n"
00075 "Expecting %d columns (%s),\n"
00076 "but read only %d columns in line %d.\n\n"
00077 "The problem line is transcribed below:\n%s\n",
00078 textCols, (const char *) description,
00079 tokens.Length(), line, (const char *) buffer);
00080 }
00081
00082 if (tokens.Length() > textCols && warn && textCols > 5)
00083 {
00084 pd.ColumnSummary(buffer);
00085 printf("WARNING -- Trailing columns in pedigree file will be ignored\n"
00086 " Expecting %d data columns (%s)\n"
00087 " However line %d, for example, has %d data columns\n\n",
00088 textCols - 5, (const char *) buffer, line, tokens.Length() - 5);
00089 warn = false;
00090 }
00091
00092 Person * p;
00093
00094
00095 if (oldCount==0 || (p = FindPerson(tokens[0], tokens[1], oldCount))==NULL)
00096 {
00097 if (count == size) Grow();
00098
00099 p = persons[count++] = new Person;
00100 }
00101
00102 p->famid = tokens[field++];
00103 p->pid = tokens[field++];
00104 p->fatid = tokens[field++];
00105 p->motid = tokens[field++];
00106
00107 bool failure = false;
00108 p->sex = TranslateSexCode(tokens[field++], failure);
00109 if (failure)
00110 error("Can't interpret the sex of individual #%d\n"
00111 "Family: %s Individual: %s Sex Code: %s", count,
00112 (const char *) p->famid, (const char *) p->pid,
00113 (const char *) tokens[field-1]);
00114
00115 if (sexAsCovariate)
00116 {
00117 if (p->sex)
00118 p->covariates[sexCovariate] = p->sex;
00119 else
00120 p->covariates[sexCovariate] = _NAN_;
00121 }
00122
00123 for (int col = 0; col < pd.columnCount; col++)
00124 switch (pd.columns[col])
00125 {
00126 case pcAffection :
00127 {
00128 int a = pd.columnHash[col];
00129 int new_status;
00130
00131 const char * affection = tokens[field++];
00132
00133 switch (toupper(affection[0]))
00134 {
00135 case '1' :
00136 case 'N' :
00137 case 'U' :
00138 new_status = 1;
00139 break;
00140 case '2' :
00141 case 'D' :
00142 case 'A' :
00143 case 'Y' :
00144 new_status = 2;
00145 break;
00146 default :
00147 new_status = atoi(affection);
00148 if (new_status < 0 || new_status > 2)
00149 error("Incorrect formating for affection status "
00150 "Col %d, Affection %s\n"
00151 "Family: %s Individual: %s Status: %s",
00152 col, (const char *) affectionNames[a],
00153 (const char *) p->famid, (const char *) p->pid,
00154 affection);
00155 }
00156 if (new_status != 0 && p->affections[a] != 0 &&
00157 new_status != p->affections[a])
00158 error("Conflict with previous affection status - "
00159 "Col %d, Affection %s\n"
00160 "Family: %s Individual: %s Old: %d New: %d",
00161 col, (const char *) affectionNames[a],
00162 (const char *) p->famid, (const char *) p->pid,
00163 p->affections[a], new_status);
00164 if (new_status) p->affections[a] = new_status;
00165 break;
00166 }
00167 case pcMarker :
00168 {
00169 int m = pd.columnHash[col];
00170
00171 Alleles new_genotype;
00172
00173 new_genotype[0] = LoadAllele(m, tokens[field++]);
00174 new_genotype[1] = LoadAllele(m, tokens[field++]);
00175
00176 if (p->markers[m].isKnown() && new_genotype.isKnown() &&
00177 new_genotype != p->markers[m])
00178 {
00179 MarkerInfo * info = GetMarkerInfo(m);
00180
00181 error("Conflict with previous genotype - Col %d, Marker %s\n"
00182 "Family: %s Individual: %s Old: %s/%s New: %s/%s",
00183 col, (const char *) markerNames[m],
00184 (const char *) p->famid, (const char *) p->pid,
00185 (const char *) info->GetAlleleLabel(p->markers[m][0]),
00186 (const char *) info->GetAlleleLabel(p->markers[m][1]),
00187 (const char *) info->GetAlleleLabel(new_genotype[0]),
00188 (const char *) info->GetAlleleLabel(new_genotype[1]));
00189 }
00190
00191 if (new_genotype.isKnown()) p->markers[m] = new_genotype;
00192 break;
00193 }
00194 case pcTrait :
00195 case pcUndocumentedTraitCovariate :
00196 {
00197 int t = pd.columnHash[col];
00198 double new_pheno = _NAN_;
00199
00200 if (pd.columns[col] == pcUndocumentedTraitCovariate)
00201 t = t / 32768;
00202
00203 const char * value = tokens[field++];
00204 char * flag = NULL;
00205
00206 if (missing == (const char *) NULL || strcmp(value, missing) != 0)
00207 new_pheno = strtod(value, &flag);
00208 if (flag != NULL && *flag) new_pheno = _NAN_;
00209
00210 if (p->traits[t] != _NAN_ && new_pheno != _NAN_ &&
00211 new_pheno != p->traits[t])
00212 error("Conflict with previous phenotype - Col %d, Trait %s\n"
00213 "Family: %s Individual: %s Old: %f New: %f",
00214 col, (const char *) traitNames[t],
00215 (const char *) p->famid, (const char *) p->pid,
00216 p->traits[t], new_pheno);
00217
00218 if (new_pheno != _NAN_) p->traits[t] = new_pheno;
00219 if (pd.columns[col] == pcTrait) break;
00220 }
00221 case pcCovariate :
00222 {
00223 int c = pd.columnHash[col];
00224 double new_covar = _NAN_;
00225
00226 if (pd.columns[col] == pcUndocumentedTraitCovariate)
00227 {
00228 c = c % 32768;
00229 field--;
00230 }
00231
00232 const char * value = tokens[field++];
00233 char * flag = NULL;
00234
00235 if (missing == (const char *) NULL || strcmp(value, missing) != 0)
00236 new_covar = strtod(value, &flag);
00237 if (flag != NULL && *flag) new_covar = _NAN_;
00238
00239 if (p->covariates[c] != _NAN_ && new_covar != _NAN_ &&
00240 new_covar != p->covariates[c])
00241 error("Conflict with previous value - Col %d, Covariate %s\n"
00242 "Family: %s Individual: %s Old: %f New: %f",
00243 col, (const char *) covariateNames[c],
00244 (const char *) p->famid, (const char *) p->pid,
00245 p->covariates[c], new_covar);
00246
00247 if (new_covar != _NAN_) p->covariates[c] = new_covar;
00248 break;
00249 }
00250 case pcSkip :
00251 field++;
00252 break;
00253 case pcZygosity :
00254 {
00255 int new_zygosity;
00256
00257 const char * zygosity = tokens[field++];
00258
00259 switch (zygosity[0])
00260 {
00261 case 'D' :
00262 case 'd' :
00263 new_zygosity = 2;
00264 break;
00265 case 'M' :
00266 case 'm' :
00267 new_zygosity = 1;
00268 break;
00269 default :
00270 new_zygosity = atoi(zygosity);
00271 }
00272 if (p->zygosity != 0 && new_zygosity != p->zygosity)
00273 error("Conflict with previous zygosity - "
00274 "Column %d in pedigree\n"
00275 "Family: %s Individual: %s Old: %d New: %d\n",
00276 col, (const char *) p->famid, (const char *) p->pid,
00277 p->zygosity, new_zygosity);
00278 p->zygosity = new_zygosity;
00279 break;
00280 }
00281 case pcEnd :
00282 break;
00283 default :
00284 error("Inconsistent Pedigree Description -- Internal Error");
00285 }
00286 }
00287
00288 Sort();
00289 }
00290
00291 void Pedigree::LoadMendel(IFILE & input)
00292 {
00293
00294 String familyHeader;
00295 String individualRecord;
00296
00297 familyHeader.ReadLine(input);
00298 individualRecord.ReadLine(input);
00299
00300
00301
00302
00303 FortranFormat headers, records;
00304
00305 headers.SetInputFile(input);
00306 headers.SetFormat(familyHeader);
00307
00308 records.SetInputFile(input);
00309 records.SetFormat(individualRecord);
00310
00311
00312 String famid;
00313 String phenotype;
00314 String affectionCode;
00315 String affectionStem;
00316 int familySize;
00317
00318 String allele1, allele2;
00319
00320 int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1;
00321
00322 while (!ifeof(input))
00323 {
00324 if (count == size)
00325 Grow();
00326
00327
00328 familySize = headers.GetNextInteger();
00329 headers.GetNextField(famid);
00330 headers.Flush();
00331
00332 if (famid.IsEmpty())
00333 {
00334 if (ifeof(input) && familySize == 0)
00335 break;
00336 else
00337 error("Blank family id encountered\n");
00338 }
00339
00340
00341 for (int i = 0; i < familySize; i++)
00342 {
00343 Person * p = persons[count++] = new Person;
00344
00345
00346 p->famid = famid;
00347 records.GetNextField(p->pid);
00348 records.GetNextField(p->fatid);
00349 records.GetNextField(p->motid);
00350
00351 if (p->pid.IsEmpty())
00352 error("No unique identifier for individual #%d in family %s\n",
00353 i + 1, (const char *) famid);
00354
00355 if (p->pid.Compare(".") == 0)
00356 error("Family %s has an individual named '.', but this code is\n"
00357 "reserved to indicate missing parents\n");
00358
00359 if (p->fatid.IsEmpty()) p->fatid = ".";
00360 if (p->motid.IsEmpty()) p->motid = ".";
00361
00362
00363 char sex = records.GetNextCharacter();
00364
00365 switch (sex)
00366 {
00367 case '0' :
00368 case 'x' :
00369 case 'X' :
00370 case '?' :
00371 case 0 :
00372 p->sex = 0;
00373 break;
00374 case '1' :
00375 case 'm' :
00376 case 'M' :
00377 p->sex = 1;
00378 break;
00379 case '2' :
00380 case 'f' :
00381 case 'F' :
00382 p->sex = 2;
00383 break;
00384 default :
00385 error("Can't interpret the sex of individual #%d\n"
00386 "Family: %s Individual: %s Sex Code: %s", count,
00387 (const char *) p->famid, (const char *) p->pid, sex);
00388 };
00389
00390 if (sexAsCovariate)
00391 {
00392 if (p->sex)
00393 p->covariates[sexCovariate] = p->sex;
00394 else
00395 p->covariates[sexCovariate] = _NAN_;
00396 }
00397
00398
00399 char zygosity = records.GetNextCharacter();
00400
00401
00402
00403 if (zygosity)
00404 p->zygosity = (zygosity - ' ') * 2 - 1;
00405
00406 affectionStem.Clear();
00407 for (int col = 0; col < pd.columnCount; col++)
00408 switch (pd.columns[col])
00409 {
00410 case pcAffection :
00411 {
00412 int a = pd.columnHash[col];
00413
00414
00415
00416
00417
00418 if (affectionStem.Length() == 0 ||
00419 affectionNames[a].CompareToStem(affectionStem) != 0)
00420 {
00421 affectionStem.Copy(affectionNames[a], 0, affectionNames[a].FindChar('>') + 1);
00422 records.GetNextField(phenotype);
00423 affectionCode = affectionStem + phenotype;
00424 }
00425
00426
00427 if (phenotype.IsEmpty())
00428 p->affections[a] = 0;
00429 else
00430 p->affections[a] = affectionCode.Compare(affectionNames[a]) == 0 ? 2 : 1;
00431
00432 break;
00433 }
00434 case pcMarker :
00435 {
00436 int m = pd.columnHash[col];
00437
00438 records.GetNextField(phenotype);
00439
00440 if (phenotype.IsEmpty())
00441 {
00442 p->markers[m].one = p->markers[m].two = 0;
00443 continue;
00444 }
00445
00446 int separator = phenotype.FindChar('/');
00447 if (separator == -1) separator = phenotype.FindChar('|');
00448
00449 if (separator == -1)
00450 error("At marker %s, person %s in family %s has genotype %s.\n"
00451 "This genotype is not in the 'al1/al2' format.\n",
00452 (const char *) markerNames[m],
00453 (const char *) p->pid,
00454 (const char *) p->famid,
00455 (const char *) phenotype);
00456
00457 allele1.Copy(phenotype, 0, separator);
00458 allele1.Trim();
00459
00460 allele2.Copy(phenotype, separator + 1, 8);
00461 allele2.Trim();
00462
00463 MarkerInfo * info = GetMarkerInfo(m);
00464
00465 int one = info->alleleNumbers.Integer(allele1);
00466
00467 if (one < 0)
00468 {
00469 if (info->freq.Length() == 0)
00470 one = info->NewAllele(allele1);
00471 else
00472 error("At marker %s, person %s in family %s has genotype %s.\n"
00473 "However, '%s' is not a valid allele for this marker.\n",
00474 (const char *) markerNames[m],
00475 (const char *) p->pid,
00476 (const char *) p->famid,
00477 (const char *) phenotype,
00478 (const char *) allele1);
00479 }
00480
00481 int two = info->alleleNumbers.Integer(allele2);
00482
00483 if (two < 0)
00484 {
00485 if (info->freq.Length() == 0)
00486 two = info->NewAllele(allele2);
00487 else
00488 error("At marker %s, person %s in family %s has genotype %s.\n"
00489 "However, '%s' is not a valid allele for this marker.\n",
00490 (const char *) markerNames[m],
00491 (const char *) p->pid,
00492 (const char *) p->famid,
00493 (const char *) phenotype,
00494 (const char *) allele2);
00495 }
00496
00497 p->markers[m].one = one;
00498 p->markers[m].two = two;
00499 break;
00500 }
00501 case pcEnd :
00502 break;
00503 case pcTrait :
00504 case pcCovariate :
00505 case pcSkip :
00506 case pcZygosity :
00507 default:
00508 error("Inconsistent Pedigree Description -- Internal Error");
00509 }
00510
00511 records.Flush();
00512 }
00513 }
00514
00515 Sort();
00516 }
00517
00518 void Pedigree::Prepare(const char * filename)
00519 {
00520
00521 if (multiPd != NULL)
00522 delete [] multiPd;
00523
00524 multiFileCount = 1;
00525
00526
00527 StringArray filenames;
00528
00529 filenames.AddColumns(filename, ',');
00530
00531 if (filenames.Length() <= 1)
00532 pd.Load(filename);
00533 else
00534 {
00535 printf("AUTOMATIC MERGE ENABLED: Detected multiple datafile names, separated by commas...\n");
00536
00537 multiPd = new PedigreeDescription[filenames.Length()];
00538
00539 for (int i = 0; i < filenames.Length(); i++)
00540 {
00541 printf(" AUTOMATIC MERGE: Reading data file '%s' ...\n", (const char *) filenames[i]);
00542 multiPd[i].Load(filenames[i], false);
00543 }
00544
00545 multiFileCount = filenames.Length();
00546 }
00547 }
00548
00549 void Pedigree::Load(const char * filename, bool allowFailures)
00550 {
00551 if (multiFileCount <= 1)
00552 {
00553 IFILE f = ifopen(filename, "rb");
00554
00555 if (f == NULL && allowFailures)
00556 return;
00557
00558 if (f == NULL)
00559 error(
00560 "The pedigree file %s cannot be opened\n\n"
00561 "Common causes for this problem are:\n"
00562 " * You might not have used the correct options to specify input file names,\n"
00563 " please check the program documentation for information on how to do this\n\n"
00564 " * The file doesn't exist or the filename might have been misspelt\n\n"
00565 " * The file exists but it is being used by another program which you will need\n"
00566 " to close\n\n"
00567 " * The file is larger than 2GB and you haven't compiled this application with\n"
00568 " large file support.\n\n",
00569 filename);
00570
00571 Load(f);
00572 ifclose(f);
00573 }
00574 else
00575 {
00576 StringArray filenames;
00577
00578 filenames.AddColumns(filename, ',');
00579
00580 if (filenames.Length() != multiFileCount)
00581 error("Different numbers of comma separated data and pedigree file names provided\n");
00582
00583 for (int i = 0; i < filenames.Length(); i++)
00584 {
00585 printf(" AUTOMATIC MERGE: Datafile '%s' matched to pedigree '%s' ...\n",
00586 (const char *) multiPd[i].filename, (const char *) filenames[i]);
00587
00588 pd = multiPd[i];
00589
00590 IFILE f = ifopen(filenames[i], "rb");
00591
00592 if (f == NULL)
00593 error("The pedigree file '%s' cannot be opened\n\n",
00594 (const char *) filenames[i]);
00595
00596 Load(f);
00597 ifclose(f);
00598 }
00599
00600 printf("\n");
00601 }
00602 }
00603
00604 int Pedigree::TranslateSexCode(const char * code, bool & failure)
00605 {
00606 failure = false;
00607
00608 switch (code[0])
00609 {
00610 case 'x' :
00611 case 'X' :
00612 case '?' :
00613 return 0;
00614 case '1' :
00615 case 'm' :
00616 case 'M' :
00617 return 1;
00618 case '2' :
00619 case 'f' :
00620 case 'F' :
00621 return 2;
00622 default :
00623 {
00624 bool result = atoi(code);
00625
00626 if (result != 0 && result != 1 && result != 2)
00627 {
00628 failure = true;
00629 result = 0;
00630 }
00631
00632 return result;
00633 }
00634 };
00635 }
00636
00637
00638