libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include "Pedigree.h" 00019 #include "FortranFormat.h" 00020 #include "Error.h" 00021 00022 #include <stdlib.h> 00023 #include <ctype.h> 00024 #include <string.h> 00025 00026 void Pedigree::Prepare(IFILE & input) 00027 { 00028 pd.Load(input); 00029 } 00030 00031 void Pedigree::Load(IFILE & input) 00032 { 00033 if (pd.mendelFormat) 00034 { 00035 LoadMendel(input); 00036 return; 00037 } 00038 00039 int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1; 00040 00041 int textCols = pd.CountTextColumns() + 5; 00042 int oldCount = count; 00043 bool warn = true; 00044 int line = 0; 00045 00046 String buffer; 00047 StringArray tokens; 00048 00049 while (!ifeof(input)) 00050 { 00051 int field = 0; 00052 00053 buffer.ReadLine(input); 00054 00055 tokens.Clear(); 00056 tokens.AddTokens(buffer, WHITESPACE); 00057 00058 if (tokens.Length() == 0) continue; 00059 if (tokens[0].SlowCompare("end") == 0) break; 00060 00061 line++; 00062 00063 if (tokens.Length() < textCols) 00064 { 00065 if (buffer.Length() > 79) 00066 { 00067 buffer.SetLength(75); 00068 buffer += " ..."; 00069 } 00070 00071 String description; 00072 00073 pd.ColumnSummary(description); 00074 error("Loading Pedigree...\n\n" 00075 "Expecting %d columns (%s),\n" 00076 "but read only %d columns in line %d.\n\n" 00077 "The problem line is transcribed below:\n%s\n", 00078 textCols, (const char *) description, 00079 tokens.Length(), line, (const char *) buffer); 00080 } 00081 00082 if (tokens.Length() > textCols && warn && textCols > 5) 00083 { 00084 pd.ColumnSummary(buffer); 00085 printf("WARNING -- Trailing columns in pedigree file will be ignored\n" 00086 " Expecting %d data columns (%s)\n" 00087 " However line %d, for example, has %d data columns\n\n", 00088 textCols - 5, (const char *) buffer, line, tokens.Length() - 5); 00089 warn = false; 00090 } 00091 00092 Person * p; 00093 00094 // create a new person if necessary 00095 if (oldCount==0 || (p = FindPerson(tokens[0], tokens[1], oldCount))==NULL) 00096 { 00097 if (count == size) Grow(); 00098 00099 p = persons[count++] = new Person; 00100 } 00101 00102 p->famid = tokens[field++]; // famid 00103 p->pid = tokens[field++]; // pid 00104 p->fatid = tokens[field++]; // fatid 00105 p->motid = tokens[field++]; // motid 00106 00107 bool failure = false; 00108 p->sex = TranslateSexCode(tokens[field++], failure); 00109 if (failure) 00110 error("Can't interpret the sex of individual #%d\n" 00111 "Family: %s Individual: %s Sex Code: %s", count, 00112 (const char *) p->famid, (const char *) p->pid, 00113 (const char *) tokens[field-1]); 00114 00115 if (sexAsCovariate) 00116 { 00117 if (p->sex) 00118 p->covariates[sexCovariate] = p->sex; 00119 else 00120 p->covariates[sexCovariate] = _NAN_; 00121 } 00122 00123 for (int col = 0; col < pd.columnCount; col++) 00124 switch (pd.columns[col]) 00125 { 00126 case pcAffection : 00127 { 00128 int a = pd.columnHash[col]; 00129 int new_status; 00130 00131 const char * affection = tokens[field++]; 00132 00133 switch (toupper(affection[0])) 00134 { 00135 case '1' : 00136 case 'N' : 00137 case 'U' : 00138 new_status = 1; 00139 break; 00140 case '2' : 00141 case 'D' : 00142 case 'A' : 00143 case 'Y' : 00144 new_status = 2; 00145 break; 00146 default : 00147 new_status = atoi(affection); 00148 if (new_status < 0 || new_status > 2) 00149 error("Incorrect formating for affection status " 00150 "Col %d, Affection %s\n" 00151 "Family: %s Individual: %s Status: %s", 00152 col, (const char *) affectionNames[a], 00153 (const char *) p->famid, (const char *) p->pid, 00154 affection); 00155 } 00156 if (new_status != 0 && p->affections[a] != 0 && 00157 new_status != p->affections[a]) 00158 error("Conflict with previous affection status - " 00159 "Col %d, Affection %s\n" 00160 "Family: %s Individual: %s Old: %d New: %d", 00161 col, (const char *) affectionNames[a], 00162 (const char *) p->famid, (const char *) p->pid, 00163 p->affections[a], new_status); 00164 if (new_status) p->affections[a] = new_status; 00165 break; 00166 } 00167 case pcMarker : 00168 { 00169 int m = pd.columnHash[col]; 00170 00171 Alleles new_genotype; 00172 00173 new_genotype[0] = LoadAllele(m, tokens[field++]); 00174 new_genotype[1] = LoadAllele(m, tokens[field++]); 00175 00176 if (p->markers[m].isKnown() && new_genotype.isKnown() && 00177 new_genotype != p->markers[m]) 00178 { 00179 MarkerInfo * info = GetMarkerInfo(m); 00180 00181 error("Conflict with previous genotype - Col %d, Marker %s\n" 00182 "Family: %s Individual: %s Old: %s/%s New: %s/%s", 00183 col, (const char *) markerNames[m], 00184 (const char *) p->famid, (const char *) p->pid, 00185 (const char *) info->GetAlleleLabel(p->markers[m][0]), 00186 (const char *) info->GetAlleleLabel(p->markers[m][1]), 00187 (const char *) info->GetAlleleLabel(new_genotype[0]), 00188 (const char *) info->GetAlleleLabel(new_genotype[1])); 00189 } 00190 00191 if (new_genotype.isKnown()) p->markers[m] = new_genotype; 00192 break; 00193 } 00194 case pcTrait : 00195 case pcUndocumentedTraitCovariate : 00196 { 00197 int t = pd.columnHash[col]; 00198 double new_pheno = _NAN_; 00199 00200 if (pd.columns[col] == pcUndocumentedTraitCovariate) 00201 t = t / 32768; 00202 00203 const char * value = tokens[field++]; 00204 char * flag = NULL; 00205 00206 if (missing == (const char *) NULL || strcmp(value, missing) != 0) 00207 new_pheno = strtod(value, &flag); 00208 if (flag != NULL && *flag) new_pheno = _NAN_; 00209 00210 if (p->traits[t] != _NAN_ && new_pheno != _NAN_ && 00211 new_pheno != p->traits[t]) 00212 error("Conflict with previous phenotype - Col %d, Trait %s\n" 00213 "Family: %s Individual: %s Old: %f New: %f", 00214 col, (const char *) traitNames[t], 00215 (const char *) p->famid, (const char *) p->pid, 00216 p->traits[t], new_pheno); 00217 00218 if (new_pheno != _NAN_) p->traits[t] = new_pheno; 00219 if (pd.columns[col] == pcTrait) break; 00220 } 00221 case pcCovariate : 00222 { 00223 int c = pd.columnHash[col]; 00224 double new_covar = _NAN_; 00225 00226 if (pd.columns[col] == pcUndocumentedTraitCovariate) 00227 { 00228 c = c % 32768; 00229 field--; 00230 } 00231 00232 const char * value = tokens[field++]; 00233 char * flag = NULL; 00234 00235 if (missing == (const char *) NULL || strcmp(value, missing) != 0) 00236 new_covar = strtod(value, &flag); 00237 if (flag != NULL && *flag) new_covar = _NAN_; 00238 00239 if (p->covariates[c] != _NAN_ && new_covar != _NAN_ && 00240 new_covar != p->covariates[c]) 00241 error("Conflict with previous value - Col %d, Covariate %s\n" 00242 "Family: %s Individual: %s Old: %f New: %f", 00243 col, (const char *) covariateNames[c], 00244 (const char *) p->famid, (const char *) p->pid, 00245 p->covariates[c], new_covar); 00246 00247 if (new_covar != _NAN_) p->covariates[c] = new_covar; 00248 break; 00249 } 00250 case pcString : 00251 { 00252 int c = pd.columnHash[col]; 00253 00254 if (!p->strings[c].IsEmpty() && p->strings[c] != tokens[field]) 00255 error("Conflict with previous value - Col %d, String %s\n" 00256 "Family: %s Individual: %s Old: %s New: %s", 00257 col, (const char *) stringNames[c], 00258 (const char *) p->famid, (const char *) p->pid, 00259 (const char *) p->strings[c], (const char *) tokens[field]); 00260 00261 p->strings[c] = tokens[field++]; 00262 00263 break; 00264 } 00265 case pcSkip : 00266 field++; 00267 break; 00268 case pcZygosity : 00269 { 00270 int new_zygosity; 00271 00272 const char * zygosity = tokens[field++]; 00273 00274 switch (zygosity[0]) 00275 { 00276 case 'D' : 00277 case 'd' : 00278 new_zygosity = 2; 00279 break; 00280 case 'M' : 00281 case 'm' : 00282 new_zygosity = 1; 00283 break; 00284 default : 00285 new_zygosity = atoi(zygosity); 00286 } 00287 if (p->zygosity != 0 && new_zygosity != p->zygosity) 00288 error("Conflict with previous zygosity - " 00289 "Column %d in pedigree\n" 00290 "Family: %s Individual: %s Old: %d New: %d\n", 00291 col, (const char *) p->famid, (const char *) p->pid, 00292 p->zygosity, new_zygosity); 00293 p->zygosity = new_zygosity; 00294 break; 00295 } 00296 case pcEnd : 00297 break; 00298 default : 00299 error("Inconsistent Pedigree Description -- Internal Error"); 00300 } 00301 } 00302 00303 Sort(); 00304 } 00305 00306 void Pedigree::LoadMendel(IFILE & input) 00307 { 00308 // First, retrieve the two format statements from file 00309 String familyHeader; 00310 String individualRecord; 00311 00312 familyHeader.ReadLine(input); 00313 individualRecord.ReadLine(input); 00314 00315 // Then create two FORTRAN input streams... 00316 // One will be used for retrieving family labels and sizes, the other 00317 // will be used for individual information 00318 FortranFormat headers, records; 00319 00320 headers.SetInputFile(input); 00321 headers.SetFormat(familyHeader); 00322 00323 records.SetInputFile(input); 00324 records.SetFormat(individualRecord); 00325 00326 // Storage for key pieces of information 00327 String famid; 00328 String phenotype; 00329 String affectionCode; 00330 String affectionStem; 00331 int familySize; 00332 00333 String allele1, allele2; 00334 00335 int sexCovariate = sexAsCovariate ? GetCovariateID("sex") : -1; 00336 00337 while (!ifeof(input)) 00338 { 00339 if (count == size) 00340 Grow(); 00341 00342 // Retrieve header for next family 00343 familySize = headers.GetNextInteger(); 00344 headers.GetNextField(famid); 00345 headers.Flush(); 00346 00347 if (famid.IsEmpty()) 00348 { 00349 if (ifeof(input) && familySize == 0) 00350 break; 00351 else 00352 error("Blank family id encountered\n"); 00353 } 00354 00355 // Retrieve each individual in the family 00356 for (int i = 0; i < familySize; i++) 00357 { 00358 Person * p = persons[count++] = new Person; 00359 00360 // Retrieve basic pedigree structure 00361 p->famid = famid; 00362 records.GetNextField(p->pid); 00363 records.GetNextField(p->fatid); 00364 records.GetNextField(p->motid); 00365 00366 if (p->pid.IsEmpty()) 00367 error("No unique identifier for individual #%d in family %s\n", 00368 i + 1, (const char *) famid); 00369 00370 if (p->pid.Compare(".") == 0) 00371 error("Family %s has an individual named '.', but this code is\n" 00372 "reserved to indicate missing parents\n"); 00373 00374 if (p->fatid.IsEmpty()) p->fatid = "."; 00375 if (p->motid.IsEmpty()) p->motid = "."; 00376 00377 // Retrieve and decode sex code 00378 char sex = records.GetNextCharacter(); 00379 00380 switch (sex) 00381 { 00382 case '0' : 00383 case 'x' : 00384 case 'X' : 00385 case '?' : 00386 case 0 : 00387 p->sex = 0; 00388 break; 00389 case '1' : 00390 case 'm' : 00391 case 'M' : 00392 p->sex = 1; 00393 break; 00394 case '2' : 00395 case 'f' : 00396 case 'F' : 00397 p->sex = 2; 00398 break; 00399 default : 00400 error("Can't interpret the sex of individual #%d\n" 00401 "Family: %s Individual: %s Sex Code: %s", count, 00402 (const char *) p->famid, (const char *) p->pid, sex); 00403 }; 00404 00405 if (sexAsCovariate) 00406 { 00407 if (p->sex) 00408 p->covariates[sexCovariate] = p->sex; 00409 else 00410 p->covariates[sexCovariate] = _NAN_; 00411 } 00412 00413 // Retrieve and decode zygosity 00414 char zygosity = records.GetNextCharacter(); 00415 00416 // Mendel uses a unique character to indicate each MZ pair, 00417 // we use a unique odd number... 00418 if (zygosity) 00419 p->zygosity = (zygosity - ' ') * 2 - 1; 00420 00421 affectionStem.Clear(); 00422 for (int col = 0; col < pd.columnCount; col++) 00423 switch (pd.columns[col]) 00424 { 00425 case pcAffection : 00426 { 00427 int a = pd.columnHash[col]; 00428 00429 // We expand each Mendel non-codominant trait into multiple 00430 // affection status column... First, if this is not a 00431 // continuation of a previous expansion we first retrieve 00432 // and encode the affection status. 00433 if (affectionStem.Length() == 0 || 00434 affectionNames[a].CompareToStem(affectionStem) != 0) 00435 { 00436 affectionStem.Copy(affectionNames[a], 0, affectionNames[a].FindChar('>') + 1); 00437 records.GetNextField(phenotype); 00438 affectionCode = affectionStem + phenotype; 00439 } 00440 00441 // Then encode each phenotype appropriately 00442 if (phenotype.IsEmpty()) 00443 p->affections[a] = 0; 00444 else 00445 p->affections[a] = affectionCode.Compare(affectionNames[a]) == 0 ? 2 : 1; 00446 00447 break; 00448 } 00449 case pcMarker : 00450 { 00451 int m = pd.columnHash[col]; 00452 00453 records.GetNextField(phenotype); 00454 00455 if (phenotype.IsEmpty()) 00456 { 00457 p->markers[m].one = p->markers[m].two = 0; 00458 continue; 00459 } 00460 00461 int separator = phenotype.FindChar('/'); 00462 if (separator == -1) separator = phenotype.FindChar('|'); 00463 00464 if (separator == -1) 00465 error("At marker %s, person %s in family %s has genotype %s.\n" 00466 "This genotype is not in the 'al1/al2' format.\n", 00467 (const char *) markerNames[m], 00468 (const char *) p->pid, 00469 (const char *) p->famid, 00470 (const char *) phenotype); 00471 00472 allele1.Copy(phenotype, 0, separator); 00473 allele1.Trim(); 00474 00475 allele2.Copy(phenotype, separator + 1, 8); 00476 allele2.Trim(); 00477 00478 MarkerInfo * info = GetMarkerInfo(m); 00479 00480 int one = info->alleleNumbers.Integer(allele1); 00481 00482 if (one < 0) 00483 { 00484 if (info->freq.Length() == 0) 00485 one = info->NewAllele(allele1); 00486 else 00487 error("At marker %s, person %s in family %s has genotype %s.\n" 00488 "However, '%s' is not a valid allele for this marker.\n", 00489 (const char *) markerNames[m], 00490 (const char *) p->pid, 00491 (const char *) p->famid, 00492 (const char *) phenotype, 00493 (const char *) allele1); 00494 } 00495 00496 int two = info->alleleNumbers.Integer(allele2); 00497 00498 if (two < 0) 00499 { 00500 if (info->freq.Length() == 0) 00501 two = info->NewAllele(allele2); 00502 else 00503 error("At marker %s, person %s in family %s has genotype %s.\n" 00504 "However, '%s' is not a valid allele for this marker.\n", 00505 (const char *) markerNames[m], 00506 (const char *) p->pid, 00507 (const char *) p->famid, 00508 (const char *) phenotype, 00509 (const char *) allele2); 00510 } 00511 00512 p->markers[m].one = one; 00513 p->markers[m].two = two; 00514 break; 00515 } 00516 case pcEnd : 00517 break; 00518 case pcTrait : 00519 case pcCovariate : 00520 case pcSkip : 00521 case pcZygosity : 00522 default: 00523 error("Inconsistent Pedigree Description -- Internal Error"); 00524 } 00525 00526 records.Flush(); 00527 } 00528 } 00529 00530 Sort(); 00531 } 00532 00533 void Pedigree::Prepare(const char * filename) 00534 { 00535 // Clear any previously loaded pedigree description 00536 if (multiPd != NULL) 00537 delete [] multiPd; 00538 00539 multiFileCount = 1; 00540 00541 // Enable multifile support 00542 StringArray filenames; 00543 00544 filenames.AddColumns(filename, ','); 00545 00546 if (filenames.Length() <= 1) 00547 pd.Load(filename); 00548 else 00549 { 00550 printf("AUTOMATIC MERGE ENABLED: Detected multiple datafile names, separated by commas...\n"); 00551 00552 multiPd = new PedigreeDescription[filenames.Length()]; 00553 00554 for (int i = 0; i < filenames.Length(); i++) 00555 { 00556 printf(" AUTOMATIC MERGE: Reading data file '%s' ...\n", (const char *) filenames[i]); 00557 multiPd[i].Load(filenames[i], false); 00558 } 00559 00560 multiFileCount = filenames.Length(); 00561 } 00562 } 00563 00564 void Pedigree::Load(const char * filename, bool allowFailures) 00565 { 00566 if (multiFileCount <= 1) 00567 { 00568 IFILE f = ifopen(filename, "rb"); 00569 00570 if (f == NULL && allowFailures) 00571 return; 00572 00573 if (f == NULL) 00574 error( 00575 "The pedigree file %s cannot be opened\n\n" 00576 "Common causes for this problem are:\n" 00577 " * You might not have used the correct options to specify input file names,\n" 00578 " please check the program documentation for information on how to do this\n\n" 00579 " * The file doesn't exist or the filename might have been misspelt\n\n" 00580 " * The file exists but it is being used by another program which you will need\n" 00581 " to close\n\n" 00582 " * The file is larger than 2GB and you haven't compiled this application with\n" 00583 " large file support.\n\n", 00584 filename); 00585 00586 Load(f); 00587 ifclose(f); 00588 } 00589 else 00590 { 00591 StringArray filenames; 00592 00593 filenames.AddColumns(filename, ','); 00594 00595 if (filenames.Length() != multiFileCount) 00596 error("Different numbers of comma separated data and pedigree file names provided\n"); 00597 00598 for (int i = 0; i < filenames.Length(); i++) 00599 { 00600 printf(" AUTOMATIC MERGE: Datafile '%s' matched to pedigree '%s' ...\n", 00601 (const char *) multiPd[i].filename, (const char *) filenames[i]); 00602 00603 pd = multiPd[i]; 00604 00605 IFILE f = ifopen(filenames[i], "rb"); 00606 00607 if (f == NULL) 00608 error("The pedigree file '%s' cannot be opened\n\n", 00609 (const char *) filenames[i]); 00610 00611 Load(f); 00612 ifclose(f); 00613 } 00614 00615 printf("\n"); 00616 } 00617 } 00618 00619 int Pedigree::TranslateSexCode(const char * code, bool & failure) 00620 { 00621 failure = false; 00622 00623 switch (code[0]) 00624 { 00625 case 'x' : 00626 case 'X' : 00627 case '?' : 00628 return 0; 00629 case '1' : 00630 case 'm' : 00631 case 'M' : 00632 return 1; 00633 case '2' : 00634 case 'f' : 00635 case 'F' : 00636 return 2; 00637 default : 00638 { 00639 bool result = atoi(code); 00640 00641 if (result != 0 && result != 1 && result != 2) 00642 { 00643 failure = true; 00644 result = 0; 00645 } 00646 00647 return result; 00648 } 00649 }; 00650 }