// GAINQC.cpp : Defines the entry point for the console application.
//

#include "Parameters.h"
#include "GenotypeLoader.h"
#include "FirstPass.h"
#include "Pedigree.h"
#include "Error.h"
#include "AssayInfo.h"
#include "SecondPass.h"
#include "RelationCheck.h"
#include "HapMapReference.h"
#include "Quality.h"
#include "QCHistograms.h"
#include "SampleLabel.h"
#include "IgnoreSamples.h"
#include "ResultFileOutput.h"
#include "LogManager.h"

#include <stdio.h>
/*********************DEBUG CODE******************
void outputDupInfo(IntArray list, IntArray * sets)
{
  int numSets = list.Length();
  printf("%d Ids duplicated\n\n", numSets);
  for (int i = 0; i < numSets; i++)
  {
    int numElems = sets[i].Length();
    printf("Set %d has %d elements\t\t", i, numElems);
    for (int k = 0; k < numElems; k++)
      printf("%d\t", sets[i][k]);
    printf("\n");
  }
}

*******************END DEBUG CODE****************/

String snpfile;
String pedfile;
String genofile;
String settingfile;
String qualityfile;
String hapmapfile;
String prefix;
String ignorefile;

int logFileLength = 0;

bool skipRelativeCheck  = false;
bool calcTDTStatistic   = false;
bool performAssociation = false;
bool outputFiles        = false;

LogManager logMgr;

BEGIN_LONG_PARAMETERS(parameters)
   LONG_PARAMETER_GROUP("Required Files")
      LONG_STRINGPARAMETER("snps", &snpfile)
      LONG_STRINGPARAMETER("ped", &pedfile)
      LONG_STRINGPARAMETER("geno", &genofile)
   LONG_PARAMETER_GROUP("Optional Files")
      LONG_STRINGPARAMETER("settings", &settingfile)
      LONG_STRINGPARAMETER("quality", &qualityfile)
      LONG_STRINGPARAMETER("hapmap", &hapmapfile)
      LONG_STRINGPARAMETER("ignoreSamples", &ignorefile)
   LONG_PARAMETER_GROUP("Output Control")
      LONG_STRINGPARAMETER("prefix", &prefix)
      LONG_PARAMETER("skipRelatednessCheck", &skipRelativeCheck)
      LONG_PARAMETER("TDT", &calcTDTStatistic)
      LONG_PARAMETER("association", &performAssociation)
      LONG_PARAMETER("output", &outputFiles)
      LONG_INTPARAMETER("logSize", &logFileLength)
END_LONG_PARAMETERS();

FILE * fopen_wrapper(const char * filename, const char * mode)
   {
   FILE * f = fopen(filename, mode);

   if (f == NULL)
     error("Failed to open file %s\n", filename);

   return f;
   }

int main(int argc, char ** argv)
   {
   // Show a simple banner
   printf("QA/QC Software -- (c) 2006-2007 Goncalo Abecasis and Shyam Gopalakrishnan\n");

#ifdef VERSION
   printf("Software Version " VERSION "\n");
#endif

   // Read and process command line arguments
   ParameterList pl;

   pl.Add(new LongParameters("Command Line Options", parameters));
   pl.Read(argc, argv);
   pl.Status();

   if (prefix.IsEmpty())
   {
      prefix = genofile;
   }
  
   // Set the default values for all the thresholds
   QC_Settings qcSettings;
   qcSettings.SetDefaults();
   qcSettings.custom.LoadSettings((const char *) settingfile);

   logMgr.initialize(logFileLength);
   logMgr.glog = fopen_wrapper(prefix + ".log", "wt");

   printf("LOADING PEDIGREE FILE ...\n");
   printf("=========================\n\n");

   Pedigree ped;
   ped.Load(pedfile);
   
   if (ped.count == 0)
     printf("No pedigree information available\n\n");
   else
     printf("Loaded pedigree information for %d individuals\n\n", ped.count);

   // Wipe out family ids, since we are not using them - v.v. important. 
   // Downstream analysis depends on the famid being FORGOTTEN
   for (int i = 0; i < ped.count; i++)
      ped[i].famid = "FORGOTTEN";
   ped.Sort();

   printf("LOADING ASSAY INFORMATION ...\n");
   printf("=============================\n\n");

   FILE * a = fopen_wrapper(snpfile, "rt");

   AssayInfo assay;

   fprintf(logMgr.glog, "Reading assay information ...\n");
   fflush(logMgr.glog);

   if (assay.parseHeader(a))
   {
     if (!assay.loadAssayInfo(a))
     {
       printf("There are too many formatting errors in the assay information file.\n");
       printf("Please check log file for details, correct the errors and try again.\n\n");
       return -1;
     }
   }
   else
   {
     printf("Assay information file is incorrectly formatted and will be ignored\n");
     return -1;
   }

   if (assay.snpArray.Entries() == 0)
   {
      printf("No marker information available.\n\n");
      return -1;
   }
   else
   {
      printf("Loaded marker information for %d markers\n", assay.snpArray.Entries());

      if (assay.duplicateEntries)
        printf("   WARNING: %d duplicate marker names, 2nd entry ignored\n", assay.duplicateEntries);
      if (assay.invalidAlleles)
         printf("   WARNING: %d markers had invalid alleles, ignored\n", assay.invalidAlleles);
      printf("\n");
   }

   fclose(a);

   // We need to load the header from the genotype file before we get started
   //

   FILE * f = fopen_wrapper(genofile, "rt");

   IntArray       qualityScores;
   Quality        quality;
   IgnoreSamples  ignoreSamples;

   if (!ignorefile.IsEmpty())
   {
      FILE * i = fopen_wrapper(ignorefile, "rt");
      ignoreSamples.PopulateIgnoreList(i);
   }
   
   GenotypeLoader loader;
   loader.LoadHeader(f, ignoreSamples, ped);
   ignoreSamples.CheckIgnoreSampleIDs();
   fflush(logMgr.glog);
   
   // read sample labels from the pedigree file -- only for samples that havent been ignored
   bool labelsPresent = false;
   SampleLabel sampLabels(loader.columnLabels.Length() - 1);
   FILE * labelFile = fopen_wrapper((const char *)pedfile, "r");
   if (!QC_Settings::SAMPLE_LABEL_KEY.IsEmpty())
   {
      labelsPresent = sampLabels.LoadSampleLabels(labelFile, QC_Settings::SAMPLE_LABEL_KEY, loader.columnLabels, ped.count);
   }
   
   fclose(labelFile);
   performAssociation = (performAssociation && labelsPresent); // skip association if labels not present

/*******************DEBUG CODE*****************
   outputDupInfo(loader.duplicatedSampleIds, loader.duplicateSampleSets);
**********************************************/

   // load quality scores from quality file
   bool qualityScoresPresent = false;
   FILE * q = NULL;

   if (!qualityfile.IsEmpty())
   {
      printf("LOADING QUALITY SCORES ...\n");
      printf("==========================\n\n");
      
      q = fopen_wrapper(qualityfile, "rt");
      if (!quality.LoadHeader(q, loader.columnLabels))
      {
         qualityScoresPresent = false;
         printf("%s\n\n", "WARNING: Empty Quality score file, ignored");
      }
      else
      {
         qualityScoresPresent = true;      
         quality.PopulateMarkerHash(q);
         printf("Using quality score threshold of %.1f\n", QC_Settings::QUALITY_THRESHOLD);
         printf("Loaded quality scores for %d markers\n\n", quality.markerLine.Entries());
      }
   }

   fflush(stdout);
   // Load Hapmap genotypes for comparison (if available)
   //

   HapMapReference hapmap;

   if (!hapmapfile.IsEmpty())
      {
      printf("LOADING HAPMAP GENOTYPES ...\n");
      printf("============================\n\n");

      FILE * h = fopen_wrapper(hapmapfile, "rt");

      hapmap.LoadGenotypes(h, assay, loader.columnLabels);
      }

   fflush(stdout);
   // Load and populate marker information
   // Report summary of problems (if any)

   printf("FIRST PASS GENOTYPE ANALYSIS ...\n");
   printf("================================\n");

   // Process genotype file

   FirstPass      firstPass;

   SNPInfo * currSNP;
   bool isSexLinked = false;
   bool noGenoPostQual = true; // indicates if the marker has no genotypes post quality cutoff
   String tempFailure;
     
   logMgr.WriteToLog(String("Reading marker information (1st pass) ..."), 0);

   int dupMarkers = 0, successes = 0, line = 0, nonQCable = 0, missingInfo = 0;
   int alleleDiffers = 0, alleleSwitch = 0, monomorphics = 0, emptyLines = 0, emptyMarkers = 0, emptyPostQual = 0;

   loader.PopulatePedigreeInformation(ped);
   firstPass.Prepare(loader.columnToSampleId.Length() - 1);
   printf("Progress (each '.' indicates 20000 markers)\n");
   
   // mendel log file
   FILE * mendelLog = fopen_wrapper(prefix + ".mendelLog", "wt");
   fprintf(mendelLog, "Marker\tOffspring\tSex\tFather\tMother\tOffspring_genotype\tFather_genotype\tMother_genotype\tSexLinked\n");

   while (!feof(f))
   {

     loader.LoadMarker(f, ignoreSamples.isSampleSkipped);

     if (++line%20000 == 0)
     {
        printf(".");
        fflush(stdout);
     }

     if (!loader.isValid)
     {
       tempFailure.printf("INVALID FORMAT, line %d: %s", line, (const char *) loader.failure);
       logMgr.WriteToLog(tempFailure, 12);
       tempFailure.Clear();
       
       if (loader.isLineEmpty)
         emptyLines++;
       else if (loader.isDuplicateMarker)
         dupMarkers++;
       else if (loader.isEmptyMarker)
         emptyMarkers++;
       continue;
     }

     currSNP = assay.getSNPInfo(loader.currMarker);

     if (currSNP == NULL)
     {
        tempFailure.printf("WARNING: SNP %s not in SNP Info file, Skipping this SNP", (const char *)loader.currMarker);
	logMgr.WriteToLog(tempFailure, 14);
	tempFailure.Clear();
        missingInfo++;
        continue;
     }
     else
     {
        if (!currSNP->isQCable())
        {
           tempFailure.printf("SNP %s is not QCable.", (const char *)loader.currMarker);
	   logMgr.WriteToLog(tempFailure, 15);
	   tempFailure.Clear();
           nonQCable++;
           continue;
        }
        if (currSNP->AreAllelesSwitched(loader.alleles, loader.isMonomorphic))
        {
           loader.alleles[0] = currSNP->alleles[0];
           loader.alleles[1] = currSNP->alleles[1];
           alleleSwitch++;
        }
        else
        {
          if (!currSNP->DoAllelesMatch(loader.alleles, loader.isMonomorphic))
          {
             tempFailure.printf("WARNING: Skipping SNP, %s has different alleles in SNP info file and genotype file.\n", (const char *)loader.currMarker);
             tempFailure.catprintf("In SNP Info file: %c/%c | In Genotype file: %c/%c", currSNP->alleles[0], currSNP->alleles[1], loader.alleles[0], loader.alleles[1]);
	     logMgr.WriteToLog(tempFailure, 16);
	     tempFailure.Clear();
             alleleDiffers++;
             continue;
          }
        }
        isSexLinked = currSNP->isSexLinked();
     }

     quality.LoadQualityScores(q, currSNP->preferredID, qualityScores);
     if (!hapmap.IsEmpty())
         {
         hapmap.CompareGenotypes(currSNP->rsID, loader.genotypes, loader.alleles[0], loader.alleles[1], currSNP->strand[0], qualityScores);
         // Get quality scores
         // Get genotypes
         // Compare the two!
         }
     
     firstPass.UpdateSampleScores(qualityScores);

     quality.CheckQualityScores(loader.currMarker, loader.genotypes, q, noGenoPostQual);

     if (noGenoPostQual)   
     {
        emptyPostQual++;
        successes--;
     }
        

     loader.setMonomorphic();
     loader.UpdateMendelErrors(mendelLog, isSexLinked);
     monomorphics += loader.isMonomorphic ? 1 : 0;

     firstPass.ProcessMarker(loader.genotypes);
     if (!loader.isMonomorphic)
     {
         firstPass.UpdateLikelihood(loader.genotypes, loader.sexCodes, isSexLinked, QC_Settings::GENOTYPING_ERROR);
         if (isSexLinked)
             firstPass.UpdateSexOdds(loader.genotypes, loader.sexCodes, QC_Settings::GENOTYPING_ERROR);
     }
     firstPass.UpdateMendelErrors(loader.mendelErrors);
     successes++;
     }
   fclose(mendelLog);

   printf("\n\n");
   printf(" Genotype file includes %d rows\n", successes + dupMarkers + emptyLines + nonQCable + missingInfo + alleleDiffers + emptyPostQual);
   printf("      %d duplicate marker rows were skipped\n", dupMarkers);
   printf("      %d empty lines were skipped\n", emptyLines);
   printf("      %d markers with no valid genotypes were skipped\n",emptyMarkers);
   printf("      %d markers with no valid genotypes after quality threshold were skipped\n", emptyPostQual);
   printf("      %d markers were not in SNP info file were skipped\n", missingInfo);
   printf("      %d markers flagged as non-QCable were skipped\n", nonQCable);
   printf("      %d markers with different alleles in snp and genotype files were skipped\n", alleleDiffers);
   printf("      %d markers were quality checked\n", successes);
   printf("              %d markers had bases switched in genotype file\n", alleleSwitch);
   printf("              %d markers were monomorphic in the first pass\n", monomorphics);
   printf(" First pass completed, see [%s.log] for additional information\n\n",
        (const char *) prefix);

   if (successes == 0)
     {
     printf("NOTHING TO DO: No genotype data available\n\n");
     return -1;
     }

   firstPass.BuildMask(loader.sexCodes);

   FILE * fout = fopen_wrapper(prefix + ".sampleinfo", "wt");
   firstPass.OutputSampleStatistics(fout, loader.columnLabels);
   fclose(fout);

   QCHistograms qchist;
   qchist.BuildSampleHistograms(firstPass, prefix, qualityScoresPresent, sampLabels, loader.sexCodes);
   
   
   firstPass.ReleaseMemory();

   // Now run the second pass
   //

   printf("SECOND PASS GENOTYPE ANALYSIS ...\n");
   printf("=================================\n");
   logMgr.WriteToLog("Reading marker information (2nd pass) ...\n", 0);
   SecondPass secondPass;

   int numRelPairs = ((loader.sexCodes.Length() - 1)*(loader.sexCodes.Length() - 2))/2;

   secondPass.Prepare(loader.markerIds.Entries(), firstPass.mask);
   RelationCheck relCheck(loader.columnToSampleId.Length() - 1);
   relCheck.initRelations(loader.fatherColumn, loader.motherColumn, loader.columnToSampleId, (loader.columnToSampleId.Length() - 1));
   if (secondPass.failure.Length() != 0)
   {
     logMgr.WriteToLog(secondPass.failure, 0);
     printf("%s", (const char *)secondPass.failure);
     return -1;
   }
   loader.Rewind(f);

   FILE *sout = fopen_wrapper(prefix + ".snpinfo", "wt");
   FILE *notAssessed = fopen_wrapper(prefix + ".notassessed", "wt");
   
   // output files
   FILE *outQuality = NULL;
   FILE *outGenoThresholded = NULL;
   FILE *outGenoNoThreshold = NULL;
   ResultOutputter outputter;
   
   // this reqd to output genotypes without thresholding on quality
   IntArray   origGenotypes;

   if (outputFiles)
   {
      outGenoThresholded = fopen_wrapper(prefix + ".genotypes.filtered", "wt");
      outputter.WriteOutputHeader(outGenoThresholded, firstPass.mask, loader.columnLabels);
      if (qualityScoresPresent)
      {
         outQuality = fopen_wrapper(prefix + ".quality", "wt");
         outputter.WriteOutputHeader(outQuality, firstPass.mask, loader.columnLabels);
         outGenoNoThreshold = fopen_wrapper(prefix + ".genotypes.original", "wt");
         outputter.WriteOutputHeader(outGenoNoThreshold, firstPass.mask, loader.columnLabels);
      }
   }

   int cnt = 0;

   secondPass.writeOutputHeader(sout, calcTDTStatistic, performAssociation);
   printf("Progress (each '.' indicates 20000 markers)\n");
   while (!feof(f))
     {
     loader.LoadMarker(f, ignoreSamples.isSampleSkipped);
     if (loader.isMarkerFailure)
        {
        secondPass.outputNotAssessed(notAssessed, loader.currMarker, loader.failure);
        continue;
        }
     cnt++;

     if(cnt%20000 == 0)
        {
        printf(".");
        fflush(stdout);
        }

     if (!loader.isValid)
       {
       secondPass.currentMarker++;
       secondPass.outputNotAssessed(notAssessed, loader.currMarker, loader.failure);
       continue;
       }

     currSNP = assay.getSNPInfo(loader.currMarker);
     if (currSNP == NULL)
        {
        secondPass.currentMarker++;
        secondPass.outputNotAssessed(notAssessed, loader.currMarker, String("SNP not found in snp info file"));
        continue;
        }
     else
     {
        if (!currSNP->isQCable())
        {
           secondPass.currentMarker++;
           secondPass.outputNotAssessed(notAssessed, loader.currMarker, String("SNP not QCable"));
           continue;
        }
        if (currSNP->AreAllelesSwitched(loader.alleles, loader.isMonomorphic))
        {
           loader.alleles[0] = currSNP->alleles[0];
           loader.alleles[1] = currSNP->alleles[1];
        }
        else
        {
           if (!currSNP->DoAllelesMatch(loader.alleles, loader.isMonomorphic))
           {
              secondPass.currentMarker++;
              secondPass.outputNotAssessed(notAssessed, loader.currMarker, String("Alleles from snp info and genotype file do not match"));
              continue;
           }
        }
        isSexLinked = currSNP->isSexLinked();
     }

     if (qualityScoresPresent && outputFiles)
     {
        quality.LoadQualityScores(q, currSNP->preferredID, qualityScores);
        // this stores the original genotypes to output them later.
        origGenotypes.Clear();
        origGenotypes = loader.genotypes;
     }

     quality.CheckQualityScores(loader.currMarker, loader.genotypes, q, noGenoPostQual);

     loader.setMonomorphic();
     loader.countMendelErrors(firstPass.mask, isSexLinked);

     if (!secondPass.ProcessMarker(loader.genotypes, loader.sexCodes, firstPass.mask, loader.alleles, loader.isDuplicateSample, loader.fatherColumn, loader.motherColumn, isSexLinked))
     {
       secondPass.outputNotAssessed(notAssessed, loader.currMarker, secondPass.failure);
       secondPass.currentMarker++;
       continue;
     }
     if (!loader.isMonomorphic)
        secondPass.XLinkedOdds(loader.genotypes, loader.sexCodes, firstPass.mask, loader.isDuplicateSample, QC_Settings::GENOTYPING_ERROR);
     secondPass.setMendelErrors(loader.totalMendelErrors, loader.mendelErrorRate);
     secondPass.ConcordanceChecks(loader.genotypes, loader.duplicateSampleSets, loader.duplicatedSampleIds.Length());
     secondPass.SNPHWE(loader.genotypes, isSexLinked, loader.sexCodes, firstPass.mask, loader.fatherColumn, loader.motherColumn, loader.isDuplicateSample);
     if (calcTDTStatistic)
        secondPass.TDTStatistic(loader.genotypes, loader.fatherColumn, loader.motherColumn, firstPass.mask, loader.sexCodes, loader.mendelErrors, loader.isDuplicateSample, isSexLinked);
     if (performAssociation)
     {
        if (!loader.isMonomorphic)
           secondPass.AssociationTest(loader.genotypes, sampLabels.sampleLabels, loader.isDuplicateSample, loader.sexCodes, firstPass.mask, isSexLinked, sampLabels.labelHash.Entries());
        else
        {
           secondPass.assocChiSq = -1.0;
           secondPass.assocPvalue = -1.0;
	}
     }
     secondPass.currentMarker++;

     if (secondPass.PostProcess())
     {
        if (!isSexLinked && !skipRelativeCheck)
           relCheck.updateRelMatrices(loader.genotypes, firstPass.mask);
        // this is the place to write output files - passed marker with passed samples
        // write all the output files - quality thresholded genotypes,
        // genotypes not quality thresholded and quality scores

        // quality file and original genotypes if scores present
	if (outputFiles)
	{
           if (qualityScoresPresent)
           {
              outputter.WriteQualityScoreLine(outQuality, firstPass.mask, qualityScores, currSNP->preferredID);
              outputter.WriteGenotypeLine(outGenoNoThreshold, firstPass.mask, origGenotypes, currSNP->preferredID, loader.alleles);
           }
           outputter.WriteGenotypeLine(outGenoThresholded, firstPass.mask, loader.genotypes, currSNP->preferredID, loader.alleles);
        }
     }

     // I think this was a major bug: you loader.markerLabels[cnt] and that was
     // not always in sync with the current SNP (in fact, that seems to double
     // up each marker on the second pass).
      secondPass.outputMarkerStatistics(sout, currSNP->preferredID, quality.avgQualityScorePreThreshold, calcTDTStatistic, performAssociation);
      }

   printf("\n\n");

   secondPass.writeOutputSummary();

   fclose(sout);
   fclose(notAssessed);
   printf("Second pass completed, see [%s.log] for additional information\n\n",
          (const char *) prefix);
   secondPass.ReleaseMemory();
   if (outputFiles)
   {
      if (qualityScoresPresent)
      {
         fclose(outGenoNoThreshold);
         fclose(outQuality);
      }
      fclose(outGenoThresholded);
   }
   fflush(stdout);

   qchist.BuildMarkerHistograms(prefix + ".snpinfo", prefix, secondPass.minimumStats, secondPass.maximumStats, qualityScoresPresent, calcTDTStatistic, performAssociation);
   
   if (!skipRelativeCheck)
   {
      FILE *rel = fopen_wrapper(prefix + ".relationinfo","wt");
      numRelPairs = 0;
      for (int i = 1; i < loader.columnLabels.Length() - 1; i++)
      {
         for (int j = i + 1; j < loader.columnLabels.Length(); j++, numRelPairs++)
         {
            relCheck.findIBDProbs(numRelPairs);
//            fprintf(rel, "%s\t%s\t", (const char *)loader.columnLabels[i], (const char *)loader.columnLabels[j]);
//            relCheck.outputRelFile(rel, numRelPairs);
//            fflush(rel);
         }
      }
      relCheck.checkRelationships();
      relCheck.outputRelationInfo(rel, loader.columnLabels);
      fclose(rel);
      qchist.BuildRelationHistograms(relCheck.estimatedKinship, relCheck.putativeRelations, relCheck.relationError, prefix);
   }

   if (!hapmap.IsEmpty())
      {
      printf("SUMMARY OF COMPARISONS WITH HAPMAP\n"
             "==================================\n\n");

      hapmap.PrintComparisonHeader();
      hapmap.PrintComparisonSummary();
      printf("\n\n");

      hapmap.PrintHistogram();
      hapmap.LogFlips(prefix + "-hapmap.log");
      hapmap.LogSampleComparisons(prefix + "-vs-reference-sample.txt", loader.columnLabels);
      }

   relCheck.ReleaseMemory();
   loader.ReleaseMemory();
   fclose(f);
   fclose(logMgr.glog);
}
