#include "GenotypeLoader.h"
#include "Error.h"

#include <ctype.h>

GenotypeLoader::GenotypeLoader()
   {
      duplicateSampleSets = NULL;
      failure = "";
   }

GenotypeLoader::~GenotypeLoader()
   {
      if (duplicateSampleSets != NULL)
         delete [] duplicateSampleSets;
   }

void GenotypeLoader::LoadHeader(FILE * input, IgnoreSamples & ignoreSamples, Pedigree & ped)
   {
   buffer.ReadLine(input);
   tokens.ReplaceTokens(buffer);

   sampleIds.Clear();
   markerIds.Clear();
   duplicatedSampleIds.Clear();
   
   columnToSampleId.Clear();
   columnToSampleId.Push(-1);
   isDuplicateSample.Clear();
   isDuplicateSample.Push(0); // the first column is snp name & duplicity doesnt matter
   columnLabels.Push("SNPID");
   ignoreSamples.isSampleSkipped.Push(0); // the first token is the marker name and is not to be skipped

   int cnt = 1;
   for (int i = 1; i < tokens.Length(); i++)
   {
     String token = tokens[i];
     int dotPosition = token.FindChar('.', 0);
     if (dotPosition == 0)
     {
       failure = "";
       failure.catprintf("Invalid header sample name %s - column %d", (const char *)tokens[i], i);
       logMgr.WriteToLog(failure, 0);
       error(failure);
     }
     else if (dotPosition > 0)
       token = token.SubStr(0, dotPosition);

     if (ignoreSamples.UpdateSkipVector(token, tokens[i], ped))
       continue; // ignore this current sample

     if (sampleIds.Find(token) > -1)
     {
       columnToSampleId.Push(sampleIds.Integer(token));
       isDuplicateSample.Push(1); // sample seen before
       if (duplicatedSampleIds.Find(sampleIds.Integer(token)) == -1)
         duplicatedSampleIds.Push(sampleIds.Integer(token));
     }
     else
     {
       isDuplicateSample.Push(0); // brand new sample
       columnToSampleId.Push(cnt);
       sampleIds.Add(token, cnt);
     }
     columnLabels.Push(tokens[i]);
     cnt++;
   }
   fflush(stdout);
   buildDuplicateList();
   }

void GenotypeLoader::LoadMarker(FILE * input, IntArray & isSampleSkipped)
   {
   isMarkerFailure = true;
   isValid = true;
   isLineEmpty = false;
   isDuplicateMarker = false;
   isEmptyMarker = false;
   currMarker = "";
   do {
     buffer.ReadLine(input);
     tokens.ReplaceTokens(buffer);
   } while (tokens.Length() == 0 && feof(input));

   if (tokens.Length() == 0)
     {
     isValid = false;
     failure = "Reached end of input";
     isLineEmpty = true;
     return;
     }

   currMarker = tokens[0];
   if (tokens.Length() != isSampleSkipped.Length())
   {
      isValid = false;
      failure.catprintf("Incorrect number of columns %d -- %d\n", tokens.Length(), isSampleSkipped.Length()); // this is before the ignored samples are removed.
      return;
   }

   {
      // block for ignoring samples
      StringArray temp;
      for (int j = 0; j < isSampleSkipped.Length(); j++)
      {
         if (isSampleSkipped[j] == 0)
            temp.Push(tokens[j]);
      }
      tokens.Clear();
      tokens = temp;
   }

   if (tokens.Length() != columnToSampleId.Length())
     {
     isValid = false;
     failure = "Incorrect number of columns";
     return;
     }

   if (markerIds.Find(tokens[0]) >= 0)
     {
     isValid = false;
     failure = "Duplicated marker";
     isDuplicateMarker = true;
     return;
     }

   isMarkerFailure = false;
   markerIds.Add(tokens[0], markerIds.Entries());

   // I think these are actually not needed
   // markerLabels.Add(tokens[0]);

   ClearAlleles();
   genotypes.Dimension(columnToSampleId.Length());

   for (int i = 1; i < tokens.Length(); i++)
     if (tokens[i].Length() != 2)
       {
       isValid = false;
       failure.printf("Some genotype codes were %d characters long [should be 2]", tokens[i].Length());
       return;
       }
     else
       {
       int al1 = GetAllele(tokens[i][0]);
       int al2 = GetAllele(tokens[i][1]);

       if (al1 == -2 && al2 == -2)
         al1 = al2 = 0;
       else if (al1 == -2)
         al1 = al2;
       else if (al2 == -2)
         al2 = al1;

       if (al1 > 2 || al2 > 2)
         {
         isValid = false;
         failure = "More than 2 different marker alleles listed";
         return;
         }

       if ((al1 == 0) ^ (al2 == 0))
         {
         isValid = false;
         failure = "Invalid genotype code found";
         return;
         }
       if ((al1 == -1) || (al2 == -1))
         {
         isValid = false;
         failure = "Invalid allele code found";
         return;
         }

       genotypes[i] = al1 | al2;
       }
       
   if (CountGenotypes() == 0)
     {
     isValid = false;
     isEmptyMarker = true;
     failure = "No valid genotypes";
     return;
     }
   isMonomorphic = (!alleles[0] || !alleles[1]);
   }

void GenotypeLoader::setMonomorphic()
{
   int cnt[4] = { 0, 0, 0, 0 };
   for (int i = 1; i < genotypes.Length(); i++)
      if (isDuplicateSample[i] == 0)
         cnt[genotypes[i]]++;
      
   isMonomorphic = false;
   if (cnt[1] + cnt[2] + cnt[3] != 0)
      if (cnt[3] == 0 && (cnt[1] == 0 || cnt[2] == 0))
         isMonomorphic = true;
}

int GenotypeLoader::GetAllele(char al)
   {
   al = toupper(al);

   if (al == 'N')
     return 0;

   if (al == 'X' || al == 'Y')
     return -2;

   if (alleles[0] == al)
     return 1;

   if (alleles[1] == al)
     return 2;

   if (!(al == 'A' || al == 'C' || al == 'T' || al == 'G'|| al == '-'))
     return -1;

   if (alleles[0] == 0)
     {
     alleles[0] = al;
     return 1;
     }

   if (alleles[1] == 0)
     {
     alleles[1] = al;
     return 2;
     }

   if (al == 'A' || al == 'C' || al == 'T' || al == 'G' || al == '-')
     return 3;

   return -1;
   }

int GenotypeLoader::CountGenotypes()
   {
   int count = 0;

   if (!isValid) return 0;

   for (int i = 0; i < genotypes.Length(); i++)
      if (genotypes[i])
         count++;

   return count;
   }

bool GenotypeLoader::MendelCheck(int child, int father)
   {
   if (child == -1 || father == -1)
      return true;

   if (genotypes[child] == 0 || genotypes[father] == 0)
      return true;

   return (genotypes[child] & genotypes[father]) != 0;
   }

bool GenotypeLoader::MendelCheck(int child, int father, int mother)
   {
   if (child == -1 || father == -1 || mother == -1)
      return true;

   if (genotypes[child] == 0 || genotypes[father] == 0 || genotypes[mother] == 0)
      return true;

   return (genotypes[child] & (genotypes[father] | genotypes[mother])) == genotypes[child];
   }

void GenotypeLoader::PopulatePedigreeInformation(Pedigree & ped)
   {
   int columns = columnToSampleId.Length();

   sexCodes.Dimension(columns);
   sexCodes.Set(0);

   fatherColumn.Dimension(columns);
   fatherColumn.Set(-1);

   motherColumn.Dimension(columns);
   motherColumn.Set(-1);
   
   individualMask.Dimension(columns);
   individualMask.Set(-1);
   
   failure = "";

   // Map sample information based on pedigree file
   for (int i = 1; i < columns; i++)
      {
      String tempToken = columnLabels[i];
      int index = tempToken.FindChar('.');
      if (index > -1)
         tempToken = tempToken.Left(index);
      Person * person = ped.FindPerson("FORGOTTEN", tempToken);

      if (person == NULL)
      {
         // TODO: Flag persons with unknown pedigree information
         // TODO: Perhaps we should just treat these people as unrelated -- current implementation?
         individualMask[i] = 1;
         failure.catprintf("Person %s is not in the pedigree file\n", (const char *) columnLabels[i]);
         logMgr.WriteToLog(failure, 13);
         failure.Clear();
         continue;
      }
      individualMask[i] = 0;
    
      sexCodes[i] = person->sex;
      if (person->father != NULL)
       if (sampleIds.Find(person->father->pid) >= 0)
         fatherColumn[i] = sampleIds.Integer(person->father->pid);;
      if (person->mother != NULL)
       if (sampleIds.Find(person->mother->pid) >= 0)
         motherColumn[i] = sampleIds.Integer(person->mother->pid);
     }
   }

void GenotypeLoader::Rewind(FILE * input)
{
   rewind(input);
   buffer.ReadLine(input);
   markerIds.Clear(); // clear marker array for second pass
}


void GenotypeLoader::UpdateMendelErrors(FILE * mendelLog, bool isSexLinked)
{
  int mendelInconsistency;
  mendelErrors.Clear();
  mendelErrors.Dimension(genotypes.Length());
  mendelErrors.Zero();
  
  if (!isSexLinked)
  {
    for (int i = 1; i < genotypes.Length(); i++)
    {
     mendelInconsistency = ((MendelCheck(i, fatherColumn[i], motherColumn[i]) &&
                   MendelCheckMother(i, motherColumn[i]) &&
                   MendelCheckFather(i, fatherColumn[i])) ? 0: 1);
     mendelErrors[i] += mendelInconsistency;
     if (fatherColumn[i] != -1)
        mendelErrors[fatherColumn[i]] += mendelInconsistency;
     if (motherColumn[i] != -1)
        mendelErrors[motherColumn[i]] += mendelInconsistency;
     if (mendelInconsistency)
        OutputErrorTrios(mendelLog, i, fatherColumn[i], motherColumn[i], isSexLinked);
    }
  }
  else
  {// the marker is X-linked: if female no difference; if male only
   // compatiblity reqd with mother
   for (int i = 1; i < genotypes.Length(); i++)
   {
      if (sexCodes[i] == 1)
      {
         mendelInconsistency = (MendelCheckMother(i,motherColumn[i]) ? 0 : 1);
         mendelErrors[i] += mendelInconsistency;
      }
      else
      {
        mendelInconsistency = ((MendelCheck(i, fatherColumn[i], motherColumn[i]) &&
                     MendelCheckMother(i, motherColumn[i]) &&
                     MendelCheckFather(i, fatherColumn[i])) ? 0 : 1);
        mendelErrors[i] += mendelInconsistency;
        if (fatherColumn[i] != -1)
           mendelErrors[fatherColumn[i]] += mendelInconsistency;
      }
      if (motherColumn[i] != -1)
         mendelErrors[motherColumn[i]] += mendelInconsistency;
      if (mendelInconsistency)
         OutputErrorTrios(mendelLog, i, fatherColumn[i], motherColumn[i], isSexLinked);
   }
  }
}

void GenotypeLoader::buildDuplicateList()
{
   duplicateSampleSets = new IntArray[duplicatedSampleIds.Length()];
   for (int i = 1; i < columnToSampleId.Length(); i++)
   {
      int index = duplicatedSampleIds.Find(columnToSampleId[i]);
      if (index != -1)
         duplicateSampleSets[index].Push(i);
   }
   
}

void GenotypeLoader::countMendelErrors(IntArray & mask, bool isSexLinked)
{
   totalMendelErrors = 0;
   int mendelTests = 0;

   mendelErrors.Clear();
   mendelErrors.Dimension(genotypes.Length());
   mendelErrors.Zero();

   int temp;

   if (!isSexLinked)
   {
      for (int i = 1; i < genotypes.Length(); i++)
      {
         if (isDuplicateSample[i] != 0)   continue;
         temp = 0;
         if (!mask[i])
         {
            temp = (MendelCheck(i, fatherColumn[i], motherColumn[i]) &&
                      MendelCheckMother(i, motherColumn[i]) &&
                      MendelCheckFather(i, fatherColumn[i])) ? 0 : 1;
            totalMendelErrors += temp;
            if (genotypes[i] != 0 && (fatherColumn[i] != -1 || motherColumn[i] != -1))
               mendelTests++;
         }
         mendelErrors[i] = temp;
      }
   }
   else
   {
      for (int i = 1; i < genotypes.Length(); i++)
      {
         if (isDuplicateSample[i] != 0)   continue;
         temp = 0;
         if (!mask[i])
         {
            if (sexCodes[i] == 1)
            {
               temp = (MendelCheckMother(i, motherColumn[i]) ? 0 : 1);
               totalMendelErrors += temp;
               if (motherColumn[i] != -1 && genotypes[i] != 0) mendelTests++;
            }
            else
            {
               temp = ((MendelCheck(i, fatherColumn[i], motherColumn[i]) &&
                            MendelCheckMother(i, motherColumn[i]) &&
                            MendelCheckFather(i, fatherColumn[i])) ? 0 : 1);
               totalMendelErrors += temp;
               if (genotypes[i] != 0 && (fatherColumn[i] != -1 || motherColumn[i] != -1)) mendelTests++;
            }
         }
         mendelErrors[i] = temp;
      }
   }
   mendelErrorRate = (totalMendelErrors*1.0)/(mendelTests + 1e-30);
}


void GenotypeLoader::ReleaseMemory()
{
   columnLabels.Clear();
   columnToSampleId.Clear();
   duplicatedSampleIds.Clear();
   fatherColumn.Clear();
   motherColumn.Clear();
   sexCodes.Clear();
   genotypes.Clear();
   markerIds.Clear();
   sampleIds.Clear();
}


void GenotypeLoader::OutputErrorTrios(FILE * mendelLog, int sampleColumn, int fatherColumn, int motherColumn, bool isSexLinked)
{
   char sex = '-';
   switch(sexCodes[sampleColumn])
   {
      case 0:  sex = '-';
               break;
      case 1:  sex = 'M';
               break;
      case 2:  sex = 'F';
               break;
   }
   
   if (fatherColumn != -1 && motherColumn != -1)
   {
      fprintf(mendelLog, "%s\t%s\t%c\t%s\t%s\t%d\t%d\t%d\t", (const char *)currMarker, (const char*)columnLabels[sampleColumn], sex,
          (const char *)columnLabels[fatherColumn], (const char *) columnLabels[motherColumn], genotypes[sampleColumn],
          genotypes[fatherColumn], genotypes[motherColumn]);
   }
   else if (fatherColumn != -1)
   {
      fprintf(mendelLog, "%s\t%s\t%c\t%s\t%s\t%d\t%d\t%s\t", (const char *)currMarker, (const char *)columnLabels[sampleColumn], sex, 
          (const char *) columnLabels[fatherColumn], "-", genotypes[sampleColumn], genotypes[fatherColumn], "-");
   }
   else if (motherColumn != -1)
   {
      fprintf(mendelLog, "%s\t%s\t%c\t%s\t%s\t%d\t%s\t%d\t", (const char *)currMarker, (const char *) columnLabels[sampleColumn], sex,              "-", (const char *) columnLabels[motherColumn], genotypes[sampleColumn], "-", genotypes[motherColumn]);
   }

   if (isSexLinked)
      fprintf(mendelLog, "%s\n", "YES");
   else
      fprintf(mendelLog, "%s\n", "NO");
}
