#include "haplotype.h"
#include "QuickIndex.h"
#include "MathStats.h"
#include <math.h>

int Increment(IntArray& sequence, int maxCount)
{
   for(int update = sequence.Length() - 1; update >= 0;){
      if(sequence[update] < maxCount){
         sequence[update] ++;
         for(update++; update < sequence.Length(); update++)
            sequence[update] = sequence[update-1] + 1;
         return 1;
      }else{
         update --;
         maxCount--;
      }
   }
   return 0;
}

HaplotypeAnalysis::HaplotypeAnalysis(Pedigree& pedigree):ped(pedigree)
{
   rareCutoff = 0.01;
   printFlag = 1;
}

void HaplotypeAnalysis::ReadData(const char* filename)
{
   FILE *fp = fopen(filename, "rb");
   if(fp==NULL) error("File %s cannot be opened.", filename);

   String line;
   StringArray tokens;
   StringArray tID;
   StringArray famID(0);
   StringArray pID(0);
   for(int i = 0; i < 2; i++) Haplotype[i].Dimension(0);
   while(!feof(fp)){
      line.ReadLine(fp);
      tokens.ReplaceColumns(line, ' ');
      if(tokens.Length()>2){
         Haplotype[tokens[1]=="HAPLO2"].Push(tokens[2]);
         tID.ReplaceTokens(tokens[0], "->");
         famID.Push(tID[0]);
         pID.Push(tID[1]);
      }
   }
   fclose(fp);

   count = Haplotype[0].Length();
   PedToHap.Dimension(ped.count);
   PedToHap.Set(-1);
   for(int i = 0; i < count; i++)
      for(int j = 0; j < ped.count; j++)
         if((ped[j].famid == famID[i*2]) && (ped[j].pid == pID[i*2]))
            PedToHap[j] = i;
}


void HaplotypeAnalysis::ExploreMarker()
{
   alleleFreq.Dimension(markerList.Length());
   minorAllele.Dimension(markerList.Length());

   for(int i = 0; i < markerList.Length(); i++){
      double sum = 0;
      for(int p = 0; p < count; p++)
         for(int k = 0; k < 2; k++)
            if(Haplotype[k][p][markerList[i]] == ped.markerInfo[markerList[i]]->alleleLabels[2][0])
               sum ++;
      sum /= (count*2);
      if(sum < 0.5){
         alleleFreq[i] = sum;
         minorAllele[i] = 2;
      }else{
         alleleFreq[i] = 1 - sum;
         minorAllele[i] = 1;
      }
   }
   Matrix freq2(markerList.Length(), markerList.Length());
   freq2.Zero();
   for(int i = 0; i < markerList.Length(); i++)
      for(int j = i+1; j < markerList.Length(); j++){
         for(int k = 0; k < 2; k++)
            for(int p = 0; p < count; p++)
               if( (Haplotype[k][p][markerList[i]] == ped.markerInfo[markerList[i]]->alleleLabels[minorAllele[i]][0])
                  && (Haplotype[k][p][markerList[j]] == ped.markerInfo[markerList[j]]->alleleLabels[minorAllele[j]][0]) )
                  freq2[i][j] ++;
         freq2[i][j] /= (count*2);
   }

   D.Dimension(markerList.Length(), markerList.Length());
   Rsquare.Dimension(markerList.Length(), markerList.Length());
   for(int i = 0; i < markerList.Length(); i++)
      for(int j = i+1; j < markerList.Length(); j++){
         double t1 = (1-alleleFreq[i])*alleleFreq[j];
         double t2 = alleleFreq[i]*(1-alleleFreq[j]);
         D[i][j] = freq2[i][j] - alleleFreq[i] * alleleFreq[j];
         Rsquare[i][j] = D[i][j] * D[i][j] / (t1 * t2);
         D[i][j] /= (t1<t2 ? t1: t2);
      }

}

void HaplotypeAnalysis::PrintSNP()
{
   printf("%15s%10s%10s", "MarkerName", "Position", "Freq(MA)");
   if( (SNPforHAP.Length()<8) || (printFlag==2) ) printf("%7s%7s", "(R2)", "(D')");
   printf("\n");

   for(int i = 0; i < SNPforHAP.Length(); i++){
      printf("%15s%10lf%7.3lf(%s)", (const char*)ped.markerNames[markerList[SNPforHAP[i]]],
         ped.markerInfo[markerList[SNPforHAP[i]]]->position*100, alleleFreq[SNPforHAP[i]],
         (const char*)ped.markerInfo[markerList[SNPforHAP[i]]]->alleleLabels[minorAllele[SNPforHAP[i]]]);
      if( (SNPforHAP.Length()<8) || (printFlag==2) )
         for(int j = 0; j < SNPforHAP.Length(); j++){
            if(i < j)   // upper
               printf("%7.3lf", D[SNPforHAP[i]][SNPforHAP[j]]);
            else if(i > j) // lower
               printf("%7.3lf", Rsquare[SNPforHAP[j]][SNPforHAP[i]]);
            else printf("%7s", "");
         }
      printf("\n");
   }
   printf("\n");
}

void HaplotypeAnalysis::PrepareHap()
{
   StringIntHash hash;
   for(int i = 0; i < 2; i++) hapCode[i].Dimension(count);
   hapCount = 0;
   String shortHaplotype;
   StringArray haplotypeList(0);
   for(int p = 0; p < count; p++)
      for(int h = 0; h < 2; h++){
         shortHaplotype = "";
         for(int i = 0; i < SNPforHAP.Length(); i++)
            shortHaplotype += Haplotype[h][p][markerList[SNPforHAP[i]]];
         hapCode[h][p] = hash.Integer(shortHaplotype);
         if(hapCode[h][p]==-1) {
            hapCode[h][p] = hapCount;
            haplotypeList.Push(shortHaplotype);
            hash.SetInteger(shortHaplotype, hapCount++);
         }
      }

   Vector hapFreq(hapCount);
   hapFreq.Zero();
   for(int h = 0; h < 2; h++)
      for(int p = 0; p < count; p++)
         hapFreq[hapCode[h][p]] ++;
   for(int i = 0; i < hapCount; i++)
      hapFreq[i] /= (count*2);

   QuickIndex index;
   index.Index(hapFreq);

/*
   Vector oldfreq, freq;
   map.Dimension(hapCount);
   IntArray oldmap(hapCount);
   for(int i = 0; i < hapCount; i++) map[i] = i;
   for(oldfreq = freq = hapFreq, oldmap = map; oldfreq.Min() < rareCutoff; oldfreq = freq, oldmap = map){
      index.Index(oldfreq);
      freq.Dimension(0);
      for(int i = oldfreq.Length()-1; i > 0; i--){
         freq.Push(oldfreq[index[i]]);
         for(int j = 0; j < hapCount; j++)
            if(oldmap[j]==index[i])
               map[j] = oldfreq.Length()-1-i;
      }
      freq[freq.Length()-1] += oldfreq[index[0]];
      for(int j = 0; j < hapCount; j++)
         if(oldmap[j] == index[0])
            map[j] = oldfreq.Length()-2;
   }

   IntArray *oldCodes = new IntArray[freq.Length()];
   for(int k = 0; k < freq.Length(); k++){
      oldCodes[k].Dimension(0);
      for(int i = 0; i < hapCount; i++){
         if(map[i]==k) oldCodes[k].Push(i);
      }
   }
*/
   Vector freq;
   map.Dimension(hapCount);
   index.Index(hapFreq);
   freq.Dimension(0);
   IntArray *oldCodes;
   int combinedHap = -1;
   for(int i = hapCount-1; i >= 0; i--)
      if(hapFreq[index[i]] < rareCutoff){
         combinedHap = i;
         break;
      }
   if(combinedHap == -1){  // no rare haplotypes
      oldCodes = new IntArray[hapCount];
      for(int i = hapCount-1; i >= 0; i--){
         freq.Push(hapFreq[index[i]]);
         map[index[i]] = hapCount - 1 - i;
         oldCodes[hapCount-1-i].Dimension(1);
         oldCodes[hapCount-1-i][0] = index[i];
      }
   }
   else{
      double sum = 0.0;
      for(int i = combinedHap; i >= 0; i--)
         sum += hapFreq[index[i]];
      if(sum < rareCutoff){
         combinedHap ++;
         sum += hapFreq[index[combinedHap]];
      }
      oldCodes = new IntArray[hapCount-combinedHap];
      for(int i = hapCount-1; i > combinedHap; i--){
         freq.Push(hapFreq[index[i]]);
         map[index[i]] = hapCount - 1 - i;
         oldCodes[hapCount-1-i].Dimension(1);
         oldCodes[hapCount-1-i][0] = index[i];
      }
      freq.Push(sum);
      oldCodes[hapCount-1-combinedHap].Dimension(0);
      for(int i = combinedHap; i >= 0; i--){
         map[index[i]] = hapCount - 1 - combinedHap;
         oldCodes[hapCount-1-combinedHap].Push(index[i]);
      }
   }

   if(printFlag){
      int printLength = SNPforHAP.Length() > 9 ? SNPforHAP.Length()+2 : 10;
      char format1[80], format2[80], format3[80];
      sprintf(format1, "%s%d%s", "%5s%10s%", printLength, "s%10s\n");
      sprintf(format2, "%s%d%s", "%5d%10.3lf%", printLength, "s%10.3lf\n");
      sprintf(format3, "%s%d%s", "%5s%10s%", printLength, "s%10.3lf\n");
      printf(format1, "Code", "Frequency", "Haplotype","Freqs");

      for(int i = 0; i < freq.Length(); i++){
         printf(format2, i+1, freq[i], (const char*)haplotypeList[oldCodes[i][0]], hapFreq[oldCodes[i][0]]);
         for(int j = 1; j < oldCodes[i].Length(); j++)
            printf(format3, "", "", (const char*)haplotypeList[oldCodes[i][j]], hapFreq[oldCodes[i][j]]);
      }
   }
   delete []oldCodes;
   hapCount = freq.Length();
}

void HaplotypeAnalysis::regression(int trait)
{
   int covariateCount = covariates.Length();
   IntArray pheno(0);
   for(int i = 0; i < ped.count; i++){
      if(!ped[i].isPhenotyped(traits[trait])) continue;
      int valid = 1;
      for(int c = 0; c < covariates.Length(); c++)
         if(!ped[i].isControlled(covariates[c])){
            valid = 0;
            break;
         }
      if(valid) pheno.Push(i);
   }
   Vector Y;
   Matrix X;
   Y.Dimension(pheno.Length());
   X.Dimension(pheno.Length(), covariates.Length()+hapCount-1);
   X.Zero();
   for(int i = 0; i < pheno.Length(); i++){
      Y[i] = ped[pheno[i]].traits[traits[trait]];
      for(int c = 0; c < covariates.Length(); c++)
         X[i][c] = ped[pheno[i]].covariates[covariates[c]];
      int m1 = map[hapCode[0][PedToHap[pheno[i]]]];
      int m2 = map[hapCode[1][PedToHap[pheno[i]]]];
      if( (m1==m2) && (m1 < hapCount-1) )
         X[i][covariates.Length()+m1] = 2;
      else{
         if(m1 < hapCount-1)
            X[i][covariates.Length()+m1] = 1;
         if(m2 < hapCount-1)
            X[i][covariates.Length()+m2] = 1;
      }
   }                     
   for(int c = 0; c < covariateCount; c++){
      double mean = 0;
      for(int i = 0; i < pheno.Length(); i++)
         mean += X[i][c];
      mean /= pheno.Length();
      for(int i = 0; i < pheno.Length(); i++)
         X[i][c] -= mean;
   }
   ols.run(Y, X);
   loglik[trait] = ols.loglik;
   double stat = 2*(ols.loglik-loglik0[trait]);
   pvalue[trait] = chidist(stat,hapCount-1);

   if(printFlag){
      if(printFlag == 2){
         printf("\nGlobal Haplotype Model for Trait %s (N=%d, loglik=%.2lf)\n",
            (const char*)ped.traitNames[traits[trait]], pheno.Length(), ols.loglik);
         printf("===========================================================\n");
         printf("%10s%7s%10s%10s%7s%7s\n", "COV", "t", "pvalue", "Effect", "h2", "h2_adj");
         printf("%10s%7.2f%10.2G%10.2G\n",
            "mu", ols.t_statistic[0], ols.pvalue[0], ols.beta[0]);
         double tempR2 = 0;
         for(int c = 0; c < covariates.Length(); c++){
            printf("%10s%7.2f%10.2G%10.2G%6.1f%%\n",
               (const char*)ped.covariateNames[covariates[c]], ols.t_statistic[c+1],
               ols.pvalue[c+1], ols.beta[c+1], ols.R2[c]*100);
            tempR2 += ols.R2[c];
         }
         for(int m = 0; m < hapCount-1; m++)
            printf("      HAP%d%7.2f%10.2G%10.2G%6.1f%%%6.1f%%\n",
               m+1, ols.t_statistic[m+covariateCount+1],
               ols.pvalue[m+covariateCount+1], ols.beta[covariateCount+m+1],
               ols.R2[m+covariateCount]*100,
               ols.R2[m+covariateCount] / (1-tempR2) * 100 );
         printf("LRT for haplotype effect: %.2lf (df=%d)   p-value: %.3G\n",
            stat, hapCount-1, pvalue[trait]);
         if(ols.failure) printf("Collinearity exits. Be careful to interpret results\n");
         printf("\n");
      }
      printf("\nGlobal Haplotype Model for Trait %s (N=%d, loglik=%.2lf)\n",
            (const char*)ped.traitNames[traits[trait]], pheno.Length(), ols.loglik);
      printf("=======================================================\n");
      printf("%10s%7s%10s%10s%7s%7s\n", "COV", "t", "pvalue", "Effect", "h2", "h2_adj");
      double tempR2 = 0;
      for(int c = 0; c < covariateCount; c++){
         printf("%10s%7.2f%10.2G%10.2G%6.1f%%\n",
            (const char*)ped.covariateNames[covariates[c]], ols.t_statistic[c+1],
            ols.pvalue[c+1], ols.beta[c+1], ols.R2[c]*100);
         tempR2 += ols.R2[c];
      }
      Vector R2(hapCount);
      R2.Zero();
      for(int m = 0; m < hapCount-1; m++){
         double tstat = (ols.beta[covariateCount+m+1]+ols.beta[0]*0.5) /
            sqrt(ols.Cov[covariateCount+m+1][covariateCount+m+1]
            + ols.Cov[0][0]/4 + ols.Cov[covariateCount+m+1][0]);
         double tpvalue = tdist(fabs(tstat), ols.N - ols.P -1);
         R2[m] = ols.R2[m+covariateCount] *
            (1 + ols.beta[0] * 0.5 / ols.beta[covariateCount+m+1]);
         printf("      HAP%d%7.2f%10.2G%10.2G%6.1f%%%6.1f%%\n",
            m+1, tstat, tpvalue,
            ols.beta[covariateCount+m+1] + ols.beta[0]*0.5,
            R2[m] * 100,
            R2[m] / (1-tempR2) * 100 );
      }
      double tstat = ols.beta[0] / sqrt(ols.Cov[0][0]);
      double tpvalue = tdist(fabs(tstat), ols.N - ols.P -1);
      for(int m = 0; m < hapCount-1; m++)
         R2[hapCount-1] += ols.R2[covariateCount+m] / ols.beta[covariateCount+m+1];
      R2[hapCount-1] *= -ols.beta[0]*0.5;
      printf("      HAP%d%7.2f%10.2G%10.2G%6.1f%%%6.1f%%\n",
            hapCount, tstat, tpvalue,
            ols.beta[0]*0.5,
            R2[hapCount-1] * 100,
            R2[hapCount-1] / (1-tempR2) * 100 );
      printf("LRT for haplotype effect: %.2lf (df=%d)   p-value: %.3G\n",
         stat, hapCount-1, pvalue[trait]);
      printf("All covariates explain %.1f%% of total variance.\n", tempR2 * 100);
      tempR2 = 0;
      for(int m = 0; m < hapCount; m++)
         tempR2 += R2[m];
      printf("All haplotypes explain %.1f%% of total variance.\n", tempR2 * 100);
      if(ols.failure) printf("Collinearity exits. Be careful to interpret results\n");
      printf("\n");
   }
}

void HaplotypeAnalysis::regression1(int trait, int tHap)
{
   IntArray pheno(0);
   for(int i = 0; i < ped.count; i++){
      if(!ped[i].isPhenotyped(traits[trait])) continue;
      int valid = 1;
      for(int c = 0; c < covariates.Length(); c++)
         if(!ped[i].isControlled(covariates[c])){
            valid = 0;
            break;
         }
      if(valid) pheno.Push(i);
   }
   Vector Y;
   Matrix X;
   Y.Dimension(pheno.Length());
   X.Dimension(pheno.Length(), covariates.Length()+1);
   X.Zero();
   for(int i = 0; i < pheno.Length(); i++){
      Y[i] = ped[pheno[i]].traits[traits[trait]];
      for(int c = 0; c < covariates.Length(); c++)
         X[i][c] = ped[pheno[i]].covariates[covariates[c]];
      X[i][covariates.Length()] = (map[hapCode[0][PedToHap[pheno[i]]]] == tHap)
         + (map[hapCode[1][PedToHap[pheno[i]]]] == tHap);
   }
   ols.run(Y, X);
   loglik[trait] = ols.loglik;
   double stat = 2*(ols.loglik-loglik0[trait]);
   pvalue2[tHap] = chidist(stat,1);
   if(printFlag){
      printf("\nSingle Haplotype Model for Trait %s (N=%d, loglik=%.2lf)\n",
            (const char*)ped.traitNames[traits[trait]], pheno.Length(), ols.loglik);
      printf("===========================================================\n");
      printf("%10s%7s%10s%10s%7s%7s\n", "COV", "t", "pvalue", "Effect", "h2", "h2_adj");
      printf("%10s%7.2f%10.2G%10.2G\n",
         "mu", ols.t_statistic[0], ols.pvalue[0], ols.beta[0]);
      double tempR2 = 0;
      for(int c = 0; c < covariates.Length(); c++){
         printf("%10s%7.2f%10.2G%10.2G%6.1f%%\n",
            (const char*)ped.covariateNames[covariates[c]], ols.t_statistic[c+1],
            ols.pvalue[c+1], ols.beta[c+1], ols.R2[c]*100);
         tempR2 += ols.R2[c];
      }
      printf("      HAP%d%7.2f%10.2G%10.2G%6.1f%%%6.1f%%\n",
            tHap+1, ols.t_statistic[covariates.Length()+1],
            ols.pvalue[covariates.Length()+1],
            ols.beta[covariates.Length()+1],
            ols.R2[covariates.Length()]*100,
            ols.R2[covariates.Length()]/(1-tempR2)*100 );
      printf("LRT for haplotype effect: %.2lf (df=%d)   p-value: %.3G\n\n",
         stat, 1, pvalue2[tHap]);
   }
}

void HaplotypeAnalysis::regression0(int trait)
{
   IntArray pheno(0);
   for(int i = 0; i < ped.count; i++){
      if(!ped[i].isPhenotyped(traits[trait])) continue;
      int valid = 1;
      for(int c = 0; c < covariates.Length(); c++)
         if(!ped[i].isControlled(covariates[c])){
            valid = 0;
            break;
         }
      if(valid) pheno.Push(i);
   }
   Vector Y;
   Matrix X;
   Y.Dimension(pheno.Length());
   X.Dimension(pheno.Length(), covariates.Length());
   X.Zero();
   for(int i = 0; i < pheno.Length(); i++){
      Y[i] = ped[pheno[i]].traits[traits[trait]];
      for(int c = 0; c < covariates.Length(); c++)
         X[i][c] = ped[pheno[i]].covariates[covariates[c]];
   }
   ols.run(Y, X);
   loglik0[trait] = ols.loglik;
}















/*

   // Regression of trait values on haplotypes
   OLS_REGRESSION ols;
   Vector Y;
   Matrix X;

   for(int k = 0; k < traits.Length(); k++){
      IntArray pheno(0);
      for(int i = 0; i < ped.count; i++){
         if(!ped[i].isPhenotyped(traits[k])) continue;
         int valid = 1;
         for(int c = 0; c < covariates.Length(); c++)
            if(!ped[i].isControlled(covariates[c])){
               valid = 0;
               break;
            }
         if(valid) pheno.Push(i);
      }

      Y.Dimension(pheno.Length());
      X.Dimension(pheno.Length(), covariates.Length()+1);
      X.Zero();

      printf("\nSingle-SNP Analysis for Trait %s (N=%d)\n",
         (const char*)ped.traitNames[traits[k]], pheno.Length());
      printf("============================================\n");
      printf("%15s%10s%7s%7s%7s%10s%10s%7s\n",
         "MarkerName", "Position", "Allele", "Freq",
         "t", "pvalue", "Effect", "h2");
      for(int m = 0; m < markerList.Length(); m++){
         for(int i = 0; i < pheno.Length(); i++){
            Y[i] = ped[pheno[i]].traits[traits[k]];
            for(int c = 0; c < covariates.Length(); c++)
               X[i][c] = ped[pheno[i]].covariates[covariates[c]];
            int m1 = (Haplotype[0][PedToHap[pheno[i]]][markerList[m]]==
               ped.markerInfo[markerList[m]]->alleleLabels[minorAllele[m]][0]);
            int m2 = (Haplotype[1][PedToHap[pheno[i]]][markerList[m]]==
               ped.markerInfo[markerList[m]]->alleleLabels[minorAllele[m]][0]);
            if((m1==m2) && (m1==1))
               X[i][covariates.Length()] = 2;
            else if( (m1==1) || (m2==1) )
                  X[i][covariates.Length()] = 1;
            else X[i][covariates.Length()] = 0;
         }
         ols.run(Y, X);
         printf("%15s%10lf%7s%7.3lf%7.2lf%10.2G%10.2G%6.1lf%%\n",
            (const char*)ped.markerNames[markerList[m]],
            ped.markerInfo[markerList[m]]->position*100,
            (const char*)ped.markerInfo[markerList[m]]->alleleLabels[minorAllele[m]],
            alleleFreq[m], ols.t_statistic[covariates.Length()],
            ols.pvalue[covariates.Length()], ols.beta[covariates.Length()+1],
            ols.R2[covariates.Length()]*100);
      }
//   }
//   for(int k = 0; k < traits.Length(); k++){
      Y.Dimension(pheno.Length());
      X.Dimension(pheno.Length(), covariates.Length()+freq.Length()-1);
      X.Zero();
      for(int i = 0; i < pheno.Length(); i++){
         Y[i] = ped[pheno[i]].traits[traits[k]];
         for(int c = 0; c < covariates.Length(); c++)
            X[i][c] = ped[pheno[i]].covariates[covariates[c]];
         int m1 = hapCode[0][PedToHap[pheno[i]]];
         int m2 = hapCode[1][PedToHap[pheno[i]]];
         if( (m1==m2) && (m1 < freq.Length()-1) )
            X[i][covariates.Length()+m1] = 2;
         else{
            if(m1 < freq.Length()-1)
               X[i][covariates.Length()+m1] = 1;
            if(m2 < freq.Length()-1)
               X[i][covariates.Length()+m2] = 1;
         }
      }
      ols.run(Y, X);
      printf("\nHaplotype Analysis for Trait %s (N=%d)\n",
         (const char*)ped.traitNames[traits[k]], pheno.Length());
      printf("============================================\n");
      printf("%10s%7s%10s%10s%7s\n", "COV", "t", "pvalue", "Effect", "h2");
      for(int c = 0; c < covariates.Length(); c++)
         printf("%10s%7.2f%10.2G%10.2G%6.1f%%\n",
            (const char*)ped.covariateNames[covariates[c]], ols.t_statistic[c],
            ols.pvalue[c], ols.beta[c+1], ols.R2[c]*100);
      for(int m = 0; m < freq.Length()-1; m++)
         printf("      HAP%d%7.2f%10.2G%10.2G%6.1f%%\n",
            m+1, ols.t_statistic[m+covariates.Length()],
            ols.pvalue[m+covariates.Length()], ols.beta[covariates.Length()+m+1],
            ols.R2[m+covariates.Length()]*100);
   }

//   HapToPed.Dimension(count);
//   HapToPed.Set(-1);
//            HapToPed[i] = j;

   */

/*
   minorAllele.Set(2);
   for(int i = 0; i < markerList.Length(); i++){
      double sum = 0;
      int tcount = 0;
      for(int p = 0; p < ped.count; p++){
         if(ped[p].markers[markerList[i]].isKnown()){
            sum += ped[p].markers[markerList[i]].countAlleles(2);
            tcount += 2;
         }
      }
      sum /= tcount;
      if(sum < 0.5)
         alleleFreq[i] = sum;
      else{
         alleleFreq[i] = 1 - sum;
         minorAllele[i] = 1;
      }
   }
 */


