/* 
    FTEST.cxx

    Apply the F-test to the data for testing the two-lines regression vs one-line regression. The full model is the two-lines regression and the reduced model is the one-line regression (the null-hypothesis). The equation is:

          (RSS_R - RSS_F) / 2
     F = -----------------------
            RSS_F / (N - 4)


    where RSS = sum (a + b X_i - Y_i)^2. Both a and b is the linear fit of Y = a + b X. 
	    
    Input file: <<input.dat>>, with format of X_i Y_i

    compile: g++ -o ftest ftest.cxx
    
    Written: C. Ngeow (4/22/04)
    
*/

#include <iostream.h>
#include <fstream.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>

/* Use in betacf */ 
#define MAXIT 100
#define EPS 3.0e-7
#define FPMIN 1.0e-30

// define global parameters
double a, b;           // ZP and slope
double siga, sigb;

void fitline(double x[], double y[], int M);
double betai(double a, double b, double x);
double betacf(double a, double b, double x);
double gammln(double xx);

void fitline(double x[], double y[], int M)
{
  double S, Sx, Sy;
  double Sxx, Sxy, delta;
  double chi2, sigma;

  S = 0.0;
  Sx = 0.0;
  Sy = 0.0;
  Sxx = 0.0;
  Sxy = 0.0;

  for(int i=1; i<=M; i++) {
    Sx += x[i];
    Sy += y[i];
    Sxx += x[i] * x[i];
    Sxy += x[i] * y[i];
    S += 1;
  }
  delta = S * Sxx - Sx * Sx;
 
  a = (Sxx * Sy - Sx * Sxy) / delta;
  b = (S * Sxy - Sx * Sy) / delta;

  siga = sqrt(Sxx/delta);
  sigb = sqrt(S/delta);

  chi2 = 0.0;
  for(int i=1; i<=M; i++) 
    chi2 += (y[i] - a - b * x[i]) * (y[i] - a - b * x[i]);

  siga = siga * sqrt(chi2/(float(M-2)));
  sigb = sigb * sqrt(chi2/(float(M-2)));
}

main()
{
  cout<<"  *** F-test for linear regressions *** \n\n";

  /* count data points */
  
  int count, N;
  double temp;

  ifstream inc("input.dat");
  
  count = 0;
  while(inc>>temp>>temp)
    count++;

  if(count == 0)
    {
      cout<<" Check input.dat, something wrong with the file. Aborting!\n";
      exit(0);
    }
  else
    N = count;

  inc.close();

  /* input the cut applied to X for two regressions */

  double xcut;

  cout<<" Input Xcut: ";
  cin>>xcut;

  /* input data */
  
  int Ml, Ms;
  double x[N+1], y[N+1];
  
  ifstream in("input.dat");

  Ml = 0;
  Ms = 0;

  for(int i=1; i<=N; i++) {
    in>>x[i]>>y[i];
    
    if(x[i] >= xcut)
      Ml++;
    else
      Ms++;
  }

  in.close();

  /* separate the data */

  int tl, ts;
  double xl[Ml+1], yl[Ml+1];
  double xs[Ms+1], ys[Ms+1];
  
  tl = 1; 
  ts = 1;

  for(int i=1; i<=N; i++) {
       
    if(x[i] >= xcut)
      {
	xl[tl] = x[i];
	yl[tl] = y[i];
	tl++;
      }
    else
      {
	xs[ts] = x[i];
	ys[ts] = y[i];
	ts++;
      }
  }

  /* calculate the regressions */

  double slope, zp;
  double slopel, zpl;
  double slopes, zps;

  cout<<"\n Results -- \n";
  printf("                Slope             ZP         N\n");

  fitline(x, y, N);
  slope = b;
  zp = a;
  printf(" All     : %2.3f +- %2.3f   %2.3f +- %2.3f  %d\n", slope, sigb, zp, siga, N);

  fitline(xs, ys, Ms);
  slopes = b;
  zps = a;
  printf(" X < %2.1f : %2.3f +- %2.3f   %2.3f +- %2.3f  %d\n", xcut, slopes, sigb, zps, siga, Ms);

  fitline(xl, yl, Ml);
  slopel = b;
  zpl = a;
  printf(" X > %2.1f : %2.3f +- %2.3f   %2.3f +- %2.3f  %d\n", xcut, slopel, sigb, zpl, siga, Ml);

  /* F test here */

  double SSE_l, SSE_s;
  double SSE, SSE2;   // SSE2 = A.H, SSE = Null
  double F1, F2, F;

  SSE_l = 0.0;
  for(int i=1; i<=Ml; i++) 
    SSE_l += (yl[i] - slopel*xl[i] - zpl) * (yl[i] - slopel*xl[i] - zpl);

  SSE_s = 0.0;
  for(int i=1; i<=Ms; i++) 
    SSE_s += (ys[i] - slopes*xs[i] - zps) * (ys[i] - slopes*xs[i] - zps);

  SSE = 0.0;
  for(int i=1; i<=N; i++) 
    SSE += (y[i] - slope*x[i] - zp) * (y[i] - slope*x[i] - zp);

  SSE2 = SSE_l + SSE_s;

  F1 = (SSE - SSE2) / 2.0;
  F2 = SSE2 / float(N - 4.0);

  F = F1 / F2;

  cout<<"\n";
  cout<<" The F value is: "<<F<<"\n";

  /* calculate the probability */

  double prob;
  double df1, df2;
  
  df1 = 0.5 * 2.0;
  df2 = 0.5* float(N-4.0);

  prob = betai(df2, df1, df2/(df2+df1*F));

  if(prob > 1.0)
    prob = 2.0 - prob;
  else
    prob = prob;

  cout<<" The P(F) value is: "<<prob<<"\n\n";
}

/* The following codes are taken from Numerical Recipes */

double betai(double a, double b, double x)
{
       
  double bt;

  if (x < 0.0 || x > 1.0) cout<<"Bad x in routine betai\n";
  if (x == 0.0 || x == 1.0) bt=0.0;
  else
    bt=exp(gammln(a+b)-gammln(a)-gammln(b)+a*log(x)+b*log(1.0-x));
  if (x < (a+1.0)/(a+b+2.0))
    return bt*betacf(a,b,x)/a;
  else
    return 1.0-bt*betacf(b,a,1.0-x)/b;
}

double gammln(double xx)
{
  double x,y,tmp,ser;
  static double cof[6]={76.18009172947146,-86.50532032941677,
			24.01409824083091,-1.231739572450155,
			0.1208650973866179e-2,-0.5395239384953e-5};
  int j;
  
  y=x=xx;
  tmp=x+5.5;
  tmp -= (x+0.5)*log(tmp);
  ser=1.000000000190015;
  for (j=0;j<=5;j++) ser += cof[j]/++y;
  return -tmp+log(2.5066282746310005*ser/x);
}

double betacf(double a, double b, double x)
{
  
  int m,m2;
  double aa,c,d,del,h,qab,qam,qap;
  
  qab=a+b;
  qap=a+1.0;
  qam=a-1.0;
  c=1.0;
  d=1.0-qab*x/qap;
  if (fabs(d) < FPMIN) d=FPMIN;
  d=1.0/d;
  h=d;
  for (m=1;m<=MAXIT;m++) {
    m2=2*m;
    aa=m*(b-m)*x/((qam+m2)*(a+m2));
    d=1.0+aa*d;
    if (fabs(d) < FPMIN) d=FPMIN;
    c=1.0+aa/c;
    if (fabs(c) < FPMIN) c=FPMIN;
    d=1.0/d;
    h *= d*c;
    aa = -(a+m)*(qab+m)*x/((a+m2)*(qap+m2));
    d=1.0+aa*d;
    if (fabs(d) < FPMIN) d=FPMIN;
    c=1.0+aa/c;
    if (fabs(c) < FPMIN) c=FPMIN;
    d=1.0/d;
    del=d*c;
    h *= del;
    if (fabs(del-1.0) < EPS) break;
  }
  if (m > MAXIT) cout<<"a or b too big, or MAXIT too small in betacf\n";
  return h;
}

#undef MAXIT
#undef EPS
#undef FPMIN



