#!/usr/bin/perl -w
use CGI qw(:standard :html3);
print header(), start_html("Perl's Student's T Test");
print h1("Results of Student's T Test");
my $data1 = param("set_1");
my $data2 = param("set_2");


$data1 =~ s/,/ /g; #replace "," with " "
$data2 =~ s/,/ /g;


@set_a = split(/\s+/, $data1);
@set_b = split(/\s+/, $data2);

$view_set_a = join(", ",@set_a);
$view_set_b = join(", ",@set_b);


#print p("Heres set 1 (containing ",@set_a+0,," data points):",br(),"$view_set_a"); 
#print p("Heres set 2 (containing ",@set_b+0,," data points):",br(),"$view_set_b"); 


print table({-border=>undef},
        caption(strong('First Data Set '),@set_a+0,' values'),
        Tr({-align=>CENTER,-valign=>TOP},
        [
           td([@set_a]),
        ]
      )
);
print p();
print table({-border=>undef},
        caption(strong('Second Data Set '),@set_b+0,' values'),
        Tr({-align=>CENTER,-valign=>TOP},
        [
           td([@set_b]),
        ]
      )
);





foreach (@set_a) { $sum_a += $_ };
foreach (@set_b) { $sum_b += $_ };

$avg_a = $sum_a / @set_a;
$avg_b = $sum_b / @set_b;

foreach (@set_a) { $var_a += ( $_ - $avg_a )**2 }; 
$var_a /= (@set_a-1);

foreach (@set_b) { $var_b += ( $_ - $avg_b )**2 }; 
$var_b /= (@set_b-1);

$df = @set_a + @set_b -2;

# is the below var*n or var*(n-1)? I think the latter.  see pp 90,
# 194, 232 of Schaum's outline

$sigma = sqrt( ((@set_a-1)*$var_a + (@set_b-1)*$var_b) / $df );

$t = ($avg_a - $avg_b) / ($sigma * sqrt( 1/@set_a + 1/@set_b));


#print p($sum_a, " ", @set_a+0, " ", $avg_a, " ", $var_a, "\n") ;
#print p($sum_b, " ", @set_b+0, " ", $avg_b, " ", $var_b, "\n") ;
#print p(" t is $t\ndf is $df\n");
$p= tdist($t,$df);

if ($p > 0.5) {

$p = 1.0 - $p;

}

if ($p < 0.0001) {
  $neat_p = "<0.0001";
  } else {
  $neat_p = sprintf("%.4f",$p);
}


#print p("and the p is $p\n");


print table({-border=>undef},
        caption(strong('Summary Statistics')),
        Tr({-align=>CENTER,-valign=>TOP},
        [
           th(['','n ','sum','avg','variance','std. dev.']),

           th('Set 1').td([@set_a+0,$sum_a,sprintf("%.4f",$avg_a),
                           sprintf("%.4f",$var_a),sprintf("%.4f",sqrt($var_a))]),
           th('Set 2').td([@set_b+0,$sum_b,sprintf("%.4f",$avg_b),
                           sprintf("%.4f",$var_b),sprintf("%.4f",sqrt($var_b))]),
        ]
      )
);

print p();

print table({-border=>undef},
        caption(strong('Student\'s T Test Results')),
        Tr({-align=>CENTER,-valign=>TOP},
        [
           th(['t value','df','P value']),
           td([sprintf("%.4f",$t),$df,$neat_p]),
        ]
      )
);




print end_html();

# tdist, which finds the probability that corresponds to a given value
# of t with k degrees of freedom.  This algorithm is translated from a
# pascal function on p81 of "Statistical Computing in Pascal" by D
# Cooke, A H Craven & G M Clark (1985: Edward Arnold (Pubs.) Ltd:
# London).  The above Pascal algorithm is itself a translation of the
# fortran algoritm "AS 3" by B E Cooper of the Atlas Computer
# Laboratory as reported in (among other places) "Applied Statistics
# Algorithms", editied by P Griffiths and I D Hill (1985; Ellis
# Horwood Ltd.; W. Sussex, England).  I used the Pascal algorithm as
# it was a much more enjoyable read and the authors put up the actual
# equations, so you can see what's going on.  Anyway, I wanted to give
# credit where credit is due and since the algorithm's so short, I
# have to do something with all the room left over...


# arguments it takes are: tdist(t_value, degrees_of_freedom)

sub tdist {

my ($tt, $tk, $ta, $tc, $ts, $ti, $tsum, $tterm, $ttheta);

($tt, $tk) = @_ ;
$ta = 0.636619772; # this is 2/pi, roughly
$tterm = $tk;
$ttheta = atan2($tt,sqrt($tterm));
$tc = cos($ttheta);
$ts = sin($ttheta);
$tsum = 0;

if ($tk%2 == 1) {
  $ti = 3;
  $tterm = $tc;
} else {
  $ti = 2;
  $tterm = 1;
}

$tsum = $tterm;

while ( $ti < $tk) {
  $tterm *= $tc*$tc*($ti-1)/$ti;
  $tsum += $tterm;
  $ti += 2;
}

$tsum *= $ts;
$tsum = $ta*($tsum+$ttheta) if (($tk % 2) == 1);
return (0.5*(1+$tsum));

}