#!/usr/bin/perl -w use CGI qw(:standard :html3); print header(), start_html("Perl's Student's T Test"); print h1("Results of Student's T Test"); my $data1 = param("set_1"); my $data2 = param("set_2"); $data1 =~ s/,/ /g; #replace "," with " " $data2 =~ s/,/ /g; @set_a = split(/\s+/, $data1); @set_b = split(/\s+/, $data2); $view_set_a = join(", ",@set_a); $view_set_b = join(", ",@set_b); #print p("Heres set 1 (containing ",@set_a+0,," data points):",br(),"$view_set_a"); #print p("Heres set 2 (containing ",@set_b+0,," data points):",br(),"$view_set_b"); print table({-border=>undef}, caption(strong('First Data Set '),@set_a+0,' values'), Tr({-align=>CENTER,-valign=>TOP}, [ td([@set_a]), ] ) ); print p(); print table({-border=>undef}, caption(strong('Second Data Set '),@set_b+0,' values'), Tr({-align=>CENTER,-valign=>TOP}, [ td([@set_b]), ] ) ); foreach (@set_a) { $sum_a += $_ }; foreach (@set_b) { $sum_b += $_ }; $avg_a = $sum_a / @set_a; $avg_b = $sum_b / @set_b; foreach (@set_a) { $var_a += ( $_ - $avg_a )**2 }; $var_a /= (@set_a-1); foreach (@set_b) { $var_b += ( $_ - $avg_b )**2 }; $var_b /= (@set_b-1); $df = @set_a + @set_b -2; # is the below var*n or var*(n-1)? I think the latter. see pp 90, # 194, 232 of Schaum's outline $sigma = sqrt( ((@set_a-1)*$var_a + (@set_b-1)*$var_b) / $df ); $t = ($avg_a - $avg_b) / ($sigma * sqrt( 1/@set_a + 1/@set_b)); #print p($sum_a, " ", @set_a+0, " ", $avg_a, " ", $var_a, "\n") ; #print p($sum_b, " ", @set_b+0, " ", $avg_b, " ", $var_b, "\n") ; #print p(" t is $t\ndf is $df\n"); $p= tdist($t,$df); if ($p > 0.5) { $p = 1.0 - $p; } if ($p < 0.0001) { $neat_p = "<0.0001"; } else { $neat_p = sprintf("%.4f",$p); } #print p("and the p is $p\n"); print table({-border=>undef}, caption(strong('Summary Statistics')), Tr({-align=>CENTER,-valign=>TOP}, [ th(['','n ','sum','avg','variance','std. dev.']), th('Set 1').td([@set_a+0,$sum_a,sprintf("%.4f",$avg_a), sprintf("%.4f",$var_a),sprintf("%.4f",sqrt($var_a))]), th('Set 2').td([@set_b+0,$sum_b,sprintf("%.4f",$avg_b), sprintf("%.4f",$var_b),sprintf("%.4f",sqrt($var_b))]), ] ) ); print p(); print table({-border=>undef}, caption(strong('Student\'s T Test Results')), Tr({-align=>CENTER,-valign=>TOP}, [ th(['t value','df','P value']), td([sprintf("%.4f",$t),$df,$neat_p]), ] ) ); print end_html(); # tdist, which finds the probability that corresponds to a given value # of t with k degrees of freedom. This algorithm is translated from a # pascal function on p81 of "Statistical Computing in Pascal" by D # Cooke, A H Craven & G M Clark (1985: Edward Arnold (Pubs.) Ltd: # London). The above Pascal algorithm is itself a translation of the # fortran algoritm "AS 3" by B E Cooper of the Atlas Computer # Laboratory as reported in (among other places) "Applied Statistics # Algorithms", editied by P Griffiths and I D Hill (1985; Ellis # Horwood Ltd.; W. Sussex, England). I used the Pascal algorithm as # it was a much more enjoyable read and the authors put up the actual # equations, so you can see what's going on. Anyway, I wanted to give # credit where credit is due and since the algorithm's so short, I # have to do something with all the room left over... # arguments it takes are: tdist(t_value, degrees_of_freedom) sub tdist { my ($tt, $tk, $ta, $tc, $ts, $ti, $tsum, $tterm, $ttheta); ($tt, $tk) = @_ ; $ta = 0.636619772; # this is 2/pi, roughly $tterm = $tk; $ttheta = atan2($tt,sqrt($tterm)); $tc = cos($ttheta); $ts = sin($ttheta); $tsum = 0; if ($tk%2 == 1) { $ti = 3; $tterm = $tc; } else { $ti = 2; $tterm = 1; } $tsum = $tterm; while ( $ti < $tk) { $tterm *= $tc*$tc*($ti-1)/$ti; $tsum += $tterm; $ti += 2; } $tsum *= $ts; $tsum = $ta*($tsum+$ttheta) if (($tk % 2) == 1); return (0.5*(1+$tsum)); }