Question

Currently I'm working on an awk script to do some statistical analysis on measurement data. I'm using linear regression to get parameter estimates, standard errors etc. and would also like to compute the p-value for a null-hypothesis test (t-test).

This is my script so far, any idea how to compute the p-value?

BEGIN {
    ybar = 0.0
    xbar = 0.0
    n = 0
    a0 = 0.0
    b0 = 0.0
    qtinf0975 = 1.960 # 5% n = inf
}

{ # y_i is in $1, x_i has to be counted
    n = n + 1
    yi[n] = $1*1.0
    xi[n] = n*1.0
}

END {
    for ( i = 1; i <= n ; i++ ) {
        ybar = ybar + yi[i]
        xbar = xbar + xi[i]
    }
    ybar = ybar/(n*1.0)
    xbar = xbar/(n*1.0)

    bhat = 0.0
    ssqx = 0.0
    for ( i = 1; i <= n; i++ ) {
        bhat = bhat + (yi[i] - ybar)*(xi[i] - xbar)
        ssqx = ssqx + (xi[i] - xbar)*(xi[i] - xbar)
    }
    bhat = bhat/ssqx
    ahat = ybar - bhat*xbar

    print "n: ", n
    print "alpha-hat: ", ahat
    print "beta-hat: ", bhat

    sigmahat2 = 0.0
    for ( i = 1; i <= n; i++ ) {
        ri[i] = yi[i] - (ahat + bhat*xi[i])
        sigmahat2 = sigmahat2 + ri[i]*ri[i]
    }
    sigmahat2 = sigmahat2 / ( n*1.0 - 2.0 )

    print "sigma-hat square: ", sigmahat2

    seb = sqrt(sigmahat2/ssqx)

    print "se(b): ", seb

    sigmahat = sqrt((seb*seb)*ssqx)
    print "sigma-hat: ", sigma
    sea = sqrt(sigmahat*sigmahat * ( 1 /(n*1.0) + xbar*xbar/ssqx))

    print "se(a): ", sea


    # Tests

    print "q(inf)(97.5%): ", qtinf0975

    Tb = (bhat - b0) / seb
    if ( qtinf0975 > Tb )
        print "T(b) plausible: ", Tb, " < ", qtinf0975
    else
        print "T(b) NOT plausible: ", Tb, " > ", qtinf0975

    print "confidence(b): [", bhat - seb * qtinf0975,", ", bhat + seb * qtinf0975 ,"]"

    Ta = (ahat - a0) / sea
    if ( qtinf0975 > Ta )
        print "T(a) plausible: ", Ta, " < ", qtinf0975
    else
        print "T(a) NOT plausible: ", Ta, " > ", qtinf0975

    print "confidence(a): [", ahat - seb * qtinf0975,", ", ahat + seb * qtinf0975 ,"]"
}
Was it helpful?

Solution 2

OK, I've found a javascript implementation and ported it to awk this are the functions used to compute the p-value:

function statcom ( mq, mi, mj, mb )
{
    zz = 1
    mz = zz
    mk = mi
    while ( mk <= mj ) {
        zz = zz * mq * mk / ( mk - mb)
        mz = mz + zz
        mk = mk + 2
    }
    return mz
}

function studpval ( mt , mn )
{
    PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679 # thank you wikipedia
    if ( mt < 0 )
        mt = -mt
    mw = mt / sqrt(mn)
    th = atan2(mw, 1)
    if ( mn == 1 )
        return 1.0 - th / (PI/2.0)
    sth = sin(th)
    cth = cos(th)
    if ( mn % 2 == 1 )
        return 1.0 - (th+sth*cth*statcom(cth*cth, 2, mn-3, -1))/(PI/2.0)
    else
        return 1.0 - sth * statcom(cth*cth, 1, mn-3, -1)
}

I've integrated them like this:

    pvalb = studpval(Tb, n)
    if ( pvalb > 0.05 )
        print "p-value(b) plausible: ", pvalb, " > 0.05"
    else
        print "p-value(b) NOT plausible: ", pvalb, " < 0.05"

    pvala = studpval(Ta, n)
    if ( pvala > 0.05 )
        print "p-value(a) plausible: ", pvala, " > 0.05"
    else
        print "p-value(a) NOT plausible: ", pvala, " < 0.05"

OTHER TIPS

You're probably trying to do a paired t-test under the assumption of variance equality. I suggest you have a look at the corresponding entry in the excellent MathWorld website.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top