Mercurial > repos > adam-novak > hexagram

// statistics.js: Web Worker file to run statistical tests in the background.

// Constants:
// How many pseudocount trials should we use for the binomial test?
var BINOMIAL_PSEUDOCOUNTS = 5;

// Should we log information about suspicious p values to the console for manual
// spot checking?
var LOG_SUSPICIOUS = false;

// Go get jStat. Hope it's happy in Worker-land.
importScripts("jstat-1.0.0.js");

// Make a fake console to catch jstat warnings, so they don't crash the script.
console = {
    warn: print
}

onmessage = function(message) {
    // Handle incoming messages from the page. Each message's data is an RPC
    // request, with "name" set to a function name, "args" set to an array of
    // arguments, and "id" set to an ID that should be returned with the return
    // value in a reply message. If the function call fails, an error is sent
    // back.


    try {
        // Go get the specified global function, and apply it on the given
        // arguments. Use the global scope ("self") as its "this".
        var return_value = self[message.data.name].apply(self,
            message.data.args);

    } catch(exception) {

        // Send the error back to the page instead of a return value.
        // Unfortunately, errors themselves can't be cloned, so we do all the
        // message making here and send back a string.

        // First we build a string with all the parts of the error we can get.
        var error_message = "Error in web worker doing job " + message.data.id;
        error_message += "\n";
        error_message += exception.name + ": " + exception.message;
        error_message += "\n";
        error_message += "Full details:\n";
        for(field in exception) {
            if(field == "name" || field == "message") {
                // Already got these.
                continue;
            }

            // Copy the field into the message as a string.
            error_message += field + ": " + exception[field] + "\n";
        }
        error_message += "Call: " + message.data.name + "(";
        for(var i = 0; i < message.data.args.length; i++) {
            error_message += message.data.args[i];
            if(i + 1 < message.data.args.length) {
                // Have an argument after this.
                error_message += ", ";
            }
        }
        error_message += ")";

        postMessage({
            id: message.data.id,
            error: error_message
        });

        return;
    }


    // Send the return value back with the id.
    postMessage({
        id: message.data.id,
        return_value: return_value
    });
}

function print(message) {
    // Print a message to the console of the parent page.
    postMessage({
        log: message
    });
}

function statistics_for_matrix(matrix_url, in_list, out_list, all_list) {
    // Download the given score matrix, do stats between in_list and out_list
    // for each layer in it, and return an object from layer name to p value.
    // all_list specifies the names of all signatures that figure into the
    // analysis at all.

    // Download the matrix synchronously.
    // See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch
    // ronous_and_Asynchronous_Requests
    // A side effect of this is that we won't have more simultaneous downloads
    // than workers, which is probably good.
    // This holds the request.
    var request = new XMLHttpRequest();
    // Get the layer data by GET. The false makes it synchronous.
    request.open("GET", matrix_url, false);
    request.send(null);

    // Now we have the layer TSV
    // But we don't have our fancy jQuery TSV parser. Parse it manually.

    // This holds an object of layer data objects (from signature to float) by
    // layer name.
    layers = {};

    // This holds the array of lines
    // Split on newlines (as seen in jQuery.tsv.js)
    var lines = request.responseText.split(/\r?\n/);

    // Line 0 gives all the layer names, but the first thing isn't a layer name
    // (since it's above the signature column).
    var layer_names = lines[0].split(/\t/);
    for(var i = 1; i < layer_names.length; i++) {
        // Make sure we have an object for this layer
        layers[layer_names[i]] = {};
    }

    // The rest give values per layer for the hex in column 1.
    for(var i = 1; i < lines.length; i++) {
        // This holds the parts of each line
        var parts = lines[i].split(/\t/);

        if(parts[0]) {
            // We actually have data

            // Get the singature
            var signature = parts[0];

            for(var j = 1; j < parts.length; j++) {
                // Go through each non-signature entry and set the appropriate
                // layer's value for this signature.
                layers[layer_names[j]][signature] = parseFloat(parts[j]);
            }
        }
    }

    // Now we've parsed the matrix.
    // Go do stats for each layer.
    // This holds our calculated p valued by layer name.
    var p_values = {};

    print("Running statistics for (up to) " + layer_names.length +
        " layers from matrix " + matrix_url);

    for(var i = 1; i < layer_names.length; i++) {
        // Pass the layer data to the per-layer statistics, and get the p value
        // back. It's probably easier to do this in this worker than to go
        // invoke more workers.
        p_values[layer_names[i]] = statistics_for_layer(layers[layer_names[i]],
            in_list, out_list, all_list);
    }

    // We've now calculated a p value for every layer in the matrix. Return the
    // calculated p values labeled by layer.
    return p_values;

}

function statistics_for_layer(layer_data, in_list, out_list, all_list) {
    // Run the appropriate statistical test for the passed layer data, between
    // the given in and out arrays of signatures. all_list specifies the names
    // of all signatures that figure into the analysis at all. Return the p
    // value for the layer, or NaN if no p value could be calculated.

    // This holds whether the layer is discrete
    var is_discrete = true;

    // This holds whether the layer is binary
    var is_binary = true;

    for(var signature in layer_data) {
        if(layer_data[signature] > 1 || layer_data[signature] < 0) {
            // Not a binary layer
            is_binary = false;
        }

        if(layer_data[signature] % 1 !== 0) {
            // It's a float
            is_binary = false;
            is_discrete = false;
        }
    }

    if(is_binary) {
        // This is a binary/dichotomous layer, so run a binomial test.
        return binomial_compare(layer_data, in_list, out_list, all_list);
    } else if (is_discrete) {
        // This is a multinomial/categorical layer
        // TODO: statistics for discrete non-binary layers
        return NaN;
    } else {
        // This is a continuous layer, so run a t test
        return t_compare(layer_data, in_list, out_list, all_list);
    }

}

function statistics_for_url(layer_url, in_list, out_list, all_list) {
    // Run the stats for the layer with the given url, between the given in and
    // out arrays of signatures. all_list specifies the names of all signatures
    // that figure into the analysis at all. Return the p value for the layer,
    // or NaN if no p value could be calculated.

    print("Running statistics for individual layer " + layer_url);

    // Download the layer data synchronously.
    // See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch
    // ronous_and_Asynchronous_Requests
    // A side effect of this is that we won't have more simultaneous downloads
    // than workers, which is probably good.
    // This holds the request.
    var request = new XMLHttpRequest();
    // Get the layer data by GET. The false makes it synchronous.
    request.open("GET", layer_url, false);
    request.send(null);

    // Now we have the layer TSV
    // But we don't have our fancy jQuery TSV parser. Parse it manually.

    // This holds the layer data (signature to float)
    var layer_data = {}

    // This holds the array of lines
    // Split on newlines (as seen in jQuery.tsv.js)
    var lines = request.responseText.split(/\r?\n/);

    for(var i = 0; i < lines.length; i++) {
        // This holds the parts of each line
        var parts = lines[i].split(/\t/);

        if(parts[0]) {
            // We actually have data
            // Parse the layer value for this signature
            var value = parseFloat(parts[1]);

            // Store the value in the layer data
            layer_data[parts[0]] = value;
        }
    }

    // Run stats on the downloaded data
    return statistics_for_layer(layer_data, in_list, out_list, all_list);
}

function t_compare(layer_data, in_list, out_list, all_list) {
    // Given the data of a continuous layer object (an object from signature
    // name to float (or undefined)), and arrays of the names of "in" and "out"
    // signatures, do a t test test for whether the in signatures differ from
    // the out signatures. Returns an object of metadata, with "p_value" set to
    // either the p value of the test (two-tailed), or NaN if the test cannot be
    // performed (due to, e.g. fewer than 2 samples in one category).

    // Go through the in list and calculate all the summary statistics
    // How many non-NaN values?
    var number_in = 0;
    // What is the sum?
    var sum_in = 0;

    for(var i = 0; i < in_list.length; i++) {
        if(!isNaN(layer_data[in_list[i]])) {
            number_in++;
            sum_in += layer_data[in_list[i]];
        }
    }

    // We've done one pass, so we know if we have any in list data actually
    if(number_in < 2) {
        // Not enough to run the t test
        return NaN;
    }

    // What is the mean?
    var mean_in = sum_in / number_in;

    // What is the second moment (sum of squares of differences from the mean)
    var second_moment_in = 0;
    for(var i = 0; i < in_list.length; i++) {
        if(!isNaN(layer_data[in_list[i]])) {
            second_moment_in += Math.pow(layer_data[in_list[i]] - mean_in, 2);
        }
    }

    // What is the unbiased variance?
    unbiased_variance_in = second_moment_in / (number_in - 1);

    // Now go through the same process for the out list
    // How many non-NaN values?
    var number_out = 0;
    // What is the sum?
    var sum_out = 0;

    for(var i = 0; i < out_list.length; i++) {
        if(!isNaN(layer_data[out_list[i]])) {
            number_out++;
            sum_out += layer_data[out_list[i]];
        }
    }

    // We've done one pass, so we know if we have any out list data actually
    if(number_out < 2) {
        // Not enough to run the t test
        return NaN;
    }

    // What is the mean?
    var mean_out = sum_out / number_out;

    // What is the second moment (sum of squares of differences from the mean)
    var second_moment_out = 0;
    for(var i = 0; i < out_list.length; i++) {
        if(!isNaN(layer_data[out_list[i]])) {
            second_moment_out += Math.pow(layer_data[out_list[i]] - mean_out,
                2);
        }
    }

    // What is the unbiased variance?
    unbiased_variance_out = second_moment_out / (number_out - 1);

    // We can't do the test if both variances are 0
    if(unbiased_variance_in == 0 && unbiased_variance_out == 0) {
        return NaN;
    }

    // Now we can calculate the t test two-tailed p value
    var p_value = t_test(mean_in, unbiased_variance_in, number_in, mean_out,
        unbiased_variance_out, number_out);

    // And return it in a dict with other metadata.
    // We don't really have any other metadata.
    return {
        p_value: p_value
    };
}

function t_test(mean_in, unbiased_variance_in, number_in, mean_out,
    unbiased_variance_out, number_out) {

    // Given the mean, unbiased variance, and number of samples for both the in
    // group and the out group, compute the p value for the t test with unequal
    // sample sizes and unequal variances, testing to see whether the means
    // differ (a two-tailed "Welch's" t test). See
    // https://en.wikipedia.org/wiki/Student%27s_t-test
    // Assumes we have enough samples to actually perform the test.

    // First, calculate the t statistic, which is where our observations fall on
    // the t distribution.
    var t_statistic = (mean_in - mean_out) / Math.sqrt((unbiased_variance_in /
        number_in) + (unbiased_variance_out / number_out));


    // Calculate the degrees of freedom for the particular t distribution that
    // we ought to compare the statistic against
    var degrees_of_freedom = Math.pow((unbiased_variance_in / number_in) +
        (unbiased_variance_out / number_out), 2) /
        ((Math.pow(unbiased_variance_in / number_in, 2) / (number_in - 1)) +
        (Math.pow(unbiased_variance_out / number_out, 2) / (number_out - 1)));

    // Now we have to compare the t statistic to the t test CDF available via
    // the totally undocumented jstat.pt = function(q, df, ncp, lower_tail, log)
    // where:
    // q is the t statistic value to calculate the cdf at
    // df is the degrees of freedom
    // ncp is the "mu" parameter for the t distributiuon. I think this sets the
    // mean, and it's OK to leave blank.
    // lower_tail presumably specifies if we want the lower or upper tail of the
    // CDF. Defaults to true.
    // Log specifies if we want the log probability. Defaults to false.

    // Make the t statistic be on the low side of the distribution, and
    // calculate the lower tail's area using the CDF.
    var one_tail_probability = jstat.pt(0 - Math.abs(t_statistic),
        degrees_of_freedom);

    // Return the two-tailed p value, which, since the t distribution is
    // symmetric, is just twice the single-tail probability
    return 2 * one_tail_probability;

}

function binomial_compare(layer_data, in_list, out_list, all_list) {
    // Given the data of a binary layer object (an object from signature name to
    // 0 or 1 (or undefined)), and arrays of the names of "in" and "out"
    // signatures, do a binomial test for whether the in signatures differ from
    // the out signatures. Uses a number of pseudocount trials as specified in
    // the global constant BINOMIAL_PSEUDOCOUNTS Returns an object of metadata,
    // with "p_value" set to either the p value of the test (two-tailed), or NaN
    // if the test cannot be performed. all_list specifies the names of all
    // signatures that figure into the analysis at all (i.e. those which the
    // user hasn't filtered out), which we use when calculating how many of our
    // pseudocounts should be successes. Signature names appearing in all_list
    // but with no data in layer_data are not counted.


    // Work out the distribution from the out list
    // How many out signatures are 1?
    var outside_yes = 0;
    // And are 0?
    var outside_no = 0;

    for(var i = 0; i < out_list.length; i++) {
        if(layer_data[out_list[i]] === 1) {
            // This is a yes and it's outside.
            outside_yes++;
        } else if(layer_data[out_list[i]] === 0) {
            // A no and outside
            outside_no++;
        }
    }

    // It's OK for all the outside hexes to be 0 now. Pseudocounts can give us a
    // p value.

    // Now work out our pseudocounts.
    // How many signatures in all_list are successes?
    var all_yes = 0;
    // And how many are failures (as opposed to undef)
    var all_no = 0;

    for(var i = 0; i < all_list.length; i++) {
        if(layer_data[all_list[i]] === 1) {
            // A yes anywhere
            all_yes++;
        } else if(layer_data[all_list[i]] === 0) {
            // A real no (not a no-data) anywhere
            all_no++;
        }
    }

    // It't not OK for there to be no hexes in the all set. Maybe they filtered
    // out all the ones with any data?
    if(all_yes + all_no == 0) {
        // TODO: Sure wish we had layer names here.
        print("No signatures were available with data for this layer.");
        return NaN;
    }

    // Calculate how many pseudo-yeses we should have.
    // Match the frequency in all signatures.
    var pseudo_yes = BINOMIAL_PSEUDOCOUNTS * (all_yes / (all_yes + all_no));

    // pseudo-trials is just BINOMIAL_PSEUDOCOUNTS

    // This holds the probability of being a 1 for the out list.
    // We want to test if the in list differs significantly from this.
    var background_probability = (outside_yes + pseudo_yes) / (outside_yes +
        outside_no + BINOMIAL_PSEUDOCOUNTS);

    if(background_probability == 0) {
        // Can't do the binomial test in this case. Somehow there were no yeses
        // anywhere.
        return NaN;
    }

    // How many 1s are in the in list?
    var inside_yes = 0;
    // And how many 0s?
    var inside_no = 0;

    for(var i = 0; i < in_list.length; i++) {
        if(layer_data[in_list[i]] === 1) {
            // This is a yes and it's inside.
            inside_yes++;
        } else if(layer_data[in_list[i]] === 0) {
            // A no and it's inside
            inside_no++;
        }
    }

    // Return the p value for rejecting the null hypothesis that the in
    // signatures follow the background distribution.
    var p = binomial_test(inside_yes + inside_no, inside_yes,
        background_probability);

    if(LOG_SUSPICIOUS && (p == 0 || p == 1)) {
        // We got an odd p value. Complain about it.
        print("Got suspicious p value " + p);
        print("Was binomial test for " + inside_yes + " successes in " +
            (inside_yes + inside_no) + " trials at probability " +
            background_probability);
        print("Background was " + outside_yes + " out of " + (outside_yes +
            outside_no) + " with " + pseudo_yes + " out of " +
            BINOMIAL_PSEUDOCOUNTS + " pseudocounts.");
    }

    // Return our p value as "p_value", and also how many non-pseudocount
    // successes were in the in_list and the out_list.
    return {
        p_value: p,
        inside_yes: inside_yes,
        outside_yes: outside_yes
    };
}

function binomial_test(trials, successes, success_probability) {
    if(trials < successes) {
        print("Trying to test " + trials + " trials with " + successes +
            " successes!");
    }

    // Return the p value for rejecting the null hypothesis that the observed
    // number of successes happened in the observed number of trials when the
    // probability of success was success_probability. Does a Binomial
    // test.

    // Calculate the P value
    // This must be terribly complicated since nobody seems to have written up
    // how to do it as anything other than an arcane stats ritual.
    // Something close: http://www.johnmyleswhite.com/notebook/2012/04/14/implem
    // enting-the-exact-binomial-test-in-julia/
    // How scipy.stats does it (x = successes, n = trials, p = supposed
    // probability):
    // SourceForge says Scipy is BSD licensed, so we can steal this code for our
    // comments.
    /*
        d = distributions.binom.pmf(x,n,p)
        rerr = 1+1e-7
        if (x < p*n):
            i = np.arange(np.ceil(p*n),n+1)
            y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0)
            pval = distributions.binom.cdf(x,n,p) + distributions.binom.sf(n-y,
                n,p)
        else:
            i = np.arange(np.floor(p*n))
            y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0)
            pval = distributions.binom.cdf(y-1,n,p) + distributions.binom.sf(
                x-1,n,p)
    */
    // There is of course no justification for why this would work.
    // What it's actually doing is a complicated Numpy vectorized operation to
    // find the boundary of the tail we don't have, and then adding the CDF of
    // the lower tail boundary and (1-CDF) of the upper tail boundary (which is
    // the P value by definition).

    // This holds the probability of exactly what we've observed under the null
    // hypothesis.
    var observed_probability = binomial_pmf(trials, successes,
        success_probability);

    if(successes < trials * success_probability) {
        // We know anything with fewer successes than this is more extreme. But
        // how many successes would we need to have an equally extreme but
        // higher than expected number of successes?
        // We should sum down from all successes. (We'll sum from small to large
        // so it's OK numerically.)

        // This holds the total probability of everything more extremely
        // successful than what we've observed.
        var other_tail_total_probability = 0;

        // TODO: implement some better sort of search thing and use CDF
        for(var other_tail_start = trials; other_tail_start >=
            Math.ceil(trials * success_probability); other_tail_start--) {

            // Get the probability for this particular case
            var case_probability = binomial_pmf(trials, other_tail_start,
                success_probability);

            if(case_probability > observed_probability) {
                // This case is actually less extreme than what we've observed,
                // so our summation is complete.

                break;
            } else {
                // This case is more extreme than what we've observed, so use it
                other_tail_total_probability += case_probability;
            }
        }

        // This holds the probability in this tail
        var this_tail_probability = binomial_cdf(trials, successes,
            success_probability)


        // Return the total probability from both tails, clamped to 1.
        return Math.min(this_tail_probability + other_tail_total_probability,
            1.0);
    } else {
        // We know anything with more successes than this is more extreme. But
        // how few successes would we need to have an equally extreme but lower
        // than expected number of successes?
        // We will sum up from 0 successes. We really ought to use the CDF
        // somehow, but I can't think of how we would do it.

        // This holds the total probability of everything more extremely
        // failureful than what we've observed.
        var other_tail_total_probability = 0;

        for(var other_tail_end = 0; other_tail_end <
            Math.floor(trials * success_probability); other_tail_end++) {
            // We only have to iterate up to the peak (most likely) value.

            // Get the probability for this particular case
            var case_probability = binomial_pmf(trials, other_tail_end,
                success_probability);

            if(case_probability > observed_probability) {
                // This case is actually less extreme than what we've observed,
                // so our summation is complete.
                break;
            } else {
                // This case is more extreme than what we've observed, so use it
                other_tail_total_probability += case_probability;
            }

        }

        // This holds the probability in this tail. It is equal to the
        // probability up to, but not including, where this tail starts. So even
        // if the tail starts at the highest possible number of successes, it
        // has some probability. successes can't be 0 here (since then we'd be
        // below any nonzero expected probability and take the other branch.
        // Since it's a positive integer, it must be 1 or more, so we can
        // subtract 1 safely.
        var this_tail_probability = 1 - binomial_cdf(trials, successes - 1,
            success_probability);

        // Return the total probability from both tails, clamped to 1
        return Math.min(this_tail_probability + other_tail_total_probability,
            1.0);
    }


}

function binomial_cdf(trials, successes, success_probability) {
    // The Binomial distribution's cumulative distribution function. Given a
    // number of trials, a number of successes, and a success probability,
    // return the probability of having observed that many successes or fewer.

    // We compute this efficiently using the "regularized incomplete beta
    // function", AKA the beta distribution cdf, which we get from jstat.
    // See http://en.wikipedia.org/wiki/Binomial_distribution#Cumulative_distrib
    // ution_function and http://en.wikipedia.org/wiki/Regularized_incomplete_be
    // ta_function#Incomplete_beta_function

    if(trials == successes) {
        // jStat doesn't want a 0 alpha for its beta distribution (no failures)
        // Calculate this one by hand (it's easy)
        return 1;
    }

    if(trials < successes) {
        // This should never happen. TODO: Debug when it happens.
        print("Error: trials (" + trials + ") < successes (" + successes +
            ")!");
        return NaN;
    }

    // This is the observation that we want the beta distribution CDF before
    var beta_observation = 1 - success_probability;

    // These are the parameters of the relavent beta distribution
    var beta_alpha = trials - successes;
    var beta_beta = successes + 1;

    // Return the beta distribution CDF value, which happens to also be our CDF.
    return jstat.pbeta(beta_observation, beta_alpha, beta_beta);
}

function binomial_pmf(trials, successes, success_probability) {
    // The Binomial distribution's probability mass function. Given a number of
    // trials, a number of successes, and the probability of success on each
    // trial, calculate the probability of observing that many successes in that
    // many trials with the given success rate.

    // The probability of this many successes in this many trials at this
    // success rate is the probability of succeeding so many times and failing
    // so many times, summed over all the mutually exclusive arrangements of
    // successes and failures.
    return (choose(trials, successes) *
        Math.pow(success_probability, successes) *
        Math.pow(1 - success_probability, trials - successes));

}

function choose(available, selected) {
    // The choose function: from available distinct objects, how many ways are
    // there to select selected of them. Returns "available choose selected".
    // Works with large input numbers that are too big to take the factorials
    // of.

    // We use a neat overflow-robust algorithm that eliminates the factorials
    // and makes the computation a multiplication of numbers greater than one.
    // So, no overflow unless the result itself is too big.
    // See http://arantxa.ii.uam.es/~ssantini/writing/notes/s667_binomial.pdf

    if(selected < available - selected) {
        // It would be faster to think about choosing what we don't include. So
        // do that instead.
        return choose(available, available - selected);
    }

    // This holds the result we are accumulating. Initialize to the
    // multiplicative identity.
    var result = 1;

    for(var i = 1; i < available - selected + 1; i++) {
        result *= (1 + (selected / i));
    }

    // TODO: The result ought always to be an integer. Ensure this.
    return result;
}
author	adam-novak
date	Tue, 22 Oct 2013 14:17:59 -0400
parents
children