hexagram: hexagram-6ae12361157c/hexagram/statistics.js comparison

comparison hexagram-6ae12361157c/hexagram/statistics.js @ 0:1407e3634bcf draft default tip

Uploaded r11 from test tool shed.

author	adam-novak
date	Tue, 22 Oct 2013 14:17:59 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:1407e3634bcf
+// statistics.js: Web Worker file to run statistical tests in the background.
+// Constants:
+// How many pseudocount trials should we use for the binomial test?
+var BINOMIAL_PSEUDOCOUNTS = 5;
+// Should we log information about suspicious p values to the console for manual
+// spot checking?
+var LOG_SUSPICIOUS = false;
+// Go get jStat. Hope it's happy in Worker-land.
+importScripts("jstat-1.0.0.js");
+// Make a fake console to catch jstat warnings, so they don't crash the script.
+console = {
+warn: print
+}
+onmessage = function(message) {
+// Handle incoming messages from the page. Each message's data is an RPC
+// request, with "name" set to a function name, "args" set to an array of
+// arguments, and "id" set to an ID that should be returned with the return
+// value in a reply message. If the function call fails, an error is sent
+// back.
+try {
+// Go get the specified global function, and apply it on the given
+// arguments. Use the global scope ("self") as its "this".
+var return_value = self[message.data.name].apply(self,
+message.data.args);
+} catch(exception) {
+// Send the error back to the page instead of a return value.
+// Unfortunately, errors themselves can't be cloned, so we do all the
+// message making here and send back a string.
+// First we build a string with all the parts of the error we can get.
+var error_message = "Error in web worker doing job " + message.data.id;
+error_message += "\n";
+error_message += exception.name + ": " + exception.message;
+error_message += "\n";
+error_message += "Full details:\n";
+for(field in exception) {
+if(field == "name" || field == "message") {
+// Already got these.
+continue;
+}
+// Copy the field into the message as a string.
+error_message += field + ": " + exception[field] + "\n";
+}
+error_message += "Call: " + message.data.name + "(";
+for(var i = 0; i < message.data.args.length; i++) {
+error_message += message.data.args[i];
+if(i + 1 < message.data.args.length) {
+// Have an argument after this.
+error_message += ", ";
+}
+}
+error_message += ")";
+postMessage({
+id: message.data.id,
+error: error_message
+});
+return;
+}
+// Send the return value back with the id.
+postMessage({
+id: message.data.id,
+return_value: return_value
+});
+}
+function print(message) {
+// Print a message to the console of the parent page.
+postMessage({
+log: message
+});
+}
+function statistics_for_matrix(matrix_url, in_list, out_list, all_list) {
+// Download the given score matrix, do stats between in_list and out_list
+// for each layer in it, and return an object from layer name to p value.
+// all_list specifies the names of all signatures that figure into the
+// analysis at all.
+// Download the matrix synchronously.
+// See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch
+// ronous_and_Asynchronous_Requests
+// A side effect of this is that we won't have more simultaneous downloads
+// than workers, which is probably good.
+// This holds the request.
+var request = new XMLHttpRequest();
+// Get the layer data by GET. The false makes it synchronous.
+request.open("GET", matrix_url, false);
+request.send(null);
+// Now we have the layer TSV
+// But we don't have our fancy jQuery TSV parser. Parse it manually.
+// This holds an object of layer data objects (from signature to float) by
+// layer name.
+layers = {};
+// This holds the array of lines
+// Split on newlines (as seen in jQuery.tsv.js)
+var lines = request.responseText.split(/\r?\n/);
+// Line 0 gives all the layer names, but the first thing isn't a layer name
+// (since it's above the signature column).
+var layer_names = lines[0].split(/\t/);
+for(var i = 1; i < layer_names.length; i++) {
+// Make sure we have an object for this layer
+layers[layer_names[i]] = {};
+}
+// The rest give values per layer for the hex in column 1.
+for(var i = 1; i < lines.length; i++) {
+// This holds the parts of each line
+var parts = lines[i].split(/\t/);
+if(parts[0]) {
+// We actually have data
+// Get the singature
+var signature = parts[0];
+for(var j = 1; j < parts.length; j++) {
+// Go through each non-signature entry and set the appropriate
+// layer's value for this signature.
+layers[layer_names[j]][signature] = parseFloat(parts[j]);
+}
+}
+}
+// Now we've parsed the matrix.
+// Go do stats for each layer.
+// This holds our calculated p valued by layer name.
+var p_values = {};
+print("Running statistics for (up to) " + layer_names.length +
+" layers from matrix " + matrix_url);
+for(var i = 1; i < layer_names.length; i++) {
+// Pass the layer data to the per-layer statistics, and get the p value
+// back. It's probably easier to do this in this worker than to go
+// invoke more workers.
+p_values[layer_names[i]] = statistics_for_layer(layers[layer_names[i]],
+in_list, out_list, all_list);
+}
+// We've now calculated a p value for every layer in the matrix. Return the
+// calculated p values labeled by layer.
+return p_values;
+}
+function statistics_for_layer(layer_data, in_list, out_list, all_list) {
+// Run the appropriate statistical test for the passed layer data, between
+// the given in and out arrays of signatures. all_list specifies the names
+// of all signatures that figure into the analysis at all. Return the p
+// value for the layer, or NaN if no p value could be calculated.
+// This holds whether the layer is discrete
+var is_discrete = true;
+// This holds whether the layer is binary
+var is_binary = true;
+for(var signature in layer_data) {
+if(layer_data[signature] > 1 || layer_data[signature] < 0) {
+// Not a binary layer
+is_binary = false;
+}
+if(layer_data[signature] % 1 !== 0) {
+// It's a float
+is_binary = false;
+is_discrete = false;
+}
+}
+if(is_binary) {
+// This is a binary/dichotomous layer, so run a binomial test.
+return binomial_compare(layer_data, in_list, out_list, all_list);
+} else if (is_discrete) {
+// This is a multinomial/categorical layer
+// TODO: statistics for discrete non-binary layers
+return NaN;
+} else {
+// This is a continuous layer, so run a t test
+return t_compare(layer_data, in_list, out_list, all_list);
+}
+}
+function statistics_for_url(layer_url, in_list, out_list, all_list) {
+// Run the stats for the layer with the given url, between the given in and
+// out arrays of signatures. all_list specifies the names of all signatures
+// that figure into the analysis at all. Return the p value for the layer,
+// or NaN if no p value could be calculated.
+print("Running statistics for individual layer " + layer_url);
+// Download the layer data synchronously.
+// See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch
+// ronous_and_Asynchronous_Requests
+// A side effect of this is that we won't have more simultaneous downloads
+// than workers, which is probably good.
+// This holds the request.
+var request = new XMLHttpRequest();
+// Get the layer data by GET. The false makes it synchronous.
+request.open("GET", layer_url, false);
+request.send(null);
+// Now we have the layer TSV
+// But we don't have our fancy jQuery TSV parser. Parse it manually.
+// This holds the layer data (signature to float)
+var layer_data = {}
+// This holds the array of lines
+// Split on newlines (as seen in jQuery.tsv.js)
+var lines = request.responseText.split(/\r?\n/);
+for(var i = 0; i < lines.length; i++) {
+// This holds the parts of each line
+var parts = lines[i].split(/\t/);
+if(parts[0]) {
+// We actually have data
+// Parse the layer value for this signature
+var value = parseFloat(parts[1]);
+// Store the value in the layer data
+layer_data[parts[0]] = value;
+}
+}
+// Run stats on the downloaded data
+return statistics_for_layer(layer_data, in_list, out_list, all_list);
+}
+function t_compare(layer_data, in_list, out_list, all_list) {
+// Given the data of a continuous layer object (an object from signature
+// name to float (or undefined)), and arrays of the names of "in" and "out"
+// signatures, do a t test test for whether the in signatures differ from
+// the out signatures. Returns an object of metadata, with "p_value" set to
+// either the p value of the test (two-tailed), or NaN if the test cannot be
+// performed (due to, e.g. fewer than 2 samples in one category).
+// Go through the in list and calculate all the summary statistics
+// How many non-NaN values?
+var number_in = 0;
+// What is the sum?
+var sum_in = 0;
+for(var i = 0; i < in_list.length; i++) {
+if(!isNaN(layer_data[in_list[i]])) {
+number_in++;
+sum_in += layer_data[in_list[i]];
+}
+}
+// We've done one pass, so we know if we have any in list data actually
+if(number_in < 2) {
+// Not enough to run the t test
+return NaN;
+}
+// What is the mean?
+var mean_in = sum_in / number_in;
+// What is the second moment (sum of squares of differences from the mean)
+var second_moment_in = 0;
+for(var i = 0; i < in_list.length; i++) {
+if(!isNaN(layer_data[in_list[i]])) {
+second_moment_in += Math.pow(layer_data[in_list[i]] - mean_in, 2);
+}
+}
+// What is the unbiased variance?
+unbiased_variance_in = second_moment_in / (number_in - 1);
+// Now go through the same process for the out list
+// How many non-NaN values?
+var number_out = 0;
+// What is the sum?
+var sum_out = 0;
+for(var i = 0; i < out_list.length; i++) {
+if(!isNaN(layer_data[out_list[i]])) {
+number_out++;
+sum_out += layer_data[out_list[i]];
+}
+}
+// We've done one pass, so we know if we have any out list data actually
+if(number_out < 2) {
+// Not enough to run the t test
+return NaN;
+}
+// What is the mean?
+var mean_out = sum_out / number_out;
+// What is the second moment (sum of squares of differences from the mean)
+var second_moment_out = 0;
+for(var i = 0; i < out_list.length; i++) {
+if(!isNaN(layer_data[out_list[i]])) {
+second_moment_out += Math.pow(layer_data[out_list[i]] - mean_out,
+2);
+}
+}
+// What is the unbiased variance?
+unbiased_variance_out = second_moment_out / (number_out - 1);
+// We can't do the test if both variances are 0
+if(unbiased_variance_in == 0 && unbiased_variance_out == 0) {
+return NaN;
+}
+// Now we can calculate the t test two-tailed p value
+var p_value = t_test(mean_in, unbiased_variance_in, number_in, mean_out,
+unbiased_variance_out, number_out);
+// And return it in a dict with other metadata.
+// We don't really have any other metadata.
+return {
+p_value: p_value
+};
+}
+function t_test(mean_in, unbiased_variance_in, number_in, mean_out,
+unbiased_variance_out, number_out) {
+// Given the mean, unbiased variance, and number of samples for both the in
+// group and the out group, compute the p value for the t test with unequal
+// sample sizes and unequal variances, testing to see whether the means
+// differ (a two-tailed "Welch's" t test). See
+// https://en.wikipedia.org/wiki/Student%27s_t-test
+// Assumes we have enough samples to actually perform the test.
+// First, calculate the t statistic, which is where our observations fall on
+// the t distribution.
+var t_statistic = (mean_in - mean_out) / Math.sqrt((unbiased_variance_in /
+number_in) + (unbiased_variance_out / number_out));
+// Calculate the degrees of freedom for the particular t distribution that
+// we ought to compare the statistic against
+var degrees_of_freedom = Math.pow((unbiased_variance_in / number_in) +
+(unbiased_variance_out / number_out), 2) /
+((Math.pow(unbiased_variance_in / number_in, 2) / (number_in - 1)) +
+(Math.pow(unbiased_variance_out / number_out, 2) / (number_out - 1)));
+// Now we have to compare the t statistic to the t test CDF available via
+// the totally undocumented jstat.pt = function(q, df, ncp, lower_tail, log)
+// where:
+// q is the t statistic value to calculate the cdf at
+// df is the degrees of freedom
+// ncp is the "mu" parameter for the t distributiuon. I think this sets the
+// mean, and it's OK to leave blank.
+// lower_tail presumably specifies if we want the lower or upper tail of the
+// CDF. Defaults to true.
+// Log specifies if we want the log probability. Defaults to false.
+// Make the t statistic be on the low side of the distribution, and
+// calculate the lower tail's area using the CDF.
+var one_tail_probability = jstat.pt(0 - Math.abs(t_statistic),
+degrees_of_freedom);
+// Return the two-tailed p value, which, since the t distribution is
+// symmetric, is just twice the single-tail probability
+return 2 * one_tail_probability;
+}
+function binomial_compare(layer_data, in_list, out_list, all_list) {
+// Given the data of a binary layer object (an object from signature name to
+// 0 or 1 (or undefined)), and arrays of the names of "in" and "out"
+// signatures, do a binomial test for whether the in signatures differ from
+// the out signatures. Uses a number of pseudocount trials as specified in
+// the global constant BINOMIAL_PSEUDOCOUNTS Returns an object of metadata,
+// with "p_value" set to either the p value of the test (two-tailed), or NaN
+// if the test cannot be performed. all_list specifies the names of all
+// signatures that figure into the analysis at all (i.e. those which the
+// user hasn't filtered out), which we use when calculating how many of our
+// pseudocounts should be successes. Signature names appearing in all_list
+// but with no data in layer_data are not counted.
+// Work out the distribution from the out list
+// How many out signatures are 1?
+var outside_yes = 0;
+// And are 0?
+var outside_no = 0;
+for(var i = 0; i < out_list.length; i++) {
+if(layer_data[out_list[i]] === 1) {
+// This is a yes and it's outside.
+outside_yes++;
+} else if(layer_data[out_list[i]] === 0) {
+// A no and outside
+outside_no++;
+}
+}
+// It's OK for all the outside hexes to be 0 now. Pseudocounts can give us a
+// p value.
+// Now work out our pseudocounts.
+// How many signatures in all_list are successes?
+var all_yes = 0;
+// And how many are failures (as opposed to undef)
+var all_no = 0;
+for(var i = 0; i < all_list.length; i++) {
+if(layer_data[all_list[i]] === 1) {
+// A yes anywhere
+all_yes++;
+} else if(layer_data[all_list[i]] === 0) {
+// A real no (not a no-data) anywhere
+all_no++;
+}
+}
+// It't not OK for there to be no hexes in the all set. Maybe they filtered
+// out all the ones with any data?
+if(all_yes + all_no == 0) {
+// TODO: Sure wish we had layer names here.
+print("No signatures were available with data for this layer.");
+return NaN;
+}
+// Calculate how many pseudo-yeses we should have.
+// Match the frequency in all signatures.
+var pseudo_yes = BINOMIAL_PSEUDOCOUNTS * (all_yes / (all_yes + all_no));
+// pseudo-trials is just BINOMIAL_PSEUDOCOUNTS
+// This holds the probability of being a 1 for the out list.
+// We want to test if the in list differs significantly from this.
+var background_probability = (outside_yes + pseudo_yes) / (outside_yes +
+outside_no + BINOMIAL_PSEUDOCOUNTS);
+if(background_probability == 0) {
+// Can't do the binomial test in this case. Somehow there were no yeses
+// anywhere.
+return NaN;
+}
+// How many 1s are in the in list?
+var inside_yes = 0;
+// And how many 0s?
+var inside_no = 0;
+for(var i = 0; i < in_list.length; i++) {
+if(layer_data[in_list[i]] === 1) {
+// This is a yes and it's inside.
+inside_yes++;
+} else if(layer_data[in_list[i]] === 0) {
+// A no and it's inside
+inside_no++;
+}
+}
+// Return the p value for rejecting the null hypothesis that the in
+// signatures follow the background distribution.
+var p = binomial_test(inside_yes + inside_no, inside_yes,
+background_probability);
+if(LOG_SUSPICIOUS && (p == 0 || p == 1)) {
+// We got an odd p value. Complain about it.
+print("Got suspicious p value " + p);
+print("Was binomial test for " + inside_yes + " successes in " +
+(inside_yes + inside_no) + " trials at probability " +
+background_probability);
+print("Background was " + outside_yes + " out of " + (outside_yes +
+outside_no) + " with " + pseudo_yes + " out of " +
+BINOMIAL_PSEUDOCOUNTS + " pseudocounts.");
+}
+// Return our p value as "p_value", and also how many non-pseudocount
+// successes were in the in_list and the out_list.
+return {
+p_value: p,
+inside_yes: inside_yes,
+outside_yes: outside_yes
+};
+}
+function binomial_test(trials, successes, success_probability) {
+if(trials < successes) {
+print("Trying to test " + trials + " trials with " + successes +
+" successes!");
+}
+// Return the p value for rejecting the null hypothesis that the observed
+// number of successes happened in the observed number of trials when the
+// probability of success was success_probability. Does a Binomial
+// test.
+// Calculate the P value
+// This must be terribly complicated since nobody seems to have written up
+// how to do it as anything other than an arcane stats ritual.
+// Something close: http://www.johnmyleswhite.com/notebook/2012/04/14/implem
+// enting-the-exact-binomial-test-in-julia/
+// How scipy.stats does it (x = successes, n = trials, p = supposed
+// probability):
+// SourceForge says Scipy is BSD licensed, so we can steal this code for our
+// comments.
+/*
+d = distributions.binom.pmf(x,n,p)
+rerr = 1+1e-7
+if (x < p*n):
+i = np.arange(np.ceil(p*n),n+1)
+y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0)
+pval = distributions.binom.cdf(x,n,p) + distributions.binom.sf(n-y,
+n,p)
+else:
+i = np.arange(np.floor(p*n))
+y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0)
+pval = distributions.binom.cdf(y-1,n,p) + distributions.binom.sf(
+x-1,n,p)
+*/
+// There is of course no justification for why this would work.
+// What it's actually doing is a complicated Numpy vectorized operation to
+// find the boundary of the tail we don't have, and then adding the CDF of
+// the lower tail boundary and (1-CDF) of the upper tail boundary (which is
+// the P value by definition).
+// This holds the probability of exactly what we've observed under the null
+// hypothesis.
+var observed_probability = binomial_pmf(trials, successes,
+success_probability);
+if(successes < trials * success_probability) {
+// We know anything with fewer successes than this is more extreme. But
+// how many successes would we need to have an equally extreme but
+// higher than expected number of successes?
+// We should sum down from all successes. (We'll sum from small to large
+// so it's OK numerically.)
+// This holds the total probability of everything more extremely
+// successful than what we've observed.
+var other_tail_total_probability = 0;
+// TODO: implement some better sort of search thing and use CDF
+for(var other_tail_start = trials; other_tail_start >=
+Math.ceil(trials * success_probability); other_tail_start--) {
+// Get the probability for this particular case
+var case_probability = binomial_pmf(trials, other_tail_start,
+success_probability);
+if(case_probability > observed_probability) {
+// This case is actually less extreme than what we've observed,
+// so our summation is complete.
+break;
+} else {
+// This case is more extreme than what we've observed, so use it
+other_tail_total_probability += case_probability;
+}
+}
+// This holds the probability in this tail
+var this_tail_probability = binomial_cdf(trials, successes,
+success_probability)
+// Return the total probability from both tails, clamped to 1.
+return Math.min(this_tail_probability + other_tail_total_probability,
+1.0);
+} else {
+// We know anything with more successes than this is more extreme. But
+// how few successes would we need to have an equally extreme but lower
+// than expected number of successes?
+// We will sum up from 0 successes. We really ought to use the CDF
+// somehow, but I can't think of how we would do it.
+// This holds the total probability of everything more extremely
+// failureful than what we've observed.
+var other_tail_total_probability = 0;
+for(var other_tail_end = 0; other_tail_end <
+Math.floor(trials * success_probability); other_tail_end++) {
+// We only have to iterate up to the peak (most likely) value.
+// Get the probability for this particular case
+var case_probability = binomial_pmf(trials, other_tail_end,
+success_probability);
+if(case_probability > observed_probability) {
+// This case is actually less extreme than what we've observed,
+// so our summation is complete.
+break;
+} else {
+// This case is more extreme than what we've observed, so use it
+other_tail_total_probability += case_probability;
+}
+}
+// This holds the probability in this tail. It is equal to the
+// probability up to, but not including, where this tail starts. So even
+// if the tail starts at the highest possible number of successes, it
+// has some probability. successes can't be 0 here (since then we'd be
+// below any nonzero expected probability and take the other branch.
+// Since it's a positive integer, it must be 1 or more, so we can
+// subtract 1 safely.
+var this_tail_probability = 1 - binomial_cdf(trials, successes - 1,
+success_probability);
+// Return the total probability from both tails, clamped to 1
+return Math.min(this_tail_probability + other_tail_total_probability,
+1.0);
+}
+}
+function binomial_cdf(trials, successes, success_probability) {
+// The Binomial distribution's cumulative distribution function. Given a
+// number of trials, a number of successes, and a success probability,
+// return the probability of having observed that many successes or fewer.
+// We compute this efficiently using the "regularized incomplete beta
+// function", AKA the beta distribution cdf, which we get from jstat.
+// See http://en.wikipedia.org/wiki/Binomial_distribution#Cumulative_distrib
+// ution_function and http://en.wikipedia.org/wiki/Regularized_incomplete_be
+// ta_function#Incomplete_beta_function
+if(trials == successes) {
+// jStat doesn't want a 0 alpha for its beta distribution (no failures)
+// Calculate this one by hand (it's easy)
+return 1;
+}
+if(trials < successes) {
+// This should never happen. TODO: Debug when it happens.
+print("Error: trials (" + trials + ") < successes (" + successes +
+")!");
+return NaN;
+}
+// This is the observation that we want the beta distribution CDF before
+var beta_observation = 1 - success_probability;
+// These are the parameters of the relavent beta distribution
+var beta_alpha = trials - successes;
+var beta_beta = successes + 1;
+// Return the beta distribution CDF value, which happens to also be our CDF.
+return jstat.pbeta(beta_observation, beta_alpha, beta_beta);
+}
+function binomial_pmf(trials, successes, success_probability) {
+// The Binomial distribution's probability mass function. Given a number of
+// trials, a number of successes, and the probability of success on each
+// trial, calculate the probability of observing that many successes in that
+// many trials with the given success rate.
+// The probability of this many successes in this many trials at this
+// success rate is the probability of succeeding so many times and failing
+// so many times, summed over all the mutually exclusive arrangements of
+// successes and failures.
+return (choose(trials, successes) *
+Math.pow(success_probability, successes) *
+Math.pow(1 - success_probability, trials - successes));
+}
+function choose(available, selected) {
+// The choose function: from available distinct objects, how many ways are
+// there to select selected of them. Returns "available choose selected".
+// Works with large input numbers that are too big to take the factorials
+// of.
+// We use a neat overflow-robust algorithm that eliminates the factorials
+// and makes the computation a multiplication of numbers greater than one.
+// So, no overflow unless the result itself is too big.
+// See http://arantxa.ii.uam.es/~ssantini/writing/notes/s667_binomial.pdf
+if(selected < available - selected) {
+// It would be faster to think about choosing what we don't include. So
+// do that instead.
+return choose(available, available - selected);
+}
+// This holds the result we are accumulating. Initialize to the
+// multiplicative identity.
+var result = 1;
+for(var i = 1; i < available - selected + 1; i++) {
+result *= (1 + (selected / i));
+}
+// TODO: The result ought always to be an integer. Ensure this.
+return result;
+}