Mercurial > repos > adam-novak > hexagram
view hexagram-6ae12361157c/hexagram/statistics.js @ 0:1407e3634bcf draft default tip
Uploaded r11 from test tool shed.
author | adam-novak |
---|---|
date | Tue, 22 Oct 2013 14:17:59 -0400 |
parents | |
children |
line wrap: on
line source
// statistics.js: Web Worker file to run statistical tests in the background. // Constants: // How many pseudocount trials should we use for the binomial test? var BINOMIAL_PSEUDOCOUNTS = 5; // Should we log information about suspicious p values to the console for manual // spot checking? var LOG_SUSPICIOUS = false; // Go get jStat. Hope it's happy in Worker-land. importScripts("jstat-1.0.0.js"); // Make a fake console to catch jstat warnings, so they don't crash the script. console = { warn: print } onmessage = function(message) { // Handle incoming messages from the page. Each message's data is an RPC // request, with "name" set to a function name, "args" set to an array of // arguments, and "id" set to an ID that should be returned with the return // value in a reply message. If the function call fails, an error is sent // back. try { // Go get the specified global function, and apply it on the given // arguments. Use the global scope ("self") as its "this". var return_value = self[message.data.name].apply(self, message.data.args); } catch(exception) { // Send the error back to the page instead of a return value. // Unfortunately, errors themselves can't be cloned, so we do all the // message making here and send back a string. // First we build a string with all the parts of the error we can get. var error_message = "Error in web worker doing job " + message.data.id; error_message += "\n"; error_message += exception.name + ": " + exception.message; error_message += "\n"; error_message += "Full details:\n"; for(field in exception) { if(field == "name" || field == "message") { // Already got these. continue; } // Copy the field into the message as a string. error_message += field + ": " + exception[field] + "\n"; } error_message += "Call: " + message.data.name + "("; for(var i = 0; i < message.data.args.length; i++) { error_message += message.data.args[i]; if(i + 1 < message.data.args.length) { // Have an argument after this. error_message += ", "; } } error_message += ")"; postMessage({ id: message.data.id, error: error_message }); return; } // Send the return value back with the id. postMessage({ id: message.data.id, return_value: return_value }); } function print(message) { // Print a message to the console of the parent page. postMessage({ log: message }); } function statistics_for_matrix(matrix_url, in_list, out_list, all_list) { // Download the given score matrix, do stats between in_list and out_list // for each layer in it, and return an object from layer name to p value. // all_list specifies the names of all signatures that figure into the // analysis at all. // Download the matrix synchronously. // See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch // ronous_and_Asynchronous_Requests // A side effect of this is that we won't have more simultaneous downloads // than workers, which is probably good. // This holds the request. var request = new XMLHttpRequest(); // Get the layer data by GET. The false makes it synchronous. request.open("GET", matrix_url, false); request.send(null); // Now we have the layer TSV // But we don't have our fancy jQuery TSV parser. Parse it manually. // This holds an object of layer data objects (from signature to float) by // layer name. layers = {}; // This holds the array of lines // Split on newlines (as seen in jQuery.tsv.js) var lines = request.responseText.split(/\r?\n/); // Line 0 gives all the layer names, but the first thing isn't a layer name // (since it's above the signature column). var layer_names = lines[0].split(/\t/); for(var i = 1; i < layer_names.length; i++) { // Make sure we have an object for this layer layers[layer_names[i]] = {}; } // The rest give values per layer for the hex in column 1. for(var i = 1; i < lines.length; i++) { // This holds the parts of each line var parts = lines[i].split(/\t/); if(parts[0]) { // We actually have data // Get the singature var signature = parts[0]; for(var j = 1; j < parts.length; j++) { // Go through each non-signature entry and set the appropriate // layer's value for this signature. layers[layer_names[j]][signature] = parseFloat(parts[j]); } } } // Now we've parsed the matrix. // Go do stats for each layer. // This holds our calculated p valued by layer name. var p_values = {}; print("Running statistics for (up to) " + layer_names.length + " layers from matrix " + matrix_url); for(var i = 1; i < layer_names.length; i++) { // Pass the layer data to the per-layer statistics, and get the p value // back. It's probably easier to do this in this worker than to go // invoke more workers. p_values[layer_names[i]] = statistics_for_layer(layers[layer_names[i]], in_list, out_list, all_list); } // We've now calculated a p value for every layer in the matrix. Return the // calculated p values labeled by layer. return p_values; } function statistics_for_layer(layer_data, in_list, out_list, all_list) { // Run the appropriate statistical test for the passed layer data, between // the given in and out arrays of signatures. all_list specifies the names // of all signatures that figure into the analysis at all. Return the p // value for the layer, or NaN if no p value could be calculated. // This holds whether the layer is discrete var is_discrete = true; // This holds whether the layer is binary var is_binary = true; for(var signature in layer_data) { if(layer_data[signature] > 1 || layer_data[signature] < 0) { // Not a binary layer is_binary = false; } if(layer_data[signature] % 1 !== 0) { // It's a float is_binary = false; is_discrete = false; } } if(is_binary) { // This is a binary/dichotomous layer, so run a binomial test. return binomial_compare(layer_data, in_list, out_list, all_list); } else if (is_discrete) { // This is a multinomial/categorical layer // TODO: statistics for discrete non-binary layers return NaN; } else { // This is a continuous layer, so run a t test return t_compare(layer_data, in_list, out_list, all_list); } } function statistics_for_url(layer_url, in_list, out_list, all_list) { // Run the stats for the layer with the given url, between the given in and // out arrays of signatures. all_list specifies the names of all signatures // that figure into the analysis at all. Return the p value for the layer, // or NaN if no p value could be calculated. print("Running statistics for individual layer " + layer_url); // Download the layer data synchronously. // See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch // ronous_and_Asynchronous_Requests // A side effect of this is that we won't have more simultaneous downloads // than workers, which is probably good. // This holds the request. var request = new XMLHttpRequest(); // Get the layer data by GET. The false makes it synchronous. request.open("GET", layer_url, false); request.send(null); // Now we have the layer TSV // But we don't have our fancy jQuery TSV parser. Parse it manually. // This holds the layer data (signature to float) var layer_data = {} // This holds the array of lines // Split on newlines (as seen in jQuery.tsv.js) var lines = request.responseText.split(/\r?\n/); for(var i = 0; i < lines.length; i++) { // This holds the parts of each line var parts = lines[i].split(/\t/); if(parts[0]) { // We actually have data // Parse the layer value for this signature var value = parseFloat(parts[1]); // Store the value in the layer data layer_data[parts[0]] = value; } } // Run stats on the downloaded data return statistics_for_layer(layer_data, in_list, out_list, all_list); } function t_compare(layer_data, in_list, out_list, all_list) { // Given the data of a continuous layer object (an object from signature // name to float (or undefined)), and arrays of the names of "in" and "out" // signatures, do a t test test for whether the in signatures differ from // the out signatures. Returns an object of metadata, with "p_value" set to // either the p value of the test (two-tailed), or NaN if the test cannot be // performed (due to, e.g. fewer than 2 samples in one category). // Go through the in list and calculate all the summary statistics // How many non-NaN values? var number_in = 0; // What is the sum? var sum_in = 0; for(var i = 0; i < in_list.length; i++) { if(!isNaN(layer_data[in_list[i]])) { number_in++; sum_in += layer_data[in_list[i]]; } } // We've done one pass, so we know if we have any in list data actually if(number_in < 2) { // Not enough to run the t test return NaN; } // What is the mean? var mean_in = sum_in / number_in; // What is the second moment (sum of squares of differences from the mean) var second_moment_in = 0; for(var i = 0; i < in_list.length; i++) { if(!isNaN(layer_data[in_list[i]])) { second_moment_in += Math.pow(layer_data[in_list[i]] - mean_in, 2); } } // What is the unbiased variance? unbiased_variance_in = second_moment_in / (number_in - 1); // Now go through the same process for the out list // How many non-NaN values? var number_out = 0; // What is the sum? var sum_out = 0; for(var i = 0; i < out_list.length; i++) { if(!isNaN(layer_data[out_list[i]])) { number_out++; sum_out += layer_data[out_list[i]]; } } // We've done one pass, so we know if we have any out list data actually if(number_out < 2) { // Not enough to run the t test return NaN; } // What is the mean? var mean_out = sum_out / number_out; // What is the second moment (sum of squares of differences from the mean) var second_moment_out = 0; for(var i = 0; i < out_list.length; i++) { if(!isNaN(layer_data[out_list[i]])) { second_moment_out += Math.pow(layer_data[out_list[i]] - mean_out, 2); } } // What is the unbiased variance? unbiased_variance_out = second_moment_out / (number_out - 1); // We can't do the test if both variances are 0 if(unbiased_variance_in == 0 && unbiased_variance_out == 0) { return NaN; } // Now we can calculate the t test two-tailed p value var p_value = t_test(mean_in, unbiased_variance_in, number_in, mean_out, unbiased_variance_out, number_out); // And return it in a dict with other metadata. // We don't really have any other metadata. return { p_value: p_value }; } function t_test(mean_in, unbiased_variance_in, number_in, mean_out, unbiased_variance_out, number_out) { // Given the mean, unbiased variance, and number of samples for both the in // group and the out group, compute the p value for the t test with unequal // sample sizes and unequal variances, testing to see whether the means // differ (a two-tailed "Welch's" t test). See // https://en.wikipedia.org/wiki/Student%27s_t-test // Assumes we have enough samples to actually perform the test. // First, calculate the t statistic, which is where our observations fall on // the t distribution. var t_statistic = (mean_in - mean_out) / Math.sqrt((unbiased_variance_in / number_in) + (unbiased_variance_out / number_out)); // Calculate the degrees of freedom for the particular t distribution that // we ought to compare the statistic against var degrees_of_freedom = Math.pow((unbiased_variance_in / number_in) + (unbiased_variance_out / number_out), 2) / ((Math.pow(unbiased_variance_in / number_in, 2) / (number_in - 1)) + (Math.pow(unbiased_variance_out / number_out, 2) / (number_out - 1))); // Now we have to compare the t statistic to the t test CDF available via // the totally undocumented jstat.pt = function(q, df, ncp, lower_tail, log) // where: // q is the t statistic value to calculate the cdf at // df is the degrees of freedom // ncp is the "mu" parameter for the t distributiuon. I think this sets the // mean, and it's OK to leave blank. // lower_tail presumably specifies if we want the lower or upper tail of the // CDF. Defaults to true. // Log specifies if we want the log probability. Defaults to false. // Make the t statistic be on the low side of the distribution, and // calculate the lower tail's area using the CDF. var one_tail_probability = jstat.pt(0 - Math.abs(t_statistic), degrees_of_freedom); // Return the two-tailed p value, which, since the t distribution is // symmetric, is just twice the single-tail probability return 2 * one_tail_probability; } function binomial_compare(layer_data, in_list, out_list, all_list) { // Given the data of a binary layer object (an object from signature name to // 0 or 1 (or undefined)), and arrays of the names of "in" and "out" // signatures, do a binomial test for whether the in signatures differ from // the out signatures. Uses a number of pseudocount trials as specified in // the global constant BINOMIAL_PSEUDOCOUNTS Returns an object of metadata, // with "p_value" set to either the p value of the test (two-tailed), or NaN // if the test cannot be performed. all_list specifies the names of all // signatures that figure into the analysis at all (i.e. those which the // user hasn't filtered out), which we use when calculating how many of our // pseudocounts should be successes. Signature names appearing in all_list // but with no data in layer_data are not counted. // Work out the distribution from the out list // How many out signatures are 1? var outside_yes = 0; // And are 0? var outside_no = 0; for(var i = 0; i < out_list.length; i++) { if(layer_data[out_list[i]] === 1) { // This is a yes and it's outside. outside_yes++; } else if(layer_data[out_list[i]] === 0) { // A no and outside outside_no++; } } // It's OK for all the outside hexes to be 0 now. Pseudocounts can give us a // p value. // Now work out our pseudocounts. // How many signatures in all_list are successes? var all_yes = 0; // And how many are failures (as opposed to undef) var all_no = 0; for(var i = 0; i < all_list.length; i++) { if(layer_data[all_list[i]] === 1) { // A yes anywhere all_yes++; } else if(layer_data[all_list[i]] === 0) { // A real no (not a no-data) anywhere all_no++; } } // It't not OK for there to be no hexes in the all set. Maybe they filtered // out all the ones with any data? if(all_yes + all_no == 0) { // TODO: Sure wish we had layer names here. print("No signatures were available with data for this layer."); return NaN; } // Calculate how many pseudo-yeses we should have. // Match the frequency in all signatures. var pseudo_yes = BINOMIAL_PSEUDOCOUNTS * (all_yes / (all_yes + all_no)); // pseudo-trials is just BINOMIAL_PSEUDOCOUNTS // This holds the probability of being a 1 for the out list. // We want to test if the in list differs significantly from this. var background_probability = (outside_yes + pseudo_yes) / (outside_yes + outside_no + BINOMIAL_PSEUDOCOUNTS); if(background_probability == 0) { // Can't do the binomial test in this case. Somehow there were no yeses // anywhere. return NaN; } // How many 1s are in the in list? var inside_yes = 0; // And how many 0s? var inside_no = 0; for(var i = 0; i < in_list.length; i++) { if(layer_data[in_list[i]] === 1) { // This is a yes and it's inside. inside_yes++; } else if(layer_data[in_list[i]] === 0) { // A no and it's inside inside_no++; } } // Return the p value for rejecting the null hypothesis that the in // signatures follow the background distribution. var p = binomial_test(inside_yes + inside_no, inside_yes, background_probability); if(LOG_SUSPICIOUS && (p == 0 || p == 1)) { // We got an odd p value. Complain about it. print("Got suspicious p value " + p); print("Was binomial test for " + inside_yes + " successes in " + (inside_yes + inside_no) + " trials at probability " + background_probability); print("Background was " + outside_yes + " out of " + (outside_yes + outside_no) + " with " + pseudo_yes + " out of " + BINOMIAL_PSEUDOCOUNTS + " pseudocounts."); } // Return our p value as "p_value", and also how many non-pseudocount // successes were in the in_list and the out_list. return { p_value: p, inside_yes: inside_yes, outside_yes: outside_yes }; } function binomial_test(trials, successes, success_probability) { if(trials < successes) { print("Trying to test " + trials + " trials with " + successes + " successes!"); } // Return the p value for rejecting the null hypothesis that the observed // number of successes happened in the observed number of trials when the // probability of success was success_probability. Does a Binomial // test. // Calculate the P value // This must be terribly complicated since nobody seems to have written up // how to do it as anything other than an arcane stats ritual. // Something close: http://www.johnmyleswhite.com/notebook/2012/04/14/implem // enting-the-exact-binomial-test-in-julia/ // How scipy.stats does it (x = successes, n = trials, p = supposed // probability): // SourceForge says Scipy is BSD licensed, so we can steal this code for our // comments. /* d = distributions.binom.pmf(x,n,p) rerr = 1+1e-7 if (x < p*n): i = np.arange(np.ceil(p*n),n+1) y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0) pval = distributions.binom.cdf(x,n,p) + distributions.binom.sf(n-y, n,p) else: i = np.arange(np.floor(p*n)) y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0) pval = distributions.binom.cdf(y-1,n,p) + distributions.binom.sf( x-1,n,p) */ // There is of course no justification for why this would work. // What it's actually doing is a complicated Numpy vectorized operation to // find the boundary of the tail we don't have, and then adding the CDF of // the lower tail boundary and (1-CDF) of the upper tail boundary (which is // the P value by definition). // This holds the probability of exactly what we've observed under the null // hypothesis. var observed_probability = binomial_pmf(trials, successes, success_probability); if(successes < trials * success_probability) { // We know anything with fewer successes than this is more extreme. But // how many successes would we need to have an equally extreme but // higher than expected number of successes? // We should sum down from all successes. (We'll sum from small to large // so it's OK numerically.) // This holds the total probability of everything more extremely // successful than what we've observed. var other_tail_total_probability = 0; // TODO: implement some better sort of search thing and use CDF for(var other_tail_start = trials; other_tail_start >= Math.ceil(trials * success_probability); other_tail_start--) { // Get the probability for this particular case var case_probability = binomial_pmf(trials, other_tail_start, success_probability); if(case_probability > observed_probability) { // This case is actually less extreme than what we've observed, // so our summation is complete. break; } else { // This case is more extreme than what we've observed, so use it other_tail_total_probability += case_probability; } } // This holds the probability in this tail var this_tail_probability = binomial_cdf(trials, successes, success_probability) // Return the total probability from both tails, clamped to 1. return Math.min(this_tail_probability + other_tail_total_probability, 1.0); } else { // We know anything with more successes than this is more extreme. But // how few successes would we need to have an equally extreme but lower // than expected number of successes? // We will sum up from 0 successes. We really ought to use the CDF // somehow, but I can't think of how we would do it. // This holds the total probability of everything more extremely // failureful than what we've observed. var other_tail_total_probability = 0; for(var other_tail_end = 0; other_tail_end < Math.floor(trials * success_probability); other_tail_end++) { // We only have to iterate up to the peak (most likely) value. // Get the probability for this particular case var case_probability = binomial_pmf(trials, other_tail_end, success_probability); if(case_probability > observed_probability) { // This case is actually less extreme than what we've observed, // so our summation is complete. break; } else { // This case is more extreme than what we've observed, so use it other_tail_total_probability += case_probability; } } // This holds the probability in this tail. It is equal to the // probability up to, but not including, where this tail starts. So even // if the tail starts at the highest possible number of successes, it // has some probability. successes can't be 0 here (since then we'd be // below any nonzero expected probability and take the other branch. // Since it's a positive integer, it must be 1 or more, so we can // subtract 1 safely. var this_tail_probability = 1 - binomial_cdf(trials, successes - 1, success_probability); // Return the total probability from both tails, clamped to 1 return Math.min(this_tail_probability + other_tail_total_probability, 1.0); } } function binomial_cdf(trials, successes, success_probability) { // The Binomial distribution's cumulative distribution function. Given a // number of trials, a number of successes, and a success probability, // return the probability of having observed that many successes or fewer. // We compute this efficiently using the "regularized incomplete beta // function", AKA the beta distribution cdf, which we get from jstat. // See http://en.wikipedia.org/wiki/Binomial_distribution#Cumulative_distrib // ution_function and http://en.wikipedia.org/wiki/Regularized_incomplete_be // ta_function#Incomplete_beta_function if(trials == successes) { // jStat doesn't want a 0 alpha for its beta distribution (no failures) // Calculate this one by hand (it's easy) return 1; } if(trials < successes) { // This should never happen. TODO: Debug when it happens. print("Error: trials (" + trials + ") < successes (" + successes + ")!"); return NaN; } // This is the observation that we want the beta distribution CDF before var beta_observation = 1 - success_probability; // These are the parameters of the relavent beta distribution var beta_alpha = trials - successes; var beta_beta = successes + 1; // Return the beta distribution CDF value, which happens to also be our CDF. return jstat.pbeta(beta_observation, beta_alpha, beta_beta); } function binomial_pmf(trials, successes, success_probability) { // The Binomial distribution's probability mass function. Given a number of // trials, a number of successes, and the probability of success on each // trial, calculate the probability of observing that many successes in that // many trials with the given success rate. // The probability of this many successes in this many trials at this // success rate is the probability of succeeding so many times and failing // so many times, summed over all the mutually exclusive arrangements of // successes and failures. return (choose(trials, successes) * Math.pow(success_probability, successes) * Math.pow(1 - success_probability, trials - successes)); } function choose(available, selected) { // The choose function: from available distinct objects, how many ways are // there to select selected of them. Returns "available choose selected". // Works with large input numbers that are too big to take the factorials // of. // We use a neat overflow-robust algorithm that eliminates the factorials // and makes the computation a multiplication of numbers greater than one. // So, no overflow unless the result itself is too big. // See http://arantxa.ii.uam.es/~ssantini/writing/notes/s667_binomial.pdf if(selected < available - selected) { // It would be faster to think about choosing what we don't include. So // do that instead. return choose(available, available - selected); } // This holds the result we are accumulating. Initialize to the // multiplicative identity. var result = 1; for(var i = 1; i < available - selected + 1; i++) { result *= (1 + (selected / i)); } // TODO: The result ought always to be an integer. Ensure this. return result; }