Mercurial > repos > adam-novak > hexagram

diff hexagram-6ae12361157c/hexagram/statistics.js~ @ 0:1407e3634bcf draft default tip
Uploaded r11 from test tool shed.
author: adam-novak
date: Tue, 22 Oct 2013 14:17:59 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hexagram-6ae12361157c/hexagram/statistics.js~	Tue Oct 22 14:17:59 2013 -0400
@@ -0,0 +1,709 @@
+// statistics.js: Web Worker file to run statistical tests in the background.
+
+// Constants:
+// How many pseudocount trials should we use for the binomial test?
+var BINOMIAL_PSEUDOCOUNTS = 5;
+
+// Should we log information about suspicious p values to the console for manual
+// spot checking?
+var LOG_SUSPICIOUS = false;
+
+// Go get jStat. Hope it's happy in Worker-land.
+importScripts("jstat-1.0.0.js");
+
+// Make a fake console to catch jstat warnings, so they don't crash the script.
+console = {
+    warn: print
+}
+
+onmessage = function(message) {
+    // Handle incoming messages from the page. Each message's data is an RPC
+    // request, with "name" set to a function name, "args" set to an array of
+    // arguments, and "id" set to an ID that should be returned with the return
+    // value in a reply message. If the function call fails, an error is sent
+    // back.
+    
+    
+    try {
+        // Go get the specified global function, and apply it on the given
+        // arguments. Use the global scope ("self") as its "this".
+        var return_value = self[message.data.name].apply(self, 
+            message.data.args);
+        
+    } catch(exception) {
+    
+        // Send the error back to the page instead of a return value.
+        // Unfortunately, errors themselves can't be cloned, so we do all the
+        // message making here and send back a string.
+        
+        // First we build a string with all the parts of the error we can get.
+        var error_message = "Error in web worker doing job " + message.data.id;
+        error_message += "\n";
+        error_message += exception.name + ": " + exception.message;
+        error_message += "\n";
+        error_message += "Full details:\n";
+        for(field in exception) {
+            if(field == "name" || field == "message") {
+                // Already got these.
+                continue;
+            }
+            
+            // Copy the field into the message as a string.
+            error_message += field + ": " + exception[field] + "\n";
+        }
+        error_message += "Call: " + message.data.name + "(";
+        for(var i = 0; i < message.data.args.length; i++) {
+            error_message += message.data.args[i];
+            if(i + 1 < message.data.args.length) {
+                // Have an argument after this.
+                error_message += ", ";
+            }
+        }
+        error_message += ")";
+
+        postMessage({
+            id: message.data.id,
+            error: error_message
+        });
+        
+        return;
+    }
+    
+    
+    // Send the return value back with the id.
+    postMessage({
+        id: message.data.id,
+        return_value: return_value
+    });
+}
+
+function print(message) {
+    // Print a message to the console of the parent page.
+    postMessage({
+        log: message
+    });
+}
+
+function statistics_for_matrix(matrix_url, in_list, out_list, all_list) {
+    // Download the given score matrix, do stats between in_list and out_list
+    // for each layer in it, and return an object from layer name to p value.
+    // all_list specifies the names of all signatures that figure into the
+    // analysis at all.
+    
+    // Download the matrix synchronously. 
+    // See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch
+    // ronous_and_Asynchronous_Requests
+    // A side effect of this is that we won't have more simultaneous downloads 
+    // than workers, which is probably good.
+    // This holds the request.
+    var request = new XMLHttpRequest();
+    // Get the layer data by GET. The false makes it synchronous.
+    request.open("GET", matrix_url, false);
+    request.send(null);
+    
+    // Now we have the layer TSV
+    // But we don't have our fancy jQuery TSV parser. Parse it manually.
+    
+    // This holds an object of layer data objects (from signature to float) by
+    // layer name.
+    layers = {};
+
+    // This holds the array of lines
+    // Split on newlines (as seen in jQuery.tsv.js)
+    var lines = request.responseText.split(/\r?\n/);
+    
+    // Line 0 gives all the layer names, but the first thing isn't a layer name
+    // (since it's above the signature column).
+    var layer_names = lines[0].split(/\t/);
+    for(var i = 1; i < layer_names.length; i++) {
+        // Make sure we have an object for this layer
+        layers[layer_names[i]] = {};
+    }
+    
+    // The rest give values per layer for the hex in column 1.
+    for(var i = 1; i < lines.length; i++) {
+        // This holds the parts of each line
+        var parts = lines[i].split(/\t/);
+        
+        if(parts[0]) {
+            // We actually have data
+            
+            // Get the singature
+            var signature = parts[0];
+            
+            for(var j = 1; j < parts.length; j++) {
+                // Go through each non-signature entry and set the appropriate
+                // layer's value for this signature.
+                layers[layer_names[j]][signature] = parseFloat(parts[j]);
+            }
+        }
+    }
+    
+    // Now we've parsed the matrix.
+    // Go do stats for each layer.
+    // This holds our calculated p valued by layer name.
+    var p_values = {};
+    
+    print("Running statistics for (up to) " + layer_names.length + 
+        " layers from matrix " + matrix_url);
+    
+    for(var i = 1; i < layer_names.length; i++) {
+        // Pass the layer data to the per-layer statistics, and get the p value
+        // back. It's probably easier to do this in this worker than to go
+        // invoke more workers.
+        p_values[layer_names[i]] = statistics_for_layer(layers[layer_names[i]],
+            in_list, out_list, all_list);
+    }
+    
+    // We've now calculated a p value for every layer in the matrix. Return the
+    // calculated p values labeled by layer.
+    return p_values;
+    
+}
+
+function statistics_for_layer(layer_data, in_list, out_list, all_list) {
+    // Run the appropriate statistical test for the passed layer data, between
+    // the given in and out arrays of signatures. all_list specifies the names
+    // of all signatures that figure into the analysis at all. Return the p
+    // value for the layer, or NaN if no p value could be calculated.
+
+    // This holds whether the layer is discrete
+    var is_discrete = true;
+    
+    // This holds whether the layer is binary
+    var is_binary = true;
+    
+    for(var signature in layer_data) {
+        if(layer_data[signature] > 1 || layer_data[signature] < 0) {
+            // Not a binary layer
+            is_binary = false;
+        }
+        
+        if(layer_data[signature] % 1 !== 0) {
+            // It's a float
+            is_binary = false;
+            is_discrete = false;
+        }
+    }
+    
+    if(is_binary) {
+        // This is a binary/dichotomous layer, so run a binomial test.
+        return binomial_compare(layer_data, in_list, out_list, all_list);
+    } else if (is_discrete) {
+        // This is a multinomial/categorical layer    
+        // TODO: statistics for discrete non-binary layers
+        return NaN;
+    } else {
+        // This is a continuous layer, so run a t test
+        return t_compare(layer_data, in_list, out_list, all_list);
+    }
+
+}
+
+function statistics_for_url(layer_url, in_list, out_list, all_list) {
+    // Run the stats for the layer with the given url, between the given in and
+    // out arrays of signatures. all_list specifies the names of all signatures
+    // that figure into the analysis at all. Return the p value for the layer,
+    // or NaN if no p value could be calculated.
+    
+    print("Running statistics for individual layer " + layer_url);
+    
+    // Download the layer data synchronously. 
+    // See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch
+    // ronous_and_Asynchronous_Requests
+    // A side effect of this is that we won't have more simultaneous downloads 
+    // than workers, which is probably good.
+    // This holds the request.
+    var request = new XMLHttpRequest();
+    // Get the layer data by GET. The false makes it synchronous.
+    request.open("GET", layer_url, false);
+    request.send(null);
+    
+    // Now we have the layer TSV
+    // But we don't have our fancy jQuery TSV parser. Parse it manually.
+    
+    // This holds the layer data (signature to float)
+    var layer_data = {}
+
+    // This holds the array of lines
+    // Split on newlines (as seen in jQuery.tsv.js)
+    var lines = request.responseText.split(/\r?\n/);
+    
+    for(var i = 0; i < lines.length; i++) {
+        // This holds the parts of each line
+        var parts = lines[i].split(/\t/);
+        
+        if(parts[0]) {
+            // We actually have data
+            // Parse the layer value for this signature
+            var value = parseFloat(parts[1]);
+            
+            // Store the value in the layer data
+            layer_data[parts[0]] = value;
+        }
+    }
+    
+    // Run stats on the downloaded data
+    return statistics_for_layer(layer_data, in_list, out_list, all_list);
+}
+
+function t_compare(layer_data, in_list, out_list, all_list) {
+    // Given the data of a continuous layer object (an object from signature
+    // name to float (or undefined)), and arrays of the names of "in" and "out"
+    // signatures, do a t test test for whether the in signatures differ from
+    // the out signatures. Returns an object of metadata, with "p_value" set to
+    // either the p value of the test (two-tailed), or NaN if the test cannot be
+    // performed (due to, e.g. fewer than 2 samples in one category).
+    
+    // Go through the in list and calculate all the summary statistics
+    // How many non-NaN values?
+    var number_in = 0;
+    // What is the sum?
+    var sum_in = 0;
+    
+    for(var i = 0; i < in_list.length; i++) {
+        if(!isNaN(layer_data[in_list[i]])) {
+            number_in++;
+            sum_in += layer_data[in_list[i]];
+        }
+    }
+    
+    // We've done one pass, so we know if we have any in list data actually
+    if(number_in < 2) {
+        // Not enough to run the t test
+        return NaN;
+    }
+    
+    // What is the mean?
+    var mean_in = sum_in / number_in;
+    
+    // What is the second moment (sum of squares of differences from the mean)
+    var second_moment_in = 0;
+    for(var i = 0; i < in_list.length; i++) {
+        if(!isNaN(layer_data[in_list[i]])) {
+            second_moment_in += Math.pow(layer_data[in_list[i]] - mean_in, 2);
+        }
+    }
+    
+    // What is the unbiased variance?
+    unbiased_variance_in = second_moment_in / (number_in - 1);
+    
+    // Now go through the same process for the out list
+    // How many non-NaN values?
+    var number_out = 0;
+    // What is the sum?
+    var sum_out = 0;
+    
+    for(var i = 0; i < out_list.length; i++) {
+        if(!isNaN(layer_data[out_list[i]])) {
+            number_out++;
+            sum_out += layer_data[out_list[i]];
+        }
+    }
+    
+    // We've done one pass, so we know if we have any out list data actually
+    if(number_out < 2) {
+        // Not enough to run the t test
+        return NaN;
+    }
+    
+    // What is the mean?
+    var mean_out = sum_out / number_out;
+    
+    // What is the second moment (sum of squares of differences from the mean)
+    var second_moment_out = 0;
+    for(var i = 0; i < out_list.length; i++) {
+        if(!isNaN(layer_data[out_list[i]])) {
+            second_moment_out += Math.pow(layer_data[out_list[i]] - mean_out, 
+                2);
+        }
+    }
+    
+    // What is the unbiased variance?
+    unbiased_variance_out = second_moment_out / (number_out - 1);
+    
+    // We can't do the test if both variances are 0
+    if(unbiased_variance_in == 0 && unbiased_variance_out == 0) {
+        return NaN;
+    }
+    
+    // Now we can calculate the t test two-tailed p value
+    var p_value = t_test(mean_in, unbiased_variance_in, number_in, mean_out, 
+        unbiased_variance_out, number_out);
+        
+    // And return it in a dict with other metadata.
+    // We don't really have any other metadata.
+    return {
+        p_value: p_value
+    };
+}
+
+function t_test(mean_in, unbiased_variance_in, number_in, mean_out, 
+    unbiased_variance_out, number_out) {
+
+    // Given the mean, unbiased variance, and number of samples for both the in
+    // group and the out group, compute the p value for the t test with unequal
+    // sample sizes and unequal variances, testing to see whether the means
+    // differ (a two-tailed "Welch's" t test). See
+    // https://en.wikipedia.org/wiki/Student%27s_t-test
+    // Assumes we have enough samples to actually perform the test.
+    
+    // First, calculate the t statistic, which is where our observations fall on
+    // the t distribution.
+    var t_statistic = (mean_in - mean_out) / Math.sqrt((unbiased_variance_in /
+        number_in) + (unbiased_variance_out / number_out));
+        
+        
+    // Calculate the degrees of freedom for the particular t distribution that
+    // we ought to compare the statistic against
+    var degrees_of_freedom = Math.pow((unbiased_variance_in / number_in) + 
+        (unbiased_variance_out / number_out), 2) / 
+        ((Math.pow(unbiased_variance_in / number_in, 2) / (number_in - 1)) + 
+        (Math.pow(unbiased_variance_out / number_out, 2) / (number_out - 1)));
+
+    // Now we have to compare the t statistic to the t test CDF available via
+    // the totally undocumented jstat.pt = function(q, df, ncp, lower_tail, log)
+    // where:
+    // q is the t statistic value to calculate the cdf at
+    // df is the degrees of freedom
+    // ncp is the "mu" parameter for the t distributiuon. I think this sets the 
+    // mean, and it's OK to leave blank.
+    // lower_tail presumably specifies if we want the lower or upper tail of the
+    // CDF. Defaults to true.
+    // Log specifies if we want the log probability. Defaults to false.
+    
+    // Make the t statistic be on the low side of the distribution, and
+    // calculate the lower tail's area using the CDF.
+    var one_tail_probability = jstat.pt(0 - Math.abs(t_statistic), 
+        degrees_of_freedom);
+        
+    // Return the two-tailed p value, which, since the t distribution is
+    // symmetric, is just twice the single-tail probability
+    return 2 * one_tail_probability;
+     
+}
+
+function binomial_compare(layer_data, in_list, out_list, all_list) {
+    // Given the data of a binary layer object (an object from signature name to
+    // 0 or 1 (or undefined)), and arrays of the names of "in" and "out"
+    // signatures, do a binomial test for whether the in signatures differ from
+    // the out signatures. Uses a number of pseudocount trials as specified in
+    // the global constant BINOMIAL_PSEUDOCOUNTS Returns an object of metadata,
+    // with "p_value" set to either the p value of the test (two-tailed), or NaN
+    // if the test cannot be performed. all_list specifies the names of all
+    // signatures that figure into the analysis at all (i.e. those which the
+    // user hasn't filtered out), which we use when calculating how many of our
+    // pseudocounts should be successes. Signature names appearing in all_list
+    // but with no data in layer_data are not counted.
+    
+    
+    // Work out the distribution from the out list
+    // How many out signatures are 1?
+    var outside_yes = 0;
+    // And are 0?
+    var outside_no = 0;
+    
+    for(var i = 0; i < out_list.length; i++) {
+        if(layer_data[out_list[i]] === 1) {
+            // This is a yes and it's outside.
+            outside_yes++;
+        } else if(layer_data[out_list[i]] === 0) {
+            // A no and outside
+            outside_no++;
+        }
+    }
+    
+    // It's OK for all the outside hexes to be 0 now. Pseudocounts can give us a
+    // p value.
+    
+    // Now work out our pseudocounts.
+    // How many signatures in all_list are successes?
+    var all_yes = 0;
+    // And how many are failures (as opposed to undef)
+    var all_no = 0;
+    
+    for(var i = 0; i < all_list.length; i++) {
+        if(layer_data[all_list[i]] === 1) {
+            // A yes anywhere
+            all_yes++;
+        } else if(layer_data[all_list[i]] === 0) {
+            // A real no (not a no-data) anywhere
+            all_no++;
+        }
+    }
+    
+    // It't not OK for there to be no hexes in the all set. Maybe they filtered
+    // out all the ones with any data?
+    if(all_yes + all_no == 0) {
+        // TODO: Sure wish we had layer names here.
+        print("No signatures were available with data for this layer.");
+        return NaN;
+    }
+    
+    // Calculate how many pseudo-yeses we should have.
+    // Match the frequency in all signatures.
+    var pseudo_yes = BINOMIAL_PSEUDOCOUNTS * (all_yes / (all_yes + all_no));
+    
+    // pseudo-trials is just BINOMIAL_PSEUDOCOUNTS
+    
+    // This holds the probability of being a 1 for the out list.
+    // We want to test if the in list differs significantly from this.
+    var background_probability = (outside_yes + pseudo_yes) / (outside_yes + 
+        outside_no + BINOMIAL_PSEUDOCOUNTS);
+
+    if(background_probability == 0) {
+        // Can't do the binomial test in this case. Somehow there were no yeses
+        // anywhere.
+        return NaN;
+    }
+    
+    // How many 1s are in the in list?
+    var inside_yes = 0;
+    // And how many 0s?
+    var inside_no = 0;
+    
+    for(var i = 0; i < in_list.length; i++) {
+        if(layer_data[in_list[i]] === 1) {
+            // This is a yes and it's inside.
+            inside_yes++;
+        } else if(layer_data[in_list[i]] === 0) {
+            // A no and it's inside
+            inside_no++;
+        }
+    }
+
+    // Return the p value for rejecting the null hypothesis that the in
+    // signatures follow the background distribution.
+    var p = binomial_test(inside_yes + inside_no, inside_yes,
+        background_probability);
+        
+    if(LOG_SUSPICIOUS && (p == 0 || p == 1)) {
+        // We got an odd p value. Complain about it.
+        print("Got suspicious p value " + p);
+        print("Was binomial test for " + inside_yes + " successes in " + 
+            (inside_yes + inside_no) + " trials at probability " + 
+            background_probability);
+        print("Background was " + outside_yes + " out of " + (outside_yes + 
+            outside_no) + " with " + pseudo_yes + " out of " + 
+            BINOMIAL_PSEUDOCOUNTS + " pseudocounts.");
+    }
+     
+    // Return our p value as "p_value", and also how many non-pseudocount
+    // successes were in the in_list and the out_list.
+    return {
+        p_value: p,
+        "1s in A": inside_yes,
+        "1s in background": outside_yes
+    };
+}    
+    
+function binomial_test(trials, successes, success_probability) {
+    if(trials < successes) {
+        print("Trying to test " + trials + " trials with " + successes + 
+            " successes!");
+    }
+
+    // Return the p value for rejecting the null hypothesis that the observed
+    // number of successes happened in the observed number of trials when the
+    // probability of success was success_probability. Does a Binomial
+    // test.
+    
+    // Calculate the P value
+    // This must be terribly complicated since nobody seems to have written up 
+    // how to do it as anything other than an arcane stats ritual.
+    // Something close: http://www.johnmyleswhite.com/notebook/2012/04/14/implem
+    // enting-the-exact-binomial-test-in-julia/
+    // How scipy.stats does it (x = successes, n = trials, p = supposed 
+    // probability):
+    // SourceForge says Scipy is BSD licensed, so we can steal this code for our
+    // comments.
+    /*
+        d = distributions.binom.pmf(x,n,p)
+        rerr = 1+1e-7
+        if (x < p*n):
+            i = np.arange(np.ceil(p*n),n+1)
+            y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0)
+            pval = distributions.binom.cdf(x,n,p) + distributions.binom.sf(n-y,
+                n,p)
+        else:
+            i = np.arange(np.floor(p*n))
+            y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0)
+            pval = distributions.binom.cdf(y-1,n,p) + distributions.binom.sf(
+                x-1,n,p)
+    */
+    // There is of course no justification for why this would work.
+    // What it's actually doing is a complicated Numpy vectorized operation to 
+    // find the boundary of the tail we don't have, and then adding the CDF of 
+    // the lower tail boundary and (1-CDF) of the upper tail boundary (which is 
+    // the P value by definition).
+    
+    // This holds the probability of exactly what we've observed under the null
+    // hypothesis.
+    var observed_probability = binomial_pmf(trials, successes, 
+        success_probability);
+    
+    if(successes < trials * success_probability) {
+        // We know anything with fewer successes than this is more extreme. But
+        // how many successes would we need to have an equally extreme but
+        // higher than expected number of successes?
+        // We should sum down from all successes. (We'll sum from small to large
+        // so it's OK numerically.)
+        
+        // This holds the total probability of everything more extremely
+        // successful than what we've observed.
+        var other_tail_total_probability = 0;
+        
+        // TODO: implement some better sort of search thing and use CDF
+        for(var other_tail_start = trials; other_tail_start >= 
+            Math.ceil(trials * success_probability); other_tail_start--) {
+            
+            // Get the probability for this particular case
+            var case_probability = binomial_pmf(trials, other_tail_start, 
+                success_probability);
+            
+            if(case_probability > observed_probability) {
+                // This case is actually less extreme than what we've observed, 
+                // so our summation is complete.
+                
+                break;
+            } else {
+                // This case is more extreme than what we've observed, so use it
+                other_tail_total_probability += case_probability;
+            }
+        }
+        
+        // This holds the probability in this tail
+        var this_tail_probability = binomial_cdf(trials, successes, 
+            success_probability)
+        
+        
+        // Return the total probability from both tails, clamped to 1.
+        return Math.min(this_tail_probability + other_tail_total_probability, 
+            1.0);
+    } else {
+        // We know anything with more successes than this is more extreme. But
+        // how few successes would we need to have an equally extreme but lower
+        // than expected number of successes?
+        // We will sum up from 0 successes. We really ought to use the CDF 
+        // somehow, but I can't think of how we would do it.
+        
+        // This holds the total probability of everything more extremely
+        // failureful than what we've observed.
+        var other_tail_total_probability = 0;
+        
+        for(var other_tail_end = 0; other_tail_end < 
+            Math.floor(trials * success_probability); other_tail_end++) {
+            // We only have to iterate up to the peak (most likely) value.
+        
+            // Get the probability for this particular case
+            var case_probability = binomial_pmf(trials, other_tail_end, 
+                success_probability);
+            
+            if(case_probability > observed_probability) {
+                // This case is actually less extreme than what we've observed, 
+                // so our summation is complete.
+                break;
+            } else {
+                // This case is more extreme than what we've observed, so use it
+                other_tail_total_probability += case_probability;
+            }     
+            
+        }
+        
+        // This holds the probability in this tail. It is equal to the
+        // probability up to, but not including, where this tail starts. So even
+        // if the tail starts at the highest possible number of successes, it
+        // has some probability. successes can't be 0 here (since then we'd be
+        // below any nonzero expected probability and take the other branch.
+        // Since it's a positive integer, it must be 1 or more, so we can
+        // subtract 1 safely.
+        var this_tail_probability = 1 - binomial_cdf(trials, successes - 1, 
+            success_probability);
+        
+        // Return the total probability from both tails, clamped to 1
+        return Math.min(this_tail_probability + other_tail_total_probability, 
+            1.0);
+    }
+        
+    
+}
+
+function binomial_cdf(trials, successes, success_probability) {
+    // The Binomial distribution's cumulative distribution function. Given a 
+    // number of trials, a number of successes, and a success probability, 
+    // return the probability of having observed that many successes or fewer.
+    
+    // We compute this efficiently using the "regularized incomplete beta 
+    // function", AKA the beta distribution cdf, which we get from jstat.
+    // See http://en.wikipedia.org/wiki/Binomial_distribution#Cumulative_distrib
+    // ution_function and http://en.wikipedia.org/wiki/Regularized_incomplete_be
+    // ta_function#Incomplete_beta_function
+    
+    if(trials == successes) {
+        // jStat doesn't want a 0 alpha for its beta distribution (no failures)
+        // Calculate this one by hand (it's easy)
+        return 1;
+    }
+    
+    if(trials < successes) {
+        // This should never happen. TODO: Debug when it happens.
+        print("Error: trials (" + trials + ") < successes (" + successes + 
+            ")!");
+        return NaN;
+    }
+    
+    // This is the observation that we want the beta distribution CDF before
+    var beta_observation = 1 - success_probability;
+    
+    // These are the parameters of the relavent beta distribution
+    var beta_alpha = trials - successes;
+    var beta_beta = successes + 1;
+    
+    // Return the beta distribution CDF value, which happens to also be our CDF.
+    return jstat.pbeta(beta_observation, beta_alpha, beta_beta);
+}
+
+function binomial_pmf(trials, successes, success_probability) {
+    // The Binomial distribution's probability mass function. Given a number of
+    // trials, a number of successes, and the probability of success on each
+    // trial, calculate the probability of observing that many successes in that
+    // many trials with the given success rate.
+    
+    // The probability of this many successes in this many trials at this
+    // success rate is the probability of succeeding so many times and failing
+    // so many times, summed over all the mutually exclusive arrangements of
+    // successes and failures.
+    return (choose(trials, successes) * 
+        Math.pow(success_probability, successes) * 
+        Math.pow(1 - success_probability, trials - successes));
+    
+}
+
+function choose(available, selected) {
+    // The choose function: from available distinct objects, how many ways are 
+    // there to select selected of them. Returns "available choose selected". 
+    // Works with large input numbers that are too big to take the factorials 
+    // of.
+    
+    // We use a neat overflow-robust algorithm that eliminates the factorials 
+    // and makes the computation a multiplication of numbers greater than one.
+    // So, no overflow unless the result itself is too big.
+    // See http://arantxa.ii.uam.es/~ssantini/writing/notes/s667_binomial.pdf
+    
+    if(selected < available - selected) {
+        // It would be faster to think about choosing what we don't include. So
+        // do that instead.
+        return choose(available, available - selected);
+    }
+    
+    // This holds the result we are accumulating. Initialize to the 
+    // multiplicative identity.
+    var result = 1;
+    
+    for(var i = 1; i < available - selected + 1; i++) {
+        result *= (1 + (selected / i));
+    }
+    
+    // TODO: The result ought always to be an integer. Ensure this.
+    return result;
+}
author	adam-novak
date	Tue, 22 Oct 2013 14:17:59 -0400
parents
children