0
|
1 // statistics.js: Web Worker file to run statistical tests in the background.
|
|
2
|
|
3 // Constants:
|
|
4 // How many pseudocount trials should we use for the binomial test?
|
|
5 var BINOMIAL_PSEUDOCOUNTS = 5;
|
|
6
|
|
7 // Should we log information about suspicious p values to the console for manual
|
|
8 // spot checking?
|
|
9 var LOG_SUSPICIOUS = false;
|
|
10
|
|
11 // Go get jStat. Hope it's happy in Worker-land.
|
|
12 importScripts("jstat-1.0.0.js");
|
|
13
|
|
14 // Make a fake console to catch jstat warnings, so they don't crash the script.
|
|
15 console = {
|
|
16 warn: print
|
|
17 }
|
|
18
|
|
19 onmessage = function(message) {
|
|
20 // Handle incoming messages from the page. Each message's data is an RPC
|
|
21 // request, with "name" set to a function name, "args" set to an array of
|
|
22 // arguments, and "id" set to an ID that should be returned with the return
|
|
23 // value in a reply message. If the function call fails, an error is sent
|
|
24 // back.
|
|
25
|
|
26
|
|
27 try {
|
|
28 // Go get the specified global function, and apply it on the given
|
|
29 // arguments. Use the global scope ("self") as its "this".
|
|
30 var return_value = self[message.data.name].apply(self,
|
|
31 message.data.args);
|
|
32
|
|
33 } catch(exception) {
|
|
34
|
|
35 // Send the error back to the page instead of a return value.
|
|
36 // Unfortunately, errors themselves can't be cloned, so we do all the
|
|
37 // message making here and send back a string.
|
|
38
|
|
39 // First we build a string with all the parts of the error we can get.
|
|
40 var error_message = "Error in web worker doing job " + message.data.id;
|
|
41 error_message += "\n";
|
|
42 error_message += exception.name + ": " + exception.message;
|
|
43 error_message += "\n";
|
|
44 error_message += "Full details:\n";
|
|
45 for(field in exception) {
|
|
46 if(field == "name" || field == "message") {
|
|
47 // Already got these.
|
|
48 continue;
|
|
49 }
|
|
50
|
|
51 // Copy the field into the message as a string.
|
|
52 error_message += field + ": " + exception[field] + "\n";
|
|
53 }
|
|
54 error_message += "Call: " + message.data.name + "(";
|
|
55 for(var i = 0; i < message.data.args.length; i++) {
|
|
56 error_message += message.data.args[i];
|
|
57 if(i + 1 < message.data.args.length) {
|
|
58 // Have an argument after this.
|
|
59 error_message += ", ";
|
|
60 }
|
|
61 }
|
|
62 error_message += ")";
|
|
63
|
|
64 postMessage({
|
|
65 id: message.data.id,
|
|
66 error: error_message
|
|
67 });
|
|
68
|
|
69 return;
|
|
70 }
|
|
71
|
|
72
|
|
73 // Send the return value back with the id.
|
|
74 postMessage({
|
|
75 id: message.data.id,
|
|
76 return_value: return_value
|
|
77 });
|
|
78 }
|
|
79
|
|
80 function print(message) {
|
|
81 // Print a message to the console of the parent page.
|
|
82 postMessage({
|
|
83 log: message
|
|
84 });
|
|
85 }
|
|
86
|
|
87 function statistics_for_matrix(matrix_url, in_list, out_list, all_list) {
|
|
88 // Download the given score matrix, do stats between in_list and out_list
|
|
89 // for each layer in it, and return an object from layer name to p value.
|
|
90 // all_list specifies the names of all signatures that figure into the
|
|
91 // analysis at all.
|
|
92
|
|
93 // Download the matrix synchronously.
|
|
94 // See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch
|
|
95 // ronous_and_Asynchronous_Requests
|
|
96 // A side effect of this is that we won't have more simultaneous downloads
|
|
97 // than workers, which is probably good.
|
|
98 // This holds the request.
|
|
99 var request = new XMLHttpRequest();
|
|
100 // Get the layer data by GET. The false makes it synchronous.
|
|
101 request.open("GET", matrix_url, false);
|
|
102 request.send(null);
|
|
103
|
|
104 // Now we have the layer TSV
|
|
105 // But we don't have our fancy jQuery TSV parser. Parse it manually.
|
|
106
|
|
107 // This holds an object of layer data objects (from signature to float) by
|
|
108 // layer name.
|
|
109 layers = {};
|
|
110
|
|
111 // This holds the array of lines
|
|
112 // Split on newlines (as seen in jQuery.tsv.js)
|
|
113 var lines = request.responseText.split(/\r?\n/);
|
|
114
|
|
115 // Line 0 gives all the layer names, but the first thing isn't a layer name
|
|
116 // (since it's above the signature column).
|
|
117 var layer_names = lines[0].split(/\t/);
|
|
118 for(var i = 1; i < layer_names.length; i++) {
|
|
119 // Make sure we have an object for this layer
|
|
120 layers[layer_names[i]] = {};
|
|
121 }
|
|
122
|
|
123 // The rest give values per layer for the hex in column 1.
|
|
124 for(var i = 1; i < lines.length; i++) {
|
|
125 // This holds the parts of each line
|
|
126 var parts = lines[i].split(/\t/);
|
|
127
|
|
128 if(parts[0]) {
|
|
129 // We actually have data
|
|
130
|
|
131 // Get the singature
|
|
132 var signature = parts[0];
|
|
133
|
|
134 for(var j = 1; j < parts.length; j++) {
|
|
135 // Go through each non-signature entry and set the appropriate
|
|
136 // layer's value for this signature.
|
|
137 layers[layer_names[j]][signature] = parseFloat(parts[j]);
|
|
138 }
|
|
139 }
|
|
140 }
|
|
141
|
|
142 // Now we've parsed the matrix.
|
|
143 // Go do stats for each layer.
|
|
144 // This holds our calculated p valued by layer name.
|
|
145 var p_values = {};
|
|
146
|
|
147 print("Running statistics for (up to) " + layer_names.length +
|
|
148 " layers from matrix " + matrix_url);
|
|
149
|
|
150 for(var i = 1; i < layer_names.length; i++) {
|
|
151 // Pass the layer data to the per-layer statistics, and get the p value
|
|
152 // back. It's probably easier to do this in this worker than to go
|
|
153 // invoke more workers.
|
|
154 p_values[layer_names[i]] = statistics_for_layer(layers[layer_names[i]],
|
|
155 in_list, out_list, all_list);
|
|
156 }
|
|
157
|
|
158 // We've now calculated a p value for every layer in the matrix. Return the
|
|
159 // calculated p values labeled by layer.
|
|
160 return p_values;
|
|
161
|
|
162 }
|
|
163
|
|
164 function statistics_for_layer(layer_data, in_list, out_list, all_list) {
|
|
165 // Run the appropriate statistical test for the passed layer data, between
|
|
166 // the given in and out arrays of signatures. all_list specifies the names
|
|
167 // of all signatures that figure into the analysis at all. Return the p
|
|
168 // value for the layer, or NaN if no p value could be calculated.
|
|
169
|
|
170 // This holds whether the layer is discrete
|
|
171 var is_discrete = true;
|
|
172
|
|
173 // This holds whether the layer is binary
|
|
174 var is_binary = true;
|
|
175
|
|
176 for(var signature in layer_data) {
|
|
177 if(layer_data[signature] > 1 || layer_data[signature] < 0) {
|
|
178 // Not a binary layer
|
|
179 is_binary = false;
|
|
180 }
|
|
181
|
|
182 if(layer_data[signature] % 1 !== 0) {
|
|
183 // It's a float
|
|
184 is_binary = false;
|
|
185 is_discrete = false;
|
|
186 }
|
|
187 }
|
|
188
|
|
189 if(is_binary) {
|
|
190 // This is a binary/dichotomous layer, so run a binomial test.
|
|
191 return binomial_compare(layer_data, in_list, out_list, all_list);
|
|
192 } else if (is_discrete) {
|
|
193 // This is a multinomial/categorical layer
|
|
194 // TODO: statistics for discrete non-binary layers
|
|
195 return NaN;
|
|
196 } else {
|
|
197 // This is a continuous layer, so run a t test
|
|
198 return t_compare(layer_data, in_list, out_list, all_list);
|
|
199 }
|
|
200
|
|
201 }
|
|
202
|
|
203 function statistics_for_url(layer_url, in_list, out_list, all_list) {
|
|
204 // Run the stats for the layer with the given url, between the given in and
|
|
205 // out arrays of signatures. all_list specifies the names of all signatures
|
|
206 // that figure into the analysis at all. Return the p value for the layer,
|
|
207 // or NaN if no p value could be calculated.
|
|
208
|
|
209 print("Running statistics for individual layer " + layer_url);
|
|
210
|
|
211 // Download the layer data synchronously.
|
|
212 // See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch
|
|
213 // ronous_and_Asynchronous_Requests
|
|
214 // A side effect of this is that we won't have more simultaneous downloads
|
|
215 // than workers, which is probably good.
|
|
216 // This holds the request.
|
|
217 var request = new XMLHttpRequest();
|
|
218 // Get the layer data by GET. The false makes it synchronous.
|
|
219 request.open("GET", layer_url, false);
|
|
220 request.send(null);
|
|
221
|
|
222 // Now we have the layer TSV
|
|
223 // But we don't have our fancy jQuery TSV parser. Parse it manually.
|
|
224
|
|
225 // This holds the layer data (signature to float)
|
|
226 var layer_data = {}
|
|
227
|
|
228 // This holds the array of lines
|
|
229 // Split on newlines (as seen in jQuery.tsv.js)
|
|
230 var lines = request.responseText.split(/\r?\n/);
|
|
231
|
|
232 for(var i = 0; i < lines.length; i++) {
|
|
233 // This holds the parts of each line
|
|
234 var parts = lines[i].split(/\t/);
|
|
235
|
|
236 if(parts[0]) {
|
|
237 // We actually have data
|
|
238 // Parse the layer value for this signature
|
|
239 var value = parseFloat(parts[1]);
|
|
240
|
|
241 // Store the value in the layer data
|
|
242 layer_data[parts[0]] = value;
|
|
243 }
|
|
244 }
|
|
245
|
|
246 // Run stats on the downloaded data
|
|
247 return statistics_for_layer(layer_data, in_list, out_list, all_list);
|
|
248 }
|
|
249
|
|
250 function t_compare(layer_data, in_list, out_list, all_list) {
|
|
251 // Given the data of a continuous layer object (an object from signature
|
|
252 // name to float (or undefined)), and arrays of the names of "in" and "out"
|
|
253 // signatures, do a t test test for whether the in signatures differ from
|
|
254 // the out signatures. Returns an object of metadata, with "p_value" set to
|
|
255 // either the p value of the test (two-tailed), or NaN if the test cannot be
|
|
256 // performed (due to, e.g. fewer than 2 samples in one category).
|
|
257
|
|
258 // Go through the in list and calculate all the summary statistics
|
|
259 // How many non-NaN values?
|
|
260 var number_in = 0;
|
|
261 // What is the sum?
|
|
262 var sum_in = 0;
|
|
263
|
|
264 for(var i = 0; i < in_list.length; i++) {
|
|
265 if(!isNaN(layer_data[in_list[i]])) {
|
|
266 number_in++;
|
|
267 sum_in += layer_data[in_list[i]];
|
|
268 }
|
|
269 }
|
|
270
|
|
271 // We've done one pass, so we know if we have any in list data actually
|
|
272 if(number_in < 2) {
|
|
273 // Not enough to run the t test
|
|
274 return NaN;
|
|
275 }
|
|
276
|
|
277 // What is the mean?
|
|
278 var mean_in = sum_in / number_in;
|
|
279
|
|
280 // What is the second moment (sum of squares of differences from the mean)
|
|
281 var second_moment_in = 0;
|
|
282 for(var i = 0; i < in_list.length; i++) {
|
|
283 if(!isNaN(layer_data[in_list[i]])) {
|
|
284 second_moment_in += Math.pow(layer_data[in_list[i]] - mean_in, 2);
|
|
285 }
|
|
286 }
|
|
287
|
|
288 // What is the unbiased variance?
|
|
289 unbiased_variance_in = second_moment_in / (number_in - 1);
|
|
290
|
|
291 // Now go through the same process for the out list
|
|
292 // How many non-NaN values?
|
|
293 var number_out = 0;
|
|
294 // What is the sum?
|
|
295 var sum_out = 0;
|
|
296
|
|
297 for(var i = 0; i < out_list.length; i++) {
|
|
298 if(!isNaN(layer_data[out_list[i]])) {
|
|
299 number_out++;
|
|
300 sum_out += layer_data[out_list[i]];
|
|
301 }
|
|
302 }
|
|
303
|
|
304 // We've done one pass, so we know if we have any out list data actually
|
|
305 if(number_out < 2) {
|
|
306 // Not enough to run the t test
|
|
307 return NaN;
|
|
308 }
|
|
309
|
|
310 // What is the mean?
|
|
311 var mean_out = sum_out / number_out;
|
|
312
|
|
313 // What is the second moment (sum of squares of differences from the mean)
|
|
314 var second_moment_out = 0;
|
|
315 for(var i = 0; i < out_list.length; i++) {
|
|
316 if(!isNaN(layer_data[out_list[i]])) {
|
|
317 second_moment_out += Math.pow(layer_data[out_list[i]] - mean_out,
|
|
318 2);
|
|
319 }
|
|
320 }
|
|
321
|
|
322 // What is the unbiased variance?
|
|
323 unbiased_variance_out = second_moment_out / (number_out - 1);
|
|
324
|
|
325 // We can't do the test if both variances are 0
|
|
326 if(unbiased_variance_in == 0 && unbiased_variance_out == 0) {
|
|
327 return NaN;
|
|
328 }
|
|
329
|
|
330 // Now we can calculate the t test two-tailed p value
|
|
331 var p_value = t_test(mean_in, unbiased_variance_in, number_in, mean_out,
|
|
332 unbiased_variance_out, number_out);
|
|
333
|
|
334 // And return it in a dict with other metadata.
|
|
335 // We don't really have any other metadata.
|
|
336 return {
|
|
337 p_value: p_value
|
|
338 };
|
|
339 }
|
|
340
|
|
341 function t_test(mean_in, unbiased_variance_in, number_in, mean_out,
|
|
342 unbiased_variance_out, number_out) {
|
|
343
|
|
344 // Given the mean, unbiased variance, and number of samples for both the in
|
|
345 // group and the out group, compute the p value for the t test with unequal
|
|
346 // sample sizes and unequal variances, testing to see whether the means
|
|
347 // differ (a two-tailed "Welch's" t test). See
|
|
348 // https://en.wikipedia.org/wiki/Student%27s_t-test
|
|
349 // Assumes we have enough samples to actually perform the test.
|
|
350
|
|
351 // First, calculate the t statistic, which is where our observations fall on
|
|
352 // the t distribution.
|
|
353 var t_statistic = (mean_in - mean_out) / Math.sqrt((unbiased_variance_in /
|
|
354 number_in) + (unbiased_variance_out / number_out));
|
|
355
|
|
356
|
|
357 // Calculate the degrees of freedom for the particular t distribution that
|
|
358 // we ought to compare the statistic against
|
|
359 var degrees_of_freedom = Math.pow((unbiased_variance_in / number_in) +
|
|
360 (unbiased_variance_out / number_out), 2) /
|
|
361 ((Math.pow(unbiased_variance_in / number_in, 2) / (number_in - 1)) +
|
|
362 (Math.pow(unbiased_variance_out / number_out, 2) / (number_out - 1)));
|
|
363
|
|
364 // Now we have to compare the t statistic to the t test CDF available via
|
|
365 // the totally undocumented jstat.pt = function(q, df, ncp, lower_tail, log)
|
|
366 // where:
|
|
367 // q is the t statistic value to calculate the cdf at
|
|
368 // df is the degrees of freedom
|
|
369 // ncp is the "mu" parameter for the t distributiuon. I think this sets the
|
|
370 // mean, and it's OK to leave blank.
|
|
371 // lower_tail presumably specifies if we want the lower or upper tail of the
|
|
372 // CDF. Defaults to true.
|
|
373 // Log specifies if we want the log probability. Defaults to false.
|
|
374
|
|
375 // Make the t statistic be on the low side of the distribution, and
|
|
376 // calculate the lower tail's area using the CDF.
|
|
377 var one_tail_probability = jstat.pt(0 - Math.abs(t_statistic),
|
|
378 degrees_of_freedom);
|
|
379
|
|
380 // Return the two-tailed p value, which, since the t distribution is
|
|
381 // symmetric, is just twice the single-tail probability
|
|
382 return 2 * one_tail_probability;
|
|
383
|
|
384 }
|
|
385
|
|
386 function binomial_compare(layer_data, in_list, out_list, all_list) {
|
|
387 // Given the data of a binary layer object (an object from signature name to
|
|
388 // 0 or 1 (or undefined)), and arrays of the names of "in" and "out"
|
|
389 // signatures, do a binomial test for whether the in signatures differ from
|
|
390 // the out signatures. Uses a number of pseudocount trials as specified in
|
|
391 // the global constant BINOMIAL_PSEUDOCOUNTS Returns an object of metadata,
|
|
392 // with "p_value" set to either the p value of the test (two-tailed), or NaN
|
|
393 // if the test cannot be performed. all_list specifies the names of all
|
|
394 // signatures that figure into the analysis at all (i.e. those which the
|
|
395 // user hasn't filtered out), which we use when calculating how many of our
|
|
396 // pseudocounts should be successes. Signature names appearing in all_list
|
|
397 // but with no data in layer_data are not counted.
|
|
398
|
|
399
|
|
400 // Work out the distribution from the out list
|
|
401 // How many out signatures are 1?
|
|
402 var outside_yes = 0;
|
|
403 // And are 0?
|
|
404 var outside_no = 0;
|
|
405
|
|
406 for(var i = 0; i < out_list.length; i++) {
|
|
407 if(layer_data[out_list[i]] === 1) {
|
|
408 // This is a yes and it's outside.
|
|
409 outside_yes++;
|
|
410 } else if(layer_data[out_list[i]] === 0) {
|
|
411 // A no and outside
|
|
412 outside_no++;
|
|
413 }
|
|
414 }
|
|
415
|
|
416 // It's OK for all the outside hexes to be 0 now. Pseudocounts can give us a
|
|
417 // p value.
|
|
418
|
|
419 // Now work out our pseudocounts.
|
|
420 // How many signatures in all_list are successes?
|
|
421 var all_yes = 0;
|
|
422 // And how many are failures (as opposed to undef)
|
|
423 var all_no = 0;
|
|
424
|
|
425 for(var i = 0; i < all_list.length; i++) {
|
|
426 if(layer_data[all_list[i]] === 1) {
|
|
427 // A yes anywhere
|
|
428 all_yes++;
|
|
429 } else if(layer_data[all_list[i]] === 0) {
|
|
430 // A real no (not a no-data) anywhere
|
|
431 all_no++;
|
|
432 }
|
|
433 }
|
|
434
|
|
435 // It't not OK for there to be no hexes in the all set. Maybe they filtered
|
|
436 // out all the ones with any data?
|
|
437 if(all_yes + all_no == 0) {
|
|
438 // TODO: Sure wish we had layer names here.
|
|
439 print("No signatures were available with data for this layer.");
|
|
440 return NaN;
|
|
441 }
|
|
442
|
|
443 // Calculate how many pseudo-yeses we should have.
|
|
444 // Match the frequency in all signatures.
|
|
445 var pseudo_yes = BINOMIAL_PSEUDOCOUNTS * (all_yes / (all_yes + all_no));
|
|
446
|
|
447 // pseudo-trials is just BINOMIAL_PSEUDOCOUNTS
|
|
448
|
|
449 // This holds the probability of being a 1 for the out list.
|
|
450 // We want to test if the in list differs significantly from this.
|
|
451 var background_probability = (outside_yes + pseudo_yes) / (outside_yes +
|
|
452 outside_no + BINOMIAL_PSEUDOCOUNTS);
|
|
453
|
|
454 if(background_probability == 0) {
|
|
455 // Can't do the binomial test in this case. Somehow there were no yeses
|
|
456 // anywhere.
|
|
457 return NaN;
|
|
458 }
|
|
459
|
|
460 // How many 1s are in the in list?
|
|
461 var inside_yes = 0;
|
|
462 // And how many 0s?
|
|
463 var inside_no = 0;
|
|
464
|
|
465 for(var i = 0; i < in_list.length; i++) {
|
|
466 if(layer_data[in_list[i]] === 1) {
|
|
467 // This is a yes and it's inside.
|
|
468 inside_yes++;
|
|
469 } else if(layer_data[in_list[i]] === 0) {
|
|
470 // A no and it's inside
|
|
471 inside_no++;
|
|
472 }
|
|
473 }
|
|
474
|
|
475 // Return the p value for rejecting the null hypothesis that the in
|
|
476 // signatures follow the background distribution.
|
|
477 var p = binomial_test(inside_yes + inside_no, inside_yes,
|
|
478 background_probability);
|
|
479
|
|
480 if(LOG_SUSPICIOUS && (p == 0 || p == 1)) {
|
|
481 // We got an odd p value. Complain about it.
|
|
482 print("Got suspicious p value " + p);
|
|
483 print("Was binomial test for " + inside_yes + " successes in " +
|
|
484 (inside_yes + inside_no) + " trials at probability " +
|
|
485 background_probability);
|
|
486 print("Background was " + outside_yes + " out of " + (outside_yes +
|
|
487 outside_no) + " with " + pseudo_yes + " out of " +
|
|
488 BINOMIAL_PSEUDOCOUNTS + " pseudocounts.");
|
|
489 }
|
|
490
|
|
491 // Return our p value as "p_value", and also how many non-pseudocount
|
|
492 // successes were in the in_list and the out_list.
|
|
493 return {
|
|
494 p_value: p,
|
|
495 "1s in A": inside_yes,
|
|
496 "1s in background": outside_yes
|
|
497 };
|
|
498 }
|
|
499
|
|
500 function binomial_test(trials, successes, success_probability) {
|
|
501 if(trials < successes) {
|
|
502 print("Trying to test " + trials + " trials with " + successes +
|
|
503 " successes!");
|
|
504 }
|
|
505
|
|
506 // Return the p value for rejecting the null hypothesis that the observed
|
|
507 // number of successes happened in the observed number of trials when the
|
|
508 // probability of success was success_probability. Does a Binomial
|
|
509 // test.
|
|
510
|
|
511 // Calculate the P value
|
|
512 // This must be terribly complicated since nobody seems to have written up
|
|
513 // how to do it as anything other than an arcane stats ritual.
|
|
514 // Something close: http://www.johnmyleswhite.com/notebook/2012/04/14/implem
|
|
515 // enting-the-exact-binomial-test-in-julia/
|
|
516 // How scipy.stats does it (x = successes, n = trials, p = supposed
|
|
517 // probability):
|
|
518 // SourceForge says Scipy is BSD licensed, so we can steal this code for our
|
|
519 // comments.
|
|
520 /*
|
|
521 d = distributions.binom.pmf(x,n,p)
|
|
522 rerr = 1+1e-7
|
|
523 if (x < p*n):
|
|
524 i = np.arange(np.ceil(p*n),n+1)
|
|
525 y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0)
|
|
526 pval = distributions.binom.cdf(x,n,p) + distributions.binom.sf(n-y,
|
|
527 n,p)
|
|
528 else:
|
|
529 i = np.arange(np.floor(p*n))
|
|
530 y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0)
|
|
531 pval = distributions.binom.cdf(y-1,n,p) + distributions.binom.sf(
|
|
532 x-1,n,p)
|
|
533 */
|
|
534 // There is of course no justification for why this would work.
|
|
535 // What it's actually doing is a complicated Numpy vectorized operation to
|
|
536 // find the boundary of the tail we don't have, and then adding the CDF of
|
|
537 // the lower tail boundary and (1-CDF) of the upper tail boundary (which is
|
|
538 // the P value by definition).
|
|
539
|
|
540 // This holds the probability of exactly what we've observed under the null
|
|
541 // hypothesis.
|
|
542 var observed_probability = binomial_pmf(trials, successes,
|
|
543 success_probability);
|
|
544
|
|
545 if(successes < trials * success_probability) {
|
|
546 // We know anything with fewer successes than this is more extreme. But
|
|
547 // how many successes would we need to have an equally extreme but
|
|
548 // higher than expected number of successes?
|
|
549 // We should sum down from all successes. (We'll sum from small to large
|
|
550 // so it's OK numerically.)
|
|
551
|
|
552 // This holds the total probability of everything more extremely
|
|
553 // successful than what we've observed.
|
|
554 var other_tail_total_probability = 0;
|
|
555
|
|
556 // TODO: implement some better sort of search thing and use CDF
|
|
557 for(var other_tail_start = trials; other_tail_start >=
|
|
558 Math.ceil(trials * success_probability); other_tail_start--) {
|
|
559
|
|
560 // Get the probability for this particular case
|
|
561 var case_probability = binomial_pmf(trials, other_tail_start,
|
|
562 success_probability);
|
|
563
|
|
564 if(case_probability > observed_probability) {
|
|
565 // This case is actually less extreme than what we've observed,
|
|
566 // so our summation is complete.
|
|
567
|
|
568 break;
|
|
569 } else {
|
|
570 // This case is more extreme than what we've observed, so use it
|
|
571 other_tail_total_probability += case_probability;
|
|
572 }
|
|
573 }
|
|
574
|
|
575 // This holds the probability in this tail
|
|
576 var this_tail_probability = binomial_cdf(trials, successes,
|
|
577 success_probability)
|
|
578
|
|
579
|
|
580 // Return the total probability from both tails, clamped to 1.
|
|
581 return Math.min(this_tail_probability + other_tail_total_probability,
|
|
582 1.0);
|
|
583 } else {
|
|
584 // We know anything with more successes than this is more extreme. But
|
|
585 // how few successes would we need to have an equally extreme but lower
|
|
586 // than expected number of successes?
|
|
587 // We will sum up from 0 successes. We really ought to use the CDF
|
|
588 // somehow, but I can't think of how we would do it.
|
|
589
|
|
590 // This holds the total probability of everything more extremely
|
|
591 // failureful than what we've observed.
|
|
592 var other_tail_total_probability = 0;
|
|
593
|
|
594 for(var other_tail_end = 0; other_tail_end <
|
|
595 Math.floor(trials * success_probability); other_tail_end++) {
|
|
596 // We only have to iterate up to the peak (most likely) value.
|
|
597
|
|
598 // Get the probability for this particular case
|
|
599 var case_probability = binomial_pmf(trials, other_tail_end,
|
|
600 success_probability);
|
|
601
|
|
602 if(case_probability > observed_probability) {
|
|
603 // This case is actually less extreme than what we've observed,
|
|
604 // so our summation is complete.
|
|
605 break;
|
|
606 } else {
|
|
607 // This case is more extreme than what we've observed, so use it
|
|
608 other_tail_total_probability += case_probability;
|
|
609 }
|
|
610
|
|
611 }
|
|
612
|
|
613 // This holds the probability in this tail. It is equal to the
|
|
614 // probability up to, but not including, where this tail starts. So even
|
|
615 // if the tail starts at the highest possible number of successes, it
|
|
616 // has some probability. successes can't be 0 here (since then we'd be
|
|
617 // below any nonzero expected probability and take the other branch.
|
|
618 // Since it's a positive integer, it must be 1 or more, so we can
|
|
619 // subtract 1 safely.
|
|
620 var this_tail_probability = 1 - binomial_cdf(trials, successes - 1,
|
|
621 success_probability);
|
|
622
|
|
623 // Return the total probability from both tails, clamped to 1
|
|
624 return Math.min(this_tail_probability + other_tail_total_probability,
|
|
625 1.0);
|
|
626 }
|
|
627
|
|
628
|
|
629 }
|
|
630
|
|
631 function binomial_cdf(trials, successes, success_probability) {
|
|
632 // The Binomial distribution's cumulative distribution function. Given a
|
|
633 // number of trials, a number of successes, and a success probability,
|
|
634 // return the probability of having observed that many successes or fewer.
|
|
635
|
|
636 // We compute this efficiently using the "regularized incomplete beta
|
|
637 // function", AKA the beta distribution cdf, which we get from jstat.
|
|
638 // See http://en.wikipedia.org/wiki/Binomial_distribution#Cumulative_distrib
|
|
639 // ution_function and http://en.wikipedia.org/wiki/Regularized_incomplete_be
|
|
640 // ta_function#Incomplete_beta_function
|
|
641
|
|
642 if(trials == successes) {
|
|
643 // jStat doesn't want a 0 alpha for its beta distribution (no failures)
|
|
644 // Calculate this one by hand (it's easy)
|
|
645 return 1;
|
|
646 }
|
|
647
|
|
648 if(trials < successes) {
|
|
649 // This should never happen. TODO: Debug when it happens.
|
|
650 print("Error: trials (" + trials + ") < successes (" + successes +
|
|
651 ")!");
|
|
652 return NaN;
|
|
653 }
|
|
654
|
|
655 // This is the observation that we want the beta distribution CDF before
|
|
656 var beta_observation = 1 - success_probability;
|
|
657
|
|
658 // These are the parameters of the relavent beta distribution
|
|
659 var beta_alpha = trials - successes;
|
|
660 var beta_beta = successes + 1;
|
|
661
|
|
662 // Return the beta distribution CDF value, which happens to also be our CDF.
|
|
663 return jstat.pbeta(beta_observation, beta_alpha, beta_beta);
|
|
664 }
|
|
665
|
|
666 function binomial_pmf(trials, successes, success_probability) {
|
|
667 // The Binomial distribution's probability mass function. Given a number of
|
|
668 // trials, a number of successes, and the probability of success on each
|
|
669 // trial, calculate the probability of observing that many successes in that
|
|
670 // many trials with the given success rate.
|
|
671
|
|
672 // The probability of this many successes in this many trials at this
|
|
673 // success rate is the probability of succeeding so many times and failing
|
|
674 // so many times, summed over all the mutually exclusive arrangements of
|
|
675 // successes and failures.
|
|
676 return (choose(trials, successes) *
|
|
677 Math.pow(success_probability, successes) *
|
|
678 Math.pow(1 - success_probability, trials - successes));
|
|
679
|
|
680 }
|
|
681
|
|
682 function choose(available, selected) {
|
|
683 // The choose function: from available distinct objects, how many ways are
|
|
684 // there to select selected of them. Returns "available choose selected".
|
|
685 // Works with large input numbers that are too big to take the factorials
|
|
686 // of.
|
|
687
|
|
688 // We use a neat overflow-robust algorithm that eliminates the factorials
|
|
689 // and makes the computation a multiplication of numbers greater than one.
|
|
690 // So, no overflow unless the result itself is too big.
|
|
691 // See http://arantxa.ii.uam.es/~ssantini/writing/notes/s667_binomial.pdf
|
|
692
|
|
693 if(selected < available - selected) {
|
|
694 // It would be faster to think about choosing what we don't include. So
|
|
695 // do that instead.
|
|
696 return choose(available, available - selected);
|
|
697 }
|
|
698
|
|
699 // This holds the result we are accumulating. Initialize to the
|
|
700 // multiplicative identity.
|
|
701 var result = 1;
|
|
702
|
|
703 for(var i = 1; i < available - selected + 1; i++) {
|
|
704 result *= (1 + (selected / i));
|
|
705 }
|
|
706
|
|
707 // TODO: The result ought always to be an integer. Ensure this.
|
|
708 return result;
|
|
709 }
|