Mercurial > repos > adam-novak > hexagram
comparison hexagram-6ae12361157c/hexagram/statistics.js @ 0:1407e3634bcf draft default tip
Uploaded r11 from test tool shed.
author | adam-novak |
---|---|
date | Tue, 22 Oct 2013 14:17:59 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1407e3634bcf |
---|---|
1 // statistics.js: Web Worker file to run statistical tests in the background. | |
2 | |
3 // Constants: | |
4 // How many pseudocount trials should we use for the binomial test? | |
5 var BINOMIAL_PSEUDOCOUNTS = 5; | |
6 | |
7 // Should we log information about suspicious p values to the console for manual | |
8 // spot checking? | |
9 var LOG_SUSPICIOUS = false; | |
10 | |
11 // Go get jStat. Hope it's happy in Worker-land. | |
12 importScripts("jstat-1.0.0.js"); | |
13 | |
14 // Make a fake console to catch jstat warnings, so they don't crash the script. | |
15 console = { | |
16 warn: print | |
17 } | |
18 | |
19 onmessage = function(message) { | |
20 // Handle incoming messages from the page. Each message's data is an RPC | |
21 // request, with "name" set to a function name, "args" set to an array of | |
22 // arguments, and "id" set to an ID that should be returned with the return | |
23 // value in a reply message. If the function call fails, an error is sent | |
24 // back. | |
25 | |
26 | |
27 try { | |
28 // Go get the specified global function, and apply it on the given | |
29 // arguments. Use the global scope ("self") as its "this". | |
30 var return_value = self[message.data.name].apply(self, | |
31 message.data.args); | |
32 | |
33 } catch(exception) { | |
34 | |
35 // Send the error back to the page instead of a return value. | |
36 // Unfortunately, errors themselves can't be cloned, so we do all the | |
37 // message making here and send back a string. | |
38 | |
39 // First we build a string with all the parts of the error we can get. | |
40 var error_message = "Error in web worker doing job " + message.data.id; | |
41 error_message += "\n"; | |
42 error_message += exception.name + ": " + exception.message; | |
43 error_message += "\n"; | |
44 error_message += "Full details:\n"; | |
45 for(field in exception) { | |
46 if(field == "name" || field == "message") { | |
47 // Already got these. | |
48 continue; | |
49 } | |
50 | |
51 // Copy the field into the message as a string. | |
52 error_message += field + ": " + exception[field] + "\n"; | |
53 } | |
54 error_message += "Call: " + message.data.name + "("; | |
55 for(var i = 0; i < message.data.args.length; i++) { | |
56 error_message += message.data.args[i]; | |
57 if(i + 1 < message.data.args.length) { | |
58 // Have an argument after this. | |
59 error_message += ", "; | |
60 } | |
61 } | |
62 error_message += ")"; | |
63 | |
64 postMessage({ | |
65 id: message.data.id, | |
66 error: error_message | |
67 }); | |
68 | |
69 return; | |
70 } | |
71 | |
72 | |
73 // Send the return value back with the id. | |
74 postMessage({ | |
75 id: message.data.id, | |
76 return_value: return_value | |
77 }); | |
78 } | |
79 | |
80 function print(message) { | |
81 // Print a message to the console of the parent page. | |
82 postMessage({ | |
83 log: message | |
84 }); | |
85 } | |
86 | |
87 function statistics_for_matrix(matrix_url, in_list, out_list, all_list) { | |
88 // Download the given score matrix, do stats between in_list and out_list | |
89 // for each layer in it, and return an object from layer name to p value. | |
90 // all_list specifies the names of all signatures that figure into the | |
91 // analysis at all. | |
92 | |
93 // Download the matrix synchronously. | |
94 // See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch | |
95 // ronous_and_Asynchronous_Requests | |
96 // A side effect of this is that we won't have more simultaneous downloads | |
97 // than workers, which is probably good. | |
98 // This holds the request. | |
99 var request = new XMLHttpRequest(); | |
100 // Get the layer data by GET. The false makes it synchronous. | |
101 request.open("GET", matrix_url, false); | |
102 request.send(null); | |
103 | |
104 // Now we have the layer TSV | |
105 // But we don't have our fancy jQuery TSV parser. Parse it manually. | |
106 | |
107 // This holds an object of layer data objects (from signature to float) by | |
108 // layer name. | |
109 layers = {}; | |
110 | |
111 // This holds the array of lines | |
112 // Split on newlines (as seen in jQuery.tsv.js) | |
113 var lines = request.responseText.split(/\r?\n/); | |
114 | |
115 // Line 0 gives all the layer names, but the first thing isn't a layer name | |
116 // (since it's above the signature column). | |
117 var layer_names = lines[0].split(/\t/); | |
118 for(var i = 1; i < layer_names.length; i++) { | |
119 // Make sure we have an object for this layer | |
120 layers[layer_names[i]] = {}; | |
121 } | |
122 | |
123 // The rest give values per layer for the hex in column 1. | |
124 for(var i = 1; i < lines.length; i++) { | |
125 // This holds the parts of each line | |
126 var parts = lines[i].split(/\t/); | |
127 | |
128 if(parts[0]) { | |
129 // We actually have data | |
130 | |
131 // Get the singature | |
132 var signature = parts[0]; | |
133 | |
134 for(var j = 1; j < parts.length; j++) { | |
135 // Go through each non-signature entry and set the appropriate | |
136 // layer's value for this signature. | |
137 layers[layer_names[j]][signature] = parseFloat(parts[j]); | |
138 } | |
139 } | |
140 } | |
141 | |
142 // Now we've parsed the matrix. | |
143 // Go do stats for each layer. | |
144 // This holds our calculated p valued by layer name. | |
145 var p_values = {}; | |
146 | |
147 print("Running statistics for (up to) " + layer_names.length + | |
148 " layers from matrix " + matrix_url); | |
149 | |
150 for(var i = 1; i < layer_names.length; i++) { | |
151 // Pass the layer data to the per-layer statistics, and get the p value | |
152 // back. It's probably easier to do this in this worker than to go | |
153 // invoke more workers. | |
154 p_values[layer_names[i]] = statistics_for_layer(layers[layer_names[i]], | |
155 in_list, out_list, all_list); | |
156 } | |
157 | |
158 // We've now calculated a p value for every layer in the matrix. Return the | |
159 // calculated p values labeled by layer. | |
160 return p_values; | |
161 | |
162 } | |
163 | |
164 function statistics_for_layer(layer_data, in_list, out_list, all_list) { | |
165 // Run the appropriate statistical test for the passed layer data, between | |
166 // the given in and out arrays of signatures. all_list specifies the names | |
167 // of all signatures that figure into the analysis at all. Return the p | |
168 // value for the layer, or NaN if no p value could be calculated. | |
169 | |
170 // This holds whether the layer is discrete | |
171 var is_discrete = true; | |
172 | |
173 // This holds whether the layer is binary | |
174 var is_binary = true; | |
175 | |
176 for(var signature in layer_data) { | |
177 if(layer_data[signature] > 1 || layer_data[signature] < 0) { | |
178 // Not a binary layer | |
179 is_binary = false; | |
180 } | |
181 | |
182 if(layer_data[signature] % 1 !== 0) { | |
183 // It's a float | |
184 is_binary = false; | |
185 is_discrete = false; | |
186 } | |
187 } | |
188 | |
189 if(is_binary) { | |
190 // This is a binary/dichotomous layer, so run a binomial test. | |
191 return binomial_compare(layer_data, in_list, out_list, all_list); | |
192 } else if (is_discrete) { | |
193 // This is a multinomial/categorical layer | |
194 // TODO: statistics for discrete non-binary layers | |
195 return NaN; | |
196 } else { | |
197 // This is a continuous layer, so run a t test | |
198 return t_compare(layer_data, in_list, out_list, all_list); | |
199 } | |
200 | |
201 } | |
202 | |
203 function statistics_for_url(layer_url, in_list, out_list, all_list) { | |
204 // Run the stats for the layer with the given url, between the given in and | |
205 // out arrays of signatures. all_list specifies the names of all signatures | |
206 // that figure into the analysis at all. Return the p value for the layer, | |
207 // or NaN if no p value could be calculated. | |
208 | |
209 print("Running statistics for individual layer " + layer_url); | |
210 | |
211 // Download the layer data synchronously. | |
212 // See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch | |
213 // ronous_and_Asynchronous_Requests | |
214 // A side effect of this is that we won't have more simultaneous downloads | |
215 // than workers, which is probably good. | |
216 // This holds the request. | |
217 var request = new XMLHttpRequest(); | |
218 // Get the layer data by GET. The false makes it synchronous. | |
219 request.open("GET", layer_url, false); | |
220 request.send(null); | |
221 | |
222 // Now we have the layer TSV | |
223 // But we don't have our fancy jQuery TSV parser. Parse it manually. | |
224 | |
225 // This holds the layer data (signature to float) | |
226 var layer_data = {} | |
227 | |
228 // This holds the array of lines | |
229 // Split on newlines (as seen in jQuery.tsv.js) | |
230 var lines = request.responseText.split(/\r?\n/); | |
231 | |
232 for(var i = 0; i < lines.length; i++) { | |
233 // This holds the parts of each line | |
234 var parts = lines[i].split(/\t/); | |
235 | |
236 if(parts[0]) { | |
237 // We actually have data | |
238 // Parse the layer value for this signature | |
239 var value = parseFloat(parts[1]); | |
240 | |
241 // Store the value in the layer data | |
242 layer_data[parts[0]] = value; | |
243 } | |
244 } | |
245 | |
246 // Run stats on the downloaded data | |
247 return statistics_for_layer(layer_data, in_list, out_list, all_list); | |
248 } | |
249 | |
250 function t_compare(layer_data, in_list, out_list, all_list) { | |
251 // Given the data of a continuous layer object (an object from signature | |
252 // name to float (or undefined)), and arrays of the names of "in" and "out" | |
253 // signatures, do a t test test for whether the in signatures differ from | |
254 // the out signatures. Returns an object of metadata, with "p_value" set to | |
255 // either the p value of the test (two-tailed), or NaN if the test cannot be | |
256 // performed (due to, e.g. fewer than 2 samples in one category). | |
257 | |
258 // Go through the in list and calculate all the summary statistics | |
259 // How many non-NaN values? | |
260 var number_in = 0; | |
261 // What is the sum? | |
262 var sum_in = 0; | |
263 | |
264 for(var i = 0; i < in_list.length; i++) { | |
265 if(!isNaN(layer_data[in_list[i]])) { | |
266 number_in++; | |
267 sum_in += layer_data[in_list[i]]; | |
268 } | |
269 } | |
270 | |
271 // We've done one pass, so we know if we have any in list data actually | |
272 if(number_in < 2) { | |
273 // Not enough to run the t test | |
274 return NaN; | |
275 } | |
276 | |
277 // What is the mean? | |
278 var mean_in = sum_in / number_in; | |
279 | |
280 // What is the second moment (sum of squares of differences from the mean) | |
281 var second_moment_in = 0; | |
282 for(var i = 0; i < in_list.length; i++) { | |
283 if(!isNaN(layer_data[in_list[i]])) { | |
284 second_moment_in += Math.pow(layer_data[in_list[i]] - mean_in, 2); | |
285 } | |
286 } | |
287 | |
288 // What is the unbiased variance? | |
289 unbiased_variance_in = second_moment_in / (number_in - 1); | |
290 | |
291 // Now go through the same process for the out list | |
292 // How many non-NaN values? | |
293 var number_out = 0; | |
294 // What is the sum? | |
295 var sum_out = 0; | |
296 | |
297 for(var i = 0; i < out_list.length; i++) { | |
298 if(!isNaN(layer_data[out_list[i]])) { | |
299 number_out++; | |
300 sum_out += layer_data[out_list[i]]; | |
301 } | |
302 } | |
303 | |
304 // We've done one pass, so we know if we have any out list data actually | |
305 if(number_out < 2) { | |
306 // Not enough to run the t test | |
307 return NaN; | |
308 } | |
309 | |
310 // What is the mean? | |
311 var mean_out = sum_out / number_out; | |
312 | |
313 // What is the second moment (sum of squares of differences from the mean) | |
314 var second_moment_out = 0; | |
315 for(var i = 0; i < out_list.length; i++) { | |
316 if(!isNaN(layer_data[out_list[i]])) { | |
317 second_moment_out += Math.pow(layer_data[out_list[i]] - mean_out, | |
318 2); | |
319 } | |
320 } | |
321 | |
322 // What is the unbiased variance? | |
323 unbiased_variance_out = second_moment_out / (number_out - 1); | |
324 | |
325 // We can't do the test if both variances are 0 | |
326 if(unbiased_variance_in == 0 && unbiased_variance_out == 0) { | |
327 return NaN; | |
328 } | |
329 | |
330 // Now we can calculate the t test two-tailed p value | |
331 var p_value = t_test(mean_in, unbiased_variance_in, number_in, mean_out, | |
332 unbiased_variance_out, number_out); | |
333 | |
334 // And return it in a dict with other metadata. | |
335 // We don't really have any other metadata. | |
336 return { | |
337 p_value: p_value | |
338 }; | |
339 } | |
340 | |
341 function t_test(mean_in, unbiased_variance_in, number_in, mean_out, | |
342 unbiased_variance_out, number_out) { | |
343 | |
344 // Given the mean, unbiased variance, and number of samples for both the in | |
345 // group and the out group, compute the p value for the t test with unequal | |
346 // sample sizes and unequal variances, testing to see whether the means | |
347 // differ (a two-tailed "Welch's" t test). See | |
348 // https://en.wikipedia.org/wiki/Student%27s_t-test | |
349 // Assumes we have enough samples to actually perform the test. | |
350 | |
351 // First, calculate the t statistic, which is where our observations fall on | |
352 // the t distribution. | |
353 var t_statistic = (mean_in - mean_out) / Math.sqrt((unbiased_variance_in / | |
354 number_in) + (unbiased_variance_out / number_out)); | |
355 | |
356 | |
357 // Calculate the degrees of freedom for the particular t distribution that | |
358 // we ought to compare the statistic against | |
359 var degrees_of_freedom = Math.pow((unbiased_variance_in / number_in) + | |
360 (unbiased_variance_out / number_out), 2) / | |
361 ((Math.pow(unbiased_variance_in / number_in, 2) / (number_in - 1)) + | |
362 (Math.pow(unbiased_variance_out / number_out, 2) / (number_out - 1))); | |
363 | |
364 // Now we have to compare the t statistic to the t test CDF available via | |
365 // the totally undocumented jstat.pt = function(q, df, ncp, lower_tail, log) | |
366 // where: | |
367 // q is the t statistic value to calculate the cdf at | |
368 // df is the degrees of freedom | |
369 // ncp is the "mu" parameter for the t distributiuon. I think this sets the | |
370 // mean, and it's OK to leave blank. | |
371 // lower_tail presumably specifies if we want the lower or upper tail of the | |
372 // CDF. Defaults to true. | |
373 // Log specifies if we want the log probability. Defaults to false. | |
374 | |
375 // Make the t statistic be on the low side of the distribution, and | |
376 // calculate the lower tail's area using the CDF. | |
377 var one_tail_probability = jstat.pt(0 - Math.abs(t_statistic), | |
378 degrees_of_freedom); | |
379 | |
380 // Return the two-tailed p value, which, since the t distribution is | |
381 // symmetric, is just twice the single-tail probability | |
382 return 2 * one_tail_probability; | |
383 | |
384 } | |
385 | |
386 function binomial_compare(layer_data, in_list, out_list, all_list) { | |
387 // Given the data of a binary layer object (an object from signature name to | |
388 // 0 or 1 (or undefined)), and arrays of the names of "in" and "out" | |
389 // signatures, do a binomial test for whether the in signatures differ from | |
390 // the out signatures. Uses a number of pseudocount trials as specified in | |
391 // the global constant BINOMIAL_PSEUDOCOUNTS Returns an object of metadata, | |
392 // with "p_value" set to either the p value of the test (two-tailed), or NaN | |
393 // if the test cannot be performed. all_list specifies the names of all | |
394 // signatures that figure into the analysis at all (i.e. those which the | |
395 // user hasn't filtered out), which we use when calculating how many of our | |
396 // pseudocounts should be successes. Signature names appearing in all_list | |
397 // but with no data in layer_data are not counted. | |
398 | |
399 | |
400 // Work out the distribution from the out list | |
401 // How many out signatures are 1? | |
402 var outside_yes = 0; | |
403 // And are 0? | |
404 var outside_no = 0; | |
405 | |
406 for(var i = 0; i < out_list.length; i++) { | |
407 if(layer_data[out_list[i]] === 1) { | |
408 // This is a yes and it's outside. | |
409 outside_yes++; | |
410 } else if(layer_data[out_list[i]] === 0) { | |
411 // A no and outside | |
412 outside_no++; | |
413 } | |
414 } | |
415 | |
416 // It's OK for all the outside hexes to be 0 now. Pseudocounts can give us a | |
417 // p value. | |
418 | |
419 // Now work out our pseudocounts. | |
420 // How many signatures in all_list are successes? | |
421 var all_yes = 0; | |
422 // And how many are failures (as opposed to undef) | |
423 var all_no = 0; | |
424 | |
425 for(var i = 0; i < all_list.length; i++) { | |
426 if(layer_data[all_list[i]] === 1) { | |
427 // A yes anywhere | |
428 all_yes++; | |
429 } else if(layer_data[all_list[i]] === 0) { | |
430 // A real no (not a no-data) anywhere | |
431 all_no++; | |
432 } | |
433 } | |
434 | |
435 // It't not OK for there to be no hexes in the all set. Maybe they filtered | |
436 // out all the ones with any data? | |
437 if(all_yes + all_no == 0) { | |
438 // TODO: Sure wish we had layer names here. | |
439 print("No signatures were available with data for this layer."); | |
440 return NaN; | |
441 } | |
442 | |
443 // Calculate how many pseudo-yeses we should have. | |
444 // Match the frequency in all signatures. | |
445 var pseudo_yes = BINOMIAL_PSEUDOCOUNTS * (all_yes / (all_yes + all_no)); | |
446 | |
447 // pseudo-trials is just BINOMIAL_PSEUDOCOUNTS | |
448 | |
449 // This holds the probability of being a 1 for the out list. | |
450 // We want to test if the in list differs significantly from this. | |
451 var background_probability = (outside_yes + pseudo_yes) / (outside_yes + | |
452 outside_no + BINOMIAL_PSEUDOCOUNTS); | |
453 | |
454 if(background_probability == 0) { | |
455 // Can't do the binomial test in this case. Somehow there were no yeses | |
456 // anywhere. | |
457 return NaN; | |
458 } | |
459 | |
460 // How many 1s are in the in list? | |
461 var inside_yes = 0; | |
462 // And how many 0s? | |
463 var inside_no = 0; | |
464 | |
465 for(var i = 0; i < in_list.length; i++) { | |
466 if(layer_data[in_list[i]] === 1) { | |
467 // This is a yes and it's inside. | |
468 inside_yes++; | |
469 } else if(layer_data[in_list[i]] === 0) { | |
470 // A no and it's inside | |
471 inside_no++; | |
472 } | |
473 } | |
474 | |
475 // Return the p value for rejecting the null hypothesis that the in | |
476 // signatures follow the background distribution. | |
477 var p = binomial_test(inside_yes + inside_no, inside_yes, | |
478 background_probability); | |
479 | |
480 if(LOG_SUSPICIOUS && (p == 0 || p == 1)) { | |
481 // We got an odd p value. Complain about it. | |
482 print("Got suspicious p value " + p); | |
483 print("Was binomial test for " + inside_yes + " successes in " + | |
484 (inside_yes + inside_no) + " trials at probability " + | |
485 background_probability); | |
486 print("Background was " + outside_yes + " out of " + (outside_yes + | |
487 outside_no) + " with " + pseudo_yes + " out of " + | |
488 BINOMIAL_PSEUDOCOUNTS + " pseudocounts."); | |
489 } | |
490 | |
491 // Return our p value as "p_value", and also how many non-pseudocount | |
492 // successes were in the in_list and the out_list. | |
493 return { | |
494 p_value: p, | |
495 inside_yes: inside_yes, | |
496 outside_yes: outside_yes | |
497 }; | |
498 } | |
499 | |
500 function binomial_test(trials, successes, success_probability) { | |
501 if(trials < successes) { | |
502 print("Trying to test " + trials + " trials with " + successes + | |
503 " successes!"); | |
504 } | |
505 | |
506 // Return the p value for rejecting the null hypothesis that the observed | |
507 // number of successes happened in the observed number of trials when the | |
508 // probability of success was success_probability. Does a Binomial | |
509 // test. | |
510 | |
511 // Calculate the P value | |
512 // This must be terribly complicated since nobody seems to have written up | |
513 // how to do it as anything other than an arcane stats ritual. | |
514 // Something close: http://www.johnmyleswhite.com/notebook/2012/04/14/implem | |
515 // enting-the-exact-binomial-test-in-julia/ | |
516 // How scipy.stats does it (x = successes, n = trials, p = supposed | |
517 // probability): | |
518 // SourceForge says Scipy is BSD licensed, so we can steal this code for our | |
519 // comments. | |
520 /* | |
521 d = distributions.binom.pmf(x,n,p) | |
522 rerr = 1+1e-7 | |
523 if (x < p*n): | |
524 i = np.arange(np.ceil(p*n),n+1) | |
525 y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0) | |
526 pval = distributions.binom.cdf(x,n,p) + distributions.binom.sf(n-y, | |
527 n,p) | |
528 else: | |
529 i = np.arange(np.floor(p*n)) | |
530 y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0) | |
531 pval = distributions.binom.cdf(y-1,n,p) + distributions.binom.sf( | |
532 x-1,n,p) | |
533 */ | |
534 // There is of course no justification for why this would work. | |
535 // What it's actually doing is a complicated Numpy vectorized operation to | |
536 // find the boundary of the tail we don't have, and then adding the CDF of | |
537 // the lower tail boundary and (1-CDF) of the upper tail boundary (which is | |
538 // the P value by definition). | |
539 | |
540 // This holds the probability of exactly what we've observed under the null | |
541 // hypothesis. | |
542 var observed_probability = binomial_pmf(trials, successes, | |
543 success_probability); | |
544 | |
545 if(successes < trials * success_probability) { | |
546 // We know anything with fewer successes than this is more extreme. But | |
547 // how many successes would we need to have an equally extreme but | |
548 // higher than expected number of successes? | |
549 // We should sum down from all successes. (We'll sum from small to large | |
550 // so it's OK numerically.) | |
551 | |
552 // This holds the total probability of everything more extremely | |
553 // successful than what we've observed. | |
554 var other_tail_total_probability = 0; | |
555 | |
556 // TODO: implement some better sort of search thing and use CDF | |
557 for(var other_tail_start = trials; other_tail_start >= | |
558 Math.ceil(trials * success_probability); other_tail_start--) { | |
559 | |
560 // Get the probability for this particular case | |
561 var case_probability = binomial_pmf(trials, other_tail_start, | |
562 success_probability); | |
563 | |
564 if(case_probability > observed_probability) { | |
565 // This case is actually less extreme than what we've observed, | |
566 // so our summation is complete. | |
567 | |
568 break; | |
569 } else { | |
570 // This case is more extreme than what we've observed, so use it | |
571 other_tail_total_probability += case_probability; | |
572 } | |
573 } | |
574 | |
575 // This holds the probability in this tail | |
576 var this_tail_probability = binomial_cdf(trials, successes, | |
577 success_probability) | |
578 | |
579 | |
580 // Return the total probability from both tails, clamped to 1. | |
581 return Math.min(this_tail_probability + other_tail_total_probability, | |
582 1.0); | |
583 } else { | |
584 // We know anything with more successes than this is more extreme. But | |
585 // how few successes would we need to have an equally extreme but lower | |
586 // than expected number of successes? | |
587 // We will sum up from 0 successes. We really ought to use the CDF | |
588 // somehow, but I can't think of how we would do it. | |
589 | |
590 // This holds the total probability of everything more extremely | |
591 // failureful than what we've observed. | |
592 var other_tail_total_probability = 0; | |
593 | |
594 for(var other_tail_end = 0; other_tail_end < | |
595 Math.floor(trials * success_probability); other_tail_end++) { | |
596 // We only have to iterate up to the peak (most likely) value. | |
597 | |
598 // Get the probability for this particular case | |
599 var case_probability = binomial_pmf(trials, other_tail_end, | |
600 success_probability); | |
601 | |
602 if(case_probability > observed_probability) { | |
603 // This case is actually less extreme than what we've observed, | |
604 // so our summation is complete. | |
605 break; | |
606 } else { | |
607 // This case is more extreme than what we've observed, so use it | |
608 other_tail_total_probability += case_probability; | |
609 } | |
610 | |
611 } | |
612 | |
613 // This holds the probability in this tail. It is equal to the | |
614 // probability up to, but not including, where this tail starts. So even | |
615 // if the tail starts at the highest possible number of successes, it | |
616 // has some probability. successes can't be 0 here (since then we'd be | |
617 // below any nonzero expected probability and take the other branch. | |
618 // Since it's a positive integer, it must be 1 or more, so we can | |
619 // subtract 1 safely. | |
620 var this_tail_probability = 1 - binomial_cdf(trials, successes - 1, | |
621 success_probability); | |
622 | |
623 // Return the total probability from both tails, clamped to 1 | |
624 return Math.min(this_tail_probability + other_tail_total_probability, | |
625 1.0); | |
626 } | |
627 | |
628 | |
629 } | |
630 | |
631 function binomial_cdf(trials, successes, success_probability) { | |
632 // The Binomial distribution's cumulative distribution function. Given a | |
633 // number of trials, a number of successes, and a success probability, | |
634 // return the probability of having observed that many successes or fewer. | |
635 | |
636 // We compute this efficiently using the "regularized incomplete beta | |
637 // function", AKA the beta distribution cdf, which we get from jstat. | |
638 // See http://en.wikipedia.org/wiki/Binomial_distribution#Cumulative_distrib | |
639 // ution_function and http://en.wikipedia.org/wiki/Regularized_incomplete_be | |
640 // ta_function#Incomplete_beta_function | |
641 | |
642 if(trials == successes) { | |
643 // jStat doesn't want a 0 alpha for its beta distribution (no failures) | |
644 // Calculate this one by hand (it's easy) | |
645 return 1; | |
646 } | |
647 | |
648 if(trials < successes) { | |
649 // This should never happen. TODO: Debug when it happens. | |
650 print("Error: trials (" + trials + ") < successes (" + successes + | |
651 ")!"); | |
652 return NaN; | |
653 } | |
654 | |
655 // This is the observation that we want the beta distribution CDF before | |
656 var beta_observation = 1 - success_probability; | |
657 | |
658 // These are the parameters of the relavent beta distribution | |
659 var beta_alpha = trials - successes; | |
660 var beta_beta = successes + 1; | |
661 | |
662 // Return the beta distribution CDF value, which happens to also be our CDF. | |
663 return jstat.pbeta(beta_observation, beta_alpha, beta_beta); | |
664 } | |
665 | |
666 function binomial_pmf(trials, successes, success_probability) { | |
667 // The Binomial distribution's probability mass function. Given a number of | |
668 // trials, a number of successes, and the probability of success on each | |
669 // trial, calculate the probability of observing that many successes in that | |
670 // many trials with the given success rate. | |
671 | |
672 // The probability of this many successes in this many trials at this | |
673 // success rate is the probability of succeeding so many times and failing | |
674 // so many times, summed over all the mutually exclusive arrangements of | |
675 // successes and failures. | |
676 return (choose(trials, successes) * | |
677 Math.pow(success_probability, successes) * | |
678 Math.pow(1 - success_probability, trials - successes)); | |
679 | |
680 } | |
681 | |
682 function choose(available, selected) { | |
683 // The choose function: from available distinct objects, how many ways are | |
684 // there to select selected of them. Returns "available choose selected". | |
685 // Works with large input numbers that are too big to take the factorials | |
686 // of. | |
687 | |
688 // We use a neat overflow-robust algorithm that eliminates the factorials | |
689 // and makes the computation a multiplication of numbers greater than one. | |
690 // So, no overflow unless the result itself is too big. | |
691 // See http://arantxa.ii.uam.es/~ssantini/writing/notes/s667_binomial.pdf | |
692 | |
693 if(selected < available - selected) { | |
694 // It would be faster to think about choosing what we don't include. So | |
695 // do that instead. | |
696 return choose(available, available - selected); | |
697 } | |
698 | |
699 // This holds the result we are accumulating. Initialize to the | |
700 // multiplicative identity. | |
701 var result = 1; | |
702 | |
703 for(var i = 1; i < available - selected + 1; i++) { | |
704 result *= (1 + (selected / i)); | |
705 } | |
706 | |
707 // TODO: The result ought always to be an integer. Ensure this. | |
708 return result; | |
709 } |