annotate hexagram-6ae12361157c/hexagram/statistics.js~ @ 0:1407e3634bcf draft default tip

Uploaded r11 from test tool shed.
author adam-novak
date Tue, 22 Oct 2013 14:17:59 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
1 // statistics.js: Web Worker file to run statistical tests in the background.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
2
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
3 // Constants:
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
4 // How many pseudocount trials should we use for the binomial test?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
5 var BINOMIAL_PSEUDOCOUNTS = 5;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
6
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
7 // Should we log information about suspicious p values to the console for manual
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
8 // spot checking?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
9 var LOG_SUSPICIOUS = false;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
10
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
11 // Go get jStat. Hope it's happy in Worker-land.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
12 importScripts("jstat-1.0.0.js");
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
13
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
14 // Make a fake console to catch jstat warnings, so they don't crash the script.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
15 console = {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
16 warn: print
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
17 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
18
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
19 onmessage = function(message) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
20 // Handle incoming messages from the page. Each message's data is an RPC
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
21 // request, with "name" set to a function name, "args" set to an array of
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
22 // arguments, and "id" set to an ID that should be returned with the return
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
23 // value in a reply message. If the function call fails, an error is sent
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
24 // back.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
25
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
26
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
27 try {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
28 // Go get the specified global function, and apply it on the given
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
29 // arguments. Use the global scope ("self") as its "this".
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
30 var return_value = self[message.data.name].apply(self,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
31 message.data.args);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
32
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
33 } catch(exception) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
34
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
35 // Send the error back to the page instead of a return value.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
36 // Unfortunately, errors themselves can't be cloned, so we do all the
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
37 // message making here and send back a string.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
38
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
39 // First we build a string with all the parts of the error we can get.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
40 var error_message = "Error in web worker doing job " + message.data.id;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
41 error_message += "\n";
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
42 error_message += exception.name + ": " + exception.message;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
43 error_message += "\n";
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
44 error_message += "Full details:\n";
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
45 for(field in exception) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
46 if(field == "name" || field == "message") {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
47 // Already got these.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
48 continue;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
49 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
50
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
51 // Copy the field into the message as a string.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
52 error_message += field + ": " + exception[field] + "\n";
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
53 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
54 error_message += "Call: " + message.data.name + "(";
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
55 for(var i = 0; i < message.data.args.length; i++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
56 error_message += message.data.args[i];
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
57 if(i + 1 < message.data.args.length) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
58 // Have an argument after this.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
59 error_message += ", ";
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
60 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
61 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
62 error_message += ")";
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
63
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
64 postMessage({
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
65 id: message.data.id,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
66 error: error_message
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
67 });
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
68
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
69 return;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
70 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
71
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
72
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
73 // Send the return value back with the id.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
74 postMessage({
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
75 id: message.data.id,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
76 return_value: return_value
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
77 });
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
78 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
79
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
80 function print(message) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
81 // Print a message to the console of the parent page.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
82 postMessage({
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
83 log: message
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
84 });
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
85 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
86
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
87 function statistics_for_matrix(matrix_url, in_list, out_list, all_list) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
88 // Download the given score matrix, do stats between in_list and out_list
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
89 // for each layer in it, and return an object from layer name to p value.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
90 // all_list specifies the names of all signatures that figure into the
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
91 // analysis at all.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
92
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
93 // Download the matrix synchronously.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
94 // See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
95 // ronous_and_Asynchronous_Requests
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
96 // A side effect of this is that we won't have more simultaneous downloads
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
97 // than workers, which is probably good.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
98 // This holds the request.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
99 var request = new XMLHttpRequest();
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
100 // Get the layer data by GET. The false makes it synchronous.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
101 request.open("GET", matrix_url, false);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
102 request.send(null);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
103
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
104 // Now we have the layer TSV
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
105 // But we don't have our fancy jQuery TSV parser. Parse it manually.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
106
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
107 // This holds an object of layer data objects (from signature to float) by
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
108 // layer name.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
109 layers = {};
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
110
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
111 // This holds the array of lines
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
112 // Split on newlines (as seen in jQuery.tsv.js)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
113 var lines = request.responseText.split(/\r?\n/);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
114
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
115 // Line 0 gives all the layer names, but the first thing isn't a layer name
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
116 // (since it's above the signature column).
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
117 var layer_names = lines[0].split(/\t/);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
118 for(var i = 1; i < layer_names.length; i++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
119 // Make sure we have an object for this layer
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
120 layers[layer_names[i]] = {};
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
121 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
122
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
123 // The rest give values per layer for the hex in column 1.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
124 for(var i = 1; i < lines.length; i++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
125 // This holds the parts of each line
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
126 var parts = lines[i].split(/\t/);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
127
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
128 if(parts[0]) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
129 // We actually have data
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
130
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
131 // Get the singature
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
132 var signature = parts[0];
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
133
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
134 for(var j = 1; j < parts.length; j++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
135 // Go through each non-signature entry and set the appropriate
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
136 // layer's value for this signature.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
137 layers[layer_names[j]][signature] = parseFloat(parts[j]);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
138 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
139 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
140 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
141
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
142 // Now we've parsed the matrix.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
143 // Go do stats for each layer.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
144 // This holds our calculated p valued by layer name.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
145 var p_values = {};
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
146
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
147 print("Running statistics for (up to) " + layer_names.length +
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
148 " layers from matrix " + matrix_url);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
149
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
150 for(var i = 1; i < layer_names.length; i++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
151 // Pass the layer data to the per-layer statistics, and get the p value
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
152 // back. It's probably easier to do this in this worker than to go
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
153 // invoke more workers.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
154 p_values[layer_names[i]] = statistics_for_layer(layers[layer_names[i]],
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
155 in_list, out_list, all_list);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
156 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
157
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
158 // We've now calculated a p value for every layer in the matrix. Return the
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
159 // calculated p values labeled by layer.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
160 return p_values;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
161
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
162 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
163
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
164 function statistics_for_layer(layer_data, in_list, out_list, all_list) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
165 // Run the appropriate statistical test for the passed layer data, between
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
166 // the given in and out arrays of signatures. all_list specifies the names
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
167 // of all signatures that figure into the analysis at all. Return the p
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
168 // value for the layer, or NaN if no p value could be calculated.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
169
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
170 // This holds whether the layer is discrete
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
171 var is_discrete = true;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
172
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
173 // This holds whether the layer is binary
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
174 var is_binary = true;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
175
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
176 for(var signature in layer_data) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
177 if(layer_data[signature] > 1 || layer_data[signature] < 0) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
178 // Not a binary layer
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
179 is_binary = false;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
180 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
181
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
182 if(layer_data[signature] % 1 !== 0) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
183 // It's a float
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
184 is_binary = false;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
185 is_discrete = false;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
186 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
187 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
188
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
189 if(is_binary) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
190 // This is a binary/dichotomous layer, so run a binomial test.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
191 return binomial_compare(layer_data, in_list, out_list, all_list);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
192 } else if (is_discrete) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
193 // This is a multinomial/categorical layer
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
194 // TODO: statistics for discrete non-binary layers
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
195 return NaN;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
196 } else {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
197 // This is a continuous layer, so run a t test
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
198 return t_compare(layer_data, in_list, out_list, all_list);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
199 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
200
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
201 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
202
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
203 function statistics_for_url(layer_url, in_list, out_list, all_list) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
204 // Run the stats for the layer with the given url, between the given in and
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
205 // out arrays of signatures. all_list specifies the names of all signatures
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
206 // that figure into the analysis at all. Return the p value for the layer,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
207 // or NaN if no p value could be calculated.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
208
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
209 print("Running statistics for individual layer " + layer_url);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
210
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
211 // Download the layer data synchronously.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
212 // See https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Synch
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
213 // ronous_and_Asynchronous_Requests
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
214 // A side effect of this is that we won't have more simultaneous downloads
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
215 // than workers, which is probably good.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
216 // This holds the request.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
217 var request = new XMLHttpRequest();
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
218 // Get the layer data by GET. The false makes it synchronous.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
219 request.open("GET", layer_url, false);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
220 request.send(null);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
221
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
222 // Now we have the layer TSV
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
223 // But we don't have our fancy jQuery TSV parser. Parse it manually.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
224
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
225 // This holds the layer data (signature to float)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
226 var layer_data = {}
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
227
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
228 // This holds the array of lines
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
229 // Split on newlines (as seen in jQuery.tsv.js)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
230 var lines = request.responseText.split(/\r?\n/);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
231
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
232 for(var i = 0; i < lines.length; i++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
233 // This holds the parts of each line
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
234 var parts = lines[i].split(/\t/);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
235
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
236 if(parts[0]) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
237 // We actually have data
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
238 // Parse the layer value for this signature
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
239 var value = parseFloat(parts[1]);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
240
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
241 // Store the value in the layer data
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
242 layer_data[parts[0]] = value;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
243 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
244 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
245
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
246 // Run stats on the downloaded data
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
247 return statistics_for_layer(layer_data, in_list, out_list, all_list);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
248 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
249
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
250 function t_compare(layer_data, in_list, out_list, all_list) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
251 // Given the data of a continuous layer object (an object from signature
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
252 // name to float (or undefined)), and arrays of the names of "in" and "out"
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
253 // signatures, do a t test test for whether the in signatures differ from
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
254 // the out signatures. Returns an object of metadata, with "p_value" set to
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
255 // either the p value of the test (two-tailed), or NaN if the test cannot be
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
256 // performed (due to, e.g. fewer than 2 samples in one category).
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
257
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
258 // Go through the in list and calculate all the summary statistics
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
259 // How many non-NaN values?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
260 var number_in = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
261 // What is the sum?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
262 var sum_in = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
263
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
264 for(var i = 0; i < in_list.length; i++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
265 if(!isNaN(layer_data[in_list[i]])) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
266 number_in++;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
267 sum_in += layer_data[in_list[i]];
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
268 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
269 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
270
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
271 // We've done one pass, so we know if we have any in list data actually
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
272 if(number_in < 2) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
273 // Not enough to run the t test
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
274 return NaN;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
275 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
276
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
277 // What is the mean?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
278 var mean_in = sum_in / number_in;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
279
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
280 // What is the second moment (sum of squares of differences from the mean)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
281 var second_moment_in = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
282 for(var i = 0; i < in_list.length; i++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
283 if(!isNaN(layer_data[in_list[i]])) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
284 second_moment_in += Math.pow(layer_data[in_list[i]] - mean_in, 2);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
285 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
286 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
287
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
288 // What is the unbiased variance?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
289 unbiased_variance_in = second_moment_in / (number_in - 1);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
290
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
291 // Now go through the same process for the out list
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
292 // How many non-NaN values?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
293 var number_out = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
294 // What is the sum?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
295 var sum_out = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
296
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
297 for(var i = 0; i < out_list.length; i++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
298 if(!isNaN(layer_data[out_list[i]])) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
299 number_out++;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
300 sum_out += layer_data[out_list[i]];
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
301 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
302 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
303
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
304 // We've done one pass, so we know if we have any out list data actually
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
305 if(number_out < 2) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
306 // Not enough to run the t test
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
307 return NaN;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
308 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
309
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
310 // What is the mean?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
311 var mean_out = sum_out / number_out;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
312
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
313 // What is the second moment (sum of squares of differences from the mean)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
314 var second_moment_out = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
315 for(var i = 0; i < out_list.length; i++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
316 if(!isNaN(layer_data[out_list[i]])) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
317 second_moment_out += Math.pow(layer_data[out_list[i]] - mean_out,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
318 2);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
319 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
320 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
321
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
322 // What is the unbiased variance?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
323 unbiased_variance_out = second_moment_out / (number_out - 1);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
324
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
325 // We can't do the test if both variances are 0
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
326 if(unbiased_variance_in == 0 && unbiased_variance_out == 0) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
327 return NaN;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
328 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
329
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
330 // Now we can calculate the t test two-tailed p value
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
331 var p_value = t_test(mean_in, unbiased_variance_in, number_in, mean_out,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
332 unbiased_variance_out, number_out);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
333
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
334 // And return it in a dict with other metadata.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
335 // We don't really have any other metadata.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
336 return {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
337 p_value: p_value
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
338 };
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
339 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
340
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
341 function t_test(mean_in, unbiased_variance_in, number_in, mean_out,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
342 unbiased_variance_out, number_out) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
343
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
344 // Given the mean, unbiased variance, and number of samples for both the in
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
345 // group and the out group, compute the p value for the t test with unequal
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
346 // sample sizes and unequal variances, testing to see whether the means
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
347 // differ (a two-tailed "Welch's" t test). See
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
348 // https://en.wikipedia.org/wiki/Student%27s_t-test
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
349 // Assumes we have enough samples to actually perform the test.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
350
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
351 // First, calculate the t statistic, which is where our observations fall on
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
352 // the t distribution.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
353 var t_statistic = (mean_in - mean_out) / Math.sqrt((unbiased_variance_in /
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
354 number_in) + (unbiased_variance_out / number_out));
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
355
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
356
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
357 // Calculate the degrees of freedom for the particular t distribution that
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
358 // we ought to compare the statistic against
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
359 var degrees_of_freedom = Math.pow((unbiased_variance_in / number_in) +
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
360 (unbiased_variance_out / number_out), 2) /
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
361 ((Math.pow(unbiased_variance_in / number_in, 2) / (number_in - 1)) +
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
362 (Math.pow(unbiased_variance_out / number_out, 2) / (number_out - 1)));
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
363
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
364 // Now we have to compare the t statistic to the t test CDF available via
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
365 // the totally undocumented jstat.pt = function(q, df, ncp, lower_tail, log)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
366 // where:
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
367 // q is the t statistic value to calculate the cdf at
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
368 // df is the degrees of freedom
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
369 // ncp is the "mu" parameter for the t distributiuon. I think this sets the
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
370 // mean, and it's OK to leave blank.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
371 // lower_tail presumably specifies if we want the lower or upper tail of the
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
372 // CDF. Defaults to true.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
373 // Log specifies if we want the log probability. Defaults to false.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
374
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
375 // Make the t statistic be on the low side of the distribution, and
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
376 // calculate the lower tail's area using the CDF.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
377 var one_tail_probability = jstat.pt(0 - Math.abs(t_statistic),
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
378 degrees_of_freedom);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
379
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
380 // Return the two-tailed p value, which, since the t distribution is
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
381 // symmetric, is just twice the single-tail probability
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
382 return 2 * one_tail_probability;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
383
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
384 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
385
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
386 function binomial_compare(layer_data, in_list, out_list, all_list) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
387 // Given the data of a binary layer object (an object from signature name to
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
388 // 0 or 1 (or undefined)), and arrays of the names of "in" and "out"
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
389 // signatures, do a binomial test for whether the in signatures differ from
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
390 // the out signatures. Uses a number of pseudocount trials as specified in
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
391 // the global constant BINOMIAL_PSEUDOCOUNTS Returns an object of metadata,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
392 // with "p_value" set to either the p value of the test (two-tailed), or NaN
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
393 // if the test cannot be performed. all_list specifies the names of all
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
394 // signatures that figure into the analysis at all (i.e. those which the
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
395 // user hasn't filtered out), which we use when calculating how many of our
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
396 // pseudocounts should be successes. Signature names appearing in all_list
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
397 // but with no data in layer_data are not counted.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
398
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
399
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
400 // Work out the distribution from the out list
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
401 // How many out signatures are 1?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
402 var outside_yes = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
403 // And are 0?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
404 var outside_no = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
405
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
406 for(var i = 0; i < out_list.length; i++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
407 if(layer_data[out_list[i]] === 1) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
408 // This is a yes and it's outside.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
409 outside_yes++;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
410 } else if(layer_data[out_list[i]] === 0) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
411 // A no and outside
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
412 outside_no++;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
413 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
414 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
415
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
416 // It's OK for all the outside hexes to be 0 now. Pseudocounts can give us a
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
417 // p value.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
418
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
419 // Now work out our pseudocounts.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
420 // How many signatures in all_list are successes?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
421 var all_yes = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
422 // And how many are failures (as opposed to undef)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
423 var all_no = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
424
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
425 for(var i = 0; i < all_list.length; i++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
426 if(layer_data[all_list[i]] === 1) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
427 // A yes anywhere
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
428 all_yes++;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
429 } else if(layer_data[all_list[i]] === 0) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
430 // A real no (not a no-data) anywhere
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
431 all_no++;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
432 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
433 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
434
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
435 // It't not OK for there to be no hexes in the all set. Maybe they filtered
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
436 // out all the ones with any data?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
437 if(all_yes + all_no == 0) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
438 // TODO: Sure wish we had layer names here.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
439 print("No signatures were available with data for this layer.");
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
440 return NaN;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
441 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
442
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
443 // Calculate how many pseudo-yeses we should have.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
444 // Match the frequency in all signatures.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
445 var pseudo_yes = BINOMIAL_PSEUDOCOUNTS * (all_yes / (all_yes + all_no));
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
446
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
447 // pseudo-trials is just BINOMIAL_PSEUDOCOUNTS
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
448
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
449 // This holds the probability of being a 1 for the out list.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
450 // We want to test if the in list differs significantly from this.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
451 var background_probability = (outside_yes + pseudo_yes) / (outside_yes +
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
452 outside_no + BINOMIAL_PSEUDOCOUNTS);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
453
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
454 if(background_probability == 0) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
455 // Can't do the binomial test in this case. Somehow there were no yeses
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
456 // anywhere.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
457 return NaN;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
458 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
459
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
460 // How many 1s are in the in list?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
461 var inside_yes = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
462 // And how many 0s?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
463 var inside_no = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
464
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
465 for(var i = 0; i < in_list.length; i++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
466 if(layer_data[in_list[i]] === 1) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
467 // This is a yes and it's inside.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
468 inside_yes++;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
469 } else if(layer_data[in_list[i]] === 0) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
470 // A no and it's inside
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
471 inside_no++;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
472 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
473 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
474
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
475 // Return the p value for rejecting the null hypothesis that the in
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
476 // signatures follow the background distribution.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
477 var p = binomial_test(inside_yes + inside_no, inside_yes,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
478 background_probability);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
479
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
480 if(LOG_SUSPICIOUS && (p == 0 || p == 1)) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
481 // We got an odd p value. Complain about it.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
482 print("Got suspicious p value " + p);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
483 print("Was binomial test for " + inside_yes + " successes in " +
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
484 (inside_yes + inside_no) + " trials at probability " +
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
485 background_probability);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
486 print("Background was " + outside_yes + " out of " + (outside_yes +
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
487 outside_no) + " with " + pseudo_yes + " out of " +
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
488 BINOMIAL_PSEUDOCOUNTS + " pseudocounts.");
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
489 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
490
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
491 // Return our p value as "p_value", and also how many non-pseudocount
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
492 // successes were in the in_list and the out_list.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
493 return {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
494 p_value: p,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
495 "1s in A": inside_yes,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
496 "1s in background": outside_yes
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
497 };
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
498 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
499
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
500 function binomial_test(trials, successes, success_probability) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
501 if(trials < successes) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
502 print("Trying to test " + trials + " trials with " + successes +
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
503 " successes!");
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
504 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
505
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
506 // Return the p value for rejecting the null hypothesis that the observed
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
507 // number of successes happened in the observed number of trials when the
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
508 // probability of success was success_probability. Does a Binomial
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
509 // test.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
510
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
511 // Calculate the P value
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
512 // This must be terribly complicated since nobody seems to have written up
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
513 // how to do it as anything other than an arcane stats ritual.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
514 // Something close: http://www.johnmyleswhite.com/notebook/2012/04/14/implem
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
515 // enting-the-exact-binomial-test-in-julia/
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
516 // How scipy.stats does it (x = successes, n = trials, p = supposed
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
517 // probability):
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
518 // SourceForge says Scipy is BSD licensed, so we can steal this code for our
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
519 // comments.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
520 /*
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
521 d = distributions.binom.pmf(x,n,p)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
522 rerr = 1+1e-7
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
523 if (x < p*n):
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
524 i = np.arange(np.ceil(p*n),n+1)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
525 y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
526 pval = distributions.binom.cdf(x,n,p) + distributions.binom.sf(n-y,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
527 n,p)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
528 else:
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
529 i = np.arange(np.floor(p*n))
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
530 y = np.sum(distributions.binom.pmf(i,n,p) <= d*rerr,axis=0)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
531 pval = distributions.binom.cdf(y-1,n,p) + distributions.binom.sf(
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
532 x-1,n,p)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
533 */
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
534 // There is of course no justification for why this would work.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
535 // What it's actually doing is a complicated Numpy vectorized operation to
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
536 // find the boundary of the tail we don't have, and then adding the CDF of
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
537 // the lower tail boundary and (1-CDF) of the upper tail boundary (which is
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
538 // the P value by definition).
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
539
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
540 // This holds the probability of exactly what we've observed under the null
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
541 // hypothesis.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
542 var observed_probability = binomial_pmf(trials, successes,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
543 success_probability);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
544
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
545 if(successes < trials * success_probability) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
546 // We know anything with fewer successes than this is more extreme. But
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
547 // how many successes would we need to have an equally extreme but
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
548 // higher than expected number of successes?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
549 // We should sum down from all successes. (We'll sum from small to large
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
550 // so it's OK numerically.)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
551
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
552 // This holds the total probability of everything more extremely
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
553 // successful than what we've observed.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
554 var other_tail_total_probability = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
555
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
556 // TODO: implement some better sort of search thing and use CDF
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
557 for(var other_tail_start = trials; other_tail_start >=
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
558 Math.ceil(trials * success_probability); other_tail_start--) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
559
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
560 // Get the probability for this particular case
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
561 var case_probability = binomial_pmf(trials, other_tail_start,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
562 success_probability);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
563
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
564 if(case_probability > observed_probability) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
565 // This case is actually less extreme than what we've observed,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
566 // so our summation is complete.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
567
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
568 break;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
569 } else {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
570 // This case is more extreme than what we've observed, so use it
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
571 other_tail_total_probability += case_probability;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
572 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
573 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
574
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
575 // This holds the probability in this tail
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
576 var this_tail_probability = binomial_cdf(trials, successes,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
577 success_probability)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
578
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
579
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
580 // Return the total probability from both tails, clamped to 1.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
581 return Math.min(this_tail_probability + other_tail_total_probability,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
582 1.0);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
583 } else {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
584 // We know anything with more successes than this is more extreme. But
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
585 // how few successes would we need to have an equally extreme but lower
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
586 // than expected number of successes?
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
587 // We will sum up from 0 successes. We really ought to use the CDF
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
588 // somehow, but I can't think of how we would do it.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
589
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
590 // This holds the total probability of everything more extremely
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
591 // failureful than what we've observed.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
592 var other_tail_total_probability = 0;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
593
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
594 for(var other_tail_end = 0; other_tail_end <
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
595 Math.floor(trials * success_probability); other_tail_end++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
596 // We only have to iterate up to the peak (most likely) value.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
597
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
598 // Get the probability for this particular case
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
599 var case_probability = binomial_pmf(trials, other_tail_end,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
600 success_probability);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
601
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
602 if(case_probability > observed_probability) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
603 // This case is actually less extreme than what we've observed,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
604 // so our summation is complete.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
605 break;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
606 } else {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
607 // This case is more extreme than what we've observed, so use it
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
608 other_tail_total_probability += case_probability;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
609 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
610
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
611 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
612
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
613 // This holds the probability in this tail. It is equal to the
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
614 // probability up to, but not including, where this tail starts. So even
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
615 // if the tail starts at the highest possible number of successes, it
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
616 // has some probability. successes can't be 0 here (since then we'd be
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
617 // below any nonzero expected probability and take the other branch.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
618 // Since it's a positive integer, it must be 1 or more, so we can
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
619 // subtract 1 safely.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
620 var this_tail_probability = 1 - binomial_cdf(trials, successes - 1,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
621 success_probability);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
622
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
623 // Return the total probability from both tails, clamped to 1
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
624 return Math.min(this_tail_probability + other_tail_total_probability,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
625 1.0);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
626 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
627
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
628
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
629 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
630
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
631 function binomial_cdf(trials, successes, success_probability) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
632 // The Binomial distribution's cumulative distribution function. Given a
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
633 // number of trials, a number of successes, and a success probability,
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
634 // return the probability of having observed that many successes or fewer.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
635
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
636 // We compute this efficiently using the "regularized incomplete beta
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
637 // function", AKA the beta distribution cdf, which we get from jstat.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
638 // See http://en.wikipedia.org/wiki/Binomial_distribution#Cumulative_distrib
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
639 // ution_function and http://en.wikipedia.org/wiki/Regularized_incomplete_be
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
640 // ta_function#Incomplete_beta_function
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
641
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
642 if(trials == successes) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
643 // jStat doesn't want a 0 alpha for its beta distribution (no failures)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
644 // Calculate this one by hand (it's easy)
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
645 return 1;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
646 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
647
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
648 if(trials < successes) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
649 // This should never happen. TODO: Debug when it happens.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
650 print("Error: trials (" + trials + ") < successes (" + successes +
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
651 ")!");
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
652 return NaN;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
653 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
654
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
655 // This is the observation that we want the beta distribution CDF before
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
656 var beta_observation = 1 - success_probability;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
657
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
658 // These are the parameters of the relavent beta distribution
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
659 var beta_alpha = trials - successes;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
660 var beta_beta = successes + 1;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
661
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
662 // Return the beta distribution CDF value, which happens to also be our CDF.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
663 return jstat.pbeta(beta_observation, beta_alpha, beta_beta);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
664 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
665
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
666 function binomial_pmf(trials, successes, success_probability) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
667 // The Binomial distribution's probability mass function. Given a number of
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
668 // trials, a number of successes, and the probability of success on each
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
669 // trial, calculate the probability of observing that many successes in that
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
670 // many trials with the given success rate.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
671
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
672 // The probability of this many successes in this many trials at this
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
673 // success rate is the probability of succeeding so many times and failing
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
674 // so many times, summed over all the mutually exclusive arrangements of
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
675 // successes and failures.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
676 return (choose(trials, successes) *
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
677 Math.pow(success_probability, successes) *
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
678 Math.pow(1 - success_probability, trials - successes));
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
679
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
680 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
681
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
682 function choose(available, selected) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
683 // The choose function: from available distinct objects, how many ways are
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
684 // there to select selected of them. Returns "available choose selected".
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
685 // Works with large input numbers that are too big to take the factorials
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
686 // of.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
687
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
688 // We use a neat overflow-robust algorithm that eliminates the factorials
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
689 // and makes the computation a multiplication of numbers greater than one.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
690 // So, no overflow unless the result itself is too big.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
691 // See http://arantxa.ii.uam.es/~ssantini/writing/notes/s667_binomial.pdf
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
692
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
693 if(selected < available - selected) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
694 // It would be faster to think about choosing what we don't include. So
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
695 // do that instead.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
696 return choose(available, available - selected);
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
697 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
698
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
699 // This holds the result we are accumulating. Initialize to the
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
700 // multiplicative identity.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
701 var result = 1;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
702
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
703 for(var i = 1; i < available - selected + 1; i++) {
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
704 result *= (1 + (selected / i));
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
705 }
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
706
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
707 // TODO: The result ought always to be an integer. Ensure this.
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
708 return result;
1407e3634bcf Uploaded r11 from test tool shed.
adam-novak
parents:
diff changeset
709 }