Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgGRR.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 """ | |
2 # july 2009: Need to see outliers so need to draw them last? | |
3 # could use clustering on the zscores to guess real relationships for unrelateds | |
4 # but definitely need to draw last | |
5 # added MAX_SHOW_ROWS to limit the length of the main report page | |
6 # Changes for Galaxy integration | |
7 # added more robust knuth method for one pass mean and sd | |
8 # no difference really - let's use scipy.mean() and scipy.std() instead... | |
9 # fixed labels and changed to .xls for outlier reports so can open in excel | |
10 # interesting - with a few hundred subjects, 5k gives good resolution | |
11 # and 100k gives better but not by much | |
12 # TODO remove non autosomal markers | |
13 # TODO it would be best if label had the zmean and zsd as these are what matter for | |
14 # outliers rather than the group mean/sd | |
15 # mods to rgGRR.py from channing CVS which John Ziniti has rewritten to produce SVG plots | |
16 # to make a Galaxy tool - we need the table of mean and SD for interesting pairs, the SVG and the log | |
17 # so the result should be an HTML file | |
18 | |
19 # rgIBS.py | |
20 # use a random subset of markers for a quick ibs | |
21 # to identify sample dups and closely related subjects | |
22 # try snpMatrix and plink and see which one works best for us? | |
23 # abecasis grr plots mean*sd for every subject to show clusters | |
24 # mods june 23 rml to avoid non-autosomal markers | |
25 # we seem to be distinguishing parent-child by gender - 2 clouds! | |
26 | |
27 | |
28 snpMatrix from David Clayton has: | |
29 ibs.stats function to calculate the identity-by-state stats of a group of samples | |
30 Description | |
31 Given a snp.matrix-class or a X.snp.matrix-class object with N samples, calculates some statistics | |
32 about the relatedness of every pair of samples within. | |
33 | |
34 Usage | |
35 ibs.stats(x) | |
36 8 ibs.stats | |
37 Arguments | |
38 x a snp.matrix-class or a X.snp.matrix-class object containing N samples | |
39 Details | |
40 No-calls are excluded from consideration here. | |
41 Value | |
42 A data.frame containing N(N - 1)/2 rows, where the row names are the sample name pairs separated | |
43 by a comma, and the columns are: | |
44 Count count of identical calls, exclusing no-calls | |
45 Fraction fraction of identical calls comparied to actual calls being made in both samples | |
46 Warning | |
47 In some applications, it may be preferable to subset a (random) selection of SNPs first - the | |
48 calculation | |
49 time increases as N(N - 1)M/2 . Typically for N = 800 samples and M = 3000 SNPs, the | |
50 calculation time is about 1 minute. A full GWA scan could take hours, and quite unnecessary for | |
51 simple applications such as checking for duplicate or related samples. | |
52 Note | |
53 This is mostly written to find mislabelled and/or duplicate samples. | |
54 Illumina indexes their SNPs in alphabetical order so the mitochondria SNPs comes first - for most | |
55 purpose it is undesirable to use these SNPs for IBS purposes. | |
56 TODO: Worst-case S4 subsetting seems to make 2 copies of a large object, so one might want to | |
57 subset before rbind(), etc; a future version of this routine may contain a built-in subsetting facility | |
58 """ | |
59 import sys,os,time,random,string,copy,optparse | |
60 | |
61 try: | |
62 set | |
63 except NameError: | |
64 from Sets import Set as set | |
65 | |
66 from rgutils import timenow,plinke | |
67 | |
68 import plinkbinJZ | |
69 | |
70 | |
71 opts = None | |
72 verbose = False | |
73 | |
74 showPolygons = False | |
75 | |
76 class NullDevice: | |
77 def write(self, s): | |
78 pass | |
79 | |
80 tempstderr = sys.stderr # save | |
81 #sys.stderr = NullDevice() | |
82 # need to avoid blather about deprecation and other strange stuff from scipy | |
83 # the current galaxy job runner assumes that | |
84 # the job is in error if anything appears on sys.stderr | |
85 # grrrrr. James wants to keep it that way instead of using the | |
86 # status flag for some strange reason. Presumably he doesn't use R or (in this case, scipy) | |
87 import numpy | |
88 import scipy | |
89 from scipy import weave | |
90 | |
91 | |
92 sys.stderr=tempstderr | |
93 | |
94 | |
95 PROGNAME = os.path.split(sys.argv[0])[-1] | |
96 X_AXIS_LABEL = 'Mean Alleles Shared' | |
97 Y_AXIS_LABEL = 'SD Alleles Shared' | |
98 LEGEND_ALIGN = 'topleft' | |
99 LEGEND_TITLE = 'Relationship' | |
100 DEFAULT_SYMBOL_SIZE = 1.0 # default symbol size | |
101 DEFAULT_SYMBOL_SIZE = 0.5 # default symbol size | |
102 | |
103 ### Some colors for R/rpy | |
104 R_BLACK = 1 | |
105 R_RED = 2 | |
106 R_GREEN = 3 | |
107 R_BLUE = 4 | |
108 R_CYAN = 5 | |
109 R_PURPLE = 6 | |
110 R_YELLOW = 7 | |
111 R_GRAY = 8 | |
112 | |
113 ### ... and some point-styles | |
114 | |
115 ### | |
116 PLOT_HEIGHT = 600 | |
117 PLOT_WIDTH = 1150 | |
118 | |
119 | |
120 #SVG_COLORS = ('black', 'darkblue', 'blue', 'deepskyblue', 'firebrick','maroon','crimson') | |
121 #SVG_COLORS = ('cyan','dodgerblue','mediumpurple', 'fuchsia', 'red','gold','gray') | |
122 SVG_COLORS = ('cyan','dodgerblue','mediumpurple','forestgreen', 'lightgreen','gold','gray') | |
123 # dupe,parentchild,sibpair,halfsib,parents,unrel,unkn | |
124 #('orange', 'red', 'green', 'chartreuse', 'blue', 'purple', 'gray') | |
125 | |
126 OUTLIERS_HEADER_list = ['Mean','Sdev','ZMean','ZSdev','FID1','IID1','FID2','IID2','RelMean_M','RelMean_SD','RelSD_M','RelSD_SD','PID1','MID1','PID2','MID2','Ped'] | |
127 OUTLIERS_HEADER = '\t'.join(OUTLIERS_HEADER_list) | |
128 TABLE_HEADER='fid1_iid1\tfid2_iid2\tmean\tsdev\tzmean\tzsdev\tgeno\trelcode\tpid1\tmid1\tpid2\tmid2\n' | |
129 | |
130 | |
131 ### Relationship codes, text, and lookups/mappings | |
132 N_RELATIONSHIP_TYPES = 7 | |
133 REL_DUPE, REL_PARENTCHILD, REL_SIBS, REL_HALFSIBS, REL_RELATED, REL_UNRELATED, REL_UNKNOWN = range(N_RELATIONSHIP_TYPES) | |
134 REL_LOOKUP = { | |
135 REL_DUPE: ('dupe', R_BLUE, 1), | |
136 REL_PARENTCHILD: ('parentchild', R_YELLOW, 1), | |
137 REL_SIBS: ('sibpairs', R_RED, 1), | |
138 REL_HALFSIBS: ('halfsibs', R_GREEN, 1), | |
139 REL_RELATED: ('parents', R_PURPLE, 1), | |
140 REL_UNRELATED: ('unrelated', R_CYAN, 1), | |
141 REL_UNKNOWN: ('unknown', R_GRAY, 1), | |
142 } | |
143 OUTLIER_STDEVS = { | |
144 REL_DUPE: 2, | |
145 REL_PARENTCHILD: 2, | |
146 REL_SIBS: 2, | |
147 REL_HALFSIBS: 2, | |
148 REL_RELATED: 2, | |
149 REL_UNRELATED: 3, | |
150 REL_UNKNOWN: 2, | |
151 } | |
152 # note now Z can be passed in | |
153 | |
154 REL_STATES = [REL_LOOKUP[r][0] for r in range(N_RELATIONSHIP_TYPES)] | |
155 REL_COLORS = SVG_COLORS | |
156 REL_POINTS = [REL_LOOKUP[r][2] for r in range(N_RELATIONSHIP_TYPES)] | |
157 | |
158 DEFAULT_MAX_SAMPLE_SIZE = 10000 | |
159 | |
160 REF_COUNT_HOM1 = 3 | |
161 REF_COUNT_HET = 2 | |
162 REF_COUNT_HOM2 = 1 | |
163 MISSING = 0 | |
164 MAX_SHOW_ROWS = 100 # framingham has millions - delays showing output page - so truncate and explain | |
165 MARKER_PAIRS_PER_SECOND_SLOW = 15000000.0 | |
166 MARKER_PAIRS_PER_SECOND_FAST = 70000000.0 | |
167 | |
168 | |
169 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> | |
170 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
171 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
172 <head> | |
173 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
174 <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> | |
175 <title></title> | |
176 <link rel="stylesheet" href="/static/style/base.css" type="text/css" /> | |
177 </head> | |
178 <body> | |
179 <div class="document"> | |
180 """ | |
181 | |
182 | |
183 SVG_HEADER = '''<?xml version="1.0" standalone="no"?> | |
184 <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.2//EN" "http://www.w3.org/Graphics/SVG/1.2/DTD/svg12.dtd"> | |
185 | |
186 <svg width="1280" height="800" | |
187 xmlns="http://www.w3.org/2000/svg" version="1.2" | |
188 xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 1280 800" onload="init()"> | |
189 | |
190 <script type="text/ecmascript" xlink:href="/static/scripts/checkbox_and_radiobutton.js"/> | |
191 <script type="text/ecmascript" xlink:href="/static/scripts/helper_functions.js"/> | |
192 <script type="text/ecmascript" xlink:href="/static/scripts/timer.js"/> | |
193 <script type="text/ecmascript"> | |
194 <![CDATA[ | |
195 var checkBoxes = new Array(); | |
196 var radioGroupBandwidth; | |
197 var colours = ['%s','%s','%s','%s','%s','%s','%s']; | |
198 function init() { | |
199 var style = {"font-family":"Arial,Helvetica", "fill":"black", "font-size":12}; | |
200 var dist = 12; | |
201 var yOffset = 4; | |
202 | |
203 //A checkBox for each relationship type dupe,parentchild,sibpair,halfsib,parents,unrel,unkn | |
204 checkBoxes["dupe"] = new checkBox("dupe","checkboxes",20,40,"cbRect","cbCross",true,"Duplicate",style,dist,yOffset,undefined,hideShowLayer); | |
205 checkBoxes["parentchild"] = new checkBox("parentchild","checkboxes",20,60,"cbRect","cbCross",true,"Parent-Child",style,dist,yOffset,undefined,hideShowLayer); | |
206 checkBoxes["sibpairs"] = new checkBox("sibpairs","checkboxes",20,80,"cbRect","cbCross",true,"Sib-pairs",style,dist,yOffset,undefined,hideShowLayer); | |
207 checkBoxes["halfsibs"] = new checkBox("halfsibs","checkboxes",20,100,"cbRect","cbCross",true,"Half-sibs",style,dist,yOffset,undefined,hideShowLayer); | |
208 checkBoxes["parents"] = new checkBox("parents","checkboxes",20,120,"cbRect","cbCross",true,"Parents",style,dist,yOffset,undefined,hideShowLayer); | |
209 checkBoxes["unrelated"] = new checkBox("unrelated","checkboxes",20,140,"cbRect","cbCross",true,"Unrelated",style,dist,yOffset,undefined,hideShowLayer); | |
210 checkBoxes["unknown"] = new checkBox("unknown","checkboxes",20,160,"cbRect","cbCross",true,"Unknown",style,dist,yOffset,undefined,hideShowLayer); | |
211 | |
212 } | |
213 | |
214 function hideShowLayer(id, status, label) { | |
215 var vis = "hidden"; | |
216 if (status) { | |
217 vis = "visible"; | |
218 } | |
219 document.getElementById(id).setAttributeNS(null, 'visibility', vis); | |
220 } | |
221 | |
222 function showBTT(evt, rel, mm, dm, md, dd, n, mg, dg, lg, hg) { | |
223 var x = parseInt(evt.pageX)-250; | |
224 var y = parseInt(evt.pageY)-110; | |
225 switch(rel) { | |
226 case 0: | |
227 fill = colours[rel]; | |
228 relt = "dupe"; | |
229 break; | |
230 case 1: | |
231 fill = colours[rel]; | |
232 relt = "parentchild"; | |
233 break; | |
234 case 2: | |
235 fill = colours[rel]; | |
236 relt = "sibpairs"; | |
237 break; | |
238 case 3: | |
239 fill = colours[rel]; | |
240 relt = "halfsibs"; | |
241 break; | |
242 case 4: | |
243 fill = colours[rel]; | |
244 relt = "parents"; | |
245 break; | |
246 case 5: | |
247 fill = colours[rel]; | |
248 relt = "unrelated"; | |
249 break; | |
250 case 6: | |
251 fill = colours[rel]; | |
252 relt = "unknown"; | |
253 break; | |
254 default: | |
255 fill = "cyan"; | |
256 relt = "ERROR_CODE: "+rel; | |
257 } | |
258 | |
259 document.getElementById("btRel").textContent = "GROUP: "+relt; | |
260 document.getElementById("btMean").textContent = "mean="+mm+" +/- "+dm; | |
261 document.getElementById("btSdev").textContent = "sdev="+dm+" +/- "+dd; | |
262 document.getElementById("btPair").textContent = "npairs="+n; | |
263 document.getElementById("btGeno").textContent = "ngenos="+mg+" +/- "+dg+" (min="+lg+", max="+hg+")"; | |
264 document.getElementById("btHead").setAttribute('fill', fill); | |
265 | |
266 var tt = document.getElementById("btTip"); | |
267 tt.setAttribute("transform", "translate("+x+","+y+")"); | |
268 tt.setAttribute('visibility', 'visible'); | |
269 } | |
270 | |
271 function showOTT(evt, rel, s1, s2, mean, sdev, ngeno, rmean, rsdev) { | |
272 var x = parseInt(evt.pageX)-150; | |
273 var y = parseInt(evt.pageY)-180; | |
274 | |
275 switch(rel) { | |
276 case 0: | |
277 fill = colours[rel]; | |
278 relt = "dupe"; | |
279 break; | |
280 case 1: | |
281 fill = colours[rel]; | |
282 relt = "parentchild"; | |
283 break; | |
284 case 2: | |
285 fill = colours[rel]; | |
286 relt = "sibpairs"; | |
287 break; | |
288 case 3: | |
289 fill = colours[rel]; | |
290 relt = "halfsibs"; | |
291 break; | |
292 case 4: | |
293 fill = colours[rel]; | |
294 relt = "parents"; | |
295 break; | |
296 case 5: | |
297 fill = colours[rel]; | |
298 relt = "unrelated"; | |
299 break; | |
300 case 6: | |
301 fill = colours[rel]; | |
302 relt = "unknown"; | |
303 break; | |
304 default: | |
305 fill = "cyan"; | |
306 relt = "ERROR_CODE: "+rel; | |
307 } | |
308 | |
309 document.getElementById("otRel").textContent = "PAIR: "+relt; | |
310 document.getElementById("otS1").textContent = "s1="+s1; | |
311 document.getElementById("otS2").textContent = "s2="+s2; | |
312 document.getElementById("otMean").textContent = "mean="+mean; | |
313 document.getElementById("otSdev").textContent = "sdev="+sdev; | |
314 document.getElementById("otGeno").textContent = "ngenos="+ngeno; | |
315 document.getElementById("otRmean").textContent = "relmean="+rmean; | |
316 document.getElementById("otRsdev").textContent = "relsdev="+rsdev; | |
317 document.getElementById("otHead").setAttribute('fill', fill); | |
318 | |
319 var tt = document.getElementById("otTip"); | |
320 tt.setAttribute("transform", "translate("+x+","+y+")"); | |
321 tt.setAttribute('visibility', 'visible'); | |
322 } | |
323 | |
324 function hideBTT(evt) { | |
325 document.getElementById("btTip").setAttributeNS(null, 'visibility', 'hidden'); | |
326 } | |
327 | |
328 function hideOTT(evt) { | |
329 document.getElementById("otTip").setAttributeNS(null, 'visibility', 'hidden'); | |
330 } | |
331 | |
332 ]]> | |
333 </script> | |
334 <defs> | |
335 <!-- symbols for check boxes --> | |
336 <symbol id="cbRect" overflow="visible"> | |
337 <rect x="-5" y="-5" width="10" height="10" fill="white" stroke="dimgray" stroke-width="1" cursor="pointer"/> | |
338 </symbol> | |
339 <symbol id="cbCross" overflow="visible"> | |
340 <g pointer-events="none" stroke="black" stroke-width="1"> | |
341 <line x1="-3" y1="-3" x2="3" y2="3"/> | |
342 <line x1="3" y1="-3" x2="-3" y2="3"/> | |
343 </g> | |
344 </symbol> | |
345 </defs> | |
346 | |
347 <desc>Developer Works Dynamic Scatter Graph Scaling Example</desc> | |
348 | |
349 <!-- Now Draw the main X and Y axis --> | |
350 <g style="stroke-width:1.0; stroke:black; shape-rendering:crispEdges"> | |
351 <!-- X Axis top and bottom --> | |
352 <path d="M 100 100 L 1250 100 Z"/> | |
353 <path d="M 100 700 L 1250 700 Z"/> | |
354 | |
355 <!-- Y Axis left and right --> | |
356 <path d="M 100 100 L 100 700 Z"/> | |
357 <path d="M 1250 100 L 1250 700 Z"/> | |
358 </g> | |
359 | |
360 <g transform="translate(100,100)"> | |
361 | |
362 <!-- Grid Lines --> | |
363 <g style="fill:none; stroke:#dddddd; stroke-width:1; stroke-dasharray:2,2; text-anchor:end; shape-rendering:crispEdges"> | |
364 | |
365 <!-- Vertical grid lines --> | |
366 <line x1="125" y1="0" x2="115" y2="600" /> | |
367 <line x1="230" y1="0" x2="230" y2="600" /> | |
368 <line x1="345" y1="0" x2="345" y2="600" /> | |
369 <line x1="460" y1="0" x2="460" y2="600" /> | |
370 <line x1="575" y1="0" x2="575" y2="600" style="stroke-dasharray:none;" /> | |
371 <line x1="690" y1="0" x2="690" y2="600" /> | |
372 <line x1="805" y1="0" x2="805" y2="600" /> | |
373 <line x1="920" y1="0" x2="920" y2="600" /> | |
374 <line x1="1035" y1="0" x2="1035" y2="600" /> | |
375 | |
376 <!-- Horizontal grid lines --> | |
377 <line x1="0" y1="60" x2="1150" y2="60" /> | |
378 <line x1="0" y1="120" x2="1150" y2="120" /> | |
379 <line x1="0" y1="180" x2="1150" y2="180" /> | |
380 <line x1="0" y1="240" x2="1150" y2="240" /> | |
381 <line x1="0" y1="300" x2="1150" y2="300" style="stroke-dasharray:none;" /> | |
382 <line x1="0" y1="360" x2="1150" y2="360" /> | |
383 <line x1="0" y1="420" x2="1150" y2="420" /> | |
384 <line x1="0" y1="480" x2="1150" y2="480" /> | |
385 <line x1="0" y1="540" x2="1150" y2="540" /> | |
386 </g> | |
387 | |
388 <!-- Legend --> | |
389 <g style="fill:black; stroke:none" font-size="12" font-family="Arial" transform="translate(25,25)"> | |
390 <rect width="160" height="270" style="fill:none; stroke:black; shape-rendering:crispEdges" /> | |
391 <text x="5" y="20" style="fill:black; stroke:none;" font-size="13" font-weight="bold">Given Pair Relationship</text> | |
392 <rect x="120" y="35" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | |
393 <rect x="120" y="55" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | |
394 <rect x="120" y="75" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | |
395 <rect x="120" y="95" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | |
396 <rect x="120" y="115" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | |
397 <rect x="120" y="135" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | |
398 <rect x="120" y="155" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/> | |
399 <text x="15" y="195" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore gt 15</text> | |
400 <circle cx="125" cy="192" r="6" style="stroke:red; fill:gold; fill-opacity:1.0; stroke-width:1;"/> | |
401 <text x="15" y="215" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore 4 to 15</text> | |
402 <circle cx="125" cy="212" r="3" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/> | |
403 <text x="15" y="235" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore lt 4</text> | |
404 <circle cx="125" cy="232" r="2" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/> | |
405 <g id="checkboxes"> | |
406 </g> | |
407 </g> | |
408 | |
409 | |
410 <g style='fill:black; stroke:none' font-size="17" font-family="Arial"> | |
411 <!-- X Axis Labels --> | |
412 <text x="480" y="660">Mean Alleles Shared</text> | |
413 <text x="0" y="630" >1.0</text> | |
414 <text x="277" y="630" >1.25</text> | |
415 <text x="564" y="630" >1.5</text> | |
416 <text x="842" y="630" >1.75</text> | |
417 <text x="1140" y="630" >2.0</text> | |
418 </g> | |
419 | |
420 <g transform="rotate(270)" style="fill:black; stroke:none" font-size="17" font-family="Arial"> | |
421 <!-- Y Axis Labels --> | |
422 <text x="-350" y="-40">SD Alleles Shared</text> | |
423 <text x="-20" y="-10" >1.0</text> | |
424 <text x="-165" y="-10" >0.75</text> | |
425 <text x="-310" y="-10" >0.5</text> | |
426 <text x="-455" y="-10" >0.25</text> | |
427 <text x="-600" y="-10" >0.0</text> | |
428 </g> | |
429 | |
430 <!-- Plot Title --> | |
431 <g style="fill:black; stroke:none" font-size="18" font-family="Arial"> | |
432 <text x="425" y="-30">%s</text> | |
433 </g> | |
434 | |
435 <!-- One group/layer of points for each relationship type --> | |
436 ''' | |
437 | |
438 SVG_FOOTER = ''' | |
439 <!-- End of Data --> | |
440 </g> | |
441 <g id="btTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial"> | |
442 <rect width="250" height="110" style="fill:silver" rx="2" ry="2"/> | |
443 <rect id="btHead" width="250" height="20" rx="2" ry="2" /> | |
444 <text id="btRel" y="14" x="85">unrelated</text> | |
445 <text id="btMean" y="40" x="4">mean=1.5 +/- 0.04</text> | |
446 <text id="btSdev" y="60" x="4">sdev=0.7 +/- 0.03</text> | |
447 <text id="btPair" y="80" x="4">npairs=1152</text> | |
448 <text id="btGeno" y="100" x="4">ngenos=4783 +/- 24 (min=1000, max=5000)</text> | |
449 </g> | |
450 | |
451 <g id="otTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial"> | |
452 <rect width="150" height="180" style="fill:silver" rx="2" ry="2"/> | |
453 <rect id="otHead" width="150" height="20" rx="2" ry="2" /> | |
454 <text id="otRel" y="14" x="40">sibpairs</text> | |
455 <text id="otS1" y="40" x="4">s1=fid1,iid1</text> | |
456 <text id="otS2" y="60" x="4">s2=fid2,iid2</text> | |
457 <text id="otMean" y="80" x="4">mean=1.82</text> | |
458 <text id="otSdev" y="100" x="4">sdev=0.7</text> | |
459 <text id="otGeno" y="120" x="4">ngeno=4487</text> | |
460 <text id="otRmean" y="140" x="4">relmean=1.85</text> | |
461 <text id="otRsdev" y="160" x="4">relsdev=0.65</text> | |
462 </g> | |
463 </svg> | |
464 ''' | |
465 | |
466 | |
467 DEFAULT_MAX_SAMPLE_SIZE = 5000 | |
468 | |
469 REF_COUNT_HOM1 = 3 | |
470 REF_COUNT_HET = 2 | |
471 REF_COUNT_HOM2 = 1 | |
472 MISSING = 0 | |
473 | |
474 MARKER_PAIRS_PER_SECOND_SLOW = 15000000 | |
475 MARKER_PAIRS_PER_SECOND_FAST = 70000000 | |
476 | |
477 POLYGONS = { | |
478 REL_UNRELATED: ((1.360, 0.655), (1.385, 0.730), (1.620, 0.575), (1.610, 0.505)), | |
479 REL_HALFSIBS: ((1.630, 0.500), (1.630, 0.550), (1.648, 0.540), (1.648, 0.490)), | |
480 REL_SIBS: ((1.660, 0.510), (1.665, 0.560), (1.820, 0.410), (1.820, 0.390)), | |
481 REL_PARENTCHILD: ((1.650, 0.470), (1.650, 0.490), (1.750, 0.440), (1.750, 0.420)), | |
482 REL_DUPE: ((1.970, 0.000), (1.970, 0.150), (2.000, 0.150), (2.000, 0.000)), | |
483 } | |
484 | |
485 def distance(point1, point2): | |
486 """ Calculate the distance between two points | |
487 """ | |
488 (x1,y1) = [float(d) for d in point1] | |
489 (x2,y2) = [float(d) for d in point2] | |
490 dx = abs(x1 - x2) | |
491 dy = abs(y1 - y2) | |
492 return math.sqrt(dx**2 + dy**2) | |
493 | |
494 def point_inside_polygon(x, y, poly): | |
495 """ Determine if a point (x,y) is inside a given polygon or not | |
496 poly is a list of (x,y) pairs. | |
497 | |
498 Taken from: http://www.ariel.com.au/a/python-point-int-poly.html | |
499 """ | |
500 | |
501 n = len(poly) | |
502 inside = False | |
503 | |
504 p1x,p1y = poly[0] | |
505 for i in range(n+1): | |
506 p2x,p2y = poly[i % n] | |
507 if y > min(p1y,p2y): | |
508 if y <= max(p1y,p2y): | |
509 if x <= max(p1x,p2x): | |
510 if p1y != p2y: | |
511 xinters = (y-p1y)*(p2x-p1x)/(p2y-p1y)+p1x | |
512 if p1x == p2x or x <= xinters: | |
513 inside = not inside | |
514 p1x,p1y = p2x,p2y | |
515 return inside | |
516 | |
517 def readMap(pedfile): | |
518 """ | |
519 """ | |
520 mapfile = pedfile.replace('.ped', '.map') | |
521 marker_list = [] | |
522 if os.path.exists(mapfile): | |
523 print 'readMap: %s' % (mapfile) | |
524 fh = file(mapfile, 'r') | |
525 for line in fh: | |
526 marker_list.append(line.strip().split()) | |
527 fh.close() | |
528 print 'readMap: %s markers' % (len(marker_list)) | |
529 return marker_list | |
530 | |
531 def calcMeanSD(useme): | |
532 """ | |
533 A numerically stable algorithm is given below. It also computes the mean. | |
534 This algorithm is due to Knuth,[1] who cites Welford.[2] | |
535 n = 0 | |
536 mean = 0 | |
537 M2 = 0 | |
538 | |
539 foreach x in data: | |
540 n = n + 1 | |
541 delta = x - mean | |
542 mean = mean + delta/n | |
543 M2 = M2 + delta*(x - mean) // This expression uses the new value of mean | |
544 end for | |
545 | |
546 variance_n = M2/n | |
547 variance = M2/(n - 1) | |
548 """ | |
549 mean = 0.0 | |
550 M2 = 0.0 | |
551 sd = 0.0 | |
552 n = len(useme) | |
553 if n > 1: | |
554 for i,x in enumerate(useme): | |
555 delta = x - mean | |
556 mean = mean + delta/(i+1) # knuth uses n+=1 at start | |
557 M2 = M2 + delta*(x - mean) # This expression uses the new value of mean | |
558 variance = M2/(n-1) # assume is sample so lose 1 DOF | |
559 sd = pow(variance,0.5) | |
560 return mean,sd | |
561 | |
562 | |
563 def doIBSpy(ped=None,basename='',outdir=None,logf=None, | |
564 nrsSamples=10000,title='title',pdftoo=0,Zcutoff=2.0): | |
565 #def doIBS(pedName, title, nrsSamples=None, pdftoo=False): | |
566 """ started with snpmatrix but GRR uses actual IBS counts and sd's | |
567 """ | |
568 repOut = [] # text strings to add to the html display | |
569 refallele = {} | |
570 tblf = '%s_table.xls' % (title) | |
571 tbl = file(os.path.join(outdir,tblf), 'w') | |
572 tbl.write(TABLE_HEADER) | |
573 svgf = '%s.svg' % (title) | |
574 svg = file(os.path.join(outdir,svgf), 'w') | |
575 | |
576 nMarkers = len(ped._markers) | |
577 if nMarkers < 5: | |
578 print sys.stderr, '### ERROR - %d is too few markers for reliable estimation in %s - terminating' % (nMarkers,PROGNAME) | |
579 sys.exit(1) | |
580 nSubjects = len(ped._subjects) | |
581 nrsSamples = min(nMarkers, nrsSamples) | |
582 if opts and opts.use_mito: | |
583 markers = range(nMarkers) | |
584 nrsSamples = min(len(markers), nrsSamples) | |
585 sampleIndexes = sorted(random.sample(markers, nrsSamples)) | |
586 else: | |
587 autosomals = ped.autosomal_indices() | |
588 nrsSamples = min(len(autosomals), nrsSamples) | |
589 sampleIndexes = sorted(random.sample(autosomals, nrsSamples)) | |
590 | |
591 print '' | |
592 print 'Getting random.sample of %s from %s total' % (nrsSamples, nMarkers) | |
593 npairs = (nSubjects*(nSubjects-1))/2 # total rows in table | |
594 newfiles=[svgf,tblf] | |
595 explanations = ['rgGRR Plot (requires SVG)','Mean by SD alleles shared - %d rows' % npairs] | |
596 # these go with the output file links in the html file | |
597 s = 'Reading genotypes for %s subjects and %s markers\n' % (nSubjects, nrsSamples) | |
598 logf.write(s) | |
599 minUsegenos = nrsSamples/2 # must have half? | |
600 nGenotypes = nSubjects*nrsSamples | |
601 stime = time.time() | |
602 emptyRows = set() | |
603 genos = numpy.zeros((nSubjects, nrsSamples), dtype=int) | |
604 for s in xrange(nSubjects): | |
605 nValid = 0 | |
606 #getGenotypesByIndices(self, s, mlist, format) | |
607 genos[s] = ped.getGenotypesByIndices(s, sampleIndexes, format='ref') | |
608 nValid = sum([1 for g in genos[s] if g]) | |
609 if not nValid: | |
610 emptyRows.add(s) | |
611 sub = ped.getSubject(s) | |
612 print 'All missing for row %d (%s)' % (s, sub) | |
613 logf.write('All missing for row %d (%s)\n' % (s, sub)) | |
614 rtime = time.time() - stime | |
615 if verbose: | |
616 print '@@Read %s genotypes in %s seconds' % (nGenotypes, rtime) | |
617 | |
618 | |
619 ### Now the expensive part. For each pair of subjects, we get the mean number | |
620 ### and standard deviation of shared alleles over all of the markers where both | |
621 ### subjects have a known genotype. Identical subjects should have mean shared | |
622 ### alleles very close to 2.0 with a standard deviation very close to 0.0. | |
623 tot = nSubjects*(nSubjects-1)/2 | |
624 nprog = tot/10 | |
625 nMarkerpairs = tot * nrsSamples | |
626 estimatedTimeSlow = nMarkerpairs/MARKER_PAIRS_PER_SECOND_SLOW | |
627 estimatedTimeFast = nMarkerpairs/MARKER_PAIRS_PER_SECOND_FAST | |
628 | |
629 pairs = [] | |
630 pair_data = {} | |
631 means = [] ## Mean IBS for each pair | |
632 ngenoL = [] ## Count of comparable genotypes for each pair | |
633 sdevs = [] ## Standard dev for each pair | |
634 rels = [] ## A relationship code for each pair | |
635 zmeans = [0.0 for x in xrange(tot)] ## zmean score for each pair for the relgroup | |
636 zstds = [0.0 for x in xrange(tot)] ## zstd score for each pair for the relgrp | |
637 skip = set() | |
638 ndone = 0 ## How many have been done so far | |
639 | |
640 logf.write('Calculating %d pairs...\n' % (tot)) | |
641 logf.write('Estimated time is %2.2f to %2.2f seconds ...\n' % (estimatedTimeFast, estimatedTimeSlow)) | |
642 | |
643 t1sum = 0 | |
644 t2sum = 0 | |
645 t3sum = 0 | |
646 now = time.time() | |
647 scache = {} | |
648 _founder_cache = {} | |
649 C_CODE = """ | |
650 #include "math.h" | |
651 int i; | |
652 int sumibs = 0; | |
653 int ssqibs = 0; | |
654 int ngeno = 0; | |
655 float mean = 0; | |
656 float M2 = 0; | |
657 float delta = 0; | |
658 float sdev=0; | |
659 float variance=0; | |
660 for (i=0; i<nrsSamples; i++) { | |
661 int a1 = g1[i]; | |
662 int a2 = g2[i]; | |
663 if (a1 != 0 && a2 != 0) { | |
664 ngeno += 1; | |
665 int shared = 2-abs(a1-a2); | |
666 delta = shared - mean; | |
667 mean = mean + delta/ngeno; | |
668 M2 += delta*(shared-mean); | |
669 // yes that second time, the updated mean is used see calcmeansd above; | |
670 //printf("%d %d %d %d %d %d\\n", i, a1, a2, ngeno, shared, squared); | |
671 } | |
672 } | |
673 if (ngeno > 1) { | |
674 variance = M2/(ngeno-1); | |
675 sdev = sqrt(variance); | |
676 //printf("OK: %d %3.2f %3.2f\\n", ngeno, mean, sdev); | |
677 } | |
678 //printf("%d %d %d %1.2f %1.2f\\n", ngeno, sumibs, ssqibs, mean, sdev); | |
679 result[0] = ngeno; | |
680 result[1] = mean; | |
681 result[2] = sdev; | |
682 return_val = ngeno; | |
683 """ | |
684 started = time.time() | |
685 for s1 in xrange(nSubjects): | |
686 if s1 in emptyRows: | |
687 continue | |
688 (fid1,iid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache.setdefault(s1, ped.getSubject(s1)) | |
689 | |
690 isFounder1 = _founder_cache.setdefault(s1, (did1==mid1)) | |
691 g1 = genos[s1] | |
692 | |
693 for s2 in xrange(s1+1, nSubjects): | |
694 if s2 in emptyRows: | |
695 continue | |
696 t1s = time.time() | |
697 | |
698 (fid2,iid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache.setdefault(s2, ped.getSubject(s2)) | |
699 | |
700 g2 = genos[s2] | |
701 isFounder2 = _founder_cache.setdefault(s2, (did2==mid2)) | |
702 | |
703 # Determine the relationship for this pair | |
704 relcode = REL_UNKNOWN | |
705 if (fid2 == fid1): | |
706 if iid1 == iid2: | |
707 relcode = REL_DUPE | |
708 elif (did2 == did1) and (mid2 == mid1) and did1 != mid1: | |
709 relcode = REL_SIBS | |
710 elif (iid1 == mid2) or (iid1 == did2) or (iid2 == mid1) or (iid2 == did1): | |
711 relcode = REL_PARENTCHILD | |
712 elif (str(did1) != '0' and (did2 == did1)) or (str(mid1) != '0' and (mid2 == mid1)): | |
713 relcode = REL_HALFSIBS | |
714 else: | |
715 # People in the same family should be marked as some other | |
716 # form of related. In general, these people will have a | |
717 # pretty random spread of similarity. This distinction is | |
718 # probably not very useful most of the time | |
719 relcode = REL_RELATED | |
720 else: | |
721 ### Different families | |
722 relcode = REL_UNRELATED | |
723 | |
724 t1e = time.time() | |
725 t1sum += t1e-t1s | |
726 | |
727 | |
728 ### Calculate sum(2-abs(a1-a2)) and sum((2-abs(a1-a2))**2) and count | |
729 ### the number of contributing genotypes. These values are not actually | |
730 ### calculated here, but instead are looked up in a table for speed. | |
731 ### FIXME: This is still too slow ... | |
732 result = [0.0, 0.0, 0.0] | |
733 ngeno = weave.inline(C_CODE, ['g1', 'g2', 'nrsSamples', 'result']) | |
734 if ngeno >= minUsegenos: | |
735 _, mean, sdev = result | |
736 means.append(mean) | |
737 sdevs.append(sdev) | |
738 ngenoL.append(ngeno) | |
739 pairs.append((s1, s2)) | |
740 rels.append(relcode) | |
741 else: | |
742 skip.add(ndone) # signal no comparable genotypes for this pair | |
743 ndone += 1 | |
744 t2e = time.time() | |
745 t2sum += t2e-t1e | |
746 t3e = time.time() | |
747 t3sum += t3e-t2e | |
748 | |
749 logme = [ 'T1: %s' % (t1sum), 'T2: %s' % (t2sum), 'T3: %s' % (t3sum),'TOT: %s' % (t3e-now), | |
750 '%s pairs with no (or not enough) comparable genotypes (%3.1f%%)' % (len(skip), | |
751 float(len(skip))/float(tot)*100)] | |
752 logf.write('%s\n' % '\t'.join(logme)) | |
753 ### Calculate mean and standard deviation of scores on a per relationship | |
754 ### type basis, allowing us to flag outliers for each particular relationship | |
755 ### type | |
756 relstats = {} | |
757 relCounts = {} | |
758 outlierFiles = {} | |
759 for relCode, relInfo in REL_LOOKUP.items(): | |
760 relName, relColor, relStyle = relInfo | |
761 useme = [means[x] for x in xrange(len(means)) if rels[x] == relCode] | |
762 relCounts[relCode] = len(useme) | |
763 mm = scipy.mean(useme) | |
764 ms = scipy.std(useme) | |
765 useme = [sdevs[x] for x in xrange(len(sdevs)) if rels[x] == relCode] | |
766 sm = scipy.mean(useme) | |
767 ss = scipy.std(useme) | |
768 relstats[relCode] = {'sd':(sm,ss), 'mean':(mm,ms)} | |
769 s = 'Relstate %s (n=%d): mean(mean)=%3.2f sdev(mean)=%3.2f, mean(sdev)=%3.2f sdev(sdev)=%3.2f\n' % \ | |
770 (relName,relCounts[relCode], mm, ms, sm, ss) | |
771 logf.write(s) | |
772 | |
773 ### now fake z scores for each subject like abecasis recommends max(|zmu|,|zsd|) | |
774 ### within each group, for each pair, z=(groupmean-pairmean)/groupsd | |
775 available = len(means) | |
776 logf.write('%d pairs are available of %d\n' % (available, tot)) | |
777 ### s = '\nOutliers:\nrelationship\tzmean\tzsd\tped1\tped2\tmean\tsd\trmeanmean\trmeansd\trsdmean\trsdsd\n' | |
778 ### logf.write(s) | |
779 pairnum = 0 | |
780 offset = 0 | |
781 nOutliers = 0 | |
782 cexs = [] | |
783 outlierRecords = dict([(r, []) for r in range(N_RELATIONSHIP_TYPES)]) | |
784 zsdmax = 0 | |
785 for s1 in range(nSubjects): | |
786 if s1 in emptyRows: | |
787 continue | |
788 (fid1,iid1,did1,mid1,sex1,aff1,ok1,d_sid1,m_sid1) = scache[s1] | |
789 for s2 in range(s1+1, nSubjects): | |
790 if s2 in emptyRows: | |
791 continue | |
792 if pairnum not in skip: | |
793 ### Get group stats for this relationship | |
794 (fid2,iid2,did2,mid2,sex2,aff2,ok2,d_sid2,m_sid2) = scache[s2] | |
795 try: | |
796 r = rels[offset] | |
797 except IndexError: | |
798 logf.write('###OOPS offset %d available %d pairnum %d len(rels) %d', offset, available, pairnum, len(rels)) | |
799 notfound = ('?',('?','0','0')) | |
800 relInfo = REL_LOOKUP.get(r,notfound) | |
801 relName, relColor, relStyle = relInfo | |
802 rmm,rmd = relstats[r]['mean'] # group mean, group meansd alleles shared | |
803 rdm,rdd = relstats[r]['sd'] # group sdmean, group sdsd alleles shared | |
804 | |
805 try: | |
806 zsd = (sdevs[offset] - rdm)/rdd # distance from group mean in group sd units | |
807 except: | |
808 zsd = 1 | |
809 if abs(zsd) > zsdmax: | |
810 zsdmax = zsd # keep for sort scaling | |
811 try: | |
812 zmean = (means[offset] - rmm)/rmd # distance from group mean | |
813 except: | |
814 zmean = 1 | |
815 zmeans[offset] = zmean | |
816 zstds[offset] = zsd | |
817 pid=(s1,s2) | |
818 zrad = max(zsd,zmean) | |
819 if zrad < 4: | |
820 zrad = 2 | |
821 elif 4 < zrad < 15: | |
822 zrad = 3 # to 9 | |
823 else: # > 15 6=24+ | |
824 zrad=zrad/4 | |
825 zrad = min(zrad,6) # scale limit | |
826 zrad = max(2,max(zsd,zmean)) # as > 2, z grows | |
827 pair_data[pid] = (zmean,zsd,r,zrad) | |
828 if max(zsd,zmean) > Zcutoff: # is potentially interesting | |
829 mean = means[offset] | |
830 sdev = sdevs[offset] | |
831 outlierRecords[r].append((mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd,did1,mid1,did2,mid2)) | |
832 nOutliers += 1 | |
833 tbl.write('%s_%s\t%s_%s\t%f\t%f\t%f\t%f\t%d\t%s\t%s\t%s\t%s\t%s\n' % \ | |
834 (fid1, iid1, fid2, iid2, mean, sdev, zmean,zsd, ngeno, relName, did1,mid1,did2,mid2)) | |
835 offset += 1 | |
836 pairnum += 1 | |
837 logf.write( 'Outliers: %s\n' % (nOutliers)) | |
838 | |
839 ### Write outlier files for each relationship type | |
840 repOut.append('<h2>Outliers in tab delimited files linked above are also listed below</h2>') | |
841 lzsd = round(numpy.log10(zsdmax)) + 1 | |
842 scalefactor = 10**lzsd | |
843 for relCode, relInfo in REL_LOOKUP.items(): | |
844 relName, _, _ = relInfo | |
845 outliers = outlierRecords[relCode] | |
846 if not outliers: | |
847 continue | |
848 outliers = [(scalefactor*int(abs(x[3]))+ int(abs(x[2])),x) for x in outliers] # decorate | |
849 outliers.sort() | |
850 outliers.reverse() # largest deviation first | |
851 outliers = [x[1] for x in outliers] # undecorate | |
852 nrows = len(outliers) | |
853 truncated = 0 | |
854 if nrows > MAX_SHOW_ROWS: | |
855 s = '<h3>%s outlying pairs (top %d of %d) from %s</h3><table border="0" cellpadding="3">' % \ | |
856 (relName,MAX_SHOW_ROWS,nrows,title) | |
857 truncated = nrows - MAX_SHOW_ROWS | |
858 else: | |
859 s = '<h3>%s outlying pairs (n=%d) from %s</h3><table border="0" cellpadding="3">' % (relName,nrows,title) | |
860 repOut.append(s) | |
861 fhname = '%s_rgGRR_%s_outliers.xls' % (title, relName) | |
862 fhpath = os.path.join(outdir,fhname) | |
863 fh = open(fhpath, 'w') | |
864 newfiles.append(fhname) | |
865 explanations.append('%s Outlier Pairs %s, N=%d, Cutoff SD=%f' % (relName,title,len(outliers),Zcutoff)) | |
866 fh.write(OUTLIERS_HEADER) | |
867 s = ''.join(['<th>%s</th>' % x for x in OUTLIERS_HEADER_list]) | |
868 repOut.append('<tr align="center">%s</tr>' % s) | |
869 for n,rec in enumerate(outliers): | |
870 #(mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd) = rec | |
871 s = '%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t' % tuple(rec) | |
872 fh.write('%s%s\n' % (s,relName)) | |
873 # (mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd, did1,mid1,did2,mid2)) | |
874 s = '''<td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td> | |
875 <td>%s</td><td>%s</td><td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>''' % tuple(rec) | |
876 s = '%s<td>%s</td>' % (s,relName) | |
877 if n < MAX_SHOW_ROWS: | |
878 repOut.append('<tr align="center">%s</tr>' % s) | |
879 if truncated > 0: | |
880 repOut.append('<H2>WARNING: %d rows truncated - see outlier file for all %d rows</H2>' % (truncated, | |
881 nrows)) | |
882 fh.close() | |
883 repOut.append('</table><p>') | |
884 | |
885 ### Now, draw the plot in jpeg and svg formats, and optionally in the PDF format | |
886 ### if requested | |
887 logf.write('Plotting ...') | |
888 pointColors = [REL_COLORS[rel] for rel in rels] | |
889 pointStyles = [REL_POINTS[rel] for rel in rels] | |
890 | |
891 mainTitle = '%s (%s subjects, %d snp)' % (title, nSubjects, nrsSamples) | |
892 svg.write(SVG_HEADER % (SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[4], | |
893 SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[0],SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[1], | |
894 SVG_COLORS[2],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[3],SVG_COLORS[4],SVG_COLORS[4], | |
895 SVG_COLORS[5],SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[6],mainTitle)) | |
896 #rpy.r.jpeg(filename='%s.jpg' % (title), width=1600, height=1200, pointsize=12, quality=100, bg='white') | |
897 #rpy.r.par(mai=(1,1,1,0.5)) | |
898 #rpy.r('par(xaxs="i",yaxs="i")') | |
899 #rpy.r.plot(means, sdevs, main=mainTitle, ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2)) | |
900 #rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE) | |
901 #rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted') | |
902 #rpy.r.dev_off() | |
903 | |
904 ### We will now go through each relationship type to partition plot points | |
905 ### into "bulk" and "outlier" groups. Bulk points will represent common | |
906 ### mean/sdev pairs and will cover the majority of the points in the plot -- | |
907 ### they will use generic tooltip informtion about all of the pairs | |
908 ### represented by that point. "Outlier" points will be uncommon pairs, | |
909 ### with very specific information in their tooltips. It would be nice to | |
910 ### keep hte total number of plotted points in the SVG representation to | |
911 ### ~10000 (certainly less than 100000?) | |
912 pointMap = {} | |
913 orderedRels = [y[1] for y in reversed(sorted([(relCounts.get(x, 0),x) for x in REL_LOOKUP.keys()]))] | |
914 # do we really want this? I want out of zone points last and big | |
915 for relCode in orderedRels: | |
916 svgColor = SVG_COLORS[relCode] | |
917 relName, relColor, relStyle = REL_LOOKUP[relCode] | |
918 svg.write('<g id="%s" style="stroke:%s; fill:%s; fill-opacity:1.0; stroke-width:1;" cursor="pointer">\n' % (relName, svgColor, svgColor)) | |
919 pMap = pointMap.setdefault(relCode, {}) | |
920 nPoints = 0 | |
921 rpairs=[] | |
922 rgenos=[] | |
923 rmeans=[] | |
924 rsdevs=[] | |
925 rz = [] | |
926 for x,rel in enumerate(rels): # all pairs | |
927 if rel == relCode: | |
928 s1,s2 = pairs[x] | |
929 pid=(s1,s2) | |
930 zmean,zsd,r,zrad = pair_data[pid][:4] | |
931 rpairs.append(pairs[x]) | |
932 rgenos.append(ngenoL[x]) | |
933 rmeans.append(means[x]) | |
934 rsdevs.append(sdevs[x]) | |
935 rz.append(zrad) | |
936 ### Now add the svg point group for this relationship to the svg file | |
937 for x in range(len(rmeans)): | |
938 svgX = '%d' % ((rmeans[x] - 1.0) * PLOT_WIDTH) # changed so mean scale is 1-2 | |
939 svgY = '%d' % (PLOT_HEIGHT - (rsdevs[x] * PLOT_HEIGHT)) # changed so sd scale is 0-1 | |
940 s1, s2 = rpairs[x] | |
941 (fid1,uid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache[s1] | |
942 (fid2,uid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache[s2] | |
943 ngenos = rgenos[x] | |
944 nPoints += 1 | |
945 point = pMap.setdefault((svgX, svgY), []) | |
946 point.append((rmeans[x], rsdevs[x], fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos,rz[x])) | |
947 for (svgX, svgY) in pMap: | |
948 points = pMap[(svgX, svgY)] | |
949 svgX = int(svgX) | |
950 svgY = int(svgY) | |
951 if len(points) > 1: | |
952 mmean,dmean = calcMeanSD([p[0] for p in points]) | |
953 msdev,dsdev = calcMeanSD([p[1] for p in points]) | |
954 mgeno,dgeno = calcMeanSD([p[-1] for p in points]) | |
955 mingeno = min([p[-1] for p in points]) | |
956 maxgeno = max([p[-1] for p in points]) | |
957 svg.write("""<circle cx="%d" cy="%d" r="2" | |
958 onmouseover="showBTT(evt, %d, %1.2f, %1.2f, %1.2f, %1.2f, %d, %d, %d, %d, %d)" | |
959 onmouseout="hideBTT(evt)" />\n""" % (svgX, svgY, relCode, mmean, dmean, msdev, dsdev, len(points), mgeno, dgeno, mingeno, maxgeno)) | |
960 else: | |
961 mean, sdev, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos, zrad = points[0][:12] | |
962 rmean = float(relstats[relCode]['mean'][0]) | |
963 rsdev = float(relstats[relCode]['sd'][0]) | |
964 if zrad < 4: | |
965 zrad = 2 | |
966 elif 4 < zrad < 9: | |
967 zrad = 3 # to 9 | |
968 else: # > 9 5=15+ | |
969 zrad=zrad/3 | |
970 zrad = min(zrad,5) # scale limit | |
971 if zrad <= 3: | |
972 svg.write('<circle cx="%d" cy="%d" r="%s" onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)" onmouseout="hideOTT(evt)" />\n' % (svgX, svgY, zrad, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev)) | |
973 else: # highlight pairs a long way from expectation by outlining circle in red | |
974 svg.write("""<circle cx="%d" cy="%d" r="%s" style="stroke:red; fill:%s; fill-opacity:1.0; stroke-width:1;" | |
975 onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)" | |
976 onmouseout="hideOTT(evt)" />\n""" % \ | |
977 (svgX, svgY, zrad, svgColor, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev)) | |
978 svg.write('</g>\n') | |
979 | |
980 ### Create a pdf as well if indicated on the command line | |
981 ### WARNING! for framingham share, with about 50M pairs, this is a 5.5GB pdf! | |
982 ## if pdftoo: | |
983 ## pdfname = '%s.pdf' % (title) | |
984 ## rpy.r.pdf(pdfname, 6, 6) | |
985 ## rpy.r.par(mai=(1,1,1,0.5)) | |
986 ## rpy.r('par(xaxs="i",yaxs="i")') | |
987 ## rpy.r.plot(means, sdevs, main='%s, %d snp' % (title, nSamples), ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2)) | |
988 ## rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE) | |
989 ## rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted') | |
990 ## rpy.r.dev_off() | |
991 | |
992 ### Draw polygons | |
993 if showPolygons: | |
994 svg.write('<g id="polygons" cursor="pointer">\n') | |
995 for rel, poly in POLYGONS.items(): | |
996 points = ' '.join(['%s,%s' % ((p[0]-1.0)*float(PLOT_WIDTH), (PLOT_HEIGHT - p[1]*PLOT_HEIGHT)) for p in poly]) | |
997 svg.write('<polygon points="%s" fill="transparent" style="stroke:%s; stroke-width:1"/>\n' % (points, SVG_COLORS[rel])) | |
998 svg.write('</g>\n') | |
999 | |
1000 | |
1001 svg.write(SVG_FOOTER) | |
1002 svg.close() | |
1003 return newfiles,explanations,repOut | |
1004 | |
1005 def doIBS(n=100): | |
1006 """parse parameters from galaxy | |
1007 expect 'input pbed path' 'basename' 'outpath' 'title' 'logpath' 'n' | |
1008 <command interpreter="python"> | |
1009 rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" | |
1010 '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z' | |
1011 </command> | |
1012 | |
1013 """ | |
1014 u="""<command interpreter="python"> | |
1015 rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" | |
1016 '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z' | |
1017 </command> | |
1018 """ | |
1019 | |
1020 | |
1021 if len(sys.argv) < 7: | |
1022 print >> sys.stdout, 'Need pbed inpath, basename, out_htmlname, outpath, title, logpath, nSNP, Zcutoff on command line please' | |
1023 print >> sys.stdout, u | |
1024 sys.exit(1) | |
1025 ts = '%s%s' % (string.punctuation,string.whitespace) | |
1026 ptran = string.maketrans(ts,'_'*len(ts)) | |
1027 inpath = sys.argv[1] | |
1028 ldinpath = os.path.split(inpath)[0] | |
1029 basename = sys.argv[2] | |
1030 outhtml = sys.argv[3] | |
1031 newfilepath = sys.argv[4] | |
1032 title = sys.argv[5].translate(ptran) | |
1033 logfname = 'Log_%s.txt' % title | |
1034 logpath = os.path.join(newfilepath,logfname) # log was a child - make part of html extra_files_path zoo | |
1035 n = int(sys.argv[6]) | |
1036 try: | |
1037 Zcutoff = float(sys.argv[7]) | |
1038 except: | |
1039 Zcutoff = 2.0 | |
1040 try: | |
1041 os.makedirs(newfilepath) | |
1042 except: | |
1043 pass | |
1044 logf = file(logpath,'w') | |
1045 efp,ibase_name = os.path.split(inpath) # need to use these for outputs in files_path | |
1046 ped = plinkbinJZ.BPed(inpath) | |
1047 ped.parse(quick=True) | |
1048 if ped == None: | |
1049 print >> sys.stderr, '## doIBSpy problem - cannot open %s or %s - cannot run' % (ldreduced,basename) | |
1050 sys.exit(1) | |
1051 newfiles,explanations,repOut = doIBSpy(ped=ped,basename=basename,outdir=newfilepath, | |
1052 logf=logf,nrsSamples=n,title=title,pdftoo=0,Zcutoff=Zcutoff) | |
1053 logf.close() | |
1054 logfs = file(logpath,'r').readlines() | |
1055 lf = file(outhtml,'w') | |
1056 lf.write(galhtmlprefix % PROGNAME) | |
1057 # this is a mess. todo clean up - should each datatype have it's own directory? Yes | |
1058 # probably. Then titles are universal - but userId libraries are separate. | |
1059 s = '<div>Output from %s run at %s<br>\n' % (PROGNAME,timenow()) | |
1060 lf.write('<h4>%s</h4>\n' % s) | |
1061 fixed = ["'%s'" % x for x in sys.argv] # add quotes just in case | |
1062 s = 'If you need to rerun this analysis, the command line was\n<pre>%s</pre>\n</div>' % (' '.join(fixed)) | |
1063 lf.write(s) | |
1064 # various ways of displaying svg - experiments related to missing svg mimetype on test (!) | |
1065 #s = """<object data="%s" type="image/svg+xml" width="%d" height="%d"> | |
1066 # <embed src="%s" type="image/svg+xml" width="%d" height="%d" /> | |
1067 # </object>""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT,newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) | |
1068 s = """ <embed src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) | |
1069 #s = """ <iframe src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) | |
1070 lf.write(s) | |
1071 lf.write('<div><h4>Click the links below to save output files and plots</h4><br><ol>\n') | |
1072 for i in range(len(newfiles)): | |
1073 if i == 0: | |
1074 lf.write('<li><a href="%s" type="image/svg+xml" >%s</a></li>\n' % (newfiles[i],explanations[i])) | |
1075 else: | |
1076 lf.write('<li><a href="%s">%s</a></li>\n' % (newfiles[i],explanations[i])) | |
1077 flist = os.listdir(newfilepath) | |
1078 for fname in flist: | |
1079 if not fname in newfiles: | |
1080 lf.write('<li><a href="%s">%s</a></li>\n' % (fname,fname)) | |
1081 lf.write('</ol></div>') | |
1082 lf.write('<div>%s</div>' % ('\n'.join(repOut))) # repOut is a list of tables | |
1083 lf.write('<div><hr><h3>Log from this job (also stored in %s)</h3><pre>%s</pre><hr></div>' % (logfname,''.join(logfs))) | |
1084 lf.write('</body></html>\n') | |
1085 lf.close() | |
1086 logf.close() | |
1087 | |
1088 if __name__ == '__main__': | |
1089 doIBS() |