comparison tools/rgenetics/rgGRR.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 """
2 # july 2009: Need to see outliers so need to draw them last?
3 # could use clustering on the zscores to guess real relationships for unrelateds
4 # but definitely need to draw last
5 # added MAX_SHOW_ROWS to limit the length of the main report page
6 # Changes for Galaxy integration
7 # added more robust knuth method for one pass mean and sd
8 # no difference really - let's use scipy.mean() and scipy.std() instead...
9 # fixed labels and changed to .xls for outlier reports so can open in excel
10 # interesting - with a few hundred subjects, 5k gives good resolution
11 # and 100k gives better but not by much
12 # TODO remove non autosomal markers
13 # TODO it would be best if label had the zmean and zsd as these are what matter for
14 # outliers rather than the group mean/sd
15 # mods to rgGRR.py from channing CVS which John Ziniti has rewritten to produce SVG plots
16 # to make a Galaxy tool - we need the table of mean and SD for interesting pairs, the SVG and the log
17 # so the result should be an HTML file
18
19 # rgIBS.py
20 # use a random subset of markers for a quick ibs
21 # to identify sample dups and closely related subjects
22 # try snpMatrix and plink and see which one works best for us?
23 # abecasis grr plots mean*sd for every subject to show clusters
24 # mods june 23 rml to avoid non-autosomal markers
25 # we seem to be distinguishing parent-child by gender - 2 clouds!
26
27
28 snpMatrix from David Clayton has:
29 ibs.stats function to calculate the identity-by-state stats of a group of samples
30 Description
31 Given a snp.matrix-class or a X.snp.matrix-class object with N samples, calculates some statistics
32 about the relatedness of every pair of samples within.
33
34 Usage
35 ibs.stats(x)
36 8 ibs.stats
37 Arguments
38 x a snp.matrix-class or a X.snp.matrix-class object containing N samples
39 Details
40 No-calls are excluded from consideration here.
41 Value
42 A data.frame containing N(N - 1)/2 rows, where the row names are the sample name pairs separated
43 by a comma, and the columns are:
44 Count count of identical calls, exclusing no-calls
45 Fraction fraction of identical calls comparied to actual calls being made in both samples
46 Warning
47 In some applications, it may be preferable to subset a (random) selection of SNPs first - the
48 calculation
49 time increases as N(N - 1)M/2 . Typically for N = 800 samples and M = 3000 SNPs, the
50 calculation time is about 1 minute. A full GWA scan could take hours, and quite unnecessary for
51 simple applications such as checking for duplicate or related samples.
52 Note
53 This is mostly written to find mislabelled and/or duplicate samples.
54 Illumina indexes their SNPs in alphabetical order so the mitochondria SNPs comes first - for most
55 purpose it is undesirable to use these SNPs for IBS purposes.
56 TODO: Worst-case S4 subsetting seems to make 2 copies of a large object, so one might want to
57 subset before rbind(), etc; a future version of this routine may contain a built-in subsetting facility
58 """
59 import sys,os,time,random,string,copy,optparse
60
61 try:
62 set
63 except NameError:
64 from Sets import Set as set
65
66 from rgutils import timenow,plinke
67
68 import plinkbinJZ
69
70
71 opts = None
72 verbose = False
73
74 showPolygons = False
75
76 class NullDevice:
77 def write(self, s):
78 pass
79
80 tempstderr = sys.stderr # save
81 #sys.stderr = NullDevice()
82 # need to avoid blather about deprecation and other strange stuff from scipy
83 # the current galaxy job runner assumes that
84 # the job is in error if anything appears on sys.stderr
85 # grrrrr. James wants to keep it that way instead of using the
86 # status flag for some strange reason. Presumably he doesn't use R or (in this case, scipy)
87 import numpy
88 import scipy
89 from scipy import weave
90
91
92 sys.stderr=tempstderr
93
94
95 PROGNAME = os.path.split(sys.argv[0])[-1]
96 X_AXIS_LABEL = 'Mean Alleles Shared'
97 Y_AXIS_LABEL = 'SD Alleles Shared'
98 LEGEND_ALIGN = 'topleft'
99 LEGEND_TITLE = 'Relationship'
100 DEFAULT_SYMBOL_SIZE = 1.0 # default symbol size
101 DEFAULT_SYMBOL_SIZE = 0.5 # default symbol size
102
103 ### Some colors for R/rpy
104 R_BLACK = 1
105 R_RED = 2
106 R_GREEN = 3
107 R_BLUE = 4
108 R_CYAN = 5
109 R_PURPLE = 6
110 R_YELLOW = 7
111 R_GRAY = 8
112
113 ### ... and some point-styles
114
115 ###
116 PLOT_HEIGHT = 600
117 PLOT_WIDTH = 1150
118
119
120 #SVG_COLORS = ('black', 'darkblue', 'blue', 'deepskyblue', 'firebrick','maroon','crimson')
121 #SVG_COLORS = ('cyan','dodgerblue','mediumpurple', 'fuchsia', 'red','gold','gray')
122 SVG_COLORS = ('cyan','dodgerblue','mediumpurple','forestgreen', 'lightgreen','gold','gray')
123 # dupe,parentchild,sibpair,halfsib,parents,unrel,unkn
124 #('orange', 'red', 'green', 'chartreuse', 'blue', 'purple', 'gray')
125
126 OUTLIERS_HEADER_list = ['Mean','Sdev','ZMean','ZSdev','FID1','IID1','FID2','IID2','RelMean_M','RelMean_SD','RelSD_M','RelSD_SD','PID1','MID1','PID2','MID2','Ped']
127 OUTLIERS_HEADER = '\t'.join(OUTLIERS_HEADER_list)
128 TABLE_HEADER='fid1_iid1\tfid2_iid2\tmean\tsdev\tzmean\tzsdev\tgeno\trelcode\tpid1\tmid1\tpid2\tmid2\n'
129
130
131 ### Relationship codes, text, and lookups/mappings
132 N_RELATIONSHIP_TYPES = 7
133 REL_DUPE, REL_PARENTCHILD, REL_SIBS, REL_HALFSIBS, REL_RELATED, REL_UNRELATED, REL_UNKNOWN = range(N_RELATIONSHIP_TYPES)
134 REL_LOOKUP = {
135 REL_DUPE: ('dupe', R_BLUE, 1),
136 REL_PARENTCHILD: ('parentchild', R_YELLOW, 1),
137 REL_SIBS: ('sibpairs', R_RED, 1),
138 REL_HALFSIBS: ('halfsibs', R_GREEN, 1),
139 REL_RELATED: ('parents', R_PURPLE, 1),
140 REL_UNRELATED: ('unrelated', R_CYAN, 1),
141 REL_UNKNOWN: ('unknown', R_GRAY, 1),
142 }
143 OUTLIER_STDEVS = {
144 REL_DUPE: 2,
145 REL_PARENTCHILD: 2,
146 REL_SIBS: 2,
147 REL_HALFSIBS: 2,
148 REL_RELATED: 2,
149 REL_UNRELATED: 3,
150 REL_UNKNOWN: 2,
151 }
152 # note now Z can be passed in
153
154 REL_STATES = [REL_LOOKUP[r][0] for r in range(N_RELATIONSHIP_TYPES)]
155 REL_COLORS = SVG_COLORS
156 REL_POINTS = [REL_LOOKUP[r][2] for r in range(N_RELATIONSHIP_TYPES)]
157
158 DEFAULT_MAX_SAMPLE_SIZE = 10000
159
160 REF_COUNT_HOM1 = 3
161 REF_COUNT_HET = 2
162 REF_COUNT_HOM2 = 1
163 MISSING = 0
164 MAX_SHOW_ROWS = 100 # framingham has millions - delays showing output page - so truncate and explain
165 MARKER_PAIRS_PER_SECOND_SLOW = 15000000.0
166 MARKER_PAIRS_PER_SECOND_FAST = 70000000.0
167
168
169 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
170 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
171 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
172 <head>
173 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
174 <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
175 <title></title>
176 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
177 </head>
178 <body>
179 <div class="document">
180 """
181
182
183 SVG_HEADER = '''<?xml version="1.0" standalone="no"?>
184 <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.2//EN" "http://www.w3.org/Graphics/SVG/1.2/DTD/svg12.dtd">
185
186 <svg width="1280" height="800"
187 xmlns="http://www.w3.org/2000/svg" version="1.2"
188 xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 1280 800" onload="init()">
189
190 <script type="text/ecmascript" xlink:href="/static/scripts/checkbox_and_radiobutton.js"/>
191 <script type="text/ecmascript" xlink:href="/static/scripts/helper_functions.js"/>
192 <script type="text/ecmascript" xlink:href="/static/scripts/timer.js"/>
193 <script type="text/ecmascript">
194 <![CDATA[
195 var checkBoxes = new Array();
196 var radioGroupBandwidth;
197 var colours = ['%s','%s','%s','%s','%s','%s','%s'];
198 function init() {
199 var style = {"font-family":"Arial,Helvetica", "fill":"black", "font-size":12};
200 var dist = 12;
201 var yOffset = 4;
202
203 //A checkBox for each relationship type dupe,parentchild,sibpair,halfsib,parents,unrel,unkn
204 checkBoxes["dupe"] = new checkBox("dupe","checkboxes",20,40,"cbRect","cbCross",true,"Duplicate",style,dist,yOffset,undefined,hideShowLayer);
205 checkBoxes["parentchild"] = new checkBox("parentchild","checkboxes",20,60,"cbRect","cbCross",true,"Parent-Child",style,dist,yOffset,undefined,hideShowLayer);
206 checkBoxes["sibpairs"] = new checkBox("sibpairs","checkboxes",20,80,"cbRect","cbCross",true,"Sib-pairs",style,dist,yOffset,undefined,hideShowLayer);
207 checkBoxes["halfsibs"] = new checkBox("halfsibs","checkboxes",20,100,"cbRect","cbCross",true,"Half-sibs",style,dist,yOffset,undefined,hideShowLayer);
208 checkBoxes["parents"] = new checkBox("parents","checkboxes",20,120,"cbRect","cbCross",true,"Parents",style,dist,yOffset,undefined,hideShowLayer);
209 checkBoxes["unrelated"] = new checkBox("unrelated","checkboxes",20,140,"cbRect","cbCross",true,"Unrelated",style,dist,yOffset,undefined,hideShowLayer);
210 checkBoxes["unknown"] = new checkBox("unknown","checkboxes",20,160,"cbRect","cbCross",true,"Unknown",style,dist,yOffset,undefined,hideShowLayer);
211
212 }
213
214 function hideShowLayer(id, status, label) {
215 var vis = "hidden";
216 if (status) {
217 vis = "visible";
218 }
219 document.getElementById(id).setAttributeNS(null, 'visibility', vis);
220 }
221
222 function showBTT(evt, rel, mm, dm, md, dd, n, mg, dg, lg, hg) {
223 var x = parseInt(evt.pageX)-250;
224 var y = parseInt(evt.pageY)-110;
225 switch(rel) {
226 case 0:
227 fill = colours[rel];
228 relt = "dupe";
229 break;
230 case 1:
231 fill = colours[rel];
232 relt = "parentchild";
233 break;
234 case 2:
235 fill = colours[rel];
236 relt = "sibpairs";
237 break;
238 case 3:
239 fill = colours[rel];
240 relt = "halfsibs";
241 break;
242 case 4:
243 fill = colours[rel];
244 relt = "parents";
245 break;
246 case 5:
247 fill = colours[rel];
248 relt = "unrelated";
249 break;
250 case 6:
251 fill = colours[rel];
252 relt = "unknown";
253 break;
254 default:
255 fill = "cyan";
256 relt = "ERROR_CODE: "+rel;
257 }
258
259 document.getElementById("btRel").textContent = "GROUP: "+relt;
260 document.getElementById("btMean").textContent = "mean="+mm+" +/- "+dm;
261 document.getElementById("btSdev").textContent = "sdev="+dm+" +/- "+dd;
262 document.getElementById("btPair").textContent = "npairs="+n;
263 document.getElementById("btGeno").textContent = "ngenos="+mg+" +/- "+dg+" (min="+lg+", max="+hg+")";
264 document.getElementById("btHead").setAttribute('fill', fill);
265
266 var tt = document.getElementById("btTip");
267 tt.setAttribute("transform", "translate("+x+","+y+")");
268 tt.setAttribute('visibility', 'visible');
269 }
270
271 function showOTT(evt, rel, s1, s2, mean, sdev, ngeno, rmean, rsdev) {
272 var x = parseInt(evt.pageX)-150;
273 var y = parseInt(evt.pageY)-180;
274
275 switch(rel) {
276 case 0:
277 fill = colours[rel];
278 relt = "dupe";
279 break;
280 case 1:
281 fill = colours[rel];
282 relt = "parentchild";
283 break;
284 case 2:
285 fill = colours[rel];
286 relt = "sibpairs";
287 break;
288 case 3:
289 fill = colours[rel];
290 relt = "halfsibs";
291 break;
292 case 4:
293 fill = colours[rel];
294 relt = "parents";
295 break;
296 case 5:
297 fill = colours[rel];
298 relt = "unrelated";
299 break;
300 case 6:
301 fill = colours[rel];
302 relt = "unknown";
303 break;
304 default:
305 fill = "cyan";
306 relt = "ERROR_CODE: "+rel;
307 }
308
309 document.getElementById("otRel").textContent = "PAIR: "+relt;
310 document.getElementById("otS1").textContent = "s1="+s1;
311 document.getElementById("otS2").textContent = "s2="+s2;
312 document.getElementById("otMean").textContent = "mean="+mean;
313 document.getElementById("otSdev").textContent = "sdev="+sdev;
314 document.getElementById("otGeno").textContent = "ngenos="+ngeno;
315 document.getElementById("otRmean").textContent = "relmean="+rmean;
316 document.getElementById("otRsdev").textContent = "relsdev="+rsdev;
317 document.getElementById("otHead").setAttribute('fill', fill);
318
319 var tt = document.getElementById("otTip");
320 tt.setAttribute("transform", "translate("+x+","+y+")");
321 tt.setAttribute('visibility', 'visible');
322 }
323
324 function hideBTT(evt) {
325 document.getElementById("btTip").setAttributeNS(null, 'visibility', 'hidden');
326 }
327
328 function hideOTT(evt) {
329 document.getElementById("otTip").setAttributeNS(null, 'visibility', 'hidden');
330 }
331
332 ]]>
333 </script>
334 <defs>
335 <!-- symbols for check boxes -->
336 <symbol id="cbRect" overflow="visible">
337 <rect x="-5" y="-5" width="10" height="10" fill="white" stroke="dimgray" stroke-width="1" cursor="pointer"/>
338 </symbol>
339 <symbol id="cbCross" overflow="visible">
340 <g pointer-events="none" stroke="black" stroke-width="1">
341 <line x1="-3" y1="-3" x2="3" y2="3"/>
342 <line x1="3" y1="-3" x2="-3" y2="3"/>
343 </g>
344 </symbol>
345 </defs>
346
347 <desc>Developer Works Dynamic Scatter Graph Scaling Example</desc>
348
349 <!-- Now Draw the main X and Y axis -->
350 <g style="stroke-width:1.0; stroke:black; shape-rendering:crispEdges">
351 <!-- X Axis top and bottom -->
352 <path d="M 100 100 L 1250 100 Z"/>
353 <path d="M 100 700 L 1250 700 Z"/>
354
355 <!-- Y Axis left and right -->
356 <path d="M 100 100 L 100 700 Z"/>
357 <path d="M 1250 100 L 1250 700 Z"/>
358 </g>
359
360 <g transform="translate(100,100)">
361
362 <!-- Grid Lines -->
363 <g style="fill:none; stroke:#dddddd; stroke-width:1; stroke-dasharray:2,2; text-anchor:end; shape-rendering:crispEdges">
364
365 <!-- Vertical grid lines -->
366 <line x1="125" y1="0" x2="115" y2="600" />
367 <line x1="230" y1="0" x2="230" y2="600" />
368 <line x1="345" y1="0" x2="345" y2="600" />
369 <line x1="460" y1="0" x2="460" y2="600" />
370 <line x1="575" y1="0" x2="575" y2="600" style="stroke-dasharray:none;" />
371 <line x1="690" y1="0" x2="690" y2="600" />
372 <line x1="805" y1="0" x2="805" y2="600" />
373 <line x1="920" y1="0" x2="920" y2="600" />
374 <line x1="1035" y1="0" x2="1035" y2="600" />
375
376 <!-- Horizontal grid lines -->
377 <line x1="0" y1="60" x2="1150" y2="60" />
378 <line x1="0" y1="120" x2="1150" y2="120" />
379 <line x1="0" y1="180" x2="1150" y2="180" />
380 <line x1="0" y1="240" x2="1150" y2="240" />
381 <line x1="0" y1="300" x2="1150" y2="300" style="stroke-dasharray:none;" />
382 <line x1="0" y1="360" x2="1150" y2="360" />
383 <line x1="0" y1="420" x2="1150" y2="420" />
384 <line x1="0" y1="480" x2="1150" y2="480" />
385 <line x1="0" y1="540" x2="1150" y2="540" />
386 </g>
387
388 <!-- Legend -->
389 <g style="fill:black; stroke:none" font-size="12" font-family="Arial" transform="translate(25,25)">
390 <rect width="160" height="270" style="fill:none; stroke:black; shape-rendering:crispEdges" />
391 <text x="5" y="20" style="fill:black; stroke:none;" font-size="13" font-weight="bold">Given Pair Relationship</text>
392 <rect x="120" y="35" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
393 <rect x="120" y="55" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
394 <rect x="120" y="75" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
395 <rect x="120" y="95" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
396 <rect x="120" y="115" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
397 <rect x="120" y="135" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
398 <rect x="120" y="155" width="10" height="10" fill="%s" stroke="%s" stroke-width="1" cursor="pointer"/>
399 <text x="15" y="195" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore gt 15</text>
400 <circle cx="125" cy="192" r="6" style="stroke:red; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
401 <text x="15" y="215" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore 4 to 15</text>
402 <circle cx="125" cy="212" r="3" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
403 <text x="15" y="235" style="fill:black; stroke:none" font-size="12" font-family="Arial" >Zscore lt 4</text>
404 <circle cx="125" cy="232" r="2" style="stroke:gold; fill:gold; fill-opacity:1.0; stroke-width:1;"/>
405 <g id="checkboxes">
406 </g>
407 </g>
408
409
410 <g style='fill:black; stroke:none' font-size="17" font-family="Arial">
411 <!-- X Axis Labels -->
412 <text x="480" y="660">Mean Alleles Shared</text>
413 <text x="0" y="630" >1.0</text>
414 <text x="277" y="630" >1.25</text>
415 <text x="564" y="630" >1.5</text>
416 <text x="842" y="630" >1.75</text>
417 <text x="1140" y="630" >2.0</text>
418 </g>
419
420 <g transform="rotate(270)" style="fill:black; stroke:none" font-size="17" font-family="Arial">
421 <!-- Y Axis Labels -->
422 <text x="-350" y="-40">SD Alleles Shared</text>
423 <text x="-20" y="-10" >1.0</text>
424 <text x="-165" y="-10" >0.75</text>
425 <text x="-310" y="-10" >0.5</text>
426 <text x="-455" y="-10" >0.25</text>
427 <text x="-600" y="-10" >0.0</text>
428 </g>
429
430 <!-- Plot Title -->
431 <g style="fill:black; stroke:none" font-size="18" font-family="Arial">
432 <text x="425" y="-30">%s</text>
433 </g>
434
435 <!-- One group/layer of points for each relationship type -->
436 '''
437
438 SVG_FOOTER = '''
439 <!-- End of Data -->
440 </g>
441 <g id="btTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial">
442 <rect width="250" height="110" style="fill:silver" rx="2" ry="2"/>
443 <rect id="btHead" width="250" height="20" rx="2" ry="2" />
444 <text id="btRel" y="14" x="85">unrelated</text>
445 <text id="btMean" y="40" x="4">mean=1.5 +/- 0.04</text>
446 <text id="btSdev" y="60" x="4">sdev=0.7 +/- 0.03</text>
447 <text id="btPair" y="80" x="4">npairs=1152</text>
448 <text id="btGeno" y="100" x="4">ngenos=4783 +/- 24 (min=1000, max=5000)</text>
449 </g>
450
451 <g id="otTip" visibility="hidden" style="stroke-width:1.0; fill:black; stroke:none;" font-size="10" font-family="Arial">
452 <rect width="150" height="180" style="fill:silver" rx="2" ry="2"/>
453 <rect id="otHead" width="150" height="20" rx="2" ry="2" />
454 <text id="otRel" y="14" x="40">sibpairs</text>
455 <text id="otS1" y="40" x="4">s1=fid1,iid1</text>
456 <text id="otS2" y="60" x="4">s2=fid2,iid2</text>
457 <text id="otMean" y="80" x="4">mean=1.82</text>
458 <text id="otSdev" y="100" x="4">sdev=0.7</text>
459 <text id="otGeno" y="120" x="4">ngeno=4487</text>
460 <text id="otRmean" y="140" x="4">relmean=1.85</text>
461 <text id="otRsdev" y="160" x="4">relsdev=0.65</text>
462 </g>
463 </svg>
464 '''
465
466
467 DEFAULT_MAX_SAMPLE_SIZE = 5000
468
469 REF_COUNT_HOM1 = 3
470 REF_COUNT_HET = 2
471 REF_COUNT_HOM2 = 1
472 MISSING = 0
473
474 MARKER_PAIRS_PER_SECOND_SLOW = 15000000
475 MARKER_PAIRS_PER_SECOND_FAST = 70000000
476
477 POLYGONS = {
478 REL_UNRELATED: ((1.360, 0.655), (1.385, 0.730), (1.620, 0.575), (1.610, 0.505)),
479 REL_HALFSIBS: ((1.630, 0.500), (1.630, 0.550), (1.648, 0.540), (1.648, 0.490)),
480 REL_SIBS: ((1.660, 0.510), (1.665, 0.560), (1.820, 0.410), (1.820, 0.390)),
481 REL_PARENTCHILD: ((1.650, 0.470), (1.650, 0.490), (1.750, 0.440), (1.750, 0.420)),
482 REL_DUPE: ((1.970, 0.000), (1.970, 0.150), (2.000, 0.150), (2.000, 0.000)),
483 }
484
485 def distance(point1, point2):
486 """ Calculate the distance between two points
487 """
488 (x1,y1) = [float(d) for d in point1]
489 (x2,y2) = [float(d) for d in point2]
490 dx = abs(x1 - x2)
491 dy = abs(y1 - y2)
492 return math.sqrt(dx**2 + dy**2)
493
494 def point_inside_polygon(x, y, poly):
495 """ Determine if a point (x,y) is inside a given polygon or not
496 poly is a list of (x,y) pairs.
497
498 Taken from: http://www.ariel.com.au/a/python-point-int-poly.html
499 """
500
501 n = len(poly)
502 inside = False
503
504 p1x,p1y = poly[0]
505 for i in range(n+1):
506 p2x,p2y = poly[i % n]
507 if y > min(p1y,p2y):
508 if y <= max(p1y,p2y):
509 if x <= max(p1x,p2x):
510 if p1y != p2y:
511 xinters = (y-p1y)*(p2x-p1x)/(p2y-p1y)+p1x
512 if p1x == p2x or x <= xinters:
513 inside = not inside
514 p1x,p1y = p2x,p2y
515 return inside
516
517 def readMap(pedfile):
518 """
519 """
520 mapfile = pedfile.replace('.ped', '.map')
521 marker_list = []
522 if os.path.exists(mapfile):
523 print 'readMap: %s' % (mapfile)
524 fh = file(mapfile, 'r')
525 for line in fh:
526 marker_list.append(line.strip().split())
527 fh.close()
528 print 'readMap: %s markers' % (len(marker_list))
529 return marker_list
530
531 def calcMeanSD(useme):
532 """
533 A numerically stable algorithm is given below. It also computes the mean.
534 This algorithm is due to Knuth,[1] who cites Welford.[2]
535 n = 0
536 mean = 0
537 M2 = 0
538
539 foreach x in data:
540 n = n + 1
541 delta = x - mean
542 mean = mean + delta/n
543 M2 = M2 + delta*(x - mean) // This expression uses the new value of mean
544 end for
545
546 variance_n = M2/n
547 variance = M2/(n - 1)
548 """
549 mean = 0.0
550 M2 = 0.0
551 sd = 0.0
552 n = len(useme)
553 if n > 1:
554 for i,x in enumerate(useme):
555 delta = x - mean
556 mean = mean + delta/(i+1) # knuth uses n+=1 at start
557 M2 = M2 + delta*(x - mean) # This expression uses the new value of mean
558 variance = M2/(n-1) # assume is sample so lose 1 DOF
559 sd = pow(variance,0.5)
560 return mean,sd
561
562
563 def doIBSpy(ped=None,basename='',outdir=None,logf=None,
564 nrsSamples=10000,title='title',pdftoo=0,Zcutoff=2.0):
565 #def doIBS(pedName, title, nrsSamples=None, pdftoo=False):
566 """ started with snpmatrix but GRR uses actual IBS counts and sd's
567 """
568 repOut = [] # text strings to add to the html display
569 refallele = {}
570 tblf = '%s_table.xls' % (title)
571 tbl = file(os.path.join(outdir,tblf), 'w')
572 tbl.write(TABLE_HEADER)
573 svgf = '%s.svg' % (title)
574 svg = file(os.path.join(outdir,svgf), 'w')
575
576 nMarkers = len(ped._markers)
577 if nMarkers < 5:
578 print sys.stderr, '### ERROR - %d is too few markers for reliable estimation in %s - terminating' % (nMarkers,PROGNAME)
579 sys.exit(1)
580 nSubjects = len(ped._subjects)
581 nrsSamples = min(nMarkers, nrsSamples)
582 if opts and opts.use_mito:
583 markers = range(nMarkers)
584 nrsSamples = min(len(markers), nrsSamples)
585 sampleIndexes = sorted(random.sample(markers, nrsSamples))
586 else:
587 autosomals = ped.autosomal_indices()
588 nrsSamples = min(len(autosomals), nrsSamples)
589 sampleIndexes = sorted(random.sample(autosomals, nrsSamples))
590
591 print ''
592 print 'Getting random.sample of %s from %s total' % (nrsSamples, nMarkers)
593 npairs = (nSubjects*(nSubjects-1))/2 # total rows in table
594 newfiles=[svgf,tblf]
595 explanations = ['rgGRR Plot (requires SVG)','Mean by SD alleles shared - %d rows' % npairs]
596 # these go with the output file links in the html file
597 s = 'Reading genotypes for %s subjects and %s markers\n' % (nSubjects, nrsSamples)
598 logf.write(s)
599 minUsegenos = nrsSamples/2 # must have half?
600 nGenotypes = nSubjects*nrsSamples
601 stime = time.time()
602 emptyRows = set()
603 genos = numpy.zeros((nSubjects, nrsSamples), dtype=int)
604 for s in xrange(nSubjects):
605 nValid = 0
606 #getGenotypesByIndices(self, s, mlist, format)
607 genos[s] = ped.getGenotypesByIndices(s, sampleIndexes, format='ref')
608 nValid = sum([1 for g in genos[s] if g])
609 if not nValid:
610 emptyRows.add(s)
611 sub = ped.getSubject(s)
612 print 'All missing for row %d (%s)' % (s, sub)
613 logf.write('All missing for row %d (%s)\n' % (s, sub))
614 rtime = time.time() - stime
615 if verbose:
616 print '@@Read %s genotypes in %s seconds' % (nGenotypes, rtime)
617
618
619 ### Now the expensive part. For each pair of subjects, we get the mean number
620 ### and standard deviation of shared alleles over all of the markers where both
621 ### subjects have a known genotype. Identical subjects should have mean shared
622 ### alleles very close to 2.0 with a standard deviation very close to 0.0.
623 tot = nSubjects*(nSubjects-1)/2
624 nprog = tot/10
625 nMarkerpairs = tot * nrsSamples
626 estimatedTimeSlow = nMarkerpairs/MARKER_PAIRS_PER_SECOND_SLOW
627 estimatedTimeFast = nMarkerpairs/MARKER_PAIRS_PER_SECOND_FAST
628
629 pairs = []
630 pair_data = {}
631 means = [] ## Mean IBS for each pair
632 ngenoL = [] ## Count of comparable genotypes for each pair
633 sdevs = [] ## Standard dev for each pair
634 rels = [] ## A relationship code for each pair
635 zmeans = [0.0 for x in xrange(tot)] ## zmean score for each pair for the relgroup
636 zstds = [0.0 for x in xrange(tot)] ## zstd score for each pair for the relgrp
637 skip = set()
638 ndone = 0 ## How many have been done so far
639
640 logf.write('Calculating %d pairs...\n' % (tot))
641 logf.write('Estimated time is %2.2f to %2.2f seconds ...\n' % (estimatedTimeFast, estimatedTimeSlow))
642
643 t1sum = 0
644 t2sum = 0
645 t3sum = 0
646 now = time.time()
647 scache = {}
648 _founder_cache = {}
649 C_CODE = """
650 #include "math.h"
651 int i;
652 int sumibs = 0;
653 int ssqibs = 0;
654 int ngeno = 0;
655 float mean = 0;
656 float M2 = 0;
657 float delta = 0;
658 float sdev=0;
659 float variance=0;
660 for (i=0; i<nrsSamples; i++) {
661 int a1 = g1[i];
662 int a2 = g2[i];
663 if (a1 != 0 && a2 != 0) {
664 ngeno += 1;
665 int shared = 2-abs(a1-a2);
666 delta = shared - mean;
667 mean = mean + delta/ngeno;
668 M2 += delta*(shared-mean);
669 // yes that second time, the updated mean is used see calcmeansd above;
670 //printf("%d %d %d %d %d %d\\n", i, a1, a2, ngeno, shared, squared);
671 }
672 }
673 if (ngeno > 1) {
674 variance = M2/(ngeno-1);
675 sdev = sqrt(variance);
676 //printf("OK: %d %3.2f %3.2f\\n", ngeno, mean, sdev);
677 }
678 //printf("%d %d %d %1.2f %1.2f\\n", ngeno, sumibs, ssqibs, mean, sdev);
679 result[0] = ngeno;
680 result[1] = mean;
681 result[2] = sdev;
682 return_val = ngeno;
683 """
684 started = time.time()
685 for s1 in xrange(nSubjects):
686 if s1 in emptyRows:
687 continue
688 (fid1,iid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache.setdefault(s1, ped.getSubject(s1))
689
690 isFounder1 = _founder_cache.setdefault(s1, (did1==mid1))
691 g1 = genos[s1]
692
693 for s2 in xrange(s1+1, nSubjects):
694 if s2 in emptyRows:
695 continue
696 t1s = time.time()
697
698 (fid2,iid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache.setdefault(s2, ped.getSubject(s2))
699
700 g2 = genos[s2]
701 isFounder2 = _founder_cache.setdefault(s2, (did2==mid2))
702
703 # Determine the relationship for this pair
704 relcode = REL_UNKNOWN
705 if (fid2 == fid1):
706 if iid1 == iid2:
707 relcode = REL_DUPE
708 elif (did2 == did1) and (mid2 == mid1) and did1 != mid1:
709 relcode = REL_SIBS
710 elif (iid1 == mid2) or (iid1 == did2) or (iid2 == mid1) or (iid2 == did1):
711 relcode = REL_PARENTCHILD
712 elif (str(did1) != '0' and (did2 == did1)) or (str(mid1) != '0' and (mid2 == mid1)):
713 relcode = REL_HALFSIBS
714 else:
715 # People in the same family should be marked as some other
716 # form of related. In general, these people will have a
717 # pretty random spread of similarity. This distinction is
718 # probably not very useful most of the time
719 relcode = REL_RELATED
720 else:
721 ### Different families
722 relcode = REL_UNRELATED
723
724 t1e = time.time()
725 t1sum += t1e-t1s
726
727
728 ### Calculate sum(2-abs(a1-a2)) and sum((2-abs(a1-a2))**2) and count
729 ### the number of contributing genotypes. These values are not actually
730 ### calculated here, but instead are looked up in a table for speed.
731 ### FIXME: This is still too slow ...
732 result = [0.0, 0.0, 0.0]
733 ngeno = weave.inline(C_CODE, ['g1', 'g2', 'nrsSamples', 'result'])
734 if ngeno >= minUsegenos:
735 _, mean, sdev = result
736 means.append(mean)
737 sdevs.append(sdev)
738 ngenoL.append(ngeno)
739 pairs.append((s1, s2))
740 rels.append(relcode)
741 else:
742 skip.add(ndone) # signal no comparable genotypes for this pair
743 ndone += 1
744 t2e = time.time()
745 t2sum += t2e-t1e
746 t3e = time.time()
747 t3sum += t3e-t2e
748
749 logme = [ 'T1: %s' % (t1sum), 'T2: %s' % (t2sum), 'T3: %s' % (t3sum),'TOT: %s' % (t3e-now),
750 '%s pairs with no (or not enough) comparable genotypes (%3.1f%%)' % (len(skip),
751 float(len(skip))/float(tot)*100)]
752 logf.write('%s\n' % '\t'.join(logme))
753 ### Calculate mean and standard deviation of scores on a per relationship
754 ### type basis, allowing us to flag outliers for each particular relationship
755 ### type
756 relstats = {}
757 relCounts = {}
758 outlierFiles = {}
759 for relCode, relInfo in REL_LOOKUP.items():
760 relName, relColor, relStyle = relInfo
761 useme = [means[x] for x in xrange(len(means)) if rels[x] == relCode]
762 relCounts[relCode] = len(useme)
763 mm = scipy.mean(useme)
764 ms = scipy.std(useme)
765 useme = [sdevs[x] for x in xrange(len(sdevs)) if rels[x] == relCode]
766 sm = scipy.mean(useme)
767 ss = scipy.std(useme)
768 relstats[relCode] = {'sd':(sm,ss), 'mean':(mm,ms)}
769 s = 'Relstate %s (n=%d): mean(mean)=%3.2f sdev(mean)=%3.2f, mean(sdev)=%3.2f sdev(sdev)=%3.2f\n' % \
770 (relName,relCounts[relCode], mm, ms, sm, ss)
771 logf.write(s)
772
773 ### now fake z scores for each subject like abecasis recommends max(|zmu|,|zsd|)
774 ### within each group, for each pair, z=(groupmean-pairmean)/groupsd
775 available = len(means)
776 logf.write('%d pairs are available of %d\n' % (available, tot))
777 ### s = '\nOutliers:\nrelationship\tzmean\tzsd\tped1\tped2\tmean\tsd\trmeanmean\trmeansd\trsdmean\trsdsd\n'
778 ### logf.write(s)
779 pairnum = 0
780 offset = 0
781 nOutliers = 0
782 cexs = []
783 outlierRecords = dict([(r, []) for r in range(N_RELATIONSHIP_TYPES)])
784 zsdmax = 0
785 for s1 in range(nSubjects):
786 if s1 in emptyRows:
787 continue
788 (fid1,iid1,did1,mid1,sex1,aff1,ok1,d_sid1,m_sid1) = scache[s1]
789 for s2 in range(s1+1, nSubjects):
790 if s2 in emptyRows:
791 continue
792 if pairnum not in skip:
793 ### Get group stats for this relationship
794 (fid2,iid2,did2,mid2,sex2,aff2,ok2,d_sid2,m_sid2) = scache[s2]
795 try:
796 r = rels[offset]
797 except IndexError:
798 logf.write('###OOPS offset %d available %d pairnum %d len(rels) %d', offset, available, pairnum, len(rels))
799 notfound = ('?',('?','0','0'))
800 relInfo = REL_LOOKUP.get(r,notfound)
801 relName, relColor, relStyle = relInfo
802 rmm,rmd = relstats[r]['mean'] # group mean, group meansd alleles shared
803 rdm,rdd = relstats[r]['sd'] # group sdmean, group sdsd alleles shared
804
805 try:
806 zsd = (sdevs[offset] - rdm)/rdd # distance from group mean in group sd units
807 except:
808 zsd = 1
809 if abs(zsd) > zsdmax:
810 zsdmax = zsd # keep for sort scaling
811 try:
812 zmean = (means[offset] - rmm)/rmd # distance from group mean
813 except:
814 zmean = 1
815 zmeans[offset] = zmean
816 zstds[offset] = zsd
817 pid=(s1,s2)
818 zrad = max(zsd,zmean)
819 if zrad < 4:
820 zrad = 2
821 elif 4 < zrad < 15:
822 zrad = 3 # to 9
823 else: # > 15 6=24+
824 zrad=zrad/4
825 zrad = min(zrad,6) # scale limit
826 zrad = max(2,max(zsd,zmean)) # as > 2, z grows
827 pair_data[pid] = (zmean,zsd,r,zrad)
828 if max(zsd,zmean) > Zcutoff: # is potentially interesting
829 mean = means[offset]
830 sdev = sdevs[offset]
831 outlierRecords[r].append((mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd,did1,mid1,did2,mid2))
832 nOutliers += 1
833 tbl.write('%s_%s\t%s_%s\t%f\t%f\t%f\t%f\t%d\t%s\t%s\t%s\t%s\t%s\n' % \
834 (fid1, iid1, fid2, iid2, mean, sdev, zmean,zsd, ngeno, relName, did1,mid1,did2,mid2))
835 offset += 1
836 pairnum += 1
837 logf.write( 'Outliers: %s\n' % (nOutliers))
838
839 ### Write outlier files for each relationship type
840 repOut.append('<h2>Outliers in tab delimited files linked above are also listed below</h2>')
841 lzsd = round(numpy.log10(zsdmax)) + 1
842 scalefactor = 10**lzsd
843 for relCode, relInfo in REL_LOOKUP.items():
844 relName, _, _ = relInfo
845 outliers = outlierRecords[relCode]
846 if not outliers:
847 continue
848 outliers = [(scalefactor*int(abs(x[3]))+ int(abs(x[2])),x) for x in outliers] # decorate
849 outliers.sort()
850 outliers.reverse() # largest deviation first
851 outliers = [x[1] for x in outliers] # undecorate
852 nrows = len(outliers)
853 truncated = 0
854 if nrows > MAX_SHOW_ROWS:
855 s = '<h3>%s outlying pairs (top %d of %d) from %s</h3><table border="0" cellpadding="3">' % \
856 (relName,MAX_SHOW_ROWS,nrows,title)
857 truncated = nrows - MAX_SHOW_ROWS
858 else:
859 s = '<h3>%s outlying pairs (n=%d) from %s</h3><table border="0" cellpadding="3">' % (relName,nrows,title)
860 repOut.append(s)
861 fhname = '%s_rgGRR_%s_outliers.xls' % (title, relName)
862 fhpath = os.path.join(outdir,fhname)
863 fh = open(fhpath, 'w')
864 newfiles.append(fhname)
865 explanations.append('%s Outlier Pairs %s, N=%d, Cutoff SD=%f' % (relName,title,len(outliers),Zcutoff))
866 fh.write(OUTLIERS_HEADER)
867 s = ''.join(['<th>%s</th>' % x for x in OUTLIERS_HEADER_list])
868 repOut.append('<tr align="center">%s</tr>' % s)
869 for n,rec in enumerate(outliers):
870 #(mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd) = rec
871 s = '%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t' % tuple(rec)
872 fh.write('%s%s\n' % (s,relName))
873 # (mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd, did1,mid1,did2,mid2))
874 s = '''<td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td>
875 <td>%s</td><td>%s</td><td>%f</td><td>%f</td><td>%f</td><td>%f</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>''' % tuple(rec)
876 s = '%s<td>%s</td>' % (s,relName)
877 if n < MAX_SHOW_ROWS:
878 repOut.append('<tr align="center">%s</tr>' % s)
879 if truncated > 0:
880 repOut.append('<H2>WARNING: %d rows truncated - see outlier file for all %d rows</H2>' % (truncated,
881 nrows))
882 fh.close()
883 repOut.append('</table><p>')
884
885 ### Now, draw the plot in jpeg and svg formats, and optionally in the PDF format
886 ### if requested
887 logf.write('Plotting ...')
888 pointColors = [REL_COLORS[rel] for rel in rels]
889 pointStyles = [REL_POINTS[rel] for rel in rels]
890
891 mainTitle = '%s (%s subjects, %d snp)' % (title, nSubjects, nrsSamples)
892 svg.write(SVG_HEADER % (SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[4],
893 SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[0],SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[1],
894 SVG_COLORS[2],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[3],SVG_COLORS[4],SVG_COLORS[4],
895 SVG_COLORS[5],SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[6],mainTitle))
896 #rpy.r.jpeg(filename='%s.jpg' % (title), width=1600, height=1200, pointsize=12, quality=100, bg='white')
897 #rpy.r.par(mai=(1,1,1,0.5))
898 #rpy.r('par(xaxs="i",yaxs="i")')
899 #rpy.r.plot(means, sdevs, main=mainTitle, ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2))
900 #rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE)
901 #rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted')
902 #rpy.r.dev_off()
903
904 ### We will now go through each relationship type to partition plot points
905 ### into "bulk" and "outlier" groups. Bulk points will represent common
906 ### mean/sdev pairs and will cover the majority of the points in the plot --
907 ### they will use generic tooltip informtion about all of the pairs
908 ### represented by that point. "Outlier" points will be uncommon pairs,
909 ### with very specific information in their tooltips. It would be nice to
910 ### keep hte total number of plotted points in the SVG representation to
911 ### ~10000 (certainly less than 100000?)
912 pointMap = {}
913 orderedRels = [y[1] for y in reversed(sorted([(relCounts.get(x, 0),x) for x in REL_LOOKUP.keys()]))]
914 # do we really want this? I want out of zone points last and big
915 for relCode in orderedRels:
916 svgColor = SVG_COLORS[relCode]
917 relName, relColor, relStyle = REL_LOOKUP[relCode]
918 svg.write('<g id="%s" style="stroke:%s; fill:%s; fill-opacity:1.0; stroke-width:1;" cursor="pointer">\n' % (relName, svgColor, svgColor))
919 pMap = pointMap.setdefault(relCode, {})
920 nPoints = 0
921 rpairs=[]
922 rgenos=[]
923 rmeans=[]
924 rsdevs=[]
925 rz = []
926 for x,rel in enumerate(rels): # all pairs
927 if rel == relCode:
928 s1,s2 = pairs[x]
929 pid=(s1,s2)
930 zmean,zsd,r,zrad = pair_data[pid][:4]
931 rpairs.append(pairs[x])
932 rgenos.append(ngenoL[x])
933 rmeans.append(means[x])
934 rsdevs.append(sdevs[x])
935 rz.append(zrad)
936 ### Now add the svg point group for this relationship to the svg file
937 for x in range(len(rmeans)):
938 svgX = '%d' % ((rmeans[x] - 1.0) * PLOT_WIDTH) # changed so mean scale is 1-2
939 svgY = '%d' % (PLOT_HEIGHT - (rsdevs[x] * PLOT_HEIGHT)) # changed so sd scale is 0-1
940 s1, s2 = rpairs[x]
941 (fid1,uid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache[s1]
942 (fid2,uid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache[s2]
943 ngenos = rgenos[x]
944 nPoints += 1
945 point = pMap.setdefault((svgX, svgY), [])
946 point.append((rmeans[x], rsdevs[x], fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos,rz[x]))
947 for (svgX, svgY) in pMap:
948 points = pMap[(svgX, svgY)]
949 svgX = int(svgX)
950 svgY = int(svgY)
951 if len(points) > 1:
952 mmean,dmean = calcMeanSD([p[0] for p in points])
953 msdev,dsdev = calcMeanSD([p[1] for p in points])
954 mgeno,dgeno = calcMeanSD([p[-1] for p in points])
955 mingeno = min([p[-1] for p in points])
956 maxgeno = max([p[-1] for p in points])
957 svg.write("""<circle cx="%d" cy="%d" r="2"
958 onmouseover="showBTT(evt, %d, %1.2f, %1.2f, %1.2f, %1.2f, %d, %d, %d, %d, %d)"
959 onmouseout="hideBTT(evt)" />\n""" % (svgX, svgY, relCode, mmean, dmean, msdev, dsdev, len(points), mgeno, dgeno, mingeno, maxgeno))
960 else:
961 mean, sdev, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos, zrad = points[0][:12]
962 rmean = float(relstats[relCode]['mean'][0])
963 rsdev = float(relstats[relCode]['sd'][0])
964 if zrad < 4:
965 zrad = 2
966 elif 4 < zrad < 9:
967 zrad = 3 # to 9
968 else: # > 9 5=15+
969 zrad=zrad/3
970 zrad = min(zrad,5) # scale limit
971 if zrad <= 3:
972 svg.write('<circle cx="%d" cy="%d" r="%s" onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)" onmouseout="hideOTT(evt)" />\n' % (svgX, svgY, zrad, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev))
973 else: # highlight pairs a long way from expectation by outlining circle in red
974 svg.write("""<circle cx="%d" cy="%d" r="%s" style="stroke:red; fill:%s; fill-opacity:1.0; stroke-width:1;"
975 onmouseover="showOTT(evt, %d, \'%s,%s,%s,%s\', \'%s,%s,%s,%s\', %1.2f, %1.2f, %s, %1.2f, %1.2f)"
976 onmouseout="hideOTT(evt)" />\n""" % \
977 (svgX, svgY, zrad, svgColor, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev))
978 svg.write('</g>\n')
979
980 ### Create a pdf as well if indicated on the command line
981 ### WARNING! for framingham share, with about 50M pairs, this is a 5.5GB pdf!
982 ## if pdftoo:
983 ## pdfname = '%s.pdf' % (title)
984 ## rpy.r.pdf(pdfname, 6, 6)
985 ## rpy.r.par(mai=(1,1,1,0.5))
986 ## rpy.r('par(xaxs="i",yaxs="i")')
987 ## rpy.r.plot(means, sdevs, main='%s, %d snp' % (title, nSamples), ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2))
988 ## rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE)
989 ## rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted')
990 ## rpy.r.dev_off()
991
992 ### Draw polygons
993 if showPolygons:
994 svg.write('<g id="polygons" cursor="pointer">\n')
995 for rel, poly in POLYGONS.items():
996 points = ' '.join(['%s,%s' % ((p[0]-1.0)*float(PLOT_WIDTH), (PLOT_HEIGHT - p[1]*PLOT_HEIGHT)) for p in poly])
997 svg.write('<polygon points="%s" fill="transparent" style="stroke:%s; stroke-width:1"/>\n' % (points, SVG_COLORS[rel]))
998 svg.write('</g>\n')
999
1000
1001 svg.write(SVG_FOOTER)
1002 svg.close()
1003 return newfiles,explanations,repOut
1004
1005 def doIBS(n=100):
1006 """parse parameters from galaxy
1007 expect 'input pbed path' 'basename' 'outpath' 'title' 'logpath' 'n'
1008 <command interpreter="python">
1009 rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"
1010 '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z'
1011 </command>
1012
1013 """
1014 u="""<command interpreter="python">
1015 rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"
1016 '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z'
1017 </command>
1018 """
1019
1020
1021 if len(sys.argv) < 7:
1022 print >> sys.stdout, 'Need pbed inpath, basename, out_htmlname, outpath, title, logpath, nSNP, Zcutoff on command line please'
1023 print >> sys.stdout, u
1024 sys.exit(1)
1025 ts = '%s%s' % (string.punctuation,string.whitespace)
1026 ptran = string.maketrans(ts,'_'*len(ts))
1027 inpath = sys.argv[1]
1028 ldinpath = os.path.split(inpath)[0]
1029 basename = sys.argv[2]
1030 outhtml = sys.argv[3]
1031 newfilepath = sys.argv[4]
1032 title = sys.argv[5].translate(ptran)
1033 logfname = 'Log_%s.txt' % title
1034 logpath = os.path.join(newfilepath,logfname) # log was a child - make part of html extra_files_path zoo
1035 n = int(sys.argv[6])
1036 try:
1037 Zcutoff = float(sys.argv[7])
1038 except:
1039 Zcutoff = 2.0
1040 try:
1041 os.makedirs(newfilepath)
1042 except:
1043 pass
1044 logf = file(logpath,'w')
1045 efp,ibase_name = os.path.split(inpath) # need to use these for outputs in files_path
1046 ped = plinkbinJZ.BPed(inpath)
1047 ped.parse(quick=True)
1048 if ped == None:
1049 print >> sys.stderr, '## doIBSpy problem - cannot open %s or %s - cannot run' % (ldreduced,basename)
1050 sys.exit(1)
1051 newfiles,explanations,repOut = doIBSpy(ped=ped,basename=basename,outdir=newfilepath,
1052 logf=logf,nrsSamples=n,title=title,pdftoo=0,Zcutoff=Zcutoff)
1053 logf.close()
1054 logfs = file(logpath,'r').readlines()
1055 lf = file(outhtml,'w')
1056 lf.write(galhtmlprefix % PROGNAME)
1057 # this is a mess. todo clean up - should each datatype have it's own directory? Yes
1058 # probably. Then titles are universal - but userId libraries are separate.
1059 s = '<div>Output from %s run at %s<br>\n' % (PROGNAME,timenow())
1060 lf.write('<h4>%s</h4>\n' % s)
1061 fixed = ["'%s'" % x for x in sys.argv] # add quotes just in case
1062 s = 'If you need to rerun this analysis, the command line was\n<pre>%s</pre>\n</div>' % (' '.join(fixed))
1063 lf.write(s)
1064 # various ways of displaying svg - experiments related to missing svg mimetype on test (!)
1065 #s = """<object data="%s" type="image/svg+xml" width="%d" height="%d">
1066 # <embed src="%s" type="image/svg+xml" width="%d" height="%d" />
1067 # </object>""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT,newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
1068 s = """ <embed src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
1069 #s = """ <iframe src="%s" type="image/svg+xml" width="%d" height="%d" />""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT)
1070 lf.write(s)
1071 lf.write('<div><h4>Click the links below to save output files and plots</h4><br><ol>\n')
1072 for i in range(len(newfiles)):
1073 if i == 0:
1074 lf.write('<li><a href="%s" type="image/svg+xml" >%s</a></li>\n' % (newfiles[i],explanations[i]))
1075 else:
1076 lf.write('<li><a href="%s">%s</a></li>\n' % (newfiles[i],explanations[i]))
1077 flist = os.listdir(newfilepath)
1078 for fname in flist:
1079 if not fname in newfiles:
1080 lf.write('<li><a href="%s">%s</a></li>\n' % (fname,fname))
1081 lf.write('</ol></div>')
1082 lf.write('<div>%s</div>' % ('\n'.join(repOut))) # repOut is a list of tables
1083 lf.write('<div><hr><h3>Log from this job (also stored in %s)</h3><pre>%s</pre><hr></div>' % (logfname,''.join(logfs)))
1084 lf.write('</body></html>\n')
1085 lf.close()
1086 logf.close()
1087
1088 if __name__ == '__main__':
1089 doIBS()