Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgutils.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 # utilities for rgenetics | |
2 # | |
3 # copyright 2009 ross lazarus | |
4 # released under the LGPL | |
5 # | |
6 | |
7 import subprocess, os, sys, time, tempfile,string,plinkbinJZ | |
8 import datetime | |
9 | |
10 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> | |
11 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
12 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
13 <head> | |
14 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
15 <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> | |
16 <title></title> | |
17 <link rel="stylesheet" href="/static/style/base.css" type="text/css" /> | |
18 </head> | |
19 <body> | |
20 <div class="document"> | |
21 """ | |
22 galhtmlattr = """<h3><a href="http://rgenetics.org">Rgenetics</a> tool %s run at %s</h3>""" | |
23 galhtmlpostfix = """</div></body></html>\n""" | |
24 | |
25 plinke = 'plink' # changed jan 2010 - all exes must be on path | |
26 rexe = 'R' # to avoid cluster/platform dependencies | |
27 smartpca = 'smartpca.perl' | |
28 | |
29 def timenow(): | |
30 """return current time as a string | |
31 """ | |
32 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) | |
33 | |
34 def timestamp(): | |
35 return datetime.datetime.now().strftime('%Y%m%d%H%M%S') | |
36 | |
37 def fail( message ): | |
38 print >> sys.stderr, message | |
39 return -1 | |
40 | |
41 def whereis(program): | |
42 for path in os.environ.get('PATH', '').split(':'): | |
43 if os.path.exists(os.path.join(path, program)) and \ | |
44 not os.path.isdir(os.path.join(path, program)): | |
45 return os.path.join(path, program) | |
46 return None | |
47 | |
48 | |
49 def bedToPicInterval(infile=None): | |
50 """ | |
51 Picard tools requiring targets want | |
52 a sam style header which incidentally, MUST be sorted in natural order - not lexicographic order: | |
53 | |
54 @SQ SN:chrM LN:16571 | |
55 @SQ SN:chr1 LN:247249719 | |
56 @SQ SN:chr2 LN:242951149 | |
57 @SQ SN:chr3 LN:199501827 | |
58 @SQ SN:chr4 LN:191273063 | |
59 added to the start of what looks like a bed style file | |
60 chr1 67052400 67052451 - CCDS635.1_cds_0_0_chr1_67052401_r | |
61 chr1 67060631 67060788 - CCDS635.1_cds_1_0_chr1_67060632_r | |
62 chr1 67065090 67065317 - CCDS635.1_cds_2_0_chr1_67065091_r | |
63 chr1 67066082 67066181 - CCDS635.1_cds_3_0_chr1_67066083_r | |
64 | |
65 | |
66 see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 | |
67 we need to add 1 to start coordinates on the way through - but length calculations are easier | |
68 """ | |
69 # bedToPicard.py | |
70 # ross lazarus October 2010 | |
71 # LGPL | |
72 # for Rgenetics | |
73 | |
74 def getFlen(bedfname=None): | |
75 """ | |
76 find all features in a BED file and sum their lengths | |
77 """ | |
78 features = {} | |
79 try: | |
80 infile = open(bedfname,'r') | |
81 except: | |
82 print '###ERROR: getFlen unable to open bedfile %s' % bedfname | |
83 sys.exit(1) | |
84 for i,row in enumerate(infile): | |
85 if row[0] == '@': # shouldn't happen given a bed file! | |
86 print 'row %d=%s - should NOT start with @!' % (i,row) | |
87 sys.exit(1) | |
88 row = row.strip() | |
89 if len(row) > 0: | |
90 srow = row.split('\t') | |
91 f = srow[0] | |
92 spos = srow[1] # zero based from UCSC so no need to add 1 - eg 0-100 is 100 bases numbered 0-99 (!) | |
93 epos = srow[2] # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 | |
94 flen = int(epos) - int(spos) | |
95 features.setdefault(f,0) | |
96 features[f] += flen | |
97 infile.close() | |
98 return features | |
99 | |
100 def keynat(string): | |
101 ''' | |
102 borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/ | |
103 A natural sort helper function for sort() and sorted() | |
104 without using regular expressions or exceptions. | |
105 | |
106 >>> items = ('Z', 'a', '10th', '1st', '9') | |
107 >>> sorted(items) | |
108 ['10th', '1st', '9', 'Z', 'a'] | |
109 >>> sorted(items, key=keynat) | |
110 ['1st', '9', '10th', 'a', 'Z'] | |
111 ''' | |
112 it = type(1) | |
113 r = [] | |
114 for c in string: | |
115 if c.isdigit(): | |
116 d = int(c) | |
117 if r and type( r[-1] ) == it: | |
118 r[-1] = r[-1] * 10 + d | |
119 else: | |
120 r.append(d) | |
121 else: | |
122 r.append(c.lower()) | |
123 return r | |
124 | |
125 def writePic(outfname=None,bedfname=None): | |
126 """ | |
127 collect header info and rewrite bed with header for picard | |
128 """ | |
129 featlen = getFlen(bedfname=bedfname) | |
130 try: | |
131 outf = open(outfname,'w') | |
132 except: | |
133 print '###ERROR: writePic unable to open output picard file %s' % outfname | |
134 sys.exit(1) | |
135 infile = open(bedfname,'r') # already tested in getFlen | |
136 k = featlen.keys() | |
137 fk = sorted(k, key=keynat) | |
138 header = ['@SQ\tSN:%s\tLN:%d' % (x,featlen[x]) for x in fk] | |
139 outf.write('\n'.join(header)) | |
140 outf.write('\n') | |
141 for row in infile: | |
142 row = row.strip() | |
143 if len(row) > 0: # convert zero based start coordinate to 1 based | |
144 srow = row.split('\t') | |
145 srow[1] = '%d' % (int(srow[1])+1) # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 | |
146 outf.write('\t'.join(srow)) | |
147 outf.write('\n') | |
148 outf.close() | |
149 infile.close() | |
150 | |
151 | |
152 | |
153 # bedToPicInterval starts here | |
154 fd,outf = tempfile.mkstemp(prefix='rgPicardHsMetrics') | |
155 writePic(outfname=outf,bedfname=infile) | |
156 return outf | |
157 | |
158 | |
159 def getFileString(fpath, outpath): | |
160 """ | |
161 format a nice file size string | |
162 """ | |
163 size = '' | |
164 fp = os.path.join(outpath, fpath) | |
165 s = '? ?' | |
166 if os.path.isfile(fp): | |
167 n = float(os.path.getsize(fp)) | |
168 if n > 2**20: | |
169 size = ' (%1.1f MB)' % (n/2**20) | |
170 elif n > 2**10: | |
171 size = ' (%1.1f KB)' % (n/2**10) | |
172 elif n > 0: | |
173 size = ' (%d B)' % (int(n)) | |
174 s = '%s %s' % (fpath, size) | |
175 return s | |
176 | |
177 | |
178 def fixPicardOutputs(tempout=None,output_dir=None,log_file=None,html_output=None,progname=None,cl=[],transpose=True): | |
179 """ | |
180 picard produces long hard to read tab header files | |
181 make them available but present them transposed for readability | |
182 """ | |
183 rstyle="""<style type="text/css"> | |
184 tr.d0 td {background-color: oldlace; color: black;} | |
185 tr.d1 td {background-color: aliceblue; color: black;} | |
186 </style>""" | |
187 cruft = [] | |
188 dat = [] | |
189 try: | |
190 r = open(tempout,'r').readlines() | |
191 except: | |
192 r = [] | |
193 for row in r: | |
194 if row.strip() > '': | |
195 srow = row.split('\t') | |
196 if row[0] == '#': | |
197 cruft.append(row.strip()) # want strings | |
198 else: | |
199 dat.append(srow) # want lists | |
200 | |
201 res = [rstyle,] | |
202 res.append(galhtmlprefix % progname) | |
203 res.append(galhtmlattr % (progname,timenow())) | |
204 flist = os.listdir(output_dir) | |
205 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf'] | |
206 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs | |
207 for p in pdflist: | |
208 imghref = '%s.jpg' % os.path.splitext(p)[0] # removes .pdf | |
209 res.append('<table cellpadding="10"><tr><td>\n') | |
210 res.append('<a href="%s"><img src="%s" alt="Click thumbnail to download %s" hspace="10" align="middle"></a>\n' % (p,imghref,p)) | |
211 res.append('</tr></td></table>\n') | |
212 res.append('<b>Your job produced the following output files.</b><hr/>\n') | |
213 res.append('<table>\n') | |
214 for i,f in enumerate(flist): | |
215 fn = os.path.split(f)[-1] | |
216 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn)) | |
217 res.append('</table><p/>\n') | |
218 if len(cruft) + len(dat) > 0: | |
219 res.append('<b>Picard on line resources</b><ul>\n') | |
220 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n') | |
221 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n') | |
222 if transpose: | |
223 res.append('<b>Picard output (transposed for readability)</b><hr/>\n') | |
224 else: | |
225 res.append('<b>Picard output</b><hr/>\n') | |
226 res.append('<table cellpadding="3" >\n') | |
227 if len(cruft) > 0: | |
228 cres = ['<tr class="d%d"><td>%s</td></tr>' % (i % 2,x) for i,x in enumerate(cruft)] | |
229 res += cres | |
230 if len(dat) > 0: | |
231 maxrows = 100 | |
232 if transpose: | |
233 tdat = map(None,*dat) # transpose an arbitrary list of lists | |
234 missing = len(tdat) - maxrows | |
235 tdat = ['<tr class="d%d"><td>%s</td><td>%s</td></tr>\n' % ((i+len(cruft)) % 2,x[0],x[1]) for i,x in enumerate(tdat) if i < maxrows] | |
236 if len(tdat) > maxrows: | |
237 tdat.append('<tr><td colspan="2">...WARNING: %d rows deleted for sanity...see raw files for all rows</td></tr>' % missing) | |
238 else: | |
239 tdat = ['<tr class="d%d"><td>%s</td></tr>\n' % ((i+len(cruft)) % 2,x) for i,x in enumerate(dat) if i < maxrows] | |
240 if len(dat) > maxrows: | |
241 missing = len(dat) - maxrows | |
242 tdat.append('<tr><td>...WARNING: %d rows deleted for sanity...see raw files for all rows</td></tr>' % missing) | |
243 res += tdat | |
244 res.append('</table>\n') | |
245 else: | |
246 res.append('<b>No Picard output found - please consult the Picard log above for an explanation</b>') | |
247 l = open(log_file,'r').readlines() | |
248 if len(l) > 0: | |
249 res.append('<b>Picard log</b><hr/>\n') | |
250 rlog = ['<pre>',] | |
251 rlog += l | |
252 rlog.append('</pre>') | |
253 res += rlog | |
254 else: | |
255 res.append("Odd, Picard left no log file %s - must have really barfed badly?" % log_file) | |
256 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n') | |
257 res.append( 'generated all outputs reported here, using this command line:<br/>\n<pre>%s</pre>\n' % ''.join(cl)) | |
258 res.append(galhtmlpostfix) | |
259 outf = open(html_output,'w') | |
260 outf.write(''.join(res)) | |
261 outf.write('\n') | |
262 outf.close() | |
263 | |
264 def keynat(string): | |
265 ''' | |
266 borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/ | |
267 A natural sort helper function for sort() and sorted() | |
268 without using regular expressions or exceptions. | |
269 | |
270 >>> items = ('Z', 'a', '10th', '1st', '9') | |
271 >>> sorted(items) | |
272 ['10th', '1st', '9', 'Z', 'a'] | |
273 >>> sorted(items, key=keynat) | |
274 ['1st', '9', '10th', 'a', 'Z'] | |
275 ''' | |
276 it = type(1) | |
277 r = [] | |
278 for c in string: | |
279 if c.isdigit(): | |
280 d = int(c) | |
281 if r and type( r[-1] ) == it: | |
282 r[-1] = r[-1] * 10 + d | |
283 else: | |
284 r.append(d) | |
285 else: | |
286 r.append(c.lower()) | |
287 return r | |
288 | |
289 def getFlen(bedfname=None): | |
290 """ | |
291 find all features in a BED file and sum their lengths | |
292 """ | |
293 features = {} | |
294 otherHeaders = [] | |
295 try: | |
296 infile = open(bedfname,'r') | |
297 except: | |
298 print '###ERROR: getFlen unable to open bedfile %s' % bedfname | |
299 sys.exit(1) | |
300 for i,row in enumerate(infile): | |
301 if row.startswith('@'): # add to headers if not @SQ | |
302 if not row.startswith('@SQ'): | |
303 otherHeaders.append(row) | |
304 else: | |
305 row = row.strip() | |
306 if row.startswith('#') or row.lower().startswith('browser') or row.lower().startswith('track'): | |
307 continue # ignore headers | |
308 srow = row.split('\t') | |
309 if len(srow) > 3: | |
310 srow = row.split('\t') | |
311 f = srow[0] | |
312 spos = srow[1] # zero based from UCSC so no need to add 1 - eg 0-100 is 100 bases numbered 0-99 (!) | |
313 epos = srow[2] # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 | |
314 flen = int(epos) - int(spos) | |
315 features.setdefault(f,0) | |
316 features[f] += flen | |
317 infile.close() | |
318 fk = features.keys() | |
319 fk = sorted(fk, key=keynat) | |
320 return features,fk,otherHeaders | |
321 | |
322 def bedToPicInterval(infile=None,outfile=None): | |
323 """ | |
324 Picard tools requiring targets want | |
325 a sam style header which incidentally, MUST be sorted in natural order - not lexicographic order: | |
326 | |
327 @SQ SN:chrM LN:16571 | |
328 @SQ SN:chr1 LN:247249719 | |
329 @SQ SN:chr2 LN:242951149 | |
330 @SQ SN:chr3 LN:199501827 | |
331 @SQ SN:chr4 LN:191273063 | |
332 added to the start of what looks like a bed style file | |
333 chr1 67052400 67052451 - CCDS635.1_cds_0_0_chr1_67052401_r | |
334 chr1 67060631 67060788 - CCDS635.1_cds_1_0_chr1_67060632_r | |
335 chr1 67065090 67065317 - CCDS635.1_cds_2_0_chr1_67065091_r | |
336 chr1 67066082 67066181 - CCDS635.1_cds_3_0_chr1_67066083_r | |
337 | |
338 see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 | |
339 we need to add 1 to start coordinates on the way through - but length calculations are easier | |
340 """ | |
341 # bedToPicard.py | |
342 # ross lazarus October 2010 | |
343 # LGPL | |
344 # for Rgenetics | |
345 """ | |
346 collect header info and rewrite bed with header for picard | |
347 """ | |
348 featlen,fk,otherHeaders = getFlen(bedfname=infile) | |
349 try: | |
350 outf = open(outfile,'w') | |
351 except: | |
352 print '###ERROR: writePic unable to open output picard file %s' % outfile | |
353 sys.exit(1) | |
354 inf = open(infile,'r') # already tested in getFlen | |
355 header = ['@SQ\tSN:%s\tLN:%d' % (x,featlen[x]) for x in fk] | |
356 if len(otherHeaders) > 0: | |
357 header += otherHeaders | |
358 outf.write('\n'.join(header)) | |
359 outf.write('\n') | |
360 for row in inf: | |
361 row = row.strip() | |
362 if len(row) > 0: # convert zero based start coordinate to 1 based | |
363 if row.startswith('@'): | |
364 continue | |
365 else: | |
366 srow = row.split('\t') | |
367 srow[1] = '%d' % (int(srow[1])+1) # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 | |
368 outf.write('\t'.join(srow)) | |
369 outf.write('\n') | |
370 outf.close() | |
371 inf.close() | |
372 | |
373 | |
374 def oRRun(rcmd=[],outdir=None,title='myR',rexe='R'): | |
375 """ | |
376 run an r script, lines in rcmd, | |
377 in a temporary directory | |
378 move everything, r script and all back to outdir which will be an html file | |
379 | |
380 | |
381 # test | |
382 RRun(rcmd=['print("hello cruel world")','q()'],title='test') | |
383 """ | |
384 rlog = [] | |
385 print '### rexe = %s' % rexe | |
386 assert os.path.isfile(rexe) | |
387 rname = '%s.R' % title | |
388 stoname = '%s.R.log' % title | |
389 rfname = rname | |
390 stofname = stoname | |
391 if outdir: # want a specific path | |
392 rfname = os.path.join(outdir,rname) | |
393 stofname = os.path.join(outdir,stoname) | |
394 try: | |
395 os.makedirs(outdir) # might not be there yet... | |
396 except: | |
397 pass | |
398 else: | |
399 outdir = tempfile.mkdtemp(prefix=title) | |
400 rfname = os.path.join(outdir,rname) | |
401 stofname = os.path.join(outdir,stoname) | |
402 rmoutdir = True | |
403 f = open(rfname,'w') | |
404 if type(rcmd) == type([]): | |
405 f.write('\n'.join(rcmd)) | |
406 else: # string | |
407 f.write(rcmd) | |
408 f.write('\n') | |
409 f.close() | |
410 sto = file(stofname,'w') | |
411 vcl = [rexe,"--vanilla --slave", '<', rfname ] | |
412 x = subprocess.Popen(' '.join(vcl),shell=True,stderr=sto,stdout=sto,cwd=outdir) | |
413 retval = x.wait() | |
414 sto.close() | |
415 rlog = file(stofname,'r').readlines() | |
416 rlog.insert(0,'## found R at %s' % rexe) | |
417 if outdir <> None: | |
418 flist = os.listdir(outdir) | |
419 else: | |
420 flist = os.listdir('.') | |
421 flist.sort | |
422 flist = [(x,x) for x in flist] | |
423 for i,x in enumerate(flist): | |
424 if x == rname: | |
425 flist[i] = (x,'R script for %s' % title) | |
426 elif x == stoname: | |
427 flist[i] = (x,'R log for %s' % title) | |
428 if False and rmoutdir: | |
429 os.removedirs(outdir) | |
430 return rlog,flist # for html layout | |
431 | |
432 | |
433 | |
434 | |
435 def RRun(rcmd=[],outdir=None,title='myR',tidy=True): | |
436 """ | |
437 run an r script, lines in rcmd, | |
438 in a temporary directory | |
439 move everything, r script and all back to outdir which will be an html file | |
440 | |
441 | |
442 # test | |
443 RRun(rcmd=['print("hello cruel world")','q()'],title='test') | |
444 echo "a <- c(5, 5); b <- c(0.5, 0.5)" | cat - RScript.R | R --slave \ --vanilla | |
445 suggested by http://tolstoy.newcastle.edu.au/R/devel/05/09/2448.html | |
446 """ | |
447 killme = string.punctuation + string.whitespace | |
448 trantab = string.maketrans(killme,'_'*len(killme)) | |
449 title = title.translate(trantab) | |
450 rlog = [] | |
451 tempout=False | |
452 rname = '%s.R' % title | |
453 stoname = '%s.R.log' % title | |
454 cwd = os.getcwd() | |
455 if outdir: # want a specific path | |
456 try: | |
457 os.makedirs(outdir) # might not be there yet... | |
458 except: | |
459 pass | |
460 os.chdir(outdir) | |
461 if type(rcmd) == type([]): | |
462 script = '\n'.join(rcmd) | |
463 else: # string | |
464 script = rcmd | |
465 sto = file(stoname,'w') | |
466 rscript = file(rname,'w') | |
467 rscript.write(script) | |
468 rscript.write('\n#R script autogenerated by rgenetics/rgutils.py on %s\n' % timenow()) | |
469 rscript.close() | |
470 vcl = '%s --slave --vanilla < %s' % (rexe,rname) | |
471 if outdir: | |
472 x = subprocess.Popen(vcl,shell=True,stderr=sto,stdout=sto,cwd=outdir) | |
473 else: | |
474 x = subprocess.Popen(vcl,shell=True,stderr=sto,stdout=sto) | |
475 retval = x.wait() | |
476 sto.close() | |
477 rlog = file(stoname,'r').readlines() | |
478 if retval <> 0: | |
479 rlog.insert(0,'Nonzero exit code = %d' % retval) # indicate failure | |
480 if outdir: | |
481 flist = os.listdir(outdir) | |
482 else: | |
483 flist = os.listdir(os.getcwd()) | |
484 flist.sort | |
485 flist = [(x,x) for x in flist] | |
486 for i,x in enumerate(flist): | |
487 if x == rname: | |
488 flist[i] = (x,'R script for %s' % title) | |
489 elif x == stoname: | |
490 flist[i] = (x,'R log for %s' % title) | |
491 if outdir: | |
492 os.chdir(cwd) | |
493 return rlog,flist # for html layout | |
494 | |
495 def runPlink(bfn='bar',ofn='foo',logf=None,plinktasks=[],cd='./',vclbase = []): | |
496 """run a series of plink tasks and append log results to stdout | |
497 vcl has a list of parameters for the spawnv | |
498 common settings can all go in the vclbase list and are added to each plinktask | |
499 """ | |
500 # root for all | |
501 fplog,plog = tempfile.mkstemp() | |
502 if type(logf) == type(' '): # open otherwise assume is file - ugh I'm in a hurry | |
503 mylog = file(logf,'a+') | |
504 else: | |
505 mylog = logf | |
506 mylog.write('## Rgenetics: http://rgenetics.org Galaxy Tools rgQC.py Plink runner\n') | |
507 for task in plinktasks: # each is a list | |
508 vcl = vclbase + task | |
509 sto = file(plog,'w') | |
510 x = subprocess.Popen(' '.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd) | |
511 retval = x.wait() | |
512 sto.close() | |
513 try: | |
514 lplog = file(plog,'r').read() | |
515 mylog.write(lplog) | |
516 os.unlink(plog) # no longer needed | |
517 except: | |
518 mylog.write('### %s Strange - no std out from plink when running command line\n%s' % (timenow(),' '.join(vcl))) | |
519 | |
520 def pruneLD(plinktasks=[],cd='./',vclbase = []): | |
521 """ | |
522 plink blathers when doing pruning - ignore | |
523 Linkage disequilibrium based SNP pruning | |
524 if a million snps in 3 billion base pairs, have mean 3k spacing | |
525 assume 40-60k of ld in ceu, a window of 120k width is about 40 snps | |
526 so lots more is perhaps less efficient - each window computational cost is | |
527 ON^2 unless the code is smart enough to avoid unecessary computation where | |
528 allele frequencies make it impossible to see ld > the r^2 cutoff threshold | |
529 So, do a window and move forward 20? | |
530 The fine Plink docs at http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#prune | |
531 reproduced below | |
532 | |
533 Sometimes it is useful to generate a pruned subset of SNPs that are in approximate linkage equilibrium with each other. This can be achieved via two commands: | |
534 --indep which prunes based on the variance inflation factor (VIF), which recursively removes SNPs within a sliding window; second, --indep-pairwise which is | |
535 similar, except it is based only on pairwise genotypic correlation. | |
536 | |
537 Hint The output of either of these commands is two lists of SNPs: those that are pruned out and those that are not. A separate command using the --extract or | |
538 --exclude option is necessary to actually perform the pruning. | |
539 | |
540 The VIF pruning routine is performed: | |
541 plink --file data --indep 50 5 2 | |
542 | |
543 will create files | |
544 | |
545 plink.prune.in | |
546 plink.prune.out | |
547 | |
548 Each is a simlpe list of SNP IDs; both these files can subsequently be specified as the argument for | |
549 a --extract or --exclude command. | |
550 | |
551 The parameters for --indep are: window size in SNPs (e.g. 50), the number of SNPs to shift the | |
552 window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other | |
553 SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent | |
554 near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of | |
555 all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window | |
556 size is too large, too many SNPs may be removed. | |
557 | |
558 The second procedure is performed: | |
559 plink --file data --indep-pairwise 50 5 0.5 | |
560 | |
561 This generates the same output files as the first version; the only difference is that a | |
562 simple pairwise threshold is used. The first two parameters (50 and 5) are the same as above (window size and step); the third parameter represents the r^2 | |
563 threshold. Note: this represents the pairwise SNP-SNP metric now, not the multiple correlation coefficient; also note, this is based on the genotypic | |
564 correlation, i.e. it does not involve phasing. | |
565 | |
566 To give a concrete example: the command above that specifies 50 5 0.5 would a) consider a | |
567 window of 50 SNPs, b) calculate LD between each pair of SNPs in the window, b) remove one of a pair of SNPs if the LD is greater than 0.5, c) shift the window 5 | |
568 SNPs forward and repeat the procedure. | |
569 | |
570 To make a new, pruned file, then use something like (in this example, we also convert the | |
571 standard PED fileset to a binary one): | |
572 plink --file data --extract plink.prune.in --make-bed --out pruneddata | |
573 """ | |
574 fplog,plog = tempfile.mkstemp() | |
575 alog = [] | |
576 alog.append('## Rgenetics: http://rgenetics.org Galaxy Tools rgQC.py Plink pruneLD runner\n') | |
577 for task in plinktasks: # each is a list | |
578 vcl = vclbase + task | |
579 sto = file(plog,'w') | |
580 x = subprocess.Popen(' '.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd) | |
581 retval = x.wait() | |
582 sto.close() | |
583 try: | |
584 lplog = file(plog,'r').readlines() | |
585 lplog = [x for x in lplog if x.find('Pruning SNP') == -1] | |
586 alog += lplog | |
587 alog.append('\n') | |
588 os.unlink(plog) # no longer needed | |
589 except: | |
590 alog.append('### %s Strange - no std out from plink when running command line\n%s\n' % (timenow(),' '.join(vcl))) | |
591 return alog | |
592 | |
593 def readMap(mapfile=None,allmarkers=False,rsdict={},c=None,spos=None,epos=None): | |
594 """abstract out - keeps reappearing | |
595 """ | |
596 mfile = open(mapfile, 'r') | |
597 markers = [] | |
598 snpcols = {} | |
599 snpIndex = 0 # in case empty or comment lines | |
600 for rownum,row in enumerate(mfile): | |
601 line = row.strip() | |
602 if not line or line[0]=='#': continue | |
603 chrom, snp, genpos, abspos = line.split()[:4] # just in case more cols | |
604 try: | |
605 abspos = int(abspos) | |
606 except: | |
607 abspos = 0 # stupid framingham data grumble grumble | |
608 if allmarkers or rsdict.get(snp,None) or (chrom == c and (spos <= abspos <= epos)): | |
609 markers.append((chrom,abspos,snp)) # decorate for sort into genomic | |
610 snpcols[snp] = snpIndex # so we know which col to find genos for this marker | |
611 snpIndex += 1 | |
612 markers.sort() | |
613 rslist = [x[2] for x in markers] # drop decoration | |
614 rsdict = dict(zip(rslist,rslist)) | |
615 mfile.close() | |
616 return markers,snpcols,rslist,rsdict | |
617 | |
618 |