0
|
1 # utilities for rgenetics
|
|
2 #
|
|
3 # copyright 2009 ross lazarus
|
|
4 # released under the LGPL
|
|
5 #
|
|
6
|
|
7 import subprocess, os, sys, time, tempfile,string,plinkbinJZ
|
|
8 import datetime
|
|
9
|
|
10 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
|
|
11 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
12 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
|
13 <head>
|
|
14 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|
15 <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
|
|
16 <title></title>
|
|
17 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
|
|
18 </head>
|
|
19 <body>
|
|
20 <div class="document">
|
|
21 """
|
|
22 galhtmlattr = """<h3><a href="http://rgenetics.org">Rgenetics</a> tool %s run at %s</h3>"""
|
|
23 galhtmlpostfix = """</div></body></html>\n"""
|
|
24
|
|
25 plinke = 'plink' # changed jan 2010 - all exes must be on path
|
|
26 rexe = 'R' # to avoid cluster/platform dependencies
|
|
27 smartpca = 'smartpca.perl'
|
|
28
|
|
29 def timenow():
|
|
30 """return current time as a string
|
|
31 """
|
|
32 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
|
|
33
|
|
34 def timestamp():
|
|
35 return datetime.datetime.now().strftime('%Y%m%d%H%M%S')
|
|
36
|
|
37 def fail( message ):
|
|
38 print >> sys.stderr, message
|
|
39 return -1
|
|
40
|
|
41 def whereis(program):
|
|
42 for path in os.environ.get('PATH', '').split(':'):
|
|
43 if os.path.exists(os.path.join(path, program)) and \
|
|
44 not os.path.isdir(os.path.join(path, program)):
|
|
45 return os.path.join(path, program)
|
|
46 return None
|
|
47
|
|
48
|
|
49 def bedToPicInterval(infile=None):
|
|
50 """
|
|
51 Picard tools requiring targets want
|
|
52 a sam style header which incidentally, MUST be sorted in natural order - not lexicographic order:
|
|
53
|
|
54 @SQ SN:chrM LN:16571
|
|
55 @SQ SN:chr1 LN:247249719
|
|
56 @SQ SN:chr2 LN:242951149
|
|
57 @SQ SN:chr3 LN:199501827
|
|
58 @SQ SN:chr4 LN:191273063
|
|
59 added to the start of what looks like a bed style file
|
|
60 chr1 67052400 67052451 - CCDS635.1_cds_0_0_chr1_67052401_r
|
|
61 chr1 67060631 67060788 - CCDS635.1_cds_1_0_chr1_67060632_r
|
|
62 chr1 67065090 67065317 - CCDS635.1_cds_2_0_chr1_67065091_r
|
|
63 chr1 67066082 67066181 - CCDS635.1_cds_3_0_chr1_67066083_r
|
|
64
|
|
65
|
|
66 see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
|
|
67 we need to add 1 to start coordinates on the way through - but length calculations are easier
|
|
68 """
|
|
69 # bedToPicard.py
|
|
70 # ross lazarus October 2010
|
|
71 # LGPL
|
|
72 # for Rgenetics
|
|
73
|
|
74 def getFlen(bedfname=None):
|
|
75 """
|
|
76 find all features in a BED file and sum their lengths
|
|
77 """
|
|
78 features = {}
|
|
79 try:
|
|
80 infile = open(bedfname,'r')
|
|
81 except:
|
|
82 print '###ERROR: getFlen unable to open bedfile %s' % bedfname
|
|
83 sys.exit(1)
|
|
84 for i,row in enumerate(infile):
|
|
85 if row[0] == '@': # shouldn't happen given a bed file!
|
|
86 print 'row %d=%s - should NOT start with @!' % (i,row)
|
|
87 sys.exit(1)
|
|
88 row = row.strip()
|
|
89 if len(row) > 0:
|
|
90 srow = row.split('\t')
|
|
91 f = srow[0]
|
|
92 spos = srow[1] # zero based from UCSC so no need to add 1 - eg 0-100 is 100 bases numbered 0-99 (!)
|
|
93 epos = srow[2] # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
|
|
94 flen = int(epos) - int(spos)
|
|
95 features.setdefault(f,0)
|
|
96 features[f] += flen
|
|
97 infile.close()
|
|
98 return features
|
|
99
|
|
100 def keynat(string):
|
|
101 '''
|
|
102 borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/
|
|
103 A natural sort helper function for sort() and sorted()
|
|
104 without using regular expressions or exceptions.
|
|
105
|
|
106 >>> items = ('Z', 'a', '10th', '1st', '9')
|
|
107 >>> sorted(items)
|
|
108 ['10th', '1st', '9', 'Z', 'a']
|
|
109 >>> sorted(items, key=keynat)
|
|
110 ['1st', '9', '10th', 'a', 'Z']
|
|
111 '''
|
|
112 it = type(1)
|
|
113 r = []
|
|
114 for c in string:
|
|
115 if c.isdigit():
|
|
116 d = int(c)
|
|
117 if r and type( r[-1] ) == it:
|
|
118 r[-1] = r[-1] * 10 + d
|
|
119 else:
|
|
120 r.append(d)
|
|
121 else:
|
|
122 r.append(c.lower())
|
|
123 return r
|
|
124
|
|
125 def writePic(outfname=None,bedfname=None):
|
|
126 """
|
|
127 collect header info and rewrite bed with header for picard
|
|
128 """
|
|
129 featlen = getFlen(bedfname=bedfname)
|
|
130 try:
|
|
131 outf = open(outfname,'w')
|
|
132 except:
|
|
133 print '###ERROR: writePic unable to open output picard file %s' % outfname
|
|
134 sys.exit(1)
|
|
135 infile = open(bedfname,'r') # already tested in getFlen
|
|
136 k = featlen.keys()
|
|
137 fk = sorted(k, key=keynat)
|
|
138 header = ['@SQ\tSN:%s\tLN:%d' % (x,featlen[x]) for x in fk]
|
|
139 outf.write('\n'.join(header))
|
|
140 outf.write('\n')
|
|
141 for row in infile:
|
|
142 row = row.strip()
|
|
143 if len(row) > 0: # convert zero based start coordinate to 1 based
|
|
144 srow = row.split('\t')
|
|
145 srow[1] = '%d' % (int(srow[1])+1) # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
|
|
146 outf.write('\t'.join(srow))
|
|
147 outf.write('\n')
|
|
148 outf.close()
|
|
149 infile.close()
|
|
150
|
|
151
|
|
152
|
|
153 # bedToPicInterval starts here
|
|
154 fd,outf = tempfile.mkstemp(prefix='rgPicardHsMetrics')
|
|
155 writePic(outfname=outf,bedfname=infile)
|
|
156 return outf
|
|
157
|
|
158
|
|
159 def getFileString(fpath, outpath):
|
|
160 """
|
|
161 format a nice file size string
|
|
162 """
|
|
163 size = ''
|
|
164 fp = os.path.join(outpath, fpath)
|
|
165 s = '? ?'
|
|
166 if os.path.isfile(fp):
|
|
167 n = float(os.path.getsize(fp))
|
|
168 if n > 2**20:
|
|
169 size = ' (%1.1f MB)' % (n/2**20)
|
|
170 elif n > 2**10:
|
|
171 size = ' (%1.1f KB)' % (n/2**10)
|
|
172 elif n > 0:
|
|
173 size = ' (%d B)' % (int(n))
|
|
174 s = '%s %s' % (fpath, size)
|
|
175 return s
|
|
176
|
|
177
|
|
178 def fixPicardOutputs(tempout=None,output_dir=None,log_file=None,html_output=None,progname=None,cl=[],transpose=True):
|
|
179 """
|
|
180 picard produces long hard to read tab header files
|
|
181 make them available but present them transposed for readability
|
|
182 """
|
|
183 rstyle="""<style type="text/css">
|
|
184 tr.d0 td {background-color: oldlace; color: black;}
|
|
185 tr.d1 td {background-color: aliceblue; color: black;}
|
|
186 </style>"""
|
|
187 cruft = []
|
|
188 dat = []
|
|
189 try:
|
|
190 r = open(tempout,'r').readlines()
|
|
191 except:
|
|
192 r = []
|
|
193 for row in r:
|
|
194 if row.strip() > '':
|
|
195 srow = row.split('\t')
|
|
196 if row[0] == '#':
|
|
197 cruft.append(row.strip()) # want strings
|
|
198 else:
|
|
199 dat.append(srow) # want lists
|
|
200
|
|
201 res = [rstyle,]
|
|
202 res.append(galhtmlprefix % progname)
|
|
203 res.append(galhtmlattr % (progname,timenow()))
|
|
204 flist = os.listdir(output_dir)
|
|
205 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
|
|
206 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
|
|
207 for p in pdflist:
|
|
208 imghref = '%s.jpg' % os.path.splitext(p)[0] # removes .pdf
|
|
209 res.append('<table cellpadding="10"><tr><td>\n')
|
|
210 res.append('<a href="%s"><img src="%s" alt="Click thumbnail to download %s" hspace="10" align="middle"></a>\n' % (p,imghref,p))
|
|
211 res.append('</tr></td></table>\n')
|
|
212 res.append('<b>Your job produced the following output files.</b><hr/>\n')
|
|
213 res.append('<table>\n')
|
|
214 for i,f in enumerate(flist):
|
|
215 fn = os.path.split(f)[-1]
|
|
216 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
|
|
217 res.append('</table><p/>\n')
|
|
218 if len(cruft) + len(dat) > 0:
|
|
219 res.append('<b>Picard on line resources</b><ul>\n')
|
|
220 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
|
|
221 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
|
|
222 if transpose:
|
|
223 res.append('<b>Picard output (transposed for readability)</b><hr/>\n')
|
|
224 else:
|
|
225 res.append('<b>Picard output</b><hr/>\n')
|
|
226 res.append('<table cellpadding="3" >\n')
|
|
227 if len(cruft) > 0:
|
|
228 cres = ['<tr class="d%d"><td>%s</td></tr>' % (i % 2,x) for i,x in enumerate(cruft)]
|
|
229 res += cres
|
|
230 if len(dat) > 0:
|
|
231 maxrows = 100
|
|
232 if transpose:
|
|
233 tdat = map(None,*dat) # transpose an arbitrary list of lists
|
|
234 missing = len(tdat) - maxrows
|
|
235 tdat = ['<tr class="d%d"><td>%s</td><td>%s</td></tr>\n' % ((i+len(cruft)) % 2,x[0],x[1]) for i,x in enumerate(tdat) if i < maxrows]
|
|
236 if len(tdat) > maxrows:
|
|
237 tdat.append('<tr><td colspan="2">...WARNING: %d rows deleted for sanity...see raw files for all rows</td></tr>' % missing)
|
|
238 else:
|
|
239 tdat = ['<tr class="d%d"><td>%s</td></tr>\n' % ((i+len(cruft)) % 2,x) for i,x in enumerate(dat) if i < maxrows]
|
|
240 if len(dat) > maxrows:
|
|
241 missing = len(dat) - maxrows
|
|
242 tdat.append('<tr><td>...WARNING: %d rows deleted for sanity...see raw files for all rows</td></tr>' % missing)
|
|
243 res += tdat
|
|
244 res.append('</table>\n')
|
|
245 else:
|
|
246 res.append('<b>No Picard output found - please consult the Picard log above for an explanation</b>')
|
|
247 l = open(log_file,'r').readlines()
|
|
248 if len(l) > 0:
|
|
249 res.append('<b>Picard log</b><hr/>\n')
|
|
250 rlog = ['<pre>',]
|
|
251 rlog += l
|
|
252 rlog.append('</pre>')
|
|
253 res += rlog
|
|
254 else:
|
|
255 res.append("Odd, Picard left no log file %s - must have really barfed badly?" % log_file)
|
|
256 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n')
|
|
257 res.append( 'generated all outputs reported here, using this command line:<br/>\n<pre>%s</pre>\n' % ''.join(cl))
|
|
258 res.append(galhtmlpostfix)
|
|
259 outf = open(html_output,'w')
|
|
260 outf.write(''.join(res))
|
|
261 outf.write('\n')
|
|
262 outf.close()
|
|
263
|
|
264 def keynat(string):
|
|
265 '''
|
|
266 borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/
|
|
267 A natural sort helper function for sort() and sorted()
|
|
268 without using regular expressions or exceptions.
|
|
269
|
|
270 >>> items = ('Z', 'a', '10th', '1st', '9')
|
|
271 >>> sorted(items)
|
|
272 ['10th', '1st', '9', 'Z', 'a']
|
|
273 >>> sorted(items, key=keynat)
|
|
274 ['1st', '9', '10th', 'a', 'Z']
|
|
275 '''
|
|
276 it = type(1)
|
|
277 r = []
|
|
278 for c in string:
|
|
279 if c.isdigit():
|
|
280 d = int(c)
|
|
281 if r and type( r[-1] ) == it:
|
|
282 r[-1] = r[-1] * 10 + d
|
|
283 else:
|
|
284 r.append(d)
|
|
285 else:
|
|
286 r.append(c.lower())
|
|
287 return r
|
|
288
|
|
289 def getFlen(bedfname=None):
|
|
290 """
|
|
291 find all features in a BED file and sum their lengths
|
|
292 """
|
|
293 features = {}
|
|
294 otherHeaders = []
|
|
295 try:
|
|
296 infile = open(bedfname,'r')
|
|
297 except:
|
|
298 print '###ERROR: getFlen unable to open bedfile %s' % bedfname
|
|
299 sys.exit(1)
|
|
300 for i,row in enumerate(infile):
|
|
301 if row.startswith('@'): # add to headers if not @SQ
|
|
302 if not row.startswith('@SQ'):
|
|
303 otherHeaders.append(row)
|
|
304 else:
|
|
305 row = row.strip()
|
|
306 if row.startswith('#') or row.lower().startswith('browser') or row.lower().startswith('track'):
|
|
307 continue # ignore headers
|
|
308 srow = row.split('\t')
|
|
309 if len(srow) > 3:
|
|
310 srow = row.split('\t')
|
|
311 f = srow[0]
|
|
312 spos = srow[1] # zero based from UCSC so no need to add 1 - eg 0-100 is 100 bases numbered 0-99 (!)
|
|
313 epos = srow[2] # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
|
|
314 flen = int(epos) - int(spos)
|
|
315 features.setdefault(f,0)
|
|
316 features[f] += flen
|
|
317 infile.close()
|
|
318 fk = features.keys()
|
|
319 fk = sorted(fk, key=keynat)
|
|
320 return features,fk,otherHeaders
|
|
321
|
|
322 def bedToPicInterval(infile=None,outfile=None):
|
|
323 """
|
|
324 Picard tools requiring targets want
|
|
325 a sam style header which incidentally, MUST be sorted in natural order - not lexicographic order:
|
|
326
|
|
327 @SQ SN:chrM LN:16571
|
|
328 @SQ SN:chr1 LN:247249719
|
|
329 @SQ SN:chr2 LN:242951149
|
|
330 @SQ SN:chr3 LN:199501827
|
|
331 @SQ SN:chr4 LN:191273063
|
|
332 added to the start of what looks like a bed style file
|
|
333 chr1 67052400 67052451 - CCDS635.1_cds_0_0_chr1_67052401_r
|
|
334 chr1 67060631 67060788 - CCDS635.1_cds_1_0_chr1_67060632_r
|
|
335 chr1 67065090 67065317 - CCDS635.1_cds_2_0_chr1_67065091_r
|
|
336 chr1 67066082 67066181 - CCDS635.1_cds_3_0_chr1_67066083_r
|
|
337
|
|
338 see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
|
|
339 we need to add 1 to start coordinates on the way through - but length calculations are easier
|
|
340 """
|
|
341 # bedToPicard.py
|
|
342 # ross lazarus October 2010
|
|
343 # LGPL
|
|
344 # for Rgenetics
|
|
345 """
|
|
346 collect header info and rewrite bed with header for picard
|
|
347 """
|
|
348 featlen,fk,otherHeaders = getFlen(bedfname=infile)
|
|
349 try:
|
|
350 outf = open(outfile,'w')
|
|
351 except:
|
|
352 print '###ERROR: writePic unable to open output picard file %s' % outfile
|
|
353 sys.exit(1)
|
|
354 inf = open(infile,'r') # already tested in getFlen
|
|
355 header = ['@SQ\tSN:%s\tLN:%d' % (x,featlen[x]) for x in fk]
|
|
356 if len(otherHeaders) > 0:
|
|
357 header += otherHeaders
|
|
358 outf.write('\n'.join(header))
|
|
359 outf.write('\n')
|
|
360 for row in inf:
|
|
361 row = row.strip()
|
|
362 if len(row) > 0: # convert zero based start coordinate to 1 based
|
|
363 if row.startswith('@'):
|
|
364 continue
|
|
365 else:
|
|
366 srow = row.split('\t')
|
|
367 srow[1] = '%d' % (int(srow[1])+1) # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
|
|
368 outf.write('\t'.join(srow))
|
|
369 outf.write('\n')
|
|
370 outf.close()
|
|
371 inf.close()
|
|
372
|
|
373
|
|
374 def oRRun(rcmd=[],outdir=None,title='myR',rexe='R'):
|
|
375 """
|
|
376 run an r script, lines in rcmd,
|
|
377 in a temporary directory
|
|
378 move everything, r script and all back to outdir which will be an html file
|
|
379
|
|
380
|
|
381 # test
|
|
382 RRun(rcmd=['print("hello cruel world")','q()'],title='test')
|
|
383 """
|
|
384 rlog = []
|
|
385 print '### rexe = %s' % rexe
|
|
386 assert os.path.isfile(rexe)
|
|
387 rname = '%s.R' % title
|
|
388 stoname = '%s.R.log' % title
|
|
389 rfname = rname
|
|
390 stofname = stoname
|
|
391 if outdir: # want a specific path
|
|
392 rfname = os.path.join(outdir,rname)
|
|
393 stofname = os.path.join(outdir,stoname)
|
|
394 try:
|
|
395 os.makedirs(outdir) # might not be there yet...
|
|
396 except:
|
|
397 pass
|
|
398 else:
|
|
399 outdir = tempfile.mkdtemp(prefix=title)
|
|
400 rfname = os.path.join(outdir,rname)
|
|
401 stofname = os.path.join(outdir,stoname)
|
|
402 rmoutdir = True
|
|
403 f = open(rfname,'w')
|
|
404 if type(rcmd) == type([]):
|
|
405 f.write('\n'.join(rcmd))
|
|
406 else: # string
|
|
407 f.write(rcmd)
|
|
408 f.write('\n')
|
|
409 f.close()
|
|
410 sto = file(stofname,'w')
|
|
411 vcl = [rexe,"--vanilla --slave", '<', rfname ]
|
|
412 x = subprocess.Popen(' '.join(vcl),shell=True,stderr=sto,stdout=sto,cwd=outdir)
|
|
413 retval = x.wait()
|
|
414 sto.close()
|
|
415 rlog = file(stofname,'r').readlines()
|
|
416 rlog.insert(0,'## found R at %s' % rexe)
|
|
417 if outdir <> None:
|
|
418 flist = os.listdir(outdir)
|
|
419 else:
|
|
420 flist = os.listdir('.')
|
|
421 flist.sort
|
|
422 flist = [(x,x) for x in flist]
|
|
423 for i,x in enumerate(flist):
|
|
424 if x == rname:
|
|
425 flist[i] = (x,'R script for %s' % title)
|
|
426 elif x == stoname:
|
|
427 flist[i] = (x,'R log for %s' % title)
|
|
428 if False and rmoutdir:
|
|
429 os.removedirs(outdir)
|
|
430 return rlog,flist # for html layout
|
|
431
|
|
432
|
|
433
|
|
434
|
|
435 def RRun(rcmd=[],outdir=None,title='myR',tidy=True):
|
|
436 """
|
|
437 run an r script, lines in rcmd,
|
|
438 in a temporary directory
|
|
439 move everything, r script and all back to outdir which will be an html file
|
|
440
|
|
441
|
|
442 # test
|
|
443 RRun(rcmd=['print("hello cruel world")','q()'],title='test')
|
|
444 echo "a <- c(5, 5); b <- c(0.5, 0.5)" | cat - RScript.R | R --slave \ --vanilla
|
|
445 suggested by http://tolstoy.newcastle.edu.au/R/devel/05/09/2448.html
|
|
446 """
|
|
447 killme = string.punctuation + string.whitespace
|
|
448 trantab = string.maketrans(killme,'_'*len(killme))
|
|
449 title = title.translate(trantab)
|
|
450 rlog = []
|
|
451 tempout=False
|
|
452 rname = '%s.R' % title
|
|
453 stoname = '%s.R.log' % title
|
|
454 cwd = os.getcwd()
|
|
455 if outdir: # want a specific path
|
|
456 try:
|
|
457 os.makedirs(outdir) # might not be there yet...
|
|
458 except:
|
|
459 pass
|
|
460 os.chdir(outdir)
|
|
461 if type(rcmd) == type([]):
|
|
462 script = '\n'.join(rcmd)
|
|
463 else: # string
|
|
464 script = rcmd
|
|
465 sto = file(stoname,'w')
|
|
466 rscript = file(rname,'w')
|
|
467 rscript.write(script)
|
|
468 rscript.write('\n#R script autogenerated by rgenetics/rgutils.py on %s\n' % timenow())
|
|
469 rscript.close()
|
|
470 vcl = '%s --slave --vanilla < %s' % (rexe,rname)
|
|
471 if outdir:
|
|
472 x = subprocess.Popen(vcl,shell=True,stderr=sto,stdout=sto,cwd=outdir)
|
|
473 else:
|
|
474 x = subprocess.Popen(vcl,shell=True,stderr=sto,stdout=sto)
|
|
475 retval = x.wait()
|
|
476 sto.close()
|
|
477 rlog = file(stoname,'r').readlines()
|
|
478 if retval <> 0:
|
|
479 rlog.insert(0,'Nonzero exit code = %d' % retval) # indicate failure
|
|
480 if outdir:
|
|
481 flist = os.listdir(outdir)
|
|
482 else:
|
|
483 flist = os.listdir(os.getcwd())
|
|
484 flist.sort
|
|
485 flist = [(x,x) for x in flist]
|
|
486 for i,x in enumerate(flist):
|
|
487 if x == rname:
|
|
488 flist[i] = (x,'R script for %s' % title)
|
|
489 elif x == stoname:
|
|
490 flist[i] = (x,'R log for %s' % title)
|
|
491 if outdir:
|
|
492 os.chdir(cwd)
|
|
493 return rlog,flist # for html layout
|
|
494
|
|
495 def runPlink(bfn='bar',ofn='foo',logf=None,plinktasks=[],cd='./',vclbase = []):
|
|
496 """run a series of plink tasks and append log results to stdout
|
|
497 vcl has a list of parameters for the spawnv
|
|
498 common settings can all go in the vclbase list and are added to each plinktask
|
|
499 """
|
|
500 # root for all
|
|
501 fplog,plog = tempfile.mkstemp()
|
|
502 if type(logf) == type(' '): # open otherwise assume is file - ugh I'm in a hurry
|
|
503 mylog = file(logf,'a+')
|
|
504 else:
|
|
505 mylog = logf
|
|
506 mylog.write('## Rgenetics: http://rgenetics.org Galaxy Tools rgQC.py Plink runner\n')
|
|
507 for task in plinktasks: # each is a list
|
|
508 vcl = vclbase + task
|
|
509 sto = file(plog,'w')
|
|
510 x = subprocess.Popen(' '.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd)
|
|
511 retval = x.wait()
|
|
512 sto.close()
|
|
513 try:
|
|
514 lplog = file(plog,'r').read()
|
|
515 mylog.write(lplog)
|
|
516 os.unlink(plog) # no longer needed
|
|
517 except:
|
|
518 mylog.write('### %s Strange - no std out from plink when running command line\n%s' % (timenow(),' '.join(vcl)))
|
|
519
|
|
520 def pruneLD(plinktasks=[],cd='./',vclbase = []):
|
|
521 """
|
|
522 plink blathers when doing pruning - ignore
|
|
523 Linkage disequilibrium based SNP pruning
|
|
524 if a million snps in 3 billion base pairs, have mean 3k spacing
|
|
525 assume 40-60k of ld in ceu, a window of 120k width is about 40 snps
|
|
526 so lots more is perhaps less efficient - each window computational cost is
|
|
527 ON^2 unless the code is smart enough to avoid unecessary computation where
|
|
528 allele frequencies make it impossible to see ld > the r^2 cutoff threshold
|
|
529 So, do a window and move forward 20?
|
|
530 The fine Plink docs at http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#prune
|
|
531 reproduced below
|
|
532
|
|
533 Sometimes it is useful to generate a pruned subset of SNPs that are in approximate linkage equilibrium with each other. This can be achieved via two commands:
|
|
534 --indep which prunes based on the variance inflation factor (VIF), which recursively removes SNPs within a sliding window; second, --indep-pairwise which is
|
|
535 similar, except it is based only on pairwise genotypic correlation.
|
|
536
|
|
537 Hint The output of either of these commands is two lists of SNPs: those that are pruned out and those that are not. A separate command using the --extract or
|
|
538 --exclude option is necessary to actually perform the pruning.
|
|
539
|
|
540 The VIF pruning routine is performed:
|
|
541 plink --file data --indep 50 5 2
|
|
542
|
|
543 will create files
|
|
544
|
|
545 plink.prune.in
|
|
546 plink.prune.out
|
|
547
|
|
548 Each is a simlpe list of SNP IDs; both these files can subsequently be specified as the argument for
|
|
549 a --extract or --exclude command.
|
|
550
|
|
551 The parameters for --indep are: window size in SNPs (e.g. 50), the number of SNPs to shift the
|
|
552 window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other
|
|
553 SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent
|
|
554 near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of
|
|
555 all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window
|
|
556 size is too large, too many SNPs may be removed.
|
|
557
|
|
558 The second procedure is performed:
|
|
559 plink --file data --indep-pairwise 50 5 0.5
|
|
560
|
|
561 This generates the same output files as the first version; the only difference is that a
|
|
562 simple pairwise threshold is used. The first two parameters (50 and 5) are the same as above (window size and step); the third parameter represents the r^2
|
|
563 threshold. Note: this represents the pairwise SNP-SNP metric now, not the multiple correlation coefficient; also note, this is based on the genotypic
|
|
564 correlation, i.e. it does not involve phasing.
|
|
565
|
|
566 To give a concrete example: the command above that specifies 50 5 0.5 would a) consider a
|
|
567 window of 50 SNPs, b) calculate LD between each pair of SNPs in the window, b) remove one of a pair of SNPs if the LD is greater than 0.5, c) shift the window 5
|
|
568 SNPs forward and repeat the procedure.
|
|
569
|
|
570 To make a new, pruned file, then use something like (in this example, we also convert the
|
|
571 standard PED fileset to a binary one):
|
|
572 plink --file data --extract plink.prune.in --make-bed --out pruneddata
|
|
573 """
|
|
574 fplog,plog = tempfile.mkstemp()
|
|
575 alog = []
|
|
576 alog.append('## Rgenetics: http://rgenetics.org Galaxy Tools rgQC.py Plink pruneLD runner\n')
|
|
577 for task in plinktasks: # each is a list
|
|
578 vcl = vclbase + task
|
|
579 sto = file(plog,'w')
|
|
580 x = subprocess.Popen(' '.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd)
|
|
581 retval = x.wait()
|
|
582 sto.close()
|
|
583 try:
|
|
584 lplog = file(plog,'r').readlines()
|
|
585 lplog = [x for x in lplog if x.find('Pruning SNP') == -1]
|
|
586 alog += lplog
|
|
587 alog.append('\n')
|
|
588 os.unlink(plog) # no longer needed
|
|
589 except:
|
|
590 alog.append('### %s Strange - no std out from plink when running command line\n%s\n' % (timenow(),' '.join(vcl)))
|
|
591 return alog
|
|
592
|
|
593 def readMap(mapfile=None,allmarkers=False,rsdict={},c=None,spos=None,epos=None):
|
|
594 """abstract out - keeps reappearing
|
|
595 """
|
|
596 mfile = open(mapfile, 'r')
|
|
597 markers = []
|
|
598 snpcols = {}
|
|
599 snpIndex = 0 # in case empty or comment lines
|
|
600 for rownum,row in enumerate(mfile):
|
|
601 line = row.strip()
|
|
602 if not line or line[0]=='#': continue
|
|
603 chrom, snp, genpos, abspos = line.split()[:4] # just in case more cols
|
|
604 try:
|
|
605 abspos = int(abspos)
|
|
606 except:
|
|
607 abspos = 0 # stupid framingham data grumble grumble
|
|
608 if allmarkers or rsdict.get(snp,None) or (chrom == c and (spos <= abspos <= epos)):
|
|
609 markers.append((chrom,abspos,snp)) # decorate for sort into genomic
|
|
610 snpcols[snp] = snpIndex # so we know which col to find genos for this marker
|
|
611 snpIndex += 1
|
|
612 markers.sort()
|
|
613 rslist = [x[2] for x in markers] # drop decoration
|
|
614 rsdict = dict(zip(rslist,rslist))
|
|
615 mfile.close()
|
|
616 return markers,snpcols,rslist,rsdict
|
|
617
|
|
618
|