Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgutils.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 # utilities for rgenetics | |
| 2 # | |
| 3 # copyright 2009 ross lazarus | |
| 4 # released under the LGPL | |
| 5 # | |
| 6 | |
| 7 import subprocess, os, sys, time, tempfile,string,plinkbinJZ | |
| 8 import datetime | |
| 9 | |
| 10 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> | |
| 11 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
| 12 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 13 <head> | |
| 14 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 15 <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> | |
| 16 <title></title> | |
| 17 <link rel="stylesheet" href="/static/style/base.css" type="text/css" /> | |
| 18 </head> | |
| 19 <body> | |
| 20 <div class="document"> | |
| 21 """ | |
| 22 galhtmlattr = """<h3><a href="http://rgenetics.org">Rgenetics</a> tool %s run at %s</h3>""" | |
| 23 galhtmlpostfix = """</div></body></html>\n""" | |
| 24 | |
| 25 plinke = 'plink' # changed jan 2010 - all exes must be on path | |
| 26 rexe = 'R' # to avoid cluster/platform dependencies | |
| 27 smartpca = 'smartpca.perl' | |
| 28 | |
| 29 def timenow(): | |
| 30 """return current time as a string | |
| 31 """ | |
| 32 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) | |
| 33 | |
| 34 def timestamp(): | |
| 35 return datetime.datetime.now().strftime('%Y%m%d%H%M%S') | |
| 36 | |
| 37 def fail( message ): | |
| 38 print >> sys.stderr, message | |
| 39 return -1 | |
| 40 | |
| 41 def whereis(program): | |
| 42 for path in os.environ.get('PATH', '').split(':'): | |
| 43 if os.path.exists(os.path.join(path, program)) and \ | |
| 44 not os.path.isdir(os.path.join(path, program)): | |
| 45 return os.path.join(path, program) | |
| 46 return None | |
| 47 | |
| 48 | |
| 49 def bedToPicInterval(infile=None): | |
| 50 """ | |
| 51 Picard tools requiring targets want | |
| 52 a sam style header which incidentally, MUST be sorted in natural order - not lexicographic order: | |
| 53 | |
| 54 @SQ SN:chrM LN:16571 | |
| 55 @SQ SN:chr1 LN:247249719 | |
| 56 @SQ SN:chr2 LN:242951149 | |
| 57 @SQ SN:chr3 LN:199501827 | |
| 58 @SQ SN:chr4 LN:191273063 | |
| 59 added to the start of what looks like a bed style file | |
| 60 chr1 67052400 67052451 - CCDS635.1_cds_0_0_chr1_67052401_r | |
| 61 chr1 67060631 67060788 - CCDS635.1_cds_1_0_chr1_67060632_r | |
| 62 chr1 67065090 67065317 - CCDS635.1_cds_2_0_chr1_67065091_r | |
| 63 chr1 67066082 67066181 - CCDS635.1_cds_3_0_chr1_67066083_r | |
| 64 | |
| 65 | |
| 66 see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 | |
| 67 we need to add 1 to start coordinates on the way through - but length calculations are easier | |
| 68 """ | |
| 69 # bedToPicard.py | |
| 70 # ross lazarus October 2010 | |
| 71 # LGPL | |
| 72 # for Rgenetics | |
| 73 | |
| 74 def getFlen(bedfname=None): | |
| 75 """ | |
| 76 find all features in a BED file and sum their lengths | |
| 77 """ | |
| 78 features = {} | |
| 79 try: | |
| 80 infile = open(bedfname,'r') | |
| 81 except: | |
| 82 print '###ERROR: getFlen unable to open bedfile %s' % bedfname | |
| 83 sys.exit(1) | |
| 84 for i,row in enumerate(infile): | |
| 85 if row[0] == '@': # shouldn't happen given a bed file! | |
| 86 print 'row %d=%s - should NOT start with @!' % (i,row) | |
| 87 sys.exit(1) | |
| 88 row = row.strip() | |
| 89 if len(row) > 0: | |
| 90 srow = row.split('\t') | |
| 91 f = srow[0] | |
| 92 spos = srow[1] # zero based from UCSC so no need to add 1 - eg 0-100 is 100 bases numbered 0-99 (!) | |
| 93 epos = srow[2] # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 | |
| 94 flen = int(epos) - int(spos) | |
| 95 features.setdefault(f,0) | |
| 96 features[f] += flen | |
| 97 infile.close() | |
| 98 return features | |
| 99 | |
| 100 def keynat(string): | |
| 101 ''' | |
| 102 borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/ | |
| 103 A natural sort helper function for sort() and sorted() | |
| 104 without using regular expressions or exceptions. | |
| 105 | |
| 106 >>> items = ('Z', 'a', '10th', '1st', '9') | |
| 107 >>> sorted(items) | |
| 108 ['10th', '1st', '9', 'Z', 'a'] | |
| 109 >>> sorted(items, key=keynat) | |
| 110 ['1st', '9', '10th', 'a', 'Z'] | |
| 111 ''' | |
| 112 it = type(1) | |
| 113 r = [] | |
| 114 for c in string: | |
| 115 if c.isdigit(): | |
| 116 d = int(c) | |
| 117 if r and type( r[-1] ) == it: | |
| 118 r[-1] = r[-1] * 10 + d | |
| 119 else: | |
| 120 r.append(d) | |
| 121 else: | |
| 122 r.append(c.lower()) | |
| 123 return r | |
| 124 | |
| 125 def writePic(outfname=None,bedfname=None): | |
| 126 """ | |
| 127 collect header info and rewrite bed with header for picard | |
| 128 """ | |
| 129 featlen = getFlen(bedfname=bedfname) | |
| 130 try: | |
| 131 outf = open(outfname,'w') | |
| 132 except: | |
| 133 print '###ERROR: writePic unable to open output picard file %s' % outfname | |
| 134 sys.exit(1) | |
| 135 infile = open(bedfname,'r') # already tested in getFlen | |
| 136 k = featlen.keys() | |
| 137 fk = sorted(k, key=keynat) | |
| 138 header = ['@SQ\tSN:%s\tLN:%d' % (x,featlen[x]) for x in fk] | |
| 139 outf.write('\n'.join(header)) | |
| 140 outf.write('\n') | |
| 141 for row in infile: | |
| 142 row = row.strip() | |
| 143 if len(row) > 0: # convert zero based start coordinate to 1 based | |
| 144 srow = row.split('\t') | |
| 145 srow[1] = '%d' % (int(srow[1])+1) # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 | |
| 146 outf.write('\t'.join(srow)) | |
| 147 outf.write('\n') | |
| 148 outf.close() | |
| 149 infile.close() | |
| 150 | |
| 151 | |
| 152 | |
| 153 # bedToPicInterval starts here | |
| 154 fd,outf = tempfile.mkstemp(prefix='rgPicardHsMetrics') | |
| 155 writePic(outfname=outf,bedfname=infile) | |
| 156 return outf | |
| 157 | |
| 158 | |
| 159 def getFileString(fpath, outpath): | |
| 160 """ | |
| 161 format a nice file size string | |
| 162 """ | |
| 163 size = '' | |
| 164 fp = os.path.join(outpath, fpath) | |
| 165 s = '? ?' | |
| 166 if os.path.isfile(fp): | |
| 167 n = float(os.path.getsize(fp)) | |
| 168 if n > 2**20: | |
| 169 size = ' (%1.1f MB)' % (n/2**20) | |
| 170 elif n > 2**10: | |
| 171 size = ' (%1.1f KB)' % (n/2**10) | |
| 172 elif n > 0: | |
| 173 size = ' (%d B)' % (int(n)) | |
| 174 s = '%s %s' % (fpath, size) | |
| 175 return s | |
| 176 | |
| 177 | |
| 178 def fixPicardOutputs(tempout=None,output_dir=None,log_file=None,html_output=None,progname=None,cl=[],transpose=True): | |
| 179 """ | |
| 180 picard produces long hard to read tab header files | |
| 181 make them available but present them transposed for readability | |
| 182 """ | |
| 183 rstyle="""<style type="text/css"> | |
| 184 tr.d0 td {background-color: oldlace; color: black;} | |
| 185 tr.d1 td {background-color: aliceblue; color: black;} | |
| 186 </style>""" | |
| 187 cruft = [] | |
| 188 dat = [] | |
| 189 try: | |
| 190 r = open(tempout,'r').readlines() | |
| 191 except: | |
| 192 r = [] | |
| 193 for row in r: | |
| 194 if row.strip() > '': | |
| 195 srow = row.split('\t') | |
| 196 if row[0] == '#': | |
| 197 cruft.append(row.strip()) # want strings | |
| 198 else: | |
| 199 dat.append(srow) # want lists | |
| 200 | |
| 201 res = [rstyle,] | |
| 202 res.append(galhtmlprefix % progname) | |
| 203 res.append(galhtmlattr % (progname,timenow())) | |
| 204 flist = os.listdir(output_dir) | |
| 205 pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf'] | |
| 206 if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs | |
| 207 for p in pdflist: | |
| 208 imghref = '%s.jpg' % os.path.splitext(p)[0] # removes .pdf | |
| 209 res.append('<table cellpadding="10"><tr><td>\n') | |
| 210 res.append('<a href="%s"><img src="%s" alt="Click thumbnail to download %s" hspace="10" align="middle"></a>\n' % (p,imghref,p)) | |
| 211 res.append('</tr></td></table>\n') | |
| 212 res.append('<b>Your job produced the following output files.</b><hr/>\n') | |
| 213 res.append('<table>\n') | |
| 214 for i,f in enumerate(flist): | |
| 215 fn = os.path.split(f)[-1] | |
| 216 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn)) | |
| 217 res.append('</table><p/>\n') | |
| 218 if len(cruft) + len(dat) > 0: | |
| 219 res.append('<b>Picard on line resources</b><ul>\n') | |
| 220 res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n') | |
| 221 res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n') | |
| 222 if transpose: | |
| 223 res.append('<b>Picard output (transposed for readability)</b><hr/>\n') | |
| 224 else: | |
| 225 res.append('<b>Picard output</b><hr/>\n') | |
| 226 res.append('<table cellpadding="3" >\n') | |
| 227 if len(cruft) > 0: | |
| 228 cres = ['<tr class="d%d"><td>%s</td></tr>' % (i % 2,x) for i,x in enumerate(cruft)] | |
| 229 res += cres | |
| 230 if len(dat) > 0: | |
| 231 maxrows = 100 | |
| 232 if transpose: | |
| 233 tdat = map(None,*dat) # transpose an arbitrary list of lists | |
| 234 missing = len(tdat) - maxrows | |
| 235 tdat = ['<tr class="d%d"><td>%s</td><td>%s</td></tr>\n' % ((i+len(cruft)) % 2,x[0],x[1]) for i,x in enumerate(tdat) if i < maxrows] | |
| 236 if len(tdat) > maxrows: | |
| 237 tdat.append('<tr><td colspan="2">...WARNING: %d rows deleted for sanity...see raw files for all rows</td></tr>' % missing) | |
| 238 else: | |
| 239 tdat = ['<tr class="d%d"><td>%s</td></tr>\n' % ((i+len(cruft)) % 2,x) for i,x in enumerate(dat) if i < maxrows] | |
| 240 if len(dat) > maxrows: | |
| 241 missing = len(dat) - maxrows | |
| 242 tdat.append('<tr><td>...WARNING: %d rows deleted for sanity...see raw files for all rows</td></tr>' % missing) | |
| 243 res += tdat | |
| 244 res.append('</table>\n') | |
| 245 else: | |
| 246 res.append('<b>No Picard output found - please consult the Picard log above for an explanation</b>') | |
| 247 l = open(log_file,'r').readlines() | |
| 248 if len(l) > 0: | |
| 249 res.append('<b>Picard log</b><hr/>\n') | |
| 250 rlog = ['<pre>',] | |
| 251 rlog += l | |
| 252 rlog.append('</pre>') | |
| 253 res += rlog | |
| 254 else: | |
| 255 res.append("Odd, Picard left no log file %s - must have really barfed badly?" % log_file) | |
| 256 res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n') | |
| 257 res.append( 'generated all outputs reported here, using this command line:<br/>\n<pre>%s</pre>\n' % ''.join(cl)) | |
| 258 res.append(galhtmlpostfix) | |
| 259 outf = open(html_output,'w') | |
| 260 outf.write(''.join(res)) | |
| 261 outf.write('\n') | |
| 262 outf.close() | |
| 263 | |
| 264 def keynat(string): | |
| 265 ''' | |
| 266 borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/ | |
| 267 A natural sort helper function for sort() and sorted() | |
| 268 without using regular expressions or exceptions. | |
| 269 | |
| 270 >>> items = ('Z', 'a', '10th', '1st', '9') | |
| 271 >>> sorted(items) | |
| 272 ['10th', '1st', '9', 'Z', 'a'] | |
| 273 >>> sorted(items, key=keynat) | |
| 274 ['1st', '9', '10th', 'a', 'Z'] | |
| 275 ''' | |
| 276 it = type(1) | |
| 277 r = [] | |
| 278 for c in string: | |
| 279 if c.isdigit(): | |
| 280 d = int(c) | |
| 281 if r and type( r[-1] ) == it: | |
| 282 r[-1] = r[-1] * 10 + d | |
| 283 else: | |
| 284 r.append(d) | |
| 285 else: | |
| 286 r.append(c.lower()) | |
| 287 return r | |
| 288 | |
| 289 def getFlen(bedfname=None): | |
| 290 """ | |
| 291 find all features in a BED file and sum their lengths | |
| 292 """ | |
| 293 features = {} | |
| 294 otherHeaders = [] | |
| 295 try: | |
| 296 infile = open(bedfname,'r') | |
| 297 except: | |
| 298 print '###ERROR: getFlen unable to open bedfile %s' % bedfname | |
| 299 sys.exit(1) | |
| 300 for i,row in enumerate(infile): | |
| 301 if row.startswith('@'): # add to headers if not @SQ | |
| 302 if not row.startswith('@SQ'): | |
| 303 otherHeaders.append(row) | |
| 304 else: | |
| 305 row = row.strip() | |
| 306 if row.startswith('#') or row.lower().startswith('browser') or row.lower().startswith('track'): | |
| 307 continue # ignore headers | |
| 308 srow = row.split('\t') | |
| 309 if len(srow) > 3: | |
| 310 srow = row.split('\t') | |
| 311 f = srow[0] | |
| 312 spos = srow[1] # zero based from UCSC so no need to add 1 - eg 0-100 is 100 bases numbered 0-99 (!) | |
| 313 epos = srow[2] # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 | |
| 314 flen = int(epos) - int(spos) | |
| 315 features.setdefault(f,0) | |
| 316 features[f] += flen | |
| 317 infile.close() | |
| 318 fk = features.keys() | |
| 319 fk = sorted(fk, key=keynat) | |
| 320 return features,fk,otherHeaders | |
| 321 | |
| 322 def bedToPicInterval(infile=None,outfile=None): | |
| 323 """ | |
| 324 Picard tools requiring targets want | |
| 325 a sam style header which incidentally, MUST be sorted in natural order - not lexicographic order: | |
| 326 | |
| 327 @SQ SN:chrM LN:16571 | |
| 328 @SQ SN:chr1 LN:247249719 | |
| 329 @SQ SN:chr2 LN:242951149 | |
| 330 @SQ SN:chr3 LN:199501827 | |
| 331 @SQ SN:chr4 LN:191273063 | |
| 332 added to the start of what looks like a bed style file | |
| 333 chr1 67052400 67052451 - CCDS635.1_cds_0_0_chr1_67052401_r | |
| 334 chr1 67060631 67060788 - CCDS635.1_cds_1_0_chr1_67060632_r | |
| 335 chr1 67065090 67065317 - CCDS635.1_cds_2_0_chr1_67065091_r | |
| 336 chr1 67066082 67066181 - CCDS635.1_cds_3_0_chr1_67066083_r | |
| 337 | |
| 338 see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 | |
| 339 we need to add 1 to start coordinates on the way through - but length calculations are easier | |
| 340 """ | |
| 341 # bedToPicard.py | |
| 342 # ross lazarus October 2010 | |
| 343 # LGPL | |
| 344 # for Rgenetics | |
| 345 """ | |
| 346 collect header info and rewrite bed with header for picard | |
| 347 """ | |
| 348 featlen,fk,otherHeaders = getFlen(bedfname=infile) | |
| 349 try: | |
| 350 outf = open(outfile,'w') | |
| 351 except: | |
| 352 print '###ERROR: writePic unable to open output picard file %s' % outfile | |
| 353 sys.exit(1) | |
| 354 inf = open(infile,'r') # already tested in getFlen | |
| 355 header = ['@SQ\tSN:%s\tLN:%d' % (x,featlen[x]) for x in fk] | |
| 356 if len(otherHeaders) > 0: | |
| 357 header += otherHeaders | |
| 358 outf.write('\n'.join(header)) | |
| 359 outf.write('\n') | |
| 360 for row in inf: | |
| 361 row = row.strip() | |
| 362 if len(row) > 0: # convert zero based start coordinate to 1 based | |
| 363 if row.startswith('@'): | |
| 364 continue | |
| 365 else: | |
| 366 srow = row.split('\t') | |
| 367 srow[1] = '%d' % (int(srow[1])+1) # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 | |
| 368 outf.write('\t'.join(srow)) | |
| 369 outf.write('\n') | |
| 370 outf.close() | |
| 371 inf.close() | |
| 372 | |
| 373 | |
| 374 def oRRun(rcmd=[],outdir=None,title='myR',rexe='R'): | |
| 375 """ | |
| 376 run an r script, lines in rcmd, | |
| 377 in a temporary directory | |
| 378 move everything, r script and all back to outdir which will be an html file | |
| 379 | |
| 380 | |
| 381 # test | |
| 382 RRun(rcmd=['print("hello cruel world")','q()'],title='test') | |
| 383 """ | |
| 384 rlog = [] | |
| 385 print '### rexe = %s' % rexe | |
| 386 assert os.path.isfile(rexe) | |
| 387 rname = '%s.R' % title | |
| 388 stoname = '%s.R.log' % title | |
| 389 rfname = rname | |
| 390 stofname = stoname | |
| 391 if outdir: # want a specific path | |
| 392 rfname = os.path.join(outdir,rname) | |
| 393 stofname = os.path.join(outdir,stoname) | |
| 394 try: | |
| 395 os.makedirs(outdir) # might not be there yet... | |
| 396 except: | |
| 397 pass | |
| 398 else: | |
| 399 outdir = tempfile.mkdtemp(prefix=title) | |
| 400 rfname = os.path.join(outdir,rname) | |
| 401 stofname = os.path.join(outdir,stoname) | |
| 402 rmoutdir = True | |
| 403 f = open(rfname,'w') | |
| 404 if type(rcmd) == type([]): | |
| 405 f.write('\n'.join(rcmd)) | |
| 406 else: # string | |
| 407 f.write(rcmd) | |
| 408 f.write('\n') | |
| 409 f.close() | |
| 410 sto = file(stofname,'w') | |
| 411 vcl = [rexe,"--vanilla --slave", '<', rfname ] | |
| 412 x = subprocess.Popen(' '.join(vcl),shell=True,stderr=sto,stdout=sto,cwd=outdir) | |
| 413 retval = x.wait() | |
| 414 sto.close() | |
| 415 rlog = file(stofname,'r').readlines() | |
| 416 rlog.insert(0,'## found R at %s' % rexe) | |
| 417 if outdir <> None: | |
| 418 flist = os.listdir(outdir) | |
| 419 else: | |
| 420 flist = os.listdir('.') | |
| 421 flist.sort | |
| 422 flist = [(x,x) for x in flist] | |
| 423 for i,x in enumerate(flist): | |
| 424 if x == rname: | |
| 425 flist[i] = (x,'R script for %s' % title) | |
| 426 elif x == stoname: | |
| 427 flist[i] = (x,'R log for %s' % title) | |
| 428 if False and rmoutdir: | |
| 429 os.removedirs(outdir) | |
| 430 return rlog,flist # for html layout | |
| 431 | |
| 432 | |
| 433 | |
| 434 | |
| 435 def RRun(rcmd=[],outdir=None,title='myR',tidy=True): | |
| 436 """ | |
| 437 run an r script, lines in rcmd, | |
| 438 in a temporary directory | |
| 439 move everything, r script and all back to outdir which will be an html file | |
| 440 | |
| 441 | |
| 442 # test | |
| 443 RRun(rcmd=['print("hello cruel world")','q()'],title='test') | |
| 444 echo "a <- c(5, 5); b <- c(0.5, 0.5)" | cat - RScript.R | R --slave \ --vanilla | |
| 445 suggested by http://tolstoy.newcastle.edu.au/R/devel/05/09/2448.html | |
| 446 """ | |
| 447 killme = string.punctuation + string.whitespace | |
| 448 trantab = string.maketrans(killme,'_'*len(killme)) | |
| 449 title = title.translate(trantab) | |
| 450 rlog = [] | |
| 451 tempout=False | |
| 452 rname = '%s.R' % title | |
| 453 stoname = '%s.R.log' % title | |
| 454 cwd = os.getcwd() | |
| 455 if outdir: # want a specific path | |
| 456 try: | |
| 457 os.makedirs(outdir) # might not be there yet... | |
| 458 except: | |
| 459 pass | |
| 460 os.chdir(outdir) | |
| 461 if type(rcmd) == type([]): | |
| 462 script = '\n'.join(rcmd) | |
| 463 else: # string | |
| 464 script = rcmd | |
| 465 sto = file(stoname,'w') | |
| 466 rscript = file(rname,'w') | |
| 467 rscript.write(script) | |
| 468 rscript.write('\n#R script autogenerated by rgenetics/rgutils.py on %s\n' % timenow()) | |
| 469 rscript.close() | |
| 470 vcl = '%s --slave --vanilla < %s' % (rexe,rname) | |
| 471 if outdir: | |
| 472 x = subprocess.Popen(vcl,shell=True,stderr=sto,stdout=sto,cwd=outdir) | |
| 473 else: | |
| 474 x = subprocess.Popen(vcl,shell=True,stderr=sto,stdout=sto) | |
| 475 retval = x.wait() | |
| 476 sto.close() | |
| 477 rlog = file(stoname,'r').readlines() | |
| 478 if retval <> 0: | |
| 479 rlog.insert(0,'Nonzero exit code = %d' % retval) # indicate failure | |
| 480 if outdir: | |
| 481 flist = os.listdir(outdir) | |
| 482 else: | |
| 483 flist = os.listdir(os.getcwd()) | |
| 484 flist.sort | |
| 485 flist = [(x,x) for x in flist] | |
| 486 for i,x in enumerate(flist): | |
| 487 if x == rname: | |
| 488 flist[i] = (x,'R script for %s' % title) | |
| 489 elif x == stoname: | |
| 490 flist[i] = (x,'R log for %s' % title) | |
| 491 if outdir: | |
| 492 os.chdir(cwd) | |
| 493 return rlog,flist # for html layout | |
| 494 | |
| 495 def runPlink(bfn='bar',ofn='foo',logf=None,plinktasks=[],cd='./',vclbase = []): | |
| 496 """run a series of plink tasks and append log results to stdout | |
| 497 vcl has a list of parameters for the spawnv | |
| 498 common settings can all go in the vclbase list and are added to each plinktask | |
| 499 """ | |
| 500 # root for all | |
| 501 fplog,plog = tempfile.mkstemp() | |
| 502 if type(logf) == type(' '): # open otherwise assume is file - ugh I'm in a hurry | |
| 503 mylog = file(logf,'a+') | |
| 504 else: | |
| 505 mylog = logf | |
| 506 mylog.write('## Rgenetics: http://rgenetics.org Galaxy Tools rgQC.py Plink runner\n') | |
| 507 for task in plinktasks: # each is a list | |
| 508 vcl = vclbase + task | |
| 509 sto = file(plog,'w') | |
| 510 x = subprocess.Popen(' '.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd) | |
| 511 retval = x.wait() | |
| 512 sto.close() | |
| 513 try: | |
| 514 lplog = file(plog,'r').read() | |
| 515 mylog.write(lplog) | |
| 516 os.unlink(plog) # no longer needed | |
| 517 except: | |
| 518 mylog.write('### %s Strange - no std out from plink when running command line\n%s' % (timenow(),' '.join(vcl))) | |
| 519 | |
| 520 def pruneLD(plinktasks=[],cd='./',vclbase = []): | |
| 521 """ | |
| 522 plink blathers when doing pruning - ignore | |
| 523 Linkage disequilibrium based SNP pruning | |
| 524 if a million snps in 3 billion base pairs, have mean 3k spacing | |
| 525 assume 40-60k of ld in ceu, a window of 120k width is about 40 snps | |
| 526 so lots more is perhaps less efficient - each window computational cost is | |
| 527 ON^2 unless the code is smart enough to avoid unecessary computation where | |
| 528 allele frequencies make it impossible to see ld > the r^2 cutoff threshold | |
| 529 So, do a window and move forward 20? | |
| 530 The fine Plink docs at http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#prune | |
| 531 reproduced below | |
| 532 | |
| 533 Sometimes it is useful to generate a pruned subset of SNPs that are in approximate linkage equilibrium with each other. This can be achieved via two commands: | |
| 534 --indep which prunes based on the variance inflation factor (VIF), which recursively removes SNPs within a sliding window; second, --indep-pairwise which is | |
| 535 similar, except it is based only on pairwise genotypic correlation. | |
| 536 | |
| 537 Hint The output of either of these commands is two lists of SNPs: those that are pruned out and those that are not. A separate command using the --extract or | |
| 538 --exclude option is necessary to actually perform the pruning. | |
| 539 | |
| 540 The VIF pruning routine is performed: | |
| 541 plink --file data --indep 50 5 2 | |
| 542 | |
| 543 will create files | |
| 544 | |
| 545 plink.prune.in | |
| 546 plink.prune.out | |
| 547 | |
| 548 Each is a simlpe list of SNP IDs; both these files can subsequently be specified as the argument for | |
| 549 a --extract or --exclude command. | |
| 550 | |
| 551 The parameters for --indep are: window size in SNPs (e.g. 50), the number of SNPs to shift the | |
| 552 window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other | |
| 553 SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent | |
| 554 near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of | |
| 555 all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window | |
| 556 size is too large, too many SNPs may be removed. | |
| 557 | |
| 558 The second procedure is performed: | |
| 559 plink --file data --indep-pairwise 50 5 0.5 | |
| 560 | |
| 561 This generates the same output files as the first version; the only difference is that a | |
| 562 simple pairwise threshold is used. The first two parameters (50 and 5) are the same as above (window size and step); the third parameter represents the r^2 | |
| 563 threshold. Note: this represents the pairwise SNP-SNP metric now, not the multiple correlation coefficient; also note, this is based on the genotypic | |
| 564 correlation, i.e. it does not involve phasing. | |
| 565 | |
| 566 To give a concrete example: the command above that specifies 50 5 0.5 would a) consider a | |
| 567 window of 50 SNPs, b) calculate LD between each pair of SNPs in the window, b) remove one of a pair of SNPs if the LD is greater than 0.5, c) shift the window 5 | |
| 568 SNPs forward and repeat the procedure. | |
| 569 | |
| 570 To make a new, pruned file, then use something like (in this example, we also convert the | |
| 571 standard PED fileset to a binary one): | |
| 572 plink --file data --extract plink.prune.in --make-bed --out pruneddata | |
| 573 """ | |
| 574 fplog,plog = tempfile.mkstemp() | |
| 575 alog = [] | |
| 576 alog.append('## Rgenetics: http://rgenetics.org Galaxy Tools rgQC.py Plink pruneLD runner\n') | |
| 577 for task in plinktasks: # each is a list | |
| 578 vcl = vclbase + task | |
| 579 sto = file(plog,'w') | |
| 580 x = subprocess.Popen(' '.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd) | |
| 581 retval = x.wait() | |
| 582 sto.close() | |
| 583 try: | |
| 584 lplog = file(plog,'r').readlines() | |
| 585 lplog = [x for x in lplog if x.find('Pruning SNP') == -1] | |
| 586 alog += lplog | |
| 587 alog.append('\n') | |
| 588 os.unlink(plog) # no longer needed | |
| 589 except: | |
| 590 alog.append('### %s Strange - no std out from plink when running command line\n%s\n' % (timenow(),' '.join(vcl))) | |
| 591 return alog | |
| 592 | |
| 593 def readMap(mapfile=None,allmarkers=False,rsdict={},c=None,spos=None,epos=None): | |
| 594 """abstract out - keeps reappearing | |
| 595 """ | |
| 596 mfile = open(mapfile, 'r') | |
| 597 markers = [] | |
| 598 snpcols = {} | |
| 599 snpIndex = 0 # in case empty or comment lines | |
| 600 for rownum,row in enumerate(mfile): | |
| 601 line = row.strip() | |
| 602 if not line or line[0]=='#': continue | |
| 603 chrom, snp, genpos, abspos = line.split()[:4] # just in case more cols | |
| 604 try: | |
| 605 abspos = int(abspos) | |
| 606 except: | |
| 607 abspos = 0 # stupid framingham data grumble grumble | |
| 608 if allmarkers or rsdict.get(snp,None) or (chrom == c and (spos <= abspos <= epos)): | |
| 609 markers.append((chrom,abspos,snp)) # decorate for sort into genomic | |
| 610 snpcols[snp] = snpIndex # so we know which col to find genos for this marker | |
| 611 snpIndex += 1 | |
| 612 markers.sort() | |
| 613 rslist = [x[2] for x in markers] # drop decoration | |
| 614 rsdict = dict(zip(rslist,rslist)) | |
| 615 mfile.close() | |
| 616 return markers,snpcols,rslist,rsdict | |
| 617 | |
| 618 |
