Mercurial > repos > xuebing > sharplabtool
diff tools/rgenetics/rgutils.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rgenetics/rgutils.py Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,618 @@ +# utilities for rgenetics +# +# copyright 2009 ross lazarus +# released under the LGPL +# + +import subprocess, os, sys, time, tempfile,string,plinkbinJZ +import datetime + +galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> +<meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> +<title></title> +<link rel="stylesheet" href="/static/style/base.css" type="text/css" /> +</head> +<body> +<div class="document"> +""" +galhtmlattr = """<h3><a href="http://rgenetics.org">Rgenetics</a> tool %s run at %s</h3>""" +galhtmlpostfix = """</div></body></html>\n""" + +plinke = 'plink' # changed jan 2010 - all exes must be on path +rexe = 'R' # to avoid cluster/platform dependencies +smartpca = 'smartpca.perl' + +def timenow(): + """return current time as a string + """ + return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) + +def timestamp(): + return datetime.datetime.now().strftime('%Y%m%d%H%M%S') + +def fail( message ): + print >> sys.stderr, message + return -1 + +def whereis(program): + for path in os.environ.get('PATH', '').split(':'): + if os.path.exists(os.path.join(path, program)) and \ + not os.path.isdir(os.path.join(path, program)): + return os.path.join(path, program) + return None + + +def bedToPicInterval(infile=None): + """ + Picard tools requiring targets want + a sam style header which incidentally, MUST be sorted in natural order - not lexicographic order: + + @SQ SN:chrM LN:16571 + @SQ SN:chr1 LN:247249719 + @SQ SN:chr2 LN:242951149 + @SQ SN:chr3 LN:199501827 + @SQ SN:chr4 LN:191273063 + added to the start of what looks like a bed style file + chr1 67052400 67052451 - CCDS635.1_cds_0_0_chr1_67052401_r + chr1 67060631 67060788 - CCDS635.1_cds_1_0_chr1_67060632_r + chr1 67065090 67065317 - CCDS635.1_cds_2_0_chr1_67065091_r + chr1 67066082 67066181 - CCDS635.1_cds_3_0_chr1_67066083_r + + + see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 + we need to add 1 to start coordinates on the way through - but length calculations are easier + """ + # bedToPicard.py + # ross lazarus October 2010 + # LGPL + # for Rgenetics + + def getFlen(bedfname=None): + """ + find all features in a BED file and sum their lengths + """ + features = {} + try: + infile = open(bedfname,'r') + except: + print '###ERROR: getFlen unable to open bedfile %s' % bedfname + sys.exit(1) + for i,row in enumerate(infile): + if row[0] == '@': # shouldn't happen given a bed file! + print 'row %d=%s - should NOT start with @!' % (i,row) + sys.exit(1) + row = row.strip() + if len(row) > 0: + srow = row.split('\t') + f = srow[0] + spos = srow[1] # zero based from UCSC so no need to add 1 - eg 0-100 is 100 bases numbered 0-99 (!) + epos = srow[2] # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 + flen = int(epos) - int(spos) + features.setdefault(f,0) + features[f] += flen + infile.close() + return features + + def keynat(string): + ''' + borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/ + A natural sort helper function for sort() and sorted() + without using regular expressions or exceptions. + + >>> items = ('Z', 'a', '10th', '1st', '9') + >>> sorted(items) + ['10th', '1st', '9', 'Z', 'a'] + >>> sorted(items, key=keynat) + ['1st', '9', '10th', 'a', 'Z'] + ''' + it = type(1) + r = [] + for c in string: + if c.isdigit(): + d = int(c) + if r and type( r[-1] ) == it: + r[-1] = r[-1] * 10 + d + else: + r.append(d) + else: + r.append(c.lower()) + return r + + def writePic(outfname=None,bedfname=None): + """ + collect header info and rewrite bed with header for picard + """ + featlen = getFlen(bedfname=bedfname) + try: + outf = open(outfname,'w') + except: + print '###ERROR: writePic unable to open output picard file %s' % outfname + sys.exit(1) + infile = open(bedfname,'r') # already tested in getFlen + k = featlen.keys() + fk = sorted(k, key=keynat) + header = ['@SQ\tSN:%s\tLN:%d' % (x,featlen[x]) for x in fk] + outf.write('\n'.join(header)) + outf.write('\n') + for row in infile: + row = row.strip() + if len(row) > 0: # convert zero based start coordinate to 1 based + srow = row.split('\t') + srow[1] = '%d' % (int(srow[1])+1) # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 + outf.write('\t'.join(srow)) + outf.write('\n') + outf.close() + infile.close() + + + + # bedToPicInterval starts here + fd,outf = tempfile.mkstemp(prefix='rgPicardHsMetrics') + writePic(outfname=outf,bedfname=infile) + return outf + + +def getFileString(fpath, outpath): + """ + format a nice file size string + """ + size = '' + fp = os.path.join(outpath, fpath) + s = '? ?' + if os.path.isfile(fp): + n = float(os.path.getsize(fp)) + if n > 2**20: + size = ' (%1.1f MB)' % (n/2**20) + elif n > 2**10: + size = ' (%1.1f KB)' % (n/2**10) + elif n > 0: + size = ' (%d B)' % (int(n)) + s = '%s %s' % (fpath, size) + return s + + +def fixPicardOutputs(tempout=None,output_dir=None,log_file=None,html_output=None,progname=None,cl=[],transpose=True): + """ + picard produces long hard to read tab header files + make them available but present them transposed for readability + """ + rstyle="""<style type="text/css"> + tr.d0 td {background-color: oldlace; color: black;} + tr.d1 td {background-color: aliceblue; color: black;} + </style>""" + cruft = [] + dat = [] + try: + r = open(tempout,'r').readlines() + except: + r = [] + for row in r: + if row.strip() > '': + srow = row.split('\t') + if row[0] == '#': + cruft.append(row.strip()) # want strings + else: + dat.append(srow) # want lists + + res = [rstyle,] + res.append(galhtmlprefix % progname) + res.append(galhtmlattr % (progname,timenow())) + flist = os.listdir(output_dir) + pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf'] + if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs + for p in pdflist: + imghref = '%s.jpg' % os.path.splitext(p)[0] # removes .pdf + res.append('<table cellpadding="10"><tr><td>\n') + res.append('<a href="%s"><img src="%s" alt="Click thumbnail to download %s" hspace="10" align="middle"></a>\n' % (p,imghref,p)) + res.append('</tr></td></table>\n') + res.append('<b>Your job produced the following output files.</b><hr/>\n') + res.append('<table>\n') + for i,f in enumerate(flist): + fn = os.path.split(f)[-1] + res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn)) + res.append('</table><p/>\n') + if len(cruft) + len(dat) > 0: + res.append('<b>Picard on line resources</b><ul>\n') + res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n') + res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n') + if transpose: + res.append('<b>Picard output (transposed for readability)</b><hr/>\n') + else: + res.append('<b>Picard output</b><hr/>\n') + res.append('<table cellpadding="3" >\n') + if len(cruft) > 0: + cres = ['<tr class="d%d"><td>%s</td></tr>' % (i % 2,x) for i,x in enumerate(cruft)] + res += cres + if len(dat) > 0: + maxrows = 100 + if transpose: + tdat = map(None,*dat) # transpose an arbitrary list of lists + missing = len(tdat) - maxrows + tdat = ['<tr class="d%d"><td>%s</td><td>%s</td></tr>\n' % ((i+len(cruft)) % 2,x[0],x[1]) for i,x in enumerate(tdat) if i < maxrows] + if len(tdat) > maxrows: + tdat.append('<tr><td colspan="2">...WARNING: %d rows deleted for sanity...see raw files for all rows</td></tr>' % missing) + else: + tdat = ['<tr class="d%d"><td>%s</td></tr>\n' % ((i+len(cruft)) % 2,x) for i,x in enumerate(dat) if i < maxrows] + if len(dat) > maxrows: + missing = len(dat) - maxrows + tdat.append('<tr><td>...WARNING: %d rows deleted for sanity...see raw files for all rows</td></tr>' % missing) + res += tdat + res.append('</table>\n') + else: + res.append('<b>No Picard output found - please consult the Picard log above for an explanation</b>') + l = open(log_file,'r').readlines() + if len(l) > 0: + res.append('<b>Picard log</b><hr/>\n') + rlog = ['<pre>',] + rlog += l + rlog.append('</pre>') + res += rlog + else: + res.append("Odd, Picard left no log file %s - must have really barfed badly?" % log_file) + res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n') + res.append( 'generated all outputs reported here, using this command line:<br/>\n<pre>%s</pre>\n' % ''.join(cl)) + res.append(galhtmlpostfix) + outf = open(html_output,'w') + outf.write(''.join(res)) + outf.write('\n') + outf.close() + +def keynat(string): + ''' + borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/ + A natural sort helper function for sort() and sorted() + without using regular expressions or exceptions. + + >>> items = ('Z', 'a', '10th', '1st', '9') + >>> sorted(items) + ['10th', '1st', '9', 'Z', 'a'] + >>> sorted(items, key=keynat) + ['1st', '9', '10th', 'a', 'Z'] + ''' + it = type(1) + r = [] + for c in string: + if c.isdigit(): + d = int(c) + if r and type( r[-1] ) == it: + r[-1] = r[-1] * 10 + d + else: + r.append(d) + else: + r.append(c.lower()) + return r + +def getFlen(bedfname=None): + """ + find all features in a BED file and sum their lengths + """ + features = {} + otherHeaders = [] + try: + infile = open(bedfname,'r') + except: + print '###ERROR: getFlen unable to open bedfile %s' % bedfname + sys.exit(1) + for i,row in enumerate(infile): + if row.startswith('@'): # add to headers if not @SQ + if not row.startswith('@SQ'): + otherHeaders.append(row) + else: + row = row.strip() + if row.startswith('#') or row.lower().startswith('browser') or row.lower().startswith('track'): + continue # ignore headers + srow = row.split('\t') + if len(srow) > 3: + srow = row.split('\t') + f = srow[0] + spos = srow[1] # zero based from UCSC so no need to add 1 - eg 0-100 is 100 bases numbered 0-99 (!) + epos = srow[2] # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 + flen = int(epos) - int(spos) + features.setdefault(f,0) + features[f] += flen + infile.close() + fk = features.keys() + fk = sorted(fk, key=keynat) + return features,fk,otherHeaders + +def bedToPicInterval(infile=None,outfile=None): + """ + Picard tools requiring targets want + a sam style header which incidentally, MUST be sorted in natural order - not lexicographic order: + + @SQ SN:chrM LN:16571 + @SQ SN:chr1 LN:247249719 + @SQ SN:chr2 LN:242951149 + @SQ SN:chr3 LN:199501827 + @SQ SN:chr4 LN:191273063 + added to the start of what looks like a bed style file + chr1 67052400 67052451 - CCDS635.1_cds_0_0_chr1_67052401_r + chr1 67060631 67060788 - CCDS635.1_cds_1_0_chr1_67060632_r + chr1 67065090 67065317 - CCDS635.1_cds_2_0_chr1_67065091_r + chr1 67066082 67066181 - CCDS635.1_cds_3_0_chr1_67066083_r + + see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 + we need to add 1 to start coordinates on the way through - but length calculations are easier + """ + # bedToPicard.py + # ross lazarus October 2010 + # LGPL + # for Rgenetics + """ + collect header info and rewrite bed with header for picard + """ + featlen,fk,otherHeaders = getFlen(bedfname=infile) + try: + outf = open(outfile,'w') + except: + print '###ERROR: writePic unable to open output picard file %s' % outfile + sys.exit(1) + inf = open(infile,'r') # already tested in getFlen + header = ['@SQ\tSN:%s\tLN:%d' % (x,featlen[x]) for x in fk] + if len(otherHeaders) > 0: + header += otherHeaders + outf.write('\n'.join(header)) + outf.write('\n') + for row in inf: + row = row.strip() + if len(row) > 0: # convert zero based start coordinate to 1 based + if row.startswith('@'): + continue + else: + srow = row.split('\t') + srow[1] = '%d' % (int(srow[1])+1) # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 + outf.write('\t'.join(srow)) + outf.write('\n') + outf.close() + inf.close() + + +def oRRun(rcmd=[],outdir=None,title='myR',rexe='R'): + """ + run an r script, lines in rcmd, + in a temporary directory + move everything, r script and all back to outdir which will be an html file + + + # test + RRun(rcmd=['print("hello cruel world")','q()'],title='test') + """ + rlog = [] + print '### rexe = %s' % rexe + assert os.path.isfile(rexe) + rname = '%s.R' % title + stoname = '%s.R.log' % title + rfname = rname + stofname = stoname + if outdir: # want a specific path + rfname = os.path.join(outdir,rname) + stofname = os.path.join(outdir,stoname) + try: + os.makedirs(outdir) # might not be there yet... + except: + pass + else: + outdir = tempfile.mkdtemp(prefix=title) + rfname = os.path.join(outdir,rname) + stofname = os.path.join(outdir,stoname) + rmoutdir = True + f = open(rfname,'w') + if type(rcmd) == type([]): + f.write('\n'.join(rcmd)) + else: # string + f.write(rcmd) + f.write('\n') + f.close() + sto = file(stofname,'w') + vcl = [rexe,"--vanilla --slave", '<', rfname ] + x = subprocess.Popen(' '.join(vcl),shell=True,stderr=sto,stdout=sto,cwd=outdir) + retval = x.wait() + sto.close() + rlog = file(stofname,'r').readlines() + rlog.insert(0,'## found R at %s' % rexe) + if outdir <> None: + flist = os.listdir(outdir) + else: + flist = os.listdir('.') + flist.sort + flist = [(x,x) for x in flist] + for i,x in enumerate(flist): + if x == rname: + flist[i] = (x,'R script for %s' % title) + elif x == stoname: + flist[i] = (x,'R log for %s' % title) + if False and rmoutdir: + os.removedirs(outdir) + return rlog,flist # for html layout + + + + +def RRun(rcmd=[],outdir=None,title='myR',tidy=True): + """ + run an r script, lines in rcmd, + in a temporary directory + move everything, r script and all back to outdir which will be an html file + + + # test + RRun(rcmd=['print("hello cruel world")','q()'],title='test') + echo "a <- c(5, 5); b <- c(0.5, 0.5)" | cat - RScript.R | R --slave \ --vanilla + suggested by http://tolstoy.newcastle.edu.au/R/devel/05/09/2448.html + """ + killme = string.punctuation + string.whitespace + trantab = string.maketrans(killme,'_'*len(killme)) + title = title.translate(trantab) + rlog = [] + tempout=False + rname = '%s.R' % title + stoname = '%s.R.log' % title + cwd = os.getcwd() + if outdir: # want a specific path + try: + os.makedirs(outdir) # might not be there yet... + except: + pass + os.chdir(outdir) + if type(rcmd) == type([]): + script = '\n'.join(rcmd) + else: # string + script = rcmd + sto = file(stoname,'w') + rscript = file(rname,'w') + rscript.write(script) + rscript.write('\n#R script autogenerated by rgenetics/rgutils.py on %s\n' % timenow()) + rscript.close() + vcl = '%s --slave --vanilla < %s' % (rexe,rname) + if outdir: + x = subprocess.Popen(vcl,shell=True,stderr=sto,stdout=sto,cwd=outdir) + else: + x = subprocess.Popen(vcl,shell=True,stderr=sto,stdout=sto) + retval = x.wait() + sto.close() + rlog = file(stoname,'r').readlines() + if retval <> 0: + rlog.insert(0,'Nonzero exit code = %d' % retval) # indicate failure + if outdir: + flist = os.listdir(outdir) + else: + flist = os.listdir(os.getcwd()) + flist.sort + flist = [(x,x) for x in flist] + for i,x in enumerate(flist): + if x == rname: + flist[i] = (x,'R script for %s' % title) + elif x == stoname: + flist[i] = (x,'R log for %s' % title) + if outdir: + os.chdir(cwd) + return rlog,flist # for html layout + +def runPlink(bfn='bar',ofn='foo',logf=None,plinktasks=[],cd='./',vclbase = []): + """run a series of plink tasks and append log results to stdout + vcl has a list of parameters for the spawnv + common settings can all go in the vclbase list and are added to each plinktask + """ + # root for all + fplog,plog = tempfile.mkstemp() + if type(logf) == type(' '): # open otherwise assume is file - ugh I'm in a hurry + mylog = file(logf,'a+') + else: + mylog = logf + mylog.write('## Rgenetics: http://rgenetics.org Galaxy Tools rgQC.py Plink runner\n') + for task in plinktasks: # each is a list + vcl = vclbase + task + sto = file(plog,'w') + x = subprocess.Popen(' '.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd) + retval = x.wait() + sto.close() + try: + lplog = file(plog,'r').read() + mylog.write(lplog) + os.unlink(plog) # no longer needed + except: + mylog.write('### %s Strange - no std out from plink when running command line\n%s' % (timenow(),' '.join(vcl))) + +def pruneLD(plinktasks=[],cd='./',vclbase = []): + """ + plink blathers when doing pruning - ignore + Linkage disequilibrium based SNP pruning + if a million snps in 3 billion base pairs, have mean 3k spacing + assume 40-60k of ld in ceu, a window of 120k width is about 40 snps + so lots more is perhaps less efficient - each window computational cost is + ON^2 unless the code is smart enough to avoid unecessary computation where + allele frequencies make it impossible to see ld > the r^2 cutoff threshold + So, do a window and move forward 20? + The fine Plink docs at http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#prune + reproduced below + +Sometimes it is useful to generate a pruned subset of SNPs that are in approximate linkage equilibrium with each other. This can be achieved via two commands: +--indep which prunes based on the variance inflation factor (VIF), which recursively removes SNPs within a sliding window; second, --indep-pairwise which is +similar, except it is based only on pairwise genotypic correlation. + +Hint The output of either of these commands is two lists of SNPs: those that are pruned out and those that are not. A separate command using the --extract or +--exclude option is necessary to actually perform the pruning. + +The VIF pruning routine is performed: +plink --file data --indep 50 5 2 + +will create files + + plink.prune.in + plink.prune.out + +Each is a simlpe list of SNP IDs; both these files can subsequently be specified as the argument for +a --extract or --exclude command. + +The parameters for --indep are: window size in SNPs (e.g. 50), the number of SNPs to shift the +window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other +SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent +near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of +all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window +size is too large, too many SNPs may be removed. + +The second procedure is performed: +plink --file data --indep-pairwise 50 5 0.5 + +This generates the same output files as the first version; the only difference is that a +simple pairwise threshold is used. The first two parameters (50 and 5) are the same as above (window size and step); the third parameter represents the r^2 +threshold. Note: this represents the pairwise SNP-SNP metric now, not the multiple correlation coefficient; also note, this is based on the genotypic +correlation, i.e. it does not involve phasing. + +To give a concrete example: the command above that specifies 50 5 0.5 would a) consider a +window of 50 SNPs, b) calculate LD between each pair of SNPs in the window, b) remove one of a pair of SNPs if the LD is greater than 0.5, c) shift the window 5 +SNPs forward and repeat the procedure. + +To make a new, pruned file, then use something like (in this example, we also convert the +standard PED fileset to a binary one): +plink --file data --extract plink.prune.in --make-bed --out pruneddata + """ + fplog,plog = tempfile.mkstemp() + alog = [] + alog.append('## Rgenetics: http://rgenetics.org Galaxy Tools rgQC.py Plink pruneLD runner\n') + for task in plinktasks: # each is a list + vcl = vclbase + task + sto = file(plog,'w') + x = subprocess.Popen(' '.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd) + retval = x.wait() + sto.close() + try: + lplog = file(plog,'r').readlines() + lplog = [x for x in lplog if x.find('Pruning SNP') == -1] + alog += lplog + alog.append('\n') + os.unlink(plog) # no longer needed + except: + alog.append('### %s Strange - no std out from plink when running command line\n%s\n' % (timenow(),' '.join(vcl))) + return alog + +def readMap(mapfile=None,allmarkers=False,rsdict={},c=None,spos=None,epos=None): + """abstract out - keeps reappearing + """ + mfile = open(mapfile, 'r') + markers = [] + snpcols = {} + snpIndex = 0 # in case empty or comment lines + for rownum,row in enumerate(mfile): + line = row.strip() + if not line or line[0]=='#': continue + chrom, snp, genpos, abspos = line.split()[:4] # just in case more cols + try: + abspos = int(abspos) + except: + abspos = 0 # stupid framingham data grumble grumble + if allmarkers or rsdict.get(snp,None) or (chrom == c and (spos <= abspos <= epos)): + markers.append((chrom,abspos,snp)) # decorate for sort into genomic + snpcols[snp] = snpIndex # so we know which col to find genos for this marker + snpIndex += 1 + markers.sort() + rslist = [x[2] for x in markers] # drop decoration + rsdict = dict(zip(rslist,rslist)) + mfile.close() + return markers,snpcols,rslist,rsdict + +