Mercurial > repos > xuebing > sharplabtool
diff tools/rgenetics/rgCaCo.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rgenetics/rgCaCo.py Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,271 @@ +#!/usr/local/bin/python +# hack to run and process a plink case control association +# expects args as +# bfilepath outname jobname outformat (wig,xls) +# ross lazarus +# for wig files, we need annotation so look for map file or complain +""" +Parameters for wiggle track definition lines +All options are placed in a single line separated by spaces: + + track type=wiggle_0 name=track_label description=center_label \ + visibility=display_mode color=r,g,b altColor=r,g,b \ + priority=priority autoScale=on|off \ + gridDefault=on|off maxHeightPixels=max:default:min \ + graphType=bar|points viewLimits=lower:upper \ + yLineMark=real-value yLineOnOff=on|off \ + windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16 +""" + +import sys,math,shutil,subprocess,os,time,tempfile,string +from os.path import abspath +from rgutils import timenow, plinke +imagedir = '/static/rg' # if needed for images +myversion = 'V000.1 April 2007' +verbose = False + +def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000): + """ + score must be scaled to 0-1000 + + Want to make some wig tracks from each analysis + Best n -log10(p). Make top hit the window. + we use our tab output which has + rs chrom offset ADD_stat ADD_p ADD_log10p + rs3094315 1 792429 1.151 0.2528 0.597223 + + """ + + def is_number(s): + try: + float(s) + return True + except ValueError: + return False + header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) + column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] + halfwidth=100 + resfpath = os.path.join(twd,resf) + resf = open(resfpath,'r') + resfl = resf.readlines() # dumb but convenient for millions of rows + resfl = [x.split() for x in resfl] + headl = resfl[0] + resfl = resfl[1:] + headl = [x.strip().upper() for x in headl] + headIndex = dict(zip(headl,range(0,len(headl)))) + whatwewant = ['CHR','RS','OFFSET','LOG10ARMITAGEP'] + wewant = [headIndex.get(x,None) for x in whatwewant] + if None in wewant: # missing something + logf.write('### Error missing a required header from %s in makeGFF - headIndex=%s\n' % (whatwewant,headIndex)) + return + ppos = wewant[3] # last in list + resfl = [x for x in resfl if x[ppos] > '' and x[ppos] <> 'NA'] + resfl = [(float(x[ppos]),x) for x in resfl] # decorate + resfl.sort() + resfl.reverse() # using -log10 so larger is better + pvals = [x[0] for x in resfl] # need to scale + resfl = [x[1] for x in resfl] # drop decoration + resfl = resfl[:topn] # truncate + maxp = max(pvals) # need to scale + minp = min(pvals) + prange = abs(maxp-minp) + 0.5 # fudge + scalefact = 1000.0/prange + logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) + for i,row in enumerate(resfl): + row[ppos] = '%d' % (int(scalefact*pvals[i])) + resfl[i] = row # replace + outf = file(outfname,'w') + outf.write(header) + outres = [] # need to resort into chrom offset order + for i,lrow in enumerate(resfl): + chrom,snp,offset,p, = [lrow[x] for x in wewant] + gff = ('chr%s' % chrom,'rgCaCo','variation','%d' % (int(offset)-halfwidth), + '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) + outres.append(gff) + outres = [(x[0],int(x[3]),x) for x in outres] # decorate + outres.sort() # into chrom offset + outres=[x[2] for x in outres] # undecorate + outres = ['\t'.join(x) for x in outres] + outf.write('\n'.join(outres)) + outf.write('\n') + outf.close() + + +def plink_assocToGG(plinkout="hm",tag='test'): + """ plink --assoc output looks like this + # CHR SNP A1 F_A F_U A2 CHISQ P OR + # 1 rs3094315 G 0.6685 0.1364 A 104.1 1.929e-24 12.77 + # write as a genegraph input file + """ + inf = file('%s.assoc' % plinkout,'r') + outf = file('%sassoc.xls' % plinkout,'w') + res = ['rs\tlog10p%s\tFakeInvOR%s\tRealOR%s' % (tag,tag,tag),] # output header for ucsc genome graphs + head = inf.next() + for l in inf: + ll = l.split() + if len(ll) >= 8: + p = float(ll[7]) + if p <> 'NA': # eesh + logp = '%9.9f' % -math.log10(p) + else: + logp = 'NA' + try: + orat = ll[8] + except: + orat = 'NA' + orat2 = orat + # invert large negative odds ratios + if float(orat) < 1 and float(orat) > 0.0: + orat2 = '%9.9f' % (1.0/float(orat)) + outl = [ll[1],logp, orat2, orat] + res.append('\t'.join(outl)) + outf.write('\n'.join(res)) + outf.write('\n') + outf.close() + inf.close() + +def xformModel(infname='',resf='',outfname='', + name='foo',mapf='/usr/local/galaxy/data/rg/ped/x.bim',flog=None): + """munge a plink .model file into either a ucsc track or an xls file + rerla@meme ~/plink]$ head hmYRI_CEU.model + CHR SNP TEST AFF UNAFF CHISQ DF P + 1 rs3094315 GENO 41/37/11 0/24/64 NA NA NA + 1 rs3094315 TREND 119/59 24/152 81.05 1 2.201e-19 + 1 rs3094315 ALLELIC 119/59 24/152 104.1 1 1.929e-24 + 1 rs3094315 DOM 78/11 24/64 NA NA NA + + bim file has +[rerla@beast pbed]$ head plink_wgas1_example.bim +1 rs3094315 0.792429 792429 G A +1 rs6672353 0.817376 817376 A G + """ + if verbose: + print 'Rgenetics rgCaCo.xformModel got resf=%s, outfname=%s' % (resf,outfname) + res = [] + rsdict = {} + map = file(mapf,'r') + for l in map: # plink map + ll = l.strip().split() + if len(ll) >= 3: + rs=ll[1].strip() + chrom = ll[0] + if chrom.lower() == 'x': + chrom='23' + elif chrom.lower() == 'y': + chrom = 24 + elif chrom.lower() == 'mito': + chrom = 25 + offset = ll[3] + rsdict[rs] = (chrom,offset) + res.append('rs\tChr\tOffset\tGenop\tlog10Genop\tArmitagep\tlog10Armitagep\tAllelep\tlog10Allelep\tDomp\tlog10Domp') + f = open(resf,'r') + headl = f.readline() + if headl.find('\t') <> -1: + headl = headl.split('\t') + delim = '\t' + else: + headl = headl.split() + delim = None + whatwewant = ['CHR','SNP','TEST','AFF','UNAFF','CHISQ','P'] + wewant = [headl.index(x) for x in whatwewant] + llen = len(headl) + lnum = anum = 0 + lastsnp = None # so we know when to write out a gg line + outl = {} + f.seek(0) + for lnum,l in enumerate(f): + if lnum == 0: + continue + ll = l.split() + if delim: + ll = l.split(delim) + if len(ll) >= llen: # valid line + chr,snp,test,naff,nuaff,chi,p = [ll[x] for x in wewant] + snp = snp.strip() + chrom,offset = rsdict.get(snp,(None,None)) + anum += 1 + fp = 1.0 # if NA + lp = 0.0 + try: + fp = float(p) + if fp > 0: + lp = -math.log10(fp) + else: + fp = 9e-100 + flog.write('### WARNING - Plink calculated %s for %s p value!!! 9e-100 substituted!\n' % (p,test)) + flog.write('### offending line #%d in %s = %s' % (lnum,l)) + except: + pass + if snp <> lastsnp: + if len(outl.keys()) > 3: + sl = [outl.get(x,'?') for x in ('snp','chrom','offset','GENO','TREND','ALLELIC','DOM')] + res.append('\t'.join(sl)) # last snp line + outl = {'snp':snp,'chrom':chrom,'offset':offset} # first 3 cols for gg line + lastsnp = snp # reset for next marker + #if p == 'NA': + # p = 1.0 + # let's pass downstream for handling R is fine? + outl[test] = '%s\t%f' % (p,lp) + if len(outl.keys()) > 3: + l = [outl.get(x,'?') for x in ('snp','chrom','offset','GENO','TREND','ALLELIC','DOM')] + res.append('\t'.join(l)) # last snp line + f = file(outfname,'w') + res.append('') + f.write('\n'.join(res)) + f.close() + + + + +if __name__ == "__main__": + """ + # called as + <command interpreter="python"> + rgCaCo.py '$i.extra_files_path/$i.metadata.base_name' "$name" + '$out_file1' '$logf' '$logf.files_path' '$gffout' + </command> </command> + """ + if len(sys.argv) < 7: + s = 'rgCaCo.py needs 6 params - got %s \n' % (sys.argv) + print >> sys.stdout, s + sys.exit(0) + bfname = sys.argv[1] + name = sys.argv[2] + killme = string.punctuation + string.whitespace + trantab = string.maketrans(killme,'_'*len(killme)) + name = name.translate(trantab) + outfname = sys.argv[3] + logf = sys.argv[4] + logoutdir = sys.argv[5] + gffout = sys.argv[6] + topn = 1000 + try: + os.makedirs(logoutdir) + except: + pass + map_file = None + me = sys.argv[0] + amapf = '%s.bim' % bfname # to decode map in xformModel + flog = file(logf,'w') + logme = [] + cdir = os.getcwd() + s = 'Rgenetics %s http://rgenetics.org Galaxy Tools, rgCaCo.py started %s\n' % (myversion,timenow()) + print >> sys.stdout, s # so will appear as blurb for file + logme.append(s) + if verbose: + s = 'rgCaCo.py: bfname=%s, logf=%s, argv = %s\n' % (bfname, logf, sys.argv) + print >> sys.stdout, s # so will appear as blurb for file + logme.append(s) + twd = tempfile.mkdtemp(suffix='rgCaCo') # make sure plink doesn't spew log file into the root! + tname = os.path.join(twd,name) + vcl = [plinke,'--noweb','--bfile',bfname,'--out',name,'--model'] + p=subprocess.Popen(' '.join(vcl),shell=True,stdout=flog,cwd=twd) + retval = p.wait() + resf = '%s.model' % tname # plink output is here we hope + xformModel(bfname,resf,outfname,name,amapf,flog) # leaves the desired summary file + makeGFF(resf=outfname,outfname=gffout,logf=flog,twd=twd,name='rgCaCo_TopTable',description=name,topn=topn) + flog.write('\n'.join(logme)) + flog.close() # close the log used + #shutil.copytree(twd,logoutdir) + shutil.rmtree(twd) # clean up +