Mercurial > repos > xuebing > sharplabtool
diff tools/rgenetics/rgTDT.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rgenetics/rgTDT.py Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,264 @@ +#!/usr/local/bin/python +# hack to run and process a plink tdt +# expects args as +# bfilepath outname jobname outformat (wig,xls) +# ross lazarus +# for wig files, we need annotation so look for map file or complain + +""" +Parameters for wiggle track definition lines +All options are placed in a single line separated by spaces: + + track type=wiggle_0 name=track_label description=center_label \ + visibility=display_mode color=r,g,b altColor=r,g,b \ + priority=priority autoScale=on|off \ + gridDefault=on|off maxHeightPixels=max:default:min \ + graphType=bar|points viewLimits=lower:upper \ + yLineMark=real-value yLineOnOff=on|off \ + windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16 +""" + +import sys,math,shutil,subprocess,os,time,tempfile,shutil,string +from os.path import abspath +from optparse import OptionParser +from rgutils import timenow, plinke +myversion = 'v0.003 January 2010' +verbose = False + + + +def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000): + """ + score must be scaled to 0-1000 + + Want to make some wig tracks from each analysis + Best n -log10(p). Make top hit the window. + we use our tab output which has + rs chrom offset ADD_stat ADD_p ADD_log10p + rs3094315 1 792429 1.151 0.2528 0.597223 + + """ + + def is_number(s): + try: + float(s) + return True + except ValueError: + return False + header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) + column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] + halfwidth=100 + resfpath = os.path.join(twd,resf) + resf = open(resfpath,'r') + resfl = resf.readlines() # dumb but convenient for millions of rows + resfl = [x.split() for x in resfl] + headl = resfl[0] + resfl = resfl[1:] + headl = [x.strip().upper() for x in headl] + headIndex = dict(zip(headl,range(0,len(headl)))) + # s = 'rs\tchrom\toffset\ta1\ta2\ttransmitted\tuntransmitted\tTDTChiSq\tTDTp\t-log10TDTp\tAbsTDTOR\tTDTOR' + chrpos = headIndex.get('CHROM',None) + rspos = headIndex.get('RS',None) + offspos = headIndex.get('OFFSET',None) + ppos = headIndex.get('-LOG10TDTP',None) + wewant = [chrpos,rspos,offspos,ppos] + if None in wewant: # missing something + logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex) + return + resfl = [x for x in resfl if x[ppos] > ''] + resfl = [(float(x[ppos]),x) for x in resfl] # decorate + resfl.sort() + resfl.reverse() # using -log10 so larger is better + resfl = resfl[:topn] # truncate + pvals = [x[0] for x in resfl] # need to scale + resfl = [x[1] for x in resfl] # drop decoration + maxp = max(pvals) # need to scale + minp = min(pvals) + prange = abs(maxp-minp) + 0.5 # fudge + scalefact = 1000.0/prange + logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) + for i,row in enumerate(resfl): + row[ppos] = '%d' % (int(scalefact*pvals[i])) + resfl[i] = row # replace + outf = file(outfname,'w') + outf.write(header) + outres = [] # need to resort into chrom offset order + for i,lrow in enumerate(resfl): + chrom,snp,offset,p, = [lrow[x] for x in wewant] + gff = ('chr%s' % chrom,'rgTDT','variation','%d' % (int(offset)-halfwidth), + '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) + outres.append(gff) + outres = [(x[0],int(x[3]),x) for x in outres] # decorate + outres.sort() # into chrom offset + outres=[x[2] for x in outres] # undecorate + outres = ['\t'.join(x) for x in outres] + outf.write('\n'.join(outres)) + outf.write('\n') + outf.close() + + + +def xformTDT(infname='',resf='',outfname='',name='foo',mapf='/usr/local/galaxy/data/rg/lped/x.bim'): + """munge a plink .tdt file into either a ucsc track or an xls file + CHR SNP A1:A2 T:U_TDT OR_TDT CHISQ_TDT P_TDT + 0 MitoT217C 2:3 0:0 NA NA NA + 0 MitoG228A 1:4 0:0 NA NA NA + 0 MitoT250C 2:3 0:0 NA NA NA + map file has + 1 rs4378174 0 003980745 + 1 rs10796404 0 005465256 + 1 rs2697965 0 014023092 + + grrr! + Changed in 1.01 to + [rerla@hg fresh]$ head database/job_working_directory/445/rgTDT.tdt + CHR SNP BP A1 A2 T U OR CHISQ P + 1 rs12562034 758311 1 3 71 79 0.8987 0.4267 0.5136 + 1 rs3934834 995669 4 2 98 129 0.7597 4.233 0.03963 + + + """ + if verbose: + print 'Rgenetics Galaxy Tools, rgTDT.py.xformTDT got resf=%s, outtype=%s, outfname=%s' % (resf,outtype,outfname) + wewantcols = ['SNP','CHR','BP','A1','A2','T','U','OR','CHISQ','P'] + res = [] + s = 'rs\tchrom\toffset\ta1\ta2\ttransmitted\tuntransmitted\tTDTChiSq\tTDTp\t-log10TDTp\tAbsTDTOR\tTDTOR' # header + res.append(s) + rsdict = {} + if not mapf: + sys.stderr.write('rgTDT called but no map file provided - cannot determine locations') + sys.exit(1) + map = file(mapf,'r') + for l in map: # plink map + ll = l.strip().split() + if len(ll) >= 3: + rs=ll[1].strip() + chrom = ll[0] + if chrom.lower() == 'x': + chrom = '23' + if chrom.lower() == 'y': + chrom = '24' + if chrom.lower() == 'mito': + chrom = '25' + offset = ll[3] + rsdict[rs] = (chrom,offset) + f = open(resf,'r') + headl = f.next().strip() + headl = headl.split() + wewant = [headl.index(x) for x in wewantcols] + llen = len(headl) + lnum = anum = 0 + for l in f: + lnum += 1 + ll = l.strip().split() + if len(ll) >= llen: # valid line + snp,chrom,offset,a1,a2,t,u,orat,chisq,p = [ll[x] for x in wewant] + if chisq == 'NA' or p == 'NA' or orat == 'NA': + continue # can't use these lines - gg gets unhappy + snp = snp.strip() + lp = '0.0' + fp = '1.0' + fakeorat = '1.0' + if p.upper().strip() <> 'NA': + try: + fp = float(p) + if fp <> 0: + lp = '%6f' % -math.log10(fp) + fp = '%6f' % fp + except: + pass + else: + p = '1.0' + if orat.upper().strip() <> 'NA': + try: + fakeorat = orat + if float(orat) < 1.0: + fakeorat = '%6f' % (1.0/float(orat)) # invert so large values big + except: + pass + else: + orat = '1.0' + outl = '\t'.join([snp,chrom,offset,a1,a2,t,u,chisq,p,lp,fakeorat,orat]) + res.append(outl) + f = file(outfname,'w') + res.append('') + f.write('\n'.join(res)) + f.close() + + +if __name__ == "__main__": + """ called as + <command interpreter="python"> + rgTDT.py -i '$infile.extra_files_path/$infile.metadata.base_name' -o '$title' -f '$outformat' -r '$out_file1' -l '$logf' -x '${GALAXY_DATA_INDEX_DIR}/rg/bin/pl$ + + </command> + + """ + u = """ called in xml as + <command interpreter="python2.4"> + rgTDT.py -i $i -o $out_prefix -f $outformat -r $out_file1 -l $logf + </command> + """ + if len(sys.argv) < 6: + s = '## Error rgTDT.py needs 5 command line params - got %s \n' % (sys.argv) + if verbose: + print >> sys.stdout, s + sys.exit(0) + parser = OptionParser(usage=u, version="%prog 0.01") + a = parser.add_option + a("-i","--infile",dest="bfname") + a("-o","--oprefix",dest="oprefix") + a("-f","--formatOut",dest="outformat") + a("-r","--results",dest="outfname") + a("-l","--logfile",dest="logf") + a("-d","--du",dest="uId") + a("-e","--email",dest="uEmail") + a("-g","--gff",dest="gffout",default="") + (options,args) = parser.parse_args() + killme = string.punctuation + string.whitespace + trantab = string.maketrans(killme,'_'*len(killme)) + title = options.oprefix + title = title.translate(trantab) + map_file = '%s.bim' % (options.bfname) # + me = sys.argv[0] + alogf = options.logf # absolute paths + od = os.path.split(alogf)[0] + try: + os.path.makedirs(od) + except: + pass + aoutf = options.outfname # absolute paths + od = os.path.split(aoutf)[0] + try: + os.path.makedirs(od) + except: + pass + vcl = [plinke,'--noweb', '--bfile',options.bfname,'--out',title,'--mind','0.5','--tdt'] + logme = [] + if verbose: + s = 'Rgenetics %s http://rgenetics.org Galaxy Tools rgTDT.py started %s\n' % (myversion,timenow()) + print >> sys.stdout,s + logme.append(s) + s ='rgTDT.py: bfname=%s, logf=%s, argv = %s\n' % (options.bfname,alogf, sys.argv) + print >> sys.stdout,s + logme.append(s) + s = 'rgTDT.py: vcl=%s\n' % (' '.join(vcl)) + print >> sys.stdout,s + logme.append(s) + twd = tempfile.mkdtemp(suffix='rgTDT') # make sure plink doesn't spew log file into the root! + tname = os.path.join(twd,title) + p=subprocess.Popen(' '.join(vcl),shell=True,cwd=twd) + retval = p.wait() + shutil.copy('%s.log' % tname,alogf) + sto = file(alogf,'a') + sto.write('\n'.join(logme)) + resf = '%s.tdt' % tname # plink output is here we hope + xformTDT(options.bfname,resf,aoutf,title,map_file) # leaves the desired summary file + gffout = options.gffout + if gffout > '': + makeGFF(resf=aoutf,outfname=gffout,logf=sto,twd='.',name='rgTDT_Top_Table',description=title,topn=1000) + shutil.rmtree(twd) + sto.close() + + +