0
|
1 #!/usr/local/bin/python
|
|
2 # hack to run and process a plink tdt
|
|
3 # expects args as
|
|
4 # bfilepath outname jobname outformat (wig,xls)
|
|
5 # ross lazarus
|
|
6 # for wig files, we need annotation so look for map file or complain
|
|
7
|
|
8 """
|
|
9 Parameters for wiggle track definition lines
|
|
10 All options are placed in a single line separated by spaces:
|
|
11
|
|
12 track type=wiggle_0 name=track_label description=center_label \
|
|
13 visibility=display_mode color=r,g,b altColor=r,g,b \
|
|
14 priority=priority autoScale=on|off \
|
|
15 gridDefault=on|off maxHeightPixels=max:default:min \
|
|
16 graphType=bar|points viewLimits=lower:upper \
|
|
17 yLineMark=real-value yLineOnOff=on|off \
|
|
18 windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16
|
|
19 """
|
|
20
|
|
21 import sys,math,shutil,subprocess,os,time,tempfile,shutil,string
|
|
22 from os.path import abspath
|
|
23 from optparse import OptionParser
|
|
24 from rgutils import timenow, plinke
|
|
25 myversion = 'v0.003 January 2010'
|
|
26 verbose = False
|
|
27
|
|
28
|
|
29
|
|
30 def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000):
|
|
31 """
|
|
32 score must be scaled to 0-1000
|
|
33
|
|
34 Want to make some wig tracks from each analysis
|
|
35 Best n -log10(p). Make top hit the window.
|
|
36 we use our tab output which has
|
|
37 rs chrom offset ADD_stat ADD_p ADD_log10p
|
|
38 rs3094315 1 792429 1.151 0.2528 0.597223
|
|
39
|
|
40 """
|
|
41
|
|
42 def is_number(s):
|
|
43 try:
|
|
44 float(s)
|
|
45 return True
|
|
46 except ValueError:
|
|
47 return False
|
|
48 header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description)
|
|
49 column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ]
|
|
50 halfwidth=100
|
|
51 resfpath = os.path.join(twd,resf)
|
|
52 resf = open(resfpath,'r')
|
|
53 resfl = resf.readlines() # dumb but convenient for millions of rows
|
|
54 resfl = [x.split() for x in resfl]
|
|
55 headl = resfl[0]
|
|
56 resfl = resfl[1:]
|
|
57 headl = [x.strip().upper() for x in headl]
|
|
58 headIndex = dict(zip(headl,range(0,len(headl))))
|
|
59 # s = 'rs\tchrom\toffset\ta1\ta2\ttransmitted\tuntransmitted\tTDTChiSq\tTDTp\t-log10TDTp\tAbsTDTOR\tTDTOR'
|
|
60 chrpos = headIndex.get('CHROM',None)
|
|
61 rspos = headIndex.get('RS',None)
|
|
62 offspos = headIndex.get('OFFSET',None)
|
|
63 ppos = headIndex.get('-LOG10TDTP',None)
|
|
64 wewant = [chrpos,rspos,offspos,ppos]
|
|
65 if None in wewant: # missing something
|
|
66 logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex)
|
|
67 return
|
|
68 resfl = [x for x in resfl if x[ppos] > '']
|
|
69 resfl = [(float(x[ppos]),x) for x in resfl] # decorate
|
|
70 resfl.sort()
|
|
71 resfl.reverse() # using -log10 so larger is better
|
|
72 resfl = resfl[:topn] # truncate
|
|
73 pvals = [x[0] for x in resfl] # need to scale
|
|
74 resfl = [x[1] for x in resfl] # drop decoration
|
|
75 maxp = max(pvals) # need to scale
|
|
76 minp = min(pvals)
|
|
77 prange = abs(maxp-minp) + 0.5 # fudge
|
|
78 scalefact = 1000.0/prange
|
|
79 logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact))
|
|
80 for i,row in enumerate(resfl):
|
|
81 row[ppos] = '%d' % (int(scalefact*pvals[i]))
|
|
82 resfl[i] = row # replace
|
|
83 outf = file(outfname,'w')
|
|
84 outf.write(header)
|
|
85 outres = [] # need to resort into chrom offset order
|
|
86 for i,lrow in enumerate(resfl):
|
|
87 chrom,snp,offset,p, = [lrow[x] for x in wewant]
|
|
88 gff = ('chr%s' % chrom,'rgTDT','variation','%d' % (int(offset)-halfwidth),
|
|
89 '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i]))
|
|
90 outres.append(gff)
|
|
91 outres = [(x[0],int(x[3]),x) for x in outres] # decorate
|
|
92 outres.sort() # into chrom offset
|
|
93 outres=[x[2] for x in outres] # undecorate
|
|
94 outres = ['\t'.join(x) for x in outres]
|
|
95 outf.write('\n'.join(outres))
|
|
96 outf.write('\n')
|
|
97 outf.close()
|
|
98
|
|
99
|
|
100
|
|
101 def xformTDT(infname='',resf='',outfname='',name='foo',mapf='/usr/local/galaxy/data/rg/lped/x.bim'):
|
|
102 """munge a plink .tdt file into either a ucsc track or an xls file
|
|
103 CHR SNP A1:A2 T:U_TDT OR_TDT CHISQ_TDT P_TDT
|
|
104 0 MitoT217C 2:3 0:0 NA NA NA
|
|
105 0 MitoG228A 1:4 0:0 NA NA NA
|
|
106 0 MitoT250C 2:3 0:0 NA NA NA
|
|
107 map file has
|
|
108 1 rs4378174 0 003980745
|
|
109 1 rs10796404 0 005465256
|
|
110 1 rs2697965 0 014023092
|
|
111
|
|
112 grrr!
|
|
113 Changed in 1.01 to
|
|
114 [rerla@hg fresh]$ head database/job_working_directory/445/rgTDT.tdt
|
|
115 CHR SNP BP A1 A2 T U OR CHISQ P
|
|
116 1 rs12562034 758311 1 3 71 79 0.8987 0.4267 0.5136
|
|
117 1 rs3934834 995669 4 2 98 129 0.7597 4.233 0.03963
|
|
118
|
|
119
|
|
120 """
|
|
121 if verbose:
|
|
122 print 'Rgenetics Galaxy Tools, rgTDT.py.xformTDT got resf=%s, outtype=%s, outfname=%s' % (resf,outtype,outfname)
|
|
123 wewantcols = ['SNP','CHR','BP','A1','A2','T','U','OR','CHISQ','P']
|
|
124 res = []
|
|
125 s = 'rs\tchrom\toffset\ta1\ta2\ttransmitted\tuntransmitted\tTDTChiSq\tTDTp\t-log10TDTp\tAbsTDTOR\tTDTOR' # header
|
|
126 res.append(s)
|
|
127 rsdict = {}
|
|
128 if not mapf:
|
|
129 sys.stderr.write('rgTDT called but no map file provided - cannot determine locations')
|
|
130 sys.exit(1)
|
|
131 map = file(mapf,'r')
|
|
132 for l in map: # plink map
|
|
133 ll = l.strip().split()
|
|
134 if len(ll) >= 3:
|
|
135 rs=ll[1].strip()
|
|
136 chrom = ll[0]
|
|
137 if chrom.lower() == 'x':
|
|
138 chrom = '23'
|
|
139 if chrom.lower() == 'y':
|
|
140 chrom = '24'
|
|
141 if chrom.lower() == 'mito':
|
|
142 chrom = '25'
|
|
143 offset = ll[3]
|
|
144 rsdict[rs] = (chrom,offset)
|
|
145 f = open(resf,'r')
|
|
146 headl = f.next().strip()
|
|
147 headl = headl.split()
|
|
148 wewant = [headl.index(x) for x in wewantcols]
|
|
149 llen = len(headl)
|
|
150 lnum = anum = 0
|
|
151 for l in f:
|
|
152 lnum += 1
|
|
153 ll = l.strip().split()
|
|
154 if len(ll) >= llen: # valid line
|
|
155 snp,chrom,offset,a1,a2,t,u,orat,chisq,p = [ll[x] for x in wewant]
|
|
156 if chisq == 'NA' or p == 'NA' or orat == 'NA':
|
|
157 continue # can't use these lines - gg gets unhappy
|
|
158 snp = snp.strip()
|
|
159 lp = '0.0'
|
|
160 fp = '1.0'
|
|
161 fakeorat = '1.0'
|
|
162 if p.upper().strip() <> 'NA':
|
|
163 try:
|
|
164 fp = float(p)
|
|
165 if fp <> 0:
|
|
166 lp = '%6f' % -math.log10(fp)
|
|
167 fp = '%6f' % fp
|
|
168 except:
|
|
169 pass
|
|
170 else:
|
|
171 p = '1.0'
|
|
172 if orat.upper().strip() <> 'NA':
|
|
173 try:
|
|
174 fakeorat = orat
|
|
175 if float(orat) < 1.0:
|
|
176 fakeorat = '%6f' % (1.0/float(orat)) # invert so large values big
|
|
177 except:
|
|
178 pass
|
|
179 else:
|
|
180 orat = '1.0'
|
|
181 outl = '\t'.join([snp,chrom,offset,a1,a2,t,u,chisq,p,lp,fakeorat,orat])
|
|
182 res.append(outl)
|
|
183 f = file(outfname,'w')
|
|
184 res.append('')
|
|
185 f.write('\n'.join(res))
|
|
186 f.close()
|
|
187
|
|
188
|
|
189 if __name__ == "__main__":
|
|
190 """ called as
|
|
191 <command interpreter="python">
|
|
192 rgTDT.py -i '$infile.extra_files_path/$infile.metadata.base_name' -o '$title' -f '$outformat' -r '$out_file1' -l '$logf' -x '${GALAXY_DATA_INDEX_DIR}/rg/bin/pl$
|
|
193
|
|
194 </command>
|
|
195
|
|
196 """
|
|
197 u = """ called in xml as
|
|
198 <command interpreter="python2.4">
|
|
199 rgTDT.py -i $i -o $out_prefix -f $outformat -r $out_file1 -l $logf
|
|
200 </command>
|
|
201 """
|
|
202 if len(sys.argv) < 6:
|
|
203 s = '## Error rgTDT.py needs 5 command line params - got %s \n' % (sys.argv)
|
|
204 if verbose:
|
|
205 print >> sys.stdout, s
|
|
206 sys.exit(0)
|
|
207 parser = OptionParser(usage=u, version="%prog 0.01")
|
|
208 a = parser.add_option
|
|
209 a("-i","--infile",dest="bfname")
|
|
210 a("-o","--oprefix",dest="oprefix")
|
|
211 a("-f","--formatOut",dest="outformat")
|
|
212 a("-r","--results",dest="outfname")
|
|
213 a("-l","--logfile",dest="logf")
|
|
214 a("-d","--du",dest="uId")
|
|
215 a("-e","--email",dest="uEmail")
|
|
216 a("-g","--gff",dest="gffout",default="")
|
|
217 (options,args) = parser.parse_args()
|
|
218 killme = string.punctuation + string.whitespace
|
|
219 trantab = string.maketrans(killme,'_'*len(killme))
|
|
220 title = options.oprefix
|
|
221 title = title.translate(trantab)
|
|
222 map_file = '%s.bim' % (options.bfname) #
|
|
223 me = sys.argv[0]
|
|
224 alogf = options.logf # absolute paths
|
|
225 od = os.path.split(alogf)[0]
|
|
226 try:
|
|
227 os.path.makedirs(od)
|
|
228 except:
|
|
229 pass
|
|
230 aoutf = options.outfname # absolute paths
|
|
231 od = os.path.split(aoutf)[0]
|
|
232 try:
|
|
233 os.path.makedirs(od)
|
|
234 except:
|
|
235 pass
|
|
236 vcl = [plinke,'--noweb', '--bfile',options.bfname,'--out',title,'--mind','0.5','--tdt']
|
|
237 logme = []
|
|
238 if verbose:
|
|
239 s = 'Rgenetics %s http://rgenetics.org Galaxy Tools rgTDT.py started %s\n' % (myversion,timenow())
|
|
240 print >> sys.stdout,s
|
|
241 logme.append(s)
|
|
242 s ='rgTDT.py: bfname=%s, logf=%s, argv = %s\n' % (options.bfname,alogf, sys.argv)
|
|
243 print >> sys.stdout,s
|
|
244 logme.append(s)
|
|
245 s = 'rgTDT.py: vcl=%s\n' % (' '.join(vcl))
|
|
246 print >> sys.stdout,s
|
|
247 logme.append(s)
|
|
248 twd = tempfile.mkdtemp(suffix='rgTDT') # make sure plink doesn't spew log file into the root!
|
|
249 tname = os.path.join(twd,title)
|
|
250 p=subprocess.Popen(' '.join(vcl),shell=True,cwd=twd)
|
|
251 retval = p.wait()
|
|
252 shutil.copy('%s.log' % tname,alogf)
|
|
253 sto = file(alogf,'a')
|
|
254 sto.write('\n'.join(logme))
|
|
255 resf = '%s.tdt' % tname # plink output is here we hope
|
|
256 xformTDT(options.bfname,resf,aoutf,title,map_file) # leaves the desired summary file
|
|
257 gffout = options.gffout
|
|
258 if gffout > '':
|
|
259 makeGFF(resf=aoutf,outfname=gffout,logf=sto,twd='.',name='rgTDT_Top_Table',description=title,topn=1000)
|
|
260 shutil.rmtree(twd)
|
|
261 sto.close()
|
|
262
|
|
263
|
|
264
|