0
|
1 #!/usr/local/bin/python
|
|
2 """
|
|
3 # added most of the available options for linear models
|
|
4 # june 2009 rml
|
|
5 # hack to run and process a plink quantitative trait
|
|
6 #
|
|
7
|
|
8 This is a wrapper for Shaun Purcell's Plink linear/logistic models for
|
|
9 traits, covariates and genotypes.
|
|
10
|
|
11 It requires some judgement to interpret the findings
|
|
12 We need some better visualizations - manhattan plots are good.
|
|
13 svg with rs numbers for top 1%?
|
|
14
|
|
15 toptable tools - truncate a gg file down to some low percentile
|
|
16
|
|
17 intersect with other tables - eg gene expression regressions on snps
|
|
18
|
|
19
|
|
20
|
|
21 """
|
|
22
|
|
23 import sys,math,shutil,subprocess,os,string,tempfile,shutil,commands
|
|
24 from rgutils import plinke
|
|
25
|
|
26 def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000):
|
|
27 """
|
|
28 score must be scaled to 0-1000
|
|
29
|
|
30 Want to make some wig tracks from each analysis
|
|
31 Best n -log10(p). Make top hit the window.
|
|
32 we use our tab output which has
|
|
33 rs chrom offset ADD_stat ADD_p ADD_log10p
|
|
34 rs3094315 1 792429 1.151 0.2528 0.597223
|
|
35
|
|
36 """
|
|
37
|
|
38 def is_number(s):
|
|
39 try:
|
|
40 float(s)
|
|
41 return True
|
|
42 except ValueError:
|
|
43 return False
|
|
44 header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description)
|
|
45 column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ]
|
|
46 halfwidth=100
|
|
47 resfpath = os.path.join(twd,resf)
|
|
48 resf = open(resfpath,'r')
|
|
49 resfl = resf.readlines() # dumb but convenient for millions of rows
|
|
50 resfl = [x.split() for x in resfl]
|
|
51 headl = resfl[0]
|
|
52 resfl = resfl[1:]
|
|
53 headl = [x.strip().upper() for x in headl]
|
|
54 headIndex = dict(zip(headl,range(0,len(headl))))
|
|
55 chrpos = headIndex.get('CHROM',None)
|
|
56 rspos = headIndex.get('RS',None)
|
|
57 offspos = headIndex.get('OFFSET',None)
|
|
58 ppos = headIndex.get('ADD_LOG10P',None)
|
|
59 wewant = [chrpos,rspos,offspos,ppos]
|
|
60 if None in wewant: # missing something
|
|
61 logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex)
|
|
62 return
|
|
63 resfl = [x for x in resfl if x[ppos] > '']
|
|
64 resfl = [(float(x[ppos]),x) for x in resfl] # decorate
|
|
65 resfl.sort()
|
|
66 resfl.reverse() # using -log10 so larger is better
|
|
67 resfl = resfl[:topn] # truncate
|
|
68 pvals = [x[0] for x in resfl] # need to scale
|
|
69 resfl = [x[1] for x in resfl] # drop decoration
|
|
70 if len(pvals) == 0:
|
|
71 logf.write('### no pvalues found in resfl - %s' % (resfl[:3]))
|
|
72 sys.exit(1)
|
|
73 maxp = max(pvals) # need to scale
|
|
74 minp = min(pvals)
|
|
75 prange = abs(maxp-minp) + 0.5 # fudge
|
|
76 scalefact = 1000.0/prange
|
|
77 logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact))
|
|
78 for i,row in enumerate(resfl):
|
|
79 row[ppos] = '%d' % (int(scalefact*pvals[i]))
|
|
80 resfl[i] = row # replace
|
|
81 outf = file(outfname,'w')
|
|
82 outf.write(header)
|
|
83 outres = [] # need to resort into chrom offset order
|
|
84 for i,lrow in enumerate(resfl):
|
|
85 chrom,snp,offset,p, = [lrow[x] for x in wewant]
|
|
86 gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth),
|
|
87 '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i]))
|
|
88 outres.append(gff)
|
|
89 outres = [(x[0],int(x[3]),x) for x in outres] # decorate
|
|
90 outres.sort() # into chrom offset
|
|
91 outres=[x[2] for x in outres] # undecorate
|
|
92 outres = ['\t'.join(x) for x in outres]
|
|
93 outf.write('\n'.join(outres))
|
|
94 outf.write('\n')
|
|
95 outf.close()
|
|
96
|
|
97
|
|
98
|
|
99 def xformQassoc(resf='',outfname='',logf=None,twd='.'):
|
|
100 """ plink.assoc.linear to gg file
|
|
101 from the docs
|
|
102 The output per each SNP might look something like:
|
|
103
|
|
104 CHR SNP BP A1 TEST NMISS OR STAT P
|
|
105 5 rs000001 10001 A ADD 664 0.7806 -1.942 0.05216
|
|
106 5 rs000001 10001 A DOMDEV 664 0.9395 -0.3562 0.7217
|
|
107 5 rs000001 10001 A COV1 664 0.9723 -0.7894 0.4299
|
|
108 5 rs000001 10001 A COV2 664 1.159 0.5132 0.6078
|
|
109 5 rs000001 10001 A GENO_2DF 664 NA 5.059 0.0797
|
|
110 need to transform into gg columns for each distinct test
|
|
111 or bed for tracks?
|
|
112
|
|
113 """
|
|
114 logf.write('xformQassoc got resf=%s, outfname=%s\n' % (resf,outfname))
|
|
115 resdict = {}
|
|
116 rsdict = {}
|
|
117 markerlist = []
|
|
118 # plink is "clever" - will run logistic if only 2 categories such as gender
|
|
119 resfs = resf.split('.')
|
|
120 if resfs[-1] == 'logistic':
|
|
121 resfs[-1] = 'linear'
|
|
122 else:
|
|
123 resfs[-1] = 'logistic'
|
|
124 altresf = '.'.join(resfs)
|
|
125
|
|
126 altresfpath = os.path.join(twd,altresf)
|
|
127 resfpath = os.path.join(twd,resf)
|
|
128 try:
|
|
129 resf = open(resfpath,'r')
|
|
130 except:
|
|
131 try:
|
|
132 resf = open(altresfpath,'r')
|
|
133 except:
|
|
134 print >> sys.stderr, '## error - no file plink output %s or %s found - cannot continue' % (resfpath, altresfpath)
|
|
135 sys.exit(1)
|
|
136 for lnum,row in enumerate(resf):
|
|
137 if lnum == 0:
|
|
138 headl = row.split()
|
|
139 headl = [x.strip().upper() for x in headl]
|
|
140 headIndex = dict(zip(headl,range(0,len(headl))))
|
|
141 chrpos = headIndex.get('CHR',None)
|
|
142 rspos = headIndex.get('SNP',None)
|
|
143 offspos = headIndex.get('BP',None)
|
|
144 nmisspos = headIndex.get('NMISS',None)
|
|
145 testpos = headIndex.get('TEST',None)
|
|
146 ppos = headIndex.get('P',None)
|
|
147 coeffpos = headIndex.get('OR',None)
|
|
148 if not coeffpos:
|
|
149 coeffpos = headIndex.get('BETA',None)
|
|
150 apos = headIndex.get('A1',None)
|
|
151 statpos = headIndex.get('STAT',None)
|
|
152 wewant = [chrpos,rspos,offspos,testpos,statpos,ppos,coeffpos,apos]
|
|
153 if None in wewant: # missing something
|
|
154 logf.write('missing a required header in xformQassoc - headIndex=%s\n' % headIndex)
|
|
155 return
|
|
156 llen = len(headl)
|
|
157 else: # no Nones!
|
|
158 ll = row.split()
|
|
159 if len(ll) >= llen: # valid line
|
|
160 chrom,snp,offset,test,stat,p,coeff,allele = [ll[x] for x in wewant]
|
|
161 snp = snp.strip()
|
|
162 if p <> 'NA' :
|
|
163 try:
|
|
164 ffp = float(p)
|
|
165 if ffp <> 0:
|
|
166 lp = -math.log10(ffp)
|
|
167 except:
|
|
168 lp = 0.0
|
|
169 resdict.setdefault(test,{})
|
|
170 resdict[test][snp] = (stat,p,'%f' % lp)
|
|
171 if rsdict.get(snp,None) == None:
|
|
172 rsdict[snp] = (chrom,offset)
|
|
173 markerlist.append(snp)
|
|
174 # now have various tests indexed by rs
|
|
175 tk = resdict.keys()
|
|
176 tk.sort() # tests
|
|
177 ohead = ['rs','chrom','offset']
|
|
178 for t in tk: # add headers
|
|
179 ohead.append('%s_stat' % t)
|
|
180 ohead.append('%s_p' % t)
|
|
181 ohead.append('%s_log10p' % t)
|
|
182 oheads = '\t'.join(ohead)
|
|
183 res = [oheads,]
|
|
184 for snp in markerlist: # retain original order
|
|
185 chrom,offset = rsdict[snp]
|
|
186 outl = [snp,chrom,offset]
|
|
187 for t in tk:
|
|
188 outl += resdict[t][snp] # add stat,p for this test
|
|
189 outs = '\t'.join(outl)
|
|
190 res.append(outs)
|
|
191 f = file(outfname,'w')
|
|
192 res.append('')
|
|
193 f.write('\n'.join(res))
|
|
194 f.close()
|
|
195
|
|
196
|
|
197 if __name__ == "__main__":
|
|
198 """
|
|
199
|
|
200 <command interpreter="python">
|
|
201 rgGLM.py '$i.extra_files_path/$i.metadata.base_name' '$phef.extra_files_path/$phef.metadata.base_name'
|
|
202 "$title1" '$predvar' '$covar' '$out_file1' '$logf' '$i.metadata.base_name'
|
|
203 '$inter' '$cond' '$gender' '$mind' '$geno' '$maf' '$logistic' '$wigout'
|
|
204 </command>
|
|
205 """
|
|
206 topn = 1000
|
|
207 killme = string.punctuation+string.whitespace
|
|
208 trantab = string.maketrans(killme,'_'*len(killme))
|
|
209 if len(sys.argv) < 17:
|
|
210 s = 'rgGLM.py needs 17 params - got %s \n' % (sys.argv)
|
|
211 sys.stderr.write(s) # print >>,s would probably also work?
|
|
212 sys.exit(0)
|
|
213 blurb = 'rgGLM.py called with %s' % sys.argv
|
|
214 print >> sys.stdout,blurb
|
|
215 bfname = sys.argv[1]
|
|
216 phename = sys.argv[2]
|
|
217 title = sys.argv[3]
|
|
218 title.translate(trantab)
|
|
219 predvar = sys.argv[4]
|
|
220 covar = sys.argv[5].strip()
|
|
221 outfname = sys.argv[6]
|
|
222 logfname = sys.argv[7]
|
|
223 op = os.path.split(logfname)[0]
|
|
224 try: # for test - needs this done
|
|
225 os.makedirs(op)
|
|
226 except:
|
|
227 pass
|
|
228 basename = sys.argv[8].translate(trantab)
|
|
229 inter = sys.argv[9] == '1'
|
|
230 cond = sys.argv[10].strip()
|
|
231 if cond == 'None':
|
|
232 cond = ''
|
|
233 gender = sys.argv[11] == '1'
|
|
234 mind = sys.argv[12]
|
|
235 geno = sys.argv[13]
|
|
236 maf = sys.argv[14]
|
|
237 logistic = sys.argv[15].strip()=='1'
|
|
238 gffout = sys.argv[16]
|
|
239 me = sys.argv[0]
|
|
240 phepath = '%s.pphe' % phename
|
|
241 twd = tempfile.mkdtemp(suffix='rgGLM') # make sure plink doesn't spew log file into the root!
|
|
242 tplog = os.path.join(twd,'%s.log' % basename) # should be path to plink log
|
|
243 vcl = [plinke,'--noweb','--bfile',bfname,'--pheno-name','"%s"' % predvar,'--pheno',
|
|
244 phepath,'--out',basename,'--mind %s' % mind, '--geno %s' % geno,
|
|
245 '--maf %s' % maf]
|
|
246 if logistic:
|
|
247 vcl.append('--logistic')
|
|
248 resf = '%s.assoc.logistic' % basename # plink output is here we hope
|
|
249 else:
|
|
250 vcl.append('--linear')
|
|
251 resf = '%s.assoc.linear' % basename # plink output is here we hope
|
|
252 resf = os.path.join(twd,resf)
|
|
253 if gender:
|
|
254 vcl.append('--sex')
|
|
255 if inter:
|
|
256 vcl.append('--interaction')
|
|
257 if covar > 'None':
|
|
258 vcl += ['--covar',phepath,'--covar-name',covar] # comma sep list of covariates
|
|
259 tcfile = None
|
|
260 if len(cond) > 0: # plink wants these in a file..
|
|
261 dummy,tcfile = tempfile.mkstemp(suffix='condlist') #
|
|
262 f = open(tcfile,'w')
|
|
263 cl = cond.split()
|
|
264 f.write('\n'.join(cl))
|
|
265 f.write('\n')
|
|
266 f.close()
|
|
267 vcl.append('--condition-list %s' % tcfile)
|
|
268 p=subprocess.Popen(' '.join(vcl),shell=True,cwd=twd)
|
|
269 retval = p.wait()
|
|
270 if tcfile:
|
|
271 os.unlink(tcfile)
|
|
272 plinklog = file(tplog,'r').read()
|
|
273 logf = file(logfname,'w')
|
|
274 logf.write(blurb)
|
|
275 logf.write('\n')
|
|
276 logf.write('vcl=%s\n' % vcl)
|
|
277 xformQassoc(resf=resf,outfname=outfname,logf=logf,twd=twd) # leaves the desired summary file
|
|
278 makeGFF(resf=outfname,outfname=gffout,logf=logf,twd=twd,name='rgGLM_TopTable',description=title,topn=topn)
|
|
279 logf.write('\n')
|
|
280 logf.write(plinklog)
|
|
281 logf.close()
|
|
282 #shutil.rmtree(twd) # clean up
|
|
283
|
|
284
|
|
285
|
|
286
|
|
287
|