Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgfakePhe.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 """ | |
2 fakephe.py | |
3 ross lazarus sept 30 2007 | |
4 This is available under the LGPL as defined then. | |
5 | |
6 use the pedigree data for ids | |
7 | |
8 use pythons generators to literally generate a bunch of random phenotype measurements | |
9 | |
10 Plink format at http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#pheno | |
11 is | |
12 | |
13 To specify an alternate phenotype for analysis, i.e. other than the one in the *.ped file (or, if using a binary fileset, the | |
14 *.fam file), use the --pheno option: | |
15 plink --file mydata --pheno pheno.txt | |
16 | |
17 where pheno.txt is a file that contains 3 columns (one row per individual): | |
18 | |
19 Family ID | |
20 Individual ID | |
21 Phenotype | |
22 | |
23 NOTE The original PED file must still contain a phenotype in column 6 (even if this is a dummy phenotype, e.g. all missing), | |
24 unless the --no-pheno flag is given. | |
25 | |
26 If an individual is in the original file but not listed in the alternate phenotype file, that person's phenotype will be set to | |
27 missing. If a person is in the alternate phenotype file but not in the original file, that entry will be ignored. The order of | |
28 the alternate phenotype file need not be the same as for the original file. | |
29 | |
30 If the phenotype file contains more than one phenotype, then use the --mpheno N option to specify the Nth phenotype is the one | |
31 to be used: | |
32 plink --file mydata --pheno pheno2.txt --mpheno 4 | |
33 | |
34 where pheno2.txt contains 5 different phenotypes (i.e. 7 columns in total), this command will use the 4th for analysis | |
35 (phenotype D): | |
36 | |
37 Family ID | |
38 Individual ID | |
39 Phenotype A | |
40 Phenotype B | |
41 Phenotype C | |
42 Phenotype D | |
43 Phenotype E | |
44 | |
45 Alternatively, your alternate phenotype file can have a header row, in which case you can use variable names to specify which | |
46 phenotype to use. If you have a header row, the first two variables must be labelled FID and IID. All subsequent variable names | |
47 cannot have any whitespace in them. For example, | |
48 | |
49 FID IID qt1 bmi site | |
50 F1 1110 2.3 22.22 2 | |
51 F2 2202 34.12 18.23 1 | |
52 ... | |
53 | |
54 then | |
55 plink --file mydata --pheno pheno2.txt --pheno-name bmi --assoc | |
56 | |
57 will select the second phenotype labelled "bmi", for analysis | |
58 | |
59 Finally, if there is more than one phenotype, then for basic association tests, it is possible to specify that all phenotypes | |
60 be tested, sequentially, with the output sent to different files: e.g. if bigpheno.raw contains 10,000 phenotypes, then | |
61 plink --bfile mydata --assoc --pheno bigpheno.raw --all-pheno | |
62 | |
63 will loop over all of these, one at a time testing for association with SNP, generating a lot of output. You might want to use | |
64 the --pfilter command in this case, to only report results with a p-value less than a certain value, e.g. --pfilter 1e-3. | |
65 | |
66 WARNING Currently, all phenotypes must be numerically coded, including missing values, in the alternate phenotype file. The | |
67 default missing value is -9, change this with --missing-phenotype, but it must be a numeric value still (in contrast to the | |
68 main phenotype in the PED/FAM file. This issue will be fixed in future releases. | |
69 Covariate files | |
70 | |
71 =========================== | |
72 rgfakePhe.xml | |
73 <tool id="fakePhe1" name="Fake phenos"> | |
74 <description>for multiple null fake phenotype</description> | |
75 <command interpreter="python2.4">rgfakePhe.py $input1 '$title1' $out_file1 $log_file1 $script_file</command> | |
76 <inputs> | |
77 <page> | |
78 <param name="input1" | |
79 type="library" format="lped" | |
80 label="Pedigree from Dataset"/> | |
81 <param name="title1" type="text" | |
82 value="My fake phenos" size="60" | |
83 label="Title for outputs"/> | |
84 </page> | |
85 <page> | |
86 <repeat name="fakePhe" title="Phenotypes to Fake"> | |
87 <param name="pName" type="text" label="Phenotype Name"> | |
88 </param> | |
89 <conditional name="series"> | |
90 <param name="phetype" type="select" label="Phenotype Type"> | |
91 <option value="rnorm" selected="true">Random normal variate</option> | |
92 <option value="cat">Random categorical</option> | |
93 </param> | |
94 <when value="rnorm"> | |
95 <param name="Mean" type="float" value="0.0" label="Mean"> | |
96 </param> | |
97 <param name="SD" type="float" label="SD" value="1.0"> | |
98 </param> | |
99 </when> | |
100 <when value="cat"> | |
101 <param name="values" type="text" value="1,2,3,fiddle" label="comma separated values to choose from"> | |
102 </param> | |
103 </when> | |
104 </conditional> | |
105 </repeat> | |
106 </page> | |
107 </inputs> | |
108 <configfiles> | |
109 <configfile name="script_file"> | |
110 #for $n, $f in enumerate($fakePhe) | |
111 #if $f.series.phetype=='rnorm' | |
112 {'pN':'$f.pName','pT':'$f.series.phetype','pP':"{'Mean':'$f.series.Mean', 'SD':'$f.series.SD'}"} | |
113 #elif $f.series.phetype=='cat' | |
114 {'pN':'$f.pName','pT':'$f.series.phetype','pP':"{'values':'$f.series.values'}"} | |
115 #end if | |
116 #end for | |
117 </configfile> | |
118 </configfiles> | |
119 | |
120 <outputs> | |
121 <data format="pphe" name="out_file1" /> | |
122 <data format="text" name="log_file1" parent="out_file1"/> | |
123 </outputs> | |
124 | |
125 <help> | |
126 .. class:: infomark | |
127 | |
128 This tool allows you to generate an arbitrary (sort of) | |
129 synthetic phenotype file with measurements drawn from normal, | |
130 gamma, or categorical distributions. These are for testing under | |
131 the null hypothesis of no association - the values are random but | |
132 from user specified distributions. | |
133 | |
134 ----- | |
135 | |
136 .. class:: warningmark | |
137 | |
138 This tool is very experimental | |
139 | |
140 ----- | |
141 | |
142 - **Pedigree** is a library pedigree file - the id's will be used in the synthetic null phenotypes | |
143 - **Title** is a name to give to the output phenotype file | |
144 | |
145 On the next page, you can add an unlimited number of various kinds of phenotypes including choices for | |
146 categorical ones or distributions with specific parameters | |
147 | |
148 Just keep adding new ones until you're done then use the Execute button to run the generation | |
149 | |
150 | |
151 | |
152 | |
153 **Example from another tool to keep you busy and in awe** | |
154 | |
155 Input file:: | |
156 | |
157 1 68 4.1 | |
158 2 71 4.6 | |
159 3 62 3.8 | |
160 4 75 4.4 | |
161 5 58 3.2 | |
162 6 60 3.1 | |
163 7 67 3.8 | |
164 8 68 4.1 | |
165 9 71 4.3 | |
166 10 69 3.7 | |
167 | |
168 Create a two series XY plot on the above data: | |
169 | |
170 - Series 1: Red Dashed-Line plot between columns 1 and 2 | |
171 - Series 2: Blue Circular-Point plot between columns 3 and 2 | |
172 | |
173 .. image:: ./static/images/xy_example.jpg | |
174 </help> | |
175 </tool> | |
176 | |
177 | |
178 | |
179 """ | |
180 | |
181 import random,copy,sys,os,time,string,math | |
182 | |
183 doFbatphe = False | |
184 | |
185 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> | |
186 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
187 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
188 <head> | |
189 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
190 <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> | |
191 <title></title> | |
192 <link rel="stylesheet" href="/static/style/base.css" type="text/css" /> | |
193 </head> | |
194 <body> | |
195 <div class="document"> | |
196 """ | |
197 | |
198 | |
199 def timenow(): | |
200 """return current time as a string | |
201 """ | |
202 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) | |
203 | |
204 | |
205 def poisson(lamb=2): | |
206 """ | |
207 algorithm poisson random number (Knuth): | |
208 init: | |
209 Let L = e^-lamb, k = 0 and p = 1. | |
210 do: | |
211 k = k + 1. | |
212 Generate uniform random number u in [0,1] and let p = p u. | |
213 while p >= L. | |
214 return k - 1. | |
215 """ | |
216 lamb = float(lamb) | |
217 l = math.e**(-lamb) | |
218 k=0 | |
219 p=1 | |
220 while 1: | |
221 while p >= l: | |
222 k += 1 | |
223 u = random.uniform(0.0,1.0) | |
224 p *= u | |
225 yield '%e' % (k - 1) | |
226 | |
227 def gammaPhe(alpha=1,beta=1): | |
228 """generator for random values from a gamma | |
229 """ | |
230 while 1: # an iterator for a random phenotype | |
231 dat = random.gammavariate(float(alpha),float(beta)) | |
232 yield '%e' % dat | |
233 | |
234 def weibullPhe(alpha=1,beta=1): | |
235 """generator for random values from a weibull distribution | |
236 """ | |
237 while 1: # an iterator for a random phenotype | |
238 dat = random.weibullvariate(float(alpha),float(beta)) | |
239 yield '%e' % dat | |
240 | |
241 def normPhe(mean=0,sd=1): | |
242 """generator for random values from a normal distribution | |
243 """ | |
244 while 1:# an iterator for a random phenotype | |
245 dat = random.normalvariate(float(mean),float(sd)) | |
246 yield '%e' % dat | |
247 | |
248 def expoPhe(mean=1): | |
249 """generator for random values from an exponential distribution | |
250 """ | |
251 lamb = 1.0/float(mean) | |
252 while 1:# an iterator for a random phenotype | |
253 dat = random.expovariate(lamb) | |
254 yield '%e' % dat | |
255 | |
256 | |
257 def catPhe(values='1,2,3'): | |
258 """ Schrodinger's of course. | |
259 """ | |
260 v = values.split(',') | |
261 while 1:# an iterator for a random phenotype | |
262 dat = random.choice(v) | |
263 yield dat | |
264 | |
265 def uniPhe(low=0.0,hi=1.0): | |
266 """generator for a uniform distribution | |
267 what if low=-5 and hi=-2 | |
268 """ | |
269 low = float(low) | |
270 hi = float(hi) | |
271 while 1: # unif phenotype | |
272 v = random.uniform(low,hi) # 0-1 | |
273 yield '%e' % v | |
274 | |
275 def getIds(indir='foo'): | |
276 """read identifiers - first 2 cols from a pedigree file or fam file | |
277 """ | |
278 inpath = os.path.split(indir)[0] # get root | |
279 infam = '%s.fam' % indir | |
280 inped = '%s.ped' % indir # if there's a ped | |
281 flist = os.listdir(inpath) | |
282 if len(flist) == 0: | |
283 print >> sys.stderr, '### Error - input path %s is empty' % indir | |
284 sys.exit(1) | |
285 if os.path.exists(infam): | |
286 pfname = infam | |
287 elif os.path.exists(inped): | |
288 pfname = inped | |
289 else: | |
290 print >> sys.stderr, '### Error - input path %s contains no ped or fam' % indir | |
291 sys.exit(1) | |
292 f = file(pfname,'r') | |
293 ids = [] | |
294 for l in f: | |
295 ll = l.split() | |
296 if len(ll) > 5: # ok line? | |
297 ids.append(ll[:2]) | |
298 return ids | |
299 | |
300 def makePhe(phes = [],ids=[]): | |
301 """Create the null phenotype values and append them to the case id | |
302 phes is the (generator, headername) for each column | |
303 for a phe file, ids are the case identifiers for the phenotype file | |
304 res contains the final strings for the file | |
305 each column is derived by iterating | |
306 over the generator actions set up by makePhes | |
307 """ | |
308 header = ['FID','IID'] # for plink | |
309 res = copy.copy(ids) | |
310 for (f,fname) in phes: | |
311 header.append(fname) | |
312 for n,subject in enumerate(ids): | |
313 res[n].append(f.next()) # generate the right value | |
314 res.insert(0,header) | |
315 res = [' '.join(x) for x in res] # must be space delim for fbat | |
316 return res | |
317 | |
318 def makePhes(pheTypes=[], pheNames=[], pheParams=[]): | |
319 """set up phes for makePhe | |
320 each has to have an iterator (action) and a name | |
321 """ | |
322 action = None | |
323 phes = [] | |
324 for n,pt in enumerate(pheTypes): | |
325 name = pheNames[n] | |
326 if pt == 'rnorm': | |
327 m = pheParams[n].get('Mean',0.0) | |
328 s = pheParams[n].get('SD',1.0) | |
329 action = normPhe(mean=m,sd=s) # so we can just iterate over action | |
330 elif pt == 'rgamma': | |
331 m = pheParams[n].get('Alpha',0.0) | |
332 s = pheParams[n].get('Beta',1.0) | |
333 action = gammaPhe(alpha=m,beta=s) | |
334 if pt == 'exponential': | |
335 m = pheParams[n].get('Mean',1.0) | |
336 action = expoPhe(mean=m) # so we can just iterate over action | |
337 elif pt == 'weibull': | |
338 m = pheParams[n].get('Alpha',0.0) | |
339 s = pheParams[n].get('Beta',1.0) | |
340 action = weibullPhe(alpha=m,beta=s) | |
341 elif pt == 'cat': | |
342 v = pheParams[n].get('values',['?',]) | |
343 action = catPhe(values=v) | |
344 elif pt == 'unif': | |
345 low = pheParams[n].get('low',0.0) | |
346 hi = pheParams[n].get('hi',1.0) | |
347 action = uniPhe(low=low,hi=hi) | |
348 elif pt == 'poisson': | |
349 lamb = pheParams[n].get('lamb',1.0) | |
350 action = poisson(lamb=lamb) | |
351 phes.append((action,name)) | |
352 return phes | |
353 | |
354 def doImport(outfile='test',flist=[],expl='',mylog=[]): | |
355 """ import into one of the new html composite data types for Rgenetics | |
356 Dan Blankenberg with mods by Ross Lazarus | |
357 October 2007 | |
358 """ | |
359 outf = open(outfile,'w') | |
360 progname = os.path.basename(sys.argv[0]) | |
361 outf.write(galhtmlprefix % progname) | |
362 if len(flist) > 0: | |
363 outf.write('<ol>\n') | |
364 for i, data in enumerate( flist ): | |
365 outf.write('<li><a href="%s">%s %s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1],expl)) | |
366 outf.write('</ol>\n') | |
367 else: | |
368 outf.write('No files found') | |
369 outf.write('<hr/><h4>Log of run follows:</h4><br/><pre>%s\n</pre><br/><hr/>' % ('\n'.join(mylog))) | |
370 outf.write("</div></body></html>\n") | |
371 outf.close() | |
372 | |
373 | |
374 def test(): | |
375 """test case | |
376 need to get these out of a galaxy form - series of pages - get types | |
377 on first screen, names on second, params on third? | |
378 holy shit. this actually works I think | |
379 """ | |
380 pT = ['rnorm','rnorm','rnorm','rnorm','cat','unif'] | |
381 pN = ['SysBP','DiaBP','HtCM','WtKG','Race','age'] | |
382 pP = [{'Mean':'120','SD':'10'},{'Mean':'90','SD':'15'},{'Mean':'160','SD':'20'},{'Mean':'60','SD':'20'}, \ | |
383 {'values':'Blink,What,Yours,green'},{'low':16,'hi':99}] | |
384 phes = makePhes(pheTypes=pT, pheNames=pN, pheParams=pP) | |
385 ids = [] | |
386 for i in range(10): | |
387 ids.append(['fid%d' % i,'iid%d' % i]) | |
388 pheres = makePhe(phes=phes,ids=ids) | |
389 res = [''.join(x) for x in pheres] | |
390 print '\n'.join(res) | |
391 | |
392 | |
393 | |
394 if __name__ == "__main__": | |
395 """ | |
396 <command interpreter="python">rgfakePhe.py '$infile1.extra_files_path/$infile1.metadata.base_name' | |
397 "$title1" '$ppheout' '$ppheout.files_path' '$script_file ' | |
398 </command> | |
399 The xml file for this tool is complex, and allows any arbitrary number of | |
400 phenotype columns to be specified from a couple of optional types - rnorm, cat | |
401 are working now. | |
402 | |
403 Note that we create new files in their respective library directories and link to them in the output file | |
404 so they can be displayed and downloaded separately | |
405 | |
406 """ | |
407 killme = string.punctuation + string.whitespace | |
408 trantab = string.maketrans(killme,'_'*len(killme)) | |
409 progname = os.path.basename(sys.argv[0]) | |
410 cl = '## at %s, %s got cl= %s' % (timenow(),progname,' '.join(sys.argv)) | |
411 print >> sys.stdout,cl | |
412 if len(sys.argv) < 5: | |
413 test() | |
414 else: | |
415 inped = sys.argv[1] | |
416 title = sys.argv[2].translate(trantab) | |
417 ppheout = sys.argv[3] | |
418 pphe_path = sys.argv[4] | |
419 scriptfile = sys.argv[5] | |
420 ind = file(scriptfile,'r').readlines() | |
421 mylog = [] | |
422 s = '## %s starting at %s<br/>\n' % (progname,timenow()) | |
423 mylog.append(s) | |
424 mylog.append(cl) | |
425 s = '## params = %s<br/>\n' % (' '.join(sys.argv[1:])) | |
426 mylog.append(s) | |
427 s = '\n'.join(ind) | |
428 mylog.append('Script file %s contained %s<br/>\n' % (scriptfile,s)) | |
429 pT = [] | |
430 pN = [] | |
431 pP = [] | |
432 for l in ind: | |
433 l = l.strip() | |
434 if len(l) > 1: | |
435 adict = eval(l) | |
436 pT.append(adict.get('pT',None)) | |
437 pN.append(adict.get('pN',None)) | |
438 pP.append(eval(adict.get('pP',None))) | |
439 s = '## pt,pn,pp=%s,%s,%s<br/>\n' % (str(pT),str(pN),str(pP)) | |
440 mylog.append(s) | |
441 phes = makePhes(pheTypes=pT, pheNames=pN, pheParams=pP) | |
442 ids = getIds(indir=inped) # for labelling rows | |
443 pheres = makePhe(phes=phes,ids=ids) # random values from given distributions | |
444 try: | |
445 os.makedirs(pphe_path) | |
446 except: | |
447 pass | |
448 outname = os.path.join(pphe_path,title) | |
449 pphefname = '%s.pphe' % outname | |
450 f = file(pphefname, 'w') | |
451 f.write('\n'.join(pheres)) | |
452 f.write('\n') | |
453 f.close() | |
454 if doFbatphe: | |
455 try: | |
456 os.makedirs(fphe_path) | |
457 except: | |
458 pass | |
459 outname = os.path.join(fphe_path,title) | |
460 fphefname = '%s.phe' % outname | |
461 f = file(fphefname, 'w') | |
462 header = pheres[0].split() | |
463 pheres[0] = ' '.join(header[2:])# remove iid fid from header for fbat | |
464 f.write('\n'.join(pheres)) | |
465 f.close() | |
466 doImport(outfile=fpheout,flist=[fphefname,],expl='(FBAT phenotype format)',mylog=mylog) | |
467 #doImport(outfile='test',flist=[],expl='',mylog=[]): | |
468 doImport(outfile=ppheout,flist=[pphefname,],expl='(Plink phenotype format)',mylog=mylog) | |
469 |