comparison tools/rgenetics/rgfakePhe.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 """
2 fakephe.py
3 ross lazarus sept 30 2007
4 This is available under the LGPL as defined then.
5
6 use the pedigree data for ids
7
8 use pythons generators to literally generate a bunch of random phenotype measurements
9
10 Plink format at http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#pheno
11 is
12
13 To specify an alternate phenotype for analysis, i.e. other than the one in the *.ped file (or, if using a binary fileset, the
14 *.fam file), use the --pheno option:
15 plink --file mydata --pheno pheno.txt
16
17 where pheno.txt is a file that contains 3 columns (one row per individual):
18
19 Family ID
20 Individual ID
21 Phenotype
22
23 NOTE The original PED file must still contain a phenotype in column 6 (even if this is a dummy phenotype, e.g. all missing),
24 unless the --no-pheno flag is given.
25
26 If an individual is in the original file but not listed in the alternate phenotype file, that person's phenotype will be set to
27 missing. If a person is in the alternate phenotype file but not in the original file, that entry will be ignored. The order of
28 the alternate phenotype file need not be the same as for the original file.
29
30 If the phenotype file contains more than one phenotype, then use the --mpheno N option to specify the Nth phenotype is the one
31 to be used:
32 plink --file mydata --pheno pheno2.txt --mpheno 4
33
34 where pheno2.txt contains 5 different phenotypes (i.e. 7 columns in total), this command will use the 4th for analysis
35 (phenotype D):
36
37 Family ID
38 Individual ID
39 Phenotype A
40 Phenotype B
41 Phenotype C
42 Phenotype D
43 Phenotype E
44
45 Alternatively, your alternate phenotype file can have a header row, in which case you can use variable names to specify which
46 phenotype to use. If you have a header row, the first two variables must be labelled FID and IID. All subsequent variable names
47 cannot have any whitespace in them. For example,
48
49 FID IID qt1 bmi site
50 F1 1110 2.3 22.22 2
51 F2 2202 34.12 18.23 1
52 ...
53
54 then
55 plink --file mydata --pheno pheno2.txt --pheno-name bmi --assoc
56
57 will select the second phenotype labelled "bmi", for analysis
58
59 Finally, if there is more than one phenotype, then for basic association tests, it is possible to specify that all phenotypes
60 be tested, sequentially, with the output sent to different files: e.g. if bigpheno.raw contains 10,000 phenotypes, then
61 plink --bfile mydata --assoc --pheno bigpheno.raw --all-pheno
62
63 will loop over all of these, one at a time testing for association with SNP, generating a lot of output. You might want to use
64 the --pfilter command in this case, to only report results with a p-value less than a certain value, e.g. --pfilter 1e-3.
65
66 WARNING Currently, all phenotypes must be numerically coded, including missing values, in the alternate phenotype file. The
67 default missing value is -9, change this with --missing-phenotype, but it must be a numeric value still (in contrast to the
68 main phenotype in the PED/FAM file. This issue will be fixed in future releases.
69 Covariate files
70
71 ===========================
72 rgfakePhe.xml
73 <tool id="fakePhe1" name="Fake phenos">
74 <description>for multiple null fake phenotype</description>
75 <command interpreter="python2.4">rgfakePhe.py $input1 '$title1' $out_file1 $log_file1 $script_file</command>
76 <inputs>
77 <page>
78 <param name="input1"
79 type="library" format="lped"
80 label="Pedigree from Dataset"/>
81 <param name="title1" type="text"
82 value="My fake phenos" size="60"
83 label="Title for outputs"/>
84 </page>
85 <page>
86 <repeat name="fakePhe" title="Phenotypes to Fake">
87 <param name="pName" type="text" label="Phenotype Name">
88 </param>
89 <conditional name="series">
90 <param name="phetype" type="select" label="Phenotype Type">
91 <option value="rnorm" selected="true">Random normal variate</option>
92 <option value="cat">Random categorical</option>
93 </param>
94 <when value="rnorm">
95 <param name="Mean" type="float" value="0.0" label="Mean">
96 </param>
97 <param name="SD" type="float" label="SD" value="1.0">
98 </param>
99 </when>
100 <when value="cat">
101 <param name="values" type="text" value="1,2,3,fiddle" label="comma separated values to choose from">
102 </param>
103 </when>
104 </conditional>
105 </repeat>
106 </page>
107 </inputs>
108 <configfiles>
109 <configfile name="script_file">
110 #for $n, $f in enumerate($fakePhe)
111 #if $f.series.phetype=='rnorm'
112 {'pN':'$f.pName','pT':'$f.series.phetype','pP':"{'Mean':'$f.series.Mean', 'SD':'$f.series.SD'}"}
113 #elif $f.series.phetype=='cat'
114 {'pN':'$f.pName','pT':'$f.series.phetype','pP':"{'values':'$f.series.values'}"}
115 #end if
116 #end for
117 </configfile>
118 </configfiles>
119
120 <outputs>
121 <data format="pphe" name="out_file1" />
122 <data format="text" name="log_file1" parent="out_file1"/>
123 </outputs>
124
125 <help>
126 .. class:: infomark
127
128 This tool allows you to generate an arbitrary (sort of)
129 synthetic phenotype file with measurements drawn from normal,
130 gamma, or categorical distributions. These are for testing under
131 the null hypothesis of no association - the values are random but
132 from user specified distributions.
133
134 -----
135
136 .. class:: warningmark
137
138 This tool is very experimental
139
140 -----
141
142 - **Pedigree** is a library pedigree file - the id's will be used in the synthetic null phenotypes
143 - **Title** is a name to give to the output phenotype file
144
145 On the next page, you can add an unlimited number of various kinds of phenotypes including choices for
146 categorical ones or distributions with specific parameters
147
148 Just keep adding new ones until you're done then use the Execute button to run the generation
149
150
151
152
153 **Example from another tool to keep you busy and in awe**
154
155 Input file::
156
157 1 68 4.1
158 2 71 4.6
159 3 62 3.8
160 4 75 4.4
161 5 58 3.2
162 6 60 3.1
163 7 67 3.8
164 8 68 4.1
165 9 71 4.3
166 10 69 3.7
167
168 Create a two series XY plot on the above data:
169
170 - Series 1: Red Dashed-Line plot between columns 1 and 2
171 - Series 2: Blue Circular-Point plot between columns 3 and 2
172
173 .. image:: ./static/images/xy_example.jpg
174 </help>
175 </tool>
176
177
178
179 """
180
181 import random,copy,sys,os,time,string,math
182
183 doFbatphe = False
184
185 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
186 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
187 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
188 <head>
189 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
190 <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
191 <title></title>
192 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
193 </head>
194 <body>
195 <div class="document">
196 """
197
198
199 def timenow():
200 """return current time as a string
201 """
202 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
203
204
205 def poisson(lamb=2):
206 """
207 algorithm poisson random number (Knuth):
208 init:
209 Let L = e^-lamb, k = 0 and p = 1.
210 do:
211 k = k + 1.
212 Generate uniform random number u in [0,1] and let p = p u.
213 while p >= L.
214 return k - 1.
215 """
216 lamb = float(lamb)
217 l = math.e**(-lamb)
218 k=0
219 p=1
220 while 1:
221 while p >= l:
222 k += 1
223 u = random.uniform(0.0,1.0)
224 p *= u
225 yield '%e' % (k - 1)
226
227 def gammaPhe(alpha=1,beta=1):
228 """generator for random values from a gamma
229 """
230 while 1: # an iterator for a random phenotype
231 dat = random.gammavariate(float(alpha),float(beta))
232 yield '%e' % dat
233
234 def weibullPhe(alpha=1,beta=1):
235 """generator for random values from a weibull distribution
236 """
237 while 1: # an iterator for a random phenotype
238 dat = random.weibullvariate(float(alpha),float(beta))
239 yield '%e' % dat
240
241 def normPhe(mean=0,sd=1):
242 """generator for random values from a normal distribution
243 """
244 while 1:# an iterator for a random phenotype
245 dat = random.normalvariate(float(mean),float(sd))
246 yield '%e' % dat
247
248 def expoPhe(mean=1):
249 """generator for random values from an exponential distribution
250 """
251 lamb = 1.0/float(mean)
252 while 1:# an iterator for a random phenotype
253 dat = random.expovariate(lamb)
254 yield '%e' % dat
255
256
257 def catPhe(values='1,2,3'):
258 """ Schrodinger's of course.
259 """
260 v = values.split(',')
261 while 1:# an iterator for a random phenotype
262 dat = random.choice(v)
263 yield dat
264
265 def uniPhe(low=0.0,hi=1.0):
266 """generator for a uniform distribution
267 what if low=-5 and hi=-2
268 """
269 low = float(low)
270 hi = float(hi)
271 while 1: # unif phenotype
272 v = random.uniform(low,hi) # 0-1
273 yield '%e' % v
274
275 def getIds(indir='foo'):
276 """read identifiers - first 2 cols from a pedigree file or fam file
277 """
278 inpath = os.path.split(indir)[0] # get root
279 infam = '%s.fam' % indir
280 inped = '%s.ped' % indir # if there's a ped
281 flist = os.listdir(inpath)
282 if len(flist) == 0:
283 print >> sys.stderr, '### Error - input path %s is empty' % indir
284 sys.exit(1)
285 if os.path.exists(infam):
286 pfname = infam
287 elif os.path.exists(inped):
288 pfname = inped
289 else:
290 print >> sys.stderr, '### Error - input path %s contains no ped or fam' % indir
291 sys.exit(1)
292 f = file(pfname,'r')
293 ids = []
294 for l in f:
295 ll = l.split()
296 if len(ll) > 5: # ok line?
297 ids.append(ll[:2])
298 return ids
299
300 def makePhe(phes = [],ids=[]):
301 """Create the null phenotype values and append them to the case id
302 phes is the (generator, headername) for each column
303 for a phe file, ids are the case identifiers for the phenotype file
304 res contains the final strings for the file
305 each column is derived by iterating
306 over the generator actions set up by makePhes
307 """
308 header = ['FID','IID'] # for plink
309 res = copy.copy(ids)
310 for (f,fname) in phes:
311 header.append(fname)
312 for n,subject in enumerate(ids):
313 res[n].append(f.next()) # generate the right value
314 res.insert(0,header)
315 res = [' '.join(x) for x in res] # must be space delim for fbat
316 return res
317
318 def makePhes(pheTypes=[], pheNames=[], pheParams=[]):
319 """set up phes for makePhe
320 each has to have an iterator (action) and a name
321 """
322 action = None
323 phes = []
324 for n,pt in enumerate(pheTypes):
325 name = pheNames[n]
326 if pt == 'rnorm':
327 m = pheParams[n].get('Mean',0.0)
328 s = pheParams[n].get('SD',1.0)
329 action = normPhe(mean=m,sd=s) # so we can just iterate over action
330 elif pt == 'rgamma':
331 m = pheParams[n].get('Alpha',0.0)
332 s = pheParams[n].get('Beta',1.0)
333 action = gammaPhe(alpha=m,beta=s)
334 if pt == 'exponential':
335 m = pheParams[n].get('Mean',1.0)
336 action = expoPhe(mean=m) # so we can just iterate over action
337 elif pt == 'weibull':
338 m = pheParams[n].get('Alpha',0.0)
339 s = pheParams[n].get('Beta',1.0)
340 action = weibullPhe(alpha=m,beta=s)
341 elif pt == 'cat':
342 v = pheParams[n].get('values',['?',])
343 action = catPhe(values=v)
344 elif pt == 'unif':
345 low = pheParams[n].get('low',0.0)
346 hi = pheParams[n].get('hi',1.0)
347 action = uniPhe(low=low,hi=hi)
348 elif pt == 'poisson':
349 lamb = pheParams[n].get('lamb',1.0)
350 action = poisson(lamb=lamb)
351 phes.append((action,name))
352 return phes
353
354 def doImport(outfile='test',flist=[],expl='',mylog=[]):
355 """ import into one of the new html composite data types for Rgenetics
356 Dan Blankenberg with mods by Ross Lazarus
357 October 2007
358 """
359 outf = open(outfile,'w')
360 progname = os.path.basename(sys.argv[0])
361 outf.write(galhtmlprefix % progname)
362 if len(flist) > 0:
363 outf.write('<ol>\n')
364 for i, data in enumerate( flist ):
365 outf.write('<li><a href="%s">%s %s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1],expl))
366 outf.write('</ol>\n')
367 else:
368 outf.write('No files found')
369 outf.write('<hr/><h4>Log of run follows:</h4><br/><pre>%s\n</pre><br/><hr/>' % ('\n'.join(mylog)))
370 outf.write("</div></body></html>\n")
371 outf.close()
372
373
374 def test():
375 """test case
376 need to get these out of a galaxy form - series of pages - get types
377 on first screen, names on second, params on third?
378 holy shit. this actually works I think
379 """
380 pT = ['rnorm','rnorm','rnorm','rnorm','cat','unif']
381 pN = ['SysBP','DiaBP','HtCM','WtKG','Race','age']
382 pP = [{'Mean':'120','SD':'10'},{'Mean':'90','SD':'15'},{'Mean':'160','SD':'20'},{'Mean':'60','SD':'20'}, \
383 {'values':'Blink,What,Yours,green'},{'low':16,'hi':99}]
384 phes = makePhes(pheTypes=pT, pheNames=pN, pheParams=pP)
385 ids = []
386 for i in range(10):
387 ids.append(['fid%d' % i,'iid%d' % i])
388 pheres = makePhe(phes=phes,ids=ids)
389 res = [''.join(x) for x in pheres]
390 print '\n'.join(res)
391
392
393
394 if __name__ == "__main__":
395 """
396 <command interpreter="python">rgfakePhe.py '$infile1.extra_files_path/$infile1.metadata.base_name'
397 "$title1" '$ppheout' '$ppheout.files_path' '$script_file '
398 </command>
399 The xml file for this tool is complex, and allows any arbitrary number of
400 phenotype columns to be specified from a couple of optional types - rnorm, cat
401 are working now.
402
403 Note that we create new files in their respective library directories and link to them in the output file
404 so they can be displayed and downloaded separately
405
406 """
407 killme = string.punctuation + string.whitespace
408 trantab = string.maketrans(killme,'_'*len(killme))
409 progname = os.path.basename(sys.argv[0])
410 cl = '## at %s, %s got cl= %s' % (timenow(),progname,' '.join(sys.argv))
411 print >> sys.stdout,cl
412 if len(sys.argv) < 5:
413 test()
414 else:
415 inped = sys.argv[1]
416 title = sys.argv[2].translate(trantab)
417 ppheout = sys.argv[3]
418 pphe_path = sys.argv[4]
419 scriptfile = sys.argv[5]
420 ind = file(scriptfile,'r').readlines()
421 mylog = []
422 s = '## %s starting at %s<br/>\n' % (progname,timenow())
423 mylog.append(s)
424 mylog.append(cl)
425 s = '## params = %s<br/>\n' % (' '.join(sys.argv[1:]))
426 mylog.append(s)
427 s = '\n'.join(ind)
428 mylog.append('Script file %s contained %s<br/>\n' % (scriptfile,s))
429 pT = []
430 pN = []
431 pP = []
432 for l in ind:
433 l = l.strip()
434 if len(l) > 1:
435 adict = eval(l)
436 pT.append(adict.get('pT',None))
437 pN.append(adict.get('pN',None))
438 pP.append(eval(adict.get('pP',None)))
439 s = '## pt,pn,pp=%s,%s,%s<br/>\n' % (str(pT),str(pN),str(pP))
440 mylog.append(s)
441 phes = makePhes(pheTypes=pT, pheNames=pN, pheParams=pP)
442 ids = getIds(indir=inped) # for labelling rows
443 pheres = makePhe(phes=phes,ids=ids) # random values from given distributions
444 try:
445 os.makedirs(pphe_path)
446 except:
447 pass
448 outname = os.path.join(pphe_path,title)
449 pphefname = '%s.pphe' % outname
450 f = file(pphefname, 'w')
451 f.write('\n'.join(pheres))
452 f.write('\n')
453 f.close()
454 if doFbatphe:
455 try:
456 os.makedirs(fphe_path)
457 except:
458 pass
459 outname = os.path.join(fphe_path,title)
460 fphefname = '%s.phe' % outname
461 f = file(fphefname, 'w')
462 header = pheres[0].split()
463 pheres[0] = ' '.join(header[2:])# remove iid fid from header for fbat
464 f.write('\n'.join(pheres))
465 f.close()
466 doImport(outfile=fpheout,flist=[fphefname,],expl='(FBAT phenotype format)',mylog=mylog)
467 #doImport(outfile='test',flist=[],expl='',mylog=[]):
468 doImport(outfile=ppheout,flist=[pphefname,],expl='(Plink phenotype format)',mylog=mylog)
469