0
|
1 """
|
|
2 fakephe.py
|
|
3 ross lazarus sept 30 2007
|
|
4 This is available under the LGPL as defined then.
|
|
5
|
|
6 use the pedigree data for ids
|
|
7
|
|
8 use pythons generators to literally generate a bunch of random phenotype measurements
|
|
9
|
|
10 Plink format at http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#pheno
|
|
11 is
|
|
12
|
|
13 To specify an alternate phenotype for analysis, i.e. other than the one in the *.ped file (or, if using a binary fileset, the
|
|
14 *.fam file), use the --pheno option:
|
|
15 plink --file mydata --pheno pheno.txt
|
|
16
|
|
17 where pheno.txt is a file that contains 3 columns (one row per individual):
|
|
18
|
|
19 Family ID
|
|
20 Individual ID
|
|
21 Phenotype
|
|
22
|
|
23 NOTE The original PED file must still contain a phenotype in column 6 (even if this is a dummy phenotype, e.g. all missing),
|
|
24 unless the --no-pheno flag is given.
|
|
25
|
|
26 If an individual is in the original file but not listed in the alternate phenotype file, that person's phenotype will be set to
|
|
27 missing. If a person is in the alternate phenotype file but not in the original file, that entry will be ignored. The order of
|
|
28 the alternate phenotype file need not be the same as for the original file.
|
|
29
|
|
30 If the phenotype file contains more than one phenotype, then use the --mpheno N option to specify the Nth phenotype is the one
|
|
31 to be used:
|
|
32 plink --file mydata --pheno pheno2.txt --mpheno 4
|
|
33
|
|
34 where pheno2.txt contains 5 different phenotypes (i.e. 7 columns in total), this command will use the 4th for analysis
|
|
35 (phenotype D):
|
|
36
|
|
37 Family ID
|
|
38 Individual ID
|
|
39 Phenotype A
|
|
40 Phenotype B
|
|
41 Phenotype C
|
|
42 Phenotype D
|
|
43 Phenotype E
|
|
44
|
|
45 Alternatively, your alternate phenotype file can have a header row, in which case you can use variable names to specify which
|
|
46 phenotype to use. If you have a header row, the first two variables must be labelled FID and IID. All subsequent variable names
|
|
47 cannot have any whitespace in them. For example,
|
|
48
|
|
49 FID IID qt1 bmi site
|
|
50 F1 1110 2.3 22.22 2
|
|
51 F2 2202 34.12 18.23 1
|
|
52 ...
|
|
53
|
|
54 then
|
|
55 plink --file mydata --pheno pheno2.txt --pheno-name bmi --assoc
|
|
56
|
|
57 will select the second phenotype labelled "bmi", for analysis
|
|
58
|
|
59 Finally, if there is more than one phenotype, then for basic association tests, it is possible to specify that all phenotypes
|
|
60 be tested, sequentially, with the output sent to different files: e.g. if bigpheno.raw contains 10,000 phenotypes, then
|
|
61 plink --bfile mydata --assoc --pheno bigpheno.raw --all-pheno
|
|
62
|
|
63 will loop over all of these, one at a time testing for association with SNP, generating a lot of output. You might want to use
|
|
64 the --pfilter command in this case, to only report results with a p-value less than a certain value, e.g. --pfilter 1e-3.
|
|
65
|
|
66 WARNING Currently, all phenotypes must be numerically coded, including missing values, in the alternate phenotype file. The
|
|
67 default missing value is -9, change this with --missing-phenotype, but it must be a numeric value still (in contrast to the
|
|
68 main phenotype in the PED/FAM file. This issue will be fixed in future releases.
|
|
69 Covariate files
|
|
70
|
|
71 ===========================
|
|
72 rgfakePhe.xml
|
|
73 <tool id="fakePhe1" name="Fake phenos">
|
|
74 <description>for multiple null fake phenotype</description>
|
|
75 <command interpreter="python2.4">rgfakePhe.py $input1 '$title1' $out_file1 $log_file1 $script_file</command>
|
|
76 <inputs>
|
|
77 <page>
|
|
78 <param name="input1"
|
|
79 type="library" format="lped"
|
|
80 label="Pedigree from Dataset"/>
|
|
81 <param name="title1" type="text"
|
|
82 value="My fake phenos" size="60"
|
|
83 label="Title for outputs"/>
|
|
84 </page>
|
|
85 <page>
|
|
86 <repeat name="fakePhe" title="Phenotypes to Fake">
|
|
87 <param name="pName" type="text" label="Phenotype Name">
|
|
88 </param>
|
|
89 <conditional name="series">
|
|
90 <param name="phetype" type="select" label="Phenotype Type">
|
|
91 <option value="rnorm" selected="true">Random normal variate</option>
|
|
92 <option value="cat">Random categorical</option>
|
|
93 </param>
|
|
94 <when value="rnorm">
|
|
95 <param name="Mean" type="float" value="0.0" label="Mean">
|
|
96 </param>
|
|
97 <param name="SD" type="float" label="SD" value="1.0">
|
|
98 </param>
|
|
99 </when>
|
|
100 <when value="cat">
|
|
101 <param name="values" type="text" value="1,2,3,fiddle" label="comma separated values to choose from">
|
|
102 </param>
|
|
103 </when>
|
|
104 </conditional>
|
|
105 </repeat>
|
|
106 </page>
|
|
107 </inputs>
|
|
108 <configfiles>
|
|
109 <configfile name="script_file">
|
|
110 #for $n, $f in enumerate($fakePhe)
|
|
111 #if $f.series.phetype=='rnorm'
|
|
112 {'pN':'$f.pName','pT':'$f.series.phetype','pP':"{'Mean':'$f.series.Mean', 'SD':'$f.series.SD'}"}
|
|
113 #elif $f.series.phetype=='cat'
|
|
114 {'pN':'$f.pName','pT':'$f.series.phetype','pP':"{'values':'$f.series.values'}"}
|
|
115 #end if
|
|
116 #end for
|
|
117 </configfile>
|
|
118 </configfiles>
|
|
119
|
|
120 <outputs>
|
|
121 <data format="pphe" name="out_file1" />
|
|
122 <data format="text" name="log_file1" parent="out_file1"/>
|
|
123 </outputs>
|
|
124
|
|
125 <help>
|
|
126 .. class:: infomark
|
|
127
|
|
128 This tool allows you to generate an arbitrary (sort of)
|
|
129 synthetic phenotype file with measurements drawn from normal,
|
|
130 gamma, or categorical distributions. These are for testing under
|
|
131 the null hypothesis of no association - the values are random but
|
|
132 from user specified distributions.
|
|
133
|
|
134 -----
|
|
135
|
|
136 .. class:: warningmark
|
|
137
|
|
138 This tool is very experimental
|
|
139
|
|
140 -----
|
|
141
|
|
142 - **Pedigree** is a library pedigree file - the id's will be used in the synthetic null phenotypes
|
|
143 - **Title** is a name to give to the output phenotype file
|
|
144
|
|
145 On the next page, you can add an unlimited number of various kinds of phenotypes including choices for
|
|
146 categorical ones or distributions with specific parameters
|
|
147
|
|
148 Just keep adding new ones until you're done then use the Execute button to run the generation
|
|
149
|
|
150
|
|
151
|
|
152
|
|
153 **Example from another tool to keep you busy and in awe**
|
|
154
|
|
155 Input file::
|
|
156
|
|
157 1 68 4.1
|
|
158 2 71 4.6
|
|
159 3 62 3.8
|
|
160 4 75 4.4
|
|
161 5 58 3.2
|
|
162 6 60 3.1
|
|
163 7 67 3.8
|
|
164 8 68 4.1
|
|
165 9 71 4.3
|
|
166 10 69 3.7
|
|
167
|
|
168 Create a two series XY plot on the above data:
|
|
169
|
|
170 - Series 1: Red Dashed-Line plot between columns 1 and 2
|
|
171 - Series 2: Blue Circular-Point plot between columns 3 and 2
|
|
172
|
|
173 .. image:: ./static/images/xy_example.jpg
|
|
174 </help>
|
|
175 </tool>
|
|
176
|
|
177
|
|
178
|
|
179 """
|
|
180
|
|
181 import random,copy,sys,os,time,string,math
|
|
182
|
|
183 doFbatphe = False
|
|
184
|
|
185 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
|
|
186 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
187 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
|
188 <head>
|
|
189 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|
190 <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
|
|
191 <title></title>
|
|
192 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
|
|
193 </head>
|
|
194 <body>
|
|
195 <div class="document">
|
|
196 """
|
|
197
|
|
198
|
|
199 def timenow():
|
|
200 """return current time as a string
|
|
201 """
|
|
202 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
|
|
203
|
|
204
|
|
205 def poisson(lamb=2):
|
|
206 """
|
|
207 algorithm poisson random number (Knuth):
|
|
208 init:
|
|
209 Let L = e^-lamb, k = 0 and p = 1.
|
|
210 do:
|
|
211 k = k + 1.
|
|
212 Generate uniform random number u in [0,1] and let p = p u.
|
|
213 while p >= L.
|
|
214 return k - 1.
|
|
215 """
|
|
216 lamb = float(lamb)
|
|
217 l = math.e**(-lamb)
|
|
218 k=0
|
|
219 p=1
|
|
220 while 1:
|
|
221 while p >= l:
|
|
222 k += 1
|
|
223 u = random.uniform(0.0,1.0)
|
|
224 p *= u
|
|
225 yield '%e' % (k - 1)
|
|
226
|
|
227 def gammaPhe(alpha=1,beta=1):
|
|
228 """generator for random values from a gamma
|
|
229 """
|
|
230 while 1: # an iterator for a random phenotype
|
|
231 dat = random.gammavariate(float(alpha),float(beta))
|
|
232 yield '%e' % dat
|
|
233
|
|
234 def weibullPhe(alpha=1,beta=1):
|
|
235 """generator for random values from a weibull distribution
|
|
236 """
|
|
237 while 1: # an iterator for a random phenotype
|
|
238 dat = random.weibullvariate(float(alpha),float(beta))
|
|
239 yield '%e' % dat
|
|
240
|
|
241 def normPhe(mean=0,sd=1):
|
|
242 """generator for random values from a normal distribution
|
|
243 """
|
|
244 while 1:# an iterator for a random phenotype
|
|
245 dat = random.normalvariate(float(mean),float(sd))
|
|
246 yield '%e' % dat
|
|
247
|
|
248 def expoPhe(mean=1):
|
|
249 """generator for random values from an exponential distribution
|
|
250 """
|
|
251 lamb = 1.0/float(mean)
|
|
252 while 1:# an iterator for a random phenotype
|
|
253 dat = random.expovariate(lamb)
|
|
254 yield '%e' % dat
|
|
255
|
|
256
|
|
257 def catPhe(values='1,2,3'):
|
|
258 """ Schrodinger's of course.
|
|
259 """
|
|
260 v = values.split(',')
|
|
261 while 1:# an iterator for a random phenotype
|
|
262 dat = random.choice(v)
|
|
263 yield dat
|
|
264
|
|
265 def uniPhe(low=0.0,hi=1.0):
|
|
266 """generator for a uniform distribution
|
|
267 what if low=-5 and hi=-2
|
|
268 """
|
|
269 low = float(low)
|
|
270 hi = float(hi)
|
|
271 while 1: # unif phenotype
|
|
272 v = random.uniform(low,hi) # 0-1
|
|
273 yield '%e' % v
|
|
274
|
|
275 def getIds(indir='foo'):
|
|
276 """read identifiers - first 2 cols from a pedigree file or fam file
|
|
277 """
|
|
278 inpath = os.path.split(indir)[0] # get root
|
|
279 infam = '%s.fam' % indir
|
|
280 inped = '%s.ped' % indir # if there's a ped
|
|
281 flist = os.listdir(inpath)
|
|
282 if len(flist) == 0:
|
|
283 print >> sys.stderr, '### Error - input path %s is empty' % indir
|
|
284 sys.exit(1)
|
|
285 if os.path.exists(infam):
|
|
286 pfname = infam
|
|
287 elif os.path.exists(inped):
|
|
288 pfname = inped
|
|
289 else:
|
|
290 print >> sys.stderr, '### Error - input path %s contains no ped or fam' % indir
|
|
291 sys.exit(1)
|
|
292 f = file(pfname,'r')
|
|
293 ids = []
|
|
294 for l in f:
|
|
295 ll = l.split()
|
|
296 if len(ll) > 5: # ok line?
|
|
297 ids.append(ll[:2])
|
|
298 return ids
|
|
299
|
|
300 def makePhe(phes = [],ids=[]):
|
|
301 """Create the null phenotype values and append them to the case id
|
|
302 phes is the (generator, headername) for each column
|
|
303 for a phe file, ids are the case identifiers for the phenotype file
|
|
304 res contains the final strings for the file
|
|
305 each column is derived by iterating
|
|
306 over the generator actions set up by makePhes
|
|
307 """
|
|
308 header = ['FID','IID'] # for plink
|
|
309 res = copy.copy(ids)
|
|
310 for (f,fname) in phes:
|
|
311 header.append(fname)
|
|
312 for n,subject in enumerate(ids):
|
|
313 res[n].append(f.next()) # generate the right value
|
|
314 res.insert(0,header)
|
|
315 res = [' '.join(x) for x in res] # must be space delim for fbat
|
|
316 return res
|
|
317
|
|
318 def makePhes(pheTypes=[], pheNames=[], pheParams=[]):
|
|
319 """set up phes for makePhe
|
|
320 each has to have an iterator (action) and a name
|
|
321 """
|
|
322 action = None
|
|
323 phes = []
|
|
324 for n,pt in enumerate(pheTypes):
|
|
325 name = pheNames[n]
|
|
326 if pt == 'rnorm':
|
|
327 m = pheParams[n].get('Mean',0.0)
|
|
328 s = pheParams[n].get('SD',1.0)
|
|
329 action = normPhe(mean=m,sd=s) # so we can just iterate over action
|
|
330 elif pt == 'rgamma':
|
|
331 m = pheParams[n].get('Alpha',0.0)
|
|
332 s = pheParams[n].get('Beta',1.0)
|
|
333 action = gammaPhe(alpha=m,beta=s)
|
|
334 if pt == 'exponential':
|
|
335 m = pheParams[n].get('Mean',1.0)
|
|
336 action = expoPhe(mean=m) # so we can just iterate over action
|
|
337 elif pt == 'weibull':
|
|
338 m = pheParams[n].get('Alpha',0.0)
|
|
339 s = pheParams[n].get('Beta',1.0)
|
|
340 action = weibullPhe(alpha=m,beta=s)
|
|
341 elif pt == 'cat':
|
|
342 v = pheParams[n].get('values',['?',])
|
|
343 action = catPhe(values=v)
|
|
344 elif pt == 'unif':
|
|
345 low = pheParams[n].get('low',0.0)
|
|
346 hi = pheParams[n].get('hi',1.0)
|
|
347 action = uniPhe(low=low,hi=hi)
|
|
348 elif pt == 'poisson':
|
|
349 lamb = pheParams[n].get('lamb',1.0)
|
|
350 action = poisson(lamb=lamb)
|
|
351 phes.append((action,name))
|
|
352 return phes
|
|
353
|
|
354 def doImport(outfile='test',flist=[],expl='',mylog=[]):
|
|
355 """ import into one of the new html composite data types for Rgenetics
|
|
356 Dan Blankenberg with mods by Ross Lazarus
|
|
357 October 2007
|
|
358 """
|
|
359 outf = open(outfile,'w')
|
|
360 progname = os.path.basename(sys.argv[0])
|
|
361 outf.write(galhtmlprefix % progname)
|
|
362 if len(flist) > 0:
|
|
363 outf.write('<ol>\n')
|
|
364 for i, data in enumerate( flist ):
|
|
365 outf.write('<li><a href="%s">%s %s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1],expl))
|
|
366 outf.write('</ol>\n')
|
|
367 else:
|
|
368 outf.write('No files found')
|
|
369 outf.write('<hr/><h4>Log of run follows:</h4><br/><pre>%s\n</pre><br/><hr/>' % ('\n'.join(mylog)))
|
|
370 outf.write("</div></body></html>\n")
|
|
371 outf.close()
|
|
372
|
|
373
|
|
374 def test():
|
|
375 """test case
|
|
376 need to get these out of a galaxy form - series of pages - get types
|
|
377 on first screen, names on second, params on third?
|
|
378 holy shit. this actually works I think
|
|
379 """
|
|
380 pT = ['rnorm','rnorm','rnorm','rnorm','cat','unif']
|
|
381 pN = ['SysBP','DiaBP','HtCM','WtKG','Race','age']
|
|
382 pP = [{'Mean':'120','SD':'10'},{'Mean':'90','SD':'15'},{'Mean':'160','SD':'20'},{'Mean':'60','SD':'20'}, \
|
|
383 {'values':'Blink,What,Yours,green'},{'low':16,'hi':99}]
|
|
384 phes = makePhes(pheTypes=pT, pheNames=pN, pheParams=pP)
|
|
385 ids = []
|
|
386 for i in range(10):
|
|
387 ids.append(['fid%d' % i,'iid%d' % i])
|
|
388 pheres = makePhe(phes=phes,ids=ids)
|
|
389 res = [''.join(x) for x in pheres]
|
|
390 print '\n'.join(res)
|
|
391
|
|
392
|
|
393
|
|
394 if __name__ == "__main__":
|
|
395 """
|
|
396 <command interpreter="python">rgfakePhe.py '$infile1.extra_files_path/$infile1.metadata.base_name'
|
|
397 "$title1" '$ppheout' '$ppheout.files_path' '$script_file '
|
|
398 </command>
|
|
399 The xml file for this tool is complex, and allows any arbitrary number of
|
|
400 phenotype columns to be specified from a couple of optional types - rnorm, cat
|
|
401 are working now.
|
|
402
|
|
403 Note that we create new files in their respective library directories and link to them in the output file
|
|
404 so they can be displayed and downloaded separately
|
|
405
|
|
406 """
|
|
407 killme = string.punctuation + string.whitespace
|
|
408 trantab = string.maketrans(killme,'_'*len(killme))
|
|
409 progname = os.path.basename(sys.argv[0])
|
|
410 cl = '## at %s, %s got cl= %s' % (timenow(),progname,' '.join(sys.argv))
|
|
411 print >> sys.stdout,cl
|
|
412 if len(sys.argv) < 5:
|
|
413 test()
|
|
414 else:
|
|
415 inped = sys.argv[1]
|
|
416 title = sys.argv[2].translate(trantab)
|
|
417 ppheout = sys.argv[3]
|
|
418 pphe_path = sys.argv[4]
|
|
419 scriptfile = sys.argv[5]
|
|
420 ind = file(scriptfile,'r').readlines()
|
|
421 mylog = []
|
|
422 s = '## %s starting at %s<br/>\n' % (progname,timenow())
|
|
423 mylog.append(s)
|
|
424 mylog.append(cl)
|
|
425 s = '## params = %s<br/>\n' % (' '.join(sys.argv[1:]))
|
|
426 mylog.append(s)
|
|
427 s = '\n'.join(ind)
|
|
428 mylog.append('Script file %s contained %s<br/>\n' % (scriptfile,s))
|
|
429 pT = []
|
|
430 pN = []
|
|
431 pP = []
|
|
432 for l in ind:
|
|
433 l = l.strip()
|
|
434 if len(l) > 1:
|
|
435 adict = eval(l)
|
|
436 pT.append(adict.get('pT',None))
|
|
437 pN.append(adict.get('pN',None))
|
|
438 pP.append(eval(adict.get('pP',None)))
|
|
439 s = '## pt,pn,pp=%s,%s,%s<br/>\n' % (str(pT),str(pN),str(pP))
|
|
440 mylog.append(s)
|
|
441 phes = makePhes(pheTypes=pT, pheNames=pN, pheParams=pP)
|
|
442 ids = getIds(indir=inped) # for labelling rows
|
|
443 pheres = makePhe(phes=phes,ids=ids) # random values from given distributions
|
|
444 try:
|
|
445 os.makedirs(pphe_path)
|
|
446 except:
|
|
447 pass
|
|
448 outname = os.path.join(pphe_path,title)
|
|
449 pphefname = '%s.pphe' % outname
|
|
450 f = file(pphefname, 'w')
|
|
451 f.write('\n'.join(pheres))
|
|
452 f.write('\n')
|
|
453 f.close()
|
|
454 if doFbatphe:
|
|
455 try:
|
|
456 os.makedirs(fphe_path)
|
|
457 except:
|
|
458 pass
|
|
459 outname = os.path.join(fphe_path,title)
|
|
460 fphefname = '%s.phe' % outname
|
|
461 f = file(fphefname, 'w')
|
|
462 header = pheres[0].split()
|
|
463 pheres[0] = ' '.join(header[2:])# remove iid fid from header for fbat
|
|
464 f.write('\n'.join(pheres))
|
|
465 f.close()
|
|
466 doImport(outfile=fpheout,flist=[fphefname,],expl='(FBAT phenotype format)',mylog=mylog)
|
|
467 #doImport(outfile='test',flist=[],expl='',mylog=[]):
|
|
468 doImport(outfile=ppheout,flist=[pphefname,],expl='(Plink phenotype format)',mylog=mylog)
|
|
469
|