Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgfakePed.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 # modified may 2011 to name components (map/ped) as RgeneticsData to align with default base_name | |
2 # otherwise downstream tools fail | |
3 # modified march 2011 to remove post execution hook | |
4 # pedigree data faker | |
5 # specifically designed for scalability testing of | |
6 # Shaun Purcel's PLINK package | |
7 # derived from John Ziniti's original suggestion | |
8 # allele frequency spectrum and random mating added | |
9 # ross lazarus me fecit january 13 2007 | |
10 # copyright ross lazarus 2007 | |
11 # without psyco | |
12 # generates about 10k snp genotypes in 2k subjects (666 trios) per minute or so. | |
13 # so 500k (a billion genotypes), at about 4 trios/min will a couple of hours to generate | |
14 # psyco makes it literally twice as quick!! | |
15 # all rights reserved except as granted under the terms of the LGPL | |
16 # see http://www.gnu.org/licenses/lgpl.html | |
17 # for a copy of the license you receive with this software | |
18 # and for your rights and obligations | |
19 # especially if you wish to modify or redistribute this code | |
20 # january 19 added random missingness inducer | |
21 # currently about 15M genos/minute without psyco, 30M/minute with | |
22 # so a billion genos should take about 40 minutes with psyco or 80 without... | |
23 # added mendel error generator jan 23 rml | |
24 | |
25 | |
26 import random,sys,time,os,string | |
27 | |
28 from optparse import OptionParser | |
29 | |
30 defbasename="RgeneticsData" | |
31 width = 500000 | |
32 ALLELES = ['1','2','3','4'] | |
33 prog = os.path.split(sys.argv[0])[-1] | |
34 debug = 0 | |
35 | |
36 """Natural-order sorting, supporting embedded numbers. | |
37 # found at http://lists.canonical.org/pipermail/kragen-hacks/2005-October/000419.html | |
38 note test code there removed to conserve brain space | |
39 foo9bar2 < foo10bar2 < foo10bar10 | |
40 | |
41 """ | |
42 import random, re, sys | |
43 | |
44 def natsort_key(item): | |
45 chunks = re.split('(\d+(?:\.\d+)?)', item) | |
46 for ii in range(len(chunks)): | |
47 if chunks[ii] and chunks[ii][0] in '0123456789': | |
48 if '.' in chunks[ii]: numtype = float | |
49 else: numtype = int | |
50 # wrap in tuple with '0' to explicitly specify numbers come first | |
51 chunks[ii] = (0, numtype(chunks[ii])) | |
52 else: | |
53 chunks[ii] = (1, chunks[ii]) | |
54 return (chunks, item) | |
55 | |
56 def natsort(seq): | |
57 "Sort a sequence of text strings in a reasonable order." | |
58 alist = [item for item in seq] | |
59 alist.sort(key=natsort_key) | |
60 return alist | |
61 | |
62 | |
63 def makeUniformMAFdist(low=0.02, high=0.5): | |
64 """Fake a non-uniform maf distribution to make the data | |
65 more interesting. Provide uniform 0.02-0.5 distribution""" | |
66 MAFdistribution = [] | |
67 for i in xrange(int(100*low),int(100*high)+1): | |
68 freq = i/100.0 # uniform | |
69 MAFdistribution.append(freq) | |
70 return MAFdistribution | |
71 | |
72 def makeTriangularMAFdist(low=0.02, high=0.5, beta=5): | |
73 """Fake a non-uniform maf distribution to make the data | |
74 more interesting - more rare alleles """ | |
75 MAFdistribution = [] | |
76 for i in xrange(int(100*low),int(100*high)+1): | |
77 freq = (51 - i)/100.0 # large numbers of small allele freqs | |
78 for j in range(beta*i): # or i*i for crude exponential distribution | |
79 MAFdistribution.append(freq) | |
80 return MAFdistribution | |
81 | |
82 def makeFbathead(rslist=[], chromlist=[], poslist=[], width=100000): | |
83 """header row | |
84 """ | |
85 res = ['%s_%s_%s' % (chromlist[x], poslist[x], rslist[x]) for x in range(len(rslist))] | |
86 return ' '.join(res) | |
87 | |
88 def makeMap( width=500000, MAFdistribution=[], useGP=False): | |
89 """make snp allele and frequency tables for consistent generation""" | |
90 usegp = 1 | |
91 snpdb = 'snp126' | |
92 hgdb = 'hg18' | |
93 alleles = [] | |
94 freqs = [] | |
95 rslist = [] | |
96 chromlist = [] | |
97 poslist = [] | |
98 for snp in range(width): | |
99 random.shuffle(ALLELES) | |
100 alleles.append(ALLELES[0:2]) # need two DIFFERENT alleles! | |
101 freqs.append(random.choice(MAFdistribution)) # more rare alleles | |
102 if useGP: | |
103 try: | |
104 import MySQLdb | |
105 genome = MySQLdb.Connect('localhost', 'hg18', 'G3gn0m3') | |
106 curs = genome.cursor() # use default cursor | |
107 except: | |
108 if debug: | |
109 print 'cannot connect to local copy of golden path' | |
110 usegp = 0 | |
111 if usegp and useGP: # urrrgghh getting snps into chrom offset order is complicated.... | |
112 curs.execute('use %s' % hgdb) | |
113 print 'Collecting %d real rs numbers - this may take a while' % width | |
114 # get a random draw of enough reasonable (hapmap) snps with frequency data | |
115 s = '''select distinct chrom,chromEnd, name from %s where avHet > 0 and chrom not like '%%random' | |
116 group by name order by rand() limit %d''' % (snpdb,width) | |
117 curs.execute(s) | |
118 reslist = curs.fetchall() | |
119 reslist = ['%s\t%09d\t%s' % (x[3:],y,z) for x,y,z in reslist] # get rid of chr | |
120 reslist = natsort(reslist) | |
121 for s in reslist: | |
122 chrom,pos,rs = s.split('\t') | |
123 rslist.append(rs) | |
124 chromlist.append(chrom) | |
125 poslist.append(pos) | |
126 else: | |
127 chrom = '1' | |
128 for snp in range(width): | |
129 pos = '%d' % (1000*snp) | |
130 rs = 'rs00%d' % snp | |
131 rslist.append(rs) | |
132 chromlist.append(chrom) | |
133 poslist.append(pos) | |
134 return alleles,freqs, rslist, chromlist, poslist | |
135 | |
136 def writeMap(fprefix = '', fpath='./', rslist=[], chromlist=[], poslist=[], width = 500000): | |
137 """make a faked plink compatible map file - fbat files | |
138 have the map written as a header line""" | |
139 outf = '%s.map'% (fprefix) | |
140 outf = os.path.join(fpath,outf) | |
141 amap = open(outf, 'w') | |
142 res = ['%s\t%s\t0\t%s' % (chromlist[x],rslist[x],poslist[x]) for x in range(len(rslist))] | |
143 res.append('') | |
144 amap.write('\n'.join(res)) | |
145 amap.close() | |
146 | |
147 def makeMissing(genos=[], missrate = 0.03, missval = '0'): | |
148 """impose some random missingness""" | |
149 nsnps = len(genos) | |
150 for snp in range(nsnps): # ignore first 6 columns | |
151 if random.random() <= missrate: | |
152 genos[snp] = '%s %s' % (missval,missval) | |
153 return genos | |
154 | |
155 def makeTriomissing(genos=[], missrate = 0.03, missval = '0'): | |
156 """impose some random missingness on a trio - moth eaten like real data""" | |
157 for person in (0,1): | |
158 nsnps = len(genos[person]) | |
159 for snp in range(nsnps): | |
160 for person in [0,1,2]: | |
161 if random.random() <= missrate: | |
162 genos[person][snp] = '%s %s' % (missval,missval) | |
163 return genos | |
164 | |
165 | |
166 def makeTriomendel(p1g=(0,0),p2g=(0,0), kiddip = (0,0)): | |
167 """impose some random mendels on a trio | |
168 there are 8 of the 9 mating types we can simulate reasonable errors for | |
169 Note, since random mating dual het parents can produce any genotype we can't generate an interesting | |
170 error for them, so the overall mendel rate will be lower than mendrate, depending on | |
171 allele frequency...""" | |
172 if p1g[0] <> p1g[1] and p2g[0] <> p2g[1]: # both parents het | |
173 return kiddip # cannot simulate a mendel error - anything is legal! | |
174 elif (p1g[0] <> p1g[1]): # p1 is het parent so p2 must be hom | |
175 if p2g[0] == 0: # - make child p2 opposite hom for error | |
176 kiddip = (1,1) | |
177 else: | |
178 kiddip = (0,0) | |
179 elif (p2g[0] <> p2g[1]): # p2 is het parent so p1 must be hom | |
180 if p1g[0] == 0: # - make child p1 opposite hom for error | |
181 kiddip = (1,1) | |
182 else: | |
183 kiddip = (0,0) | |
184 elif (p1g[0] == p1g[1]): # p1 is hom parent and if we get here p2 must also be hom | |
185 if p1g[0] == p2g[0]: # both parents are same hom - make child either het or opposite hom for error | |
186 if random.random() <= 0.5: | |
187 kiddip = (0,1) | |
188 else: | |
189 if p1g[0] == 0: | |
190 kiddip = (1,1) | |
191 else: | |
192 kiddip = (0,0) | |
193 else: # parents are opposite hom - return any hom as an error | |
194 if random.random() <= 0.5: | |
195 kiddip = (0,0) | |
196 else: | |
197 kiddip = (1,1) | |
198 return kiddip | |
199 | |
200 | |
201 | |
202 | |
203 def makeFam(width=100, freqs={}, alleles={}, trio=1, missrate=0.03, missval='0', mendrate=0.0): | |
204 """this family is a simple trio, constructed by random mating two random genotypes | |
205 TODO: why not generate from chromosomes - eg hapmap | |
206 set each haplotype locus according to the conditional | |
207 probability implied by the surrounding loci - eg use both neighboring pairs, triplets | |
208 and quads as observed in hapmap ceu""" | |
209 dadped = '%d 1 0 0 1 1 %s' | |
210 mumped = '%d 2 0 0 2 1 %s' # a mother is a mum where I come from :) | |
211 kidped = '%d 3 1 2 %d %d %s' | |
212 family = [] # result accumulator | |
213 sex = random.choice((1,2)) # for the kid | |
214 affected = random.choice((1,2)) | |
215 genos = [[],[],[]] # dad, mum, kid - 0/1 for common,rare initially, then xform to alleles | |
216 # parent1...kidn lists of 0/1 for common,rare initially, then xformed to alleles | |
217 for snp in xrange(width): | |
218 f = freqs[snp] | |
219 for i in range(2): # do dad and mum | |
220 p = random.random() | |
221 a1 = a2 = 0 | |
222 if p <= f: # a rare allele | |
223 a1 = 1 | |
224 p = random.random() | |
225 if p <= f: # a rare allele | |
226 a2 = 1 | |
227 if a1 > a2: | |
228 a1,a2 = a2,a1 # so ordering consistent - 00,01,11 | |
229 dip = (a1,a2) | |
230 genos[i].append(dip) # tuples of 0,1 | |
231 a1 = random.choice(genos[0][snp]) # dad gamete | |
232 a2 = random.choice(genos[1][snp]) # mum gamete | |
233 if a1 > a2: | |
234 a1,a2 = a2,a1 # so ordering consistent - 00,01,11 | |
235 kiddip = (a1,a2) # NSFW mating! | |
236 genos[2].append(kiddip) | |
237 if mendrate > 0: | |
238 if random.random() <= mendrate: | |
239 genos[2][snp] = makeTriomendel(genos[0][snp],genos[1][snp], kiddip) | |
240 achoice = alleles[snp] | |
241 for g in genos: # now convert to alleles using allele dict | |
242 a1 = achoice[g[snp][0]] # get allele letter | |
243 a2 = achoice[g[snp][1]] | |
244 g[snp] = '%s %s' % (a1,a2) | |
245 if missrate > 0: | |
246 genos = makeTriomissing(genos=genos,missrate=missrate, missval=missval) | |
247 family.append(dadped % (trio,' '.join(genos[0]))) # create a row for each member of trio | |
248 family.append(mumped % (trio,' '.join(genos[1]))) | |
249 family.append(kidped % (trio,sex,affected,' '.join(genos[2]))) | |
250 return family | |
251 | |
252 def makePerson(width=100, aff=1, freqs={}, alleles={}, id=1, missrate = 0.03, missval='0'): | |
253 """make an entire genotype vector for an independent subject""" | |
254 sex = random.choice((1,2)) | |
255 if not aff: | |
256 aff = random.choice((1,2)) | |
257 genos = [] #0/1 for common,rare initially, then xform to alleles | |
258 family = [] | |
259 personped = '%d 1 0 0 %d %d %s' | |
260 poly = (0,1) | |
261 for snp in xrange(width): | |
262 achoice = alleles[snp] | |
263 f = freqs[snp] | |
264 p = random.random() | |
265 a1 = a2 = 0 | |
266 if p <= f: # a rare allele | |
267 a1 = 1 | |
268 p = random.random() | |
269 if p <= f: # a rare allele | |
270 a2 = 1 | |
271 if a1 > a2: | |
272 a1,a2 = a2,a1 # so ordering consistent - 00,01,11 | |
273 a1 = achoice[a1] # get allele letter | |
274 a2 = achoice[a2] | |
275 g = '%s %s' % (a1,a2) | |
276 genos.append(g) | |
277 if missrate > 0.0: | |
278 genos = makeMissing(genos=genos,missrate=missrate, missval=missval) | |
279 family.append(personped % (id,sex,aff,' '.join(genos))) | |
280 return family | |
281 | |
282 def makeHapmap(fprefix= 'fakebigped',width=100, aff=[], freqs={}, | |
283 alleles={}, nsubj = 2000, trios = True, mendrate=0.03, missrate = 0.03, missval='0'): | |
284 """ fake a hapmap file and a pedigree file for eg haploview | |
285 this is arranged as the transpose of a ped file - cols are subjects, rows are markers | |
286 so we need to generate differently since we can't do the transpose in ram reliably for | |
287 a few billion genotypes... | |
288 """ | |
289 outheadprefix = 'rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode %s' | |
290 cfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1", | |
291 "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:lsid:dcc.hapmap.org:Panel:CEPH-30-trios:1","QC+"] | |
292 yfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1", | |
293 "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:LSID:dcc.hapmap.org:Panel:Yoruba-30-trios:1","QC+"] | |
294 sampids = ids | |
295 if trios: | |
296 ts = '%d trios' % int(nsubj/3.) | |
297 else: | |
298 ts = '%d unrelated subjects' % nsubj | |
299 res = ['#%s fake hapmap file %d snps and %s, faked by %s' % (timenow(), width, ts, prog),] | |
300 res.append('# ross lazarus me fecit') | |
301 res.append(outheadprefix % ' '.join(sampids)) # make a header compatible with hapmap extracts | |
302 outf = open('%s.hmap' % (fprefix), 'w') | |
303 started = time.time() | |
304 if trios: | |
305 ntrios = int(nsubj/3.) | |
306 for n in ntrios: # each is a dict | |
307 row = copy.copy(cfake5) # get first fields | |
308 row = map(str,row) | |
309 if race == "YRI": | |
310 row += yfake5 | |
311 elif race == 'CEU': | |
312 row += cfake5 | |
313 else: | |
314 row += ['NA' for x in range(5)] # 5 dummy fields = center protLSID assayLSID panelLSID QCcode | |
315 row += [''.join(sorted(line[x])) for x in sampids] # the genotypes in header (sorted) sample id order | |
316 res.append(' '.join(row)) | |
317 res.append('') | |
318 outfname = '%s_%s_%s_%dkb.geno' % (gene,probeid,race,2*flank/1000) | |
319 f = file(outfname,'w') | |
320 f.write('\n'.join(res)) | |
321 f.close() | |
322 print '### %s: Wrote %d lines to %s' % (timenow(), len(res),outfname) | |
323 | |
324 | |
325 def makePed(fprefix= 'fakebigped', fpath='./', | |
326 width=500000, nsubj=2000, MAFdistribution=[],alleles={}, | |
327 freqs={}, fbatstyle=True, mendrate = 0.0, missrate = 0.03, missval='0',fbathead=''): | |
328 """fake trios with mendel consistent random mating genotypes in offspring | |
329 with consistent alleles and MAFs for the sample""" | |
330 res = [] | |
331 if fbatstyle: # add a header row with the marker names | |
332 res.append(fbathead) # header row for fbat | |
333 outfname = '%s.ped'% (fprefix) | |
334 outfname = os.path.join(fpath,outfname) | |
335 outf = open(outfname,'w') | |
336 ntrios = int(nsubj/3.) | |
337 outf = open(outfile, 'w') | |
338 started = time.time() | |
339 for trio in xrange(ntrios): | |
340 family = makeFam(width=width, freqs=freqs, alleles=alleles, trio=trio, | |
341 missrate = missrate, mendrate=mendrate, missval=missval) | |
342 res += family | |
343 if (trio + 1) % 10 == 0: # write out to keep ram requirements reasonable | |
344 if (trio + 1) % 50 == 0: # show progress | |
345 dur = time.time() - started | |
346 if dur == 0: | |
347 dur = 1.0 | |
348 print 'Trio: %d, %4.1f genos/sec at %6.1f sec' % (trio + 1, width*trio*3/dur, dur) | |
349 outf.write('\n'.join(res)) | |
350 outf.write('\n') | |
351 res = [] | |
352 if len(res) > 0: # some left | |
353 outf.write('\n'.join(res)) | |
354 outf.write('\n') | |
355 outf.close() | |
356 if debug: | |
357 print '##makeped : %6.1f seconds total runtime' % (time.time() - started) | |
358 | |
359 def makeIndep(fprefix = 'fakebigped', fpath='./', | |
360 width=500000, Nunaff=1000, Naff=1000, MAFdistribution=[], | |
361 alleles={}, freqs={}, fbatstyle=True, missrate = 0.03, missval='0',fbathead=''): | |
362 """fake a random sample from a random mating sample | |
363 with consistent alleles and MAFs""" | |
364 res = [] | |
365 Ntot = Nunaff + Naff | |
366 status = [1,]*Nunaff | |
367 status += [2,]*Nunaff | |
368 outf = '%s.ped' % (fprefix) | |
369 outf = os.path.join(fpath,outf) | |
370 outf = open(outf, 'w') | |
371 started = time.time() | |
372 #sample = personMaker(width=width, affs=status, freqs=freqs, alleles=alleles, Ntomake=Ntot) | |
373 if fbatstyle: # add a header row with the marker names | |
374 res.append(fbathead) # header row for fbat | |
375 for id in xrange(Ntot): | |
376 if id < Nunaff: | |
377 aff = 1 | |
378 else: | |
379 aff = 2 | |
380 family = makePerson(width=width, aff=aff, freqs=freqs, alleles=alleles, id=id+1) | |
381 res += family | |
382 if (id % 50 == 0): # write out to keep ram requirements reasonable | |
383 if (id % 200 == 0): # show progress | |
384 dur = time.time() - started | |
385 if dur == 0: | |
386 dur = 1.0 | |
387 print 'Id: %d, %4.1f genos/sec at %6.1f sec' % (id, width*id/dur, dur) | |
388 outf.write('\n'.join(res)) | |
389 outf.write('\n') | |
390 res = [] | |
391 if len(res) > 0: # some left | |
392 outf.write('\n'.join(res)) | |
393 outf.write('\n') | |
394 outf.close() | |
395 print '## makeindep: %6.1f seconds total runtime' % (time.time() - started) | |
396 | |
397 u = """ | |
398 Generate either trios or independent subjects with a prespecified | |
399 number of random alleles and a uniform or triangular MAF distribution for | |
400 stress testing. No LD is simulated - alleles are random. Offspring for | |
401 trios are generated by random mating the random parental alleles so there are | |
402 no Mendelian errors unless the -M option is used. Mendelian errors are generated | |
403 randomly according to the possible errors given the parental mating type although | |
404 this is fresh code and not guaranteed to work quite right yet - comments welcomed | |
405 | |
406 Enquiries to ross.lazarus@gmail.com | |
407 | |
408 eg to generate 700 trios with 500k snps, use: | |
409 fakebigped.py -n 2100 -s 500000 | |
410 or to generate 500 independent cases and 500 controls with 100k snps and 0.02 missingness (MCAR), use: | |
411 fakebigped.py -c 500 -n 1000 -s 100000 -m 0.02 | |
412 | |
413 fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000 | |
414 will make fbat compatible myfake.ped with 100k markers in | |
415 666 trios (total close to 2000 subjects), a uniform MAF distribution and about 5% MCAR missing | |
416 | |
417 fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000 -M 0.05 | |
418 will make fbat compatible myfake.ped with 100k markers in | |
419 666 trios (total close to 2000 subjects), a uniform MAF distribution, | |
420 about 5% Mendelian errors and about 5% MCAR missing | |
421 | |
422 | |
423 fakebigped.py -o myfakecc -m 0.05 -s 100000 -n 2000 -c 1000 -l | |
424 will make plink compatible myfakecc.ped and myfakecc.map (that's what the -l option does), | |
425 with 100k markers in 1000 cases and 1000 controls (affection status 2 and 1 respectively), | |
426 a triangular MAF distribution (more rare alleles) and about 5% MCAR missing | |
427 | |
428 You should see about 1/4 million genotypes/second so about an hour for a | |
429 500k snps in 2k subjects and about a 4GB ped file - these are BIG!! | |
430 | |
431 """ | |
432 | |
433 import sys, os, glob | |
434 | |
435 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> | |
436 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
437 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
438 <head> | |
439 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
440 <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> | |
441 <title></title> | |
442 <link rel="stylesheet" href="/static/style/base.css" type="text/css" /> | |
443 </head> | |
444 <body> | |
445 <div class="document"> | |
446 """ | |
447 | |
448 | |
449 def doImport(outfile=None,outpath=None): | |
450 """ import into one of the new html composite data types for Rgenetics | |
451 Dan Blankenberg with mods by Ross Lazarus | |
452 October 2007 | |
453 """ | |
454 flist = glob.glob(os.path.join(outpath,'*')) | |
455 outf = open(outfile,'w') | |
456 outf.write(galhtmlprefix % prog) | |
457 for i, data in enumerate( flist ): | |
458 outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1])) | |
459 outf.write('<br><h3>This is simulated null genotype data generated by Rgenetics!</h3>') | |
460 outf.write('%s called with command line:<br><pre>' % prog) | |
461 outf.write(' '.join(sys.argv)) | |
462 outf.write('\n</pre>\n') | |
463 outf.write("</div></body></html>") | |
464 outf.close() | |
465 | |
466 | |
467 | |
468 if __name__ == "__main__": | |
469 """ | |
470 """ | |
471 parser = OptionParser(usage=u, version="%prog 0.01") | |
472 a = parser.add_option | |
473 a("-n","--nsubjects",type="int",dest="Ntot", | |
474 help="nsubj: total number of subjects",default=2000) | |
475 a("-t","--title",dest="title", | |
476 help="title: file basename for outputs",default='fakeped') | |
477 a("-c","--cases",type="int",dest="Naff", | |
478 help="number of cases: independent subjects with status set to 2 (ie cases). If not set, NTOT/3 trios will be generated", default = 0) | |
479 a("-s","--snps",dest="width",type="int", | |
480 help="snps: total number of snps per subject", default=1000) | |
481 a("-d","--distribution",dest="MAFdist",default="Uniform", | |
482 help="MAF distribution - default is Uniform, can be Triangular") | |
483 a("-o","--outf",dest="outf", | |
484 help="Output file", default = 'fakeped') | |
485 a("-p","--outpath",dest="outpath", | |
486 help="Path for output files", default = './') | |
487 a("-l","--pLink",dest="outstyle", default='L', | |
488 help="Ped files as for Plink - no header, separate Map file - default is Plink style") | |
489 a("-w","--loWmaf", type="float", dest="lowmaf", default=0.01, help="Lower limit for SNP MAF (minor allele freq)") | |
490 a("-m","--missing",dest="missrate",type="float", | |
491 help="missing: probability of missing MCAR - default 0.0", default=0.0) | |
492 a("-v","--valmiss",dest="missval", | |
493 help="missing character: Missing allele code - usually 0 or N - default 0", default="0") | |
494 a("-M","--Mendelrate",dest="mendrate",type="float", | |
495 help="Mendelian error rate: probability of a mendel error per trio, default=0.0", default=0.0) | |
496 a("-H","--noHGRS",dest="useHG",type="int", | |
497 help="Use local copy of UCSC snp126 database to generate real rs numbers", default=True) | |
498 (options,args) = parser.parse_args() | |
499 low = options.lowmaf | |
500 try: | |
501 os.makedirs(options.outpath) | |
502 except: | |
503 pass | |
504 if options.MAFdist.upper() == 'U': | |
505 mafDist = makeUniformMAFdist(low=low, high=0.5) | |
506 else: | |
507 mafDist = makeTriangularMAFdist(low=low, high=0.5, beta=5) | |
508 alleles,freqs, rslist, chromlist, poslist = makeMap(width=int(options.width), | |
509 MAFdistribution=mafDist, useGP=False) | |
510 fbathead = [] | |
511 s = string.whitespace+string.punctuation | |
512 trantab = string.maketrans(s,'_'*len(s)) | |
513 title = string.translate(options.title,trantab) | |
514 | |
515 if options.outstyle == 'F': | |
516 fbatstyle = True | |
517 fbathead = makeFbathead(rslist=rslist, chromlist=chromlist, poslist=poslist, width=options.width) | |
518 else: | |
519 fbatstyle = False | |
520 writeMap(fprefix=defbasename, rslist=rslist, fpath=options.outpath, | |
521 chromlist=chromlist, poslist=poslist, width=options.width) | |
522 if options.Naff > 0: # make case control data | |
523 makeIndep(fprefix = defbasename, fpath=options.outpath, | |
524 width=options.width, Nunaff=options.Ntot-options.Naff, | |
525 Naff=options.Naff, MAFdistribution=mafDist,alleles=alleles, freqs=freqs, | |
526 fbatstyle=fbatstyle, missrate=options.missrate, missval=options.missval, | |
527 fbathead=fbathead) | |
528 else: | |
529 makePed(fprefix=defbasename, fpath=options.fpath, | |
530 width=options.width, MAFdistribution=mafDist, nsubj=options.Ntot, | |
531 alleles=alleles, freqs=freqs, fbatstyle=fbatstyle, missrate=options.missrate, | |
532 mendrate=options.mendrate, missval=options.missval, | |
533 fbathead=fbathead) | |
534 doImport(outfile=options.outf,outpath=options.outpath) | |
535 | |
536 | |
537 |