0
|
1 # modified may 2011 to name components (map/ped) as RgeneticsData to align with default base_name
|
|
2 # otherwise downstream tools fail
|
|
3 # modified march 2011 to remove post execution hook
|
|
4 # pedigree data faker
|
|
5 # specifically designed for scalability testing of
|
|
6 # Shaun Purcel's PLINK package
|
|
7 # derived from John Ziniti's original suggestion
|
|
8 # allele frequency spectrum and random mating added
|
|
9 # ross lazarus me fecit january 13 2007
|
|
10 # copyright ross lazarus 2007
|
|
11 # without psyco
|
|
12 # generates about 10k snp genotypes in 2k subjects (666 trios) per minute or so.
|
|
13 # so 500k (a billion genotypes), at about 4 trios/min will a couple of hours to generate
|
|
14 # psyco makes it literally twice as quick!!
|
|
15 # all rights reserved except as granted under the terms of the LGPL
|
|
16 # see http://www.gnu.org/licenses/lgpl.html
|
|
17 # for a copy of the license you receive with this software
|
|
18 # and for your rights and obligations
|
|
19 # especially if you wish to modify or redistribute this code
|
|
20 # january 19 added random missingness inducer
|
|
21 # currently about 15M genos/minute without psyco, 30M/minute with
|
|
22 # so a billion genos should take about 40 minutes with psyco or 80 without...
|
|
23 # added mendel error generator jan 23 rml
|
|
24
|
|
25
|
|
26 import random,sys,time,os,string
|
|
27
|
|
28 from optparse import OptionParser
|
|
29
|
|
30 defbasename="RgeneticsData"
|
|
31 width = 500000
|
|
32 ALLELES = ['1','2','3','4']
|
|
33 prog = os.path.split(sys.argv[0])[-1]
|
|
34 debug = 0
|
|
35
|
|
36 """Natural-order sorting, supporting embedded numbers.
|
|
37 # found at http://lists.canonical.org/pipermail/kragen-hacks/2005-October/000419.html
|
|
38 note test code there removed to conserve brain space
|
|
39 foo9bar2 < foo10bar2 < foo10bar10
|
|
40
|
|
41 """
|
|
42 import random, re, sys
|
|
43
|
|
44 def natsort_key(item):
|
|
45 chunks = re.split('(\d+(?:\.\d+)?)', item)
|
|
46 for ii in range(len(chunks)):
|
|
47 if chunks[ii] and chunks[ii][0] in '0123456789':
|
|
48 if '.' in chunks[ii]: numtype = float
|
|
49 else: numtype = int
|
|
50 # wrap in tuple with '0' to explicitly specify numbers come first
|
|
51 chunks[ii] = (0, numtype(chunks[ii]))
|
|
52 else:
|
|
53 chunks[ii] = (1, chunks[ii])
|
|
54 return (chunks, item)
|
|
55
|
|
56 def natsort(seq):
|
|
57 "Sort a sequence of text strings in a reasonable order."
|
|
58 alist = [item for item in seq]
|
|
59 alist.sort(key=natsort_key)
|
|
60 return alist
|
|
61
|
|
62
|
|
63 def makeUniformMAFdist(low=0.02, high=0.5):
|
|
64 """Fake a non-uniform maf distribution to make the data
|
|
65 more interesting. Provide uniform 0.02-0.5 distribution"""
|
|
66 MAFdistribution = []
|
|
67 for i in xrange(int(100*low),int(100*high)+1):
|
|
68 freq = i/100.0 # uniform
|
|
69 MAFdistribution.append(freq)
|
|
70 return MAFdistribution
|
|
71
|
|
72 def makeTriangularMAFdist(low=0.02, high=0.5, beta=5):
|
|
73 """Fake a non-uniform maf distribution to make the data
|
|
74 more interesting - more rare alleles """
|
|
75 MAFdistribution = []
|
|
76 for i in xrange(int(100*low),int(100*high)+1):
|
|
77 freq = (51 - i)/100.0 # large numbers of small allele freqs
|
|
78 for j in range(beta*i): # or i*i for crude exponential distribution
|
|
79 MAFdistribution.append(freq)
|
|
80 return MAFdistribution
|
|
81
|
|
82 def makeFbathead(rslist=[], chromlist=[], poslist=[], width=100000):
|
|
83 """header row
|
|
84 """
|
|
85 res = ['%s_%s_%s' % (chromlist[x], poslist[x], rslist[x]) for x in range(len(rslist))]
|
|
86 return ' '.join(res)
|
|
87
|
|
88 def makeMap( width=500000, MAFdistribution=[], useGP=False):
|
|
89 """make snp allele and frequency tables for consistent generation"""
|
|
90 usegp = 1
|
|
91 snpdb = 'snp126'
|
|
92 hgdb = 'hg18'
|
|
93 alleles = []
|
|
94 freqs = []
|
|
95 rslist = []
|
|
96 chromlist = []
|
|
97 poslist = []
|
|
98 for snp in range(width):
|
|
99 random.shuffle(ALLELES)
|
|
100 alleles.append(ALLELES[0:2]) # need two DIFFERENT alleles!
|
|
101 freqs.append(random.choice(MAFdistribution)) # more rare alleles
|
|
102 if useGP:
|
|
103 try:
|
|
104 import MySQLdb
|
|
105 genome = MySQLdb.Connect('localhost', 'hg18', 'G3gn0m3')
|
|
106 curs = genome.cursor() # use default cursor
|
|
107 except:
|
|
108 if debug:
|
|
109 print 'cannot connect to local copy of golden path'
|
|
110 usegp = 0
|
|
111 if usegp and useGP: # urrrgghh getting snps into chrom offset order is complicated....
|
|
112 curs.execute('use %s' % hgdb)
|
|
113 print 'Collecting %d real rs numbers - this may take a while' % width
|
|
114 # get a random draw of enough reasonable (hapmap) snps with frequency data
|
|
115 s = '''select distinct chrom,chromEnd, name from %s where avHet > 0 and chrom not like '%%random'
|
|
116 group by name order by rand() limit %d''' % (snpdb,width)
|
|
117 curs.execute(s)
|
|
118 reslist = curs.fetchall()
|
|
119 reslist = ['%s\t%09d\t%s' % (x[3:],y,z) for x,y,z in reslist] # get rid of chr
|
|
120 reslist = natsort(reslist)
|
|
121 for s in reslist:
|
|
122 chrom,pos,rs = s.split('\t')
|
|
123 rslist.append(rs)
|
|
124 chromlist.append(chrom)
|
|
125 poslist.append(pos)
|
|
126 else:
|
|
127 chrom = '1'
|
|
128 for snp in range(width):
|
|
129 pos = '%d' % (1000*snp)
|
|
130 rs = 'rs00%d' % snp
|
|
131 rslist.append(rs)
|
|
132 chromlist.append(chrom)
|
|
133 poslist.append(pos)
|
|
134 return alleles,freqs, rslist, chromlist, poslist
|
|
135
|
|
136 def writeMap(fprefix = '', fpath='./', rslist=[], chromlist=[], poslist=[], width = 500000):
|
|
137 """make a faked plink compatible map file - fbat files
|
|
138 have the map written as a header line"""
|
|
139 outf = '%s.map'% (fprefix)
|
|
140 outf = os.path.join(fpath,outf)
|
|
141 amap = open(outf, 'w')
|
|
142 res = ['%s\t%s\t0\t%s' % (chromlist[x],rslist[x],poslist[x]) for x in range(len(rslist))]
|
|
143 res.append('')
|
|
144 amap.write('\n'.join(res))
|
|
145 amap.close()
|
|
146
|
|
147 def makeMissing(genos=[], missrate = 0.03, missval = '0'):
|
|
148 """impose some random missingness"""
|
|
149 nsnps = len(genos)
|
|
150 for snp in range(nsnps): # ignore first 6 columns
|
|
151 if random.random() <= missrate:
|
|
152 genos[snp] = '%s %s' % (missval,missval)
|
|
153 return genos
|
|
154
|
|
155 def makeTriomissing(genos=[], missrate = 0.03, missval = '0'):
|
|
156 """impose some random missingness on a trio - moth eaten like real data"""
|
|
157 for person in (0,1):
|
|
158 nsnps = len(genos[person])
|
|
159 for snp in range(nsnps):
|
|
160 for person in [0,1,2]:
|
|
161 if random.random() <= missrate:
|
|
162 genos[person][snp] = '%s %s' % (missval,missval)
|
|
163 return genos
|
|
164
|
|
165
|
|
166 def makeTriomendel(p1g=(0,0),p2g=(0,0), kiddip = (0,0)):
|
|
167 """impose some random mendels on a trio
|
|
168 there are 8 of the 9 mating types we can simulate reasonable errors for
|
|
169 Note, since random mating dual het parents can produce any genotype we can't generate an interesting
|
|
170 error for them, so the overall mendel rate will be lower than mendrate, depending on
|
|
171 allele frequency..."""
|
|
172 if p1g[0] <> p1g[1] and p2g[0] <> p2g[1]: # both parents het
|
|
173 return kiddip # cannot simulate a mendel error - anything is legal!
|
|
174 elif (p1g[0] <> p1g[1]): # p1 is het parent so p2 must be hom
|
|
175 if p2g[0] == 0: # - make child p2 opposite hom for error
|
|
176 kiddip = (1,1)
|
|
177 else:
|
|
178 kiddip = (0,0)
|
|
179 elif (p2g[0] <> p2g[1]): # p2 is het parent so p1 must be hom
|
|
180 if p1g[0] == 0: # - make child p1 opposite hom for error
|
|
181 kiddip = (1,1)
|
|
182 else:
|
|
183 kiddip = (0,0)
|
|
184 elif (p1g[0] == p1g[1]): # p1 is hom parent and if we get here p2 must also be hom
|
|
185 if p1g[0] == p2g[0]: # both parents are same hom - make child either het or opposite hom for error
|
|
186 if random.random() <= 0.5:
|
|
187 kiddip = (0,1)
|
|
188 else:
|
|
189 if p1g[0] == 0:
|
|
190 kiddip = (1,1)
|
|
191 else:
|
|
192 kiddip = (0,0)
|
|
193 else: # parents are opposite hom - return any hom as an error
|
|
194 if random.random() <= 0.5:
|
|
195 kiddip = (0,0)
|
|
196 else:
|
|
197 kiddip = (1,1)
|
|
198 return kiddip
|
|
199
|
|
200
|
|
201
|
|
202
|
|
203 def makeFam(width=100, freqs={}, alleles={}, trio=1, missrate=0.03, missval='0', mendrate=0.0):
|
|
204 """this family is a simple trio, constructed by random mating two random genotypes
|
|
205 TODO: why not generate from chromosomes - eg hapmap
|
|
206 set each haplotype locus according to the conditional
|
|
207 probability implied by the surrounding loci - eg use both neighboring pairs, triplets
|
|
208 and quads as observed in hapmap ceu"""
|
|
209 dadped = '%d 1 0 0 1 1 %s'
|
|
210 mumped = '%d 2 0 0 2 1 %s' # a mother is a mum where I come from :)
|
|
211 kidped = '%d 3 1 2 %d %d %s'
|
|
212 family = [] # result accumulator
|
|
213 sex = random.choice((1,2)) # for the kid
|
|
214 affected = random.choice((1,2))
|
|
215 genos = [[],[],[]] # dad, mum, kid - 0/1 for common,rare initially, then xform to alleles
|
|
216 # parent1...kidn lists of 0/1 for common,rare initially, then xformed to alleles
|
|
217 for snp in xrange(width):
|
|
218 f = freqs[snp]
|
|
219 for i in range(2): # do dad and mum
|
|
220 p = random.random()
|
|
221 a1 = a2 = 0
|
|
222 if p <= f: # a rare allele
|
|
223 a1 = 1
|
|
224 p = random.random()
|
|
225 if p <= f: # a rare allele
|
|
226 a2 = 1
|
|
227 if a1 > a2:
|
|
228 a1,a2 = a2,a1 # so ordering consistent - 00,01,11
|
|
229 dip = (a1,a2)
|
|
230 genos[i].append(dip) # tuples of 0,1
|
|
231 a1 = random.choice(genos[0][snp]) # dad gamete
|
|
232 a2 = random.choice(genos[1][snp]) # mum gamete
|
|
233 if a1 > a2:
|
|
234 a1,a2 = a2,a1 # so ordering consistent - 00,01,11
|
|
235 kiddip = (a1,a2) # NSFW mating!
|
|
236 genos[2].append(kiddip)
|
|
237 if mendrate > 0:
|
|
238 if random.random() <= mendrate:
|
|
239 genos[2][snp] = makeTriomendel(genos[0][snp],genos[1][snp], kiddip)
|
|
240 achoice = alleles[snp]
|
|
241 for g in genos: # now convert to alleles using allele dict
|
|
242 a1 = achoice[g[snp][0]] # get allele letter
|
|
243 a2 = achoice[g[snp][1]]
|
|
244 g[snp] = '%s %s' % (a1,a2)
|
|
245 if missrate > 0:
|
|
246 genos = makeTriomissing(genos=genos,missrate=missrate, missval=missval)
|
|
247 family.append(dadped % (trio,' '.join(genos[0]))) # create a row for each member of trio
|
|
248 family.append(mumped % (trio,' '.join(genos[1])))
|
|
249 family.append(kidped % (trio,sex,affected,' '.join(genos[2])))
|
|
250 return family
|
|
251
|
|
252 def makePerson(width=100, aff=1, freqs={}, alleles={}, id=1, missrate = 0.03, missval='0'):
|
|
253 """make an entire genotype vector for an independent subject"""
|
|
254 sex = random.choice((1,2))
|
|
255 if not aff:
|
|
256 aff = random.choice((1,2))
|
|
257 genos = [] #0/1 for common,rare initially, then xform to alleles
|
|
258 family = []
|
|
259 personped = '%d 1 0 0 %d %d %s'
|
|
260 poly = (0,1)
|
|
261 for snp in xrange(width):
|
|
262 achoice = alleles[snp]
|
|
263 f = freqs[snp]
|
|
264 p = random.random()
|
|
265 a1 = a2 = 0
|
|
266 if p <= f: # a rare allele
|
|
267 a1 = 1
|
|
268 p = random.random()
|
|
269 if p <= f: # a rare allele
|
|
270 a2 = 1
|
|
271 if a1 > a2:
|
|
272 a1,a2 = a2,a1 # so ordering consistent - 00,01,11
|
|
273 a1 = achoice[a1] # get allele letter
|
|
274 a2 = achoice[a2]
|
|
275 g = '%s %s' % (a1,a2)
|
|
276 genos.append(g)
|
|
277 if missrate > 0.0:
|
|
278 genos = makeMissing(genos=genos,missrate=missrate, missval=missval)
|
|
279 family.append(personped % (id,sex,aff,' '.join(genos)))
|
|
280 return family
|
|
281
|
|
282 def makeHapmap(fprefix= 'fakebigped',width=100, aff=[], freqs={},
|
|
283 alleles={}, nsubj = 2000, trios = True, mendrate=0.03, missrate = 0.03, missval='0'):
|
|
284 """ fake a hapmap file and a pedigree file for eg haploview
|
|
285 this is arranged as the transpose of a ped file - cols are subjects, rows are markers
|
|
286 so we need to generate differently since we can't do the transpose in ram reliably for
|
|
287 a few billion genotypes...
|
|
288 """
|
|
289 outheadprefix = 'rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode %s'
|
|
290 cfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1",
|
|
291 "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:lsid:dcc.hapmap.org:Panel:CEPH-30-trios:1","QC+"]
|
|
292 yfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1",
|
|
293 "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:LSID:dcc.hapmap.org:Panel:Yoruba-30-trios:1","QC+"]
|
|
294 sampids = ids
|
|
295 if trios:
|
|
296 ts = '%d trios' % int(nsubj/3.)
|
|
297 else:
|
|
298 ts = '%d unrelated subjects' % nsubj
|
|
299 res = ['#%s fake hapmap file %d snps and %s, faked by %s' % (timenow(), width, ts, prog),]
|
|
300 res.append('# ross lazarus me fecit')
|
|
301 res.append(outheadprefix % ' '.join(sampids)) # make a header compatible with hapmap extracts
|
|
302 outf = open('%s.hmap' % (fprefix), 'w')
|
|
303 started = time.time()
|
|
304 if trios:
|
|
305 ntrios = int(nsubj/3.)
|
|
306 for n in ntrios: # each is a dict
|
|
307 row = copy.copy(cfake5) # get first fields
|
|
308 row = map(str,row)
|
|
309 if race == "YRI":
|
|
310 row += yfake5
|
|
311 elif race == 'CEU':
|
|
312 row += cfake5
|
|
313 else:
|
|
314 row += ['NA' for x in range(5)] # 5 dummy fields = center protLSID assayLSID panelLSID QCcode
|
|
315 row += [''.join(sorted(line[x])) for x in sampids] # the genotypes in header (sorted) sample id order
|
|
316 res.append(' '.join(row))
|
|
317 res.append('')
|
|
318 outfname = '%s_%s_%s_%dkb.geno' % (gene,probeid,race,2*flank/1000)
|
|
319 f = file(outfname,'w')
|
|
320 f.write('\n'.join(res))
|
|
321 f.close()
|
|
322 print '### %s: Wrote %d lines to %s' % (timenow(), len(res),outfname)
|
|
323
|
|
324
|
|
325 def makePed(fprefix= 'fakebigped', fpath='./',
|
|
326 width=500000, nsubj=2000, MAFdistribution=[],alleles={},
|
|
327 freqs={}, fbatstyle=True, mendrate = 0.0, missrate = 0.03, missval='0',fbathead=''):
|
|
328 """fake trios with mendel consistent random mating genotypes in offspring
|
|
329 with consistent alleles and MAFs for the sample"""
|
|
330 res = []
|
|
331 if fbatstyle: # add a header row with the marker names
|
|
332 res.append(fbathead) # header row for fbat
|
|
333 outfname = '%s.ped'% (fprefix)
|
|
334 outfname = os.path.join(fpath,outfname)
|
|
335 outf = open(outfname,'w')
|
|
336 ntrios = int(nsubj/3.)
|
|
337 outf = open(outfile, 'w')
|
|
338 started = time.time()
|
|
339 for trio in xrange(ntrios):
|
|
340 family = makeFam(width=width, freqs=freqs, alleles=alleles, trio=trio,
|
|
341 missrate = missrate, mendrate=mendrate, missval=missval)
|
|
342 res += family
|
|
343 if (trio + 1) % 10 == 0: # write out to keep ram requirements reasonable
|
|
344 if (trio + 1) % 50 == 0: # show progress
|
|
345 dur = time.time() - started
|
|
346 if dur == 0:
|
|
347 dur = 1.0
|
|
348 print 'Trio: %d, %4.1f genos/sec at %6.1f sec' % (trio + 1, width*trio*3/dur, dur)
|
|
349 outf.write('\n'.join(res))
|
|
350 outf.write('\n')
|
|
351 res = []
|
|
352 if len(res) > 0: # some left
|
|
353 outf.write('\n'.join(res))
|
|
354 outf.write('\n')
|
|
355 outf.close()
|
|
356 if debug:
|
|
357 print '##makeped : %6.1f seconds total runtime' % (time.time() - started)
|
|
358
|
|
359 def makeIndep(fprefix = 'fakebigped', fpath='./',
|
|
360 width=500000, Nunaff=1000, Naff=1000, MAFdistribution=[],
|
|
361 alleles={}, freqs={}, fbatstyle=True, missrate = 0.03, missval='0',fbathead=''):
|
|
362 """fake a random sample from a random mating sample
|
|
363 with consistent alleles and MAFs"""
|
|
364 res = []
|
|
365 Ntot = Nunaff + Naff
|
|
366 status = [1,]*Nunaff
|
|
367 status += [2,]*Nunaff
|
|
368 outf = '%s.ped' % (fprefix)
|
|
369 outf = os.path.join(fpath,outf)
|
|
370 outf = open(outf, 'w')
|
|
371 started = time.time()
|
|
372 #sample = personMaker(width=width, affs=status, freqs=freqs, alleles=alleles, Ntomake=Ntot)
|
|
373 if fbatstyle: # add a header row with the marker names
|
|
374 res.append(fbathead) # header row for fbat
|
|
375 for id in xrange(Ntot):
|
|
376 if id < Nunaff:
|
|
377 aff = 1
|
|
378 else:
|
|
379 aff = 2
|
|
380 family = makePerson(width=width, aff=aff, freqs=freqs, alleles=alleles, id=id+1)
|
|
381 res += family
|
|
382 if (id % 50 == 0): # write out to keep ram requirements reasonable
|
|
383 if (id % 200 == 0): # show progress
|
|
384 dur = time.time() - started
|
|
385 if dur == 0:
|
|
386 dur = 1.0
|
|
387 print 'Id: %d, %4.1f genos/sec at %6.1f sec' % (id, width*id/dur, dur)
|
|
388 outf.write('\n'.join(res))
|
|
389 outf.write('\n')
|
|
390 res = []
|
|
391 if len(res) > 0: # some left
|
|
392 outf.write('\n'.join(res))
|
|
393 outf.write('\n')
|
|
394 outf.close()
|
|
395 print '## makeindep: %6.1f seconds total runtime' % (time.time() - started)
|
|
396
|
|
397 u = """
|
|
398 Generate either trios or independent subjects with a prespecified
|
|
399 number of random alleles and a uniform or triangular MAF distribution for
|
|
400 stress testing. No LD is simulated - alleles are random. Offspring for
|
|
401 trios are generated by random mating the random parental alleles so there are
|
|
402 no Mendelian errors unless the -M option is used. Mendelian errors are generated
|
|
403 randomly according to the possible errors given the parental mating type although
|
|
404 this is fresh code and not guaranteed to work quite right yet - comments welcomed
|
|
405
|
|
406 Enquiries to ross.lazarus@gmail.com
|
|
407
|
|
408 eg to generate 700 trios with 500k snps, use:
|
|
409 fakebigped.py -n 2100 -s 500000
|
|
410 or to generate 500 independent cases and 500 controls with 100k snps and 0.02 missingness (MCAR), use:
|
|
411 fakebigped.py -c 500 -n 1000 -s 100000 -m 0.02
|
|
412
|
|
413 fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000
|
|
414 will make fbat compatible myfake.ped with 100k markers in
|
|
415 666 trios (total close to 2000 subjects), a uniform MAF distribution and about 5% MCAR missing
|
|
416
|
|
417 fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000 -M 0.05
|
|
418 will make fbat compatible myfake.ped with 100k markers in
|
|
419 666 trios (total close to 2000 subjects), a uniform MAF distribution,
|
|
420 about 5% Mendelian errors and about 5% MCAR missing
|
|
421
|
|
422
|
|
423 fakebigped.py -o myfakecc -m 0.05 -s 100000 -n 2000 -c 1000 -l
|
|
424 will make plink compatible myfakecc.ped and myfakecc.map (that's what the -l option does),
|
|
425 with 100k markers in 1000 cases and 1000 controls (affection status 2 and 1 respectively),
|
|
426 a triangular MAF distribution (more rare alleles) and about 5% MCAR missing
|
|
427
|
|
428 You should see about 1/4 million genotypes/second so about an hour for a
|
|
429 500k snps in 2k subjects and about a 4GB ped file - these are BIG!!
|
|
430
|
|
431 """
|
|
432
|
|
433 import sys, os, glob
|
|
434
|
|
435 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
|
|
436 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
437 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
|
438 <head>
|
|
439 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|
440 <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
|
|
441 <title></title>
|
|
442 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
|
|
443 </head>
|
|
444 <body>
|
|
445 <div class="document">
|
|
446 """
|
|
447
|
|
448
|
|
449 def doImport(outfile=None,outpath=None):
|
|
450 """ import into one of the new html composite data types for Rgenetics
|
|
451 Dan Blankenberg with mods by Ross Lazarus
|
|
452 October 2007
|
|
453 """
|
|
454 flist = glob.glob(os.path.join(outpath,'*'))
|
|
455 outf = open(outfile,'w')
|
|
456 outf.write(galhtmlprefix % prog)
|
|
457 for i, data in enumerate( flist ):
|
|
458 outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
|
|
459 outf.write('<br><h3>This is simulated null genotype data generated by Rgenetics!</h3>')
|
|
460 outf.write('%s called with command line:<br><pre>' % prog)
|
|
461 outf.write(' '.join(sys.argv))
|
|
462 outf.write('\n</pre>\n')
|
|
463 outf.write("</div></body></html>")
|
|
464 outf.close()
|
|
465
|
|
466
|
|
467
|
|
468 if __name__ == "__main__":
|
|
469 """
|
|
470 """
|
|
471 parser = OptionParser(usage=u, version="%prog 0.01")
|
|
472 a = parser.add_option
|
|
473 a("-n","--nsubjects",type="int",dest="Ntot",
|
|
474 help="nsubj: total number of subjects",default=2000)
|
|
475 a("-t","--title",dest="title",
|
|
476 help="title: file basename for outputs",default='fakeped')
|
|
477 a("-c","--cases",type="int",dest="Naff",
|
|
478 help="number of cases: independent subjects with status set to 2 (ie cases). If not set, NTOT/3 trios will be generated", default = 0)
|
|
479 a("-s","--snps",dest="width",type="int",
|
|
480 help="snps: total number of snps per subject", default=1000)
|
|
481 a("-d","--distribution",dest="MAFdist",default="Uniform",
|
|
482 help="MAF distribution - default is Uniform, can be Triangular")
|
|
483 a("-o","--outf",dest="outf",
|
|
484 help="Output file", default = 'fakeped')
|
|
485 a("-p","--outpath",dest="outpath",
|
|
486 help="Path for output files", default = './')
|
|
487 a("-l","--pLink",dest="outstyle", default='L',
|
|
488 help="Ped files as for Plink - no header, separate Map file - default is Plink style")
|
|
489 a("-w","--loWmaf", type="float", dest="lowmaf", default=0.01, help="Lower limit for SNP MAF (minor allele freq)")
|
|
490 a("-m","--missing",dest="missrate",type="float",
|
|
491 help="missing: probability of missing MCAR - default 0.0", default=0.0)
|
|
492 a("-v","--valmiss",dest="missval",
|
|
493 help="missing character: Missing allele code - usually 0 or N - default 0", default="0")
|
|
494 a("-M","--Mendelrate",dest="mendrate",type="float",
|
|
495 help="Mendelian error rate: probability of a mendel error per trio, default=0.0", default=0.0)
|
|
496 a("-H","--noHGRS",dest="useHG",type="int",
|
|
497 help="Use local copy of UCSC snp126 database to generate real rs numbers", default=True)
|
|
498 (options,args) = parser.parse_args()
|
|
499 low = options.lowmaf
|
|
500 try:
|
|
501 os.makedirs(options.outpath)
|
|
502 except:
|
|
503 pass
|
|
504 if options.MAFdist.upper() == 'U':
|
|
505 mafDist = makeUniformMAFdist(low=low, high=0.5)
|
|
506 else:
|
|
507 mafDist = makeTriangularMAFdist(low=low, high=0.5, beta=5)
|
|
508 alleles,freqs, rslist, chromlist, poslist = makeMap(width=int(options.width),
|
|
509 MAFdistribution=mafDist, useGP=False)
|
|
510 fbathead = []
|
|
511 s = string.whitespace+string.punctuation
|
|
512 trantab = string.maketrans(s,'_'*len(s))
|
|
513 title = string.translate(options.title,trantab)
|
|
514
|
|
515 if options.outstyle == 'F':
|
|
516 fbatstyle = True
|
|
517 fbathead = makeFbathead(rslist=rslist, chromlist=chromlist, poslist=poslist, width=options.width)
|
|
518 else:
|
|
519 fbatstyle = False
|
|
520 writeMap(fprefix=defbasename, rslist=rslist, fpath=options.outpath,
|
|
521 chromlist=chromlist, poslist=poslist, width=options.width)
|
|
522 if options.Naff > 0: # make case control data
|
|
523 makeIndep(fprefix = defbasename, fpath=options.outpath,
|
|
524 width=options.width, Nunaff=options.Ntot-options.Naff,
|
|
525 Naff=options.Naff, MAFdistribution=mafDist,alleles=alleles, freqs=freqs,
|
|
526 fbatstyle=fbatstyle, missrate=options.missrate, missval=options.missval,
|
|
527 fbathead=fbathead)
|
|
528 else:
|
|
529 makePed(fprefix=defbasename, fpath=options.fpath,
|
|
530 width=options.width, MAFdistribution=mafDist, nsubj=options.Ntot,
|
|
531 alleles=alleles, freqs=freqs, fbatstyle=fbatstyle, missrate=options.missrate,
|
|
532 mendrate=options.mendrate, missval=options.missval,
|
|
533 fbathead=fbathead)
|
|
534 doImport(outfile=options.outf,outpath=options.outpath)
|
|
535
|
|
536
|
|
537
|