Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgPedSub.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 """ | |
2 July 1 2009 added relatedness filter - fo/oo or all | |
3 released under the terms of the LGPL | |
4 copyright ross lazarus August 2007 | |
5 for the rgenetics project | |
6 | |
7 Special galaxy tool for the camp2007 data | |
8 Allows grabbing genotypes from an arbitrary region | |
9 | |
10 Needs a mongo results file in the location hardwired below or could be passed in as | |
11 a library parameter - but this file must have a very specific structure | |
12 rs chrom offset float1...floatn | |
13 | |
14 called as | |
15 | |
16 <command interpreter="python2.4"> | |
17 campRGeno2.py $region "$rslist" "$title" $output1 $log_file $userId "$lpedIn" "$lhistIn" | |
18 </command> | |
19 | |
20 | |
21 """ | |
22 | |
23 | |
24 import sys, array, os, string | |
25 from rgutils import galhtmlprefix,plinke,readMap | |
26 | |
27 progname = os.path.split(sys.argv[0])[1] | |
28 | |
29 | |
30 atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'} | |
31 | |
32 def doImport(outfile='test',flist=[]): | |
33 """ import into one of the new html composite data types for Rgenetics | |
34 Dan Blankenberg with mods by Ross Lazarus | |
35 October 2007 | |
36 """ | |
37 out = open(outfile,'w') | |
38 out.write(galhtmlprefix % progname) | |
39 | |
40 if len(flist) > 0: | |
41 out.write('<ol>\n') | |
42 for i, data in enumerate( flist ): | |
43 out.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1])) | |
44 out.write('</ol>\n') | |
45 else: | |
46 out.write('No files found') | |
47 out.write("</div></body></html>") | |
48 out.close() | |
49 | |
50 def setupPedFilter(relfilter='oo',dfile=None): | |
51 """ figure out who to drop to satisfy relative filtering | |
52 note single offspring only from each family id | |
53 ordering of pdict keys makes this 'random' as the first one only is | |
54 kept if there are multiple sibs from same familyid. | |
55 """ | |
56 dropId = {} | |
57 keepoff = (relfilter == 'oo') | |
58 keepfounder = (relfilter == 'fo') | |
59 pdict = {} | |
60 for row in dfile: | |
61 rowl = row.strip().split() | |
62 if len(rowl) > 6: | |
63 idk = (rowl[0],rowl[1]) | |
64 pa = (rowl[0],rowl[2]) # key for father | |
65 ma = (rowl[0],rowl[3]) # and mother | |
66 pdict[idk] = (pa,ma) | |
67 dfile.seek(0) # rewind | |
68 pk = pdict.keys() | |
69 for p in pk: | |
70 parents = pdict[p] | |
71 if pdict.get(parents[0],None) or pdict.get(parents[1],None): # parents are in this file | |
72 if keepfounder: | |
73 dropId[p] = 1 # flag for removal | |
74 elif keepoff: | |
75 dropId[p] = 1 # flag for removal | |
76 if keepoff: # TODO keep only a random offspring if many - rely on pdict keys being randomly ordered...! | |
77 famseen = {} | |
78 for p in pk: # look for multiples from same family - drop all but first | |
79 famid = p[0] | |
80 if famseen.get(famid,None): | |
81 dropId[p] = 1 # already got one from this family | |
82 famseen.setdefault(famid,1) | |
83 return dropId | |
84 | |
85 def writeFped(rslist=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None): | |
86 """ fbat format version | |
87 """ | |
88 outname = os.path.join(outdir,basename) | |
89 pedfname = '%s.ped' % outname | |
90 ofile = file(pedfname, 'w') | |
91 rsl = ' '.join(rslist) # rslist for fbat | |
92 ofile.write(rsl) | |
93 s = 'wrote %d marker header to %s - %s\n' % (len(rslist),pedfname,rsl[:50]) | |
94 lf.write(s) | |
95 ofile.write('\n') | |
96 nrows = 0 | |
97 for line in dfile: | |
98 line = line.strip() | |
99 if not line: | |
100 continue | |
101 line = line.replace('D','N') | |
102 fields = line.split() | |
103 preamble = fields[:6] | |
104 idk = (preamble[0],preamble[1]) | |
105 dropme = dropId.get(idk,None) | |
106 if not dropme: | |
107 g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant] | |
108 g = ' '.join(g) | |
109 g = g.split() # we'll get there | |
110 g = [atrandic.get(x,'0') for x in g] # numeric alleles... | |
111 # hack for framingham ND | |
112 ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g))) | |
113 nrows += 1 | |
114 ofile.close() | |
115 loglist = open(logfile,'r').readlines() # retrieve log to add to html file | |
116 doImport(outfile,[pedfname,],loglist=loglist) | |
117 return nrows,pedfname | |
118 | |
119 def writePed(markers=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None): | |
120 """ split out | |
121 """ | |
122 outname = os.path.join(outdir,basename) | |
123 mapfname = '%s.map' % outname | |
124 pedfname = '%s.ped' % outname | |
125 ofile = file(pedfname, 'w') | |
126 # make a map file in the lped library | |
127 mf = file(mapfname,'w') | |
128 map = ['%s\t%s\t0\t%d' % (x[0],x[2],x[1]) for x in markers] # chrom,abspos,snp in genomic order | |
129 mf.write('%s\n' % '\n'.join(map)) | |
130 mf.close() | |
131 nrows = 0 | |
132 for line in dfile: | |
133 line = line.strip() | |
134 if not line: | |
135 continue | |
136 #line = line.replace('D','N') | |
137 fields = line.split() | |
138 preamble = fields[:6] | |
139 idk = (preamble[0],preamble[1]) | |
140 dropme = dropId.get(idk,None) | |
141 if not dropme: | |
142 g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant] | |
143 g = ' '.join(g) | |
144 g = g.split() # we'll get there | |
145 g = [atrandic.get(x,'0') for x in g] # numeric alleles... | |
146 # hack for framingham ND | |
147 ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g))) | |
148 nrows += 1 | |
149 ofile.close() | |
150 loglist = open(logfile,'r').readlines() # retrieve log to add to html file | |
151 doImport(outfile,[mapfname,pedfname,logfile]) | |
152 return nrows,pedfname | |
153 | |
154 def subset(): | |
155 """ ### Sanity check the arguments | |
156 now passed in as | |
157 <command interpreter="python"> | |
158 rgPedSub.py $script_file | |
159 </command> | |
160 | |
161 with | |
162 <configfiles> | |
163 <configfile name="script_file"> | |
164 title~~~~$title | |
165 output1~~~~$output1 | |
166 log_file~~~~$log_file | |
167 userId~~~~$userId | |
168 outformat~~~~$outformat | |
169 outdir~~~~$output1.extra_files_path | |
170 relfilter~~~~$relfilter | |
171 #if $d.source=='library' | |
172 inped~~~~$d.lpedIn | |
173 #else | |
174 inped~~~~$d.lhistIn.extra_files_path/$d.lhistIn.metadata.base_name | |
175 #end if | |
176 #if $m.mtype=='grslist' | |
177 rslist~~~~$m.rslist | |
178 region~~~~ | |
179 #else | |
180 rslist~~~~ | |
181 region~~~~$m.region | |
182 #end if | |
183 </configfile> | |
184 </configfiles> | |
185 """ | |
186 sep = '~~~~' # arbitrary choice | |
187 conf = {} | |
188 if len(sys.argv) < 2: | |
189 print >> sys.stderr, "No configuration file passed as a parameter - cannot run" | |
190 sys.exit(1) | |
191 configf = sys.argv[1] | |
192 config = file(configf,'r').readlines() | |
193 for row in config: | |
194 row = row.strip() | |
195 if len(row) > 0: | |
196 try: | |
197 key,valu = row.split(sep) | |
198 conf[key] = valu | |
199 except: | |
200 pass | |
201 ss = '%s%s' % (string.punctuation,string.whitespace) | |
202 ptran = string.maketrans(ss,'_'*len(ss)) | |
203 ### Figure out what genomic region we are interested in | |
204 region = conf.get('region','') | |
205 orslist = conf.get('rslist','').replace('X',' ').lower() | |
206 orslist = orslist.replace(',',' ').lower() | |
207 # galaxy replaces newlines with XX - go figure | |
208 title = conf.get('title','').translate(ptran) # for outputs | |
209 outfile = conf.get('output1','') | |
210 outdir = conf.get('outdir','') | |
211 try: | |
212 os.makedirs(outdir) | |
213 except: | |
214 pass | |
215 outformat = conf.get('outformat','lped') | |
216 basename = conf.get('basename',title) | |
217 logfile = os.path.join(outdir,'%s.log' % title) | |
218 userId = conf.get('userId','') # for library | |
219 pedFileBase = conf.get('inped','') | |
220 relfilter = conf.get('relfilter','') | |
221 MAP_FILE = '%s.map' % pedFileBase | |
222 DATA_FILE = '%s.ped' % pedFileBase | |
223 title = conf.get('title','lped subset') | |
224 lf = file(logfile,'w') | |
225 lf.write('config file %s = \n' % configf) | |
226 lf.write(''.join(config)) | |
227 c = '' | |
228 spos = epos = 0 | |
229 rslist = [] | |
230 rsdict = {} | |
231 if region > '': | |
232 try: # TODO make a regexp? | |
233 c,rest = region.split(':') | |
234 c = c.replace('chr','') | |
235 rest = rest.replace(',','') # remove commas | |
236 spos,epos = rest.split('-') | |
237 spos = int(spos) | |
238 epos = int(epos) | |
239 s = '## %s parsing chrom %s from %d to %d\n' % (progname,c,spos,epos) | |
240 lf.write(s) | |
241 except: | |
242 s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,region) | |
243 lf.write(s) | |
244 lf.close() | |
245 sys.exit(1) | |
246 else: | |
247 rslist = orslist.split() # galaxy replaces newlines with XX - go figure | |
248 rsdict = dict(zip(rslist,rslist)) | |
249 allmarkers = False | |
250 if len(rslist) == 0 and epos == 0: # must be a full extract - presumably remove relateds or something | |
251 allmarkers = True | |
252 ### Figure out which markers are in this region | |
253 markers,snpcols,rslist,rsdict = readMap(mapfile=MAP_FILE,allmarkers=allmarkers,rsdict=rsdict,c=c,spos=spos,epos=epos) | |
254 if len(rslist) == 0: | |
255 s = '##! %s found no rs numbers in %s\n' % (progname,sys.argv[1:3]) | |
256 lf.write(s) | |
257 lf.write('\n') | |
258 lf.close() | |
259 sys.exit(1) | |
260 s = '## %s looking for %d rs (%s....etc)\n' % (progname,len(rslist),rslist[:5]) | |
261 lf.write(s) | |
262 try: | |
263 dfile = open(DATA_FILE, 'r') | |
264 except: # bad input file name? | |
265 s = '##! rgPedSub unable to open file %s\n' % (DATA_FILE) | |
266 lf.write(s) | |
267 lf.write('\n') | |
268 lf.close() | |
269 print >> sys.stdout, s | |
270 raise | |
271 sys.exit(1) | |
272 if relfilter <> 'all': # must read pedigree and figure out who to drop | |
273 dropId = setupPedFilter(relfilter=relfilter,dfile=dfile) | |
274 else: | |
275 dropId = {} | |
276 wewant = [(6+(2*snpcols[x])) for x in rslist] # | |
277 # column indices of first geno of each marker pair to get the markers into genomic | |
278 ### ... and then parse the rest of the ped file to pull out | |
279 ### the genotypes for all subjects for those markers | |
280 # /usr/local/galaxy/data/rg/1/lped/ | |
281 if len(dropId.keys()) > 0: | |
282 s = '## dropped the following subjects to satisfy requirement that relfilter = %s\n' % relfilter | |
283 lf.write(s) | |
284 if relfilter == 'oo': | |
285 s = '## note that one random offspring from each family was kept if there were multiple offspring\n' | |
286 lf.write(s) | |
287 s = 'FamilyId\tSubjectId\n' | |
288 lf.write(s) | |
289 dk = dropId.keys() | |
290 dk.sort() | |
291 for k in dk: | |
292 s = '%s\t%s\n' % (k[0],k[1]) | |
293 lf.write(s) | |
294 lf.write('\n') | |
295 lf.close() | |
296 if outformat == 'lped': | |
297 nrows,pedfname=writePed(markers=markers,outdir=outdir,title=title,basename=basename,dfile=dfile, | |
298 wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile) | |
299 elif outformat == 'fped': | |
300 nrows,pedfname=writeFped(rslist=rslist,outdir=outdir,title=title,basename=basename,dfile=dfile, | |
301 wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile) | |
302 dfile.close() | |
303 | |
304 if __name__ == "__main__": | |
305 subset() |