comparison tools/rgenetics/rgPedSub.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 """
2 July 1 2009 added relatedness filter - fo/oo or all
3 released under the terms of the LGPL
4 copyright ross lazarus August 2007
5 for the rgenetics project
6
7 Special galaxy tool for the camp2007 data
8 Allows grabbing genotypes from an arbitrary region
9
10 Needs a mongo results file in the location hardwired below or could be passed in as
11 a library parameter - but this file must have a very specific structure
12 rs chrom offset float1...floatn
13
14 called as
15
16 <command interpreter="python2.4">
17 campRGeno2.py $region "$rslist" "$title" $output1 $log_file $userId "$lpedIn" "$lhistIn"
18 </command>
19
20
21 """
22
23
24 import sys, array, os, string
25 from rgutils import galhtmlprefix,plinke,readMap
26
27 progname = os.path.split(sys.argv[0])[1]
28
29
30 atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'}
31
32 def doImport(outfile='test',flist=[]):
33 """ import into one of the new html composite data types for Rgenetics
34 Dan Blankenberg with mods by Ross Lazarus
35 October 2007
36 """
37 out = open(outfile,'w')
38 out.write(galhtmlprefix % progname)
39
40 if len(flist) > 0:
41 out.write('<ol>\n')
42 for i, data in enumerate( flist ):
43 out.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
44 out.write('</ol>\n')
45 else:
46 out.write('No files found')
47 out.write("</div></body></html>")
48 out.close()
49
50 def setupPedFilter(relfilter='oo',dfile=None):
51 """ figure out who to drop to satisfy relative filtering
52 note single offspring only from each family id
53 ordering of pdict keys makes this 'random' as the first one only is
54 kept if there are multiple sibs from same familyid.
55 """
56 dropId = {}
57 keepoff = (relfilter == 'oo')
58 keepfounder = (relfilter == 'fo')
59 pdict = {}
60 for row in dfile:
61 rowl = row.strip().split()
62 if len(rowl) > 6:
63 idk = (rowl[0],rowl[1])
64 pa = (rowl[0],rowl[2]) # key for father
65 ma = (rowl[0],rowl[3]) # and mother
66 pdict[idk] = (pa,ma)
67 dfile.seek(0) # rewind
68 pk = pdict.keys()
69 for p in pk:
70 parents = pdict[p]
71 if pdict.get(parents[0],None) or pdict.get(parents[1],None): # parents are in this file
72 if keepfounder:
73 dropId[p] = 1 # flag for removal
74 elif keepoff:
75 dropId[p] = 1 # flag for removal
76 if keepoff: # TODO keep only a random offspring if many - rely on pdict keys being randomly ordered...!
77 famseen = {}
78 for p in pk: # look for multiples from same family - drop all but first
79 famid = p[0]
80 if famseen.get(famid,None):
81 dropId[p] = 1 # already got one from this family
82 famseen.setdefault(famid,1)
83 return dropId
84
85 def writeFped(rslist=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None):
86 """ fbat format version
87 """
88 outname = os.path.join(outdir,basename)
89 pedfname = '%s.ped' % outname
90 ofile = file(pedfname, 'w')
91 rsl = ' '.join(rslist) # rslist for fbat
92 ofile.write(rsl)
93 s = 'wrote %d marker header to %s - %s\n' % (len(rslist),pedfname,rsl[:50])
94 lf.write(s)
95 ofile.write('\n')
96 nrows = 0
97 for line in dfile:
98 line = line.strip()
99 if not line:
100 continue
101 line = line.replace('D','N')
102 fields = line.split()
103 preamble = fields[:6]
104 idk = (preamble[0],preamble[1])
105 dropme = dropId.get(idk,None)
106 if not dropme:
107 g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
108 g = ' '.join(g)
109 g = g.split() # we'll get there
110 g = [atrandic.get(x,'0') for x in g] # numeric alleles...
111 # hack for framingham ND
112 ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
113 nrows += 1
114 ofile.close()
115 loglist = open(logfile,'r').readlines() # retrieve log to add to html file
116 doImport(outfile,[pedfname,],loglist=loglist)
117 return nrows,pedfname
118
119 def writePed(markers=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None):
120 """ split out
121 """
122 outname = os.path.join(outdir,basename)
123 mapfname = '%s.map' % outname
124 pedfname = '%s.ped' % outname
125 ofile = file(pedfname, 'w')
126 # make a map file in the lped library
127 mf = file(mapfname,'w')
128 map = ['%s\t%s\t0\t%d' % (x[0],x[2],x[1]) for x in markers] # chrom,abspos,snp in genomic order
129 mf.write('%s\n' % '\n'.join(map))
130 mf.close()
131 nrows = 0
132 for line in dfile:
133 line = line.strip()
134 if not line:
135 continue
136 #line = line.replace('D','N')
137 fields = line.split()
138 preamble = fields[:6]
139 idk = (preamble[0],preamble[1])
140 dropme = dropId.get(idk,None)
141 if not dropme:
142 g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
143 g = ' '.join(g)
144 g = g.split() # we'll get there
145 g = [atrandic.get(x,'0') for x in g] # numeric alleles...
146 # hack for framingham ND
147 ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
148 nrows += 1
149 ofile.close()
150 loglist = open(logfile,'r').readlines() # retrieve log to add to html file
151 doImport(outfile,[mapfname,pedfname,logfile])
152 return nrows,pedfname
153
154 def subset():
155 """ ### Sanity check the arguments
156 now passed in as
157 <command interpreter="python">
158 rgPedSub.py $script_file
159 </command>
160
161 with
162 <configfiles>
163 <configfile name="script_file">
164 title~~~~$title
165 output1~~~~$output1
166 log_file~~~~$log_file
167 userId~~~~$userId
168 outformat~~~~$outformat
169 outdir~~~~$output1.extra_files_path
170 relfilter~~~~$relfilter
171 #if $d.source=='library'
172 inped~~~~$d.lpedIn
173 #else
174 inped~~~~$d.lhistIn.extra_files_path/$d.lhistIn.metadata.base_name
175 #end if
176 #if $m.mtype=='grslist'
177 rslist~~~~$m.rslist
178 region~~~~
179 #else
180 rslist~~~~
181 region~~~~$m.region
182 #end if
183 </configfile>
184 </configfiles>
185 """
186 sep = '~~~~' # arbitrary choice
187 conf = {}
188 if len(sys.argv) < 2:
189 print >> sys.stderr, "No configuration file passed as a parameter - cannot run"
190 sys.exit(1)
191 configf = sys.argv[1]
192 config = file(configf,'r').readlines()
193 for row in config:
194 row = row.strip()
195 if len(row) > 0:
196 try:
197 key,valu = row.split(sep)
198 conf[key] = valu
199 except:
200 pass
201 ss = '%s%s' % (string.punctuation,string.whitespace)
202 ptran = string.maketrans(ss,'_'*len(ss))
203 ### Figure out what genomic region we are interested in
204 region = conf.get('region','')
205 orslist = conf.get('rslist','').replace('X',' ').lower()
206 orslist = orslist.replace(',',' ').lower()
207 # galaxy replaces newlines with XX - go figure
208 title = conf.get('title','').translate(ptran) # for outputs
209 outfile = conf.get('output1','')
210 outdir = conf.get('outdir','')
211 try:
212 os.makedirs(outdir)
213 except:
214 pass
215 outformat = conf.get('outformat','lped')
216 basename = conf.get('basename',title)
217 logfile = os.path.join(outdir,'%s.log' % title)
218 userId = conf.get('userId','') # for library
219 pedFileBase = conf.get('inped','')
220 relfilter = conf.get('relfilter','')
221 MAP_FILE = '%s.map' % pedFileBase
222 DATA_FILE = '%s.ped' % pedFileBase
223 title = conf.get('title','lped subset')
224 lf = file(logfile,'w')
225 lf.write('config file %s = \n' % configf)
226 lf.write(''.join(config))
227 c = ''
228 spos = epos = 0
229 rslist = []
230 rsdict = {}
231 if region > '':
232 try: # TODO make a regexp?
233 c,rest = region.split(':')
234 c = c.replace('chr','')
235 rest = rest.replace(',','') # remove commas
236 spos,epos = rest.split('-')
237 spos = int(spos)
238 epos = int(epos)
239 s = '## %s parsing chrom %s from %d to %d\n' % (progname,c,spos,epos)
240 lf.write(s)
241 except:
242 s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,region)
243 lf.write(s)
244 lf.close()
245 sys.exit(1)
246 else:
247 rslist = orslist.split() # galaxy replaces newlines with XX - go figure
248 rsdict = dict(zip(rslist,rslist))
249 allmarkers = False
250 if len(rslist) == 0 and epos == 0: # must be a full extract - presumably remove relateds or something
251 allmarkers = True
252 ### Figure out which markers are in this region
253 markers,snpcols,rslist,rsdict = readMap(mapfile=MAP_FILE,allmarkers=allmarkers,rsdict=rsdict,c=c,spos=spos,epos=epos)
254 if len(rslist) == 0:
255 s = '##! %s found no rs numbers in %s\n' % (progname,sys.argv[1:3])
256 lf.write(s)
257 lf.write('\n')
258 lf.close()
259 sys.exit(1)
260 s = '## %s looking for %d rs (%s....etc)\n' % (progname,len(rslist),rslist[:5])
261 lf.write(s)
262 try:
263 dfile = open(DATA_FILE, 'r')
264 except: # bad input file name?
265 s = '##! rgPedSub unable to open file %s\n' % (DATA_FILE)
266 lf.write(s)
267 lf.write('\n')
268 lf.close()
269 print >> sys.stdout, s
270 raise
271 sys.exit(1)
272 if relfilter <> 'all': # must read pedigree and figure out who to drop
273 dropId = setupPedFilter(relfilter=relfilter,dfile=dfile)
274 else:
275 dropId = {}
276 wewant = [(6+(2*snpcols[x])) for x in rslist] #
277 # column indices of first geno of each marker pair to get the markers into genomic
278 ### ... and then parse the rest of the ped file to pull out
279 ### the genotypes for all subjects for those markers
280 # /usr/local/galaxy/data/rg/1/lped/
281 if len(dropId.keys()) > 0:
282 s = '## dropped the following subjects to satisfy requirement that relfilter = %s\n' % relfilter
283 lf.write(s)
284 if relfilter == 'oo':
285 s = '## note that one random offspring from each family was kept if there were multiple offspring\n'
286 lf.write(s)
287 s = 'FamilyId\tSubjectId\n'
288 lf.write(s)
289 dk = dropId.keys()
290 dk.sort()
291 for k in dk:
292 s = '%s\t%s\n' % (k[0],k[1])
293 lf.write(s)
294 lf.write('\n')
295 lf.close()
296 if outformat == 'lped':
297 nrows,pedfname=writePed(markers=markers,outdir=outdir,title=title,basename=basename,dfile=dfile,
298 wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile)
299 elif outformat == 'fped':
300 nrows,pedfname=writeFped(rslist=rslist,outdir=outdir,title=title,basename=basename,dfile=dfile,
301 wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile)
302 dfile.close()
303
304 if __name__ == "__main__":
305 subset()