0
|
1 """
|
|
2 July 1 2009 added relatedness filter - fo/oo or all
|
|
3 released under the terms of the LGPL
|
|
4 copyright ross lazarus August 2007
|
|
5 for the rgenetics project
|
|
6
|
|
7 Special galaxy tool for the camp2007 data
|
|
8 Allows grabbing genotypes from an arbitrary region
|
|
9
|
|
10 Needs a mongo results file in the location hardwired below or could be passed in as
|
|
11 a library parameter - but this file must have a very specific structure
|
|
12 rs chrom offset float1...floatn
|
|
13
|
|
14 called as
|
|
15
|
|
16 <command interpreter="python2.4">
|
|
17 campRGeno2.py $region "$rslist" "$title" $output1 $log_file $userId "$lpedIn" "$lhistIn"
|
|
18 </command>
|
|
19
|
|
20
|
|
21 """
|
|
22
|
|
23
|
|
24 import sys, array, os, string
|
|
25 from rgutils import galhtmlprefix,plinke,readMap
|
|
26
|
|
27 progname = os.path.split(sys.argv[0])[1]
|
|
28
|
|
29
|
|
30 atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'}
|
|
31
|
|
32 def doImport(outfile='test',flist=[]):
|
|
33 """ import into one of the new html composite data types for Rgenetics
|
|
34 Dan Blankenberg with mods by Ross Lazarus
|
|
35 October 2007
|
|
36 """
|
|
37 out = open(outfile,'w')
|
|
38 out.write(galhtmlprefix % progname)
|
|
39
|
|
40 if len(flist) > 0:
|
|
41 out.write('<ol>\n')
|
|
42 for i, data in enumerate( flist ):
|
|
43 out.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
|
|
44 out.write('</ol>\n')
|
|
45 else:
|
|
46 out.write('No files found')
|
|
47 out.write("</div></body></html>")
|
|
48 out.close()
|
|
49
|
|
50 def setupPedFilter(relfilter='oo',dfile=None):
|
|
51 """ figure out who to drop to satisfy relative filtering
|
|
52 note single offspring only from each family id
|
|
53 ordering of pdict keys makes this 'random' as the first one only is
|
|
54 kept if there are multiple sibs from same familyid.
|
|
55 """
|
|
56 dropId = {}
|
|
57 keepoff = (relfilter == 'oo')
|
|
58 keepfounder = (relfilter == 'fo')
|
|
59 pdict = {}
|
|
60 for row in dfile:
|
|
61 rowl = row.strip().split()
|
|
62 if len(rowl) > 6:
|
|
63 idk = (rowl[0],rowl[1])
|
|
64 pa = (rowl[0],rowl[2]) # key for father
|
|
65 ma = (rowl[0],rowl[3]) # and mother
|
|
66 pdict[idk] = (pa,ma)
|
|
67 dfile.seek(0) # rewind
|
|
68 pk = pdict.keys()
|
|
69 for p in pk:
|
|
70 parents = pdict[p]
|
|
71 if pdict.get(parents[0],None) or pdict.get(parents[1],None): # parents are in this file
|
|
72 if keepfounder:
|
|
73 dropId[p] = 1 # flag for removal
|
|
74 elif keepoff:
|
|
75 dropId[p] = 1 # flag for removal
|
|
76 if keepoff: # TODO keep only a random offspring if many - rely on pdict keys being randomly ordered...!
|
|
77 famseen = {}
|
|
78 for p in pk: # look for multiples from same family - drop all but first
|
|
79 famid = p[0]
|
|
80 if famseen.get(famid,None):
|
|
81 dropId[p] = 1 # already got one from this family
|
|
82 famseen.setdefault(famid,1)
|
|
83 return dropId
|
|
84
|
|
85 def writeFped(rslist=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None):
|
|
86 """ fbat format version
|
|
87 """
|
|
88 outname = os.path.join(outdir,basename)
|
|
89 pedfname = '%s.ped' % outname
|
|
90 ofile = file(pedfname, 'w')
|
|
91 rsl = ' '.join(rslist) # rslist for fbat
|
|
92 ofile.write(rsl)
|
|
93 s = 'wrote %d marker header to %s - %s\n' % (len(rslist),pedfname,rsl[:50])
|
|
94 lf.write(s)
|
|
95 ofile.write('\n')
|
|
96 nrows = 0
|
|
97 for line in dfile:
|
|
98 line = line.strip()
|
|
99 if not line:
|
|
100 continue
|
|
101 line = line.replace('D','N')
|
|
102 fields = line.split()
|
|
103 preamble = fields[:6]
|
|
104 idk = (preamble[0],preamble[1])
|
|
105 dropme = dropId.get(idk,None)
|
|
106 if not dropme:
|
|
107 g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
|
|
108 g = ' '.join(g)
|
|
109 g = g.split() # we'll get there
|
|
110 g = [atrandic.get(x,'0') for x in g] # numeric alleles...
|
|
111 # hack for framingham ND
|
|
112 ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
|
|
113 nrows += 1
|
|
114 ofile.close()
|
|
115 loglist = open(logfile,'r').readlines() # retrieve log to add to html file
|
|
116 doImport(outfile,[pedfname,],loglist=loglist)
|
|
117 return nrows,pedfname
|
|
118
|
|
119 def writePed(markers=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None):
|
|
120 """ split out
|
|
121 """
|
|
122 outname = os.path.join(outdir,basename)
|
|
123 mapfname = '%s.map' % outname
|
|
124 pedfname = '%s.ped' % outname
|
|
125 ofile = file(pedfname, 'w')
|
|
126 # make a map file in the lped library
|
|
127 mf = file(mapfname,'w')
|
|
128 map = ['%s\t%s\t0\t%d' % (x[0],x[2],x[1]) for x in markers] # chrom,abspos,snp in genomic order
|
|
129 mf.write('%s\n' % '\n'.join(map))
|
|
130 mf.close()
|
|
131 nrows = 0
|
|
132 for line in dfile:
|
|
133 line = line.strip()
|
|
134 if not line:
|
|
135 continue
|
|
136 #line = line.replace('D','N')
|
|
137 fields = line.split()
|
|
138 preamble = fields[:6]
|
|
139 idk = (preamble[0],preamble[1])
|
|
140 dropme = dropId.get(idk,None)
|
|
141 if not dropme:
|
|
142 g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
|
|
143 g = ' '.join(g)
|
|
144 g = g.split() # we'll get there
|
|
145 g = [atrandic.get(x,'0') for x in g] # numeric alleles...
|
|
146 # hack for framingham ND
|
|
147 ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
|
|
148 nrows += 1
|
|
149 ofile.close()
|
|
150 loglist = open(logfile,'r').readlines() # retrieve log to add to html file
|
|
151 doImport(outfile,[mapfname,pedfname,logfile])
|
|
152 return nrows,pedfname
|
|
153
|
|
154 def subset():
|
|
155 """ ### Sanity check the arguments
|
|
156 now passed in as
|
|
157 <command interpreter="python">
|
|
158 rgPedSub.py $script_file
|
|
159 </command>
|
|
160
|
|
161 with
|
|
162 <configfiles>
|
|
163 <configfile name="script_file">
|
|
164 title~~~~$title
|
|
165 output1~~~~$output1
|
|
166 log_file~~~~$log_file
|
|
167 userId~~~~$userId
|
|
168 outformat~~~~$outformat
|
|
169 outdir~~~~$output1.extra_files_path
|
|
170 relfilter~~~~$relfilter
|
|
171 #if $d.source=='library'
|
|
172 inped~~~~$d.lpedIn
|
|
173 #else
|
|
174 inped~~~~$d.lhistIn.extra_files_path/$d.lhistIn.metadata.base_name
|
|
175 #end if
|
|
176 #if $m.mtype=='grslist'
|
|
177 rslist~~~~$m.rslist
|
|
178 region~~~~
|
|
179 #else
|
|
180 rslist~~~~
|
|
181 region~~~~$m.region
|
|
182 #end if
|
|
183 </configfile>
|
|
184 </configfiles>
|
|
185 """
|
|
186 sep = '~~~~' # arbitrary choice
|
|
187 conf = {}
|
|
188 if len(sys.argv) < 2:
|
|
189 print >> sys.stderr, "No configuration file passed as a parameter - cannot run"
|
|
190 sys.exit(1)
|
|
191 configf = sys.argv[1]
|
|
192 config = file(configf,'r').readlines()
|
|
193 for row in config:
|
|
194 row = row.strip()
|
|
195 if len(row) > 0:
|
|
196 try:
|
|
197 key,valu = row.split(sep)
|
|
198 conf[key] = valu
|
|
199 except:
|
|
200 pass
|
|
201 ss = '%s%s' % (string.punctuation,string.whitespace)
|
|
202 ptran = string.maketrans(ss,'_'*len(ss))
|
|
203 ### Figure out what genomic region we are interested in
|
|
204 region = conf.get('region','')
|
|
205 orslist = conf.get('rslist','').replace('X',' ').lower()
|
|
206 orslist = orslist.replace(',',' ').lower()
|
|
207 # galaxy replaces newlines with XX - go figure
|
|
208 title = conf.get('title','').translate(ptran) # for outputs
|
|
209 outfile = conf.get('output1','')
|
|
210 outdir = conf.get('outdir','')
|
|
211 try:
|
|
212 os.makedirs(outdir)
|
|
213 except:
|
|
214 pass
|
|
215 outformat = conf.get('outformat','lped')
|
|
216 basename = conf.get('basename',title)
|
|
217 logfile = os.path.join(outdir,'%s.log' % title)
|
|
218 userId = conf.get('userId','') # for library
|
|
219 pedFileBase = conf.get('inped','')
|
|
220 relfilter = conf.get('relfilter','')
|
|
221 MAP_FILE = '%s.map' % pedFileBase
|
|
222 DATA_FILE = '%s.ped' % pedFileBase
|
|
223 title = conf.get('title','lped subset')
|
|
224 lf = file(logfile,'w')
|
|
225 lf.write('config file %s = \n' % configf)
|
|
226 lf.write(''.join(config))
|
|
227 c = ''
|
|
228 spos = epos = 0
|
|
229 rslist = []
|
|
230 rsdict = {}
|
|
231 if region > '':
|
|
232 try: # TODO make a regexp?
|
|
233 c,rest = region.split(':')
|
|
234 c = c.replace('chr','')
|
|
235 rest = rest.replace(',','') # remove commas
|
|
236 spos,epos = rest.split('-')
|
|
237 spos = int(spos)
|
|
238 epos = int(epos)
|
|
239 s = '## %s parsing chrom %s from %d to %d\n' % (progname,c,spos,epos)
|
|
240 lf.write(s)
|
|
241 except:
|
|
242 s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,region)
|
|
243 lf.write(s)
|
|
244 lf.close()
|
|
245 sys.exit(1)
|
|
246 else:
|
|
247 rslist = orslist.split() # galaxy replaces newlines with XX - go figure
|
|
248 rsdict = dict(zip(rslist,rslist))
|
|
249 allmarkers = False
|
|
250 if len(rslist) == 0 and epos == 0: # must be a full extract - presumably remove relateds or something
|
|
251 allmarkers = True
|
|
252 ### Figure out which markers are in this region
|
|
253 markers,snpcols,rslist,rsdict = readMap(mapfile=MAP_FILE,allmarkers=allmarkers,rsdict=rsdict,c=c,spos=spos,epos=epos)
|
|
254 if len(rslist) == 0:
|
|
255 s = '##! %s found no rs numbers in %s\n' % (progname,sys.argv[1:3])
|
|
256 lf.write(s)
|
|
257 lf.write('\n')
|
|
258 lf.close()
|
|
259 sys.exit(1)
|
|
260 s = '## %s looking for %d rs (%s....etc)\n' % (progname,len(rslist),rslist[:5])
|
|
261 lf.write(s)
|
|
262 try:
|
|
263 dfile = open(DATA_FILE, 'r')
|
|
264 except: # bad input file name?
|
|
265 s = '##! rgPedSub unable to open file %s\n' % (DATA_FILE)
|
|
266 lf.write(s)
|
|
267 lf.write('\n')
|
|
268 lf.close()
|
|
269 print >> sys.stdout, s
|
|
270 raise
|
|
271 sys.exit(1)
|
|
272 if relfilter <> 'all': # must read pedigree and figure out who to drop
|
|
273 dropId = setupPedFilter(relfilter=relfilter,dfile=dfile)
|
|
274 else:
|
|
275 dropId = {}
|
|
276 wewant = [(6+(2*snpcols[x])) for x in rslist] #
|
|
277 # column indices of first geno of each marker pair to get the markers into genomic
|
|
278 ### ... and then parse the rest of the ped file to pull out
|
|
279 ### the genotypes for all subjects for those markers
|
|
280 # /usr/local/galaxy/data/rg/1/lped/
|
|
281 if len(dropId.keys()) > 0:
|
|
282 s = '## dropped the following subjects to satisfy requirement that relfilter = %s\n' % relfilter
|
|
283 lf.write(s)
|
|
284 if relfilter == 'oo':
|
|
285 s = '## note that one random offspring from each family was kept if there were multiple offspring\n'
|
|
286 lf.write(s)
|
|
287 s = 'FamilyId\tSubjectId\n'
|
|
288 lf.write(s)
|
|
289 dk = dropId.keys()
|
|
290 dk.sort()
|
|
291 for k in dk:
|
|
292 s = '%s\t%s\n' % (k[0],k[1])
|
|
293 lf.write(s)
|
|
294 lf.write('\n')
|
|
295 lf.close()
|
|
296 if outformat == 'lped':
|
|
297 nrows,pedfname=writePed(markers=markers,outdir=outdir,title=title,basename=basename,dfile=dfile,
|
|
298 wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile)
|
|
299 elif outformat == 'fped':
|
|
300 nrows,pedfname=writeFped(rslist=rslist,outdir=outdir,title=title,basename=basename,dfile=dfile,
|
|
301 wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile)
|
|
302 dfile.close()
|
|
303
|
|
304 if __name__ == "__main__":
|
|
305 subset()
|