0
|
1 """
|
|
2 @summary: Calculate the frequency of eQTLs and genes per sliding window bin
|
|
3 @author: nanette.coetzer@gmail.com
|
|
4 @version 5
|
|
5
|
|
6 """
|
|
7 import optparse, sys
|
|
8 import subprocess
|
|
9 import tempfile
|
|
10 import os, re
|
|
11
|
|
12 def stop_err( msg ):
|
|
13 sys.stderr.write( "%s\n" % msg )
|
|
14 sys.exit()
|
|
15
|
|
16 def __main__():
|
|
17 #Parse Command Line
|
|
18 parser = optparse.OptionParser()
|
|
19 parser.add_option("-m", "--rscript", default=None, dest="rscript",
|
|
20 help="R script")
|
|
21 parser.add_option("-i", "--input1", default=None, dest="input1",
|
|
22 help="Frequency per bin file")
|
|
23 parser.add_option("-j", "--input2", default=None, dest="input2",
|
|
24 help="Lookup summary file")
|
|
25 parser.add_option("-k", "--input3", default=None, dest="input3",
|
|
26 help="Frequency summary file")
|
|
27 parser.add_option("-o", "--output1", default=None, dest="output1",
|
|
28 help="Sliding frequency file")
|
|
29 parser.add_option("-p", "--output2", default=None, dest="output2",
|
|
30 help="Sliding lookup file")
|
|
31 parser.add_option("-q", "--output3", default=None, dest="output3",
|
|
32 help="eQTL and genes polygon plot")
|
|
33 (options, args) = parser.parse_args()
|
|
34
|
|
35 try:
|
|
36 open(options.input1, "r").close()
|
|
37 except TypeError, e:
|
|
38 stop_err("You need to supply the Lookup frequency file:\n" + str(e))
|
|
39 except IOError, e:
|
|
40 stop_err("Can not open the Lookup frequency file:\n" + str(e))
|
|
41
|
|
42 try:
|
|
43 open(options.input2, "r").close()
|
|
44 except TypeError, e:
|
|
45 stop_err("You need to supply the Lookup summary file:\n" + str(e))
|
|
46 except IOError, e:
|
|
47 stop_err("Can not open the Lookup summary file:\n" + str(e))
|
|
48
|
|
49 try:
|
|
50 open(options.input3, "r").close()
|
|
51 except TypeError, e:
|
|
52 stop_err("You need to supply the Frequency summary file:\n" + str(e))
|
|
53 except IOError, e:
|
|
54 stop_err("Can not open the Frequency summary file:\n" + str(e))
|
|
55
|
|
56 ##############################################
|
|
57 ##############################################
|
|
58
|
|
59 infile3 = open(options.input3,'r') # Frequency summary file
|
|
60 for line in infile3:
|
|
61 if line.startswith("Number"):
|
|
62 num_intervals = int(line.strip().split("\t")[1])
|
|
63 #print num_intervals
|
|
64 infile3.close()
|
|
65
|
|
66 chr_second_last = []
|
|
67 chr_end = []
|
|
68 tot = 0
|
|
69 infile2 = open(options.input2,'r') # Lookup summary file
|
|
70 for line in infile2:
|
|
71 l = line.strip().split("\t")
|
|
72 if not l[4].startswith("i"):
|
|
73 tot += int(l[4])
|
|
74 chr_second_last.append(tot-2)
|
|
75 chr_end.append(tot)
|
|
76
|
|
77 freq_dict = {}
|
|
78 sliding_id = 0
|
|
79 prev_size = 0
|
|
80 idlist = []
|
|
81 slidingdict = {}
|
|
82 cur_size = 0
|
|
83 add_last = 0
|
|
84 i = 0
|
|
85 sizelist = []
|
|
86
|
|
87 infile1 = open(options.input1,'r') # Frequency.txt
|
|
88
|
|
89 # 1b is where an interval always starts with a 2cM bin (plus a small bin if available) --> never split 2 bins on either sides of a marker
|
|
90 if num_intervals == 1:
|
|
91 for line in infile1:
|
|
92 l2 = line.strip().split("\t")
|
|
93 if not l2[0].startswith("int"):
|
|
94 i += 1
|
|
95 freq_dict[l2[0]] = l2
|
|
96 if prev_size == 0:
|
|
97 # end of chromosome
|
|
98 if int(l2[0]) in chr_end:
|
|
99 pass
|
|
100 else:
|
|
101 prev_size = float(l2[6])
|
|
102 idlist.append(int(l2[0]))
|
|
103 sizelist.append(float(l2[6]))
|
|
104 cur_size = float(l2[6])
|
|
105 add_last = 0
|
|
106 else:
|
|
107 prev_size = float(l2[6])
|
|
108
|
|
109 if prev_size < 2 and add_last == 0:
|
|
110 cur_size = float(cur_size) + float(l2[6])
|
|
111 idlist.append(int(l2[0]))
|
|
112 sizelist.append(float(l2[6]))
|
|
113 elif prev_size == 2:
|
|
114 sliding_id = sliding_id + 1
|
|
115 slidingdict[int(sliding_id)] = idlist
|
|
116 idlist = [int(l2[0])]
|
|
117 sizelist = [float(l2[6])]
|
|
118 prev_size = float(l2[6])
|
|
119 cur_size = float(l2[6])
|
|
120
|
|
121 if add_last == 1:
|
|
122 cur_size = float(cur_size) + float(l2[6])
|
|
123 idlist.append(int(l2[0]))
|
|
124 sizelist.append(float(l2[6]))
|
|
125 sliding_id = sliding_id + 1
|
|
126 slidingdict[sliding_id] = idlist
|
|
127 prev_size = 0
|
|
128 idlist = []
|
|
129 sizelist = []
|
|
130 cur_size = 0
|
|
131
|
|
132 if int(l2[0]) in chr_second_last:
|
|
133 add_last = 1
|
|
134
|
|
135 # 1 is where an interval starts with a small bin if available (plus a 2cM bin)
|
|
136 if num_intervals == 9:
|
|
137 for line in infile1:
|
|
138 l2 = line.strip().split("\t")
|
|
139 if not l2[0].startswith("int"):
|
|
140 i += 1
|
|
141 freq_dict[l2[0]] = l2
|
|
142 print slidingdict
|
|
143 if prev_size == 0:
|
|
144 # end of chromosome
|
|
145 if int(l2[0]) in chr_end:
|
|
146 pass
|
|
147 else:
|
|
148 prev_size = float(l2[6])
|
|
149 idlist.append(int(l2[0]))
|
|
150 cur_size = float(l2[6])
|
|
151 add_last = 0
|
|
152 if float(cur_size) >= 2 and add_last == 0:
|
|
153 sliding_id = sliding_id + 1
|
|
154 slidingdict[int(sliding_id)] = idlist
|
|
155
|
|
156 prev_size = float(l2[6])
|
|
157 idlist = []
|
|
158 cur_size = float(0)
|
|
159 else:
|
|
160 cur_size = float(cur_size) + float(l2[6])
|
|
161 idlist.append(int(l2[0]))
|
|
162 prev_size = float(l2[6])
|
|
163 if add_last == 1:
|
|
164 sliding_id = sliding_id + 1
|
|
165 slidingdict[sliding_id] = idlist
|
|
166 prev_size = 0
|
|
167 idlist = []
|
|
168 cur_size = 0
|
|
169
|
|
170 if int(l2[0]) in chr_second_last:
|
|
171 add_last = 1
|
|
172 if float(cur_size) >= 2 and add_last == 0:
|
|
173 sliding_id = sliding_id + 1
|
|
174 slidingdict[int(sliding_id)] = idlist
|
|
175 prev_size = float(l2[6])
|
|
176 idlist = []
|
|
177 cur_size = float(0)
|
|
178
|
|
179
|
|
180 if num_intervals == 2:
|
|
181 for line in infile1:
|
|
182 l2 = line.strip().split("\t")
|
|
183 if not l2[0].startswith("int"):
|
|
184 i += 1
|
|
185 freq_dict[l2[0]] = l2
|
|
186 if prev_size == 0:
|
|
187 # end of chromosome
|
|
188 if int(l2[0]) in chr_end:
|
|
189 pass
|
|
190 else:
|
|
191 prev_size = float(l2[6])
|
|
192 idlist.append(int(l2[0]))
|
|
193 cur_size = float(l2[6])
|
|
194 add_last = 0
|
|
195 else:
|
|
196 cur_size = float(cur_size) + float(l2[6])
|
|
197 idlist.append(int(l2[0]))
|
|
198 prev_size = float(l2[6])
|
|
199 if add_last == 1:
|
|
200 sliding_id = sliding_id + 1
|
|
201 slidingdict[sliding_id] = idlist
|
|
202 prev_size = 0
|
|
203 idlist = []
|
|
204 cur_size = 0
|
|
205
|
|
206 if int(l2[0]) in chr_second_last:
|
|
207 add_last = 1
|
|
208 if float(cur_size) >= 4 and add_last == 0:
|
|
209 sliding_id = sliding_id + 1
|
|
210 slidingdict[int(sliding_id)] = idlist
|
|
211
|
|
212 prev_size = float(l2[6])
|
|
213 idlist = [int(l2[0])]
|
|
214 cur_size = float(prev_size)
|
|
215
|
|
216 if num_intervals == 3:
|
|
217 for line in infile1:
|
|
218 l2 = line.strip().split("\t")
|
|
219 if not l2[0].startswith("int"):
|
|
220 i += 1
|
|
221 freq_dict[l2[0]] = l2
|
|
222 if prev_size == 0:
|
|
223 # end of chromosome
|
|
224 if int(l2[0]) in chr_end:
|
|
225 pass
|
|
226 else:
|
|
227 prev_size = float(l2[6])
|
|
228 idlist.append(int(l2[0]))
|
|
229 sizelist.append(float(l2[6]))
|
|
230 cur_size = float(l2[6])
|
|
231 add_last = 0
|
|
232 else:
|
|
233 cur_size = float(cur_size) + float(l2[6])
|
|
234 idlist.append(int(l2[0]))
|
|
235 sizelist.append(float(l2[6]))
|
|
236 prev_size = float(l2[6])
|
|
237 if add_last == 1:
|
|
238 sliding_id = sliding_id + 1
|
|
239 slidingdict[sliding_id] = idlist
|
|
240 prev_size = 0
|
|
241 idlist = []
|
|
242 sizelist = []
|
|
243 cur_size = 0
|
|
244
|
|
245 if int(l2[0]) in chr_second_last:
|
|
246 add_last = 1
|
|
247 if float(cur_size) >= 6 and add_last == 0:
|
|
248 sliding_id = sliding_id + 1
|
|
249 slidingdict[int(sliding_id)] = idlist
|
|
250 prev_size = float(l2[6])
|
|
251 newsizelist = []
|
|
252 newidlist = []
|
|
253 newstart = 0
|
|
254 c = 0
|
|
255 cur_size = 0
|
|
256 for s in range(len(sizelist)):
|
|
257 if newstart == 1:
|
|
258 newsizelist.append(sizelist[s])
|
|
259 newidlist.append(idlist[s])
|
|
260 cur_size += float(sizelist[s])
|
|
261 if sizelist[s] == 2.0 and newstart == 0:
|
|
262 c += 1
|
|
263 if c == 2:
|
|
264 newsizelist.append(sizelist[s])
|
|
265 newidlist.append(idlist[s])
|
|
266 cur_size += float(sizelist[s])
|
|
267 newstart = 1
|
|
268 idlist = newidlist
|
|
269 sizelist = newsizelist
|
|
270
|
|
271 out1 = open(options.output1,'w') # Frequency.txt
|
|
272 out2 = open(options.output2,'w')
|
|
273 #out1.write("sliding.id\tchr\tsliding.cM\tsliding.eQTL\tsliding.cis.eQTL\tsliding.trans.eQTL\tsliding.genes\tsliding.eQTL/cM\tsliding.eQTL/cM/10genes\n")
|
|
274 out1.write("sliding.id\tchr\tsliding.cM\tsliding.all.eQTL\tsliding.cis.eQTL\tsliding.trans.eQTL\tsliding.genes\tsliding.all.eQTL/cM\tsliding.cis.eQTL/cM\tsliding.trans.eQTL/cM\tsliding.genes/cM\n")
|
|
275 tcM = 0
|
|
276 tEQTL = 0
|
|
277 tGENES = 0
|
|
278
|
|
279 for j in range(len(slidingdict.keys())):
|
|
280 totcM = 0
|
|
281 toteqtl = 0
|
|
282 totcis = 0
|
|
283 tottrans = 0
|
|
284 totgenes = 0
|
|
285 totEcM_all = 0
|
|
286 totEcM_cis = 0
|
|
287 totEcM_trans = 0
|
|
288 totGcM = 0
|
|
289 #totEcM10G = 0
|
|
290 out2.write(str(j+1)+"\t"+str(slidingdict[j+1])+"\n")
|
|
291 for k in slidingdict[j+1]:
|
|
292 totcM += float(freq_dict[str(k)][6])
|
|
293 toteqtl += float(freq_dict[str(k)][7])
|
|
294 totcis += float(freq_dict[str(k)][8])
|
|
295 tottrans += float(freq_dict[str(k)][9])
|
|
296 totgenes += float(freq_dict[str(k)][10])
|
|
297
|
|
298 chr = freq_dict[str(k)][1]
|
|
299 if totgenes == 0:
|
|
300 totgenes = 1
|
|
301
|
|
302 totEcM_all += round((toteqtl/totcM),2)
|
|
303 totEcM_cis += round((totcis/totcM),2)
|
|
304 totEcM_trans += round((tottrans/totcM),2)
|
|
305 totGcM += round((totgenes/totcM),2)
|
|
306 out1.write(str(j+1)+"\t"+str(chr)+"\t"+str(totcM)+"\t"+str(toteqtl)+"\t"+str(totcis)+"\t"+str(tottrans)+"\t"+str(totgenes)+"\t"+str(totEcM_all)+"\t"+str(totEcM_cis)+"\t"+str(totEcM_trans)+"\t"+str(totGcM)+"\n")
|
|
307
|
|
308 out1.close()
|
|
309 infile1.close()
|
|
310 infile2.close()
|
|
311 out2.close()
|
|
312
|
|
313
|
|
314 # Create temp direcotry
|
|
315 tempdir = tempfile.mkdtemp()
|
|
316
|
|
317 # copy INPUT file to the temp directory
|
|
318 s = "cp %s %s/sliding_frequency.txt" %(options.output1, tempdir)
|
|
319 subprocess.call(s, shell=True)
|
|
320
|
|
321 # create R script => save in temp directory
|
|
322 # generate new header
|
|
323 new_script = open(tempdir+"/new_script.r","w")
|
|
324 header = "setwd(\"%s\")" %tempdir
|
|
325 new_script.write(header+"\n")
|
|
326 # add script body
|
|
327 script = open(options.rscript,"r")
|
|
328 for line in script:
|
|
329 new_script.write(line.strip()+"\n")
|
|
330 new_script.close()
|
|
331
|
|
332 # run R script from temp directory
|
|
333 s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
|
|
334 subprocess.call(s, shell=True)
|
|
335
|
|
336 # run R script from temp directory
|
|
337 s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
|
|
338 subprocess.call(s, shell=True)
|
|
339
|
|
340 os.system("mv %s/Rplot_eQTL_genes_polygon.pdf %s" %(tempdir,options.output3))
|
|
341
|
|
342 ##############################################
|
|
343
|
|
344 if __name__=="__main__":
|
|
345 __main__()
|