4
|
1 from mirgene_functions import *
|
|
2 from mirgene_graphs import *
|
|
3 import time
|
|
4 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
|
|
5 import subprocess
|
|
6 import argparse
|
31
|
7 import sys
|
4
|
8
|
|
9 subprocess.call(['mkdir','-p','split1','split2','split3','split4','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre'])
|
|
10
|
|
11 parser = argparse.ArgumentParser()
|
|
12 parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store")
|
|
13 parser.add_argument("-con", "--control", help="input fastq file (controls)", nargs='+', default=[])
|
|
14 parser.add_argument("-tre", "--treated", help="input fastq file (treated)", nargs='+', default=[] )
|
|
15 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
|
|
16 parser.add_argument("-gen", "--org_name", help="Organism", action="store")
|
|
17 parser.add_argument("-f", "--flag", help="choose the database (MirBase,MirGene)", action="store")
|
|
18 parser.add_argument("-percentage", "--per", help="Percentage of Samples", action="store")
|
|
19 parser.add_argument("-counts", "--count", help="Counts for filtering", action="store")
|
|
20 parser.add_argument("-name1", "--group1", help="Samples group 1", action="store")
|
|
21 parser.add_argument("-name2", "--group2", help="Samples group 2", action="store")
|
|
22 args = parser.parse_args()
|
|
23
|
|
24
|
|
25 #########################################################################3###############################################################################################################################################################################################
|
|
26
|
|
27 if __name__ == '__main__':
|
|
28
|
|
29 starttime = time.time()
|
|
30
|
|
31 lock = Lock()
|
|
32 manager = Manager()
|
|
33
|
|
34 # Download reference miRNA sequences from MirBase
|
|
35 mature_mirnas=manager.list()
|
|
36 ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name))
|
|
37 ps_mature.start()
|
|
38
|
|
39 # Keep the names of the files and location paths
|
|
40 args.control[0]=args.control[0][1:]
|
|
41 args.control[len(args.control)-1][:-1]
|
|
42 control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)]
|
|
43
|
|
44 args.treated[0]=args.treated[0][1:]
|
|
45 args.treated[len(args.treated)-1][:-1]
|
|
46 treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)]
|
|
47
|
|
48
|
|
49 ############## Detection of templated isoforms ################
|
|
50
|
|
51 #radar = manager.list([0,0,0,0])
|
|
52 con_samples = manager.list()
|
|
53 con_data= manager.list()
|
|
54 con_file_order=manager.list()
|
|
55 con_names_seqs=manager.list()
|
|
56 deseq=manager.list()
|
|
57 con_unmap_seq=manager.Value('i',0)
|
|
58 con_unmap_counts=manager.Value('i',0)
|
|
59 con_mirna_names=manager.list()
|
|
60 ini_con_samples = manager.list()
|
|
61
|
|
62 #radar1 = manager.list([0,0,0,0])
|
|
63 tre_samples = manager.list()
|
|
64 tre_data = manager.list()
|
|
65 tre_file_order = manager.list()
|
|
66 tre_names_seqs=manager.list()
|
|
67 deseq1=manager.list()
|
|
68 tre_unmap_seq = manager.Value('i',0)
|
|
69 tre_unmap_counts = manager.Value('i',0)
|
|
70 tre_mirna_names=manager.list()
|
|
71 ini_tre_samples = manager.list()
|
|
72
|
|
73 # Wait for the download of reference miRNA sequences
|
|
74 ps_mature.join()
|
|
75 mature_mirnas=list(mature_mirnas)
|
|
76
|
|
77 # Processing of the detected miRNAs from SAM files
|
|
78 ps_sam = [Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,con_samples,con_data,con_file_order,con_unmap_seq,con_names_seqs,deseq,con_mirna_names,ini_con_samples,con_unmap_counts)) for path in control]
|
|
79 ps_sam.extend([Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,tre_samples,tre_data,tre_file_order,tre_unmap_seq,tre_names_seqs,deseq1,tre_mirna_names,ini_tre_samples,tre_unmap_counts)) for path in treated])
|
|
80
|
|
81 # Wait for processing of SAM files to finish
|
|
82 [p.start() for p in ps_sam]
|
|
83 [p.join() for p in ps_sam]
|
|
84
|
|
85 # Generate a histogram
|
|
86 ps_hist=[Process(target=hist_red,args=(ini_con_samples,'c',args.group1))]
|
|
87 ps_hist.extend([Process(target=hist_red,args=(ini_tre_samples,'t',args.group2))])
|
|
88 [x.start() for x in ps_hist]
|
|
89
|
|
90 # Convert managers to lists
|
|
91 con_samples = list(con_samples)
|
|
92 tre_samples = list(tre_samples)
|
|
93 con_file_order=list(con_file_order)
|
|
94 tre_file_order=list(tre_file_order)
|
|
95 deseq=list(deseq)
|
|
96 deseq1=list(deseq1)
|
|
97
|
|
98 # Remove duplicates and sorting
|
|
99 con_names_seqs=list(con_names_seqs)
|
|
100 con_names_seqs.sort()
|
|
101 con_names_seqs=list(con_names_seqs for con_names_seqs,_ in itertools.groupby(con_names_seqs))
|
|
102
|
|
103 tre_names_seqs=list(tre_names_seqs)
|
|
104 tre_names_seqs.sort()
|
|
105 tre_names_seqs=list(tre_names_seqs for tre_names_seqs,_ in itertools.groupby(tre_names_seqs))
|
|
106
|
|
107 # initialization of new managers
|
|
108 new_con_file_order=manager.list()
|
|
109 new_tre_file_order=manager.list()
|
|
110 new_deseq=manager.list()
|
|
111 new_deseq1=manager.list()
|
|
112
|
|
113 # add uncommon detected mirnas among the samples
|
|
114 ps_un_mirnas=[Process(target=uncommon_mirnas,args=(sampp,con_names_seqs,lock,new_deseq,con_file_order[i],new_con_file_order)) for i,sampp in enumerate(deseq)]
|
|
115 ps_un_mirnas.extend([Process(target=uncommon_mirnas,args=(sampp,tre_names_seqs,lock,new_deseq1,tre_file_order[i],new_tre_file_order)) for i,sampp in enumerate(deseq1)])
|
|
116
|
|
117 # Wait for processing of uncommon detected mirnas to finish
|
|
118 [z.start() for z in ps_un_mirnas]
|
|
119 [z.join() for z in ps_un_mirnas]
|
|
120
|
|
121 # Convert managers to lists
|
|
122 new_deseq=list(new_deseq)
|
|
123 new_deseq1=list(new_deseq1)
|
|
124 con_file_order=list(new_con_file_order)
|
|
125 tre_file_order=list(new_tre_file_order)
|
|
126
|
|
127 # Genereation of count matrices per group (controls - treated)
|
|
128 control_group=[[x[0],x[2]] for x in new_deseq[0]]
|
|
129 [control_group[i].append(y[i][1]) for i,_ in enumerate(control_group) for y in new_deseq]
|
|
130
|
|
131 treated_group=[[x[0],x[2]] for x in new_deseq1[0]]
|
|
132 [treated_group[i].append(y[i][1]) for i,_ in enumerate(treated_group) for y in new_deseq1]
|
|
133
|
|
134 # Keep a copy of count matrices
|
|
135 control_group_copy=copy.deepcopy(list(control_group))
|
|
136 treated_group_copy=copy.deepcopy(list(treated_group))
|
|
137
|
|
138 # Initialization of managers
|
|
139 merg_nam_control_group=manager.list()
|
|
140 merg_nam_treated_group=manager.list()
|
|
141
|
|
142 # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates
|
|
143 ps_merge = [Process(target=merging_names,args=(control_group_copy,merg_nam_control_group))]
|
|
144 ps_merge.extend([Process(target=merging_names,args=(treated_group_copy,merg_nam_treated_group))])
|
|
145 [x.start() for x in ps_merge]
|
|
146
|
|
147 # Add unique mirna sequences between groups (all groups will have the same amount of sequences)
|
|
148 con_list=manager.list()
|
|
149 tre_list=manager.list()
|
|
150
|
|
151 ps_bw = [Process(target=black_white,args=(con_names_seqs,tre_names_seqs,treated_group,tre_list))]
|
|
152 ps_bw.extend([Process(target=black_white,args=(tre_names_seqs,con_names_seqs,control_group,con_list))])
|
|
153 [x.start() for x in ps_bw]
|
|
154 [x.join() for x in ps_bw]
|
|
155
|
|
156 control_group=list(con_list)
|
|
157 treated_group=list(tre_list)
|
|
158
|
|
159 # Detection of duplications
|
|
160 dupes=manager.list()
|
|
161
|
|
162 ps_dupes = Process(target=merging_dupes,args=(control_group,dupes))
|
|
163 ps_dupes.start()
|
|
164 ps_dupes.join()
|
|
165
|
|
166 dupes=list(dupes)
|
|
167
|
|
168 # Merging the duplications in one entry with all different names
|
|
169 con_list=manager.list()
|
|
170 tre_list=manager.list()
|
|
171
|
|
172 ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(control_group,dupes,con_list))]
|
|
173 ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(treated_group,dupes,tre_list))])
|
|
174 [x.start() for x in ps_ap_merg_dupes]
|
|
175
|
|
176 # Preparation of reference sequences (isodforms) for the detection of non template mirnas
|
|
177 if args.anal=="2":
|
|
178 all_iso = manager.list()
|
|
179 ps_non_iso = Process(target=non_template_ref,args=(con_samples,tre_samples,all_iso))
|
|
180 ps_non_iso.start()
|
|
181
|
|
182 # Finishing the process for merging
|
|
183 [x.join() for x in ps_merge]
|
|
184 merg_nam_control_group=list(merg_nam_control_group)
|
|
185 merg_nam_treated_group=list(merg_nam_treated_group)
|
|
186
|
|
187 # Export the database and the graphs
|
|
188 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in con_data]
|
|
189 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in tre_data])
|
|
190 procs.extend([Process(target=make_spider,args=(merg_nam_control_group,merg_nam_treated_group,args.group1,args.group2))])
|
|
191
|
|
192 if args.anal == "1":
|
|
193 procs.extend([Process(target=pie_temp,args=(merg_nam_control_group,con_unmap_seq.value,con_unmap_counts.value,merg_nam_treated_group,tre_unmap_seq.value,tre_unmap_counts.value,args.group1,args.group2))])
|
|
194
|
|
195 [p.start() for p in procs]
|
|
196
|
|
197 # Export the pdf report file
|
|
198 if args.anal=="1":
|
|
199 [x.join() for x in ps_hist]
|
|
200 [p.join() for p in procs]
|
|
201 ps_pdf = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2))
|
|
202 ps_pdf.start()
|
|
203
|
|
204 [x.join() for x in ps_ap_merg_dupes]
|
|
205 control_group=list(con_list)
|
|
206 treated_group=list(tre_list)
|
|
207
|
|
208 # Filters low count mirnas (otpional)
|
|
209 if int(args.per)!=-1:
|
31
|
210 if int(args.per)>0 and int(args.per)<=100 and int(args.count)>0:
|
30
|
211 fil_con_group=manager.list()
|
|
212 fil_tre_group=manager.list()
|
4
|
213
|
30
|
214 ps_low_counts = Process(target=filter_low_counts,args=(control_group,treated_group,fil_con_group,fil_tre_group,args.per,args.count))
|
|
215 ps_low_counts.start()
|
|
216 ps_low_counts.join()
|
4
|
217
|
30
|
218 fil_con_group=list(fil_con_group)
|
|
219 fil_tre_group=list(fil_tre_group)
|
|
220
|
|
221 else:
|
|
222 sys.exit("Not acceptable values for filter")
|
4
|
223
|
|
224 if "fil_con_group" not in locals() or "fil_con_group" not in globals():
|
|
225 fil_con_group=control_group
|
|
226 fil_tre_group=treated_group
|
|
227
|
|
228 # export count matrices
|
|
229 ps_write = Process(target=write_main,args=(control_group, treated_group, fil_con_group, fil_tre_group, con_file_order,tre_file_order,1,args.group1,args.group2,args.per))
|
|
230 ps_write.start()
|
|
231
|
|
232 # export counts files compatible with Deseq2 and EdgeR
|
|
233 ps1_matrix = [Process(target=temp_counts_to_diff,args=(con_file_order,fil_con_group,"Diff/temp_con/",0))]
|
|
234 ps1_matrix.extend([Process(target=temp_counts_to_diff,args=(tre_file_order,fil_tre_group,"Diff/temp_tre/",0))])
|
|
235 [p.start() for p in ps1_matrix]
|
|
236
|
|
237 if args.anal=="1":
|
|
238 ps_pdf.join()
|
|
239 if args.anal=="2":
|
|
240 [p.join() for p in procs]
|
|
241 [x.join() for x in ps_hist]
|
|
242
|
|
243 ps_write.join()
|
|
244 [p.join() for p in ps1_matrix]
|
|
245
|
|
246 ############################## Detection of Both #######################################
|
|
247
|
|
248 if args.anal == "2":
|
|
249
|
|
250 # Initialization of the managers between the proccesses
|
|
251 # First group of samples (controls)
|
|
252 n_con_data= manager.list()
|
|
253 n_con_file_order=manager.list()
|
|
254 n_con_names_seqs=manager.list()
|
|
255 n_deseq=manager.list()
|
|
256
|
|
257 # Second group of samples (treated)
|
|
258 n_tre_data = manager.list()
|
|
259 n_tre_file_order = manager.list()
|
|
260 n_tre_names_seqs=manager.list()
|
|
261 n_deseq1=manager.list()
|
|
262
|
|
263 # Preparation of reference sequences
|
|
264 new_ref_mirnas = list(mature_mirnas)
|
|
265 ps_non_iso.join()
|
|
266
|
|
267
|
|
268 all_iso=list(all_iso)
|
|
269 new_ref_mirnas.extend(all_iso)
|
|
270
|
|
271 # Processing of non template miRNAs from SAM files
|
|
272 ps_sam = [Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_con_data,n_con_file_order,n_deseq,n_con_names_seqs)) for path in control]
|
|
273 ps_sam.extend([Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_tre_data,n_tre_file_order,n_deseq1,n_tre_names_seqs)) for path in treated])
|
|
274
|
|
275 [p.start() for p in ps_sam]
|
|
276 [p.join() for p in ps_sam]
|
|
277
|
|
278 # Convert managers to lists
|
|
279 n_con_file_order=list(n_con_file_order)
|
|
280 n_tre_file_order=list(n_tre_file_order)
|
|
281 n_deseq=list(n_deseq)
|
|
282 n_deseq1=list(n_deseq1)
|
|
283
|
|
284 # Remove duplicates and sorting
|
|
285 n_con_names_seqs=list(n_con_names_seqs)
|
|
286 n_con_names_seqs.sort()
|
|
287 n_con_names_seqs=list(n_con_names_seqs for n_con_names_seqs,_ in itertools.groupby(n_con_names_seqs))
|
|
288
|
|
289 n_tre_names_seqs=list(n_tre_names_seqs)
|
|
290 n_tre_names_seqs.sort()
|
|
291 n_tre_names_seqs=list(n_tre_names_seqs for n_tre_names_seqs,_ in itertools.groupby(n_tre_names_seqs))
|
|
292
|
|
293 # initialization of new managers
|
|
294 new_n_con_file_order=manager.list()
|
|
295 new_n_tre_file_order=manager.list()
|
|
296 n_new_deseq=manager.list()
|
|
297 n_new_deseq1=manager.list()
|
|
298
|
|
299 # add uncommon detected mirnas among the samples
|
|
300 ps_deseq=[Process(target=uncommon_mirnas,args=(sampp,n_con_names_seqs,lock,n_new_deseq,n_con_file_order[i],new_n_con_file_order)) for i,sampp in enumerate(n_deseq)]
|
|
301 ps_deseq.extend([Process(target=uncommon_mirnas,args=(sampp,n_tre_names_seqs,lock,n_new_deseq1,n_tre_file_order[i],new_n_tre_file_order)) for i,sampp in enumerate(n_deseq1)])
|
|
302
|
|
303 # Wait for processing of uncommon detected mirnas to finish
|
|
304 [x.start() for x in ps_deseq]
|
|
305 [x.join() for x in ps_deseq]
|
|
306
|
|
307 # Convert managers to lists
|
|
308 n_new_deseq=list(n_new_deseq)
|
|
309 n_new_deseq1=list(n_new_deseq1)
|
|
310 n_con_file_order=list(new_n_con_file_order)
|
|
311 n_tre_file_order=list(new_n_tre_file_order)
|
|
312
|
|
313 # Genereation of count matrices per group (controls - treated)
|
|
314 n_control_group=[[x[0],x[2]] for x in n_new_deseq[0]]
|
|
315 [n_control_group[i].append(y[i][1]) for i,_ in enumerate(n_control_group) for y in n_new_deseq]
|
|
316
|
|
317 n_treated_group=[[x[0],x[2]] for x in n_new_deseq1[0]]
|
|
318 [n_treated_group[i].append(y[i][1]) for i,_ in enumerate(n_treated_group) for y in n_new_deseq1]
|
|
319
|
|
320 # Keep a copy of count matrices
|
|
321 n_control_group_copy=copy.deepcopy(list(n_control_group))
|
|
322 n_treated_group_copy=copy.deepcopy(list(n_treated_group))
|
|
323
|
|
324 # Initialization of managers
|
|
325 merg_nam_n_control_group=manager.list()
|
|
326 merg_nam_n_treated_group=manager.list()
|
|
327
|
|
328 # Merging of different names for the same mirna sequence per group (controls, treated) to avoid duplicates
|
|
329 ps_merge = [Process(target=merging_names,args=(n_control_group_copy,merg_nam_n_control_group))]
|
|
330 ps_merge.extend([Process(target=merging_names,args=(n_treated_group_copy,merg_nam_n_treated_group))])
|
|
331 [x.start() for x in ps_merge]
|
|
332
|
|
333 # Add unique mirna sequences between groups (all groups will have the same amount of sequences)
|
|
334 n_con_list=manager.list()
|
|
335 n_tre_list=manager.list()
|
|
336
|
|
337 ps_bw = [Process(target=black_white,args=(n_con_names_seqs,n_tre_names_seqs,n_treated_group,n_tre_list))]
|
|
338 ps_bw.extend([Process(target=black_white,args=(n_tre_names_seqs,n_con_names_seqs,n_control_group,n_con_list))])
|
|
339 [x.start() for x in ps_bw]
|
|
340 [x.join() for x in ps_bw]
|
|
341
|
|
342 n_control_group=list(n_con_list)
|
|
343 n_treated_group=list(n_tre_list)
|
|
344
|
|
345 # Detection of duplications
|
|
346 n_dupes=manager.list()
|
|
347
|
|
348 ps_dupes = Process(target=merging_dupes,args=(n_control_group,n_dupes))
|
|
349 ps_dupes.start()
|
|
350 ps_dupes.join()
|
|
351
|
|
352 n_dupes=list(n_dupes)
|
|
353
|
|
354 # Merging the duplications in one entry with all different names
|
|
355 n_con_list=manager.list()
|
|
356 n_tre_list=manager.list()
|
|
357
|
|
358 ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(n_control_group,n_dupes,n_con_list))]
|
|
359 ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(n_treated_group,n_dupes,n_tre_list))])
|
|
360 [x.start() for x in ps_ap_merg_dupes]
|
|
361
|
|
362 # Finishing the process for merging
|
|
363 [x.join() for x in ps_merge]
|
|
364 merg_nam_n_control_group=list(merg_nam_n_control_group)
|
|
365 merg_nam_n_treated_group=list(merg_nam_n_treated_group)
|
|
366
|
|
367 # Export the database and the graphs
|
|
368 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_con_data]
|
|
369 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_tre_data])
|
|
370 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_control_group,'c',args.group1))])
|
|
371 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_treated_group,'t',args.group2))])
|
|
372 procs.extend([Process(target=pie_non_temp,args=(merg_nam_control_group,merg_nam_n_control_group,merg_nam_treated_group,merg_nam_n_treated_group,con_unmap_seq.value,tre_unmap_seq.value,con_unmap_counts.value,tre_unmap_counts.value,args.group1,args.group2))])
|
|
373
|
|
374 [p.start() for p in procs]
|
|
375 [p.join() for p in procs]
|
|
376
|
|
377 procs1 = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2))
|
|
378 procs1.start()
|
|
379
|
|
380 [x.join() for x in ps_ap_merg_dupes]
|
|
381 n_control_group=list(n_con_list)
|
|
382 n_treated_group=list(n_tre_list)
|
|
383
|
|
384 # Filters low count mirnas (otpional)
|
|
385 if int(args.per)!=-1:
|
31
|
386 if int(args.per)>0 and int(args.per)<=100 and int(args.count)>0:
|
4
|
387
|
30
|
388 n_fil_con_group=manager.list()
|
|
389 n_fil_tre_group=manager.list()
|
4
|
390
|
30
|
391 ps_low_counts = Process(target=filter_low_counts,args=(n_control_group,n_treated_group,n_fil_con_group,n_fil_tre_group,args.per,args.count))
|
|
392 ps_low_counts.start()
|
|
393 ps_low_counts.join()
|
4
|
394
|
30
|
395 n_fil_con_group=list(n_fil_con_group)
|
|
396 n_fil_tre_group=list(n_fil_tre_group)
|
|
397 else:
|
|
398 sys.exit("Not acceptable values for filter")
|
4
|
399
|
|
400 if "n_fil_con_group" not in locals() or "n_fil_con_group"not in globals():
|
|
401 n_fil_con_group=n_control_group
|
|
402 n_fil_tre_group=n_treated_group
|
|
403
|
|
404
|
|
405 ps_write = Process(target=write_main,args=(n_control_group, n_treated_group,n_fil_con_group, n_fil_tre_group, n_con_file_order, n_tre_file_order,2,args.group1,args.group2,args.per))
|
|
406 ps_write.start()
|
|
407
|
|
408 ps1_matrix = [Process(target=nontemp_counts_to_diff,args=(n_con_file_order,n_fil_con_group,con_file_order,fil_con_group,"Diff/n_temp_con/",0))]
|
|
409 ps1_matrix.extend([Process(target=nontemp_counts_to_diff,args=(n_tre_file_order,n_fil_tre_group,tre_file_order,fil_tre_group,"Diff/n_temp_tre/",0))])
|
|
410 [p.start() for p in ps1_matrix]
|
|
411
|
|
412 ps_write.join()
|
|
413 [p.join() for p in ps1_matrix]
|
|
414 procs1.join()
|
28
|
415 print('Running time: {} seconds'.format(round(time.time() - starttime,2)))
|
4
|
416
|
|
417
|
|
418
|
|
419
|
|
420
|
|
421
|
|
422
|
|
423
|