comparison mirgene.py @ 4:f44185f616bc draft

Uploaded
author glogobyte
date Wed, 13 Oct 2021 16:04:40 +0000
parents
children 476121298816
comparison
equal deleted inserted replaced
3:d77b33e65501 4:f44185f616bc
1 from mirgene_functions import *
2 from mirgene_graphs import *
3 import time
4 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
5 import subprocess
6 import argparse
7
8 subprocess.call(['mkdir','-p','split1','split2','split3','split4','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre'])
9
10 parser = argparse.ArgumentParser()
11 parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store")
12 parser.add_argument("-con", "--control", help="input fastq file (controls)", nargs='+', default=[])
13 parser.add_argument("-tre", "--treated", help="input fastq file (treated)", nargs='+', default=[] )
14 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
15 parser.add_argument("-gen", "--org_name", help="Organism", action="store")
16 parser.add_argument("-f", "--flag", help="choose the database (MirBase,MirGene)", action="store")
17 parser.add_argument("-percentage", "--per", help="Percentage of Samples", action="store")
18 parser.add_argument("-counts", "--count", help="Counts for filtering", action="store")
19 parser.add_argument("-name1", "--group1", help="Samples group 1", action="store")
20 parser.add_argument("-name2", "--group2", help="Samples group 2", action="store")
21 args = parser.parse_args()
22
23
24 #########################################################################3###############################################################################################################################################################################################
25
26 if __name__ == '__main__':
27
28 starttime = time.time()
29
30 lock = Lock()
31 manager = Manager()
32
33 # Download reference miRNA sequences from MirBase
34 mature_mirnas=manager.list()
35 ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name))
36 ps_mature.start()
37
38 # Keep the names of the files and location paths
39 args.control[0]=args.control[0][1:]
40 args.control[len(args.control)-1][:-1]
41 control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)]
42
43 args.treated[0]=args.treated[0][1:]
44 args.treated[len(args.treated)-1][:-1]
45 treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)]
46
47
48 ############## Detection of templated isoforms ################
49
50 #radar = manager.list([0,0,0,0])
51 con_samples = manager.list()
52 con_data= manager.list()
53 con_file_order=manager.list()
54 con_names_seqs=manager.list()
55 deseq=manager.list()
56 con_unmap_seq=manager.Value('i',0)
57 con_unmap_counts=manager.Value('i',0)
58 con_mirna_names=manager.list()
59 ini_con_samples = manager.list()
60
61 #radar1 = manager.list([0,0,0,0])
62 tre_samples = manager.list()
63 tre_data = manager.list()
64 tre_file_order = manager.list()
65 tre_names_seqs=manager.list()
66 deseq1=manager.list()
67 tre_unmap_seq = manager.Value('i',0)
68 tre_unmap_counts = manager.Value('i',0)
69 tre_mirna_names=manager.list()
70 ini_tre_samples = manager.list()
71
72 # Wait for the download of reference miRNA sequences
73 ps_mature.join()
74 mature_mirnas=list(mature_mirnas)
75
76 # Processing of the detected miRNAs from SAM files
77 ps_sam = [Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,con_samples,con_data,con_file_order,con_unmap_seq,con_names_seqs,deseq,con_mirna_names,ini_con_samples,con_unmap_counts)) for path in control]
78 ps_sam.extend([Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,tre_samples,tre_data,tre_file_order,tre_unmap_seq,tre_names_seqs,deseq1,tre_mirna_names,ini_tre_samples,tre_unmap_counts)) for path in treated])
79
80 # Wait for processing of SAM files to finish
81 [p.start() for p in ps_sam]
82 [p.join() for p in ps_sam]
83
84 # Generate a histogram
85 ps_hist=[Process(target=hist_red,args=(ini_con_samples,'c',args.group1))]
86 ps_hist.extend([Process(target=hist_red,args=(ini_tre_samples,'t',args.group2))])
87 [x.start() for x in ps_hist]
88
89 # Convert managers to lists
90 con_samples = list(con_samples)
91 tre_samples = list(tre_samples)
92 con_file_order=list(con_file_order)
93 tre_file_order=list(tre_file_order)
94 deseq=list(deseq)
95 deseq1=list(deseq1)
96
97 # Remove duplicates and sorting
98 con_names_seqs=list(con_names_seqs)
99 con_names_seqs.sort()
100 con_names_seqs=list(con_names_seqs for con_names_seqs,_ in itertools.groupby(con_names_seqs))
101
102 tre_names_seqs=list(tre_names_seqs)
103 tre_names_seqs.sort()
104 tre_names_seqs=list(tre_names_seqs for tre_names_seqs,_ in itertools.groupby(tre_names_seqs))
105
106 # initialization of new managers
107 new_con_file_order=manager.list()
108 new_tre_file_order=manager.list()
109 new_deseq=manager.list()
110 new_deseq1=manager.list()
111
112 # add uncommon detected mirnas among the samples
113 ps_un_mirnas=[Process(target=uncommon_mirnas,args=(sampp,con_names_seqs,lock,new_deseq,con_file_order[i],new_con_file_order)) for i,sampp in enumerate(deseq)]
114 ps_un_mirnas.extend([Process(target=uncommon_mirnas,args=(sampp,tre_names_seqs,lock,new_deseq1,tre_file_order[i],new_tre_file_order)) for i,sampp in enumerate(deseq1)])
115
116 # Wait for processing of uncommon detected mirnas to finish
117 [z.start() for z in ps_un_mirnas]
118 [z.join() for z in ps_un_mirnas]
119
120 # Convert managers to lists
121 new_deseq=list(new_deseq)
122 new_deseq1=list(new_deseq1)
123 con_file_order=list(new_con_file_order)
124 tre_file_order=list(new_tre_file_order)
125
126 # Genereation of count matrices per group (controls - treated)
127 control_group=[[x[0],x[2]] for x in new_deseq[0]]
128 [control_group[i].append(y[i][1]) for i,_ in enumerate(control_group) for y in new_deseq]
129
130 treated_group=[[x[0],x[2]] for x in new_deseq1[0]]
131 [treated_group[i].append(y[i][1]) for i,_ in enumerate(treated_group) for y in new_deseq1]
132
133 # Keep a copy of count matrices
134 control_group_copy=copy.deepcopy(list(control_group))
135 treated_group_copy=copy.deepcopy(list(treated_group))
136
137 # Initialization of managers
138 merg_nam_control_group=manager.list()
139 merg_nam_treated_group=manager.list()
140
141 # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates
142 ps_merge = [Process(target=merging_names,args=(control_group_copy,merg_nam_control_group))]
143 ps_merge.extend([Process(target=merging_names,args=(treated_group_copy,merg_nam_treated_group))])
144 [x.start() for x in ps_merge]
145
146 # Add unique mirna sequences between groups (all groups will have the same amount of sequences)
147 con_list=manager.list()
148 tre_list=manager.list()
149
150 ps_bw = [Process(target=black_white,args=(con_names_seqs,tre_names_seqs,treated_group,tre_list))]
151 ps_bw.extend([Process(target=black_white,args=(tre_names_seqs,con_names_seqs,control_group,con_list))])
152 [x.start() for x in ps_bw]
153 [x.join() for x in ps_bw]
154
155 control_group=list(con_list)
156 treated_group=list(tre_list)
157
158 # Detection of duplications
159 dupes=manager.list()
160
161 ps_dupes = Process(target=merging_dupes,args=(control_group,dupes))
162 ps_dupes.start()
163 ps_dupes.join()
164
165 dupes=list(dupes)
166
167 # Merging the duplications in one entry with all different names
168 con_list=manager.list()
169 tre_list=manager.list()
170
171 ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(control_group,dupes,con_list))]
172 ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(treated_group,dupes,tre_list))])
173 [x.start() for x in ps_ap_merg_dupes]
174
175 # Preparation of reference sequences (isodforms) for the detection of non template mirnas
176 if args.anal=="2":
177 all_iso = manager.list()
178 ps_non_iso = Process(target=non_template_ref,args=(con_samples,tre_samples,all_iso))
179 ps_non_iso.start()
180
181 # Finishing the process for merging
182 [x.join() for x in ps_merge]
183 merg_nam_control_group=list(merg_nam_control_group)
184 merg_nam_treated_group=list(merg_nam_treated_group)
185
186 # Export the database and the graphs
187 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in con_data]
188 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in tre_data])
189 procs.extend([Process(target=make_spider,args=(merg_nam_control_group,merg_nam_treated_group,args.group1,args.group2))])
190
191 if args.anal == "1":
192 procs.extend([Process(target=pie_temp,args=(merg_nam_control_group,con_unmap_seq.value,con_unmap_counts.value,merg_nam_treated_group,tre_unmap_seq.value,tre_unmap_counts.value,args.group1,args.group2))])
193
194 [p.start() for p in procs]
195
196 # Export the pdf report file
197 if args.anal=="1":
198 [x.join() for x in ps_hist]
199 [p.join() for p in procs]
200 ps_pdf = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2))
201 ps_pdf.start()
202
203 [x.join() for x in ps_ap_merg_dupes]
204 control_group=list(con_list)
205 treated_group=list(tre_list)
206
207 # Filters low count mirnas (otpional)
208 if int(args.per)!=-1:
209
210 fil_con_group=manager.list()
211 fil_tre_group=manager.list()
212
213 ps_low_counts = Process(target=filter_low_counts,args=(control_group,treated_group,fil_con_group,fil_tre_group,args.per,args.count))
214 ps_low_counts.start()
215 ps_low_counts.join()
216
217 fil_con_group=list(fil_con_group)
218 fil_tre_group=list(fil_tre_group)
219
220 if "fil_con_group" not in locals() or "fil_con_group" not in globals():
221 fil_con_group=control_group
222 fil_tre_group=treated_group
223
224 # export count matrices
225 ps_write = Process(target=write_main,args=(control_group, treated_group, fil_con_group, fil_tre_group, con_file_order,tre_file_order,1,args.group1,args.group2,args.per))
226 ps_write.start()
227
228 # export counts files compatible with Deseq2 and EdgeR
229 ps1_matrix = [Process(target=temp_counts_to_diff,args=(con_file_order,fil_con_group,"Diff/temp_con/",0))]
230 ps1_matrix.extend([Process(target=temp_counts_to_diff,args=(tre_file_order,fil_tre_group,"Diff/temp_tre/",0))])
231 [p.start() for p in ps1_matrix]
232
233 if args.anal=="1":
234 ps_pdf.join()
235 if args.anal=="2":
236 [p.join() for p in procs]
237 [x.join() for x in ps_hist]
238
239 ps_write.join()
240 [p.join() for p in ps1_matrix]
241
242 ############################## Detection of Both #######################################
243
244 half = time.time()
245
246 if args.anal == "2":
247
248 # Initialization of the managers between the proccesses
249 # First group of samples (controls)
250 n_con_data= manager.list()
251 n_con_file_order=manager.list()
252 n_con_names_seqs=manager.list()
253 n_deseq=manager.list()
254
255 # Second group of samples (treated)
256 n_tre_data = manager.list()
257 n_tre_file_order = manager.list()
258 n_tre_names_seqs=manager.list()
259 n_deseq1=manager.list()
260
261 # Preparation of reference sequences
262 new_ref_mirnas = list(mature_mirnas)
263 ps_non_iso.join()
264
265
266 all_iso=list(all_iso)
267 new_ref_mirnas.extend(all_iso)
268
269 # Processing of non template miRNAs from SAM files
270 ps_sam = [Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_con_data,n_con_file_order,n_deseq,n_con_names_seqs)) for path in control]
271 ps_sam.extend([Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_tre_data,n_tre_file_order,n_deseq1,n_tre_names_seqs)) for path in treated])
272
273 [p.start() for p in ps_sam]
274 [p.join() for p in ps_sam]
275
276 # Convert managers to lists
277 n_con_file_order=list(n_con_file_order)
278 n_tre_file_order=list(n_tre_file_order)
279 n_deseq=list(n_deseq)
280 n_deseq1=list(n_deseq1)
281
282 # Remove duplicates and sorting
283 n_con_names_seqs=list(n_con_names_seqs)
284 n_con_names_seqs.sort()
285 n_con_names_seqs=list(n_con_names_seqs for n_con_names_seqs,_ in itertools.groupby(n_con_names_seqs))
286
287 n_tre_names_seqs=list(n_tre_names_seqs)
288 n_tre_names_seqs.sort()
289 n_tre_names_seqs=list(n_tre_names_seqs for n_tre_names_seqs,_ in itertools.groupby(n_tre_names_seqs))
290
291 # initialization of new managers
292 new_n_con_file_order=manager.list()
293 new_n_tre_file_order=manager.list()
294 n_new_deseq=manager.list()
295 n_new_deseq1=manager.list()
296
297 # add uncommon detected mirnas among the samples
298 ps_deseq=[Process(target=uncommon_mirnas,args=(sampp,n_con_names_seqs,lock,n_new_deseq,n_con_file_order[i],new_n_con_file_order)) for i,sampp in enumerate(n_deseq)]
299 ps_deseq.extend([Process(target=uncommon_mirnas,args=(sampp,n_tre_names_seqs,lock,n_new_deseq1,n_tre_file_order[i],new_n_tre_file_order)) for i,sampp in enumerate(n_deseq1)])
300
301 # Wait for processing of uncommon detected mirnas to finish
302 [x.start() for x in ps_deseq]
303 [x.join() for x in ps_deseq]
304
305 # Convert managers to lists
306 n_new_deseq=list(n_new_deseq)
307 n_new_deseq1=list(n_new_deseq1)
308 n_con_file_order=list(new_n_con_file_order)
309 n_tre_file_order=list(new_n_tre_file_order)
310
311 # Genereation of count matrices per group (controls - treated)
312 n_control_group=[[x[0],x[2]] for x in n_new_deseq[0]]
313 [n_control_group[i].append(y[i][1]) for i,_ in enumerate(n_control_group) for y in n_new_deseq]
314
315 n_treated_group=[[x[0],x[2]] for x in n_new_deseq1[0]]
316 [n_treated_group[i].append(y[i][1]) for i,_ in enumerate(n_treated_group) for y in n_new_deseq1]
317
318 # Keep a copy of count matrices
319 n_control_group_copy=copy.deepcopy(list(n_control_group))
320 n_treated_group_copy=copy.deepcopy(list(n_treated_group))
321
322 # Initialization of managers
323 merg_nam_n_control_group=manager.list()
324 merg_nam_n_treated_group=manager.list()
325
326 # Merging of different names for the same mirna sequence per group (controls, treated) to avoid duplicates
327 ps_merge = [Process(target=merging_names,args=(n_control_group_copy,merg_nam_n_control_group))]
328 ps_merge.extend([Process(target=merging_names,args=(n_treated_group_copy,merg_nam_n_treated_group))])
329 [x.start() for x in ps_merge]
330
331 # Add unique mirna sequences between groups (all groups will have the same amount of sequences)
332 n_con_list=manager.list()
333 n_tre_list=manager.list()
334
335 ps_bw = [Process(target=black_white,args=(n_con_names_seqs,n_tre_names_seqs,n_treated_group,n_tre_list))]
336 ps_bw.extend([Process(target=black_white,args=(n_tre_names_seqs,n_con_names_seqs,n_control_group,n_con_list))])
337 [x.start() for x in ps_bw]
338 [x.join() for x in ps_bw]
339
340 n_control_group=list(n_con_list)
341 n_treated_group=list(n_tre_list)
342
343 # Detection of duplications
344 n_dupes=manager.list()
345
346 ps_dupes = Process(target=merging_dupes,args=(n_control_group,n_dupes))
347 ps_dupes.start()
348 ps_dupes.join()
349
350 n_dupes=list(n_dupes)
351
352 # Merging the duplications in one entry with all different names
353 n_con_list=manager.list()
354 n_tre_list=manager.list()
355
356 ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(n_control_group,n_dupes,n_con_list))]
357 ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(n_treated_group,n_dupes,n_tre_list))])
358 [x.start() for x in ps_ap_merg_dupes]
359
360 # Finishing the process for merging
361 [x.join() for x in ps_merge]
362 merg_nam_n_control_group=list(merg_nam_n_control_group)
363 merg_nam_n_treated_group=list(merg_nam_n_treated_group)
364
365 # Export the database and the graphs
366 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_con_data]
367 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_tre_data])
368 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_control_group,'c',args.group1))])
369 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_treated_group,'t',args.group2))])
370 procs.extend([Process(target=pie_non_temp,args=(merg_nam_control_group,merg_nam_n_control_group,merg_nam_treated_group,merg_nam_n_treated_group,con_unmap_seq.value,tre_unmap_seq.value,con_unmap_counts.value,tre_unmap_counts.value,args.group1,args.group2))])
371
372 [p.start() for p in procs]
373 [p.join() for p in procs]
374
375 procs1 = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2))
376 procs1.start()
377
378 [x.join() for x in ps_ap_merg_dupes]
379 n_control_group=list(n_con_list)
380 n_treated_group=list(n_tre_list)
381
382 # Filters low count mirnas (otpional)
383 if int(args.per)!=-1:
384
385 n_fil_con_group=manager.list()
386 n_fil_tre_group=manager.list()
387
388 ps_low_counts = Process(target=filter_low_counts,args=(n_control_group,n_treated_group,n_fil_con_group,n_fil_tre_group,args.per,args.count))
389 ps_low_counts.start()
390 ps_low_counts.join()
391
392 n_fil_con_group=list(n_fil_con_group)
393 n_fil_tre_group=list(n_fil_tre_group)
394
395 if "n_fil_con_group" not in locals() or "n_fil_con_group"not in globals():
396 n_fil_con_group=n_control_group
397 n_fil_tre_group=n_treated_group
398
399
400 ps_write = Process(target=write_main,args=(n_control_group, n_treated_group,n_fil_con_group, n_fil_tre_group, n_con_file_order, n_tre_file_order,2,args.group1,args.group2,args.per))
401 ps_write.start()
402
403 ps1_matrix = [Process(target=nontemp_counts_to_diff,args=(n_con_file_order,n_fil_con_group,con_file_order,fil_con_group,"Diff/n_temp_con/",0))]
404 ps1_matrix.extend([Process(target=nontemp_counts_to_diff,args=(n_tre_file_order,n_fil_tre_group,tre_file_order,fil_tre_group,"Diff/n_temp_tre/",0))])
405 [p.start() for p in ps1_matrix]
406
407 ps_write.join()
408 [p.join() for p in ps1_matrix]
409 procs1.join()
410 print('That took {} seconds'.format(time.time() - half))
411 print('That took {} seconds'.format(time.time() - starttime))
412
413
414
415
416
417
418
419