comparison mirbase.py @ 1:f10c8f43f010 draft

Uploaded
author glogobyte
date Wed, 13 Oct 2021 16:03:50 +0000
parents
children 6d3abc45aa49
comparison
equal deleted inserted replaced
0:63ae92b7f425 1:f10c8f43f010
1 from mirbase_functions import *
2 from mirbase_graphs import *
3 import time
4 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
5 import subprocess
6 import argparse
7 import sys
8
9 subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre'])
10
11 parser = argparse.ArgumentParser()
12 parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store")
13 parser.add_argument("-con", "--control", help="input fastq file (controls)", nargs='+', default=[])
14 parser.add_argument("-tre", "--treated", help="input fastq file (treated)", nargs='+', default=[] )
15 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
16 parser.add_argument("-gen", "--org_name", help="Organism", action="store")
17 parser.add_argument("-f", "--flag", help="choose the database (MirBase,MirGene)", action="store")
18 parser.add_argument("-percentage", "--per", help="Percentage of Samples", action="store")
19 parser.add_argument("-counts", "--count", help="Counts for filtering", action="store")
20 parser.add_argument("-name1", "--group1", help="Samples group 1", action="store")
21 parser.add_argument("-name2", "--group2", help="Samples group 2", action="store")
22 args = parser.parse_args()
23
24
25 #################################################################################################################################################################################################################
26
27 if __name__ == '__main__':
28
29 starttime = time.time()
30
31 lock = Lock()
32 manager = Manager()
33
34 # Download reference miRNA sequences from MirBase
35 mature_mirnas=manager.list()
36 ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name))
37 ps_mature.start()
38
39
40 # Keep the names of the files and location paths
41 args.control[0]=args.control[0][1:]
42 args.control[len(args.control)-1][:-1]
43 control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)]
44
45
46 args.treated[0]=args.treated[0][1:]
47 args.treated[len(args.treated)-1][:-1]
48 treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)]
49
50
51 ############## Detection of templated isoforms ################
52
53
54 # Initialization of the managers between the proccesses
55 # First group of samples (controls)
56 con_samples = manager.list() # Collapsed mirnas with the new names
57 con_data= manager.list() # keeps all necessary data for the Database
58 con_file_order=manager.list() # files' names ordered by processes
59 con_names_seqs=manager.list() # keeps only mirna names and sequences
60 deseq=manager.list()
61 con_unmap_seq=manager.Value('i',0) # keeps unmap unique sequnces for the generation of a graph
62 con_unmap_counts=manager.Value('i',0) # keeps unmap counts of sequnces for the generation of a graph
63 con_mirna_names=manager.list() # keeps the names of mirnas
64 ini_con_samples = manager.list() # filtered SAM files
65
66 # Second group of samples (treated)
67 tre_samples = manager.list()
68 tre_data = manager.list()
69 tre_file_order = manager.list()
70 tre_names_seqs=manager.list()
71 deseq1=manager.list()
72 tre_unmap_seq = manager.Value('i',0)
73 tre_unmap_counts = manager.Value('i',0)
74 tre_mirna_names=manager.list()
75 ini_tre_samples = manager.list()
76
77 # Wait for the download of reference miRNA sequences
78 ps_mature.join()
79 mature_mirnas=list(mature_mirnas)
80
81 # Processing of the detected miRNAs from SAM files
82 ps_sam = [Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,con_samples,con_data,con_file_order,con_unmap_seq,con_names_seqs,deseq,con_mirna_names,ini_con_samples,con_unmap_counts)) for path in control]
83 ps_sam.extend([Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,tre_samples,tre_data,tre_file_order,tre_unmap_seq,tre_names_seqs,deseq1,tre_mirna_names,ini_tre_samples,tre_unmap_counts)) for path in treated])
84
85 # Wait for processing of SAM files to finish
86 [p.start() for p in ps_sam]
87 [p.join() for p in ps_sam]
88
89 # Generate a histogram
90 ps_hist=[Process(target=hist_red,args=(ini_con_samples,'c',args.group1))]
91 ps_hist.extend([Process(target=hist_red,args=(ini_tre_samples,'t',args.group2))])
92 [x.start() for x in ps_hist]
93
94
95 # Convert managers to lists
96 con_samples = list(con_samples)
97 tre_samples = list(tre_samples)
98 con_file_order=list(con_file_order)
99 tre_file_order=list(tre_file_order)
100 deseq=list(deseq)
101 deseq1=list(deseq1)
102
103 # Remove duplicates and sorting
104 con_names_seqs=list(con_names_seqs)
105 con_names_seqs.sort()
106 con_names_seqs=list(con_names_seqs for con_names_seqs,_ in itertools.groupby(con_names_seqs))
107
108 tre_names_seqs=list(tre_names_seqs)
109 tre_names_seqs.sort()
110 tre_names_seqs=list(tre_names_seqs for tre_names_seqs,_ in itertools.groupby(tre_names_seqs))
111
112 # initialization of new managers
113 new_con_file_order=manager.list()
114 new_tre_file_order=manager.list()
115 new_deseq=manager.list()
116 new_deseq1=manager.list()
117
118 # add uncommon detected mirnas among the samples
119 ps_un_mirnas=[Process(target=uncommon_mirnas,args=(sampp,con_names_seqs,lock,new_deseq,con_file_order[i],new_con_file_order)) for i,sampp in enumerate(deseq)]
120 ps_un_mirnas.extend([Process(target=uncommon_mirnas,args=(sampp,tre_names_seqs,lock,new_deseq1,tre_file_order[i],new_tre_file_order)) for i,sampp in enumerate(deseq1)])
121
122 # Wait for processing of uncommon detected mirnas to finish
123 [z.start() for z in ps_un_mirnas]
124 [z.join() for z in ps_un_mirnas]
125
126 # Convert managers to lists
127 new_deseq=list(new_deseq)
128 new_deseq1=list(new_deseq1)
129 con_file_order=list(new_con_file_order)
130 tre_file_order=list(new_tre_file_order)
131
132 # Genereation of count matrices per group (controls - treated)
133 control_group=[[x[0],x[2]] for x in new_deseq[0]]
134 [control_group[i].append(y[i][1]) for i,_ in enumerate(control_group) for y in new_deseq]
135
136 treated_group=[[x[0],x[2]] for x in new_deseq1[0]]
137 [treated_group[i].append(y[i][1]) for i,_ in enumerate(treated_group) for y in new_deseq1]
138
139 # Keep a copy of count matrices
140 control_group_copy=copy.deepcopy(list(control_group))
141 treated_group_copy=copy.deepcopy(list(treated_group))
142
143 # Initialization of managers
144 merg_nam_control_group=manager.list()
145 merg_nam_treated_group=manager.list()
146
147 # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates
148 ps_merge = [Process(target=merging_names,args=(control_group_copy,merg_nam_control_group))]
149 ps_merge.extend([Process(target=merging_names,args=(treated_group_copy,merg_nam_treated_group))])
150 [x.start() for x in ps_merge]
151
152
153 # Add unique mirna sequences between groups (all groups will have the same amount of sequences)
154 con_list=manager.list()
155 tre_list=manager.list()
156
157 ps_bw = [Process(target=black_white,args=(con_names_seqs,tre_names_seqs,treated_group,tre_list))]
158 ps_bw.extend([Process(target=black_white,args=(tre_names_seqs,con_names_seqs,control_group,con_list))])
159 [x.start() for x in ps_bw]
160 [x.join() for x in ps_bw]
161
162 control_group=list(con_list)
163 treated_group=list(tre_list)
164
165 # Detection of duplications
166 dupes=manager.list()
167
168 ps_dupes = Process(target=merging_dupes,args=(control_group,dupes))
169 ps_dupes.start()
170 ps_dupes.join()
171
172 dupes=list(dupes)
173
174 # Merging the duplications in one entry with all different names
175 con_list=manager.list()
176 tre_list=manager.list()
177
178 ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(control_group,dupes,con_list))]
179 ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(treated_group,dupes,tre_list))])
180 [x.start() for x in ps_ap_merg_dupes]
181
182 # Preparation of reference sequences (isodforms) for the detection of non template mirnas
183 if args.anal=="2":
184 all_iso = manager.list()
185 ps_non_iso = Process(target=non_template_ref,args=(con_samples,tre_samples,all_iso))
186 ps_non_iso.start()
187
188 # Finishing the process for merging
189 [x.join() for x in ps_merge]
190 merg_nam_control_group=list(merg_nam_control_group)
191 merg_nam_treated_group=list(merg_nam_treated_group)
192
193 # Export the database and the graphs
194 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in con_data]
195 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in tre_data])
196 procs.extend([Process(target=make_spider,args=(merg_nam_control_group,merg_nam_treated_group,args.group1,args.group2))])
197
198 if args.anal == "1":
199 procs.extend([Process(target=pie_temp,args=(merg_nam_control_group,con_unmap_seq.value,con_unmap_counts.value,merg_nam_treated_group,tre_unmap_seq.value,tre_unmap_counts.value,group_name1,group_name2))])
200
201 [p.start() for p in procs]
202
203 # Export the pdf report file
204 if args.anal=="1":
205 [x.join() for x in ps_hist]
206 [p.join() for p in procs]
207 ps_pdf = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2))
208 ps_pdf.start()
209
210
211 [x.join() for x in ps_ap_merg_dupes]
212 control_group=list(con_list)
213 treated_group=list(tre_list)
214
215 # Filters low count mirnas (otpional)
216 if int(args.per)!=-1:
217
218 fil_con_group=manager.list()
219 fil_tre_group=manager.list()
220
221 ps_low_counts = Process(target=filter_low_counts,args=(control_group,treated_group,fil_con_group,fil_tre_group,args.per,args.count))
222 ps_low_counts.start()
223 ps_low_counts.join()
224
225 fil_con_group=list(fil_con_group)
226 fil_tre_group=list(fil_tre_group)
227
228 if "fil_con_group" not in locals() or "fil_con_group" not in globals():
229 fil_con_group=control_group
230 fil_tre_group=treated_group
231
232 # export count matrices
233 ps_write = Process(target=write_main,args=(control_group, treated_group, fil_con_group, fil_tre_group, con_file_order,tre_file_order,1,args.group1,args.group2,args.per))
234 ps_write.start()
235
236 # export counts files compatible with Deseq2 and EdgeR
237 ps1_matrix = [Process(target=temp_counts_to_diff,args=(con_file_order,fil_con_group,"Diff/temp_con/"))]
238 ps1_matrix.extend([Process(target=temp_counts_to_diff,args=(tre_file_order,fil_tre_group,"Diff/temp_tre/"))])
239 [p.start() for p in ps1_matrix]
240
241 if args.anal=="1":
242 ps_pdf.join()
243 if args.anal=="2":
244 [p.join() for p in procs]
245 [x.join() for x in ps_hist]
246
247 ps_write.join()
248 [p.join() for p in ps1_matrix]
249
250 ############################## Detection of non-template #######################################
251
252 starttime10 = time.time()
253
254 if args.anal == "2":
255
256 # Initialization of the managers between the proccesses
257 # First group of samples (controls)
258 n_con_data= manager.list()
259 n_con_file_order=manager.list()
260 n_con_names_seqs=manager.list()
261 n_deseq=manager.list()
262
263 # Second group of samples (treated)
264 n_tre_data = manager.list()
265 n_tre_file_order = manager.list()
266 n_tre_names_seqs=manager.list()
267 n_deseq1=manager.list()
268
269 # Preparation of reference sequences
270 new_ref_mirnas = list(mature_mirnas)
271 ps_non_iso.join()
272
273 all_iso=list(all_iso)
274 new_ref_mirnas.extend(all_iso)
275
276 # Processing of non template miRNAs from SAM files
277 ps_sam = [Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_con_data,n_con_file_order,n_deseq,n_con_names_seqs)) for path in control]
278 ps_sam.extend([Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_tre_data,n_tre_file_order,n_deseq1,n_tre_names_seqs)) for path in treated])
279
280 [p.start() for p in ps_sam]
281 [p.join() for p in ps_sam]
282
283 # Convert managers to lists
284 n_con_file_order=list(n_con_file_order)
285 n_tre_file_order=list(n_tre_file_order)
286 n_deseq=list(n_deseq)
287 n_deseq1=list(n_deseq1)
288
289 # Remove duplicates and sorting
290 n_con_names_seqs=list(n_con_names_seqs)
291 n_con_names_seqs.sort()
292 n_con_names_seqs=list(n_con_names_seqs for n_con_names_seqs,_ in itertools.groupby(n_con_names_seqs))
293
294 n_tre_names_seqs=list(n_tre_names_seqs)
295 n_tre_names_seqs.sort()
296 n_tre_names_seqs=list(n_tre_names_seqs for n_tre_names_seqs,_ in itertools.groupby(n_tre_names_seqs))
297
298 # initialization of new managers
299 new_n_con_file_order=manager.list()
300 new_n_tre_file_order=manager.list()
301 n_new_deseq=manager.list()
302 n_new_deseq1=manager.list()
303
304 # add uncommon detected mirnas among the samples
305 ps_deseq=[Process(target=uncommon_mirnas,args=(sampp,n_con_names_seqs,lock,n_new_deseq,n_con_file_order[i],new_n_con_file_order)) for i,sampp in enumerate(n_deseq)]
306 ps_deseq.extend([Process(target=uncommon_mirnas,args=(sampp,n_tre_names_seqs,lock,n_new_deseq1,n_tre_file_order[i],new_n_tre_file_order)) for i,sampp in enumerate(n_deseq1)])
307
308 # Wait for processing of uncommon detected mirnas to finish
309 [x.start() for x in ps_deseq]
310 [x.join() for x in ps_deseq]
311
312 # Convert managers to lists
313 n_new_deseq=list(n_new_deseq)
314 n_new_deseq1=list(n_new_deseq1)
315 n_con_file_order=list(new_n_con_file_order)
316 n_tre_file_order=list(new_n_tre_file_order)
317
318 # Genereation of count matrices per group (controls - treated)
319 n_control_group=[[x[0],x[2]] for x in n_new_deseq[0]]
320 [n_control_group[i].append(y[i][1]) for i,_ in enumerate(n_control_group) for y in n_new_deseq]
321
322 n_treated_group=[[x[0],x[2]] for x in n_new_deseq1[0]]
323 [n_treated_group[i].append(y[i][1]) for i,_ in enumerate(n_treated_group) for y in n_new_deseq1]
324
325 # Keep a copy of count matrices
326 n_control_group_copy=copy.deepcopy(list(n_control_group))
327 n_treated_group_copy=copy.deepcopy(list(n_treated_group))
328
329 # Initialization of managers
330 merg_nam_n_control_group=manager.list()
331 merg_nam_n_treated_group=manager.list()
332
333 # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates\
334 ps_merge = [Process(target=merging_names,args=(n_control_group_copy,merg_nam_n_control_group))]
335 ps_merge.extend([Process(target=merging_names,args=(n_treated_group_copy,merg_nam_n_treated_group))])
336 [x.start() for x in ps_merge]
337
338 # Add unique mirna sequences between groups (all groups will have the same amount of sequences)
339 n_con_list=manager.list()
340 n_tre_list=manager.list()
341
342 ps_bw = [Process(target=black_white,args=(n_con_names_seqs,n_tre_names_seqs,n_treated_group,n_tre_list))]
343 ps_bw.extend([Process(target=black_white,args=(n_tre_names_seqs,n_con_names_seqs,n_control_group,n_con_list))])
344 [x.start() for x in ps_bw]
345 [x.join() for x in ps_bw]
346
347 n_control_group=list(n_con_list)
348 n_treated_group=list(n_tre_list)
349
350 # Detection of duplications
351 n_dupes=manager.list()
352
353 ps_dupes = Process(target=merging_dupes,args=(n_control_group,n_dupes))
354 ps_dupes.start()
355 ps_dupes.join()
356
357 n_dupes=list(n_dupes)
358
359 # Merging the duplications in one entry with all different names
360 n_con_list=manager.list()
361 n_tre_list=manager.list()
362
363 ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(n_control_group,n_dupes,n_con_list))]
364 ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(n_treated_group,n_dupes,n_tre_list))])
365 [x.start() for x in ps_ap_merg_dupes]
366
367 # Finishing the process for merging
368 [x.join() for x in ps_merge]
369 merg_nam_n_control_group=list(merg_nam_n_control_group)
370 merg_nam_n_treated_group=list(merg_nam_n_treated_group)
371
372 # Export the database and the graphs
373 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_con_data]
374 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_tre_data])
375 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_control_group,'c',args.group1))])
376 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_treated_group,'t',args.group2))])
377 procs.extend([Process(target=pie_non_temp,args=(merg_nam_control_group,merg_nam_n_control_group,merg_nam_treated_group,merg_nam_n_treated_group,con_unmap_seq.value,tre_unmap_seq.value,con_unmap_counts.value,tre_unmap_counts.value,args.group1,args.group2))])
378
379 [p.start() for p in procs]
380 [p.join() for p in procs]
381
382 procs1 = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2))
383 procs1.start()
384
385 [x.join() for x in ps_ap_merg_dupes]
386 n_control_group=list(n_con_list)
387 n_treated_group=list(n_tre_list)
388
389
390 # Filters low count mirnas (otpional)
391 if int(args.per)!=-1:
392
393 n_fil_con_group=manager.list()
394 n_fil_tre_group=manager.list()
395
396 ps_low_counts = Process(target=filter_low_counts,args=(n_control_group,n_treated_group,n_fil_con_group,n_fil_tre_group,args.per,args.count))
397 ps_low_counts.start()
398 ps_low_counts.join()
399
400 n_fil_con_group=list(n_fil_con_group)
401 n_fil_tre_group=list(n_fil_tre_group)
402
403 if "n_fil_con_group" not in locals() or "n_fil_con_group" not in globals():
404 n_fil_con_group=n_control_group
405 n_fil_tre_group=n_treated_group
406
407 ps_write = Process(target=write_main,args=(n_control_group, n_treated_group,n_fil_con_group, n_fil_tre_group, n_con_file_order, n_tre_file_order,2,args.group1,args.group2,args.per))
408 ps_write.start()
409
410 ps1_matrix = [Process(target=nontemp_counts_to_diff,args=(n_con_file_order,n_fil_con_group,con_file_order,fil_con_group,"Diff/n_temp_con/"))]
411 ps1_matrix.extend([Process(target=nontemp_counts_to_diff,args=(n_tre_file_order,n_fil_tre_group,tre_file_order,fil_tre_group,"Diff/n_temp_tre/"))])
412 [p.start() for p in ps1_matrix]
413
414 ps_write.join()
415 [p.join() for p in ps1_matrix]
416 procs1.join()
417 print('That took {} seconds'.format(time.time() - starttime10))
418 print('That took {} seconds'.format(time.time() - starttime))
419