Mercurial > repos > glogobyte > isoread
comparison mirgene.py @ 4:f44185f616bc draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 13 Oct 2021 16:04:40 +0000 |
parents | |
children | 476121298816 |
comparison
equal
deleted
inserted
replaced
3:d77b33e65501 | 4:f44185f616bc |
---|---|
1 from mirgene_functions import * | |
2 from mirgene_graphs import * | |
3 import time | |
4 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value | |
5 import subprocess | |
6 import argparse | |
7 | |
8 subprocess.call(['mkdir','-p','split1','split2','split3','split4','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre']) | |
9 | |
10 parser = argparse.ArgumentParser() | |
11 parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store") | |
12 parser.add_argument("-con", "--control", help="input fastq file (controls)", nargs='+', default=[]) | |
13 parser.add_argument("-tre", "--treated", help="input fastq file (treated)", nargs='+', default=[] ) | |
14 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store") | |
15 parser.add_argument("-gen", "--org_name", help="Organism", action="store") | |
16 parser.add_argument("-f", "--flag", help="choose the database (MirBase,MirGene)", action="store") | |
17 parser.add_argument("-percentage", "--per", help="Percentage of Samples", action="store") | |
18 parser.add_argument("-counts", "--count", help="Counts for filtering", action="store") | |
19 parser.add_argument("-name1", "--group1", help="Samples group 1", action="store") | |
20 parser.add_argument("-name2", "--group2", help="Samples group 2", action="store") | |
21 args = parser.parse_args() | |
22 | |
23 | |
24 #########################################################################3############################################################################################################################################################################################### | |
25 | |
26 if __name__ == '__main__': | |
27 | |
28 starttime = time.time() | |
29 | |
30 lock = Lock() | |
31 manager = Manager() | |
32 | |
33 # Download reference miRNA sequences from MirBase | |
34 mature_mirnas=manager.list() | |
35 ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name)) | |
36 ps_mature.start() | |
37 | |
38 # Keep the names of the files and location paths | |
39 args.control[0]=args.control[0][1:] | |
40 args.control[len(args.control)-1][:-1] | |
41 control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)] | |
42 | |
43 args.treated[0]=args.treated[0][1:] | |
44 args.treated[len(args.treated)-1][:-1] | |
45 treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)] | |
46 | |
47 | |
48 ############## Detection of templated isoforms ################ | |
49 | |
50 #radar = manager.list([0,0,0,0]) | |
51 con_samples = manager.list() | |
52 con_data= manager.list() | |
53 con_file_order=manager.list() | |
54 con_names_seqs=manager.list() | |
55 deseq=manager.list() | |
56 con_unmap_seq=manager.Value('i',0) | |
57 con_unmap_counts=manager.Value('i',0) | |
58 con_mirna_names=manager.list() | |
59 ini_con_samples = manager.list() | |
60 | |
61 #radar1 = manager.list([0,0,0,0]) | |
62 tre_samples = manager.list() | |
63 tre_data = manager.list() | |
64 tre_file_order = manager.list() | |
65 tre_names_seqs=manager.list() | |
66 deseq1=manager.list() | |
67 tre_unmap_seq = manager.Value('i',0) | |
68 tre_unmap_counts = manager.Value('i',0) | |
69 tre_mirna_names=manager.list() | |
70 ini_tre_samples = manager.list() | |
71 | |
72 # Wait for the download of reference miRNA sequences | |
73 ps_mature.join() | |
74 mature_mirnas=list(mature_mirnas) | |
75 | |
76 # Processing of the detected miRNAs from SAM files | |
77 ps_sam = [Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,con_samples,con_data,con_file_order,con_unmap_seq,con_names_seqs,deseq,con_mirna_names,ini_con_samples,con_unmap_counts)) for path in control] | |
78 ps_sam.extend([Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,tre_samples,tre_data,tre_file_order,tre_unmap_seq,tre_names_seqs,deseq1,tre_mirna_names,ini_tre_samples,tre_unmap_counts)) for path in treated]) | |
79 | |
80 # Wait for processing of SAM files to finish | |
81 [p.start() for p in ps_sam] | |
82 [p.join() for p in ps_sam] | |
83 | |
84 # Generate a histogram | |
85 ps_hist=[Process(target=hist_red,args=(ini_con_samples,'c',args.group1))] | |
86 ps_hist.extend([Process(target=hist_red,args=(ini_tre_samples,'t',args.group2))]) | |
87 [x.start() for x in ps_hist] | |
88 | |
89 # Convert managers to lists | |
90 con_samples = list(con_samples) | |
91 tre_samples = list(tre_samples) | |
92 con_file_order=list(con_file_order) | |
93 tre_file_order=list(tre_file_order) | |
94 deseq=list(deseq) | |
95 deseq1=list(deseq1) | |
96 | |
97 # Remove duplicates and sorting | |
98 con_names_seqs=list(con_names_seqs) | |
99 con_names_seqs.sort() | |
100 con_names_seqs=list(con_names_seqs for con_names_seqs,_ in itertools.groupby(con_names_seqs)) | |
101 | |
102 tre_names_seqs=list(tre_names_seqs) | |
103 tre_names_seqs.sort() | |
104 tre_names_seqs=list(tre_names_seqs for tre_names_seqs,_ in itertools.groupby(tre_names_seqs)) | |
105 | |
106 # initialization of new managers | |
107 new_con_file_order=manager.list() | |
108 new_tre_file_order=manager.list() | |
109 new_deseq=manager.list() | |
110 new_deseq1=manager.list() | |
111 | |
112 # add uncommon detected mirnas among the samples | |
113 ps_un_mirnas=[Process(target=uncommon_mirnas,args=(sampp,con_names_seqs,lock,new_deseq,con_file_order[i],new_con_file_order)) for i,sampp in enumerate(deseq)] | |
114 ps_un_mirnas.extend([Process(target=uncommon_mirnas,args=(sampp,tre_names_seqs,lock,new_deseq1,tre_file_order[i],new_tre_file_order)) for i,sampp in enumerate(deseq1)]) | |
115 | |
116 # Wait for processing of uncommon detected mirnas to finish | |
117 [z.start() for z in ps_un_mirnas] | |
118 [z.join() for z in ps_un_mirnas] | |
119 | |
120 # Convert managers to lists | |
121 new_deseq=list(new_deseq) | |
122 new_deseq1=list(new_deseq1) | |
123 con_file_order=list(new_con_file_order) | |
124 tre_file_order=list(new_tre_file_order) | |
125 | |
126 # Genereation of count matrices per group (controls - treated) | |
127 control_group=[[x[0],x[2]] for x in new_deseq[0]] | |
128 [control_group[i].append(y[i][1]) for i,_ in enumerate(control_group) for y in new_deseq] | |
129 | |
130 treated_group=[[x[0],x[2]] for x in new_deseq1[0]] | |
131 [treated_group[i].append(y[i][1]) for i,_ in enumerate(treated_group) for y in new_deseq1] | |
132 | |
133 # Keep a copy of count matrices | |
134 control_group_copy=copy.deepcopy(list(control_group)) | |
135 treated_group_copy=copy.deepcopy(list(treated_group)) | |
136 | |
137 # Initialization of managers | |
138 merg_nam_control_group=manager.list() | |
139 merg_nam_treated_group=manager.list() | |
140 | |
141 # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates | |
142 ps_merge = [Process(target=merging_names,args=(control_group_copy,merg_nam_control_group))] | |
143 ps_merge.extend([Process(target=merging_names,args=(treated_group_copy,merg_nam_treated_group))]) | |
144 [x.start() for x in ps_merge] | |
145 | |
146 # Add unique mirna sequences between groups (all groups will have the same amount of sequences) | |
147 con_list=manager.list() | |
148 tre_list=manager.list() | |
149 | |
150 ps_bw = [Process(target=black_white,args=(con_names_seqs,tre_names_seqs,treated_group,tre_list))] | |
151 ps_bw.extend([Process(target=black_white,args=(tre_names_seqs,con_names_seqs,control_group,con_list))]) | |
152 [x.start() for x in ps_bw] | |
153 [x.join() for x in ps_bw] | |
154 | |
155 control_group=list(con_list) | |
156 treated_group=list(tre_list) | |
157 | |
158 # Detection of duplications | |
159 dupes=manager.list() | |
160 | |
161 ps_dupes = Process(target=merging_dupes,args=(control_group,dupes)) | |
162 ps_dupes.start() | |
163 ps_dupes.join() | |
164 | |
165 dupes=list(dupes) | |
166 | |
167 # Merging the duplications in one entry with all different names | |
168 con_list=manager.list() | |
169 tre_list=manager.list() | |
170 | |
171 ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(control_group,dupes,con_list))] | |
172 ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(treated_group,dupes,tre_list))]) | |
173 [x.start() for x in ps_ap_merg_dupes] | |
174 | |
175 # Preparation of reference sequences (isodforms) for the detection of non template mirnas | |
176 if args.anal=="2": | |
177 all_iso = manager.list() | |
178 ps_non_iso = Process(target=non_template_ref,args=(con_samples,tre_samples,all_iso)) | |
179 ps_non_iso.start() | |
180 | |
181 # Finishing the process for merging | |
182 [x.join() for x in ps_merge] | |
183 merg_nam_control_group=list(merg_nam_control_group) | |
184 merg_nam_treated_group=list(merg_nam_treated_group) | |
185 | |
186 # Export the database and the graphs | |
187 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in con_data] | |
188 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in tre_data]) | |
189 procs.extend([Process(target=make_spider,args=(merg_nam_control_group,merg_nam_treated_group,args.group1,args.group2))]) | |
190 | |
191 if args.anal == "1": | |
192 procs.extend([Process(target=pie_temp,args=(merg_nam_control_group,con_unmap_seq.value,con_unmap_counts.value,merg_nam_treated_group,tre_unmap_seq.value,tre_unmap_counts.value,args.group1,args.group2))]) | |
193 | |
194 [p.start() for p in procs] | |
195 | |
196 # Export the pdf report file | |
197 if args.anal=="1": | |
198 [x.join() for x in ps_hist] | |
199 [p.join() for p in procs] | |
200 ps_pdf = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2)) | |
201 ps_pdf.start() | |
202 | |
203 [x.join() for x in ps_ap_merg_dupes] | |
204 control_group=list(con_list) | |
205 treated_group=list(tre_list) | |
206 | |
207 # Filters low count mirnas (otpional) | |
208 if int(args.per)!=-1: | |
209 | |
210 fil_con_group=manager.list() | |
211 fil_tre_group=manager.list() | |
212 | |
213 ps_low_counts = Process(target=filter_low_counts,args=(control_group,treated_group,fil_con_group,fil_tre_group,args.per,args.count)) | |
214 ps_low_counts.start() | |
215 ps_low_counts.join() | |
216 | |
217 fil_con_group=list(fil_con_group) | |
218 fil_tre_group=list(fil_tre_group) | |
219 | |
220 if "fil_con_group" not in locals() or "fil_con_group" not in globals(): | |
221 fil_con_group=control_group | |
222 fil_tre_group=treated_group | |
223 | |
224 # export count matrices | |
225 ps_write = Process(target=write_main,args=(control_group, treated_group, fil_con_group, fil_tre_group, con_file_order,tre_file_order,1,args.group1,args.group2,args.per)) | |
226 ps_write.start() | |
227 | |
228 # export counts files compatible with Deseq2 and EdgeR | |
229 ps1_matrix = [Process(target=temp_counts_to_diff,args=(con_file_order,fil_con_group,"Diff/temp_con/",0))] | |
230 ps1_matrix.extend([Process(target=temp_counts_to_diff,args=(tre_file_order,fil_tre_group,"Diff/temp_tre/",0))]) | |
231 [p.start() for p in ps1_matrix] | |
232 | |
233 if args.anal=="1": | |
234 ps_pdf.join() | |
235 if args.anal=="2": | |
236 [p.join() for p in procs] | |
237 [x.join() for x in ps_hist] | |
238 | |
239 ps_write.join() | |
240 [p.join() for p in ps1_matrix] | |
241 | |
242 ############################## Detection of Both ####################################### | |
243 | |
244 half = time.time() | |
245 | |
246 if args.anal == "2": | |
247 | |
248 # Initialization of the managers between the proccesses | |
249 # First group of samples (controls) | |
250 n_con_data= manager.list() | |
251 n_con_file_order=manager.list() | |
252 n_con_names_seqs=manager.list() | |
253 n_deseq=manager.list() | |
254 | |
255 # Second group of samples (treated) | |
256 n_tre_data = manager.list() | |
257 n_tre_file_order = manager.list() | |
258 n_tre_names_seqs=manager.list() | |
259 n_deseq1=manager.list() | |
260 | |
261 # Preparation of reference sequences | |
262 new_ref_mirnas = list(mature_mirnas) | |
263 ps_non_iso.join() | |
264 | |
265 | |
266 all_iso=list(all_iso) | |
267 new_ref_mirnas.extend(all_iso) | |
268 | |
269 # Processing of non template miRNAs from SAM files | |
270 ps_sam = [Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_con_data,n_con_file_order,n_deseq,n_con_names_seqs)) for path in control] | |
271 ps_sam.extend([Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_tre_data,n_tre_file_order,n_deseq1,n_tre_names_seqs)) for path in treated]) | |
272 | |
273 [p.start() for p in ps_sam] | |
274 [p.join() for p in ps_sam] | |
275 | |
276 # Convert managers to lists | |
277 n_con_file_order=list(n_con_file_order) | |
278 n_tre_file_order=list(n_tre_file_order) | |
279 n_deseq=list(n_deseq) | |
280 n_deseq1=list(n_deseq1) | |
281 | |
282 # Remove duplicates and sorting | |
283 n_con_names_seqs=list(n_con_names_seqs) | |
284 n_con_names_seqs.sort() | |
285 n_con_names_seqs=list(n_con_names_seqs for n_con_names_seqs,_ in itertools.groupby(n_con_names_seqs)) | |
286 | |
287 n_tre_names_seqs=list(n_tre_names_seqs) | |
288 n_tre_names_seqs.sort() | |
289 n_tre_names_seqs=list(n_tre_names_seqs for n_tre_names_seqs,_ in itertools.groupby(n_tre_names_seqs)) | |
290 | |
291 # initialization of new managers | |
292 new_n_con_file_order=manager.list() | |
293 new_n_tre_file_order=manager.list() | |
294 n_new_deseq=manager.list() | |
295 n_new_deseq1=manager.list() | |
296 | |
297 # add uncommon detected mirnas among the samples | |
298 ps_deseq=[Process(target=uncommon_mirnas,args=(sampp,n_con_names_seqs,lock,n_new_deseq,n_con_file_order[i],new_n_con_file_order)) for i,sampp in enumerate(n_deseq)] | |
299 ps_deseq.extend([Process(target=uncommon_mirnas,args=(sampp,n_tre_names_seqs,lock,n_new_deseq1,n_tre_file_order[i],new_n_tre_file_order)) for i,sampp in enumerate(n_deseq1)]) | |
300 | |
301 # Wait for processing of uncommon detected mirnas to finish | |
302 [x.start() for x in ps_deseq] | |
303 [x.join() for x in ps_deseq] | |
304 | |
305 # Convert managers to lists | |
306 n_new_deseq=list(n_new_deseq) | |
307 n_new_deseq1=list(n_new_deseq1) | |
308 n_con_file_order=list(new_n_con_file_order) | |
309 n_tre_file_order=list(new_n_tre_file_order) | |
310 | |
311 # Genereation of count matrices per group (controls - treated) | |
312 n_control_group=[[x[0],x[2]] for x in n_new_deseq[0]] | |
313 [n_control_group[i].append(y[i][1]) for i,_ in enumerate(n_control_group) for y in n_new_deseq] | |
314 | |
315 n_treated_group=[[x[0],x[2]] for x in n_new_deseq1[0]] | |
316 [n_treated_group[i].append(y[i][1]) for i,_ in enumerate(n_treated_group) for y in n_new_deseq1] | |
317 | |
318 # Keep a copy of count matrices | |
319 n_control_group_copy=copy.deepcopy(list(n_control_group)) | |
320 n_treated_group_copy=copy.deepcopy(list(n_treated_group)) | |
321 | |
322 # Initialization of managers | |
323 merg_nam_n_control_group=manager.list() | |
324 merg_nam_n_treated_group=manager.list() | |
325 | |
326 # Merging of different names for the same mirna sequence per group (controls, treated) to avoid duplicates | |
327 ps_merge = [Process(target=merging_names,args=(n_control_group_copy,merg_nam_n_control_group))] | |
328 ps_merge.extend([Process(target=merging_names,args=(n_treated_group_copy,merg_nam_n_treated_group))]) | |
329 [x.start() for x in ps_merge] | |
330 | |
331 # Add unique mirna sequences between groups (all groups will have the same amount of sequences) | |
332 n_con_list=manager.list() | |
333 n_tre_list=manager.list() | |
334 | |
335 ps_bw = [Process(target=black_white,args=(n_con_names_seqs,n_tre_names_seqs,n_treated_group,n_tre_list))] | |
336 ps_bw.extend([Process(target=black_white,args=(n_tre_names_seqs,n_con_names_seqs,n_control_group,n_con_list))]) | |
337 [x.start() for x in ps_bw] | |
338 [x.join() for x in ps_bw] | |
339 | |
340 n_control_group=list(n_con_list) | |
341 n_treated_group=list(n_tre_list) | |
342 | |
343 # Detection of duplications | |
344 n_dupes=manager.list() | |
345 | |
346 ps_dupes = Process(target=merging_dupes,args=(n_control_group,n_dupes)) | |
347 ps_dupes.start() | |
348 ps_dupes.join() | |
349 | |
350 n_dupes=list(n_dupes) | |
351 | |
352 # Merging the duplications in one entry with all different names | |
353 n_con_list=manager.list() | |
354 n_tre_list=manager.list() | |
355 | |
356 ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(n_control_group,n_dupes,n_con_list))] | |
357 ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(n_treated_group,n_dupes,n_tre_list))]) | |
358 [x.start() for x in ps_ap_merg_dupes] | |
359 | |
360 # Finishing the process for merging | |
361 [x.join() for x in ps_merge] | |
362 merg_nam_n_control_group=list(merg_nam_n_control_group) | |
363 merg_nam_n_treated_group=list(merg_nam_n_treated_group) | |
364 | |
365 # Export the database and the graphs | |
366 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_con_data] | |
367 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_tre_data]) | |
368 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_control_group,'c',args.group1))]) | |
369 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_treated_group,'t',args.group2))]) | |
370 procs.extend([Process(target=pie_non_temp,args=(merg_nam_control_group,merg_nam_n_control_group,merg_nam_treated_group,merg_nam_n_treated_group,con_unmap_seq.value,tre_unmap_seq.value,con_unmap_counts.value,tre_unmap_counts.value,args.group1,args.group2))]) | |
371 | |
372 [p.start() for p in procs] | |
373 [p.join() for p in procs] | |
374 | |
375 procs1 = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2)) | |
376 procs1.start() | |
377 | |
378 [x.join() for x in ps_ap_merg_dupes] | |
379 n_control_group=list(n_con_list) | |
380 n_treated_group=list(n_tre_list) | |
381 | |
382 # Filters low count mirnas (otpional) | |
383 if int(args.per)!=-1: | |
384 | |
385 n_fil_con_group=manager.list() | |
386 n_fil_tre_group=manager.list() | |
387 | |
388 ps_low_counts = Process(target=filter_low_counts,args=(n_control_group,n_treated_group,n_fil_con_group,n_fil_tre_group,args.per,args.count)) | |
389 ps_low_counts.start() | |
390 ps_low_counts.join() | |
391 | |
392 n_fil_con_group=list(n_fil_con_group) | |
393 n_fil_tre_group=list(n_fil_tre_group) | |
394 | |
395 if "n_fil_con_group" not in locals() or "n_fil_con_group"not in globals(): | |
396 n_fil_con_group=n_control_group | |
397 n_fil_tre_group=n_treated_group | |
398 | |
399 | |
400 ps_write = Process(target=write_main,args=(n_control_group, n_treated_group,n_fil_con_group, n_fil_tre_group, n_con_file_order, n_tre_file_order,2,args.group1,args.group2,args.per)) | |
401 ps_write.start() | |
402 | |
403 ps1_matrix = [Process(target=nontemp_counts_to_diff,args=(n_con_file_order,n_fil_con_group,con_file_order,fil_con_group,"Diff/n_temp_con/",0))] | |
404 ps1_matrix.extend([Process(target=nontemp_counts_to_diff,args=(n_tre_file_order,n_fil_tre_group,tre_file_order,fil_tre_group,"Diff/n_temp_tre/",0))]) | |
405 [p.start() for p in ps1_matrix] | |
406 | |
407 ps_write.join() | |
408 [p.join() for p in ps1_matrix] | |
409 procs1.join() | |
410 print('That took {} seconds'.format(time.time() - half)) | |
411 print('That took {} seconds'.format(time.time() - starttime)) | |
412 | |
413 | |
414 | |
415 | |
416 | |
417 | |
418 | |
419 |