Mercurial > repos > glogobyte > isoread
comparison mirbase.py @ 1:f10c8f43f010 draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 13 Oct 2021 16:03:50 +0000 |
parents | |
children | 6d3abc45aa49 |
comparison
equal
deleted
inserted
replaced
0:63ae92b7f425 | 1:f10c8f43f010 |
---|---|
1 from mirbase_functions import * | |
2 from mirbase_graphs import * | |
3 import time | |
4 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value | |
5 import subprocess | |
6 import argparse | |
7 import sys | |
8 | |
9 subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre']) | |
10 | |
11 parser = argparse.ArgumentParser() | |
12 parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store") | |
13 parser.add_argument("-con", "--control", help="input fastq file (controls)", nargs='+', default=[]) | |
14 parser.add_argument("-tre", "--treated", help="input fastq file (treated)", nargs='+', default=[] ) | |
15 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store") | |
16 parser.add_argument("-gen", "--org_name", help="Organism", action="store") | |
17 parser.add_argument("-f", "--flag", help="choose the database (MirBase,MirGene)", action="store") | |
18 parser.add_argument("-percentage", "--per", help="Percentage of Samples", action="store") | |
19 parser.add_argument("-counts", "--count", help="Counts for filtering", action="store") | |
20 parser.add_argument("-name1", "--group1", help="Samples group 1", action="store") | |
21 parser.add_argument("-name2", "--group2", help="Samples group 2", action="store") | |
22 args = parser.parse_args() | |
23 | |
24 | |
25 ################################################################################################################################################################################################################# | |
26 | |
27 if __name__ == '__main__': | |
28 | |
29 starttime = time.time() | |
30 | |
31 lock = Lock() | |
32 manager = Manager() | |
33 | |
34 # Download reference miRNA sequences from MirBase | |
35 mature_mirnas=manager.list() | |
36 ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name)) | |
37 ps_mature.start() | |
38 | |
39 | |
40 # Keep the names of the files and location paths | |
41 args.control[0]=args.control[0][1:] | |
42 args.control[len(args.control)-1][:-1] | |
43 control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)] | |
44 | |
45 | |
46 args.treated[0]=args.treated[0][1:] | |
47 args.treated[len(args.treated)-1][:-1] | |
48 treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)] | |
49 | |
50 | |
51 ############## Detection of templated isoforms ################ | |
52 | |
53 | |
54 # Initialization of the managers between the proccesses | |
55 # First group of samples (controls) | |
56 con_samples = manager.list() # Collapsed mirnas with the new names | |
57 con_data= manager.list() # keeps all necessary data for the Database | |
58 con_file_order=manager.list() # files' names ordered by processes | |
59 con_names_seqs=manager.list() # keeps only mirna names and sequences | |
60 deseq=manager.list() | |
61 con_unmap_seq=manager.Value('i',0) # keeps unmap unique sequnces for the generation of a graph | |
62 con_unmap_counts=manager.Value('i',0) # keeps unmap counts of sequnces for the generation of a graph | |
63 con_mirna_names=manager.list() # keeps the names of mirnas | |
64 ini_con_samples = manager.list() # filtered SAM files | |
65 | |
66 # Second group of samples (treated) | |
67 tre_samples = manager.list() | |
68 tre_data = manager.list() | |
69 tre_file_order = manager.list() | |
70 tre_names_seqs=manager.list() | |
71 deseq1=manager.list() | |
72 tre_unmap_seq = manager.Value('i',0) | |
73 tre_unmap_counts = manager.Value('i',0) | |
74 tre_mirna_names=manager.list() | |
75 ini_tre_samples = manager.list() | |
76 | |
77 # Wait for the download of reference miRNA sequences | |
78 ps_mature.join() | |
79 mature_mirnas=list(mature_mirnas) | |
80 | |
81 # Processing of the detected miRNAs from SAM files | |
82 ps_sam = [Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,con_samples,con_data,con_file_order,con_unmap_seq,con_names_seqs,deseq,con_mirna_names,ini_con_samples,con_unmap_counts)) for path in control] | |
83 ps_sam.extend([Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,tre_samples,tre_data,tre_file_order,tre_unmap_seq,tre_names_seqs,deseq1,tre_mirna_names,ini_tre_samples,tre_unmap_counts)) for path in treated]) | |
84 | |
85 # Wait for processing of SAM files to finish | |
86 [p.start() for p in ps_sam] | |
87 [p.join() for p in ps_sam] | |
88 | |
89 # Generate a histogram | |
90 ps_hist=[Process(target=hist_red,args=(ini_con_samples,'c',args.group1))] | |
91 ps_hist.extend([Process(target=hist_red,args=(ini_tre_samples,'t',args.group2))]) | |
92 [x.start() for x in ps_hist] | |
93 | |
94 | |
95 # Convert managers to lists | |
96 con_samples = list(con_samples) | |
97 tre_samples = list(tre_samples) | |
98 con_file_order=list(con_file_order) | |
99 tre_file_order=list(tre_file_order) | |
100 deseq=list(deseq) | |
101 deseq1=list(deseq1) | |
102 | |
103 # Remove duplicates and sorting | |
104 con_names_seqs=list(con_names_seqs) | |
105 con_names_seqs.sort() | |
106 con_names_seqs=list(con_names_seqs for con_names_seqs,_ in itertools.groupby(con_names_seqs)) | |
107 | |
108 tre_names_seqs=list(tre_names_seqs) | |
109 tre_names_seqs.sort() | |
110 tre_names_seqs=list(tre_names_seqs for tre_names_seqs,_ in itertools.groupby(tre_names_seqs)) | |
111 | |
112 # initialization of new managers | |
113 new_con_file_order=manager.list() | |
114 new_tre_file_order=manager.list() | |
115 new_deseq=manager.list() | |
116 new_deseq1=manager.list() | |
117 | |
118 # add uncommon detected mirnas among the samples | |
119 ps_un_mirnas=[Process(target=uncommon_mirnas,args=(sampp,con_names_seqs,lock,new_deseq,con_file_order[i],new_con_file_order)) for i,sampp in enumerate(deseq)] | |
120 ps_un_mirnas.extend([Process(target=uncommon_mirnas,args=(sampp,tre_names_seqs,lock,new_deseq1,tre_file_order[i],new_tre_file_order)) for i,sampp in enumerate(deseq1)]) | |
121 | |
122 # Wait for processing of uncommon detected mirnas to finish | |
123 [z.start() for z in ps_un_mirnas] | |
124 [z.join() for z in ps_un_mirnas] | |
125 | |
126 # Convert managers to lists | |
127 new_deseq=list(new_deseq) | |
128 new_deseq1=list(new_deseq1) | |
129 con_file_order=list(new_con_file_order) | |
130 tre_file_order=list(new_tre_file_order) | |
131 | |
132 # Genereation of count matrices per group (controls - treated) | |
133 control_group=[[x[0],x[2]] for x in new_deseq[0]] | |
134 [control_group[i].append(y[i][1]) for i,_ in enumerate(control_group) for y in new_deseq] | |
135 | |
136 treated_group=[[x[0],x[2]] for x in new_deseq1[0]] | |
137 [treated_group[i].append(y[i][1]) for i,_ in enumerate(treated_group) for y in new_deseq1] | |
138 | |
139 # Keep a copy of count matrices | |
140 control_group_copy=copy.deepcopy(list(control_group)) | |
141 treated_group_copy=copy.deepcopy(list(treated_group)) | |
142 | |
143 # Initialization of managers | |
144 merg_nam_control_group=manager.list() | |
145 merg_nam_treated_group=manager.list() | |
146 | |
147 # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates | |
148 ps_merge = [Process(target=merging_names,args=(control_group_copy,merg_nam_control_group))] | |
149 ps_merge.extend([Process(target=merging_names,args=(treated_group_copy,merg_nam_treated_group))]) | |
150 [x.start() for x in ps_merge] | |
151 | |
152 | |
153 # Add unique mirna sequences between groups (all groups will have the same amount of sequences) | |
154 con_list=manager.list() | |
155 tre_list=manager.list() | |
156 | |
157 ps_bw = [Process(target=black_white,args=(con_names_seqs,tre_names_seqs,treated_group,tre_list))] | |
158 ps_bw.extend([Process(target=black_white,args=(tre_names_seqs,con_names_seqs,control_group,con_list))]) | |
159 [x.start() for x in ps_bw] | |
160 [x.join() for x in ps_bw] | |
161 | |
162 control_group=list(con_list) | |
163 treated_group=list(tre_list) | |
164 | |
165 # Detection of duplications | |
166 dupes=manager.list() | |
167 | |
168 ps_dupes = Process(target=merging_dupes,args=(control_group,dupes)) | |
169 ps_dupes.start() | |
170 ps_dupes.join() | |
171 | |
172 dupes=list(dupes) | |
173 | |
174 # Merging the duplications in one entry with all different names | |
175 con_list=manager.list() | |
176 tre_list=manager.list() | |
177 | |
178 ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(control_group,dupes,con_list))] | |
179 ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(treated_group,dupes,tre_list))]) | |
180 [x.start() for x in ps_ap_merg_dupes] | |
181 | |
182 # Preparation of reference sequences (isodforms) for the detection of non template mirnas | |
183 if args.anal=="2": | |
184 all_iso = manager.list() | |
185 ps_non_iso = Process(target=non_template_ref,args=(con_samples,tre_samples,all_iso)) | |
186 ps_non_iso.start() | |
187 | |
188 # Finishing the process for merging | |
189 [x.join() for x in ps_merge] | |
190 merg_nam_control_group=list(merg_nam_control_group) | |
191 merg_nam_treated_group=list(merg_nam_treated_group) | |
192 | |
193 # Export the database and the graphs | |
194 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in con_data] | |
195 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in tre_data]) | |
196 procs.extend([Process(target=make_spider,args=(merg_nam_control_group,merg_nam_treated_group,args.group1,args.group2))]) | |
197 | |
198 if args.anal == "1": | |
199 procs.extend([Process(target=pie_temp,args=(merg_nam_control_group,con_unmap_seq.value,con_unmap_counts.value,merg_nam_treated_group,tre_unmap_seq.value,tre_unmap_counts.value,group_name1,group_name2))]) | |
200 | |
201 [p.start() for p in procs] | |
202 | |
203 # Export the pdf report file | |
204 if args.anal=="1": | |
205 [x.join() for x in ps_hist] | |
206 [p.join() for p in procs] | |
207 ps_pdf = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2)) | |
208 ps_pdf.start() | |
209 | |
210 | |
211 [x.join() for x in ps_ap_merg_dupes] | |
212 control_group=list(con_list) | |
213 treated_group=list(tre_list) | |
214 | |
215 # Filters low count mirnas (otpional) | |
216 if int(args.per)!=-1: | |
217 | |
218 fil_con_group=manager.list() | |
219 fil_tre_group=manager.list() | |
220 | |
221 ps_low_counts = Process(target=filter_low_counts,args=(control_group,treated_group,fil_con_group,fil_tre_group,args.per,args.count)) | |
222 ps_low_counts.start() | |
223 ps_low_counts.join() | |
224 | |
225 fil_con_group=list(fil_con_group) | |
226 fil_tre_group=list(fil_tre_group) | |
227 | |
228 if "fil_con_group" not in locals() or "fil_con_group" not in globals(): | |
229 fil_con_group=control_group | |
230 fil_tre_group=treated_group | |
231 | |
232 # export count matrices | |
233 ps_write = Process(target=write_main,args=(control_group, treated_group, fil_con_group, fil_tre_group, con_file_order,tre_file_order,1,args.group1,args.group2,args.per)) | |
234 ps_write.start() | |
235 | |
236 # export counts files compatible with Deseq2 and EdgeR | |
237 ps1_matrix = [Process(target=temp_counts_to_diff,args=(con_file_order,fil_con_group,"Diff/temp_con/"))] | |
238 ps1_matrix.extend([Process(target=temp_counts_to_diff,args=(tre_file_order,fil_tre_group,"Diff/temp_tre/"))]) | |
239 [p.start() for p in ps1_matrix] | |
240 | |
241 if args.anal=="1": | |
242 ps_pdf.join() | |
243 if args.anal=="2": | |
244 [p.join() for p in procs] | |
245 [x.join() for x in ps_hist] | |
246 | |
247 ps_write.join() | |
248 [p.join() for p in ps1_matrix] | |
249 | |
250 ############################## Detection of non-template ####################################### | |
251 | |
252 starttime10 = time.time() | |
253 | |
254 if args.anal == "2": | |
255 | |
256 # Initialization of the managers between the proccesses | |
257 # First group of samples (controls) | |
258 n_con_data= manager.list() | |
259 n_con_file_order=manager.list() | |
260 n_con_names_seqs=manager.list() | |
261 n_deseq=manager.list() | |
262 | |
263 # Second group of samples (treated) | |
264 n_tre_data = manager.list() | |
265 n_tre_file_order = manager.list() | |
266 n_tre_names_seqs=manager.list() | |
267 n_deseq1=manager.list() | |
268 | |
269 # Preparation of reference sequences | |
270 new_ref_mirnas = list(mature_mirnas) | |
271 ps_non_iso.join() | |
272 | |
273 all_iso=list(all_iso) | |
274 new_ref_mirnas.extend(all_iso) | |
275 | |
276 # Processing of non template miRNAs from SAM files | |
277 ps_sam = [Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_con_data,n_con_file_order,n_deseq,n_con_names_seqs)) for path in control] | |
278 ps_sam.extend([Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_tre_data,n_tre_file_order,n_deseq1,n_tre_names_seqs)) for path in treated]) | |
279 | |
280 [p.start() for p in ps_sam] | |
281 [p.join() for p in ps_sam] | |
282 | |
283 # Convert managers to lists | |
284 n_con_file_order=list(n_con_file_order) | |
285 n_tre_file_order=list(n_tre_file_order) | |
286 n_deseq=list(n_deseq) | |
287 n_deseq1=list(n_deseq1) | |
288 | |
289 # Remove duplicates and sorting | |
290 n_con_names_seqs=list(n_con_names_seqs) | |
291 n_con_names_seqs.sort() | |
292 n_con_names_seqs=list(n_con_names_seqs for n_con_names_seqs,_ in itertools.groupby(n_con_names_seqs)) | |
293 | |
294 n_tre_names_seqs=list(n_tre_names_seqs) | |
295 n_tre_names_seqs.sort() | |
296 n_tre_names_seqs=list(n_tre_names_seqs for n_tre_names_seqs,_ in itertools.groupby(n_tre_names_seqs)) | |
297 | |
298 # initialization of new managers | |
299 new_n_con_file_order=manager.list() | |
300 new_n_tre_file_order=manager.list() | |
301 n_new_deseq=manager.list() | |
302 n_new_deseq1=manager.list() | |
303 | |
304 # add uncommon detected mirnas among the samples | |
305 ps_deseq=[Process(target=uncommon_mirnas,args=(sampp,n_con_names_seqs,lock,n_new_deseq,n_con_file_order[i],new_n_con_file_order)) for i,sampp in enumerate(n_deseq)] | |
306 ps_deseq.extend([Process(target=uncommon_mirnas,args=(sampp,n_tre_names_seqs,lock,n_new_deseq1,n_tre_file_order[i],new_n_tre_file_order)) for i,sampp in enumerate(n_deseq1)]) | |
307 | |
308 # Wait for processing of uncommon detected mirnas to finish | |
309 [x.start() for x in ps_deseq] | |
310 [x.join() for x in ps_deseq] | |
311 | |
312 # Convert managers to lists | |
313 n_new_deseq=list(n_new_deseq) | |
314 n_new_deseq1=list(n_new_deseq1) | |
315 n_con_file_order=list(new_n_con_file_order) | |
316 n_tre_file_order=list(new_n_tre_file_order) | |
317 | |
318 # Genereation of count matrices per group (controls - treated) | |
319 n_control_group=[[x[0],x[2]] for x in n_new_deseq[0]] | |
320 [n_control_group[i].append(y[i][1]) for i,_ in enumerate(n_control_group) for y in n_new_deseq] | |
321 | |
322 n_treated_group=[[x[0],x[2]] for x in n_new_deseq1[0]] | |
323 [n_treated_group[i].append(y[i][1]) for i,_ in enumerate(n_treated_group) for y in n_new_deseq1] | |
324 | |
325 # Keep a copy of count matrices | |
326 n_control_group_copy=copy.deepcopy(list(n_control_group)) | |
327 n_treated_group_copy=copy.deepcopy(list(n_treated_group)) | |
328 | |
329 # Initialization of managers | |
330 merg_nam_n_control_group=manager.list() | |
331 merg_nam_n_treated_group=manager.list() | |
332 | |
333 # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates\ | |
334 ps_merge = [Process(target=merging_names,args=(n_control_group_copy,merg_nam_n_control_group))] | |
335 ps_merge.extend([Process(target=merging_names,args=(n_treated_group_copy,merg_nam_n_treated_group))]) | |
336 [x.start() for x in ps_merge] | |
337 | |
338 # Add unique mirna sequences between groups (all groups will have the same amount of sequences) | |
339 n_con_list=manager.list() | |
340 n_tre_list=manager.list() | |
341 | |
342 ps_bw = [Process(target=black_white,args=(n_con_names_seqs,n_tre_names_seqs,n_treated_group,n_tre_list))] | |
343 ps_bw.extend([Process(target=black_white,args=(n_tre_names_seqs,n_con_names_seqs,n_control_group,n_con_list))]) | |
344 [x.start() for x in ps_bw] | |
345 [x.join() for x in ps_bw] | |
346 | |
347 n_control_group=list(n_con_list) | |
348 n_treated_group=list(n_tre_list) | |
349 | |
350 # Detection of duplications | |
351 n_dupes=manager.list() | |
352 | |
353 ps_dupes = Process(target=merging_dupes,args=(n_control_group,n_dupes)) | |
354 ps_dupes.start() | |
355 ps_dupes.join() | |
356 | |
357 n_dupes=list(n_dupes) | |
358 | |
359 # Merging the duplications in one entry with all different names | |
360 n_con_list=manager.list() | |
361 n_tre_list=manager.list() | |
362 | |
363 ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(n_control_group,n_dupes,n_con_list))] | |
364 ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(n_treated_group,n_dupes,n_tre_list))]) | |
365 [x.start() for x in ps_ap_merg_dupes] | |
366 | |
367 # Finishing the process for merging | |
368 [x.join() for x in ps_merge] | |
369 merg_nam_n_control_group=list(merg_nam_n_control_group) | |
370 merg_nam_n_treated_group=list(merg_nam_n_treated_group) | |
371 | |
372 # Export the database and the graphs | |
373 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_con_data] | |
374 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_tre_data]) | |
375 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_control_group,'c',args.group1))]) | |
376 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_treated_group,'t',args.group2))]) | |
377 procs.extend([Process(target=pie_non_temp,args=(merg_nam_control_group,merg_nam_n_control_group,merg_nam_treated_group,merg_nam_n_treated_group,con_unmap_seq.value,tre_unmap_seq.value,con_unmap_counts.value,tre_unmap_counts.value,args.group1,args.group2))]) | |
378 | |
379 [p.start() for p in procs] | |
380 [p.join() for p in procs] | |
381 | |
382 procs1 = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2)) | |
383 procs1.start() | |
384 | |
385 [x.join() for x in ps_ap_merg_dupes] | |
386 n_control_group=list(n_con_list) | |
387 n_treated_group=list(n_tre_list) | |
388 | |
389 | |
390 # Filters low count mirnas (otpional) | |
391 if int(args.per)!=-1: | |
392 | |
393 n_fil_con_group=manager.list() | |
394 n_fil_tre_group=manager.list() | |
395 | |
396 ps_low_counts = Process(target=filter_low_counts,args=(n_control_group,n_treated_group,n_fil_con_group,n_fil_tre_group,args.per,args.count)) | |
397 ps_low_counts.start() | |
398 ps_low_counts.join() | |
399 | |
400 n_fil_con_group=list(n_fil_con_group) | |
401 n_fil_tre_group=list(n_fil_tre_group) | |
402 | |
403 if "n_fil_con_group" not in locals() or "n_fil_con_group" not in globals(): | |
404 n_fil_con_group=n_control_group | |
405 n_fil_tre_group=n_treated_group | |
406 | |
407 ps_write = Process(target=write_main,args=(n_control_group, n_treated_group,n_fil_con_group, n_fil_tre_group, n_con_file_order, n_tre_file_order,2,args.group1,args.group2,args.per)) | |
408 ps_write.start() | |
409 | |
410 ps1_matrix = [Process(target=nontemp_counts_to_diff,args=(n_con_file_order,n_fil_con_group,con_file_order,fil_con_group,"Diff/n_temp_con/"))] | |
411 ps1_matrix.extend([Process(target=nontemp_counts_to_diff,args=(n_tre_file_order,n_fil_tre_group,tre_file_order,fil_tre_group,"Diff/n_temp_tre/"))]) | |
412 [p.start() for p in ps1_matrix] | |
413 | |
414 ps_write.join() | |
415 [p.join() for p in ps1_matrix] | |
416 procs1.join() | |
417 print('That took {} seconds'.format(time.time() - starttime10)) | |
418 print('That took {} seconds'.format(time.time() - starttime)) | |
419 |