diff mirbase.py @ 1:f10c8f43f010 draft

Uploaded
author glogobyte
date Wed, 13 Oct 2021 16:03:50 +0000
parents
children 6d3abc45aa49
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mirbase.py	Wed Oct 13 16:03:50 2021 +0000
@@ -0,0 +1,419 @@
+from mirbase_functions import *
+from mirbase_graphs import *
+import time
+from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
+import subprocess
+import argparse
+import sys
+
+subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre'])
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store")
+parser.add_argument("-con", "--control", help="input fastq file (controls)", nargs='+', default=[])
+parser.add_argument("-tre", "--treated", help="input fastq file (treated)", nargs='+', default=[] )
+parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
+parser.add_argument("-gen", "--org_name", help="Organism", action="store")
+parser.add_argument("-f", "--flag", help="choose the database (MirBase,MirGene)", action="store")
+parser.add_argument("-percentage", "--per", help="Percentage of Samples", action="store")
+parser.add_argument("-counts", "--count", help="Counts for filtering", action="store")
+parser.add_argument("-name1", "--group1", help="Samples group 1", action="store")
+parser.add_argument("-name2", "--group2", help="Samples group 2", action="store")
+args = parser.parse_args()
+
+
+#################################################################################################################################################################################################################
+
+if __name__ == '__main__':
+
+ starttime = time.time()
+
+ lock = Lock()
+ manager = Manager()
+
+ # Download reference miRNA sequences from MirBase
+ mature_mirnas=manager.list()
+ ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name))
+ ps_mature.start()
+
+
+ # Keep the names of the files and location paths
+ args.control[0]=args.control[0][1:]
+ args.control[len(args.control)-1][:-1]
+ control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)]
+
+
+ args.treated[0]=args.treated[0][1:]
+ args.treated[len(args.treated)-1][:-1]
+ treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)]
+
+
+##############  Detection of templated isoforms ################
+
+
+ # Initialization of the managers between the proccesses
+ # First group of samples (controls)
+ con_samples = manager.list()           # Collapsed mirnas with the new names
+ con_data= manager.list()	        # keeps all necessary data for the Database
+ con_file_order=manager.list()          # files' names ordered by processes
+ con_names_seqs=manager.list()          # keeps only mirna names and sequences
+ deseq=manager.list()
+ con_unmap_seq=manager.Value('i',0)             # keeps unmap unique sequnces for the generation of a graph
+ con_unmap_counts=manager.Value('i',0)  # keeps unmap counts of sequnces for the generation of a graph
+ con_mirna_names=manager.list()         # keeps the names of mirnas
+ ini_con_samples = manager.list()       # filtered SAM files
+
+ # Second group of samples (treated)
+ tre_samples = manager.list()
+ tre_data = manager.list()
+ tre_file_order = manager.list()
+ tre_names_seqs=manager.list()
+ deseq1=manager.list()
+ tre_unmap_seq = manager.Value('i',0)
+ tre_unmap_counts = manager.Value('i',0)
+ tre_mirna_names=manager.list()
+ ini_tre_samples = manager.list()
+
+ # Wait for the download of reference miRNA sequences
+ ps_mature.join()
+ mature_mirnas=list(mature_mirnas)
+
+ # Processing of the detected miRNAs from SAM files
+ ps_sam = [Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,con_samples,con_data,con_file_order,con_unmap_seq,con_names_seqs,deseq,con_mirna_names,ini_con_samples,con_unmap_counts)) for path in control]
+ ps_sam.extend([Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,tre_samples,tre_data,tre_file_order,tre_unmap_seq,tre_names_seqs,deseq1,tre_mirna_names,ini_tre_samples,tre_unmap_counts)) for path in treated])
+
+ # Wait for processing of SAM files to finish
+ [p.start() for p in ps_sam]
+ [p.join() for p in ps_sam]
+
+ # Generate a histogram
+ ps_hist=[Process(target=hist_red,args=(ini_con_samples,'c',args.group1))]
+ ps_hist.extend([Process(target=hist_red,args=(ini_tre_samples,'t',args.group2))])
+ [x.start() for x in ps_hist]
+
+
+ # Convert managers to lists
+ con_samples = list(con_samples)
+ tre_samples = list(tre_samples)
+ con_file_order=list(con_file_order)
+ tre_file_order=list(tre_file_order)
+ deseq=list(deseq)
+ deseq1=list(deseq1)
+
+ # Remove duplicates and sorting
+ con_names_seqs=list(con_names_seqs)
+ con_names_seqs.sort()
+ con_names_seqs=list(con_names_seqs for con_names_seqs,_ in itertools.groupby(con_names_seqs))
+
+ tre_names_seqs=list(tre_names_seqs)
+ tre_names_seqs.sort()
+ tre_names_seqs=list(tre_names_seqs for tre_names_seqs,_ in itertools.groupby(tre_names_seqs))
+
+ # initialization of new managers
+ new_con_file_order=manager.list()
+ new_tre_file_order=manager.list()
+ new_deseq=manager.list()
+ new_deseq1=manager.list()
+
+ # add uncommon detected mirnas among the samples
+ ps_un_mirnas=[Process(target=uncommon_mirnas,args=(sampp,con_names_seqs,lock,new_deseq,con_file_order[i],new_con_file_order)) for i,sampp in enumerate(deseq)]
+ ps_un_mirnas.extend([Process(target=uncommon_mirnas,args=(sampp,tre_names_seqs,lock,new_deseq1,tre_file_order[i],new_tre_file_order)) for i,sampp in enumerate(deseq1)])
+
+ # Wait for processing of uncommon detected mirnas to finish
+ [z.start() for z in ps_un_mirnas]
+ [z.join() for z in ps_un_mirnas]
+
+ # Convert managers to lists
+ new_deseq=list(new_deseq)
+ new_deseq1=list(new_deseq1)
+ con_file_order=list(new_con_file_order)
+ tre_file_order=list(new_tre_file_order)
+
+ # Genereation of count matrices per group (controls - treated)
+ control_group=[[x[0],x[2]] for x in new_deseq[0]]
+ [control_group[i].append(y[i][1]) for i,_ in enumerate(control_group) for y in new_deseq]
+
+ treated_group=[[x[0],x[2]] for x in new_deseq1[0]]
+ [treated_group[i].append(y[i][1]) for i,_ in enumerate(treated_group) for y in new_deseq1]
+
+ # Keep a copy of count matrices
+ control_group_copy=copy.deepcopy(list(control_group))
+ treated_group_copy=copy.deepcopy(list(treated_group))
+
+ # Initialization of managers
+ merg_nam_control_group=manager.list()
+ merg_nam_treated_group=manager.list()
+
+ # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates
+ ps_merge = [Process(target=merging_names,args=(control_group_copy,merg_nam_control_group))]
+ ps_merge.extend([Process(target=merging_names,args=(treated_group_copy,merg_nam_treated_group))])
+ [x.start() for x in ps_merge]
+
+
+ # Add unique mirna sequences between groups (all groups will have the same amount of sequences)
+ con_list=manager.list()
+ tre_list=manager.list()
+
+ ps_bw = [Process(target=black_white,args=(con_names_seqs,tre_names_seqs,treated_group,tre_list))]
+ ps_bw.extend([Process(target=black_white,args=(tre_names_seqs,con_names_seqs,control_group,con_list))])
+ [x.start() for x in ps_bw]
+ [x.join() for x in ps_bw]
+
+ control_group=list(con_list)
+ treated_group=list(tre_list)
+
+ # Detection of duplications
+ dupes=manager.list()
+
+ ps_dupes = Process(target=merging_dupes,args=(control_group,dupes))
+ ps_dupes.start()
+ ps_dupes.join()
+
+ dupes=list(dupes)
+
+ # Merging the duplications in one entry with all different names
+ con_list=manager.list()
+ tre_list=manager.list()
+
+ ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(control_group,dupes,con_list))]
+ ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(treated_group,dupes,tre_list))])
+ [x.start() for x in ps_ap_merg_dupes]
+
+ # Preparation of reference sequences (isodforms) for the detection of non template mirnas
+ if args.anal=="2":
+    all_iso = manager.list()
+    ps_non_iso = Process(target=non_template_ref,args=(con_samples,tre_samples,all_iso))
+    ps_non_iso.start()
+
+ # Finishing the process for merging
+ [x.join() for x in ps_merge]
+ merg_nam_control_group=list(merg_nam_control_group)
+ merg_nam_treated_group=list(merg_nam_treated_group)
+
+ # Export the database and the graphs
+ procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in con_data]
+ procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in tre_data])
+ procs.extend([Process(target=make_spider,args=(merg_nam_control_group,merg_nam_treated_group,args.group1,args.group2))])
+
+ if args.anal == "1":
+    procs.extend([Process(target=pie_temp,args=(merg_nam_control_group,con_unmap_seq.value,con_unmap_counts.value,merg_nam_treated_group,tre_unmap_seq.value,tre_unmap_counts.value,group_name1,group_name2))])
+
+ [p.start() for p in procs]
+
+ # Export the pdf report file
+ if args.anal=="1":
+    [x.join() for x in ps_hist]
+    [p.join() for p in procs]
+    ps_pdf = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2))
+    ps_pdf.start()
+
+
+ [x.join() for x in ps_ap_merg_dupes]
+ control_group=list(con_list)
+ treated_group=list(tre_list)
+
+ # Filters low count mirnas (otpional)
+ if int(args.per)!=-1:
+
+    fil_con_group=manager.list()
+    fil_tre_group=manager.list()
+
+    ps_low_counts = Process(target=filter_low_counts,args=(control_group,treated_group,fil_con_group,fil_tre_group,args.per,args.count))
+    ps_low_counts.start()
+    ps_low_counts.join()
+
+    fil_con_group=list(fil_con_group)
+    fil_tre_group=list(fil_tre_group)
+
+ if "fil_con_group" not in locals() or "fil_con_group" not in globals():
+    fil_con_group=control_group
+    fil_tre_group=treated_group
+
+ # export count matrices
+ ps_write = Process(target=write_main,args=(control_group, treated_group, fil_con_group, fil_tre_group, con_file_order,tre_file_order,1,args.group1,args.group2,args.per))
+ ps_write.start()
+
+ # export counts files compatible with Deseq2 and EdgeR
+ ps1_matrix = [Process(target=temp_counts_to_diff,args=(con_file_order,fil_con_group,"Diff/temp_con/"))]
+ ps1_matrix.extend([Process(target=temp_counts_to_diff,args=(tre_file_order,fil_tre_group,"Diff/temp_tre/"))])
+ [p.start() for p in ps1_matrix]
+
+ if args.anal=="1":
+    ps_pdf.join()
+ if args.anal=="2":
+    [p.join() for p in procs]
+    [x.join() for x in ps_hist]
+
+ ps_write.join()
+ [p.join() for p in ps1_matrix]
+
+############################## Detection of non-template  #######################################
+
+ starttime10 = time.time()
+
+ if args.anal == "2":
+
+  # Initialization of the managers between the proccesses
+  # First group of samples (controls)
+  n_con_data= manager.list()
+  n_con_file_order=manager.list()
+  n_con_names_seqs=manager.list()
+  n_deseq=manager.list()
+
+  # Second group of samples (treated)
+  n_tre_data = manager.list()
+  n_tre_file_order = manager.list()
+  n_tre_names_seqs=manager.list()
+  n_deseq1=manager.list()
+
+  # Preparation of reference sequences
+  new_ref_mirnas = list(mature_mirnas)
+  ps_non_iso.join()
+
+  all_iso=list(all_iso)
+  new_ref_mirnas.extend(all_iso)
+
+  # Processing of non template miRNAs from SAM files
+  ps_sam = [Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_con_data,n_con_file_order,n_deseq,n_con_names_seqs)) for path in control]
+  ps_sam.extend([Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_tre_data,n_tre_file_order,n_deseq1,n_tre_names_seqs)) for path in treated])
+
+  [p.start() for p in ps_sam]
+  [p.join() for p in ps_sam]
+
+  # Convert managers to lists
+  n_con_file_order=list(n_con_file_order)
+  n_tre_file_order=list(n_tre_file_order)
+  n_deseq=list(n_deseq)
+  n_deseq1=list(n_deseq1)
+
+  # Remove duplicates and sorting
+  n_con_names_seqs=list(n_con_names_seqs)
+  n_con_names_seqs.sort()
+  n_con_names_seqs=list(n_con_names_seqs for n_con_names_seqs,_ in itertools.groupby(n_con_names_seqs))
+
+  n_tre_names_seqs=list(n_tre_names_seqs)
+  n_tre_names_seqs.sort()
+  n_tre_names_seqs=list(n_tre_names_seqs for n_tre_names_seqs,_ in itertools.groupby(n_tre_names_seqs))
+
+  # initialization of new managers
+  new_n_con_file_order=manager.list()
+  new_n_tre_file_order=manager.list()
+  n_new_deseq=manager.list()
+  n_new_deseq1=manager.list()
+
+  # add uncommon detected mirnas among the samples
+  ps_deseq=[Process(target=uncommon_mirnas,args=(sampp,n_con_names_seqs,lock,n_new_deseq,n_con_file_order[i],new_n_con_file_order)) for i,sampp in enumerate(n_deseq)]
+  ps_deseq.extend([Process(target=uncommon_mirnas,args=(sampp,n_tre_names_seqs,lock,n_new_deseq1,n_tre_file_order[i],new_n_tre_file_order)) for i,sampp in enumerate(n_deseq1)])
+
+  # Wait for processing of uncommon detected mirnas to finish
+  [x.start() for x in ps_deseq]
+  [x.join() for x in ps_deseq]
+
+  # Convert managers to lists
+  n_new_deseq=list(n_new_deseq)
+  n_new_deseq1=list(n_new_deseq1)
+  n_con_file_order=list(new_n_con_file_order)
+  n_tre_file_order=list(new_n_tre_file_order)
+
+  # Genereation of count matrices per group (controls - treated)
+  n_control_group=[[x[0],x[2]] for x in n_new_deseq[0]]
+  [n_control_group[i].append(y[i][1]) for i,_ in enumerate(n_control_group) for y in n_new_deseq]
+
+  n_treated_group=[[x[0],x[2]] for x in n_new_deseq1[0]]
+  [n_treated_group[i].append(y[i][1]) for i,_ in enumerate(n_treated_group) for y in n_new_deseq1]
+
+  # Keep a copy of count matrices
+  n_control_group_copy=copy.deepcopy(list(n_control_group))
+  n_treated_group_copy=copy.deepcopy(list(n_treated_group))
+
+  # Initialization of managers
+  merg_nam_n_control_group=manager.list()
+  merg_nam_n_treated_group=manager.list()
+
+  # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates\
+  ps_merge = [Process(target=merging_names,args=(n_control_group_copy,merg_nam_n_control_group))]
+  ps_merge.extend([Process(target=merging_names,args=(n_treated_group_copy,merg_nam_n_treated_group))])
+  [x.start() for x in ps_merge]
+
+  # Add unique mirna sequences between groups (all groups will have the same amount of sequences)
+  n_con_list=manager.list()
+  n_tre_list=manager.list()
+
+  ps_bw = [Process(target=black_white,args=(n_con_names_seqs,n_tre_names_seqs,n_treated_group,n_tre_list))]
+  ps_bw.extend([Process(target=black_white,args=(n_tre_names_seqs,n_con_names_seqs,n_control_group,n_con_list))])
+  [x.start() for x in ps_bw]
+  [x.join() for x in ps_bw]
+
+  n_control_group=list(n_con_list)
+  n_treated_group=list(n_tre_list)
+
+  # Detection of duplications
+  n_dupes=manager.list()
+
+  ps_dupes = Process(target=merging_dupes,args=(n_control_group,n_dupes))
+  ps_dupes.start()
+  ps_dupes.join()
+
+  n_dupes=list(n_dupes)
+
+  # Merging the duplications in one entry with all different names
+  n_con_list=manager.list()
+  n_tre_list=manager.list()
+
+  ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(n_control_group,n_dupes,n_con_list))]
+  ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(n_treated_group,n_dupes,n_tre_list))])
+  [x.start() for x in ps_ap_merg_dupes]
+
+  # Finishing the process for merging
+  [x.join() for x in ps_merge]
+  merg_nam_n_control_group=list(merg_nam_n_control_group)
+  merg_nam_n_treated_group=list(merg_nam_n_treated_group)
+
+  # Export the database and the graphs
+  procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_con_data]
+  procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_tre_data])
+  procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_control_group,'c',args.group1))])
+  procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_treated_group,'t',args.group2))])
+  procs.extend([Process(target=pie_non_temp,args=(merg_nam_control_group,merg_nam_n_control_group,merg_nam_treated_group,merg_nam_n_treated_group,con_unmap_seq.value,tre_unmap_seq.value,con_unmap_counts.value,tre_unmap_counts.value,args.group1,args.group2))])
+
+  [p.start() for p in procs]
+  [p.join() for p in procs]
+
+  procs1 = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2))
+  procs1.start()
+
+  [x.join() for x in ps_ap_merg_dupes]
+  n_control_group=list(n_con_list)
+  n_treated_group=list(n_tre_list)
+
+
+  # Filters low count mirnas (otpional)
+  if int(args.per)!=-1:
+
+     n_fil_con_group=manager.list()
+     n_fil_tre_group=manager.list()
+
+     ps_low_counts = Process(target=filter_low_counts,args=(n_control_group,n_treated_group,n_fil_con_group,n_fil_tre_group,args.per,args.count))
+     ps_low_counts.start()
+     ps_low_counts.join()
+
+     n_fil_con_group=list(n_fil_con_group)
+     n_fil_tre_group=list(n_fil_tre_group)
+
+  if "n_fil_con_group" not in locals() or "n_fil_con_group" not in globals():
+      n_fil_con_group=n_control_group
+      n_fil_tre_group=n_treated_group
+
+  ps_write = Process(target=write_main,args=(n_control_group, n_treated_group,n_fil_con_group, n_fil_tre_group, n_con_file_order, n_tre_file_order,2,args.group1,args.group2,args.per))
+  ps_write.start()
+
+  ps1_matrix = [Process(target=nontemp_counts_to_diff,args=(n_con_file_order,n_fil_con_group,con_file_order,fil_con_group,"Diff/n_temp_con/"))]
+  ps1_matrix.extend([Process(target=nontemp_counts_to_diff,args=(n_tre_file_order,n_fil_tre_group,tre_file_order,fil_tre_group,"Diff/n_temp_tre/"))])
+  [p.start() for p in ps1_matrix]
+
+  ps_write.join()
+  [p.join() for p in ps1_matrix]
+  procs1.join()
+ print('That took {} seconds'.format(time.time() - starttime10))
+ print('That took {} seconds'.format(time.time() - starttime))
+