Mercurial > repos > glogobyte > isoread
diff mirbase.py @ 1:f10c8f43f010 draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 13 Oct 2021 16:03:50 +0000 |
parents | |
children | 6d3abc45aa49 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mirbase.py Wed Oct 13 16:03:50 2021 +0000 @@ -0,0 +1,419 @@ +from mirbase_functions import * +from mirbase_graphs import * +import time +from multiprocessing import Process, Queue, Lock, Pool, Manager, Value +import subprocess +import argparse +import sys + +subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre']) + +parser = argparse.ArgumentParser() +parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store") +parser.add_argument("-con", "--control", help="input fastq file (controls)", nargs='+', default=[]) +parser.add_argument("-tre", "--treated", help="input fastq file (treated)", nargs='+', default=[] ) +parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store") +parser.add_argument("-gen", "--org_name", help="Organism", action="store") +parser.add_argument("-f", "--flag", help="choose the database (MirBase,MirGene)", action="store") +parser.add_argument("-percentage", "--per", help="Percentage of Samples", action="store") +parser.add_argument("-counts", "--count", help="Counts for filtering", action="store") +parser.add_argument("-name1", "--group1", help="Samples group 1", action="store") +parser.add_argument("-name2", "--group2", help="Samples group 2", action="store") +args = parser.parse_args() + + +################################################################################################################################################################################################################# + +if __name__ == '__main__': + + starttime = time.time() + + lock = Lock() + manager = Manager() + + # Download reference miRNA sequences from MirBase + mature_mirnas=manager.list() + ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name)) + ps_mature.start() + + + # Keep the names of the files and location paths + args.control[0]=args.control[0][1:] + args.control[len(args.control)-1][:-1] + control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)] + + + args.treated[0]=args.treated[0][1:] + args.treated[len(args.treated)-1][:-1] + treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)] + + +############## Detection of templated isoforms ################ + + + # Initialization of the managers between the proccesses + # First group of samples (controls) + con_samples = manager.list() # Collapsed mirnas with the new names + con_data= manager.list() # keeps all necessary data for the Database + con_file_order=manager.list() # files' names ordered by processes + con_names_seqs=manager.list() # keeps only mirna names and sequences + deseq=manager.list() + con_unmap_seq=manager.Value('i',0) # keeps unmap unique sequnces for the generation of a graph + con_unmap_counts=manager.Value('i',0) # keeps unmap counts of sequnces for the generation of a graph + con_mirna_names=manager.list() # keeps the names of mirnas + ini_con_samples = manager.list() # filtered SAM files + + # Second group of samples (treated) + tre_samples = manager.list() + tre_data = manager.list() + tre_file_order = manager.list() + tre_names_seqs=manager.list() + deseq1=manager.list() + tre_unmap_seq = manager.Value('i',0) + tre_unmap_counts = manager.Value('i',0) + tre_mirna_names=manager.list() + ini_tre_samples = manager.list() + + # Wait for the download of reference miRNA sequences + ps_mature.join() + mature_mirnas=list(mature_mirnas) + + # Processing of the detected miRNAs from SAM files + ps_sam = [Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,con_samples,con_data,con_file_order,con_unmap_seq,con_names_seqs,deseq,con_mirna_names,ini_con_samples,con_unmap_counts)) for path in control] + ps_sam.extend([Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,tre_samples,tre_data,tre_file_order,tre_unmap_seq,tre_names_seqs,deseq1,tre_mirna_names,ini_tre_samples,tre_unmap_counts)) for path in treated]) + + # Wait for processing of SAM files to finish + [p.start() for p in ps_sam] + [p.join() for p in ps_sam] + + # Generate a histogram + ps_hist=[Process(target=hist_red,args=(ini_con_samples,'c',args.group1))] + ps_hist.extend([Process(target=hist_red,args=(ini_tre_samples,'t',args.group2))]) + [x.start() for x in ps_hist] + + + # Convert managers to lists + con_samples = list(con_samples) + tre_samples = list(tre_samples) + con_file_order=list(con_file_order) + tre_file_order=list(tre_file_order) + deseq=list(deseq) + deseq1=list(deseq1) + + # Remove duplicates and sorting + con_names_seqs=list(con_names_seqs) + con_names_seqs.sort() + con_names_seqs=list(con_names_seqs for con_names_seqs,_ in itertools.groupby(con_names_seqs)) + + tre_names_seqs=list(tre_names_seqs) + tre_names_seqs.sort() + tre_names_seqs=list(tre_names_seqs for tre_names_seqs,_ in itertools.groupby(tre_names_seqs)) + + # initialization of new managers + new_con_file_order=manager.list() + new_tre_file_order=manager.list() + new_deseq=manager.list() + new_deseq1=manager.list() + + # add uncommon detected mirnas among the samples + ps_un_mirnas=[Process(target=uncommon_mirnas,args=(sampp,con_names_seqs,lock,new_deseq,con_file_order[i],new_con_file_order)) for i,sampp in enumerate(deseq)] + ps_un_mirnas.extend([Process(target=uncommon_mirnas,args=(sampp,tre_names_seqs,lock,new_deseq1,tre_file_order[i],new_tre_file_order)) for i,sampp in enumerate(deseq1)]) + + # Wait for processing of uncommon detected mirnas to finish + [z.start() for z in ps_un_mirnas] + [z.join() for z in ps_un_mirnas] + + # Convert managers to lists + new_deseq=list(new_deseq) + new_deseq1=list(new_deseq1) + con_file_order=list(new_con_file_order) + tre_file_order=list(new_tre_file_order) + + # Genereation of count matrices per group (controls - treated) + control_group=[[x[0],x[2]] for x in new_deseq[0]] + [control_group[i].append(y[i][1]) for i,_ in enumerate(control_group) for y in new_deseq] + + treated_group=[[x[0],x[2]] for x in new_deseq1[0]] + [treated_group[i].append(y[i][1]) for i,_ in enumerate(treated_group) for y in new_deseq1] + + # Keep a copy of count matrices + control_group_copy=copy.deepcopy(list(control_group)) + treated_group_copy=copy.deepcopy(list(treated_group)) + + # Initialization of managers + merg_nam_control_group=manager.list() + merg_nam_treated_group=manager.list() + + # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates + ps_merge = [Process(target=merging_names,args=(control_group_copy,merg_nam_control_group))] + ps_merge.extend([Process(target=merging_names,args=(treated_group_copy,merg_nam_treated_group))]) + [x.start() for x in ps_merge] + + + # Add unique mirna sequences between groups (all groups will have the same amount of sequences) + con_list=manager.list() + tre_list=manager.list() + + ps_bw = [Process(target=black_white,args=(con_names_seqs,tre_names_seqs,treated_group,tre_list))] + ps_bw.extend([Process(target=black_white,args=(tre_names_seqs,con_names_seqs,control_group,con_list))]) + [x.start() for x in ps_bw] + [x.join() for x in ps_bw] + + control_group=list(con_list) + treated_group=list(tre_list) + + # Detection of duplications + dupes=manager.list() + + ps_dupes = Process(target=merging_dupes,args=(control_group,dupes)) + ps_dupes.start() + ps_dupes.join() + + dupes=list(dupes) + + # Merging the duplications in one entry with all different names + con_list=manager.list() + tre_list=manager.list() + + ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(control_group,dupes,con_list))] + ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(treated_group,dupes,tre_list))]) + [x.start() for x in ps_ap_merg_dupes] + + # Preparation of reference sequences (isodforms) for the detection of non template mirnas + if args.anal=="2": + all_iso = manager.list() + ps_non_iso = Process(target=non_template_ref,args=(con_samples,tre_samples,all_iso)) + ps_non_iso.start() + + # Finishing the process for merging + [x.join() for x in ps_merge] + merg_nam_control_group=list(merg_nam_control_group) + merg_nam_treated_group=list(merg_nam_treated_group) + + # Export the database and the graphs + procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in con_data] + procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in tre_data]) + procs.extend([Process(target=make_spider,args=(merg_nam_control_group,merg_nam_treated_group,args.group1,args.group2))]) + + if args.anal == "1": + procs.extend([Process(target=pie_temp,args=(merg_nam_control_group,con_unmap_seq.value,con_unmap_counts.value,merg_nam_treated_group,tre_unmap_seq.value,tre_unmap_counts.value,group_name1,group_name2))]) + + [p.start() for p in procs] + + # Export the pdf report file + if args.anal=="1": + [x.join() for x in ps_hist] + [p.join() for p in procs] + ps_pdf = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2)) + ps_pdf.start() + + + [x.join() for x in ps_ap_merg_dupes] + control_group=list(con_list) + treated_group=list(tre_list) + + # Filters low count mirnas (otpional) + if int(args.per)!=-1: + + fil_con_group=manager.list() + fil_tre_group=manager.list() + + ps_low_counts = Process(target=filter_low_counts,args=(control_group,treated_group,fil_con_group,fil_tre_group,args.per,args.count)) + ps_low_counts.start() + ps_low_counts.join() + + fil_con_group=list(fil_con_group) + fil_tre_group=list(fil_tre_group) + + if "fil_con_group" not in locals() or "fil_con_group" not in globals(): + fil_con_group=control_group + fil_tre_group=treated_group + + # export count matrices + ps_write = Process(target=write_main,args=(control_group, treated_group, fil_con_group, fil_tre_group, con_file_order,tre_file_order,1,args.group1,args.group2,args.per)) + ps_write.start() + + # export counts files compatible with Deseq2 and EdgeR + ps1_matrix = [Process(target=temp_counts_to_diff,args=(con_file_order,fil_con_group,"Diff/temp_con/"))] + ps1_matrix.extend([Process(target=temp_counts_to_diff,args=(tre_file_order,fil_tre_group,"Diff/temp_tre/"))]) + [p.start() for p in ps1_matrix] + + if args.anal=="1": + ps_pdf.join() + if args.anal=="2": + [p.join() for p in procs] + [x.join() for x in ps_hist] + + ps_write.join() + [p.join() for p in ps1_matrix] + +############################## Detection of non-template ####################################### + + starttime10 = time.time() + + if args.anal == "2": + + # Initialization of the managers between the proccesses + # First group of samples (controls) + n_con_data= manager.list() + n_con_file_order=manager.list() + n_con_names_seqs=manager.list() + n_deseq=manager.list() + + # Second group of samples (treated) + n_tre_data = manager.list() + n_tre_file_order = manager.list() + n_tre_names_seqs=manager.list() + n_deseq1=manager.list() + + # Preparation of reference sequences + new_ref_mirnas = list(mature_mirnas) + ps_non_iso.join() + + all_iso=list(all_iso) + new_ref_mirnas.extend(all_iso) + + # Processing of non template miRNAs from SAM files + ps_sam = [Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_con_data,n_con_file_order,n_deseq,n_con_names_seqs)) for path in control] + ps_sam.extend([Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_tre_data,n_tre_file_order,n_deseq1,n_tre_names_seqs)) for path in treated]) + + [p.start() for p in ps_sam] + [p.join() for p in ps_sam] + + # Convert managers to lists + n_con_file_order=list(n_con_file_order) + n_tre_file_order=list(n_tre_file_order) + n_deseq=list(n_deseq) + n_deseq1=list(n_deseq1) + + # Remove duplicates and sorting + n_con_names_seqs=list(n_con_names_seqs) + n_con_names_seqs.sort() + n_con_names_seqs=list(n_con_names_seqs for n_con_names_seqs,_ in itertools.groupby(n_con_names_seqs)) + + n_tre_names_seqs=list(n_tre_names_seqs) + n_tre_names_seqs.sort() + n_tre_names_seqs=list(n_tre_names_seqs for n_tre_names_seqs,_ in itertools.groupby(n_tre_names_seqs)) + + # initialization of new managers + new_n_con_file_order=manager.list() + new_n_tre_file_order=manager.list() + n_new_deseq=manager.list() + n_new_deseq1=manager.list() + + # add uncommon detected mirnas among the samples + ps_deseq=[Process(target=uncommon_mirnas,args=(sampp,n_con_names_seqs,lock,n_new_deseq,n_con_file_order[i],new_n_con_file_order)) for i,sampp in enumerate(n_deseq)] + ps_deseq.extend([Process(target=uncommon_mirnas,args=(sampp,n_tre_names_seqs,lock,n_new_deseq1,n_tre_file_order[i],new_n_tre_file_order)) for i,sampp in enumerate(n_deseq1)]) + + # Wait for processing of uncommon detected mirnas to finish + [x.start() for x in ps_deseq] + [x.join() for x in ps_deseq] + + # Convert managers to lists + n_new_deseq=list(n_new_deseq) + n_new_deseq1=list(n_new_deseq1) + n_con_file_order=list(new_n_con_file_order) + n_tre_file_order=list(new_n_tre_file_order) + + # Genereation of count matrices per group (controls - treated) + n_control_group=[[x[0],x[2]] for x in n_new_deseq[0]] + [n_control_group[i].append(y[i][1]) for i,_ in enumerate(n_control_group) for y in n_new_deseq] + + n_treated_group=[[x[0],x[2]] for x in n_new_deseq1[0]] + [n_treated_group[i].append(y[i][1]) for i,_ in enumerate(n_treated_group) for y in n_new_deseq1] + + # Keep a copy of count matrices + n_control_group_copy=copy.deepcopy(list(n_control_group)) + n_treated_group_copy=copy.deepcopy(list(n_treated_group)) + + # Initialization of managers + merg_nam_n_control_group=manager.list() + merg_nam_n_treated_group=manager.list() + + # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates\ + ps_merge = [Process(target=merging_names,args=(n_control_group_copy,merg_nam_n_control_group))] + ps_merge.extend([Process(target=merging_names,args=(n_treated_group_copy,merg_nam_n_treated_group))]) + [x.start() for x in ps_merge] + + # Add unique mirna sequences between groups (all groups will have the same amount of sequences) + n_con_list=manager.list() + n_tre_list=manager.list() + + ps_bw = [Process(target=black_white,args=(n_con_names_seqs,n_tre_names_seqs,n_treated_group,n_tre_list))] + ps_bw.extend([Process(target=black_white,args=(n_tre_names_seqs,n_con_names_seqs,n_control_group,n_con_list))]) + [x.start() for x in ps_bw] + [x.join() for x in ps_bw] + + n_control_group=list(n_con_list) + n_treated_group=list(n_tre_list) + + # Detection of duplications + n_dupes=manager.list() + + ps_dupes = Process(target=merging_dupes,args=(n_control_group,n_dupes)) + ps_dupes.start() + ps_dupes.join() + + n_dupes=list(n_dupes) + + # Merging the duplications in one entry with all different names + n_con_list=manager.list() + n_tre_list=manager.list() + + ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(n_control_group,n_dupes,n_con_list))] + ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(n_treated_group,n_dupes,n_tre_list))]) + [x.start() for x in ps_ap_merg_dupes] + + # Finishing the process for merging + [x.join() for x in ps_merge] + merg_nam_n_control_group=list(merg_nam_n_control_group) + merg_nam_n_treated_group=list(merg_nam_n_treated_group) + + # Export the database and the graphs + procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_con_data] + procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_tre_data]) + procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_control_group,'c',args.group1))]) + procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_treated_group,'t',args.group2))]) + procs.extend([Process(target=pie_non_temp,args=(merg_nam_control_group,merg_nam_n_control_group,merg_nam_treated_group,merg_nam_n_treated_group,con_unmap_seq.value,tre_unmap_seq.value,con_unmap_counts.value,tre_unmap_counts.value,args.group1,args.group2))]) + + [p.start() for p in procs] + [p.join() for p in procs] + + procs1 = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2)) + procs1.start() + + [x.join() for x in ps_ap_merg_dupes] + n_control_group=list(n_con_list) + n_treated_group=list(n_tre_list) + + + # Filters low count mirnas (otpional) + if int(args.per)!=-1: + + n_fil_con_group=manager.list() + n_fil_tre_group=manager.list() + + ps_low_counts = Process(target=filter_low_counts,args=(n_control_group,n_treated_group,n_fil_con_group,n_fil_tre_group,args.per,args.count)) + ps_low_counts.start() + ps_low_counts.join() + + n_fil_con_group=list(n_fil_con_group) + n_fil_tre_group=list(n_fil_tre_group) + + if "n_fil_con_group" not in locals() or "n_fil_con_group" not in globals(): + n_fil_con_group=n_control_group + n_fil_tre_group=n_treated_group + + ps_write = Process(target=write_main,args=(n_control_group, n_treated_group,n_fil_con_group, n_fil_tre_group, n_con_file_order, n_tre_file_order,2,args.group1,args.group2,args.per)) + ps_write.start() + + ps1_matrix = [Process(target=nontemp_counts_to_diff,args=(n_con_file_order,n_fil_con_group,con_file_order,fil_con_group,"Diff/n_temp_con/"))] + ps1_matrix.extend([Process(target=nontemp_counts_to_diff,args=(n_tre_file_order,n_fil_tre_group,tre_file_order,fil_tre_group,"Diff/n_temp_tre/"))]) + [p.start() for p in ps1_matrix] + + ps_write.join() + [p.join() for p in ps1_matrix] + procs1.join() + print('That took {} seconds'.format(time.time() - starttime10)) + print('That took {} seconds'.format(time.time() - starttime)) +