Mercurial > repos > glogobyte > isoread
view mirbase.py @ 5:810e789ffeab draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 13 Oct 2021 16:04:54 +0000 |
parents | f10c8f43f010 |
children | 6d3abc45aa49 |
line wrap: on
line source
from mirbase_functions import * from mirbase_graphs import * import time from multiprocessing import Process, Queue, Lock, Pool, Manager, Value import subprocess import argparse import sys subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre']) parser = argparse.ArgumentParser() parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store") parser.add_argument("-con", "--control", help="input fastq file (controls)", nargs='+', default=[]) parser.add_argument("-tre", "--treated", help="input fastq file (treated)", nargs='+', default=[] ) parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store") parser.add_argument("-gen", "--org_name", help="Organism", action="store") parser.add_argument("-f", "--flag", help="choose the database (MirBase,MirGene)", action="store") parser.add_argument("-percentage", "--per", help="Percentage of Samples", action="store") parser.add_argument("-counts", "--count", help="Counts for filtering", action="store") parser.add_argument("-name1", "--group1", help="Samples group 1", action="store") parser.add_argument("-name2", "--group2", help="Samples group 2", action="store") args = parser.parse_args() ################################################################################################################################################################################################################# if __name__ == '__main__': starttime = time.time() lock = Lock() manager = Manager() # Download reference miRNA sequences from MirBase mature_mirnas=manager.list() ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name)) ps_mature.start() # Keep the names of the files and location paths args.control[0]=args.control[0][1:] args.control[len(args.control)-1][:-1] control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)] args.treated[0]=args.treated[0][1:] args.treated[len(args.treated)-1][:-1] treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)] ############## Detection of templated isoforms ################ # Initialization of the managers between the proccesses # First group of samples (controls) con_samples = manager.list() # Collapsed mirnas with the new names con_data= manager.list() # keeps all necessary data for the Database con_file_order=manager.list() # files' names ordered by processes con_names_seqs=manager.list() # keeps only mirna names and sequences deseq=manager.list() con_unmap_seq=manager.Value('i',0) # keeps unmap unique sequnces for the generation of a graph con_unmap_counts=manager.Value('i',0) # keeps unmap counts of sequnces for the generation of a graph con_mirna_names=manager.list() # keeps the names of mirnas ini_con_samples = manager.list() # filtered SAM files # Second group of samples (treated) tre_samples = manager.list() tre_data = manager.list() tre_file_order = manager.list() tre_names_seqs=manager.list() deseq1=manager.list() tre_unmap_seq = manager.Value('i',0) tre_unmap_counts = manager.Value('i',0) tre_mirna_names=manager.list() ini_tre_samples = manager.list() # Wait for the download of reference miRNA sequences ps_mature.join() mature_mirnas=list(mature_mirnas) # Processing of the detected miRNAs from SAM files ps_sam = [Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,con_samples,con_data,con_file_order,con_unmap_seq,con_names_seqs,deseq,con_mirna_names,ini_con_samples,con_unmap_counts)) for path in control] ps_sam.extend([Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,tre_samples,tre_data,tre_file_order,tre_unmap_seq,tre_names_seqs,deseq1,tre_mirna_names,ini_tre_samples,tre_unmap_counts)) for path in treated]) # Wait for processing of SAM files to finish [p.start() for p in ps_sam] [p.join() for p in ps_sam] # Generate a histogram ps_hist=[Process(target=hist_red,args=(ini_con_samples,'c',args.group1))] ps_hist.extend([Process(target=hist_red,args=(ini_tre_samples,'t',args.group2))]) [x.start() for x in ps_hist] # Convert managers to lists con_samples = list(con_samples) tre_samples = list(tre_samples) con_file_order=list(con_file_order) tre_file_order=list(tre_file_order) deseq=list(deseq) deseq1=list(deseq1) # Remove duplicates and sorting con_names_seqs=list(con_names_seqs) con_names_seqs.sort() con_names_seqs=list(con_names_seqs for con_names_seqs,_ in itertools.groupby(con_names_seqs)) tre_names_seqs=list(tre_names_seqs) tre_names_seqs.sort() tre_names_seqs=list(tre_names_seqs for tre_names_seqs,_ in itertools.groupby(tre_names_seqs)) # initialization of new managers new_con_file_order=manager.list() new_tre_file_order=manager.list() new_deseq=manager.list() new_deseq1=manager.list() # add uncommon detected mirnas among the samples ps_un_mirnas=[Process(target=uncommon_mirnas,args=(sampp,con_names_seqs,lock,new_deseq,con_file_order[i],new_con_file_order)) for i,sampp in enumerate(deseq)] ps_un_mirnas.extend([Process(target=uncommon_mirnas,args=(sampp,tre_names_seqs,lock,new_deseq1,tre_file_order[i],new_tre_file_order)) for i,sampp in enumerate(deseq1)]) # Wait for processing of uncommon detected mirnas to finish [z.start() for z in ps_un_mirnas] [z.join() for z in ps_un_mirnas] # Convert managers to lists new_deseq=list(new_deseq) new_deseq1=list(new_deseq1) con_file_order=list(new_con_file_order) tre_file_order=list(new_tre_file_order) # Genereation of count matrices per group (controls - treated) control_group=[[x[0],x[2]] for x in new_deseq[0]] [control_group[i].append(y[i][1]) for i,_ in enumerate(control_group) for y in new_deseq] treated_group=[[x[0],x[2]] for x in new_deseq1[0]] [treated_group[i].append(y[i][1]) for i,_ in enumerate(treated_group) for y in new_deseq1] # Keep a copy of count matrices control_group_copy=copy.deepcopy(list(control_group)) treated_group_copy=copy.deepcopy(list(treated_group)) # Initialization of managers merg_nam_control_group=manager.list() merg_nam_treated_group=manager.list() # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates ps_merge = [Process(target=merging_names,args=(control_group_copy,merg_nam_control_group))] ps_merge.extend([Process(target=merging_names,args=(treated_group_copy,merg_nam_treated_group))]) [x.start() for x in ps_merge] # Add unique mirna sequences between groups (all groups will have the same amount of sequences) con_list=manager.list() tre_list=manager.list() ps_bw = [Process(target=black_white,args=(con_names_seqs,tre_names_seqs,treated_group,tre_list))] ps_bw.extend([Process(target=black_white,args=(tre_names_seqs,con_names_seqs,control_group,con_list))]) [x.start() for x in ps_bw] [x.join() for x in ps_bw] control_group=list(con_list) treated_group=list(tre_list) # Detection of duplications dupes=manager.list() ps_dupes = Process(target=merging_dupes,args=(control_group,dupes)) ps_dupes.start() ps_dupes.join() dupes=list(dupes) # Merging the duplications in one entry with all different names con_list=manager.list() tre_list=manager.list() ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(control_group,dupes,con_list))] ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(treated_group,dupes,tre_list))]) [x.start() for x in ps_ap_merg_dupes] # Preparation of reference sequences (isodforms) for the detection of non template mirnas if args.anal=="2": all_iso = manager.list() ps_non_iso = Process(target=non_template_ref,args=(con_samples,tre_samples,all_iso)) ps_non_iso.start() # Finishing the process for merging [x.join() for x in ps_merge] merg_nam_control_group=list(merg_nam_control_group) merg_nam_treated_group=list(merg_nam_treated_group) # Export the database and the graphs procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in con_data] procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in tre_data]) procs.extend([Process(target=make_spider,args=(merg_nam_control_group,merg_nam_treated_group,args.group1,args.group2))]) if args.anal == "1": procs.extend([Process(target=pie_temp,args=(merg_nam_control_group,con_unmap_seq.value,con_unmap_counts.value,merg_nam_treated_group,tre_unmap_seq.value,tre_unmap_counts.value,group_name1,group_name2))]) [p.start() for p in procs] # Export the pdf report file if args.anal=="1": [x.join() for x in ps_hist] [p.join() for p in procs] ps_pdf = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2)) ps_pdf.start() [x.join() for x in ps_ap_merg_dupes] control_group=list(con_list) treated_group=list(tre_list) # Filters low count mirnas (otpional) if int(args.per)!=-1: fil_con_group=manager.list() fil_tre_group=manager.list() ps_low_counts = Process(target=filter_low_counts,args=(control_group,treated_group,fil_con_group,fil_tre_group,args.per,args.count)) ps_low_counts.start() ps_low_counts.join() fil_con_group=list(fil_con_group) fil_tre_group=list(fil_tre_group) if "fil_con_group" not in locals() or "fil_con_group" not in globals(): fil_con_group=control_group fil_tre_group=treated_group # export count matrices ps_write = Process(target=write_main,args=(control_group, treated_group, fil_con_group, fil_tre_group, con_file_order,tre_file_order,1,args.group1,args.group2,args.per)) ps_write.start() # export counts files compatible with Deseq2 and EdgeR ps1_matrix = [Process(target=temp_counts_to_diff,args=(con_file_order,fil_con_group,"Diff/temp_con/"))] ps1_matrix.extend([Process(target=temp_counts_to_diff,args=(tre_file_order,fil_tre_group,"Diff/temp_tre/"))]) [p.start() for p in ps1_matrix] if args.anal=="1": ps_pdf.join() if args.anal=="2": [p.join() for p in procs] [x.join() for x in ps_hist] ps_write.join() [p.join() for p in ps1_matrix] ############################## Detection of non-template ####################################### starttime10 = time.time() if args.anal == "2": # Initialization of the managers between the proccesses # First group of samples (controls) n_con_data= manager.list() n_con_file_order=manager.list() n_con_names_seqs=manager.list() n_deseq=manager.list() # Second group of samples (treated) n_tre_data = manager.list() n_tre_file_order = manager.list() n_tre_names_seqs=manager.list() n_deseq1=manager.list() # Preparation of reference sequences new_ref_mirnas = list(mature_mirnas) ps_non_iso.join() all_iso=list(all_iso) new_ref_mirnas.extend(all_iso) # Processing of non template miRNAs from SAM files ps_sam = [Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_con_data,n_con_file_order,n_deseq,n_con_names_seqs)) for path in control] ps_sam.extend([Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_tre_data,n_tre_file_order,n_deseq1,n_tre_names_seqs)) for path in treated]) [p.start() for p in ps_sam] [p.join() for p in ps_sam] # Convert managers to lists n_con_file_order=list(n_con_file_order) n_tre_file_order=list(n_tre_file_order) n_deseq=list(n_deseq) n_deseq1=list(n_deseq1) # Remove duplicates and sorting n_con_names_seqs=list(n_con_names_seqs) n_con_names_seqs.sort() n_con_names_seqs=list(n_con_names_seqs for n_con_names_seqs,_ in itertools.groupby(n_con_names_seqs)) n_tre_names_seqs=list(n_tre_names_seqs) n_tre_names_seqs.sort() n_tre_names_seqs=list(n_tre_names_seqs for n_tre_names_seqs,_ in itertools.groupby(n_tre_names_seqs)) # initialization of new managers new_n_con_file_order=manager.list() new_n_tre_file_order=manager.list() n_new_deseq=manager.list() n_new_deseq1=manager.list() # add uncommon detected mirnas among the samples ps_deseq=[Process(target=uncommon_mirnas,args=(sampp,n_con_names_seqs,lock,n_new_deseq,n_con_file_order[i],new_n_con_file_order)) for i,sampp in enumerate(n_deseq)] ps_deseq.extend([Process(target=uncommon_mirnas,args=(sampp,n_tre_names_seqs,lock,n_new_deseq1,n_tre_file_order[i],new_n_tre_file_order)) for i,sampp in enumerate(n_deseq1)]) # Wait for processing of uncommon detected mirnas to finish [x.start() for x in ps_deseq] [x.join() for x in ps_deseq] # Convert managers to lists n_new_deseq=list(n_new_deseq) n_new_deseq1=list(n_new_deseq1) n_con_file_order=list(new_n_con_file_order) n_tre_file_order=list(new_n_tre_file_order) # Genereation of count matrices per group (controls - treated) n_control_group=[[x[0],x[2]] for x in n_new_deseq[0]] [n_control_group[i].append(y[i][1]) for i,_ in enumerate(n_control_group) for y in n_new_deseq] n_treated_group=[[x[0],x[2]] for x in n_new_deseq1[0]] [n_treated_group[i].append(y[i][1]) for i,_ in enumerate(n_treated_group) for y in n_new_deseq1] # Keep a copy of count matrices n_control_group_copy=copy.deepcopy(list(n_control_group)) n_treated_group_copy=copy.deepcopy(list(n_treated_group)) # Initialization of managers merg_nam_n_control_group=manager.list() merg_nam_n_treated_group=manager.list() # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates\ ps_merge = [Process(target=merging_names,args=(n_control_group_copy,merg_nam_n_control_group))] ps_merge.extend([Process(target=merging_names,args=(n_treated_group_copy,merg_nam_n_treated_group))]) [x.start() for x in ps_merge] # Add unique mirna sequences between groups (all groups will have the same amount of sequences) n_con_list=manager.list() n_tre_list=manager.list() ps_bw = [Process(target=black_white,args=(n_con_names_seqs,n_tre_names_seqs,n_treated_group,n_tre_list))] ps_bw.extend([Process(target=black_white,args=(n_tre_names_seqs,n_con_names_seqs,n_control_group,n_con_list))]) [x.start() for x in ps_bw] [x.join() for x in ps_bw] n_control_group=list(n_con_list) n_treated_group=list(n_tre_list) # Detection of duplications n_dupes=manager.list() ps_dupes = Process(target=merging_dupes,args=(n_control_group,n_dupes)) ps_dupes.start() ps_dupes.join() n_dupes=list(n_dupes) # Merging the duplications in one entry with all different names n_con_list=manager.list() n_tre_list=manager.list() ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(n_control_group,n_dupes,n_con_list))] ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(n_treated_group,n_dupes,n_tre_list))]) [x.start() for x in ps_ap_merg_dupes] # Finishing the process for merging [x.join() for x in ps_merge] merg_nam_n_control_group=list(merg_nam_n_control_group) merg_nam_n_treated_group=list(merg_nam_n_treated_group) # Export the database and the graphs procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_con_data] procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_tre_data]) procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_control_group,'c',args.group1))]) procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_treated_group,'t',args.group2))]) procs.extend([Process(target=pie_non_temp,args=(merg_nam_control_group,merg_nam_n_control_group,merg_nam_treated_group,merg_nam_n_treated_group,con_unmap_seq.value,tre_unmap_seq.value,con_unmap_counts.value,tre_unmap_counts.value,args.group1,args.group2))]) [p.start() for p in procs] [p.join() for p in procs] procs1 = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2)) procs1.start() [x.join() for x in ps_ap_merg_dupes] n_control_group=list(n_con_list) n_treated_group=list(n_tre_list) # Filters low count mirnas (otpional) if int(args.per)!=-1: n_fil_con_group=manager.list() n_fil_tre_group=manager.list() ps_low_counts = Process(target=filter_low_counts,args=(n_control_group,n_treated_group,n_fil_con_group,n_fil_tre_group,args.per,args.count)) ps_low_counts.start() ps_low_counts.join() n_fil_con_group=list(n_fil_con_group) n_fil_tre_group=list(n_fil_tre_group) if "n_fil_con_group" not in locals() or "n_fil_con_group" not in globals(): n_fil_con_group=n_control_group n_fil_tre_group=n_treated_group ps_write = Process(target=write_main,args=(n_control_group, n_treated_group,n_fil_con_group, n_fil_tre_group, n_con_file_order, n_tre_file_order,2,args.group1,args.group2,args.per)) ps_write.start() ps1_matrix = [Process(target=nontemp_counts_to_diff,args=(n_con_file_order,n_fil_con_group,con_file_order,fil_con_group,"Diff/n_temp_con/"))] ps1_matrix.extend([Process(target=nontemp_counts_to_diff,args=(n_tre_file_order,n_fil_tre_group,tre_file_order,fil_tre_group,"Diff/n_temp_tre/"))]) [p.start() for p in ps1_matrix] ps_write.join() [p.join() for p in ps1_matrix] procs1.join() print('That took {} seconds'.format(time.time() - starttime10)) print('That took {} seconds'.format(time.time() - starttime))