view mirbase.py @ 31:23a88a3ec37d draft

Uploaded
author glogobyte
date Wed, 20 Oct 2021 16:39:59 +0000
parents 0e7bd3f72c2c
children 5d232330e81f
line wrap: on
line source

from mirbase_functions import *
from mirbase_graphs import *
import time
from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
import subprocess
import argparse
import sys

subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre'])

parser = argparse.ArgumentParser()
parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store")
parser.add_argument("-con", "--control", help="input fastq file (controls)", nargs='+', default=[])
parser.add_argument("-tre", "--treated", help="input fastq file (treated)", nargs='+', default=[] )
parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
parser.add_argument("-gen", "--org_name", help="Organism", action="store")
parser.add_argument("-f", "--flag", help="choose the database (MirBase,MirGene)", action="store")
parser.add_argument("-percentage", "--per", help="Percentage of Samples", action="store")
parser.add_argument("-counts", "--count", help="Counts for filtering", action="store")
parser.add_argument("-name1", "--group1", help="Samples group 1", action="store")
parser.add_argument("-name2", "--group2", help="Samples group 2", action="store")
args = parser.parse_args()


#################################################################################################################################################################################################################

if __name__ == '__main__':

 starttime = time.time()

 lock = Lock()
 manager = Manager()

 # Download reference miRNA sequences from MirBase
 mature_mirnas=manager.list()
 ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name))
 ps_mature.start()


 # Keep the names of the files and location paths
 args.control[0]=args.control[0][1:]
 args.control[len(args.control)-1][:-1]
 control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)]


 args.treated[0]=args.treated[0][1:]
 args.treated[len(args.treated)-1][:-1]
 treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)]


##############  Detection of templated isoforms ################


 # Initialization of the managers between the proccesses
 # First group of samples (controls)
 con_samples = manager.list()           # Collapsed mirnas with the new names
 con_data= manager.list()	        # keeps all necessary data for the Database
 con_file_order=manager.list()          # files' names ordered by processes
 con_names_seqs=manager.list()          # keeps only mirna names and sequences
 deseq=manager.list()
 con_unmap_seq=manager.Value('i',0)             # keeps unmap unique sequnces for the generation of a graph
 con_unmap_counts=manager.Value('i',0)  # keeps unmap counts of sequnces for the generation of a graph
 con_mirna_names=manager.list()         # keeps the names of mirnas
 ini_con_samples = manager.list()       # filtered SAM files

 # Second group of samples (treated)
 tre_samples = manager.list()
 tre_data = manager.list()
 tre_file_order = manager.list()
 tre_names_seqs=manager.list()
 deseq1=manager.list()
 tre_unmap_seq = manager.Value('i',0)
 tre_unmap_counts = manager.Value('i',0)
 tre_mirna_names=manager.list()
 ini_tre_samples = manager.list()

 # Wait for the download of reference miRNA sequences
 ps_mature.join()
 mature_mirnas=list(mature_mirnas)

 # Processing of the detected miRNAs from SAM files
 ps_sam = [Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,con_samples,con_data,con_file_order,con_unmap_seq,con_names_seqs,deseq,con_mirna_names,ini_con_samples,con_unmap_counts)) for path in control]
 ps_sam.extend([Process(target=sam_edit,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,tre_samples,tre_data,tre_file_order,tre_unmap_seq,tre_names_seqs,deseq1,tre_mirna_names,ini_tre_samples,tre_unmap_counts)) for path in treated])

 # Wait for processing of SAM files to finish
 [p.start() for p in ps_sam]
 [p.join() for p in ps_sam]

 # Generate a histogram
 ps_hist=[Process(target=hist_red,args=(ini_con_samples,'c',args.group1))]
 ps_hist.extend([Process(target=hist_red,args=(ini_tre_samples,'t',args.group2))])
 [x.start() for x in ps_hist]


 # Convert managers to lists
 con_samples = list(con_samples)
 tre_samples = list(tre_samples)
 con_file_order=list(con_file_order)
 tre_file_order=list(tre_file_order)
 deseq=list(deseq)
 deseq1=list(deseq1)

 # Remove duplicates and sorting
 con_names_seqs=list(con_names_seqs)
 con_names_seqs.sort()
 con_names_seqs=list(con_names_seqs for con_names_seqs,_ in itertools.groupby(con_names_seqs))

 tre_names_seqs=list(tre_names_seqs)
 tre_names_seqs.sort()
 tre_names_seqs=list(tre_names_seqs for tre_names_seqs,_ in itertools.groupby(tre_names_seqs))

 # initialization of new managers
 new_con_file_order=manager.list()
 new_tre_file_order=manager.list()
 new_deseq=manager.list()
 new_deseq1=manager.list()

 # add uncommon detected mirnas among the samples
 ps_un_mirnas=[Process(target=uncommon_mirnas,args=(sampp,con_names_seqs,lock,new_deseq,con_file_order[i],new_con_file_order)) for i,sampp in enumerate(deseq)]
 ps_un_mirnas.extend([Process(target=uncommon_mirnas,args=(sampp,tre_names_seqs,lock,new_deseq1,tre_file_order[i],new_tre_file_order)) for i,sampp in enumerate(deseq1)])

 # Wait for processing of uncommon detected mirnas to finish
 [z.start() for z in ps_un_mirnas]
 [z.join() for z in ps_un_mirnas]

 # Convert managers to lists
 new_deseq=list(new_deseq)
 new_deseq1=list(new_deseq1)
 con_file_order=list(new_con_file_order)
 tre_file_order=list(new_tre_file_order)

 # Genereation of count matrices per group (controls - treated)
 control_group=[[x[0],x[2]] for x in new_deseq[0]]
 [control_group[i].append(y[i][1]) for i,_ in enumerate(control_group) for y in new_deseq]

 treated_group=[[x[0],x[2]] for x in new_deseq1[0]]
 [treated_group[i].append(y[i][1]) for i,_ in enumerate(treated_group) for y in new_deseq1]

 # Keep a copy of count matrices
 control_group_copy=copy.deepcopy(list(control_group))
 treated_group_copy=copy.deepcopy(list(treated_group))

 # Initialization of managers
 merg_nam_control_group=manager.list()
 merg_nam_treated_group=manager.list()

 # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates
 ps_merge = [Process(target=merging_names,args=(control_group_copy,merg_nam_control_group))]
 ps_merge.extend([Process(target=merging_names,args=(treated_group_copy,merg_nam_treated_group))])
 [x.start() for x in ps_merge]


 # Add unique mirna sequences between groups (all groups will have the same amount of sequences)
 con_list=manager.list()
 tre_list=manager.list()

 ps_bw = [Process(target=black_white,args=(con_names_seqs,tre_names_seqs,treated_group,tre_list))]
 ps_bw.extend([Process(target=black_white,args=(tre_names_seqs,con_names_seqs,control_group,con_list))])
 [x.start() for x in ps_bw]
 [x.join() for x in ps_bw]

 control_group=list(con_list)
 treated_group=list(tre_list)

 # Detection of duplications
 dupes=manager.list()

 ps_dupes = Process(target=merging_dupes,args=(control_group,dupes))
 ps_dupes.start()
 ps_dupes.join()

 dupes=list(dupes)

 # Merging the duplications in one entry with all different names
 con_list=manager.list()
 tre_list=manager.list()

 ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(control_group,dupes,con_list))]
 ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(treated_group,dupes,tre_list))])
 [x.start() for x in ps_ap_merg_dupes]

 # Preparation of reference sequences (isodforms) for the detection of non template mirnas
 if args.anal=="2":
    all_iso = manager.list()
    ps_non_iso = Process(target=non_template_ref,args=(con_samples,tre_samples,all_iso))
    ps_non_iso.start()

 # Finishing the process for merging
 [x.join() for x in ps_merge]
 merg_nam_control_group=list(merg_nam_control_group)
 merg_nam_treated_group=list(merg_nam_treated_group)

 # Export the database and the graphs
 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in con_data]
 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in tre_data])
 procs.extend([Process(target=make_spider,args=(merg_nam_control_group,merg_nam_treated_group,args.group1,args.group2))])

 if args.anal == "1":
    procs.extend([Process(target=pie_temp,args=(merg_nam_control_group,con_unmap_seq.value,con_unmap_counts.value,merg_nam_treated_group,tre_unmap_seq.value,tre_unmap_counts.value,args.group1,args.group2))])

 [p.start() for p in procs]

 # Export the pdf report file
 if args.anal=="1":
    [x.join() for x in ps_hist]
    [p.join() for p in procs]
    ps_pdf = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2))
    ps_pdf.start()


 [x.join() for x in ps_ap_merg_dupes]
 control_group=list(con_list)
 treated_group=list(tre_list)

 # Filters low count mirnas (otpional)
 if int(args.per)!=-1:
    if int(args.per)>0 and float(args.per)<=100 and int(args.count)>0:

       fil_con_group=manager.list()
       fil_tre_group=manager.list()

       ps_low_counts = Process(target=filter_low_counts,args=(control_group,treated_group,fil_con_group,fil_tre_group,args.per,args.count))
       ps_low_counts.start()
       ps_low_counts.join()

       fil_con_group=list(fil_con_group)
       fil_tre_group=list(fil_tre_group)
    else:
        sys.exit("Not acceptable values for filter")
    

 if "fil_con_group" not in locals() or "fil_con_group" not in globals():
    fil_con_group=control_group
    fil_tre_group=treated_group

 # export count matrices
 ps_write = Process(target=write_main,args=(control_group, treated_group, fil_con_group, fil_tre_group, con_file_order,tre_file_order,1,args.group1,args.group2,args.per))
 ps_write.start()

 # export counts files compatible with Deseq2 and EdgeR
 ps1_matrix = [Process(target=temp_counts_to_diff,args=(con_file_order,fil_con_group,"Diff/temp_con/"))]
 ps1_matrix.extend([Process(target=temp_counts_to_diff,args=(tre_file_order,fil_tre_group,"Diff/temp_tre/"))])
 [p.start() for p in ps1_matrix]

 if args.anal=="1":
    ps_pdf.join()
 if args.anal=="2":
    [p.join() for p in procs]
    [x.join() for x in ps_hist]

 ps_write.join()
 [p.join() for p in ps1_matrix]

############################## Detection of non-template  #######################################

 if args.anal == "2":

  # Initialization of the managers between the proccesses
  # First group of samples (controls)
  n_con_data= manager.list()
  n_con_file_order=manager.list()
  n_con_names_seqs=manager.list()
  n_deseq=manager.list()

  # Second group of samples (treated)
  n_tre_data = manager.list()
  n_tre_file_order = manager.list()
  n_tre_names_seqs=manager.list()
  n_deseq1=manager.list()

  # Preparation of reference sequences
  new_ref_mirnas = list(mature_mirnas)
  ps_non_iso.join()

  all_iso=list(all_iso)
  new_ref_mirnas.extend(all_iso)

  # Processing of non template miRNAs from SAM files
  ps_sam = [Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_con_data,n_con_file_order,n_deseq,n_con_names_seqs)) for path in control]
  ps_sam.extend([Process(target=non_sam_edit,args=(new_ref_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_tre_data,n_tre_file_order,n_deseq1,n_tre_names_seqs)) for path in treated])

  [p.start() for p in ps_sam]
  [p.join() for p in ps_sam]

  # Convert managers to lists
  n_con_file_order=list(n_con_file_order)
  n_tre_file_order=list(n_tre_file_order)
  n_deseq=list(n_deseq)
  n_deseq1=list(n_deseq1)

  # Remove duplicates and sorting
  n_con_names_seqs=list(n_con_names_seqs)
  n_con_names_seqs.sort()
  n_con_names_seqs=list(n_con_names_seqs for n_con_names_seqs,_ in itertools.groupby(n_con_names_seqs))

  n_tre_names_seqs=list(n_tre_names_seqs)
  n_tre_names_seqs.sort()
  n_tre_names_seqs=list(n_tre_names_seqs for n_tre_names_seqs,_ in itertools.groupby(n_tre_names_seqs))

  # initialization of new managers
  new_n_con_file_order=manager.list()
  new_n_tre_file_order=manager.list()
  n_new_deseq=manager.list()
  n_new_deseq1=manager.list()

  # add uncommon detected mirnas among the samples
  ps_deseq=[Process(target=uncommon_mirnas,args=(sampp,n_con_names_seqs,lock,n_new_deseq,n_con_file_order[i],new_n_con_file_order)) for i,sampp in enumerate(n_deseq)]
  ps_deseq.extend([Process(target=uncommon_mirnas,args=(sampp,n_tre_names_seqs,lock,n_new_deseq1,n_tre_file_order[i],new_n_tre_file_order)) for i,sampp in enumerate(n_deseq1)])

  # Wait for processing of uncommon detected mirnas to finish
  [x.start() for x in ps_deseq]
  [x.join() for x in ps_deseq]

  # Convert managers to lists
  n_new_deseq=list(n_new_deseq)
  n_new_deseq1=list(n_new_deseq1)
  n_con_file_order=list(new_n_con_file_order)
  n_tre_file_order=list(new_n_tre_file_order)

  # Genereation of count matrices per group (controls - treated)
  n_control_group=[[x[0],x[2]] for x in n_new_deseq[0]]
  [n_control_group[i].append(y[i][1]) for i,_ in enumerate(n_control_group) for y in n_new_deseq]

  n_treated_group=[[x[0],x[2]] for x in n_new_deseq1[0]]
  [n_treated_group[i].append(y[i][1]) for i,_ in enumerate(n_treated_group) for y in n_new_deseq1]

  # Keep a copy of count matrices
  n_control_group_copy=copy.deepcopy(list(n_control_group))
  n_treated_group_copy=copy.deepcopy(list(n_treated_group))

  # Initialization of managers
  merg_nam_n_control_group=manager.list()
  merg_nam_n_treated_group=manager.list()

  # Merging of names different names for the same mirna sequence per group (controls, treated) to avoid duplicates\
  ps_merge = [Process(target=merging_names,args=(n_control_group_copy,merg_nam_n_control_group))]
  ps_merge.extend([Process(target=merging_names,args=(n_treated_group_copy,merg_nam_n_treated_group))])
  [x.start() for x in ps_merge]

  # Add unique mirna sequences between groups (all groups will have the same amount of sequences)
  n_con_list=manager.list()
  n_tre_list=manager.list()

  ps_bw = [Process(target=black_white,args=(n_con_names_seqs,n_tre_names_seqs,n_treated_group,n_tre_list))]
  ps_bw.extend([Process(target=black_white,args=(n_tre_names_seqs,n_con_names_seqs,n_control_group,n_con_list))])
  [x.start() for x in ps_bw]
  [x.join() for x in ps_bw]

  n_control_group=list(n_con_list)
  n_treated_group=list(n_tre_list)

  # Detection of duplications
  n_dupes=manager.list()

  ps_dupes = Process(target=merging_dupes,args=(n_control_group,n_dupes))
  ps_dupes.start()
  ps_dupes.join()

  n_dupes=list(n_dupes)

  # Merging the duplications in one entry with all different names
  n_con_list=manager.list()
  n_tre_list=manager.list()

  ps_ap_merg_dupes = [Process(target=apply_merging_dupes,args=(n_control_group,n_dupes,n_con_list))]
  ps_ap_merg_dupes.extend([Process(target=apply_merging_dupes,args=(n_treated_group,n_dupes,n_tre_list))])
  [x.start() for x in ps_ap_merg_dupes]

  # Finishing the process for merging
  [x.join() for x in ps_merge]
  merg_nam_n_control_group=list(merg_nam_n_control_group)
  merg_nam_n_treated_group=list(merg_nam_n_treated_group)

  # Export the database and the graphs
  procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_con_data]
  procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_tre_data])
  procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_control_group,'c',args.group1))])
  procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_treated_group,'t',args.group2))])
  procs.extend([Process(target=pie_non_temp,args=(merg_nam_control_group,merg_nam_n_control_group,merg_nam_treated_group,merg_nam_n_treated_group,con_unmap_seq.value,tre_unmap_seq.value,con_unmap_counts.value,tre_unmap_counts.value,args.group1,args.group2))])

  [p.start() for p in procs]
  [p.join() for p in procs]

  procs1 = Process(target=pdf_before_DE,args=(args.anal,args.group1,args.group2))
  procs1.start()

  [x.join() for x in ps_ap_merg_dupes]
  n_control_group=list(n_con_list)
  n_treated_group=list(n_tre_list)


  # Filters low count mirnas (otpional)
  if int(args.per)!=-1:
     if int(args.per)>0 and float(args.per)<=100 and int(args.count)>0:

        n_fil_con_group=manager.list()
        n_fil_tre_group=manager.list()

        ps_low_counts = Process(target=filter_low_counts,args=(n_control_group,n_treated_group,n_fil_con_group,n_fil_tre_group,args.per,args.count))
        ps_low_counts.start()
        ps_low_counts.join()

        n_fil_con_group=list(n_fil_con_group)
        n_fil_tre_group=list(n_fil_tre_group)
     
     else:
        sys.exit("Not acceptable values for filter")
  
  if "n_fil_con_group" not in locals() or "n_fil_con_group" not in globals():
      n_fil_con_group=n_control_group
      n_fil_tre_group=n_treated_group

  ps_write = Process(target=write_main,args=(n_control_group, n_treated_group,n_fil_con_group, n_fil_tre_group, n_con_file_order, n_tre_file_order,2,args.group1,args.group2,args.per))
  ps_write.start()

  ps1_matrix = [Process(target=nontemp_counts_to_diff,args=(n_con_file_order,n_fil_con_group,con_file_order,fil_con_group,"Diff/n_temp_con/"))]
  ps1_matrix.extend([Process(target=nontemp_counts_to_diff,args=(n_tre_file_order,n_fil_tre_group,tre_file_order,fil_tre_group,"Diff/n_temp_tre/"))])
  [p.start() for p in ps1_matrix]

  ps_write.join()
  [p.join() for p in ps1_matrix]
  procs1.join()
 print('Running time: {} seconds'.format(round(time.time() - starttime,2)))