Mercurial > repos > glogobyte > isoread
view mirgene_graphs.py @ 17:0cdc0c3c821e draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 20 Oct 2021 08:44:20 +0000 |
parents | d58f050acd18 |
children | 01679b6e886a |
line wrap: on
line source
import itertools import pandas as pd from math import pi import numpy as np import matplotlib.pyplot as plt import math import logomaker as lm from fpdf import FPDF, fpdf import glob ############################################################################################################################################################################### def pie_non_temp(merge_con,merge_non_con,merge_tre,merge_non_tre,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts,group_name1,group_name2): c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_con] t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_tre] c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_con] t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_tre] c_templ = 0 c_tem_counts = 0 c_mature = 0 c_mat_counts = 0 t_templ = 0 t_tem_counts = 0 t_mature = 0 t_mat_counts = 0 c_non = len(c_non_samples) c_non_counts = sum(x[2] for x in c_non_samples) t_non = len(t_non_samples) t_non_counts = sum(x[2] for x in t_non_samples) c_unmap = c_unmap - c_non t_unmap = c_unmap - t_non c_unmap_counts=c_unmap_counts - c_non_counts t_unmap_counts=t_unmap_counts - t_non_counts for x in c_samples: if "/" not in x[0]: if len(x[0].split("_"))==2: c_mature+=1 c_mat_counts += x[2] else: c_templ+=1 c_tem_counts += x[2] else: f=0 for y in x[0].split("/"): if len(y.split("_"))==2: c_mature+=1 c_mat_counts += x[2] f=1 break if f==0: c_templ+=1 c_tem_counts += x[2] for x in t_samples: if "/" not in x[0]: if len(x[0].split("_"))==2: t_mature+=1 t_mat_counts += x[2] else: t_templ+=1 t_tem_counts += x[2] else: f=0 for y in x[0].split("/"): if len(y.split("_"))==2: t_mature+=1 t_mat_counts += x[2] f=1 break if f==0: t_templ+=1 t_tem_counts += x[2] fig = plt.figure(figsize=(7,5)) labels = 'miRNA RefSeq','templated', 'unassigned' ,'non-templated' sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts] colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] # Plot ax1 = plt.subplot2grid((1,2),(0,0)) patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) [x.set_fontsize(10) for x in texts] plt.title(group_name1.capitalize() + ' group (reads)',fontsize=12) labels = 'miRNA RefSeq','templated', 'unassigned','non-templated' sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts] colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] # Plot ax2 = plt.subplot2grid((1,2),(0,1)) patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) [x.set_fontsize(10) for x in texts] plt.title(group_name2.capitalize() + ' group (reads)', fontsize=12) plt.savefig('pie_non.png',dpi=300) #################################################################################################################################################################################################################### def pie_temp(merge_con,c_unmap,c_unmap_counts,merge_tre,t_unmap,t_unmap_counts,group_name1,group_name2): c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_con] t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_tre] c_templ = 0 c_tem_counts = 0 c_mature = 0 c_mat_counts = 0 t_templ = 0 t_tem_counts = 0 t_mature = 0 t_mat_counts = 0 for x in c_samples: if "/" not in x[0]: if len(x[0].split("_"))==2: c_mature+=1 c_mat_counts += x[2] else: c_templ+=1 c_tem_counts += x[2] else: f=0 for y in x[0].split("/"): if len(y.split("_"))==2: c_mature+=1 c_mat_counts += x[2] f=1 break if f==0: c_templ+=1 c_tem_counts += x[2] for x in t_samples: if "/" not in x[0]: if len(x[0].split("_"))==2: t_mature+=1 t_mat_counts += x[2] else: t_templ+=1 t_tem_counts += x[2] else: f=0 for y in x[0].split("/"): if len(y.split("_"))==2: t_mature+=1 t_mat_counts += x[2] f=1 break if f==0: t_templ+=1 t_tem_counts += x[2] fig = plt.figure() labels = 'miRNA RefSeq','templated', 'unassigned' sizes = [c_mat_counts, c_tem_counts, c_unmap_counts] colors = ['gold', 'yellowgreen', 'lightskyblue'] explode = (0.2, 0.05, 0.1) # Plot ax1 = plt.subplot2grid((1,2),(0,0)) patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) [x.set_fontsize(10) for x in texts] plt.title(group_name1.capitalize() + ' group (reads)', fontsize=12) labels = 'miRNA RefSeq','templated', 'Unassigned' sizes = [t_mat_counts, t_tem_counts, t_unmap_counts] colors = ['gold', 'yellowgreen', 'lightskyblue'] explode = (0.2, 0.05, 0.1) # Plot ax2 = plt.subplot2grid((1,2),(0,1)) patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) [x.set_fontsize(10) for x in texts] plt.title(group_name2.capitalize() + ' group (reads)',fontsize = 12) plt.savefig('pie_tem.png',dpi=300) ################################################################################################################################################################################################################### def make_spider(merge_con,merge_tre,group_name1,group_name2): c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_con] t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_tre] c_5 = 0 c_5_counts = 0 c_3 = 0 c_3_counts = 0 c_both =0 c_both_counts=0 c_mature = 0 c_mat_counts = 0 c_exception=0 c_exception_counts=0 t_5 = 0 t_5_counts = 0 t_3 = 0 t_3_counts = 0 t_both = 0 t_both_counts = 0 t_mature = 0 t_mat_counts = 0 t_exception = 0 t_exception_counts=0 for x in c_samples: if "/" not in x[0]: if len(x[0].split("_"))==2: c_mature+=1 c_mat_counts += x[2] elif 0 == int(x[0].split("_")[3]): c_5+=1 c_5_counts += x[2] elif 0 == int(x[0].split("_")[4]): c_3+=1 c_3_counts += x[2] else: c_both+=1 c_both_counts+=x[2] else: f=0 for y in x[0].split("/"): if len(y.split("_"))==2: c_mature+=1 c_mat_counts += x[2] f=1 break if f==0: part=x[0].split("/")[0].split("_")[-2]+"_"+x[0].split("/")[0].split("_")[-1] num=0 for y in x[0].split("/"): if y.split("_")[-2]+"_"+y.split("_")[-1]==part: num+=1 if num==len(x[0].split("/")): if 0 == int(x[0].split("/")[0].split("_")[3]): c_5+=1 c_5_counts += x[2] elif 0 == int(x[0].split("/")[0].split("_")[4]): c_3+=1 c_3_counts += x[2] else: c_both+=1 c_both_counts+=x[2] else: c_exception+=1 c_exception_counts += x[2] for x in t_samples: if "/" not in x[0]: if len(x[0].split("_"))==2: t_mature+=1 t_mat_counts += x[2] elif 0 == int(x[0].split("_")[3]): t_5+=1 t_5_counts += x[2] elif 0 == int(x[0].split("_")[4]): t_3+=1 t_3_counts += x[2] else: t_both+=1 t_both_counts+=x[2] else: f=0 for y in x[0].split("/"): if len(y.split("_"))==2: t_mature+=1 t_mat_counts += x[2] f=1 break if f==0: part=x[0].split("/")[0].split("_")[-2]+"_"+x[0].split("/")[0].split("_")[-1] num=0 for y in x[0].split("/"): if y.split("_")[-2]+"_"+y.split("_")[-1]==part: num+=1 if num==len(x[0].split("/")): if 0 == int(x[0].split("/")[0].split("_")[3]): t_5+=1 t_5_counts += x[2] elif 0 == int(x[0].split("/")[0].split("_")[4]): t_3+=1 t_3_counts += x[2] else: t_both+=1 t_both_counts+=x[2] else: t_exception+=1 t_exception_counts += x[2] c_all = c_5+c_3+c_both+c_mature+c_exception c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts t_all = t_5+t_3+t_both+t_mature + t_exception t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts c_5 = round(c_5/c_all*100,2) c_3 = round(c_3/c_all*100,2) c_both = round(c_both/c_all*100,2) c_mature = round(c_mature/c_all*100,2) c_exception = round(c_exception/c_all*100,2) c_5_counts = round(c_5_counts/c_all_counts*100,2) c_3_counts = round(c_3_counts/c_all_counts*100,2) c_both_counts = round(c_both_counts/c_all_counts*100,2) c_mat_counts = round(c_mat_counts/c_all_counts*100,2) c_exception_counts = round(c_exception_counts/c_all_counts*100,2) t_5 = round(t_5/t_all*100,2) t_3 = round(t_3/t_all*100,2) t_both = round(t_both/t_all*100,2) t_mature = round(t_mature/t_all*100,2) t_exception = round(t_exception/t_all*100,2) t_5_counts = round(t_5_counts/t_all_counts*100,2) t_3_counts = round(t_3_counts/t_all_counts*100,2) t_both_counts = round(t_both_counts/t_all_counts*100,2) t_mat_counts = round(t_mat_counts/t_all_counts*100,2) t_exception_counts = round(t_exception_counts/t_all_counts*100,2) radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception) radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts) df=pd.DataFrame({ 'group':[group_name1.capitalize(),group_name2.capitalize()], """5'3'-isomiRs""":[c_both,t_both], """3'-isomiRs""":[c_3,t_3], 'RefSeq miRNA':[c_mature,t_mature], """5'-isomiRs""":[c_5,t_5], 'others*':[c_exception,t_exception]}) df1=pd.DataFrame({ 'group':[group_name1.capitalize(),group_name2.capitalize()], """5'-3'-isomiRs""":[c_both_counts,t_both_counts], """3'-isomiRs""":[c_3_counts,t_3_counts], 'RefSeq miRNA':[c_mat_counts,t_mat_counts], """5'-isomiRs""":[c_5_counts,t_5_counts], 'others*':[c_exception_counts,t_exception_counts]}) spider_last(df,radar_max,1,group_name1,group_name2) spider_last(df1,radar_max_counts,2,group_name1,group_name2) ############################################################################################################################################################################################################## def spider_last(df,radar_max,flag,group_name1,group_name2): # ------- PART 1: Create background fig = plt.figure() # number of variable categories=list(df)[1:] N = len(categories) # What will be the angle of each axis in the plot? (we divide the plot / number of variable) angles = [n / float(N) * 2 * pi for n in range(N)] angles += angles[:1] # Initialise the spider plot ax = plt.subplot(111, polar=True) # If you want the first axis to be on top: ax.set_theta_offset(pi/2) ax.set_theta_direction(-1) # Draw one axe per variable + add labels labels yet plt.xticks(angles[:-1], categories, fontsize=13) # Draw ylabels radar_max=round(radar_max+radar_max*0.1) mul=len(str(radar_max))-1 maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul) sep = round(maxi/4) plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=12) plt.ylim(0, maxi) # ------- PART 2: Add plots # Plot each individual = each line of the data # I don't do a loop, because plotting more than 3 groups makes the chart unreadable # Ind1 values=df.loc[0].drop('group').values.flatten().tolist() values += values[:1] ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label=group_name1.capitalize()) ax.fill(angles, values, 'b', alpha=0.1) # Ind2 values=df.loc[1].drop('group').values.flatten().tolist() values += values[:1] ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label=group_name2.capitalize()) ax.fill(angles, values, 'r', alpha=0.1) # Add legend if flag==1: plt.legend(loc='upper right', prop={'size': 11}, bbox_to_anchor=(0.0, 0.1)) plt.savefig('spider_non_red.png',dpi=300) else: plt.legend(loc='upper right', prop={'size': 11}, bbox_to_anchor=(0.0, 0.1)) plt.savefig('spider_red.png',dpi=300) ############################################################################################################################################################################################################# def hist_red(samples,flag,group): lengths=[] cat=[] total_reads=0 seq=[] if flag == "c": title = "Length distribution of "+group.lower()+" group (redundant reads)" if flag == "t": title = "Length distribution of "+group.lower()+" group (redundant reads)" for i in samples: for x in i: lengths.append(x[3]) if x[1]=="0": seq.append([x[3],x[0].split("-")[1],"Mapped"]) cat.append("Mapped") if x[1] == "4": seq.append([x[3],x[0].split("-")[1],"Unassigned"]) cat.append("Unassigned") uni_len=list(set(lengths)) uni_len=[x for x in uni_len if x<=35] low=min(lengths) up=max(lengths) seq.sort() uni_seq=list(seq for seq,_ in itertools.groupby(seq)) dim=up-low if dim>20: s=5 else: s=8 total_reads+=sum([int(x[1]) for x in uni_seq]) map_reads=[] unmap_reads=[] length=[] for y in uni_len: map_temp=0 unmap_temp=0 for x in uni_seq: if x[0]==y and x[2]=="Mapped": map_temp+=int(x[1]) if x[0]==y and x[2]=="Unassigned": unmap_temp+=int(x[1]) if y<=35: length.append(y) map_reads.append(round(map_temp/total_reads*100,2)) unmap_reads.append(round(unmap_temp/total_reads*100,2)) ylim=max([sum(x) for x in zip(unmap_reads, map_reads)]) ylim=ylim+ylim*20/100 fig, ax = plt.subplots() width=0.8 ax.bar(length, unmap_reads, width, label='Unassigned') h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped') plt.xticks(np.arange(length[0], length[-1]+1, 1)) plt.yticks(np.arange(0, ylim, 5)) plt.xlabel('Length (nt)',fontsize=14) plt.ylabel('Percentage',fontsize=14) plt.title(title,fontsize=14) ax.legend() plt.ylim([0, ylim]) ax.grid(axis='y',linewidth=0.2) if flag=='c': plt.savefig('c_hist_red.png',dpi=300) if flag=='t': plt.savefig('t_hist_red.png',dpi=300) ############################################################################################################################################################################################################################################## def logo_seq_red(merge, flag, group): if flag=="c": titlos = group + " group (redundant)" file_logo = "c_logo.png" file_bar = "c_bar.png" if flag=="t": titlos = group + " group (redundant)" file_logo = "t_logo.png" file_bar = "t_bar.png" c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge] A=[0]*3 C=[0]*3 G=[0]*3 T=[0]*3 total_reads=0 for y in c_samples: if "/" in y[0]: length=[] for x in y[0].split("/"): length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) best=min(length) total_reads+=best[2] for i in range(3): if i<len(best[1]): if best[1][i] == "A": A[i]+=best[2] elif best[1][i] == "C": C[i]+=best[2] elif best[1][i] == "G": G[i]+=best[2] else: T[i]+=best[2] else: total_reads+=y[2] for i in range(3): if i<len(y[0].split("_")[-1]): if y[0].split("_")[-1][i] == "A": A[i]+=(y[2]) elif y[0].split("_")[-1][i] == "C": C[i]+=(y[2]) elif y[0].split("_")[-1][i] == "G": G[i]+=(y[2]) else: T[i]+=y[2] A[:] = [round(x*100,1) / total_reads for x in A] C[:] = [round(x*100,1) / total_reads for x in C] G[:] = [round(x*100,1) / total_reads for x in G] T[:] = [round(x*100,1) / total_reads for x in T] data = {'A':A,'C':C,'G':G,'T':T} df = pd.DataFrame(data, index=[1,2,3]) h=df.plot.bar(color=tuple(["g", "b","gold","r"]) ) h.grid(axis='y',linewidth=0.2) plt.xticks(rotation=0, ha="right") plt.ylabel("Counts (%)",fontsize=18) plt.xlabel("Numbers of additional nucleotides",fontsize=18) plt.title(titlos,fontsize=20) plt.tight_layout() plt.savefig(file_bar, dpi=300) crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') crp_logo.style_spines(visible=False) crp_logo.style_spines(spines=['left', 'bottom'], visible=True) crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) crp_logo.ax.set_title(titlos,fontsize=18) crp_logo.ax.set_ylabel("Counts (%)", fontsize=16,labelpad=5) crp_logo.ax.set_xlabel("Numbers of additional nucleotides",fontsize=16, labelpad=5) crp_logo.ax.xaxis.set_ticks_position('none') crp_logo.ax.xaxis.set_tick_params(pad=-1) figure = plt.gcf() figure.set_size_inches(6, 4) crp_logo.fig.savefig(file_logo,dpi=300) ########################################################################################################################################################################################################## def pdf_before_DE(analysis,group_name1,group_name2): # Image extensions if analysis=="2": image_extensions = ("c_hist_red.png","t_hist_red.png","pie_non.png","spider_red.png","spider_non_red.png","c_logo.png","t_logo.png","c_bar.png","t_bar.png") else: image_extensions = ("c_hist_red.png","t_hist_red.png","pie_tem.png","spider_red.png","spider_non_red.png") # This list will hold the images file names images = [] # Build the image list by merging the glob results (a list of files) # for each extension. We are taking images from current folder. for extension in image_extensions: images.extend(glob.glob(extension)) #sys.exit(images) # Create instance of FPDF class pdf = FPDF('P', 'in', 'A4') # Add new page. Without this you cannot create the document. pdf.add_page() # Set font to Arial, 'B'old, 16 pts pdf.set_font('Arial', 'B', 20.0) # Page header pdf.cell(pdf.w-0.5, 0.5, 'IsomiR profile report',align='C') pdf.ln(0.7) pdf.set_font('Arial','', 16.0) pdf.cell(pdf.w-0.5, 0.5, 'sRNA length distribution',align='C') # Smaller font for image captions pdf.set_font('Arial', '', 11.0) # Image caption pdf.ln(0.5) yh=FPDF.get_y(pdf) pdf.image(images[0],x=0.3,w=4, h=3) pdf.image(images[1],x=4,y=yh, w=4, h=3) pdf.ln(0.3) # Image caption pdf.cell(0.2) pdf.cell(3.0, 0.0, " Mapped and unmapped reads to custom precursor arm reference DB (5p and 3p arms) in "+group_name1.lower()) pdf.ln(0.2) pdf.cell(0.2) pdf.cell(3.0, 0.0, " (left) and "+group_name2.lower()+" (right) groups") pdf.ln(0.5) h1=FPDF.get_y(pdf) pdf.image(images[2],x=1, w=6.5, h=5) h2=FPDF.get_y(pdf) FPDF.set_y(pdf,h1+0.2) pdf.set_font('Arial','', 16.0) pdf.cell(pdf.w-0.5, 0.5, 'Template and non-template isomiRs',align='C') pdf.set_font('Arial', '', 11.0) FPDF.set_y(pdf,h2) FPDF.set_y(pdf,9.5) # Image caption pdf.cell(0.2) if analysis=="2": pdf.cell(3.0, 0.0, " RefSeq miRNAs, templated isomiRs, non-templated isomiRs and unassigned sequences as percentage") pdf.ln(0.2) pdf.cell(0.2) pdf.cell(3.0, 0.0, " of total sRNA reads in "+group_name1.lower()+" (left) and "+group_name2.lower()+" (right) groups") else: pdf.cell(3.0, 0.0, " RefSeq miRNAS, Templated isomiRs and unassigned sequences as percentage of total sRNA reads in") pdf.ln(0.2) pdf.cell(0.2) pdf.cell(3.0, 0.0, " "+group_name1.lower()+" (left) and "+group_name2.lower()+" (right) groups") pdf.add_page() pdf.set_font('Arial', 'B', 18.0) pdf.cell(pdf.w-0.5, 0.5, "Templated isomiR subtypes",align='C') pdf.ln(0.7) pdf.set_font('Arial', 'B', 14.0) pdf.cell(pdf.w-0.5, 0.5, "Templated isomiR profile (redundant reads)",align='C') pdf.ln(0.5) pdf.image(images[3],x=1.5, w=5.5, h=4) pdf.ln(0.6) pdf.cell(pdf.w-0.5, 0.0, "Templated isomiR profile (non-redundant reads)",align='C') pdf.set_font('Arial', '', 12.0) pdf.ln(0.2) pdf.image(images[4],x=1.5, w=5.5, h=4) pdf.ln(0.3) pdf.set_font('Arial', '', 11.0) pdf.cell(0.2) pdf.cell(3.0, 0.0, " * IsomiRs potentially generated from multiple loci") if analysis=="2": pdf.add_page('L') pdf.set_font('Arial', 'B', 18.0) pdf.cell(pdf.w-0.5, 0.5, "Non-templated isomiRs",align='C') pdf.ln(0.5) pdf.set_font('Arial', 'B', 14.0) pdf.cell(pdf.w-0.5, 0.5, "3'-end additions to RefSeq miRNAs and templated isomiRs",align='C') pdf.ln(0.7) yh=FPDF.get_y(pdf) pdf.image(images[5],x=1.5,w=3.65, h=2.65) pdf.image(images[7],x=6.5,y=yh, w=3.65, h=2.65) pdf.ln(0.5) yh=FPDF.get_y(pdf) pdf.image(images[6],x=1.5,w=3.65, h=2.65) pdf.image(images[8],x=6.5,y=yh, w=3.65, h=2.65) pdf.close() pdf.output('report1.pdf','F') #############################################################################################################################################################3