Mercurial > repos > glogobyte > isoread
view mirbase_graphs.py @ 4:f44185f616bc draft
Uploaded
author | glogobyte |
---|---|
date | Wed, 13 Oct 2021 16:04:40 +0000 |
parents | d77b33e65501 |
children | fa48ad87ae3e |
line wrap: on
line source
import itertools import pandas as pd from math import pi import numpy as np import matplotlib.pyplot as plt import math import logomaker as lm from fpdf import FPDF, fpdf import glob ################################################################################################################################################################# def pie_non_temp(merge_con,merge_non_con,merge_tre,merge_non_tre,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts,group_name1,group_name2): c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_con] t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_tre] c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_con] t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_tre] c_templ = 0 c_tem_counts = 0 c_mature = 0 c_mat_counts = 0 t_templ = 0 t_tem_counts = 0 t_mature = 0 t_mat_counts = 0 c_non = len(c_non_samples) c_non_counts = sum(x[2] for x in c_non_samples) t_non = len(t_non_samples) t_non_counts = sum(x[2] for x in t_non_samples) c_unmap = c_unmap - c_non t_unmap = c_unmap - t_non c_unmap_counts=c_unmap_counts - c_non_counts t_unmap_counts=t_unmap_counts - t_non_counts for x in c_samples: if "/" not in x[0]: if "chr" in x[0].split("_")[-1]: c_mature+=1 c_mat_counts += x[2] else: c_templ+=1 c_tem_counts += x[2] else: f=0 for y in x[0].split("/"): if "chr" in y.split("_")[-1]: c_mature+=1 c_mat_counts += x[2] f=1 break if f==0: c_templ+=1 c_tem_counts += x[2] for x in t_samples: if "/" not in x[0]: if "chr" in x[0].split("_")[-1]: t_mature+=1 t_mat_counts += x[2] else: t_templ+=1 t_tem_counts += x[2] else: f=0 for y in x[0].split("/"): if "chr" in y.split("_")[-1]: t_mature+=1 t_mat_counts += x[2] f=1 break if f==0: t_templ+=1 t_tem_counts += x[2] fig = plt.figure(figsize=(7,5)) labels = 'miRNA RefSeq','Template', 'Unassigned','Non-template' sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts] colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] ax1 = plt.subplot2grid((1,2),(0,0)) patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) [x.set_fontsize(8) for x in texts] plt.title(group_name1 + ' Group (reads)',fontsize=12) labels = 'miRNA RefSeq','Template', 'Unassigned','non-template' sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts] colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] ax2 = plt.subplot2grid((1,2),(0,1)) patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) [x.set_fontsize(8) for x in texts] plt.title(group_name2 + ' Group (reads)', fontsize=12) plt.savefig('pie_non.png',dpi=300) ###################################################################################################################################################### def pie_temp(merge_con,c_unmap,c_unmap_counts,merge_tre,t_unmap,t_unmap_counts,group_name1,group_name2): c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_con] t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_tre] c_templ = 0 c_tem_counts = 0 c_mature = 0 c_mat_counts = 0 t_templ = 0 t_tem_counts = 0 t_mature = 0 t_mat_counts = 0 for x in c_samples: if "/" not in x[0]: if "chr" in x[0].split("_")[-1]: c_mature+=1 c_mat_counts += x[2] else: c_templ+=1 c_tem_counts += x[2] else: f=0 for y in x[0].split("/"): if "chr" in y.split("_")[-1]: c_mature+=1 c_mat_counts += x[2] f=1 break if f==0: c_templ+=1 c_tem_counts += x[2] for x in t_samples: if "/" not in x[0]: if "chr" in x[0].split("_")[-1]: t_mature+=1 t_mat_counts += x[2] else: t_templ+=1 t_tem_counts += x[2] else: f=0 for y in x[0].split("/"): if "chr" in y.split("_")[-1]: t_mature+=1 t_mat_counts += x[2] f=1 break if f==0: t_templ+=1 t_tem_counts += x[2] fig = plt.figure() labels = 'miRNA RefSeq','Template', 'Unassigned' sizes = [c_mat_counts, c_tem_counts, c_unmap_counts] colors = ['gold', 'yellowgreen', 'lightskyblue'] explode = (0.2, 0.05, 0.1) ax1 = plt.subplot2grid((1,2),(0,0)) patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) [x.set_fontsize(8) for x in texts] plt.title(group_name1 + ' group (reads)', fontsize=12) labels = 'miRNA RefSeq','Template', 'Unassigned' sizes = [t_mat_counts, t_tem_counts, t_unmap_counts] colors = ['gold', 'yellowgreen', 'lightskyblue'] explode = (0.2, 0.05, 0.1) ax2 = plt.subplot2grid((1,2),(0,1)) patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8) [x.set_fontsize(8) for x in texts] plt.title(group_name2 + ' group (reads)',fontsize = 12) plt.savefig('pie_tem.png',dpi=300) ################################################################################################################################################################################################################### def make_spider(merge_con,merge_tre,group_name1,group_name2): c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_con] t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_tre] c_5 = 0 c_5_counts = 0 c_3 = 0 c_3_counts = 0 c_both =0 c_both_counts=0 c_mature = 0 c_mat_counts = 0 c_exception=0 c_exception_counts=0 t_5 = 0 t_5_counts = 0 t_3 = 0 t_3_counts = 0 t_both = 0 t_both_counts = 0 t_mature = 0 t_mat_counts = 0 t_exception = 0 t_exception_counts=0 for x in c_samples: if "/" not in x[0]: if "chr" in x[0].split("_")[-1]: c_mature+=1 c_mat_counts += x[2] elif 0 == int(x[0].split("_")[-1]): c_5+=1 c_5_counts += x[2] elif 0 == int(x[0].split("_")[-2]): c_3+=1 c_3_counts += x[2] else: c_both+=1 c_both_counts+=x[2] else: f=0 for y in x[0].split("/"): if "chr" in y.split("_")[-1]: c_mature+=1 c_mat_counts += x[2] f=1 break if f==0: for y in x[0].split("/"): c_exception+=1 c_exception_counts += x[2] for x in t_samples: if "/" not in x[0]: if "chr" in x[0].split("_")[-1]: t_mature+=1 t_mat_counts += x[2] elif 0 == int(x[0].split("_")[-1]): t_5+=1 t_5_counts += x[2] elif 0 == int(x[0].split("_")[-2]): t_3+=1 t_3_counts += x[2] else: t_both+=1 t_both_counts+=x[2] else: f=0 for y in x[0].split("/"): if "chr" in y.split("_")[-1]: t_mature+=1 t_mat_counts += x[2] f=1 break if f==0: for y in x[0].split("/"): t_exception+=1 t_exception_counts += x[2] c_all = c_5+c_3+c_both+c_mature+c_exception c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts t_all = t_5+t_3+t_both+t_mature + t_exception t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts c_5 = round(c_5/c_all*100,2) c_3 = round(c_3/c_all*100,2) c_both = round(c_both/c_all*100,2) c_mature = round(c_mature/c_all*100,2) c_exception = round(c_exception/c_all*100,2) c_5_counts = round(c_5_counts/c_all_counts*100,2) c_3_counts = round(c_3_counts/c_all_counts*100,2) c_both_counts = round(c_both_counts/c_all_counts*100,2) c_mat_counts = round(c_mat_counts/c_all_counts*100,2) c_exception_counts = round(c_exception_counts/c_all_counts*100,2) t_5 = round(t_5/t_all*100,2) t_3 = round(t_3/t_all*100,2) t_both = round(t_both/t_all*100,2) t_mature = round(t_mature/t_all*100,2) t_exception = round(t_exception/t_all*100,2) t_5_counts = round(t_5_counts/t_all_counts*100,2) t_3_counts = round(t_3_counts/t_all_counts*100,2) t_both_counts = round(t_both_counts/t_all_counts*100,2) t_mat_counts = round(t_mat_counts/t_all_counts*100,2) t_exception_counts = round(t_exception_counts/t_all_counts*100,2) radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception) radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts) df=pd.DataFrame({ 'group':[group_name1,group_name2], """5'3'-isomiRs""":[c_both,t_both], """3'-isomiRs""":[c_3,t_3], 'RefSeq miRNA':[c_mature,t_mature], """5'-isomiRs""":[c_5,t_5], 'Others*':[c_exception,t_exception]}) df1=pd.DataFrame({ 'group':[group_name1,group_name2], """5'3'-isomiRs""":[c_both_counts,t_both_counts], """3'-isomiRs""":[c_3_counts,t_3_counts], 'RefSeq miRNA':[c_mat_counts,t_mat_counts], """5'-isomiRs""":[c_5_counts,t_5_counts], 'Others*':[c_exception_counts,t_exception_counts]}) spider_last(df,radar_max,1,group_name1,group_name2) spider_last(df1,radar_max_counts,2,group_name1,group_name2) ##################################################################################################################################################### def spider_last(df,radar_max,flag,group_name1,group_name2): # ------- PART 1: Create background fig = plt.figure() # number of variable categories=list(df)[1:] N = len(categories) # What will be the angle of each axis in the plot? (we divide the plot / number of variable) angles = [n / float(N) * 2 * pi for n in range(N)] angles += angles[:1] # Initialise the spider plot ax = plt.subplot(111, polar=True) # If you want the first axis to be on top: ax.set_theta_offset(pi/2) ax.set_theta_direction(-1) # Draw one axe per variable + add labels labels yet plt.xticks(angles[:-1], categories, fontsize=11) # Draw ylabels radar_max=round(radar_max+radar_max*0.1) mul=len(str(radar_max))-1 maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul) sep = round(maxi/4) plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10) plt.ylim(0, maxi) # ------- PART 2: Add plots # Plot each individual = each line of the data # I don't do a loop, because plotting more than 3 groups makes the chart unreadable # Ind1 values=df.loc[0].drop('group').values.flatten().tolist() values += values[:1] ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label=group_name1) ax.fill(angles, values, 'b', alpha=0.1) # Ind2 values=df.loc[1].drop('group').values.flatten().tolist() values += values[:1] ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label=group_name2) ax.fill(angles, values, 'r', alpha=0.1) # Add legend if flag==1: plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) plt.savefig('spider_non_red.png',dpi=300) else: plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1)) plt.savefig('spider_red.png',dpi=300) ############################################################################################################################################################################################################# def hist_red(samples,flag,group_name): lengths=[] cat=[] total_reads=0 seq=[] if flag == "c": title = "Length Distribution of "+ group_name +" group (Redudant reads)" if flag == "t": title = "Length Distribution of "+ group_name +" group (Redudant reads)" # classification of the sequences on two categories mapped or unmapped for i in samples: for x in i: lengths.append(x[3]) if x[1]=="0": seq.append([x[3],x[0].split("-")[1],"Mapped"]) cat.append("Mapped") if x[1] == "4": seq.append([x[3],x[0].split("-")[1],"Unassigned"]) cat.append("Unassigned") # Keep lengths below 35nts uni_len=list(set(lengths)) uni_len=[x for x in uni_len if x<=35] # Remove duplicates from sequences seq.sort() uni_seq=list(seq for seq,_ in itertools.groupby(seq)) # Calculation of the reads per group (mapped or unmapped) total_reads+=sum([int(x[1]) for x in uni_seq]) map_reads=[] unmap_reads=[] length=[] for y in uni_len: map_temp=0 unmap_temp=0 for x in uni_seq: if x[0]==y and x[2]=="Mapped": map_temp+=int(x[1]) if x[0]==y and x[2]=="Unassigned": unmap_temp+=int(x[1]) length.append(y) map_reads.append(round(map_temp/total_reads*100,2)) # percentage of mapped reads over total number of sequences unmap_reads.append(round(unmap_temp/total_reads*100,2)) # percentage of unmapped reads over total number of sequences # Generation of the graph ylim=max([sum(x) for x in zip(unmap_reads, map_reads)]) ylim=ylim+ylim*20/100 fig, ax = plt.subplots() width=0.8 ax.bar(length, unmap_reads, width, label='Unassigned') h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped') plt.xticks(np.arange(length[0], length[-1]+1, 1)) plt.yticks(np.arange(0, ylim, 5)) plt.xlabel('Length (nt)',fontsize=14) plt.ylabel('Percentage',fontsize=14) plt.title(title,fontsize=14) ax.legend() plt.ylim([0, ylim]) ax.grid(axis='y',linewidth=0.2) # Save of the graph if flag=='c': plt.savefig('c_hist_red.png',dpi=300) if flag=='t': plt.savefig('t_hist_red.png',dpi=300) ################################################################################################################# def logo_seq_red(merge, flag, group_name): if flag=="c": titlos= group_name + " group (Redundant)" file_logo="c_logo.png" file_bar="c_bar.png" if flag=="t": titlos= group_name + " group (Redundant)" file_logo="t_logo.png" file_bar="t_bar.png" c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge] A=[0]*3 C=[0]*3 G=[0]*3 T=[0]*3 total_reads=0 for y in c_samples: if "/" in y[0]: length=[] for x in y[0].split("/"): length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]]) best=min(length) total_reads+=best[2] for i in range(3): if i<len(best[1]): if best[1][i] == "A": A[i]+=best[2] elif best[1][i] == "C": C[i]+=best[2] elif best[1][i] == "G": G[i]+=best[2] else: T[i]+=best[2] else: total_reads+=y[2] for i in range(3): if i<len(y[0].split("_")[-1]): if y[0].split("_")[-1][i] == "A": A[i]+=(y[2]) elif y[0].split("_")[-1][i] == "C": C[i]+=(y[2]) elif y[0].split("_")[-1][i] == "G": G[i]+=(y[2]) else: T[i]+=y[2] A[:] = [round(x*100,1) / total_reads for x in A] C[:] = [round(x*100,1) / total_reads for x in C] G[:] = [round(x*100,1) / total_reads for x in G] T[:] = [round(x*100,1) / total_reads for x in T] data = {'A':A,'C':C,'G':G,'T':T} df = pd.DataFrame(data, index=[1,2,3]) h=df.plot.bar(color=tuple(["g", "b","gold","r"]) ) h.grid(axis='y',linewidth=0.2) plt.xticks(rotation=0, ha="right") plt.ylabel("Counts (%)",fontsize=18) plt.xlabel("Numbers of additional nucleotides",fontsize=18) plt.title(titlos,fontsize=20) plt.tight_layout() plt.savefig(file_bar, dpi=300) crp_logo = lm.Logo(df, font_name = 'DejaVu Sans') crp_logo.style_spines(visible=False) crp_logo.style_spines(spines=['left', 'bottom'], visible=True) crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0) # style using Axes methods crp_logo.ax.set_title(titlos,fontsize=18) crp_logo.ax.set_ylabel("Counts (%)", fontsize=16,labelpad=5) crp_logo.ax.set_xlabel("Numbers of additional nucleotides",fontsize=16, labelpad=5) crp_logo.ax.xaxis.set_ticks_position('none') crp_logo.ax.xaxis.set_tick_params(pad=-1) figure = plt.gcf() figure.set_size_inches(6, 4) crp_logo.fig.savefig(file_logo,dpi=300) ########################################################################################################################################################################################################## def pdf_before_DE(analysis,group_name1,group_name2): # Image extensions if analysis=="2": image_extensions = ("c_hist_red.png","t_hist_red.png","pie_non.png","spider_red.png","spider_non_red.png","c_logo.png","t_logo.png","c_bar.png","t_bar.png") else: image_extensions = ("c_hist_red.png","t_hist_red.png","pie_tem.png","spider_red.png","spider_non_red.png") # This list will hold the images file names images = [] # Build the image list by merging the glob results (a list of files) # for each extension. We are taking images from current folder. for extension in image_extensions: images.extend(glob.glob(extension)) # Create instance of FPDF class pdf = FPDF('P', 'in', 'A4') # Add new page. Without this you cannot create the document. pdf.add_page() # Set font to Arial, 'B'old, 16 pts pdf.set_font('Arial', 'B', 20.0) # Page header pdf.cell(pdf.w-0.5, 0.5, 'IsomiR Profile Report',align='C') pdf.ln(0.7) pdf.set_font('Arial','B', 16.0) pdf.cell(pdf.w-0.5, 0.5, 'sRNA length distribution',align='C') # Smaller font for image captions pdf.set_font('Arial', '', 11.0) pdf.ln(0.5) yh=FPDF.get_y(pdf) pdf.image(images[0],x=0.3,w=4, h=3) pdf.image(images[1],x=4,y=yh, w=4, h=3) pdf.ln(0.3) pdf.cell(0.2) pdf.cell(3.0, 0.0, " Mapped and unmapped reads to custom precussor arm reference DB (5p and 3p arms) in "+group_name1) pdf.ln(0.2) pdf.cell(0.2) pdf.cell(3.0, 0.0, " (left) and "+group_name2+" (right) groups") pdf.ln(0.5) h1=FPDF.get_y(pdf) pdf.image(images[2],x=1, w=6.5, h=5) h2=FPDF.get_y(pdf) FPDF.set_y(pdf,h1+0.2) pdf.set_font('Arial','B', 16.0) pdf.cell(pdf.w-0.5, 0.5, 'Templated and non-templated isomiRs',align='C') pdf.set_font('Arial', '', 11.0) FPDF.set_y(pdf,h2) FPDF.set_y(pdf,9.5) pdf.cell(0.2) if analysis=="2": pdf.cell(3.0, 0.0, " RefSeq miRNAs, templated isomiRs, non-templated isomiRs and unassigned sequences as percentage") pdf.ln(0.2) pdf.cell(0.2) pdf.cell(3.0, 0.0, " of total sRNA reads in "+group_name1+" (left) and "+group_name2+" (right) groups") else: pdf.cell(3.0, 0.0, " RefSeq miRNAS, Templated isomiRs and unassigned sequences as percentage of total sRNA reads in") pdf.ln(0.2) pdf.cell(0.2) pdf.cell(3.0, 0.0, " "+group_name1+" (left) and "+group_name2 + " (right) groups") pdf.add_page() pdf.set_font('Arial', 'B', 18.0) pdf.cell(pdf.w-0.5, 0.5, "Templated isomiR subtypes",align='C') pdf.ln(0.7) pdf.set_font('Arial', 'B', 14.0) pdf.cell(pdf.w-0.5, 0.5, "Templated isomiR profile (redundant reads)",align='C') pdf.ln(0.5) pdf.image(images[3],x=1.5, w=5.5, h=4) pdf.ln(0.6) pdf.cell(pdf.w-0.5, 0.0, "Templated isomiR profile (non-redundant reads)",align='C') pdf.set_font('Arial', '', 12.0) pdf.ln(0.2) pdf.image(images[4],x=1.5, w=5.5, h=4) pdf.ln(0.3) pdf.set_font('Arial', '', 11.0) pdf.cell(0.2) pdf.cell(3.0, 0.0, " * IsomiRs potentially generated from multiple loci") if analysis=="2": pdf.add_page('L') pdf.set_font('Arial', 'B', 18.0) pdf.cell(pdf.w-0.5, 0.5, "Non-templated isomiRs",align='C') pdf.ln(0.5) pdf.set_font('Arial', 'B', 14.0) pdf.cell(pdf.w-0.5, 0.5, "3'-end additions to RefSeq miRNAs and templated isomiRs",align='C') pdf.ln(0.7) yh=FPDF.get_y(pdf) pdf.image(images[5],x=1.5,w=3.65, h=2.65) pdf.image(images[7],x=6.5,y=yh, w=3.65, h=2.65) pdf.ln(0.5) yh=FPDF.get_y(pdf) pdf.image(images[6],x=1.5,w=3.65, h=2.65) pdf.image(images[8],x=6.5,y=yh, w=3.65, h=2.65) pdf.close() pdf.output('report1.pdf','F') #############################################################################################################################################################3