view mirbase_graphs.py @ 4:f44185f616bc draft

Uploaded
author glogobyte
date Wed, 13 Oct 2021 16:04:40 +0000
parents d77b33e65501
children fa48ad87ae3e
line wrap: on
line source

import itertools
import pandas as pd
from math import pi
import numpy as np
import matplotlib.pyplot as plt
import math
import logomaker as lm
from fpdf import FPDF, fpdf
import glob

#################################################################################################################################################################

def pie_non_temp(merge_con,merge_non_con,merge_tre,merge_non_tre,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts,group_name1,group_name2):

    c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_con]
    t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_tre]
    c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_con]
    t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_tre]

    c_templ = 0
    c_tem_counts = 0
    c_mature = 0
    c_mat_counts = 0
    t_templ = 0
    t_tem_counts = 0
    t_mature = 0
    t_mat_counts = 0

    c_non = len(c_non_samples)
    c_non_counts = sum(x[2] for x in c_non_samples)
    t_non = len(t_non_samples)
    t_non_counts = sum(x[2] for x in t_non_samples)

    c_unmap = c_unmap - c_non
    t_unmap = c_unmap - t_non

    c_unmap_counts=c_unmap_counts - c_non_counts
    t_unmap_counts=t_unmap_counts - t_non_counts


    for x in c_samples:

        if "/" not in x[0]:
            if "chr" in  x[0].split("_")[-1]:
               c_mature+=1
               c_mat_counts += x[2]
            else:
               c_templ+=1
               c_tem_counts += x[2]
        else:
           f=0
           for y in x[0].split("/"):
               if "chr" in y.split("_")[-1]:
                  c_mature+=1
                  c_mat_counts += x[2]
                  f=1
                  break
           if f==0:
              c_templ+=1
              c_tem_counts += x[2]

    for x in t_samples:

        if "/" not in x[0]:
            if "chr" in  x[0].split("_")[-1]:
               t_mature+=1
               t_mat_counts += x[2]
            else:
               t_templ+=1
               t_tem_counts += x[2]
        else:
           f=0
           for y in x[0].split("/"):
               if "chr" in y.split("_")[-1]:
                  t_mature+=1
                  t_mat_counts += x[2]
                  f=1
                  break
           if f==0:
              t_templ+=1
              t_tem_counts += x[2]

    fig = plt.figure(figsize=(7,5))
    labels = 'miRNA RefSeq','Template', 'Unassigned','Non-template'
    sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts]
    colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
    ax1 = plt.subplot2grid((1,2),(0,0))
    patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
    [x.set_fontsize(8) for x in texts]
    plt.title(group_name1 + ' Group (reads)',fontsize=12)
    labels = 'miRNA RefSeq','Template', 'Unassigned','non-template'
    sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts]
    colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
    ax2 = plt.subplot2grid((1,2),(0,1))
    patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
    [x.set_fontsize(8) for x in texts]
    plt.title(group_name2 + ' Group (reads)', fontsize=12)
    plt.savefig('pie_non.png',dpi=300)

######################################################################################################################################################


def pie_temp(merge_con,c_unmap,c_unmap_counts,merge_tre,t_unmap,t_unmap_counts,group_name1,group_name2):

    c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_con]
    t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_tre]

    c_templ = 0
    c_tem_counts = 0
    c_mature = 0
    c_mat_counts = 0
    t_templ = 0
    t_tem_counts = 0
    t_mature = 0
    t_mat_counts = 0

    for x in c_samples:

        if "/" not in x[0]:
            if "chr" in  x[0].split("_")[-1]:
               c_mature+=1
               c_mat_counts += x[2]
            else:
               c_templ+=1
               c_tem_counts += x[2]
        else:
           f=0
           for y in x[0].split("/"):
               if "chr" in y.split("_")[-1]:
                  c_mature+=1
                  c_mat_counts += x[2]
                  f=1
                  break
           if f==0:
              c_templ+=1
              c_tem_counts += x[2]

    for x in t_samples:

        if "/" not in x[0]:
            if "chr" in  x[0].split("_")[-1]:
               t_mature+=1
               t_mat_counts += x[2]
            else:
               t_templ+=1
               t_tem_counts += x[2]
        else:
           f=0
           for y in x[0].split("/"):
               if "chr" in y.split("_")[-1]:
                  t_mature+=1
                  t_mat_counts += x[2]
                  f=1
                  break
           if f==0:
              t_templ+=1
              t_tem_counts += x[2]


    fig = plt.figure()
    labels = 'miRNA RefSeq','Template', 'Unassigned'
    sizes = [c_mat_counts, c_tem_counts, c_unmap_counts]
    colors = ['gold', 'yellowgreen', 'lightskyblue']
    explode = (0.2, 0.05, 0.1)
    ax1 = plt.subplot2grid((1,2),(0,0))
    patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
    [x.set_fontsize(8) for x in texts]
    plt.title(group_name1 + ' group (reads)', fontsize=12)
    labels = 'miRNA RefSeq','Template', 'Unassigned'
    sizes = [t_mat_counts, t_tem_counts, t_unmap_counts]
    colors = ['gold', 'yellowgreen', 'lightskyblue']
    explode = (0.2, 0.05, 0.1)
    ax2 = plt.subplot2grid((1,2),(0,1))
    patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
    [x.set_fontsize(8) for x in texts]
    plt.title(group_name2 + ' group (reads)',fontsize = 12)
    plt.savefig('pie_tem.png',dpi=300)

###################################################################################################################################################################################################################


def make_spider(merge_con,merge_tre,group_name1,group_name2):

  c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_con]
  t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_tre]

  c_5 = 0
  c_5_counts = 0
  c_3 = 0
  c_3_counts = 0
  c_both =0
  c_both_counts=0
  c_mature = 0
  c_mat_counts = 0
  c_exception=0
  c_exception_counts=0


  t_5 = 0
  t_5_counts = 0
  t_3 = 0
  t_3_counts = 0
  t_both = 0
  t_both_counts = 0
  t_mature = 0
  t_mat_counts = 0
  t_exception = 0
  t_exception_counts=0

  for x in c_samples:

      if "/" not in x[0]:
          if "chr" in  x[0].split("_")[-1]:
             c_mature+=1
             c_mat_counts += x[2]
          elif 0 == int(x[0].split("_")[-1]):
             c_5+=1
             c_5_counts += x[2]
          elif 0 == int(x[0].split("_")[-2]):
             c_3+=1
             c_3_counts += x[2]
          else:
             c_both+=1
             c_both_counts+=x[2]

      else:
         f=0
         for y in x[0].split("/"):
             if "chr" in y.split("_")[-1]:
                c_mature+=1
                c_mat_counts += x[2]
                f=1
                break
         if f==0:
            for y in x[0].split("/"):
                c_exception+=1
                c_exception_counts += x[2]


  for x in t_samples:

      if "/" not in x[0]:
          if "chr" in  x[0].split("_")[-1]:
             t_mature+=1
             t_mat_counts += x[2]
          elif 0 == int(x[0].split("_")[-1]):
             t_5+=1
             t_5_counts += x[2]
          elif 0 == int(x[0].split("_")[-2]):
             t_3+=1
             t_3_counts += x[2]
          else:
             t_both+=1
             t_both_counts+=x[2]

      else:
         f=0
         for y in x[0].split("/"):
             if "chr" in y.split("_")[-1]:
                t_mature+=1
                t_mat_counts += x[2]
                f=1
                break
         if f==0:
            for y in x[0].split("/"):
                t_exception+=1
                t_exception_counts += x[2]


  c_all = c_5+c_3+c_both+c_mature+c_exception
  c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts

  t_all = t_5+t_3+t_both+t_mature + t_exception
  t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts

  c_5 = round(c_5/c_all*100,2)
  c_3 = round(c_3/c_all*100,2)
  c_both = round(c_both/c_all*100,2)
  c_mature = round(c_mature/c_all*100,2)
  c_exception = round(c_exception/c_all*100,2)

  c_5_counts = round(c_5_counts/c_all_counts*100,2)
  c_3_counts = round(c_3_counts/c_all_counts*100,2)
  c_both_counts = round(c_both_counts/c_all_counts*100,2)
  c_mat_counts = round(c_mat_counts/c_all_counts*100,2)
  c_exception_counts = round(c_exception_counts/c_all_counts*100,2)

  t_5 = round(t_5/t_all*100,2)
  t_3 = round(t_3/t_all*100,2)
  t_both = round(t_both/t_all*100,2)
  t_mature = round(t_mature/t_all*100,2)
  t_exception = round(t_exception/t_all*100,2)

  t_5_counts = round(t_5_counts/t_all_counts*100,2)
  t_3_counts = round(t_3_counts/t_all_counts*100,2)
  t_both_counts = round(t_both_counts/t_all_counts*100,2)
  t_mat_counts = round(t_mat_counts/t_all_counts*100,2)
  t_exception_counts = round(t_exception_counts/t_all_counts*100,2)

  radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception)
  radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts)

  df=pd.DataFrame({
  'group':[group_name1,group_name2],
  """5'3'-isomiRs""":[c_both,t_both],
  """3'-isomiRs""":[c_3,t_3],
  'RefSeq miRNA':[c_mature,t_mature],
  """5'-isomiRs""":[c_5,t_5],
  'Others*':[c_exception,t_exception]})

  df1=pd.DataFrame({
  'group':[group_name1,group_name2],
  """5'3'-isomiRs""":[c_both_counts,t_both_counts],
  """3'-isomiRs""":[c_3_counts,t_3_counts],
  'RefSeq miRNA':[c_mat_counts,t_mat_counts],
  """5'-isomiRs""":[c_5_counts,t_5_counts],
  'Others*':[c_exception_counts,t_exception_counts]})

  spider_last(df,radar_max,1,group_name1,group_name2)
  spider_last(df1,radar_max_counts,2,group_name1,group_name2)

#####################################################################################################################################################

def spider_last(df,radar_max,flag,group_name1,group_name2):
  # ------- PART 1: Create background
  fig = plt.figure()
  # number of variable
  categories=list(df)[1:]
  N = len(categories)

  # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
  angles = [n / float(N) * 2 * pi for n in range(N)]
  angles += angles[:1]

  # Initialise the spider plot
  ax = plt.subplot(111, polar=True)

  # If you want the first axis to be on top:
  ax.set_theta_offset(pi/2)
  ax.set_theta_direction(-1)

  # Draw one axe per variable + add labels labels yet
  plt.xticks(angles[:-1], categories, fontsize=11)

  # Draw ylabels
  radar_max=round(radar_max+radar_max*0.1)
  mul=len(str(radar_max))-1
  maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul)
  sep = round(maxi/4)
  plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10)
  plt.ylim(0, maxi)

  # ------- PART 2: Add plots

  # Plot each individual = each line of the data
  # I don't do a loop, because plotting more than 3 groups makes the chart unreadable

  # Ind1
  values=df.loc[0].drop('group').values.flatten().tolist()
  values += values[:1]
  ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label=group_name1)
  ax.fill(angles, values, 'b', alpha=0.1)

  # Ind2
  values=df.loc[1].drop('group').values.flatten().tolist()
  values += values[:1]
  ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label=group_name2)
  ax.fill(angles, values, 'r', alpha=0.1)

  # Add legend
  if flag==1:
     plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1))
     plt.savefig('spider_non_red.png',dpi=300)
  else:
     plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1))
     plt.savefig('spider_red.png',dpi=300)


#############################################################################################################################################################################################################

def hist_red(samples,flag,group_name):

    lengths=[]
    cat=[]
    total_reads=0
    seq=[]

    if flag == "c":
        title = "Length Distribution of "+ group_name +" group (Redudant reads)"
    if flag == "t":
        title = "Length Distribution of "+ group_name +" group (Redudant reads)"

    # classification of the sequences on two categories mapped or unmapped
    for i in samples:
        for x in i:
            lengths.append(x[3])
            if x[1]=="0":
               seq.append([x[3],x[0].split("-")[1],"Mapped"])
               cat.append("Mapped")
            if x[1] == "4":
               seq.append([x[3],x[0].split("-")[1],"Unassigned"])
               cat.append("Unassigned")

    # Keep lengths below 35nts
    uni_len=list(set(lengths))
    uni_len=[x for x in uni_len if x<=35]

    # Remove duplicates from sequences
    seq.sort()
    uni_seq=list(seq for seq,_ in itertools.groupby(seq))

    # Calculation of the reads per group (mapped or unmapped)
    total_reads+=sum([int(x[1]) for x in uni_seq])
    map_reads=[]
    unmap_reads=[]
    length=[]
    for y in uni_len:
        map_temp=0
        unmap_temp=0
        for x in uni_seq:
            if x[0]==y and x[2]=="Mapped":
               map_temp+=int(x[1])
            if x[0]==y and x[2]=="Unassigned":
               unmap_temp+=int(x[1])
        length.append(y)
        map_reads.append(round(map_temp/total_reads*100,2))      # percentage of mapped reads over total number of sequences
        unmap_reads.append(round(unmap_temp/total_reads*100,2))  # percentage of unmapped reads over total number of sequences

    # Generation of the graph
    ylim=max([sum(x) for x in zip(unmap_reads, map_reads)])
    ylim=ylim+ylim*20/100
    fig, ax = plt.subplots()
    width=0.8
    ax.bar(length, unmap_reads, width, label='Unassigned')
    h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped')
    plt.xticks(np.arange(length[0], length[-1]+1, 1))
    plt.yticks(np.arange(0, ylim, 5))
    plt.xlabel('Length (nt)',fontsize=14)
    plt.ylabel('Percentage',fontsize=14)
    plt.title(title,fontsize=14)
    ax.legend()
    plt.ylim([0, ylim])
    ax.grid(axis='y',linewidth=0.2)

    # Save of the graph
    if flag=='c':
       plt.savefig('c_hist_red.png',dpi=300)

    if flag=='t':
       plt.savefig('t_hist_red.png',dpi=300)

#################################################################################################################

def logo_seq_red(merge, flag, group_name):

    if flag=="c":
       titlos= group_name + " group (Redundant)"
       file_logo="c_logo.png"
       file_bar="c_bar.png"
    if flag=="t":
       titlos= group_name + " group (Redundant)"
       file_logo="t_logo.png"
       file_bar="t_bar.png"

    c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge]

    A=[0]*3
    C=[0]*3
    G=[0]*3
    T=[0]*3
    total_reads=0

    for y in c_samples:
        if "/" in y[0]:
            length=[]
            for x in y[0].split("/"):
                length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]])

            best=min(length)
            total_reads+=best[2]
            for i in range(3):
                if i<len(best[1]):
                    if best[1][i] == "A":
                        A[i]+=best[2]
                    elif best[1][i] == "C":
                        C[i]+=best[2]
                    elif best[1][i] == "G":
                        G[i]+=best[2]
                    else:
                        T[i]+=best[2]
        else:
            total_reads+=y[2]
            for i in range(3):
                if i<len(y[0].split("_")[-1]):
                    if y[0].split("_")[-1][i] == "A":
                        A[i]+=(y[2])
                    elif y[0].split("_")[-1][i] == "C":
                        C[i]+=(y[2])
                    elif y[0].split("_")[-1][i] == "G":
                        G[i]+=(y[2])
                    else:
                        T[i]+=y[2]

    A[:] = [round(x*100,1) / total_reads for x in A]
    C[:] = [round(x*100,1) / total_reads for x in C]
    G[:] = [round(x*100,1) / total_reads for x in G]
    T[:] = [round(x*100,1) / total_reads for x in T]



    data = {'A':A,'C':C,'G':G,'T':T}
    df = pd.DataFrame(data, index=[1,2,3])
    h=df.plot.bar(color=tuple(["g", "b","gold","r"]) )
    h.grid(axis='y',linewidth=0.2)
    plt.xticks(rotation=0, ha="right")
    plt.ylabel("Counts (%)",fontsize=18)
    plt.xlabel("Numbers of additional nucleotides",fontsize=18)
    plt.title(titlos,fontsize=20)
    plt.tight_layout()
    plt.savefig(file_bar, dpi=300)

    crp_logo = lm.Logo(df, font_name = 'DejaVu Sans')
    crp_logo.style_spines(visible=False)
    crp_logo.style_spines(spines=['left', 'bottom'], visible=True)
    crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0)

    # style using Axes methods
    crp_logo.ax.set_title(titlos,fontsize=18)
    crp_logo.ax.set_ylabel("Counts (%)", fontsize=16,labelpad=5)
    crp_logo.ax.set_xlabel("Numbers of additional nucleotides",fontsize=16, labelpad=5)
    crp_logo.ax.xaxis.set_ticks_position('none')
    crp_logo.ax.xaxis.set_tick_params(pad=-1)
    figure = plt.gcf()
    figure.set_size_inches(6, 4)
    crp_logo.fig.savefig(file_logo,dpi=300)

##########################################################################################################################################################################################################

def pdf_before_DE(analysis,group_name1,group_name2):

  # Image extensions
  if analysis=="2":
     image_extensions = ("c_hist_red.png","t_hist_red.png","pie_non.png","spider_red.png","spider_non_red.png","c_logo.png","t_logo.png","c_bar.png","t_bar.png")
  else:
     image_extensions = ("c_hist_red.png","t_hist_red.png","pie_tem.png","spider_red.png","spider_non_red.png")
  # This list will hold the images file names
  images = []

  # Build the image list by merging the glob results (a list of files)
  # for each extension. We are taking images from current folder.
  for extension in image_extensions:
      images.extend(glob.glob(extension))

  # Create instance of FPDF class
  pdf = FPDF('P', 'in', 'A4')
  # Add new page. Without this you cannot create the document.
  pdf.add_page()
  # Set font to Arial, 'B'old, 16 pts
  pdf.set_font('Arial', 'B', 20.0)

  # Page header
  pdf.cell(pdf.w-0.5, 0.5, 'IsomiR Profile Report',align='C')
  pdf.ln(0.7)
  pdf.set_font('Arial','B', 16.0)
  pdf.cell(pdf.w-0.5, 0.5, 'sRNA length distribution',align='C')

  # Smaller font for image captions
  pdf.set_font('Arial', '', 11.0)
  pdf.ln(0.5)

  yh=FPDF.get_y(pdf)
  pdf.image(images[0],x=0.3,w=4, h=3)
  pdf.image(images[1],x=4,y=yh, w=4, h=3)
  pdf.ln(0.3)

  pdf.cell(0.2)
  pdf.cell(3.0, 0.0, "  Mapped and unmapped reads to custom precussor arm reference DB (5p and 3p arms) in "+group_name1)
  pdf.ln(0.2)
  pdf.cell(0.2)
  pdf.cell(3.0, 0.0, "  (left) and "+group_name2+" (right) groups")

  pdf.ln(0.5)
  h1=FPDF.get_y(pdf)
  pdf.image(images[2],x=1, w=6.5, h=5)
  h2=FPDF.get_y(pdf)
  FPDF.set_y(pdf,h1+0.2)
  pdf.set_font('Arial','B', 16.0)
  pdf.cell(pdf.w-0.5, 0.5, 'Templated and non-templated isomiRs',align='C')
  pdf.set_font('Arial', '', 11.0)
  FPDF.set_y(pdf,h2)
  FPDF.set_y(pdf,9.5)
  pdf.cell(0.2)

  if analysis=="2":
     pdf.cell(3.0, 0.0, "  RefSeq miRNAs, templated isomiRs, non-templated isomiRs and unassigned sequences as percentage")
     pdf.ln(0.2)
     pdf.cell(0.2)
     pdf.cell(3.0, 0.0, "  of total sRNA reads in "+group_name1+" (left) and "+group_name2+" (right) groups")
  else:
     pdf.cell(3.0, 0.0, "  RefSeq miRNAS, Templated isomiRs and unassigned sequences as percentage of total sRNA reads in")
     pdf.ln(0.2)
     pdf.cell(0.2)
     pdf.cell(3.0, 0.0, "  "+group_name1+" (left) and "+group_name2 + " (right) groups")

  pdf.add_page()
  pdf.set_font('Arial', 'B', 18.0)
  pdf.cell(pdf.w-0.5, 0.5, "Templated isomiR subtypes",align='C')
  pdf.ln(0.7)
  pdf.set_font('Arial', 'B', 14.0)
  pdf.cell(pdf.w-0.5, 0.5, "Templated isomiR profile (redundant reads)",align='C')
  pdf.ln(0.5)
  pdf.image(images[3],x=1.5, w=5.5, h=4)
  pdf.ln(0.6)
  pdf.cell(pdf.w-0.5, 0.0, "Templated isomiR profile (non-redundant reads)",align='C')
  pdf.set_font('Arial', '', 12.0)
  pdf.ln(0.2)
  pdf.image(images[4],x=1.5, w=5.5, h=4)
  pdf.ln(0.3)
  pdf.set_font('Arial', '', 11.0)
  pdf.cell(0.2)
  pdf.cell(3.0, 0.0, "  * IsomiRs potentially generated from multiple loci")


  if analysis=="2":
     pdf.add_page('L')

     pdf.set_font('Arial', 'B', 18.0)
     pdf.cell(pdf.w-0.5, 0.5, "Non-templated isomiRs",align='C')
     pdf.ln(0.5)
     pdf.set_font('Arial', 'B', 14.0)
     pdf.cell(pdf.w-0.5, 0.5, "3'-end additions to RefSeq miRNAs and templated isomiRs",align='C')
     pdf.ln(0.7)

     yh=FPDF.get_y(pdf)
     pdf.image(images[5],x=1.5,w=3.65, h=2.65)
     pdf.image(images[7],x=6.5,y=yh, w=3.65, h=2.65)
     pdf.ln(0.5)
     yh=FPDF.get_y(pdf)
     pdf.image(images[6],x=1.5,w=3.65, h=2.65)
     pdf.image(images[8],x=6.5,y=yh, w=3.65, h=2.65)

  pdf.close()
  pdf.output('report1.pdf','F')

#############################################################################################################################################################3