diff Roary/contrib/roary_plots/roary_plots.py @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Roary/contrib/roary_plots/roary_plots.py	Fri May 14 20:27:06 2021 +0000
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+# Copyright (C) <2015> EMBL-European Bioinformatics Institute
+
+# This program is free software: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of
+# the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# Neither the institution name nor the name roary_plots
+# can be used to endorse or promote products derived from
+# this software without prior written permission.
+# For written permission, please contact <marco@ebi.ac.uk>.
+
+# Products derived from this software may not be called roary_plots
+# nor may roary_plots appear in their names without prior written
+# permission of the developers. You should have received a copy
+# of the GNU General Public License along with this program.
+# If not, see <http://www.gnu.org/licenses/>.
+
+__author__ = "Marco Galardini"
+__version__ = '0.1.0'
+
+def get_options():
+    import argparse
+
+    # create the top-level parser
+    description = "Create plots from roary outputs"
+    parser = argparse.ArgumentParser(description = description,
+                                     prog = 'roary_plots.py')
+
+    parser.add_argument('tree', action='store',
+                        help='Newick Tree file', default='accessory_binary_genes.fa.newick')
+    parser.add_argument('spreadsheet', action='store',
+                        help='Roary gene presence/absence spreadsheet', default='gene_presence_absence.csv')
+
+    parser.add_argument('--labels', action='store_true',
+                        default=False,
+                        help='Add node labels to the tree (up to 10 chars)')
+    parser.add_argument('--format',
+                        choices=('png',
+                                 'tiff',
+                                 'pdf',
+                                 'svg'),
+                        default='png',
+                        help='Output format [Default: png]')
+    parser.add_argument('-N', '--skipped-columns', action='store',
+                        type=int,
+                        default=14,
+                        help='First N columns of Roary\'s output to exclude [Default: 14]')
+    
+    parser.add_argument('--version', action='version',
+                         version='%(prog)s '+__version__)
+
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    options = get_options()
+
+    import matplotlib
+    matplotlib.use('Agg')
+
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+
+    sns.set_style('white')
+
+    import os
+    import pandas as pd
+    import numpy as np
+    from Bio import Phylo
+
+    t = Phylo.read(options.tree, 'newick')
+
+    # Max distance to create better plots
+    mdist = max([t.distance(t.root, x) for x in t.get_terminals()])
+
+    # Load roary
+    roary = pd.read_csv(options.spreadsheet, low_memory=False)
+    # Set index (group name)
+    roary.set_index('Gene', inplace=True)
+    # Drop the other info columns
+    roary.drop(list(roary.columns[:options.skipped_columns-1]), axis=1, inplace=True)
+
+    # Transform it in a presence/absence matrix (1/0)
+    roary.replace('.{2,100}', 1, regex=True, inplace=True)
+    roary.replace(np.nan, 0, regex=True, inplace=True)
+
+    # Sort the matrix by the sum of strains presence
+    idx = roary.sum(axis=1).sort_values(ascending=False).index
+    roary_sorted = roary.loc[idx]
+
+    # Pangenome frequency plot
+    plt.figure(figsize=(7, 5))
+
+    plt.hist(roary.sum(axis=1), roary.shape[1],
+             histtype="stepfilled", alpha=.7)
+
+    plt.xlabel('No. of genomes')
+    plt.ylabel('No. of genes')
+
+    sns.despine(left=True,
+                bottom=True)
+    plt.savefig('pangenome_frequency.%s'%options.format, dpi=300)
+    plt.clf()
+
+    # Sort the matrix according to tip labels in the tree
+    roary_sorted = roary_sorted[[x.name for x in t.get_terminals()]]
+
+    # Plot presence/absence matrix against the tree
+    with sns.axes_style('whitegrid'):
+        fig = plt.figure(figsize=(17, 10))
+
+        ax1=plt.subplot2grid((1,40), (0, 10), colspan=30)
+        a=ax1.matshow(roary_sorted.T, cmap=plt.cm.Blues,
+                   vmin=0, vmax=1,
+                   aspect='auto',
+                   interpolation='none',
+                    )
+        ax1.set_yticks([])
+        ax1.set_xticks([])
+        ax1.axis('off')
+
+        ax = fig.add_subplot(1,2,1)
+        # matplotlib v1/2 workaround
+        try:
+            ax=plt.subplot2grid((1,40), (0, 0), colspan=10, facecolor='white')
+        except AttributeError:
+            ax=plt.subplot2grid((1,40), (0, 0), colspan=10, axisbg='white')
+
+        fig.subplots_adjust(wspace=0, hspace=0)
+
+        ax1.set_title('Roary matrix\n(%d gene clusters)'%roary.shape[0])
+
+        if options.labels:
+            fsize = 12 - 0.1*roary.shape[1]
+            if fsize < 7:
+                fsize = 7
+            with plt.rc_context({'font.size': fsize}):
+                Phylo.draw(t, axes=ax, 
+                           show_confidence=False,
+                           label_func=lambda x: str(x)[:10],
+                           xticks=([],), yticks=([],),
+                           ylabel=('',), xlabel=('',),
+                           xlim=(-mdist*0.1,mdist+mdist*0.45-mdist*roary.shape[1]*0.001),
+                           axis=('off',),
+                           title=('Tree\n(%d strains)'%roary.shape[1],), 
+                           do_show=False,
+                          )
+        else:
+            Phylo.draw(t, axes=ax, 
+                       show_confidence=False,
+                       label_func=lambda x: None,
+                       xticks=([],), yticks=([],),
+                       ylabel=('',), xlabel=('',),
+                       xlim=(-mdist*0.1,mdist+mdist*0.1),
+                       axis=('off',),
+                       title=('Tree\n(%d strains)'%roary.shape[1],),
+                       do_show=False,
+                      )
+        plt.savefig('pangenome_matrix.%s'%options.format, dpi=300)
+        plt.clf()
+
+    # Plot the pangenome pie chart
+    plt.figure(figsize=(10, 10))
+
+    core     = roary[(roary.sum(axis=1) >= roary.shape[1]*0.99) & (roary.sum(axis=1) <= roary.shape[1]     )].shape[0]
+    softcore = roary[(roary.sum(axis=1) >= roary.shape[1]*0.95) & (roary.sum(axis=1) <  roary.shape[1]*0.99)].shape[0]
+    shell    = roary[(roary.sum(axis=1) >= roary.shape[1]*0.15) & (roary.sum(axis=1) <  roary.shape[1]*0.95)].shape[0]
+    cloud    = roary[roary.sum(axis=1)  < roary.shape[1]*0.15].shape[0]
+
+    total = roary.shape[0]
+    
+    def my_autopct(pct):
+        val=int(round(pct*total/100.0))
+        return '{v:d}'.format(v=val)
+
+    a=plt.pie([core, softcore, shell, cloud],
+          labels=['core\n(%d <= strains <= %d)'%(roary.shape[1]*.99,roary.shape[1]),
+                  'soft-core\n(%d <= strains < %d)'%(roary.shape[1]*.95,roary.shape[1]*.99),
+                  'shell\n(%d <= strains < %d)'%(roary.shape[1]*.15,roary.shape[1]*.95),
+                  'cloud\n(strains < %d)'%(roary.shape[1]*.15)],
+          explode=[0.1, 0.05, 0.02, 0], radius=0.9,
+          colors=[(0, 0, 1, float(x)/total) for x in (core, softcore, shell, cloud)],
+          autopct=my_autopct)
+    plt.savefig('pangenome_pie.%s'%options.format, dpi=300)
+    plt.clf()