Mercurial > repos > dereeper > roary_plots
comparison Roary/contrib/roary_plots/roary_plots.py @ 0:c47a5f61bc9f draft
Uploaded
| author | dereeper |
|---|---|
| date | Fri, 14 May 2021 20:27:06 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c47a5f61bc9f |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # Copyright (C) <2015> EMBL-European Bioinformatics Institute | |
| 3 | |
| 4 # This program is free software: you can redistribute it and/or | |
| 5 # modify it under the terms of the GNU General Public License as | |
| 6 # published by the Free Software Foundation, either version 3 of | |
| 7 # the License, or (at your option) any later version. | |
| 8 | |
| 9 # This program is distributed in the hope that it will be useful, | |
| 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 12 # GNU General Public License for more details. | |
| 13 | |
| 14 # Neither the institution name nor the name roary_plots | |
| 15 # can be used to endorse or promote products derived from | |
| 16 # this software without prior written permission. | |
| 17 # For written permission, please contact <marco@ebi.ac.uk>. | |
| 18 | |
| 19 # Products derived from this software may not be called roary_plots | |
| 20 # nor may roary_plots appear in their names without prior written | |
| 21 # permission of the developers. You should have received a copy | |
| 22 # of the GNU General Public License along with this program. | |
| 23 # If not, see <http://www.gnu.org/licenses/>. | |
| 24 | |
| 25 __author__ = "Marco Galardini" | |
| 26 __version__ = '0.1.0' | |
| 27 | |
| 28 def get_options(): | |
| 29 import argparse | |
| 30 | |
| 31 # create the top-level parser | |
| 32 description = "Create plots from roary outputs" | |
| 33 parser = argparse.ArgumentParser(description = description, | |
| 34 prog = 'roary_plots.py') | |
| 35 | |
| 36 parser.add_argument('tree', action='store', | |
| 37 help='Newick Tree file', default='accessory_binary_genes.fa.newick') | |
| 38 parser.add_argument('spreadsheet', action='store', | |
| 39 help='Roary gene presence/absence spreadsheet', default='gene_presence_absence.csv') | |
| 40 | |
| 41 parser.add_argument('--labels', action='store_true', | |
| 42 default=False, | |
| 43 help='Add node labels to the tree (up to 10 chars)') | |
| 44 parser.add_argument('--format', | |
| 45 choices=('png', | |
| 46 'tiff', | |
| 47 'pdf', | |
| 48 'svg'), | |
| 49 default='png', | |
| 50 help='Output format [Default: png]') | |
| 51 parser.add_argument('-N', '--skipped-columns', action='store', | |
| 52 type=int, | |
| 53 default=14, | |
| 54 help='First N columns of Roary\'s output to exclude [Default: 14]') | |
| 55 | |
| 56 parser.add_argument('--version', action='version', | |
| 57 version='%(prog)s '+__version__) | |
| 58 | |
| 59 return parser.parse_args() | |
| 60 | |
| 61 if __name__ == "__main__": | |
| 62 options = get_options() | |
| 63 | |
| 64 import matplotlib | |
| 65 matplotlib.use('Agg') | |
| 66 | |
| 67 import matplotlib.pyplot as plt | |
| 68 import seaborn as sns | |
| 69 | |
| 70 sns.set_style('white') | |
| 71 | |
| 72 import os | |
| 73 import pandas as pd | |
| 74 import numpy as np | |
| 75 from Bio import Phylo | |
| 76 | |
| 77 t = Phylo.read(options.tree, 'newick') | |
| 78 | |
| 79 # Max distance to create better plots | |
| 80 mdist = max([t.distance(t.root, x) for x in t.get_terminals()]) | |
| 81 | |
| 82 # Load roary | |
| 83 roary = pd.read_csv(options.spreadsheet, low_memory=False) | |
| 84 # Set index (group name) | |
| 85 roary.set_index('Gene', inplace=True) | |
| 86 # Drop the other info columns | |
| 87 roary.drop(list(roary.columns[:options.skipped_columns-1]), axis=1, inplace=True) | |
| 88 | |
| 89 # Transform it in a presence/absence matrix (1/0) | |
| 90 roary.replace('.{2,100}', 1, regex=True, inplace=True) | |
| 91 roary.replace(np.nan, 0, regex=True, inplace=True) | |
| 92 | |
| 93 # Sort the matrix by the sum of strains presence | |
| 94 idx = roary.sum(axis=1).sort_values(ascending=False).index | |
| 95 roary_sorted = roary.loc[idx] | |
| 96 | |
| 97 # Pangenome frequency plot | |
| 98 plt.figure(figsize=(7, 5)) | |
| 99 | |
| 100 plt.hist(roary.sum(axis=1), roary.shape[1], | |
| 101 histtype="stepfilled", alpha=.7) | |
| 102 | |
| 103 plt.xlabel('No. of genomes') | |
| 104 plt.ylabel('No. of genes') | |
| 105 | |
| 106 sns.despine(left=True, | |
| 107 bottom=True) | |
| 108 plt.savefig('pangenome_frequency.%s'%options.format, dpi=300) | |
| 109 plt.clf() | |
| 110 | |
| 111 # Sort the matrix according to tip labels in the tree | |
| 112 roary_sorted = roary_sorted[[x.name for x in t.get_terminals()]] | |
| 113 | |
| 114 # Plot presence/absence matrix against the tree | |
| 115 with sns.axes_style('whitegrid'): | |
| 116 fig = plt.figure(figsize=(17, 10)) | |
| 117 | |
| 118 ax1=plt.subplot2grid((1,40), (0, 10), colspan=30) | |
| 119 a=ax1.matshow(roary_sorted.T, cmap=plt.cm.Blues, | |
| 120 vmin=0, vmax=1, | |
| 121 aspect='auto', | |
| 122 interpolation='none', | |
| 123 ) | |
| 124 ax1.set_yticks([]) | |
| 125 ax1.set_xticks([]) | |
| 126 ax1.axis('off') | |
| 127 | |
| 128 ax = fig.add_subplot(1,2,1) | |
| 129 # matplotlib v1/2 workaround | |
| 130 try: | |
| 131 ax=plt.subplot2grid((1,40), (0, 0), colspan=10, facecolor='white') | |
| 132 except AttributeError: | |
| 133 ax=plt.subplot2grid((1,40), (0, 0), colspan=10, axisbg='white') | |
| 134 | |
| 135 fig.subplots_adjust(wspace=0, hspace=0) | |
| 136 | |
| 137 ax1.set_title('Roary matrix\n(%d gene clusters)'%roary.shape[0]) | |
| 138 | |
| 139 if options.labels: | |
| 140 fsize = 12 - 0.1*roary.shape[1] | |
| 141 if fsize < 7: | |
| 142 fsize = 7 | |
| 143 with plt.rc_context({'font.size': fsize}): | |
| 144 Phylo.draw(t, axes=ax, | |
| 145 show_confidence=False, | |
| 146 label_func=lambda x: str(x)[:10], | |
| 147 xticks=([],), yticks=([],), | |
| 148 ylabel=('',), xlabel=('',), | |
| 149 xlim=(-mdist*0.1,mdist+mdist*0.45-mdist*roary.shape[1]*0.001), | |
| 150 axis=('off',), | |
| 151 title=('Tree\n(%d strains)'%roary.shape[1],), | |
| 152 do_show=False, | |
| 153 ) | |
| 154 else: | |
| 155 Phylo.draw(t, axes=ax, | |
| 156 show_confidence=False, | |
| 157 label_func=lambda x: None, | |
| 158 xticks=([],), yticks=([],), | |
| 159 ylabel=('',), xlabel=('',), | |
| 160 xlim=(-mdist*0.1,mdist+mdist*0.1), | |
| 161 axis=('off',), | |
| 162 title=('Tree\n(%d strains)'%roary.shape[1],), | |
| 163 do_show=False, | |
| 164 ) | |
| 165 plt.savefig('pangenome_matrix.%s'%options.format, dpi=300) | |
| 166 plt.clf() | |
| 167 | |
| 168 # Plot the pangenome pie chart | |
| 169 plt.figure(figsize=(10, 10)) | |
| 170 | |
| 171 core = roary[(roary.sum(axis=1) >= roary.shape[1]*0.99) & (roary.sum(axis=1) <= roary.shape[1] )].shape[0] | |
| 172 softcore = roary[(roary.sum(axis=1) >= roary.shape[1]*0.95) & (roary.sum(axis=1) < roary.shape[1]*0.99)].shape[0] | |
| 173 shell = roary[(roary.sum(axis=1) >= roary.shape[1]*0.15) & (roary.sum(axis=1) < roary.shape[1]*0.95)].shape[0] | |
| 174 cloud = roary[roary.sum(axis=1) < roary.shape[1]*0.15].shape[0] | |
| 175 | |
| 176 total = roary.shape[0] | |
| 177 | |
| 178 def my_autopct(pct): | |
| 179 val=int(round(pct*total/100.0)) | |
| 180 return '{v:d}'.format(v=val) | |
| 181 | |
| 182 a=plt.pie([core, softcore, shell, cloud], | |
| 183 labels=['core\n(%d <= strains <= %d)'%(roary.shape[1]*.99,roary.shape[1]), | |
| 184 'soft-core\n(%d <= strains < %d)'%(roary.shape[1]*.95,roary.shape[1]*.99), | |
| 185 'shell\n(%d <= strains < %d)'%(roary.shape[1]*.15,roary.shape[1]*.95), | |
| 186 'cloud\n(strains < %d)'%(roary.shape[1]*.15)], | |
| 187 explode=[0.1, 0.05, 0.02, 0], radius=0.9, | |
| 188 colors=[(0, 0, 1, float(x)/total) for x in (core, softcore, shell, cloud)], | |
| 189 autopct=my_autopct) | |
| 190 plt.savefig('pangenome_pie.%s'%options.format, dpi=300) | |
| 191 plt.clf() |
