annotate Roary/contrib/roary_plots/roary_plots.py @ 0:c47a5f61bc9f draft

Uploaded
author dereeper
date Fri, 14 May 2021 20:27:06 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/env python
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
2 # Copyright (C) <2015> EMBL-European Bioinformatics Institute
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
3
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
4 # This program is free software: you can redistribute it and/or
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
5 # modify it under the terms of the GNU General Public License as
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
6 # published by the Free Software Foundation, either version 3 of
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
7 # the License, or (at your option) any later version.
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
8
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
9 # This program is distributed in the hope that it will be useful,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
12 # GNU General Public License for more details.
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
13
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
14 # Neither the institution name nor the name roary_plots
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
15 # can be used to endorse or promote products derived from
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
16 # this software without prior written permission.
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
17 # For written permission, please contact <marco@ebi.ac.uk>.
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
18
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
19 # Products derived from this software may not be called roary_plots
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
20 # nor may roary_plots appear in their names without prior written
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
21 # permission of the developers. You should have received a copy
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
22 # of the GNU General Public License along with this program.
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
23 # If not, see <http://www.gnu.org/licenses/>.
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
24
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
25 __author__ = "Marco Galardini"
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
26 __version__ = '0.1.0'
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
27
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
28 def get_options():
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
29 import argparse
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
30
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
31 # create the top-level parser
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
32 description = "Create plots from roary outputs"
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
33 parser = argparse.ArgumentParser(description = description,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
34 prog = 'roary_plots.py')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
35
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
36 parser.add_argument('tree', action='store',
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
37 help='Newick Tree file', default='accessory_binary_genes.fa.newick')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
38 parser.add_argument('spreadsheet', action='store',
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
39 help='Roary gene presence/absence spreadsheet', default='gene_presence_absence.csv')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
40
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
41 parser.add_argument('--labels', action='store_true',
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
42 default=False,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
43 help='Add node labels to the tree (up to 10 chars)')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
44 parser.add_argument('--format',
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
45 choices=('png',
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
46 'tiff',
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
47 'pdf',
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
48 'svg'),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
49 default='png',
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
50 help='Output format [Default: png]')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
51 parser.add_argument('-N', '--skipped-columns', action='store',
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
52 type=int,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
53 default=14,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
54 help='First N columns of Roary\'s output to exclude [Default: 14]')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
55
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
56 parser.add_argument('--version', action='version',
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
57 version='%(prog)s '+__version__)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
58
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
59 return parser.parse_args()
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
60
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
61 if __name__ == "__main__":
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
62 options = get_options()
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
63
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
64 import matplotlib
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
65 matplotlib.use('Agg')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
66
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
67 import matplotlib.pyplot as plt
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
68 import seaborn as sns
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
69
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
70 sns.set_style('white')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
71
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
72 import os
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
73 import pandas as pd
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
74 import numpy as np
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
75 from Bio import Phylo
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
76
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
77 t = Phylo.read(options.tree, 'newick')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
78
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
79 # Max distance to create better plots
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
80 mdist = max([t.distance(t.root, x) for x in t.get_terminals()])
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
81
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
82 # Load roary
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
83 roary = pd.read_csv(options.spreadsheet, low_memory=False)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
84 # Set index (group name)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
85 roary.set_index('Gene', inplace=True)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
86 # Drop the other info columns
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
87 roary.drop(list(roary.columns[:options.skipped_columns-1]), axis=1, inplace=True)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
88
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
89 # Transform it in a presence/absence matrix (1/0)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
90 roary.replace('.{2,100}', 1, regex=True, inplace=True)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
91 roary.replace(np.nan, 0, regex=True, inplace=True)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
92
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
93 # Sort the matrix by the sum of strains presence
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
94 idx = roary.sum(axis=1).sort_values(ascending=False).index
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
95 roary_sorted = roary.loc[idx]
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
96
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
97 # Pangenome frequency plot
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
98 plt.figure(figsize=(7, 5))
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
99
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
100 plt.hist(roary.sum(axis=1), roary.shape[1],
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
101 histtype="stepfilled", alpha=.7)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
102
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
103 plt.xlabel('No. of genomes')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
104 plt.ylabel('No. of genes')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
105
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
106 sns.despine(left=True,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
107 bottom=True)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
108 plt.savefig('pangenome_frequency.%s'%options.format, dpi=300)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
109 plt.clf()
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
110
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
111 # Sort the matrix according to tip labels in the tree
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
112 roary_sorted = roary_sorted[[x.name for x in t.get_terminals()]]
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
113
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
114 # Plot presence/absence matrix against the tree
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
115 with sns.axes_style('whitegrid'):
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
116 fig = plt.figure(figsize=(17, 10))
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
117
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
118 ax1=plt.subplot2grid((1,40), (0, 10), colspan=30)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
119 a=ax1.matshow(roary_sorted.T, cmap=plt.cm.Blues,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
120 vmin=0, vmax=1,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
121 aspect='auto',
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
122 interpolation='none',
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
123 )
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
124 ax1.set_yticks([])
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
125 ax1.set_xticks([])
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
126 ax1.axis('off')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
127
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
128 ax = fig.add_subplot(1,2,1)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
129 # matplotlib v1/2 workaround
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
130 try:
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
131 ax=plt.subplot2grid((1,40), (0, 0), colspan=10, facecolor='white')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
132 except AttributeError:
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
133 ax=plt.subplot2grid((1,40), (0, 0), colspan=10, axisbg='white')
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
134
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
135 fig.subplots_adjust(wspace=0, hspace=0)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
136
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
137 ax1.set_title('Roary matrix\n(%d gene clusters)'%roary.shape[0])
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
138
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
139 if options.labels:
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
140 fsize = 12 - 0.1*roary.shape[1]
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
141 if fsize < 7:
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
142 fsize = 7
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
143 with plt.rc_context({'font.size': fsize}):
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
144 Phylo.draw(t, axes=ax,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
145 show_confidence=False,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
146 label_func=lambda x: str(x)[:10],
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
147 xticks=([],), yticks=([],),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
148 ylabel=('',), xlabel=('',),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
149 xlim=(-mdist*0.1,mdist+mdist*0.45-mdist*roary.shape[1]*0.001),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
150 axis=('off',),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
151 title=('Tree\n(%d strains)'%roary.shape[1],),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
152 do_show=False,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
153 )
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
154 else:
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
155 Phylo.draw(t, axes=ax,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
156 show_confidence=False,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
157 label_func=lambda x: None,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
158 xticks=([],), yticks=([],),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
159 ylabel=('',), xlabel=('',),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
160 xlim=(-mdist*0.1,mdist+mdist*0.1),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
161 axis=('off',),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
162 title=('Tree\n(%d strains)'%roary.shape[1],),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
163 do_show=False,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
164 )
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
165 plt.savefig('pangenome_matrix.%s'%options.format, dpi=300)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
166 plt.clf()
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
167
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
168 # Plot the pangenome pie chart
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
169 plt.figure(figsize=(10, 10))
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
170
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
171 core = roary[(roary.sum(axis=1) >= roary.shape[1]*0.99) & (roary.sum(axis=1) <= roary.shape[1] )].shape[0]
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
172 softcore = roary[(roary.sum(axis=1) >= roary.shape[1]*0.95) & (roary.sum(axis=1) < roary.shape[1]*0.99)].shape[0]
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
173 shell = roary[(roary.sum(axis=1) >= roary.shape[1]*0.15) & (roary.sum(axis=1) < roary.shape[1]*0.95)].shape[0]
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
174 cloud = roary[roary.sum(axis=1) < roary.shape[1]*0.15].shape[0]
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
175
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
176 total = roary.shape[0]
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
177
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
178 def my_autopct(pct):
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
179 val=int(round(pct*total/100.0))
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
180 return '{v:d}'.format(v=val)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
181
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
182 a=plt.pie([core, softcore, shell, cloud],
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
183 labels=['core\n(%d <= strains <= %d)'%(roary.shape[1]*.99,roary.shape[1]),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
184 'soft-core\n(%d <= strains < %d)'%(roary.shape[1]*.95,roary.shape[1]*.99),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
185 'shell\n(%d <= strains < %d)'%(roary.shape[1]*.15,roary.shape[1]*.95),
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
186 'cloud\n(strains < %d)'%(roary.shape[1]*.15)],
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
187 explode=[0.1, 0.05, 0.02, 0], radius=0.9,
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
188 colors=[(0, 0, 1, float(x)/total) for x in (core, softcore, shell, cloud)],
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
189 autopct=my_autopct)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
190 plt.savefig('pangenome_pie.%s'%options.format, dpi=300)
c47a5f61bc9f Uploaded
dereeper
parents:
diff changeset
191 plt.clf()