0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import argparse
|
|
4 import os
|
7
|
5 import random
|
0
|
6
|
7
|
7 import matplotlib.pyplot as pyplot
|
0
|
8 import pandas
|
|
9 from Bio import SeqIO
|
|
10 from dna_features_viewer import GraphicFeature, GraphicRecord
|
|
11
|
|
12
|
|
13 AMR_COLOR = '#FED976'
|
|
14 INC_GROUPS_COLOR = '#0570B0'
|
|
15 FEATURE_COLORS = [AMR_COLOR, INC_GROUPS_COLOR]
|
|
16 FIGURE_WIDTH = 13
|
|
17
|
|
18
|
7
|
19 def get_random_color():
|
|
20 number_of_colors = 16
|
|
21 colors = ['#%s' % ' '.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(number_of_colors)]
|
|
22 return random.choice(colors)
|
|
23
|
|
24
|
0
|
25 def draw_features(feature_hits_files, contigs, output_dir):
|
|
26 ofh = open('process_log', 'w')
|
|
27 # Read feature_hits_files.
|
|
28 feature_hits = pandas.Series(dtype=object)
|
|
29 feature_plots = pandas.Series(dtype=object)
|
|
30 for feature_hits_file in feature_hits_files:
|
|
31 feature_name = os.path.basename(feature_hits_file)
|
|
32 # Make sure the file is not empty.
|
|
33 if os.path.isfile(feature_hits_file) and os.path.getsize(feature_hits_file) > 0:
|
|
34 best_hits = pandas.read_csv(filepath_or_buffer=feature_hits_file, sep='\t', header=None)
|
1
|
35 ofh.write("\nFeature file %s will be processed\n" % os.path.basename(feature_hits_file))
|
0
|
36 else:
|
1
|
37 ofh.write("\nEmpty feature file %s will NOT be processed\n" % os.path.basename(feature_hits_file))
|
0
|
38 best_hits = None
|
|
39 feature_hits[feature_name] = best_hits
|
|
40
|
|
41 # Draw one plot per contig for simplicity.
|
|
42 ofh.write("\nProcessing contigs file: %s\n" % str(contigs))
|
|
43 for contig in SeqIO.parse(contigs, 'fasta'):
|
|
44 ofh.write("Processing contig: %s\n" % str(contig))
|
|
45 contig_plot_png = os.path.join(output_dir, '%s.png' % str(contig.id))
|
|
46 feature_sets_to_plot = pandas.Series(dtype=object)
|
|
47 for feature_number in range(len(feature_hits)):
|
|
48 feature_name = feature_hits.index.to_list()[feature_number]
|
|
49 ofh.write("Processing feature name: %s\n" % str(feature_name))
|
|
50 these_features = feature_hits[feature_name]
|
|
51 if these_features is None or these_features.shape[0] == 0:
|
|
52 # No features.
|
|
53 continue
|
|
54 contig_features = these_features.loc[these_features.iloc[:, 0] == contig.id, :]
|
|
55 if contig_features is None or contig_features.shape[0] == 0:
|
|
56 # No features.
|
|
57 continue
|
|
58 features_to_plot = []
|
|
59 for i in range(contig_features.shape[0]):
|
|
60 i = contig_features.iloc[i, :]
|
7
|
61 if feature_number <= len(FEATURE_COLORS):
|
|
62 color = FEATURE_COLORS[feature_number]
|
|
63 else:
|
|
64 color = get_random_color()
|
|
65 features_to_plot += [GraphicFeature(start=i[1], end=i[2], label=i[3], strand=1 * i[5], color=color)]
|
0
|
66 feature_sets_to_plot[feature_name] = features_to_plot
|
|
67 ofh.write("Number of features to plot: %d\n" % len(feature_sets_to_plot))
|
|
68 if len(feature_sets_to_plot) == 0:
|
|
69 # No features.
|
|
70 continue
|
|
71 # Determine each plot height for later scaling
|
|
72 expected_plot_heights = []
|
|
73 for i in range(len(feature_sets_to_plot)):
|
|
74 record = GraphicRecord(sequence_length=len(contig), features=feature_sets_to_plot[i])
|
|
75 if i == len(feature_sets_to_plot) - 1:
|
|
76 with_ruler = True
|
|
77 else:
|
|
78 with_ruler = False
|
|
79 plot, _ = record.plot(figure_width=FIGURE_WIDTH, with_ruler=with_ruler)
|
|
80 expected_plot_heights += [plot.figure.get_size_inches()[1]]
|
|
81 plot_height_sum = sum(expected_plot_heights)
|
|
82 # Make a figure with separate plots for each feature class.
|
|
83 plots = pyplot.subplots(nrows=len(feature_sets_to_plot),
|
|
84 ncols=1,
|
|
85 sharex=True,
|
|
86 figsize=(FIGURE_WIDTH, plot_height_sum * .66666),
|
|
87 gridspec_kw={"height_ratios": expected_plot_heights})
|
|
88 figure = plots[0]
|
|
89 plots = plots[1]
|
|
90 if len(feature_sets_to_plot) == 1:
|
|
91 plots = [plots]
|
|
92 # Add each feature class's plot with the pre-determined height.
|
|
93 for i in range(len(feature_sets_to_plot)):
|
|
94 record = GraphicRecord(sequence_length=len(contig), features=feature_sets_to_plot[i])
|
|
95 if i == len(feature_sets_to_plot) - 1:
|
|
96 with_ruler = True
|
|
97 else:
|
|
98 with_ruler = False
|
|
99 plot, _ = record.plot(ax=plots[i], with_ruler=with_ruler, figure_width=FIGURE_WIDTH)
|
|
100 ymin, ymax = plot.figure.axes[0].get_ylim()
|
|
101 if i == 0:
|
|
102 plot.text(x=0, y=ymax, s=contig.id)
|
|
103 figure.tight_layout()
|
|
104 ofh.write("Saving PNG plot file: %s\n" % str(contig_plot_png))
|
|
105 figure.savefig(contig_plot_png)
|
|
106 feature_plots[contig.id] = contig_plot_png
|
|
107 ofh.close()
|
|
108
|
|
109
|
|
110 if __name__ == '__main__':
|
|
111 parser = argparse.ArgumentParser()
|
|
112
|
|
113 parser.add_argument('--feature_hits_dir', action='store', dest='feature_hits_dir', help='Directory of tabular files containing feature hits')
|
|
114 parser.add_argument('--contigs', action='store', dest='contigs', help='Fasta file of contigs')
|
|
115 parser.add_argument('--output_dir', action='store', dest='output_dir', help='Output directory')
|
|
116
|
|
117 args = parser.parse_args()
|
|
118
|
|
119 # Get thge collection of feature hits files. The collection
|
|
120 # will be sorted alphabetically and will contain 2 files
|
|
121 # named something like AMR_CDS_311_2022_12_20.fasta and
|
|
122 # Incompatibility_Groups_2023_01_01.fasta.
|
|
123 feature_hits_files = []
|
|
124 for file_name in sorted(os.listdir(args.feature_hits_dir)):
|
|
125 file_path = os.path.abspath(os.path.join(args.feature_hits_dir, file_name))
|
|
126 feature_hits_files.append(file_path)
|
|
127
|
|
128 draw_features(feature_hits_files, args.contigs, args.output_dir)
|