annotate draw_features.py @ 8:67520145696f draft default tip

Uploaded
author greg
date Thu, 23 Mar 2023 19:57:44 +0000
parents c5555f1c8f49
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env python
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
2
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
3 import argparse
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
4 import os
7
c5555f1c8f49 Uploaded
greg
parents: 1
diff changeset
5 import random
0
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
6
7
c5555f1c8f49 Uploaded
greg
parents: 1
diff changeset
7 import matplotlib.pyplot as pyplot
0
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
8 import pandas
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
9 from Bio import SeqIO
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
10 from dna_features_viewer import GraphicFeature, GraphicRecord
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
11
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
12
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
13 AMR_COLOR = '#FED976'
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
14 INC_GROUPS_COLOR = '#0570B0'
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
15 FEATURE_COLORS = [AMR_COLOR, INC_GROUPS_COLOR]
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
16 FIGURE_WIDTH = 13
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
17
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
18
7
c5555f1c8f49 Uploaded
greg
parents: 1
diff changeset
19 def get_random_color():
8
67520145696f Uploaded
greg
parents: 7
diff changeset
20 r = lambda: random.randint(0,255)
67520145696f Uploaded
greg
parents: 7
diff changeset
21 return '#%02X%02X%02X' % (r(),r(),r())
7
c5555f1c8f49 Uploaded
greg
parents: 1
diff changeset
22
c5555f1c8f49 Uploaded
greg
parents: 1
diff changeset
23
0
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
24 def draw_features(feature_hits_files, contigs, output_dir):
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
25 ofh = open('process_log', 'w')
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
26 # Read feature_hits_files.
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
27 feature_hits = pandas.Series(dtype=object)
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
28 feature_plots = pandas.Series(dtype=object)
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
29 for feature_hits_file in feature_hits_files:
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
30 feature_name = os.path.basename(feature_hits_file)
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
31 # Make sure the file is not empty.
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
32 if os.path.isfile(feature_hits_file) and os.path.getsize(feature_hits_file) > 0:
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
33 best_hits = pandas.read_csv(filepath_or_buffer=feature_hits_file, sep='\t', header=None)
1
7d47800ee5ac Uploaded
greg
parents: 0
diff changeset
34 ofh.write("\nFeature file %s will be processed\n" % os.path.basename(feature_hits_file))
0
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
35 else:
1
7d47800ee5ac Uploaded
greg
parents: 0
diff changeset
36 ofh.write("\nEmpty feature file %s will NOT be processed\n" % os.path.basename(feature_hits_file))
0
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
37 best_hits = None
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
38 feature_hits[feature_name] = best_hits
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
39
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
40 # Draw one plot per contig for simplicity.
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
41 ofh.write("\nProcessing contigs file: %s\n" % str(contigs))
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
42 for contig in SeqIO.parse(contigs, 'fasta'):
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
43 ofh.write("Processing contig: %s\n" % str(contig))
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
44 contig_plot_png = os.path.join(output_dir, '%s.png' % str(contig.id))
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
45 feature_sets_to_plot = pandas.Series(dtype=object)
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
46 for feature_number in range(len(feature_hits)):
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
47 feature_name = feature_hits.index.to_list()[feature_number]
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
48 ofh.write("Processing feature name: %s\n" % str(feature_name))
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
49 these_features = feature_hits[feature_name]
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
50 if these_features is None or these_features.shape[0] == 0:
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
51 # No features.
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
52 continue
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
53 contig_features = these_features.loc[these_features.iloc[:, 0] == contig.id, :]
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
54 if contig_features is None or contig_features.shape[0] == 0:
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
55 # No features.
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
56 continue
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
57 features_to_plot = []
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
58 for i in range(contig_features.shape[0]):
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
59 i = contig_features.iloc[i, :]
8
67520145696f Uploaded
greg
parents: 7
diff changeset
60 if feature_number < len(FEATURE_COLORS):
7
c5555f1c8f49 Uploaded
greg
parents: 1
diff changeset
61 color = FEATURE_COLORS[feature_number]
c5555f1c8f49 Uploaded
greg
parents: 1
diff changeset
62 else:
c5555f1c8f49 Uploaded
greg
parents: 1
diff changeset
63 color = get_random_color()
c5555f1c8f49 Uploaded
greg
parents: 1
diff changeset
64 features_to_plot += [GraphicFeature(start=i[1], end=i[2], label=i[3], strand=1 * i[5], color=color)]
0
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
65 feature_sets_to_plot[feature_name] = features_to_plot
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
66 ofh.write("Number of features to plot: %d\n" % len(feature_sets_to_plot))
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
67 if len(feature_sets_to_plot) == 0:
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
68 # No features.
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
69 continue
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
70 # Determine each plot height for later scaling
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
71 expected_plot_heights = []
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
72 for i in range(len(feature_sets_to_plot)):
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
73 record = GraphicRecord(sequence_length=len(contig), features=feature_sets_to_plot[i])
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
74 if i == len(feature_sets_to_plot) - 1:
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
75 with_ruler = True
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
76 else:
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
77 with_ruler = False
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
78 plot, _ = record.plot(figure_width=FIGURE_WIDTH, with_ruler=with_ruler)
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
79 expected_plot_heights += [plot.figure.get_size_inches()[1]]
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
80 plot_height_sum = sum(expected_plot_heights)
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
81 # Make a figure with separate plots for each feature class.
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
82 plots = pyplot.subplots(nrows=len(feature_sets_to_plot),
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
83 ncols=1,
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
84 sharex=True,
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
85 figsize=(FIGURE_WIDTH, plot_height_sum * .66666),
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
86 gridspec_kw={"height_ratios": expected_plot_heights})
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
87 figure = plots[0]
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
88 plots = plots[1]
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
89 if len(feature_sets_to_plot) == 1:
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
90 plots = [plots]
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
91 # Add each feature class's plot with the pre-determined height.
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
92 for i in range(len(feature_sets_to_plot)):
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
93 record = GraphicRecord(sequence_length=len(contig), features=feature_sets_to_plot[i])
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
94 if i == len(feature_sets_to_plot) - 1:
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
95 with_ruler = True
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
96 else:
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
97 with_ruler = False
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
98 plot, _ = record.plot(ax=plots[i], with_ruler=with_ruler, figure_width=FIGURE_WIDTH)
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
99 ymin, ymax = plot.figure.axes[0].get_ylim()
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
100 if i == 0:
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
101 plot.text(x=0, y=ymax, s=contig.id)
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
102 figure.tight_layout()
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
103 ofh.write("Saving PNG plot file: %s\n" % str(contig_plot_png))
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
104 figure.savefig(contig_plot_png)
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
105 feature_plots[contig.id] = contig_plot_png
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
106 ofh.close()
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
107
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
108
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
109 if __name__ == '__main__':
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
110 parser = argparse.ArgumentParser()
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
111
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
112 parser.add_argument('--feature_hits_dir', action='store', dest='feature_hits_dir', help='Directory of tabular files containing feature hits')
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
113 parser.add_argument('--contigs', action='store', dest='contigs', help='Fasta file of contigs')
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
114 parser.add_argument('--output_dir', action='store', dest='output_dir', help='Output directory')
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
115
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
116 args = parser.parse_args()
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
117
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
118 # Get thge collection of feature hits files. The collection
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
119 # will be sorted alphabetically and will contain 2 files
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
120 # named something like AMR_CDS_311_2022_12_20.fasta and
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
121 # Incompatibility_Groups_2023_01_01.fasta.
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
122 feature_hits_files = []
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
123 for file_name in sorted(os.listdir(args.feature_hits_dir)):
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
124 file_path = os.path.abspath(os.path.join(args.feature_hits_dir, file_name))
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
125 feature_hits_files.append(file_path)
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
126
636eeb4fb9ac Uploaded
greg
parents:
diff changeset
127 draw_features(feature_hits_files, args.contigs, args.output_dir)