Mercurial > repos > greg > draw_features
comparison draw_features.py @ 0:636eeb4fb9ac draft
Uploaded
author | greg |
---|---|
date | Wed, 25 Jan 2023 20:38:21 +0000 |
parents | |
children | 7d47800ee5ac |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:636eeb4fb9ac |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse | |
4 import os | |
5 | |
6 import pandas | |
7 import matplotlib.pyplot as pyplot | |
8 from Bio import SeqIO | |
9 from dna_features_viewer import GraphicFeature, GraphicRecord | |
10 | |
11 | |
12 AMR_COLOR = '#FED976' | |
13 INC_GROUPS_COLOR = '#0570B0' | |
14 FEATURE_COLORS = [AMR_COLOR, INC_GROUPS_COLOR] | |
15 FIGURE_WIDTH = 13 | |
16 | |
17 | |
18 def draw_features(feature_hits_files, contigs, output_dir): | |
19 ofh = open('process_log', 'w') | |
20 # Read feature_hits_files. | |
21 feature_hits = pandas.Series(dtype=object) | |
22 feature_plots = pandas.Series(dtype=object) | |
23 for feature_hits_file in feature_hits_files: | |
24 feature_name = os.path.basename(feature_hits_file) | |
25 # Make sure the file is not empty. | |
26 if os.path.isfile(feature_hits_file) and os.path.getsize(feature_hits_file) > 0: | |
27 best_hits = pandas.read_csv(filepath_or_buffer=feature_hits_file, sep='\t', header=None) | |
28 ofh.write("\nFeature file %s will be processed\n" % str(feature_hits_file)) | |
29 else: | |
30 ofh.write("\nEmpty feature file %s will NOT be processed\n" % str(feature_hits_file)) | |
31 best_hits = None | |
32 feature_hits[feature_name] = best_hits | |
33 | |
34 # Draw one plot per contig for simplicity. | |
35 ofh.write("\nProcessing contigs file: %s\n" % str(contigs)) | |
36 for contig in SeqIO.parse(contigs, 'fasta'): | |
37 ofh.write("Processing contig: %s\n" % str(contig)) | |
38 contig_plot_png = os.path.join(output_dir, '%s.png' % str(contig.id)) | |
39 feature_sets_to_plot = pandas.Series(dtype=object) | |
40 for feature_number in range(len(feature_hits)): | |
41 feature_name = feature_hits.index.to_list()[feature_number] | |
42 ofh.write("Processing feature name: %s\n" % str(feature_name)) | |
43 these_features = feature_hits[feature_name] | |
44 if these_features is None or these_features.shape[0] == 0: | |
45 # No features. | |
46 continue | |
47 contig_features = these_features.loc[these_features.iloc[:, 0] == contig.id, :] | |
48 if contig_features is None or contig_features.shape[0] == 0: | |
49 # No features. | |
50 continue | |
51 features_to_plot = [] | |
52 for i in range(contig_features.shape[0]): | |
53 i = contig_features.iloc[i, :] | |
54 features_to_plot += [GraphicFeature(start=i[1], end=i[2], label=i[3], strand=1 * i[5], color=FEATURE_COLORS[feature_number])] | |
55 feature_sets_to_plot[feature_name] = features_to_plot | |
56 ofh.write("Number of features to plot: %d\n" % len(feature_sets_to_plot)) | |
57 if len(feature_sets_to_plot) == 0: | |
58 # No features. | |
59 continue | |
60 # Determine each plot height for later scaling | |
61 expected_plot_heights = [] | |
62 for i in range(len(feature_sets_to_plot)): | |
63 record = GraphicRecord(sequence_length=len(contig), features=feature_sets_to_plot[i]) | |
64 if i == len(feature_sets_to_plot) - 1: | |
65 with_ruler = True | |
66 else: | |
67 with_ruler = False | |
68 plot, _ = record.plot(figure_width=FIGURE_WIDTH, with_ruler=with_ruler) | |
69 expected_plot_heights += [plot.figure.get_size_inches()[1]] | |
70 plot_height_sum = sum(expected_plot_heights) | |
71 # Make a figure with separate plots for each feature class. | |
72 plots = pyplot.subplots(nrows=len(feature_sets_to_plot), | |
73 ncols=1, | |
74 sharex=True, | |
75 figsize=(FIGURE_WIDTH, plot_height_sum * .66666), | |
76 gridspec_kw={"height_ratios": expected_plot_heights}) | |
77 figure = plots[0] | |
78 plots = plots[1] | |
79 if len(feature_sets_to_plot) == 1: | |
80 plots = [plots] | |
81 # Add each feature class's plot with the pre-determined height. | |
82 for i in range(len(feature_sets_to_plot)): | |
83 record = GraphicRecord(sequence_length=len(contig), features=feature_sets_to_plot[i]) | |
84 if i == len(feature_sets_to_plot) - 1: | |
85 with_ruler = True | |
86 else: | |
87 with_ruler = False | |
88 plot, _ = record.plot(ax=plots[i], with_ruler=with_ruler, figure_width=FIGURE_WIDTH) | |
89 ymin, ymax = plot.figure.axes[0].get_ylim() | |
90 if i == 0: | |
91 plot.text(x=0, y=ymax, s=contig.id) | |
92 figure.tight_layout() | |
93 ofh.write("Saving PNG plot file: %s\n" % str(contig_plot_png)) | |
94 figure.savefig(contig_plot_png) | |
95 feature_plots[contig.id] = contig_plot_png | |
96 ofh.close() | |
97 | |
98 | |
99 if __name__ == '__main__': | |
100 parser = argparse.ArgumentParser() | |
101 | |
102 parser.add_argument('--feature_hits_dir', action='store', dest='feature_hits_dir', help='Directory of tabular files containing feature hits') | |
103 parser.add_argument('--contigs', action='store', dest='contigs', help='Fasta file of contigs') | |
104 parser.add_argument('--output_dir', action='store', dest='output_dir', help='Output directory') | |
105 | |
106 args = parser.parse_args() | |
107 | |
108 # Get thge collection of feature hits files. The collection | |
109 # will be sorted alphabetically and will contain 2 files | |
110 # named something like AMR_CDS_311_2022_12_20.fasta and | |
111 # Incompatibility_Groups_2023_01_01.fasta. | |
112 feature_hits_files = [] | |
113 for file_name in sorted(os.listdir(args.feature_hits_dir)): | |
114 file_path = os.path.abspath(os.path.join(args.feature_hits_dir, file_name)) | |
115 feature_hits_files.append(file_path) | |
116 | |
117 draw_features(feature_hits_files, args.contigs, args.output_dir) |