changeset 0:636eeb4fb9ac draft

Uploaded
author greg
date Wed, 25 Jan 2023 20:38:21 +0000
parents
children 7d47800ee5ac
files .shed.yml draw_features.py draw_features.xml macros.xml test-data/PS01519_contigs.fasta.gz test-data/amr_cds.bed
diffstat 6 files changed, 235 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Wed Jan 25 20:38:21 2023 +0000
@@ -0,0 +1,9 @@
+name: draw_features
+owner: greg
+description: Plots contigs and features of high-quality annotated assemblies
+long_description: Plots contigs and features of high-quality annotated assemblies
+categories: 
+- Visualization
+remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/pima/draw_features
+homepage_url: https://github.com/gregvonkuster/galaxy_tools
+type: unrestricted
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/draw_features.py	Wed Jan 25 20:38:21 2023 +0000
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+
+import argparse
+import os
+
+import pandas
+import matplotlib.pyplot as pyplot
+from Bio import SeqIO
+from dna_features_viewer import GraphicFeature, GraphicRecord
+
+
+AMR_COLOR = '#FED976'
+INC_GROUPS_COLOR = '#0570B0'
+FEATURE_COLORS = [AMR_COLOR, INC_GROUPS_COLOR]
+FIGURE_WIDTH = 13
+
+
+def draw_features(feature_hits_files, contigs, output_dir):
+    ofh = open('process_log', 'w')
+    # Read feature_hits_files.
+    feature_hits = pandas.Series(dtype=object)
+    feature_plots = pandas.Series(dtype=object)
+    for feature_hits_file in feature_hits_files:
+        feature_name = os.path.basename(feature_hits_file)
+        # Make sure the file is not empty.
+        if os.path.isfile(feature_hits_file) and os.path.getsize(feature_hits_file) > 0:
+            best_hits = pandas.read_csv(filepath_or_buffer=feature_hits_file, sep='\t', header=None)
+            ofh.write("\nFeature file %s will be processed\n" % str(feature_hits_file))
+        else:
+            ofh.write("\nEmpty feature file %s will NOT be processed\n" % str(feature_hits_file))
+            best_hits = None
+        feature_hits[feature_name] = best_hits
+
+    # Draw one plot per contig for simplicity.
+    ofh.write("\nProcessing contigs file: %s\n" % str(contigs))
+    for contig in SeqIO.parse(contigs, 'fasta'):
+        ofh.write("Processing contig: %s\n" % str(contig))
+        contig_plot_png = os.path.join(output_dir, '%s.png' % str(contig.id))
+        feature_sets_to_plot = pandas.Series(dtype=object)
+        for feature_number in range(len(feature_hits)):
+            feature_name = feature_hits.index.to_list()[feature_number]
+            ofh.write("Processing feature name: %s\n" % str(feature_name))
+            these_features = feature_hits[feature_name]
+            if these_features is None or these_features.shape[0] == 0:
+                # No features.
+                continue
+            contig_features = these_features.loc[these_features.iloc[:, 0] == contig.id, :]
+            if contig_features is None or contig_features.shape[0] == 0:
+                # No features.
+                continue
+            features_to_plot = []
+            for i in range(contig_features.shape[0]):
+                i = contig_features.iloc[i, :]
+                features_to_plot += [GraphicFeature(start=i[1], end=i[2], label=i[3], strand=1 * i[5], color=FEATURE_COLORS[feature_number])]
+            feature_sets_to_plot[feature_name] = features_to_plot
+        ofh.write("Number of features to plot: %d\n" % len(feature_sets_to_plot))
+        if len(feature_sets_to_plot) == 0:
+            # No features.
+            continue
+        # Determine each plot height for later scaling
+        expected_plot_heights = []
+        for i in range(len(feature_sets_to_plot)):
+            record = GraphicRecord(sequence_length=len(contig), features=feature_sets_to_plot[i])
+            if i == len(feature_sets_to_plot) - 1:
+                with_ruler = True
+            else:
+                with_ruler = False
+            plot, _ = record.plot(figure_width=FIGURE_WIDTH, with_ruler=with_ruler)
+            expected_plot_heights += [plot.figure.get_size_inches()[1]]
+        plot_height_sum = sum(expected_plot_heights)
+        # Make a figure with separate plots for each feature class.
+        plots = pyplot.subplots(nrows=len(feature_sets_to_plot),
+                                ncols=1,
+                                sharex=True,
+                                figsize=(FIGURE_WIDTH, plot_height_sum * .66666),
+                                gridspec_kw={"height_ratios": expected_plot_heights})
+        figure = plots[0]
+        plots = plots[1]
+        if len(feature_sets_to_plot) == 1:
+            plots = [plots]
+        # Add each feature class's plot with the pre-determined height.
+        for i in range(len(feature_sets_to_plot)):
+            record = GraphicRecord(sequence_length=len(contig), features=feature_sets_to_plot[i])
+            if i == len(feature_sets_to_plot) - 1:
+                with_ruler = True
+            else:
+                with_ruler = False
+            plot, _ = record.plot(ax=plots[i], with_ruler=with_ruler, figure_width=FIGURE_WIDTH)
+            ymin, ymax = plot.figure.axes[0].get_ylim()
+            if i == 0:
+                plot.text(x=0, y=ymax, s=contig.id)
+        figure.tight_layout()
+        ofh.write("Saving PNG plot file: %s\n" % str(contig_plot_png))
+        figure.savefig(contig_plot_png)
+        feature_plots[contig.id] = contig_plot_png
+    ofh.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--feature_hits_dir', action='store', dest='feature_hits_dir', help='Directory of tabular files containing feature hits')
+    parser.add_argument('--contigs', action='store', dest='contigs', help='Fasta file of contigs')
+    parser.add_argument('--output_dir', action='store', dest='output_dir', help='Output directory')
+
+    args = parser.parse_args()
+
+    # Get thge collection of feature hits files.  The collection
+    # will be sorted alphabetically and will contain 2 files
+    # named something like AMR_CDS_311_2022_12_20.fasta and
+    # Incompatibility_Groups_2023_01_01.fasta.
+    feature_hits_files = []
+    for file_name in sorted(os.listdir(args.feature_hits_dir)):
+        file_path = os.path.abspath(os.path.join(args.feature_hits_dir, file_name))
+        feature_hits_files.append(file_path)
+
+    draw_features(feature_hits_files, args.contigs, args.output_dir)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/draw_features.xml	Wed Jan 25 20:38:21 2023 +0000
@@ -0,0 +1,74 @@
+<tool id="draw_features" name="PIMA: draw features" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>of annotated assemblies</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+#import re
+
+#if $contigs.is_of_type('fasta.gz'):
+    gunzip -c '$contigs' > 'contigs.fasta' &&
+#else:
+    ln -s '$contigs' 'contigs.fasta' &&
+#end if
+
+mkdir feature_hits_dir &&
+mkdir output_dir &&
+
+#for $i in $feature_hits:
+    #set file_name = $i.file_name
+    #set identifier = re.sub('[^\s\w\-\\.]', '_', str($i.element_identifier))
+    ln -s '$file_name' 'feature_hits_dir/$identifier' &&
+#end for
+
+python '$__tool_directory__/draw_features.py'
+--contigs 'contigs.fasta'
+--feature_hits_dir 'feature_hits_dir'
+--output_dir 'output_dir'
+#if str($output_process_log) == 'yes':
+    && mv 'process_log' '$process_log'
+#end if
+]]></command>
+    <inputs>
+        <param name="contigs" type="data" format="fasta,fasta.gz" label="Fasta file of assembled contigs"/>
+        <param name="feature_hits" format="bed" type="data_collection" collection_type="list" label="Collection of feature hits BED files"/>
+        <param name="output_process_log" type="select" display="radio" label="Output process log file?">
+            <option value="no" selected="true">No</option>
+            <option value="yes">Yes</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="process_log" format="txt" label="${tool.name} on ${on_string} (Process log)">
+            <filter>output_process_log == 'yes'</filter>
+        </data>
+        <collection name="features_png" type="list" format="png">
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;png)" directory="output_dir"/>
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <param name="contigs" value="PS01519_contigs.fasta.gz" ftype="fasta.gz"/>
+            <param name="feature_hits">
+                <collection type="list">
+                    <element name="amr_cds.bed" value="amr_cds.bed"/>
+                </collection>
+            </param>
+            <output_collection name="features_png" type="list" count="1">
+                <element name="contig_1" ftype="png">
+                    <assert_contents>
+                        <has_size value="25383" delta="100"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Accepts a FASTA file of assembled contigs and a collection of 2 BED files containing the feature hits and plots
+the features.
+    </help>
+    <expand macro="citations"/>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Wed Jan 25 20:38:21 2023 +0000
@@ -0,0 +1,19 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.0.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">21.01</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="1.79">biopython</requirement>
+            <requirement type="package" version="3.1.2">dna_features_viewer</requirement>
+            <requirement type="package" version="3.6.2">matplotlib</requirement>
+            <requirement type="package" version="1.5.2">pandas</requirement>
+        </requirements>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1101/011650</citation>
+        </citations>
+    </xml>
+</macros>
+
Binary file test-data/PS01519_contigs.fasta.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/amr_cds.bed	Wed Jan 25 20:38:21 2023 +0000
@@ -0,0 +1,16 @@
+contig_1	484392	485921	62550724|AAX88798.1|AY818309.1|1|1|alo|alo|anthrolysin_O/cereolysin_O_family_cholesterol-dependent_cytolysin_Alo	0.984	+
+contig_1	4673373	4675772	29894390|AAP07680.1|AE016877.1|1|1|inhA2|inhA2|M6_family_metalloprotease_immune_inhibitor_InhA2	1.000	-
+contig_1	4253739	4254749	29894808|AAP08097.1|AE016877.1|1|1|cytK2|cytK2|beta-channel_forming_cytolysin_CytK2	1.000	+
+contig_1	232577	233434	5305719|AAD41788.1|AF132087.1|1|1|plcR|plcR|transcriptional_regulator_PlcR	1.000	+
+contig_1	4667636	4668636	505581118|AGL98063.1|KC527544.1|1|1|cerB|cerB|sphingomyelinase_CerB	0.983	-
+contig_1	4095082	4097472	296323016|ADH05944.1|CP001903.1|1|1|inhA1|inhA1|M6_family_metalloprotease_immune_inhibitor_InhA1	0.998	-
+contig_1	2421494	2422813	29896770|AAP10048.1|AE016877.1|1|1|hblC|hblC|hemolytic_enterotoxin_HBL_lytic_component_L2	0.977	+
+contig_1	3520286	3521566	29895642|AAP08924.1|AE016877.1|1|1|entFM|entFM|enterotoxin_EntFM	0.988	-
+contig_1	2423992	2425119	29896768|AAP10046.1|AE016877.1|1|1|hblB|hblB|hemolytic_enterotoxin_HBL_binding_subunit_HblB	0.996	+
+contig_1	1750170	1751159	29897410|AAP10686.1|AE016877.1|1|1|plcA|plcA|phosphatidylinositol_diacylglycerol-lyase	1.000	+
+contig_1	3448511	3448927	446866507|WP_000943763.1|NG_050591.1|1|1|fosBx1|fosB_gen|fosfomycin_resistance_bacillithiol_transferase_FosBx1	0.998	-
+contig_1	1994015	1995253	29897180|AAP10457.1|AE016877.1|1|1|hlyII|hlyII|hemolysin_II_HlyII	1.000	+
+contig_1	2968066	2968986	1028083092|WP_063842248.1|NG_047482.1|1|1|bla1|bla1|class_A_beta-lactamase_Bla1	0.976	-
+contig_1	2079369	2080142	1028078998|WP_063839879.1|NG_056058.1|1|1|BcII|BcII|BcII_family_subclass_B1_metallo-beta-lactamase	0.963	+
+contig_1	3591341	3592420	29895498|AAP08785.1|AE016877.1|1|1|nheC|nheC|non-hemolytic_enterotoxin_NHE_subunit_C	0.999	-
+contig_1	2425495	2426895	6686906|CAB64770.1|AJ007794.1|1|1|hblA|hblA|hemolytic_enterotoxin_HBL_binding_subunit_HblA	0.988	+