Mercurial > repos > greg > draw_features
changeset 0:636eeb4fb9ac draft
Uploaded
author | greg |
---|---|
date | Wed, 25 Jan 2023 20:38:21 +0000 |
parents | |
children | 7d47800ee5ac |
files | .shed.yml draw_features.py draw_features.xml macros.xml test-data/PS01519_contigs.fasta.gz test-data/amr_cds.bed |
diffstat | 6 files changed, 235 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Wed Jan 25 20:38:21 2023 +0000 @@ -0,0 +1,9 @@ +name: draw_features +owner: greg +description: Plots contigs and features of high-quality annotated assemblies +long_description: Plots contigs and features of high-quality annotated assemblies +categories: +- Visualization +remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/pima/draw_features +homepage_url: https://github.com/gregvonkuster/galaxy_tools +type: unrestricted
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/draw_features.py Wed Jan 25 20:38:21 2023 +0000 @@ -0,0 +1,117 @@ +#!/usr/bin/env python + +import argparse +import os + +import pandas +import matplotlib.pyplot as pyplot +from Bio import SeqIO +from dna_features_viewer import GraphicFeature, GraphicRecord + + +AMR_COLOR = '#FED976' +INC_GROUPS_COLOR = '#0570B0' +FEATURE_COLORS = [AMR_COLOR, INC_GROUPS_COLOR] +FIGURE_WIDTH = 13 + + +def draw_features(feature_hits_files, contigs, output_dir): + ofh = open('process_log', 'w') + # Read feature_hits_files. + feature_hits = pandas.Series(dtype=object) + feature_plots = pandas.Series(dtype=object) + for feature_hits_file in feature_hits_files: + feature_name = os.path.basename(feature_hits_file) + # Make sure the file is not empty. + if os.path.isfile(feature_hits_file) and os.path.getsize(feature_hits_file) > 0: + best_hits = pandas.read_csv(filepath_or_buffer=feature_hits_file, sep='\t', header=None) + ofh.write("\nFeature file %s will be processed\n" % str(feature_hits_file)) + else: + ofh.write("\nEmpty feature file %s will NOT be processed\n" % str(feature_hits_file)) + best_hits = None + feature_hits[feature_name] = best_hits + + # Draw one plot per contig for simplicity. + ofh.write("\nProcessing contigs file: %s\n" % str(contigs)) + for contig in SeqIO.parse(contigs, 'fasta'): + ofh.write("Processing contig: %s\n" % str(contig)) + contig_plot_png = os.path.join(output_dir, '%s.png' % str(contig.id)) + feature_sets_to_plot = pandas.Series(dtype=object) + for feature_number in range(len(feature_hits)): + feature_name = feature_hits.index.to_list()[feature_number] + ofh.write("Processing feature name: %s\n" % str(feature_name)) + these_features = feature_hits[feature_name] + if these_features is None or these_features.shape[0] == 0: + # No features. + continue + contig_features = these_features.loc[these_features.iloc[:, 0] == contig.id, :] + if contig_features is None or contig_features.shape[0] == 0: + # No features. + continue + features_to_plot = [] + for i in range(contig_features.shape[0]): + i = contig_features.iloc[i, :] + features_to_plot += [GraphicFeature(start=i[1], end=i[2], label=i[3], strand=1 * i[5], color=FEATURE_COLORS[feature_number])] + feature_sets_to_plot[feature_name] = features_to_plot + ofh.write("Number of features to plot: %d\n" % len(feature_sets_to_plot)) + if len(feature_sets_to_plot) == 0: + # No features. + continue + # Determine each plot height for later scaling + expected_plot_heights = [] + for i in range(len(feature_sets_to_plot)): + record = GraphicRecord(sequence_length=len(contig), features=feature_sets_to_plot[i]) + if i == len(feature_sets_to_plot) - 1: + with_ruler = True + else: + with_ruler = False + plot, _ = record.plot(figure_width=FIGURE_WIDTH, with_ruler=with_ruler) + expected_plot_heights += [plot.figure.get_size_inches()[1]] + plot_height_sum = sum(expected_plot_heights) + # Make a figure with separate plots for each feature class. + plots = pyplot.subplots(nrows=len(feature_sets_to_plot), + ncols=1, + sharex=True, + figsize=(FIGURE_WIDTH, plot_height_sum * .66666), + gridspec_kw={"height_ratios": expected_plot_heights}) + figure = plots[0] + plots = plots[1] + if len(feature_sets_to_plot) == 1: + plots = [plots] + # Add each feature class's plot with the pre-determined height. + for i in range(len(feature_sets_to_plot)): + record = GraphicRecord(sequence_length=len(contig), features=feature_sets_to_plot[i]) + if i == len(feature_sets_to_plot) - 1: + with_ruler = True + else: + with_ruler = False + plot, _ = record.plot(ax=plots[i], with_ruler=with_ruler, figure_width=FIGURE_WIDTH) + ymin, ymax = plot.figure.axes[0].get_ylim() + if i == 0: + plot.text(x=0, y=ymax, s=contig.id) + figure.tight_layout() + ofh.write("Saving PNG plot file: %s\n" % str(contig_plot_png)) + figure.savefig(contig_plot_png) + feature_plots[contig.id] = contig_plot_png + ofh.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument('--feature_hits_dir', action='store', dest='feature_hits_dir', help='Directory of tabular files containing feature hits') + parser.add_argument('--contigs', action='store', dest='contigs', help='Fasta file of contigs') + parser.add_argument('--output_dir', action='store', dest='output_dir', help='Output directory') + + args = parser.parse_args() + + # Get thge collection of feature hits files. The collection + # will be sorted alphabetically and will contain 2 files + # named something like AMR_CDS_311_2022_12_20.fasta and + # Incompatibility_Groups_2023_01_01.fasta. + feature_hits_files = [] + for file_name in sorted(os.listdir(args.feature_hits_dir)): + file_path = os.path.abspath(os.path.join(args.feature_hits_dir, file_name)) + feature_hits_files.append(file_path) + + draw_features(feature_hits_files, args.contigs, args.output_dir)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/draw_features.xml Wed Jan 25 20:38:21 2023 +0000 @@ -0,0 +1,74 @@ +<tool id="draw_features" name="PIMA: draw features" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>of annotated assemblies</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ +#import re + +#if $contigs.is_of_type('fasta.gz'): + gunzip -c '$contigs' > 'contigs.fasta' && +#else: + ln -s '$contigs' 'contigs.fasta' && +#end if + +mkdir feature_hits_dir && +mkdir output_dir && + +#for $i in $feature_hits: + #set file_name = $i.file_name + #set identifier = re.sub('[^\s\w\-\\.]', '_', str($i.element_identifier)) + ln -s '$file_name' 'feature_hits_dir/$identifier' && +#end for + +python '$__tool_directory__/draw_features.py' +--contigs 'contigs.fasta' +--feature_hits_dir 'feature_hits_dir' +--output_dir 'output_dir' +#if str($output_process_log) == 'yes': + && mv 'process_log' '$process_log' +#end if +]]></command> + <inputs> + <param name="contigs" type="data" format="fasta,fasta.gz" label="Fasta file of assembled contigs"/> + <param name="feature_hits" format="bed" type="data_collection" collection_type="list" label="Collection of feature hits BED files"/> + <param name="output_process_log" type="select" display="radio" label="Output process log file?"> + <option value="no" selected="true">No</option> + <option value="yes">Yes</option> + </param> + </inputs> + <outputs> + <data name="process_log" format="txt" label="${tool.name} on ${on_string} (Process log)"> + <filter>output_process_log == 'yes'</filter> + </data> + <collection name="features_png" type="list" format="png"> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>png)" directory="output_dir"/> + </collection> + </outputs> + <tests> + <test> + <param name="contigs" value="PS01519_contigs.fasta.gz" ftype="fasta.gz"/> + <param name="feature_hits"> + <collection type="list"> + <element name="amr_cds.bed" value="amr_cds.bed"/> + </collection> + </param> + <output_collection name="features_png" type="list" count="1"> + <element name="contig_1" ftype="png"> + <assert_contents> + <has_size value="25383" delta="100"/> + </assert_contents> + </element> + </output_collection> + </test> + </tests> + <help> +**What it does** + +Accepts a FASTA file of assembled contigs and a collection of 2 BED files containing the feature hits and plots +the features. + </help> + <expand macro="citations"/> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Wed Jan 25 20:38:21 2023 +0000 @@ -0,0 +1,19 @@ +<macros> + <token name="@TOOL_VERSION@">1.0.0</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">21.01</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="1.79">biopython</requirement> + <requirement type="package" version="3.1.2">dna_features_viewer</requirement> + <requirement type="package" version="3.6.2">matplotlib</requirement> + <requirement type="package" version="1.5.2">pandas</requirement> + </requirements> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1101/011650</citation> + </citations> + </xml> +</macros> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/amr_cds.bed Wed Jan 25 20:38:21 2023 +0000 @@ -0,0 +1,16 @@ +contig_1 484392 485921 62550724|AAX88798.1|AY818309.1|1|1|alo|alo|anthrolysin_O/cereolysin_O_family_cholesterol-dependent_cytolysin_Alo 0.984 + +contig_1 4673373 4675772 29894390|AAP07680.1|AE016877.1|1|1|inhA2|inhA2|M6_family_metalloprotease_immune_inhibitor_InhA2 1.000 - +contig_1 4253739 4254749 29894808|AAP08097.1|AE016877.1|1|1|cytK2|cytK2|beta-channel_forming_cytolysin_CytK2 1.000 + +contig_1 232577 233434 5305719|AAD41788.1|AF132087.1|1|1|plcR|plcR|transcriptional_regulator_PlcR 1.000 + +contig_1 4667636 4668636 505581118|AGL98063.1|KC527544.1|1|1|cerB|cerB|sphingomyelinase_CerB 0.983 - +contig_1 4095082 4097472 296323016|ADH05944.1|CP001903.1|1|1|inhA1|inhA1|M6_family_metalloprotease_immune_inhibitor_InhA1 0.998 - +contig_1 2421494 2422813 29896770|AAP10048.1|AE016877.1|1|1|hblC|hblC|hemolytic_enterotoxin_HBL_lytic_component_L2 0.977 + +contig_1 3520286 3521566 29895642|AAP08924.1|AE016877.1|1|1|entFM|entFM|enterotoxin_EntFM 0.988 - +contig_1 2423992 2425119 29896768|AAP10046.1|AE016877.1|1|1|hblB|hblB|hemolytic_enterotoxin_HBL_binding_subunit_HblB 0.996 + +contig_1 1750170 1751159 29897410|AAP10686.1|AE016877.1|1|1|plcA|plcA|phosphatidylinositol_diacylglycerol-lyase 1.000 + +contig_1 3448511 3448927 446866507|WP_000943763.1|NG_050591.1|1|1|fosBx1|fosB_gen|fosfomycin_resistance_bacillithiol_transferase_FosBx1 0.998 - +contig_1 1994015 1995253 29897180|AAP10457.1|AE016877.1|1|1|hlyII|hlyII|hemolysin_II_HlyII 1.000 + +contig_1 2968066 2968986 1028083092|WP_063842248.1|NG_047482.1|1|1|bla1|bla1|class_A_beta-lactamase_Bla1 0.976 - +contig_1 2079369 2080142 1028078998|WP_063839879.1|NG_056058.1|1|1|BcII|BcII|BcII_family_subclass_B1_metallo-beta-lactamase 0.963 + +contig_1 3591341 3592420 29895498|AAP08785.1|AE016877.1|1|1|nheC|nheC|non-hemolytic_enterotoxin_NHE_subunit_C 0.999 - +contig_1 2425495 2426895 6686906|CAB64770.1|AJ007794.1|1|1|hblA|hblA|hemolytic_enterotoxin_HBL_binding_subunit_HblA 0.988 +