changeset 0:cacb90cde53e draft

First version of b2btools for single sequences in Galaxy
author adrian.diaz
date Wed, 06 Jul 2022 11:01:15 +0000
parents
children 891ccfd22633
files singleSeq/b2btools_single_sequence.xml singleSeq/script.py
diffstat 2 files changed, 273 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/singleSeq/b2btools_single_sequence.xml	Wed Jul 06 11:01:15 2022 +0000
@@ -0,0 +1,113 @@
+<tool
+    id="b2btools_single_sequence"
+    name="bio2Byte: biophysical predictors (b2bTools) for single sequences"
+    version="3.0.4+galaxy0"
+    license="GPL-3.0"
+    python_template_version="3.5"
+    profile="21.05">
+    <description>predicts protein biophysical properties from their amino-acid sequences</description>
+    <requirements>
+        <requirement type="package" version="3.0.4">b2btools</requirement>
+    </requirements>
+    <xrefs>
+        <xref type="bio.tools">b2btools</xref>
+    </xrefs>
+    <command detect_errors="exit_code"><![CDATA[
+        mkdir tabular;
+        mkdir plots;
+        python '$__tool_directory__/script.py' --file '$section_input_file.input'
+                                               $section_predictors.dynamine
+                                               $section_predictors.disomine
+                                               $section_predictors.efoldmine
+                                               $section_predictors.agmata
+                                               $section_plot.highlight
+                                               $section_plot.plot
+                                               $section_plot.plot_all
+                                               --output ./tabular
+                                               --plot-output ./plots
+                                               --json '$predictions_output'
+    ]]></command>
+    <inputs>
+        <section name="section_input_file" title="Input file" help="Configure this section to plug a valid input in FASTA format">
+            <param type="data" name="input" format="fasta" label="Protein sequences in FASTA format" help="FASTA file containing up to 10 valid protein sequences"/>
+        </section>
+        <section name="section_predictors" title="Biophyisical predictors" help="Configure this section to select the predictions to be executed">
+            <param
+                name="dynamine"
+                type="boolean"
+                label="DynaMine: Dynamics"
+                truevalue="--dynamine"
+                falsevalue=""
+                help="Fast predictor of protein backbone dynamics using only sequence information as input. The version here also predicts side-chain dynamics and secondary structure predictors using the same principle." />
+            <param
+                name="disomine"
+                type="boolean"
+                label="DisoMine: Disorder"
+                truevalue="--disomine"
+                falsevalue=""
+                help="Predicts protein disorder with recurrent neural networks not directly from the amino acid sequence, but instead from more generic predictions of key biophysical properties, here protein dynamics, secondary structure and early folding."/>
+            <param
+                name="efoldmine"
+                type="boolean"
+                label="EFoldMine: Early folding"
+                truevalue="--efoldmine"
+                falsevalue=""
+                help="Predicts from the primary amino acid sequence of a protein, which amino acids are likely involved in early folding events."/>
+            <param
+                name="agmata"
+                type="boolean"
+                label="Agmata: Beta aggregation"
+                truevalue="--agmata"
+                falsevalue=""
+                help="Single-sequence based predictor of protein regions that are likely to cause beta-aggregation."/>
+        </section>
+        <section name="section_plot" title="Plot options" help="Configure plot output">
+            <param name="plot" type="boolean" label="Plot prediction values" truevalue="--plot" falsevalue=""/>
+            <param name="plot_all" type="boolean" label="Plot all sequences together" truevalue="--plot_all" falsevalue="" help="This tool can plot all sequences together in order to compare predicted values of different sequences"/>
+            <param name="highlight" type="boolean" label="Highlight regions of interest" truevalue="--highlight" falsevalue="" help="Highlight biophysical regions on the background of the plots"/>
+        </section>
+    </inputs>
+    <outputs>
+        <data name="predictions_output" format="json" />
+        <collection name="split_output" type="list" label="Tabular predictions by sequence">
+            <discover_datasets pattern="__name_and_ext__" format="tabular" directory="tabular" visible="true" />
+        </collection>
+        <collection name="split_output_plots" type="list" label="Plots">
+            <discover_datasets pattern="__name_and_ext__" format="png" directory="plots" visible="true" />
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="example.fasta" />
+            <param name="dynamine" value="true"/>
+            <param name="disomine" value="true"/>
+            <param name="efoldmine" value="true"/>
+            <param name="agmata" value="true"/>
+            <output name="predictions_output" file="test_output.json" ftype="json" />
+        </test>
+    </tests>
+    <help><![CDATA[
+        Bio2byte tools (b2btools) offer the following single protein sequence based predictions:
+
+        - Backbone and sidechain dynamics (DynaMine) - Helix, sheet, coil and polyproline-II propensity
+        - Early folding propensity (EFoldMine)
+        - Disorder (DisoMine)
+        - Beta-sheet aggregation (Agmata)
+
+        This tool is available on the Python Package Index (PyPI): https://pypi.org/project/b2bTools/
+    ]]>
+    </help>
+    <creator>
+        <organization name="bio2Byte" url="https://bio2byte.be" email=""/>
+        <organization name="Vrije Universiteit Brussel" url="https://vub.be" alternateName="VUB"/>
+        <person honorificPrefix="Prof." givenName="Wim" familyName="Vranken" email="Wim.Vranken@vub.be" identifier="http://orcid.org/0000-0001-7470-4324" />
+        <person givenName="Jose" familyName="Gavalda-Garcia" email="Jose.Gavalda.Garcia@vub.be" identifier="http://orcid.org/0000-0001-6431-3442" />
+        <person givenName="Adrian" familyName="Diaz" email="Adrian.Diaz@vub.be" identifier="http://orcid.org/0000-0003-0165-1318" />
+    </creator>
+    <citations>
+        <citation type="doi">10.1038/ncomms3741</citation>
+        <citation type="doi">10.1101&#47;2020.05.25.115253</citation>
+        <citation type="doi">10.1038&#47;s41598-017-08366-3</citation>
+        <citation type="doi">10.1093/bioinformatics/btz912</citation>
+    </citations>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/singleSeq/script.py	Wed Jul 06 11:01:15 2022 +0000
@@ -0,0 +1,160 @@
+import optparse
+import os.path
+import unicodedata
+import re
+import pandas as pd
+from b2bTools import SingleSeq
+import matplotlib.pyplot as plt
+
+
+def slugify(value):
+    """
+    Taken from https://github.com/django/django/blob/master/django/utils/text.py
+    Convert to ASCII if 'allow_unicode'. Convert spaces or repeated
+    dashes to single dashes. Remove characters that aren't alphanumerics,
+    underscores, or hyphens. Convert to lowercase. Also strip leading and
+    trailing whitespace, dashes, and underscores.
+    """
+    value = str(value)
+    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
+    value = re.sub(r'[^\w\s-]', '', value.lower())
+    return re.sub(r'[-\s]+', '-', value).strip('-_')
+
+
+def plot_prediction(prediction_name, highlighting_regions, predicted_values, seq_name):
+    thresholds_dict = {'backbone': {'membrane spanning': [1., 1.5],
+                                    'rigid': [0.8, 1.],
+                                    'context-dependent': [0.69, 0.8],
+                                    'flexible': [-1.0, 0.69]},
+                       'earlyFolding': {'early folds': [0.169, 2.], 'late folds': [-1., 0.169]},
+                       'disoMine': {'ordered': [-1., 0.5], 'disordered': [0.5, 2.]},
+                       }
+    ordered_regions_dict = {'backbone': ['flexible', 'context-dependent', 'rigid', 'membrane spanning'],
+                            'earlyFolding': ['late folds', 'early folds'],
+                            'disoMine': ['ordered', 'disordered'],
+                            }
+    colors = ['yellow', 'orange', 'pink', 'red']
+    ranges_dict = {
+        'backbone': [-0.2, 1.2],
+        'sidechain': [-0.2, 1.2],
+        'ppII': [-0.2, 1.2],
+        'earlyFolding': [-0.2, 1.2],
+        'disoMine': [-0.2, 1.2],
+        'agmata': [-0.2, 1.2],
+        'helix': [-1., 1.],
+        'sheet': [-1., 1.],
+        'coil': [-1., 1.],
+    }
+    fig, ax = plt.subplots(1, 1)
+    fig.set_figwidth(10)
+    fig.set_figheight(5)
+    ax.set_title(prediction_name + ' ' + 'prediction')
+    plt.tight_layout(rect=[0, 0, 0.75, 1])
+    if seq_name == 'all':
+        max_len = 0
+        for seq in predicted_values.keys():
+            predictions = predicted_values[seq]
+            ax.plot(range(len(predictions)), predictions, label=seq)
+            if len(predictions)>max_len:
+                max_len = len(predictions)
+            ax.set_xlim([0, max_len - 1])
+    else:
+        predictions = predicted_values
+        ax.plot(range(len(predictions)), predictions, label=seq_name)
+        ax.set_xlim([0, len(predictions) - 1])
+    legend_lines = plt.legend(bbox_to_anchor=(1.04,1), loc="upper left", fancybox=True, shadow=True)
+    ax.add_artist(legend_lines)
+    # Define regions
+    if highlighting_regions:
+        if prediction_name in ordered_regions_dict.keys():
+            for i, prediction in enumerate(ordered_regions_dict[prediction_name]):
+                lower = thresholds_dict[prediction_name][prediction][0]
+                upper = thresholds_dict[prediction_name][prediction][1]
+                color = colors[i]
+                ax.axhspan(lower, upper, alpha=0.3, color=color, label=prediction)
+            included_in_regions_legend = list(reversed(
+                [prediction for prediction in ordered_regions_dict[prediction_name]]))  # to sort it "from up to low"
+            # Get handles and labels
+            handles, labels = plt.gca().get_legend_handles_labels()
+            handles_dict = {label: handles[idx] for idx, label in enumerate(labels)}
+            # Add legend for regions, if available
+            region_legend = ax.legend([handles_dict[region] for region in included_in_regions_legend],
+                                      [region for region in included_in_regions_legend], fancybox=True, shadow=True,
+                                      loc='lower left', bbox_to_anchor=(1.04,0))
+            ax.add_artist(region_legend)
+    ax.set_ylim(ranges_dict[prediction_name])
+    ax.set_xlabel('residue index')
+    ax.set_ylabel('prediction values')
+    ax.grid(axis='y')
+    plt.savefig(os.path.join(options.plot_output, "{0}_{1}.png".format(slugify(seq_name), prediction_name)), bbox_inches="tight")
+    plt.close()
+
+def df_dict_to_dict_of_values(df_dict, predictor):
+    results_dict = {}
+    for seq in df_dict.keys():
+        df = pd.read_csv(df_dict[seq], sep='\t')
+        results_dict[seq] = df[predictor]
+    return results_dict
+
+def main(options):
+    single_seq = SingleSeq(options.input_fasta)
+    b2b_tools = []
+    if options.dynamine:
+        b2b_tools.append('dynamine')
+    if options.disomine:
+        b2b_tools.append('disomine')
+    if options.efoldmine:
+        b2b_tools.append('efoldmine')
+    if options.agmata:
+        b2b_tools.append('agmata')
+
+    single_seq.predict(b2b_tools)
+    predictions = single_seq.get_all_predictions()
+    results_json = single_seq.get_all_predictions_json('all')
+    with open(options.json_output, 'w') as f:
+        f.write(results_json)
+    first_sequence_key = next(iter(predictions))
+    prediction_keys = predictions[first_sequence_key].keys()
+    df_dictionary = {}
+    for sequence_key, sequence_predictions in predictions.items():
+        residues = sequence_predictions['seq']
+        residues_count = len(residues)
+        sequence_df = pd.DataFrame(columns=prediction_keys, index=range(residues_count))
+        sequence_df.index.name = 'residue_index'
+        for predictor in prediction_keys:
+            sequence_df[predictor] = sequence_predictions[predictor]
+        sequence_df = sequence_df.rename(columns={"seq": "residue"})
+        sequence_df = sequence_df.round(decimals=2)
+        filename = f'{options.output}/{slugify(sequence_key)}.tsv'
+        df_dictionary[sequence_key] = filename
+        sequence_df.to_csv(filename, sep="\t")
+        # Plot each individual plot (compatible with plot all)
+        if options.plot:
+            for predictor in prediction_keys:
+                if predictor != 'seq':
+                    plot_prediction(prediction_name=predictor, highlighting_regions=True,
+                                    predicted_values=sequence_predictions[predictor], seq_name=sequence_key)
+    # Plot all together (compatible with plot individual)
+    if options.plot_all:
+        for predictor in prediction_keys:
+            if predictor != 'seq':
+                results_dictionary = df_dict_to_dict_of_values(df_dict=df_dictionary, predictor=predictor)
+                plot_prediction(prediction_name=predictor, highlighting_regions=True,
+                                predicted_values=results_dictionary, seq_name='all')
+
+if __name__ == "__main__":
+    parser = optparse.OptionParser()
+    parser.add_option("--dynamine", action="store_true", default=False)
+    parser.add_option("--disomine", action="store_true", default=False)
+    parser.add_option("--efoldmine", action="store_true", default=False)
+    parser.add_option("--agmata", action="store_true", default=False)
+    parser.add_option("--file", dest="input_fasta", default=False)
+    parser.add_option("--output", dest="output", default=False)
+    parser.add_option("--plot-output", dest="plot_output", default=False)
+
+    parser.add_option("--json", dest="json_output", default=False)
+    parser.add_option("--plot", action="store_true", default=False)
+    parser.add_option("--plot_all", action="store_true", default=False)
+    parser.add_option("--highlight", action="store_true", default=False)
+    options, _args = parser.parse_args()
+    main(options)
\ No newline at end of file