Mercurial > repos > adrian.diaz > b2btools_single_sequence
changeset 2:a9db23ac113f draft default tip
Uploaded new version formatted.
author | adrian.diaz |
---|---|
date | Tue, 02 Aug 2022 09:44:33 +0000 |
parents | 891ccfd22633 |
children | |
files | b2btools_single_sequence.xml b2btools_single_sequence/b2btools_single_sequence.xml b2btools_single_sequence/script.py script.py test-data/input.fasta test-data/test_output.json |
diffstat | 6 files changed, 1177 insertions(+), 289 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/b2btools_single_sequence.xml Tue Aug 02 09:44:33 2022 +0000 @@ -0,0 +1,123 @@ +<tool + id="b2btools_single_sequence" + name="b2bTools: Biophysical predictors for single sequences" + version="3.0.4+galaxy0" + license="GPL-3.0" + python_template_version="3.5" + profile="21.05"> + <description>from their amino-acid sequences</description> + <xrefs> + <xref type="bio.tools">b2btools</xref> + </xrefs> + <requirements> + <requirement type="package" version="3.0.4">b2btools</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + mkdir tabular; + mkdir plots; + python '$__tool_directory__/script.py' --file '$section_input_file.input' + $section_predictors.dynamine + $section_predictors.disomine + $section_predictors.efoldmine + $section_predictors.agmata + $section_plot.highlight + $section_plot.plot + $section_plot.plot_all + --output ./tabular + --plot-output ./plots + --json '$predictions_output' + ]]></command> + <inputs> + <section name="section_input_file" title="Input file" help="Configure this section to plug a valid input in FASTA format"> + <param type="data" name="input" format="fasta" label="Protein sequences in FASTA format" help="FASTA file of protein sequences (up to 10 sequences if AgMata is selected)"/> + </section> + <section name="section_predictors" title="Biophyisical predictors" help="Configure this section to select the predictions to be executed"> + <param + name="dynamine" + type="boolean" + label="DynaMine: Dynamics" + truevalue="--dynamine" + falsevalue="" + help="Fast predictor of protein backbone dynamics using only sequence information as input. The version here also predicts side-chain dynamics and secondary structure predictors using the same principle." /> + <param + name="disomine" + type="boolean" + label="DisoMine: Disorder" + truevalue="--disomine" + falsevalue="" + help="Predicts protein disorder with recurrent neural networks not directly from the amino acid sequence, but instead from more generic predictions of key biophysical properties, here protein dynamics, secondary structure and early folding."/> + <param + name="efoldmine" + type="boolean" + label="EFoldMine: Early folding" + truevalue="--efoldmine" + falsevalue="" + help="Predicts from the primary amino acid sequence of a protein, which amino acids are likely involved in early folding events."/> + <param + name="agmata" + type="boolean" + label="Agmata: Beta aggregation" + truevalue="--agmata" + falsevalue="" + help="(Max. 10 sequences) Single-sequence based predictor of protein regions that are likely to cause beta-aggregation."/> + </section> + <section name="section_plot" title="Plot options" help="Configure plot output"> + <param name="plot" type="boolean" label="Plot predicted values by sequence" truevalue="--plot" falsevalue="" help="This option plots predicted values in different files"/> + <param name="plot_all" type="boolean" label="Plot all sequences together" truevalue="--plot_all" falsevalue="" help="This option plots all sequences together in order to compare predicted values of different sequences"/> + <param name="highlight" type="boolean" label="Highlight regions of interest" truevalue="--highlight" falsevalue="" help="Highlight biophysical regions on the background of the plots"/> + </section> + </inputs> + <outputs> + <data name="predictions_output" label="Predictions in JSON format" format="json" /> + <collection name="split_output" type="list" label="Tabular predictions by sequence"> + <discover_datasets pattern="__name_and_ext__" format="tabular" directory="tabular" visible="true" /> + </collection> + <collection name="split_output_plots" type="list" label="Plots"> + <discover_datasets pattern="__name_and_ext__" format="png" directory="plots" visible="true" /> + </collection> + </outputs> + <tests> + <test> + <param name="input" value="input.fasta" ftype="fasta"/> + <param name="dynamine" value="true"/> + <param name="disomine" value="false"/> + <param name="efoldmine" value="false"/> + <param name="agmata" value="false"/> + <param name="plot" value="true"/> + <param name="plot_all" value="true"/> + <param name="highlight" value="true"/> + <assert_command> + <has_text text="--dynamine" /> + <has_text text="--json" /> + <has_text text="--plot" /> + <has_text text="--plot_all" /> + <has_text text="--highlight" /> + </assert_command> + <!-- <output name="predictions_output" value="test_output.json" ftype="json" lines_diff="1"/> --> + </test> + </tests> + <help><![CDATA[ + Bio2byte tools (b2btools) offer the following single protein sequence based predictions: + + - Backbone and sidechain dynamics (DynaMine) - Helix, sheet, coil and polyproline-II propensity + - Early folding propensity (EFoldMine) + - Disorder (DisoMine) + - Beta-sheet aggregation (Agmata) + + This tool is available on the Python Package Index (PyPI): https://pypi.org/project/b2bTools/ + ]]> + </help> + <creator> + <organization name="bio2Byte" url="https://bio2byte.be" email=""/> + <organization name="Vrije Universiteit Brussel" url="https://vub.be" alternateName="VUB"/> + <person honorificPrefix="Prof." givenName="Wim" familyName="Vranken" email="Wim.Vranken@vub.be" identifier="http://orcid.org/0000-0001-7470-4324" /> + <person givenName="Jose" familyName="Gavalda-Garcia" email="Jose.Gavalda.Garcia@vub.be" identifier="http://orcid.org/0000-0001-6431-3442" /> + <person givenName="Adrian" familyName="Diaz" email="Adrian.Diaz@vub.be" identifier="http://orcid.org/0000-0003-0165-1318" /> + </creator> + <citations> + <citation type="doi">10.1038/ncomms3741</citation> + <citation type="doi">10.1101/2020.05.25.115253</citation> + <citation type="doi">10.1038/s41598-017-08366-3</citation> + <citation type="doi">10.1093/bioinformatics/btz912</citation> + </citations> +</tool> \ No newline at end of file
--- a/b2btools_single_sequence/b2btools_single_sequence.xml Tue Aug 02 08:54:19 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,113 +0,0 @@ -<tool - id="b2btools_single_sequence" - name="bio2Byte: biophysical predictors (b2bTools) for single sequences" - version="3.0.4+galaxy0" - license="GPL-3.0" - python_template_version="3.5" - profile="21.05"> - <description>predicts protein biophysical properties from their amino-acid sequences</description> - <requirements> - <requirement type="package" version="3.0.4">b2btools</requirement> - </requirements> - <xrefs> - <xref type="bio.tools">b2btools</xref> - </xrefs> - <command detect_errors="exit_code"><![CDATA[ - mkdir tabular; - mkdir plots; - python '$__tool_directory__/script.py' --file '$section_input_file.input' - $section_predictors.dynamine - $section_predictors.disomine - $section_predictors.efoldmine - $section_predictors.agmata - $section_plot.highlight - $section_plot.plot - $section_plot.plot_all - --output ./tabular - --plot-output ./plots - --json '$predictions_output' - ]]></command> - <inputs> - <section name="section_input_file" title="Input file" help="Configure this section to plug a valid input in FASTA format"> - <param type="data" name="input" format="fasta" label="Protein sequences in FASTA format" help="FASTA file containing up to 10 valid protein sequences"/> - </section> - <section name="section_predictors" title="Biophyisical predictors" help="Configure this section to select the predictions to be executed"> - <param - name="dynamine" - type="boolean" - label="DynaMine: Dynamics" - truevalue="--dynamine" - falsevalue="" - help="Fast predictor of protein backbone dynamics using only sequence information as input. The version here also predicts side-chain dynamics and secondary structure predictors using the same principle." /> - <param - name="disomine" - type="boolean" - label="DisoMine: Disorder" - truevalue="--disomine" - falsevalue="" - help="Predicts protein disorder with recurrent neural networks not directly from the amino acid sequence, but instead from more generic predictions of key biophysical properties, here protein dynamics, secondary structure and early folding."/> - <param - name="efoldmine" - type="boolean" - label="EFoldMine: Early folding" - truevalue="--efoldmine" - falsevalue="" - help="Predicts from the primary amino acid sequence of a protein, which amino acids are likely involved in early folding events."/> - <param - name="agmata" - type="boolean" - label="Agmata: Beta aggregation" - truevalue="--agmata" - falsevalue="" - help="Single-sequence based predictor of protein regions that are likely to cause beta-aggregation."/> - </section> - <section name="section_plot" title="Plot options" help="Configure plot output"> - <param name="plot" type="boolean" label="Plot prediction values" truevalue="--plot" falsevalue=""/> - <param name="plot_all" type="boolean" label="Plot all sequences together" truevalue="--plot_all" falsevalue="" help="This tool can plot all sequences together in order to compare predicted values of different sequences"/> - <param name="highlight" type="boolean" label="Highlight regions of interest" truevalue="--highlight" falsevalue="" help="Highlight biophysical regions on the background of the plots"/> - </section> - </inputs> - <outputs> - <data name="predictions_output" format="json" /> - <collection name="split_output" type="list" label="Tabular predictions by sequence"> - <discover_datasets pattern="__name_and_ext__" format="tabular" directory="tabular" visible="true" /> - </collection> - <collection name="split_output_plots" type="list" label="Plots"> - <discover_datasets pattern="__name_and_ext__" format="png" directory="plots" visible="true" /> - </collection> - </outputs> - <tests> - <test> - <param name="input" value="example.fasta" /> - <param name="dynamine" value="true"/> - <param name="disomine" value="true"/> - <param name="efoldmine" value="true"/> - <param name="agmata" value="true"/> - <output name="predictions_output" file="test_output.json" ftype="json" /> - </test> - </tests> - <help><![CDATA[ - Bio2byte tools (b2btools) offer the following single protein sequence based predictions: - - - Backbone and sidechain dynamics (DynaMine) - Helix, sheet, coil and polyproline-II propensity - - Early folding propensity (EFoldMine) - - Disorder (DisoMine) - - Beta-sheet aggregation (Agmata) - - This tool is available on the Python Package Index (PyPI): https://pypi.org/project/b2bTools/ - ]]> - </help> - <creator> - <organization name="bio2Byte" url="https://bio2byte.be" email=""/> - <organization name="Vrije Universiteit Brussel" url="https://vub.be" alternateName="VUB"/> - <person honorificPrefix="Prof." givenName="Wim" familyName="Vranken" email="Wim.Vranken@vub.be" identifier="http://orcid.org/0000-0001-7470-4324" /> - <person givenName="Jose" familyName="Gavalda-Garcia" email="Jose.Gavalda.Garcia@vub.be" identifier="http://orcid.org/0000-0001-6431-3442" /> - <person givenName="Adrian" familyName="Diaz" email="Adrian.Diaz@vub.be" identifier="http://orcid.org/0000-0003-0165-1318" /> - </creator> - <citations> - <citation type="doi">10.1038/ncomms3741</citation> - <citation type="doi">10.1101/2020.05.25.115253</citation> - <citation type="doi">10.1038/s41598-017-08366-3</citation> - <citation type="doi">10.1093/bioinformatics/btz912</citation> - </citations> -</tool> \ No newline at end of file
--- a/b2btools_single_sequence/script.py Tue Aug 02 08:54:19 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,176 +0,0 @@ -import optparse -import os.path -import unicodedata -import re -import numpy as np -import pandas as pd -from b2bTools import SingleSeq -import matplotlib.pyplot as plt - - -def slugify(value): - """ - Taken from https://github.com/django/django/blob/master/django/utils/text.py - Convert to ASCII if 'allow_unicode'. Convert spaces or repeated - dashes to single dashes. Remove characters that aren't alphanumerics, - underscores, or hyphens. Convert to lowercase. Also strip leading and - trailing whitespace, dashes, and underscores. - """ - value = str(value) - value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') - value = re.sub(r'[^\w\s-]', '', value.lower()) - return re.sub(r'[-\s]+', '-', value).strip('-_') - - -def check_min_max(predicted_values, former_min, former_max): - seq_max = max(predicted_values) - seq_min = min(predicted_values) - if seq_max + 0.1 > former_max and not np.isnan(seq_max) and not np.isinf(seq_max): - former_max = seq_max + 0.1 - if seq_min - 0.1 < former_min and not np.isnan(seq_min) and not np.isinf(seq_min): - former_min = seq_min - 0.1 - return former_min, former_max - - -def plot_prediction(prediction_name, highlighting_regions, predicted_values, seq_name): - thresholds_dict = {'backbone': {'membrane spanning': [1., 1.5], - 'rigid': [0.8, 1.], - 'context-dependent': [0.69, 0.8], - 'flexible': [-1.0, 0.69]}, - 'earlyFolding': {'early folds': [0.169, 2.], 'late folds': [-1., 0.169]}, - 'disoMine': {'ordered': [-1., 0.5], 'disordered': [0.5, 2.]}, - } - ordered_regions_dict = {'backbone': ['flexible', 'context-dependent', 'rigid', 'membrane spanning'], - 'earlyFolding': ['late folds', 'early folds'], - 'disoMine': ['ordered', 'disordered'], - } - colors = ['yellow', 'orange', 'pink', 'red'] - ranges_dict = { - 'backbone': [-0.2, 1.2], - 'sidechain': [-0.2, 1.2], - 'ppII': [-0.2, 1.2], - 'earlyFolding': [-0.2, 1.2], - 'disoMine': [-0.2, 1.2], - 'agmata': [-0.2, 1.2], - 'helix': [-1., 1.], - 'sheet': [-1., 1.], - 'coil': [-1., 1.], - } - fig, ax = plt.subplots(1, 1) - fig.set_figwidth(10) - fig.set_figheight(5) - ax.set_title(prediction_name + ' ' + 'prediction') - min_value, max_value = ranges_dict[prediction_name] - if seq_name == 'all': - max_len = 0 - for seq in predicted_values.keys(): - predictions = predicted_values[seq] - min_value, max_value = check_min_max(predictions, min_value, max_value) - ax.plot(range(len(predictions)), predictions, label=seq) - if len(predictions) > max_len: - max_len = len(predictions) - ax.set_xlim([0, max_len - 1]) - else: - predictions = predicted_values - min_value, max_value = check_min_max(predictions, min_value, max_value) - ax.plot(range(len(predictions)), predictions, label=seq_name) - ax.set_xlim([0, len(predictions) - 1]) - legend_lines = plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left", fancybox=True, shadow=True) - ax.add_artist(legend_lines) - # Define regions - if highlighting_regions: - if prediction_name in ordered_regions_dict.keys(): - for i, prediction in enumerate(ordered_regions_dict[prediction_name]): - lower = thresholds_dict[prediction_name][prediction][0] - upper = thresholds_dict[prediction_name][prediction][1] - color = colors[i] - ax.axhspan(lower, upper, alpha=0.3, color=color, label=prediction) - included_in_regions_legend = list(reversed( - [prediction for prediction in ordered_regions_dict[prediction_name]])) # to sort it "from up to low" - # Get handles and labels - handles, labels = plt.gca().get_legend_handles_labels() - handles_dict = {label: handles[idx] for idx, label in enumerate(labels)} - # Add legend for regions, if available - region_legend = ax.legend([handles_dict[region] for region in included_in_regions_legend], - [region for region in included_in_regions_legend], fancybox=True, shadow=True, - loc='lower left', bbox_to_anchor=(1.04, 0)) - ax.add_artist(region_legend) - ax.set_ylim([min_value, max_value]) - ax.set_xlabel('residue index') - ax.set_ylabel('prediction values') - ax.grid(axis='y') - plt.savefig(os.path.join(options.plot_output, "{0}_{1}.png".format(slugify(seq_name), prediction_name)), bbox_inches="tight") - plt.close() - - -def df_dict_to_dict_of_values(df_dict, predictor): - results_dict = {} - for seq in df_dict.keys(): - df = pd.read_csv(df_dict[seq], sep='\t') - results_dict[seq] = df[predictor] - return results_dict - - -def main(options): - single_seq = SingleSeq(options.input_fasta) - b2b_tools = [] - if options.dynamine: - b2b_tools.append('dynamine') - if options.disomine: - b2b_tools.append('disomine') - if options.efoldmine: - b2b_tools.append('efoldmine') - if options.agmata: - b2b_tools.append('agmata') - - single_seq.predict(b2b_tools) - predictions = single_seq.get_all_predictions() - results_json = single_seq.get_all_predictions_json('all') - with open(options.json_output, 'w') as f: - f.write(results_json) - first_sequence_key = next(iter(predictions)) - prediction_keys = predictions[first_sequence_key].keys() - df_dictionary = {} - for sequence_key, sequence_predictions in predictions.items(): - residues = sequence_predictions['seq'] - residues_count = len(residues) - sequence_df = pd.DataFrame(columns=prediction_keys, index=range(residues_count)) - sequence_df.index.name = 'residue_index' - for predictor in prediction_keys: - sequence_df[predictor] = sequence_predictions[predictor] - sequence_df = sequence_df.rename(columns={"seq": "residue"}) - sequence_df = sequence_df.round(decimals=2) - filename = f'{options.output}/{slugify(sequence_key)}.tsv' - df_dictionary[sequence_key] = filename - sequence_df.to_csv(filename, sep="\t") - # Plot each individual plot (compatible with plot all) - if options.plot: - for predictor in prediction_keys: - if predictor != 'seq': - plot_prediction(prediction_name=predictor, highlighting_regions=True, - predicted_values=sequence_predictions[predictor], seq_name=sequence_key) - # Plot all together (compatible with plot individual) - if options.plot_all: - for predictor in prediction_keys: - if predictor != 'seq': - results_dictionary = df_dict_to_dict_of_values(df_dict=df_dictionary, predictor=predictor) - plot_prediction(prediction_name=predictor, highlighting_regions=True, - predicted_values=results_dictionary, seq_name='all') - - -if __name__ == "__main__": - parser = optparse.OptionParser() - parser.add_option("--dynamine", action="store_true", default=False) - parser.add_option("--disomine", action="store_true", default=False) - parser.add_option("--efoldmine", action="store_true", default=False) - parser.add_option("--agmata", action="store_true", default=False) - parser.add_option("--file", dest="input_fasta", default=False) - parser.add_option("--output", dest="output", default=False) - parser.add_option("--plot-output", dest="plot_output", default=False) - - parser.add_option("--json", dest="json_output", default=False) - parser.add_option("--plot", action="store_true", default=False) - parser.add_option("--plot_all", action="store_true", default=False) - parser.add_option("--highlight", action="store_true", default=False) - options, _args = parser.parse_args() - main(options)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script.py Tue Aug 02 09:44:33 2022 +0000 @@ -0,0 +1,222 @@ +import optparse +import os.path +import unicodedata +import re +import numpy as np +import pandas as pd +from b2bTools import SingleSeq +import matplotlib.pyplot as plt + + +def slugify(value): + """ + Taken from + https://github.com/django/django/blob/master/django/utils/text.py + Convert to ASCII if 'allow_unicode'. Convert spaces or repeated + dashes to single dashes. Remove characters that aren't alphanumerics, + underscores, or hyphens. Convert to lowercase. Also strip leading and + trailing whitespace, dashes, and underscores. + """ + value = str(value) + value = unicodedata.normalize( + 'NFKD', value).encode( + 'ascii', 'ignore').decode('ascii') + value = re.sub(r'[^\w\s-]', '', value.lower()) + return re.sub(r'[-\s]+', '-', value).strip('-_') + + +def check_min_max(predicted_values, former_min, former_max): + seq_max = max(predicted_values) + seq_min = min(predicted_values) + if seq_max + \ + 0.1 > former_max and not np.isnan(seq_max) \ + and not np.isinf(seq_max): + former_max = seq_max + 0.1 + if seq_min - \ + 0.1 < former_min and not np.isnan(seq_min) \ + and not np.isinf(seq_min): + former_min = seq_min - 0.1 + return former_min, former_max + + +def plot_prediction(prediction_name, highlighting_regions, + pred_vals, seq_name): + thresholds_dict = {'backbone': {'membrane spanning': [1., 1.5], + 'rigid': [0.8, 1.], + 'context-dependent': [0.69, 0.8], + 'flexible': [-1.0, 0.69]}, + 'earlyFolding': {'early folds': [0.169, 2.], + 'late folds': [-1., 0.169]}, + 'disoMine': {'ordered': [-1., 0.5], + 'disordered': [0.5, 2.]}, + } + ordered_regions_dict = {'backbone': ['flexible', + 'context-dependent', + 'rigid', + 'membrane spanning'], + 'earlyFolding': ['late folds', 'early folds'], + 'disoMine': ['ordered', 'disordered'], + } + colors = ['yellow', 'orange', 'pink', 'red'] + ranges_dict = { + 'backbone': [-0.2, 1.2], + 'sidechain': [-0.2, 1.2], + 'ppII': [-0.2, 1.2], + 'earlyFolding': [-0.2, 1.2], + 'disoMine': [-0.2, 1.2], + 'agmata': [-0.2, 1.2], + 'helix': [-1., 1.], + 'sheet': [-1., 1.], + 'coil': [-1., 1.], + } + fig, ax = plt.subplots(1, 1) + fig.set_figwidth(10) + fig.set_figheight(5) + ax.set_title(prediction_name + ' ' + 'prediction') + min_value, max_value = ranges_dict[prediction_name] + if seq_name == 'all': + max_len = 0 + for seq in pred_vals.keys(): + predictions = pred_vals[seq] + min_value, max_value = check_min_max( + predictions, min_value, max_value) + ax.plot(range(len(predictions)), predictions, label=seq) + if len(predictions) > max_len: + max_len = len(predictions) + ax.set_xlim([0, max_len - 1]) + else: + predictions = pred_vals + min_value, max_value = check_min_max(predictions, min_value, max_value) + ax.plot(range(len(predictions)), predictions, label=seq_name) + ax.set_xlim([0, len(predictions) - 1]) + legend_lines = plt.legend( + bbox_to_anchor=( + 1.04, + 1), + loc="upper left", + fancybox=True, + shadow=True) + ax.add_artist(legend_lines) + # Define regions + if highlighting_regions: + if prediction_name in ordered_regions_dict.keys(): + for i, prediction in enumerate( + ordered_regions_dict[prediction_name]): + lower = thresholds_dict[prediction_name][prediction][0] + upper = thresholds_dict[prediction_name][prediction][1] + color = colors[i] + ax.axhspan( + lower, + upper, + alpha=0.3, + color=color, + label=prediction) + # to sort it "from up to low" + included_in_regions_legend = list(reversed( + [r_pred for r_pred in ordered_regions_dict[prediction_name]])) + # Get handles and labels + handles, labels = plt.gca().get_legend_handles_labels() + handles_dict = {label: handles[idx] + for idx, label in enumerate(labels)} + # Add legend for regions, if available + lgnd_labels = [handles_dict[r] for r in included_in_regions_legend] + lgnd_regions = [region for region in included_in_regions_legend] + region_legend = ax.legend(lgnd_labels, + lgnd_regions, + fancybox=True, + shadow=True, + loc='lower left', + bbox_to_anchor=(1.04, 0)) + ax.add_artist(region_legend) + ax.set_ylim([min_value, max_value]) + ax.set_xlabel('residue index') + ax.set_ylabel('prediction values') + ax.grid(axis='y') + plt.savefig( + os.path.join( + options.plot_output, + "{0}_{1}.png".format( + slugify(seq_name), + prediction_name)), + bbox_inches="tight") + plt.close() + + +def df_dict_to_dict_of_values(df_dict, predictor): + results_dict = {} + for seq in df_dict.keys(): + df = pd.read_csv(df_dict[seq], sep='\t') + results_dict[seq] = df[predictor] + return results_dict + + +def main(options): + single_seq = SingleSeq(options.input_fasta) + b2b_tools = [] + if options.dynamine: + b2b_tools.append('dynamine') + if options.disomine: + b2b_tools.append('disomine') + if options.efoldmine: + b2b_tools.append('efoldmine') + if options.agmata: + b2b_tools.append('agmata') + + single_seq.predict(b2b_tools) + predictions = single_seq.get_all_predictions() + results_json = single_seq.get_all_predictions_json('all') + with open(options.json_output, 'w') as f: + f.write(results_json) + first_sequence_key = next(iter(predictions)) + prediction_keys = predictions[first_sequence_key].keys() + df_dictionary = {} + for sequence_key, sequence_predictions in predictions.items(): + residues = sequence_predictions['seq'] + residues_count = len(residues) + sequence_df = pd.DataFrame( + columns=prediction_keys, + index=range(residues_count)) + sequence_df.index.name = 'residue_index' + for predictor in prediction_keys: + sequence_df[predictor] = sequence_predictions[predictor] + sequence_df = sequence_df.rename(columns={"seq": "residue"}) + sequence_df = sequence_df.round(decimals=2) + filename = f'{options.output}/{slugify(sequence_key)}.tsv' + df_dictionary[sequence_key] = filename + sequence_df.to_csv(filename, sep="\t") + # Plot each individual plot (compatible with plot all) + if options.plot: + for predictor in prediction_keys: + if predictor != 'seq': + plot_prediction(prediction_name=predictor, + highlighting_regions=True, + pred_vals=sequence_predictions[predictor], + seq_name=sequence_key) + # Plot all together (compatible with plot individual) + if options.plot_all: + for predictor in prediction_keys: + if predictor != 'seq': + results_dictionary = df_dict_to_dict_of_values( + df_dict=df_dictionary, predictor=predictor) + plot_prediction(prediction_name=predictor, + highlighting_regions=True, + pred_vals=results_dictionary, + seq_name='all') + + +if __name__ == "__main__": + parser = optparse.OptionParser() + parser.add_option("--dynamine", action="store_true", default=False) + parser.add_option("--disomine", action="store_true", default=False) + parser.add_option("--efoldmine", action="store_true", default=False) + parser.add_option("--agmata", action="store_true", default=False) + parser.add_option("--file", dest="input_fasta", default=False) + parser.add_option("--output", dest="output", default=False) + parser.add_option("--plot-output", dest="plot_output", default=False) + + parser.add_option("--json", dest="json_output", default=False) + parser.add_option("--plot", action="store_true", default=False) + parser.add_option("--plot_all", action="store_true", default=False) + parser.add_option("--highlight", action="store_true", default=False) + options, _args = parser.parse_args() + main(options)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input.fasta Tue Aug 02 09:44:33 2022 +0000 @@ -0,0 +1,29 @@ +>random_sequence_1_consisting_of_10_residues +MDRHDPVQKS + +>random_sequence_2_consisting_of_10_residues +SQRAMWSMWR + +>random_sequence_3_consisting_of_10_residues +YWCELTYWRV + +>random_sequence_4_consisting_of_10_residues +SWTHYELKAV + +>random_sequence_5_consisting_of_10_residues +NCPIEPEDQY + +>random_sequence_6_consisting_of_10_residues +YACLFQKPYI + +>random_sequence_7_consisting_of_10_residues +FVPGKQEPDS + +>random_sequence_8_consisting_of_10_residues +HHLCANKMDL + +>random_sequence_9_consisting_of_10_residues +GNKTPFMKMH + +>random_sequence_10_consisting_of_10_residues +PMSKMWQLDN
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_output.json Tue Aug 02 09:44:33 2022 +0000 @@ -0,0 +1,803 @@ +{ + "creation_date": "2022-07-06T13:54:39", + "id": "all", + "information": { + "Agmata": { + "info": "Generated by Agmata;See http://bio2byte.be", + "origin": "b2bTools.singleSeq.Agmata.Predictor", + "references": [ + "doi: 10.1093/bioinformatics/btz912 (2020)" + ], + "version": "1.0" + }, + "DisoMine": { + "info": "Generated by DisoMine;See http://bio2byte.be", + "origin": "python.b2bTools.singleSeq.DisoMine.Predictor", + "references": [ + "doi: 10.1101/2020.05.25.115253 (2020)" + ], + "version": "2.0" + }, + "DynaMine suite": { + "info": "Generated by the DynaMine suite;See http://bio2byte.be", + "origin": "python.b2bTools.singleSeq.DynaMine.Predictor", + "references": [ + "doi: 10.1038/ncomms3741 (2013)", + "doi: 10.1093/nar/gku270 (2014)" + ], + "version": "4.0" + }, + "EFoldMine": { + "info": "Generated by EFoldMine;Based on the DynaMine project;See http://bio2byte.be", + "origin": "python.b2bTools.singleSeq.EFoldMine.Predictor", + "references": [ + "doi: 10.1038/ncomms3741 (2013)", + "doi: 10.1093/nar/gku270 (2014)", + "doi: 10.1038/s41598-017-08366-3 (2017)" + ], + "version": "2.0" + } + }, + "results": [ + { + "backbone": [ + 0.724, + 0.749, + 0.745, + 0.777, + 0.792, + 0.794, + 0.817, + 0.846, + 0.82, + 0.815 + ], + "coil": [ + 0.411, + 0.427, + 0.419, + 0.362, + 0.346, + 0.316, + 0.313, + 0.351, + 0.42, + 0.449 + ], + "helix": [ + 0.379, + 0.418, + 0.445, + 0.505, + 0.466, + 0.465, + 0.452, + 0.447, + 0.416, + 0.401 + ], + "ppII": [ + 0.121, + 0.099, + 0.088, + 0.07, + 0.069, + 0.071, + 0.068, + 0.058, + 0.067, + 0.06 + ], + "proteinID": "random_sequence_10_consisting_of_10_residues", + "sequence": "PMSKMWQLDN", + "sheet": [ + 0.131, + 0.117, + 0.115, + 0.143, + 0.213, + 0.262, + 0.287, + 0.213, + 0.137, + 0.09 + ], + "sidechain": [ + 0.508, + 0.523, + 0.494, + 0.367, + 0.541, + 0.62, + 0.41, + 0.599, + 0.327, + 0.538 + ] + }, + { + "backbone": [ + 0.717, + 0.697, + 0.729, + 0.747, + 0.742, + 0.723, + 0.713, + 0.709, + 0.732, + 0.724 + ], + "coil": [ + 0.458, + 0.541, + 0.572, + 0.555, + 0.521, + 0.475, + 0.459, + 0.434, + 0.406, + 0.407 + ], + "helix": [ + 0.309, + 0.187, + 0.144, + 0.144, + 0.186, + 0.237, + 0.256, + 0.276, + 0.318, + 0.317 + ], + "ppII": [ + 0.105, + 0.129, + 0.132, + 0.13, + 0.143, + 0.15, + 0.137, + 0.123, + 0.106, + 0.104 + ], + "proteinID": "random_sequence_1_consisting_of_10_residues", + "sequence": "MDRHDPVQKS", + "sheet": [ + 0.152, + 0.121, + 0.108, + 0.113, + 0.137, + 0.194, + 0.205, + 0.218, + 0.212, + 0.2 + ], + "sidechain": [ + 0.487, + 0.282, + 0.324, + 0.503, + 0.3, + 0.503, + 0.514, + 0.339, + 0.338, + 0.507 + ] + }, + { + "backbone": [ + 0.743, + 0.763, + 0.743, + 0.764, + 0.802, + 0.846, + 0.854, + 0.854, + 0.832, + 0.845 + ], + "coil": [ + 0.34, + 0.321, + 0.338, + 0.325, + 0.323, + 0.287, + 0.282, + 0.283, + 0.303, + 0.295 + ], + "helix": [ + 0.54, + 0.567, + 0.536, + 0.565, + 0.564, + 0.571, + 0.533, + 0.515, + 0.448, + 0.44 + ], + "ppII": [ + 0.079, + 0.073, + 0.079, + 0.07, + 0.058, + 0.042, + 0.037, + 0.034, + 0.052, + 0.048 + ], + "proteinID": "random_sequence_2_consisting_of_10_residues", + "sequence": "SQRAMWSMWR", + "sheet": [ + 0.104, + 0.14, + 0.164, + 0.163, + 0.206, + 0.239, + 0.267, + 0.266, + 0.287, + 0.292 + ], + "sidechain": [ + 0.502, + 0.396, + 0.32, + 0.561, + 0.557, + 0.637, + 0.578, + 0.589, + 0.65, + 0.433 + ] + }, + { + "backbone": [ + 0.92, + 0.938, + 0.976, + 0.997, + 0.992, + 0.981, + 0.955, + 0.937, + 0.936, + 0.912 + ], + "coil": [ + 0.255, + 0.254, + 0.228, + 0.225, + 0.233, + 0.238, + 0.243, + 0.221, + 0.209, + 0.213 + ], + "helix": [ + 0.434, + 0.383, + 0.406, + 0.458, + 0.483, + 0.443, + 0.413, + 0.404, + 0.398, + 0.389 + ], + "ppII": [ + 0.041, + 0.038, + 0.025, + 0.016, + 0.01, + 0.017, + 0.028, + 0.032, + 0.035, + 0.04 + ], + "proteinID": "random_sequence_3_consisting_of_10_residues", + "sequence": "YWCELTYWRV", + "sheet": [ + 0.363, + 0.436, + 0.473, + 0.484, + 0.46, + 0.443, + 0.455, + 0.473, + 0.505, + 0.494 + ], + "sidechain": [ + 0.63, + 0.699, + 0.732, + 0.409, + 0.701, + 0.713, + 0.646, + 0.686, + 0.436, + 0.622 + ] + }, + { + "backbone": [ + 0.748, + 0.766, + 0.828, + 0.859, + 0.874, + 0.908, + 0.863, + 0.84, + 0.853, + 0.824 + ], + "coil": [ + 0.419, + 0.403, + 0.338, + 0.278, + 0.231, + 0.237, + 0.239, + 0.26, + 0.257, + 0.267 + ], + "helix": [ + 0.227, + 0.224, + 0.336, + 0.437, + 0.548, + 0.54, + 0.52, + 0.502, + 0.516, + 0.501 + ], + "ppII": [ + 0.101, + 0.096, + 0.068, + 0.048, + 0.039, + 0.038, + 0.05, + 0.05, + 0.048, + 0.06 + ], + "proteinID": "random_sequence_4_consisting_of_10_residues", + "sequence": "SWTHYELKAV", + "sheet": [ + 0.281, + 0.312, + 0.317, + 0.329, + 0.326, + 0.358, + 0.355, + 0.333, + 0.317, + 0.286 + ], + "sidechain": [ + 0.534, + 0.654, + 0.622, + 0.612, + 0.593, + 0.387, + 0.611, + 0.374, + 0.59, + 0.567 + ] + }, + { + "backbone": [ + 0.775, + 0.754, + 0.753, + 0.754, + 0.754, + 0.783, + 0.771, + 0.798, + 0.773, + 0.777 + ], + "coil": [ + 0.505, + 0.538, + 0.497, + 0.495, + 0.468, + 0.451, + 0.46, + 0.449, + 0.409, + 0.372 + ], + "helix": [ + 0.156, + 0.028, + 0.096, + 0.086, + 0.156, + 0.267, + 0.34, + 0.408, + 0.411, + 0.394 + ], + "ppII": [ + 0.107, + 0.135, + 0.152, + 0.157, + 0.154, + 0.141, + 0.124, + 0.093, + 0.09, + 0.085 + ], + "proteinID": "random_sequence_5_consisting_of_10_residues", + "sequence": "NCPIEPEDQY", + "sheet": [ + 0.223, + 0.288, + 0.278, + 0.281, + 0.236, + 0.165, + 0.097, + 0.057, + 0.111, + 0.178 + ], + "sidechain": [ + 0.49, + 0.635, + 0.528, + 0.572, + 0.299, + 0.54, + 0.28, + 0.321, + 0.375, + 0.549 + ] + }, + { + "backbone": [ + 0.911, + 0.923, + 0.931, + 0.883, + 0.891, + 0.881, + 0.886, + 0.855, + 0.837, + 0.82 + ], + "coil": [ + 0.217, + 0.199, + 0.2, + 0.273, + 0.33, + 0.342, + 0.351, + 0.332, + 0.338, + 0.318 + ], + "helix": [ + 0.546, + 0.566, + 0.586, + 0.493, + 0.43, + 0.394, + 0.324, + 0.342, + 0.351, + 0.352 + ], + "ppII": [ + 0.033, + 0.025, + 0.019, + 0.035, + 0.045, + 0.066, + 0.079, + 0.091, + 0.087, + 0.081 + ], + "proteinID": "random_sequence_6_consisting_of_10_residues", + "sequence": "YACLFQKPYI", + "sheet": [ + 0.33, + 0.372, + 0.376, + 0.358, + 0.316, + 0.298, + 0.333, + 0.327, + 0.343, + 0.332 + ], + "sidechain": [ + 0.617, + 0.622, + 0.689, + 0.632, + 0.638, + 0.445, + 0.432, + 0.593, + 0.596, + 0.618 + ] + }, + { + "backbone": [ + 0.789, + 0.776, + 0.744, + 0.689, + 0.678, + 0.635, + 0.64, + 0.672, + 0.701, + 0.691 + ], + "coil": [ + 0.373, + 0.395, + 0.422, + 0.51, + 0.581, + 0.601, + 0.583, + 0.539, + 0.544, + 0.512 + ], + "helix": [ + 0.163, + 0.147, + 0.175, + 0.105, + 0.027, + -0.041, + 0.018, + 0.152, + 0.273, + 0.307 + ], + "ppII": [ + 0.106, + 0.126, + 0.151, + 0.171, + 0.169, + 0.19, + 0.193, + 0.172, + 0.143, + 0.122 + ], + "proteinID": "random_sequence_7_consisting_of_10_residues", + "sequence": "FVPGKQEPDS", + "sheet": [ + 0.412, + 0.369, + 0.278, + 0.208, + 0.167, + 0.175, + 0.159, + 0.111, + 0.008, + 0.01 + ], + "sidechain": [ + 0.582, + 0.577, + 0.526, + 0.444, + 0.344, + 0.325, + 0.268, + 0.491, + 0.269, + 0.468 + ] + }, + { + "backbone": [ + 0.806, + 0.83, + 0.829, + 0.835, + 0.797, + 0.816, + 0.835, + 0.828, + 0.808, + 0.806 + ], + "coil": [ + 0.331, + 0.313, + 0.316, + 0.336, + 0.393, + 0.406, + 0.41, + 0.4, + 0.374, + 0.346 + ], + "helix": [ + 0.46, + 0.477, + 0.475, + 0.488, + 0.402, + 0.468, + 0.498, + 0.48, + 0.496, + 0.469 + ], + "ppII": [ + 0.058, + 0.046, + 0.051, + 0.054, + 0.071, + 0.062, + 0.055, + 0.055, + 0.068, + 0.067 + ], + "proteinID": "random_sequence_8_consisting_of_10_residues", + "sequence": "HHLCANKMDL", + "sheet": [ + 0.219, + 0.279, + 0.278, + 0.251, + 0.22, + 0.128, + 0.093, + 0.114, + 0.136, + 0.18 + ], + "sidechain": [ + 0.563, + 0.555, + 0.601, + 0.647, + 0.562, + 0.493, + 0.376, + 0.523, + 0.312, + 0.572 + ] + }, + { + "backbone": [ + 0.64, + 0.674, + 0.721, + 0.758, + 0.791, + 0.753, + 0.762, + 0.772, + 0.77, + 0.796 + ], + "coil": [ + 0.578, + 0.611, + 0.552, + 0.459, + 0.363, + 0.378, + 0.383, + 0.372, + 0.391, + 0.37 + ], + "helix": [ + 0.043, + -0.002, + 0.039, + 0.143, + 0.329, + 0.405, + 0.425, + 0.401, + 0.406, + 0.431 + ], + "ppII": [ + 0.158, + 0.152, + 0.14, + 0.13, + 0.11, + 0.099, + 0.081, + 0.073, + 0.071, + 0.06 + ], + "proteinID": "random_sequence_9_consisting_of_10_residues", + "sequence": "GNKTPFMKMH", + "sheet": [ + 0.143, + 0.165, + 0.21, + 0.269, + 0.291, + 0.218, + 0.224, + 0.238, + 0.2, + 0.182 + ], + "sidechain": [ + 0.431, + 0.477, + 0.361, + 0.588, + 0.562, + 0.551, + 0.514, + 0.344, + 0.526, + 0.573 + ] + } + ] +} \ No newline at end of file