# HG changeset patch # User adrian.diaz # Date 1659430459 0 # Node ID 891ccfd226332445cd9572b42bca2127c77c17ab # Parent cacb90cde53e559ba4f66ba7e40d7a9869b3c727 Uploaded new version which meets formatting requirements diff -r cacb90cde53e -r 891ccfd22633 b2btools_single_sequence/b2btools_single_sequence.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/b2btools_single_sequence/b2btools_single_sequence.xml Tue Aug 02 08:54:19 2022 +0000 @@ -0,0 +1,113 @@ + + predicts protein biophysical properties from their amino-acid sequences + + b2btools + + + b2btools + + + +
+ +
+
+ + + + +
+
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.1038/ncomms3741 + 10.1101/2020.05.25.115253 + 10.1038/s41598-017-08366-3 + 10.1093/bioinformatics/btz912 + +
\ No newline at end of file diff -r cacb90cde53e -r 891ccfd22633 b2btools_single_sequence/script.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/b2btools_single_sequence/script.py Tue Aug 02 08:54:19 2022 +0000 @@ -0,0 +1,176 @@ +import optparse +import os.path +import unicodedata +import re +import numpy as np +import pandas as pd +from b2bTools import SingleSeq +import matplotlib.pyplot as plt + + +def slugify(value): + """ + Taken from https://github.com/django/django/blob/master/django/utils/text.py + Convert to ASCII if 'allow_unicode'. Convert spaces or repeated + dashes to single dashes. Remove characters that aren't alphanumerics, + underscores, or hyphens. Convert to lowercase. Also strip leading and + trailing whitespace, dashes, and underscores. + """ + value = str(value) + value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') + value = re.sub(r'[^\w\s-]', '', value.lower()) + return re.sub(r'[-\s]+', '-', value).strip('-_') + + +def check_min_max(predicted_values, former_min, former_max): + seq_max = max(predicted_values) + seq_min = min(predicted_values) + if seq_max + 0.1 > former_max and not np.isnan(seq_max) and not np.isinf(seq_max): + former_max = seq_max + 0.1 + if seq_min - 0.1 < former_min and not np.isnan(seq_min) and not np.isinf(seq_min): + former_min = seq_min - 0.1 + return former_min, former_max + + +def plot_prediction(prediction_name, highlighting_regions, predicted_values, seq_name): + thresholds_dict = {'backbone': {'membrane spanning': [1., 1.5], + 'rigid': [0.8, 1.], + 'context-dependent': [0.69, 0.8], + 'flexible': [-1.0, 0.69]}, + 'earlyFolding': {'early folds': [0.169, 2.], 'late folds': [-1., 0.169]}, + 'disoMine': {'ordered': [-1., 0.5], 'disordered': [0.5, 2.]}, + } + ordered_regions_dict = {'backbone': ['flexible', 'context-dependent', 'rigid', 'membrane spanning'], + 'earlyFolding': ['late folds', 'early folds'], + 'disoMine': ['ordered', 'disordered'], + } + colors = ['yellow', 'orange', 'pink', 'red'] + ranges_dict = { + 'backbone': [-0.2, 1.2], + 'sidechain': [-0.2, 1.2], + 'ppII': [-0.2, 1.2], + 'earlyFolding': [-0.2, 1.2], + 'disoMine': [-0.2, 1.2], + 'agmata': [-0.2, 1.2], + 'helix': [-1., 1.], + 'sheet': [-1., 1.], + 'coil': [-1., 1.], + } + fig, ax = plt.subplots(1, 1) + fig.set_figwidth(10) + fig.set_figheight(5) + ax.set_title(prediction_name + ' ' + 'prediction') + min_value, max_value = ranges_dict[prediction_name] + if seq_name == 'all': + max_len = 0 + for seq in predicted_values.keys(): + predictions = predicted_values[seq] + min_value, max_value = check_min_max(predictions, min_value, max_value) + ax.plot(range(len(predictions)), predictions, label=seq) + if len(predictions) > max_len: + max_len = len(predictions) + ax.set_xlim([0, max_len - 1]) + else: + predictions = predicted_values + min_value, max_value = check_min_max(predictions, min_value, max_value) + ax.plot(range(len(predictions)), predictions, label=seq_name) + ax.set_xlim([0, len(predictions) - 1]) + legend_lines = plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left", fancybox=True, shadow=True) + ax.add_artist(legend_lines) + # Define regions + if highlighting_regions: + if prediction_name in ordered_regions_dict.keys(): + for i, prediction in enumerate(ordered_regions_dict[prediction_name]): + lower = thresholds_dict[prediction_name][prediction][0] + upper = thresholds_dict[prediction_name][prediction][1] + color = colors[i] + ax.axhspan(lower, upper, alpha=0.3, color=color, label=prediction) + included_in_regions_legend = list(reversed( + [prediction for prediction in ordered_regions_dict[prediction_name]])) # to sort it "from up to low" + # Get handles and labels + handles, labels = plt.gca().get_legend_handles_labels() + handles_dict = {label: handles[idx] for idx, label in enumerate(labels)} + # Add legend for regions, if available + region_legend = ax.legend([handles_dict[region] for region in included_in_regions_legend], + [region for region in included_in_regions_legend], fancybox=True, shadow=True, + loc='lower left', bbox_to_anchor=(1.04, 0)) + ax.add_artist(region_legend) + ax.set_ylim([min_value, max_value]) + ax.set_xlabel('residue index') + ax.set_ylabel('prediction values') + ax.grid(axis='y') + plt.savefig(os.path.join(options.plot_output, "{0}_{1}.png".format(slugify(seq_name), prediction_name)), bbox_inches="tight") + plt.close() + + +def df_dict_to_dict_of_values(df_dict, predictor): + results_dict = {} + for seq in df_dict.keys(): + df = pd.read_csv(df_dict[seq], sep='\t') + results_dict[seq] = df[predictor] + return results_dict + + +def main(options): + single_seq = SingleSeq(options.input_fasta) + b2b_tools = [] + if options.dynamine: + b2b_tools.append('dynamine') + if options.disomine: + b2b_tools.append('disomine') + if options.efoldmine: + b2b_tools.append('efoldmine') + if options.agmata: + b2b_tools.append('agmata') + + single_seq.predict(b2b_tools) + predictions = single_seq.get_all_predictions() + results_json = single_seq.get_all_predictions_json('all') + with open(options.json_output, 'w') as f: + f.write(results_json) + first_sequence_key = next(iter(predictions)) + prediction_keys = predictions[first_sequence_key].keys() + df_dictionary = {} + for sequence_key, sequence_predictions in predictions.items(): + residues = sequence_predictions['seq'] + residues_count = len(residues) + sequence_df = pd.DataFrame(columns=prediction_keys, index=range(residues_count)) + sequence_df.index.name = 'residue_index' + for predictor in prediction_keys: + sequence_df[predictor] = sequence_predictions[predictor] + sequence_df = sequence_df.rename(columns={"seq": "residue"}) + sequence_df = sequence_df.round(decimals=2) + filename = f'{options.output}/{slugify(sequence_key)}.tsv' + df_dictionary[sequence_key] = filename + sequence_df.to_csv(filename, sep="\t") + # Plot each individual plot (compatible with plot all) + if options.plot: + for predictor in prediction_keys: + if predictor != 'seq': + plot_prediction(prediction_name=predictor, highlighting_regions=True, + predicted_values=sequence_predictions[predictor], seq_name=sequence_key) + # Plot all together (compatible with plot individual) + if options.plot_all: + for predictor in prediction_keys: + if predictor != 'seq': + results_dictionary = df_dict_to_dict_of_values(df_dict=df_dictionary, predictor=predictor) + plot_prediction(prediction_name=predictor, highlighting_regions=True, + predicted_values=results_dictionary, seq_name='all') + + +if __name__ == "__main__": + parser = optparse.OptionParser() + parser.add_option("--dynamine", action="store_true", default=False) + parser.add_option("--disomine", action="store_true", default=False) + parser.add_option("--efoldmine", action="store_true", default=False) + parser.add_option("--agmata", action="store_true", default=False) + parser.add_option("--file", dest="input_fasta", default=False) + parser.add_option("--output", dest="output", default=False) + parser.add_option("--plot-output", dest="plot_output", default=False) + + parser.add_option("--json", dest="json_output", default=False) + parser.add_option("--plot", action="store_true", default=False) + parser.add_option("--plot_all", action="store_true", default=False) + parser.add_option("--highlight", action="store_true", default=False) + options, _args = parser.parse_args() + main(options) diff -r cacb90cde53e -r 891ccfd22633 singleSeq/b2btools_single_sequence.xml --- a/singleSeq/b2btools_single_sequence.xml Wed Jul 06 11:01:15 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,113 +0,0 @@ - - predicts protein biophysical properties from their amino-acid sequences - - b2btools - - - b2btools - - - -
- -
-
- - - - -
-
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 10.1038/ncomms3741 - 10.1101/2020.05.25.115253 - 10.1038/s41598-017-08366-3 - 10.1093/bioinformatics/btz912 - -
\ No newline at end of file diff -r cacb90cde53e -r 891ccfd22633 singleSeq/script.py --- a/singleSeq/script.py Wed Jul 06 11:01:15 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,160 +0,0 @@ -import optparse -import os.path -import unicodedata -import re -import pandas as pd -from b2bTools import SingleSeq -import matplotlib.pyplot as plt - - -def slugify(value): - """ - Taken from https://github.com/django/django/blob/master/django/utils/text.py - Convert to ASCII if 'allow_unicode'. Convert spaces or repeated - dashes to single dashes. Remove characters that aren't alphanumerics, - underscores, or hyphens. Convert to lowercase. Also strip leading and - trailing whitespace, dashes, and underscores. - """ - value = str(value) - value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') - value = re.sub(r'[^\w\s-]', '', value.lower()) - return re.sub(r'[-\s]+', '-', value).strip('-_') - - -def plot_prediction(prediction_name, highlighting_regions, predicted_values, seq_name): - thresholds_dict = {'backbone': {'membrane spanning': [1., 1.5], - 'rigid': [0.8, 1.], - 'context-dependent': [0.69, 0.8], - 'flexible': [-1.0, 0.69]}, - 'earlyFolding': {'early folds': [0.169, 2.], 'late folds': [-1., 0.169]}, - 'disoMine': {'ordered': [-1., 0.5], 'disordered': [0.5, 2.]}, - } - ordered_regions_dict = {'backbone': ['flexible', 'context-dependent', 'rigid', 'membrane spanning'], - 'earlyFolding': ['late folds', 'early folds'], - 'disoMine': ['ordered', 'disordered'], - } - colors = ['yellow', 'orange', 'pink', 'red'] - ranges_dict = { - 'backbone': [-0.2, 1.2], - 'sidechain': [-0.2, 1.2], - 'ppII': [-0.2, 1.2], - 'earlyFolding': [-0.2, 1.2], - 'disoMine': [-0.2, 1.2], - 'agmata': [-0.2, 1.2], - 'helix': [-1., 1.], - 'sheet': [-1., 1.], - 'coil': [-1., 1.], - } - fig, ax = plt.subplots(1, 1) - fig.set_figwidth(10) - fig.set_figheight(5) - ax.set_title(prediction_name + ' ' + 'prediction') - plt.tight_layout(rect=[0, 0, 0.75, 1]) - if seq_name == 'all': - max_len = 0 - for seq in predicted_values.keys(): - predictions = predicted_values[seq] - ax.plot(range(len(predictions)), predictions, label=seq) - if len(predictions)>max_len: - max_len = len(predictions) - ax.set_xlim([0, max_len - 1]) - else: - predictions = predicted_values - ax.plot(range(len(predictions)), predictions, label=seq_name) - ax.set_xlim([0, len(predictions) - 1]) - legend_lines = plt.legend(bbox_to_anchor=(1.04,1), loc="upper left", fancybox=True, shadow=True) - ax.add_artist(legend_lines) - # Define regions - if highlighting_regions: - if prediction_name in ordered_regions_dict.keys(): - for i, prediction in enumerate(ordered_regions_dict[prediction_name]): - lower = thresholds_dict[prediction_name][prediction][0] - upper = thresholds_dict[prediction_name][prediction][1] - color = colors[i] - ax.axhspan(lower, upper, alpha=0.3, color=color, label=prediction) - included_in_regions_legend = list(reversed( - [prediction for prediction in ordered_regions_dict[prediction_name]])) # to sort it "from up to low" - # Get handles and labels - handles, labels = plt.gca().get_legend_handles_labels() - handles_dict = {label: handles[idx] for idx, label in enumerate(labels)} - # Add legend for regions, if available - region_legend = ax.legend([handles_dict[region] for region in included_in_regions_legend], - [region for region in included_in_regions_legend], fancybox=True, shadow=True, - loc='lower left', bbox_to_anchor=(1.04,0)) - ax.add_artist(region_legend) - ax.set_ylim(ranges_dict[prediction_name]) - ax.set_xlabel('residue index') - ax.set_ylabel('prediction values') - ax.grid(axis='y') - plt.savefig(os.path.join(options.plot_output, "{0}_{1}.png".format(slugify(seq_name), prediction_name)), bbox_inches="tight") - plt.close() - -def df_dict_to_dict_of_values(df_dict, predictor): - results_dict = {} - for seq in df_dict.keys(): - df = pd.read_csv(df_dict[seq], sep='\t') - results_dict[seq] = df[predictor] - return results_dict - -def main(options): - single_seq = SingleSeq(options.input_fasta) - b2b_tools = [] - if options.dynamine: - b2b_tools.append('dynamine') - if options.disomine: - b2b_tools.append('disomine') - if options.efoldmine: - b2b_tools.append('efoldmine') - if options.agmata: - b2b_tools.append('agmata') - - single_seq.predict(b2b_tools) - predictions = single_seq.get_all_predictions() - results_json = single_seq.get_all_predictions_json('all') - with open(options.json_output, 'w') as f: - f.write(results_json) - first_sequence_key = next(iter(predictions)) - prediction_keys = predictions[first_sequence_key].keys() - df_dictionary = {} - for sequence_key, sequence_predictions in predictions.items(): - residues = sequence_predictions['seq'] - residues_count = len(residues) - sequence_df = pd.DataFrame(columns=prediction_keys, index=range(residues_count)) - sequence_df.index.name = 'residue_index' - for predictor in prediction_keys: - sequence_df[predictor] = sequence_predictions[predictor] - sequence_df = sequence_df.rename(columns={"seq": "residue"}) - sequence_df = sequence_df.round(decimals=2) - filename = f'{options.output}/{slugify(sequence_key)}.tsv' - df_dictionary[sequence_key] = filename - sequence_df.to_csv(filename, sep="\t") - # Plot each individual plot (compatible with plot all) - if options.plot: - for predictor in prediction_keys: - if predictor != 'seq': - plot_prediction(prediction_name=predictor, highlighting_regions=True, - predicted_values=sequence_predictions[predictor], seq_name=sequence_key) - # Plot all together (compatible with plot individual) - if options.plot_all: - for predictor in prediction_keys: - if predictor != 'seq': - results_dictionary = df_dict_to_dict_of_values(df_dict=df_dictionary, predictor=predictor) - plot_prediction(prediction_name=predictor, highlighting_regions=True, - predicted_values=results_dictionary, seq_name='all') - -if __name__ == "__main__": - parser = optparse.OptionParser() - parser.add_option("--dynamine", action="store_true", default=False) - parser.add_option("--disomine", action="store_true", default=False) - parser.add_option("--efoldmine", action="store_true", default=False) - parser.add_option("--agmata", action="store_true", default=False) - parser.add_option("--file", dest="input_fasta", default=False) - parser.add_option("--output", dest="output", default=False) - parser.add_option("--plot-output", dest="plot_output", default=False) - - parser.add_option("--json", dest="json_output", default=False) - parser.add_option("--plot", action="store_true", default=False) - parser.add_option("--plot_all", action="store_true", default=False) - parser.add_option("--highlight", action="store_true", default=False) - options, _args = parser.parse_args() - main(options) \ No newline at end of file