Mercurial > repos > adrian.diaz > b2btools_single_sequence
diff singleSeq/script.py @ 0:cacb90cde53e draft
First version of b2btools for single sequences in Galaxy
author | adrian.diaz |
---|---|
date | Wed, 06 Jul 2022 11:01:15 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/singleSeq/script.py Wed Jul 06 11:01:15 2022 +0000 @@ -0,0 +1,160 @@ +import optparse +import os.path +import unicodedata +import re +import pandas as pd +from b2bTools import SingleSeq +import matplotlib.pyplot as plt + + +def slugify(value): + """ + Taken from https://github.com/django/django/blob/master/django/utils/text.py + Convert to ASCII if 'allow_unicode'. Convert spaces or repeated + dashes to single dashes. Remove characters that aren't alphanumerics, + underscores, or hyphens. Convert to lowercase. Also strip leading and + trailing whitespace, dashes, and underscores. + """ + value = str(value) + value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') + value = re.sub(r'[^\w\s-]', '', value.lower()) + return re.sub(r'[-\s]+', '-', value).strip('-_') + + +def plot_prediction(prediction_name, highlighting_regions, predicted_values, seq_name): + thresholds_dict = {'backbone': {'membrane spanning': [1., 1.5], + 'rigid': [0.8, 1.], + 'context-dependent': [0.69, 0.8], + 'flexible': [-1.0, 0.69]}, + 'earlyFolding': {'early folds': [0.169, 2.], 'late folds': [-1., 0.169]}, + 'disoMine': {'ordered': [-1., 0.5], 'disordered': [0.5, 2.]}, + } + ordered_regions_dict = {'backbone': ['flexible', 'context-dependent', 'rigid', 'membrane spanning'], + 'earlyFolding': ['late folds', 'early folds'], + 'disoMine': ['ordered', 'disordered'], + } + colors = ['yellow', 'orange', 'pink', 'red'] + ranges_dict = { + 'backbone': [-0.2, 1.2], + 'sidechain': [-0.2, 1.2], + 'ppII': [-0.2, 1.2], + 'earlyFolding': [-0.2, 1.2], + 'disoMine': [-0.2, 1.2], + 'agmata': [-0.2, 1.2], + 'helix': [-1., 1.], + 'sheet': [-1., 1.], + 'coil': [-1., 1.], + } + fig, ax = plt.subplots(1, 1) + fig.set_figwidth(10) + fig.set_figheight(5) + ax.set_title(prediction_name + ' ' + 'prediction') + plt.tight_layout(rect=[0, 0, 0.75, 1]) + if seq_name == 'all': + max_len = 0 + for seq in predicted_values.keys(): + predictions = predicted_values[seq] + ax.plot(range(len(predictions)), predictions, label=seq) + if len(predictions)>max_len: + max_len = len(predictions) + ax.set_xlim([0, max_len - 1]) + else: + predictions = predicted_values + ax.plot(range(len(predictions)), predictions, label=seq_name) + ax.set_xlim([0, len(predictions) - 1]) + legend_lines = plt.legend(bbox_to_anchor=(1.04,1), loc="upper left", fancybox=True, shadow=True) + ax.add_artist(legend_lines) + # Define regions + if highlighting_regions: + if prediction_name in ordered_regions_dict.keys(): + for i, prediction in enumerate(ordered_regions_dict[prediction_name]): + lower = thresholds_dict[prediction_name][prediction][0] + upper = thresholds_dict[prediction_name][prediction][1] + color = colors[i] + ax.axhspan(lower, upper, alpha=0.3, color=color, label=prediction) + included_in_regions_legend = list(reversed( + [prediction for prediction in ordered_regions_dict[prediction_name]])) # to sort it "from up to low" + # Get handles and labels + handles, labels = plt.gca().get_legend_handles_labels() + handles_dict = {label: handles[idx] for idx, label in enumerate(labels)} + # Add legend for regions, if available + region_legend = ax.legend([handles_dict[region] for region in included_in_regions_legend], + [region for region in included_in_regions_legend], fancybox=True, shadow=True, + loc='lower left', bbox_to_anchor=(1.04,0)) + ax.add_artist(region_legend) + ax.set_ylim(ranges_dict[prediction_name]) + ax.set_xlabel('residue index') + ax.set_ylabel('prediction values') + ax.grid(axis='y') + plt.savefig(os.path.join(options.plot_output, "{0}_{1}.png".format(slugify(seq_name), prediction_name)), bbox_inches="tight") + plt.close() + +def df_dict_to_dict_of_values(df_dict, predictor): + results_dict = {} + for seq in df_dict.keys(): + df = pd.read_csv(df_dict[seq], sep='\t') + results_dict[seq] = df[predictor] + return results_dict + +def main(options): + single_seq = SingleSeq(options.input_fasta) + b2b_tools = [] + if options.dynamine: + b2b_tools.append('dynamine') + if options.disomine: + b2b_tools.append('disomine') + if options.efoldmine: + b2b_tools.append('efoldmine') + if options.agmata: + b2b_tools.append('agmata') + + single_seq.predict(b2b_tools) + predictions = single_seq.get_all_predictions() + results_json = single_seq.get_all_predictions_json('all') + with open(options.json_output, 'w') as f: + f.write(results_json) + first_sequence_key = next(iter(predictions)) + prediction_keys = predictions[first_sequence_key].keys() + df_dictionary = {} + for sequence_key, sequence_predictions in predictions.items(): + residues = sequence_predictions['seq'] + residues_count = len(residues) + sequence_df = pd.DataFrame(columns=prediction_keys, index=range(residues_count)) + sequence_df.index.name = 'residue_index' + for predictor in prediction_keys: + sequence_df[predictor] = sequence_predictions[predictor] + sequence_df = sequence_df.rename(columns={"seq": "residue"}) + sequence_df = sequence_df.round(decimals=2) + filename = f'{options.output}/{slugify(sequence_key)}.tsv' + df_dictionary[sequence_key] = filename + sequence_df.to_csv(filename, sep="\t") + # Plot each individual plot (compatible with plot all) + if options.plot: + for predictor in prediction_keys: + if predictor != 'seq': + plot_prediction(prediction_name=predictor, highlighting_regions=True, + predicted_values=sequence_predictions[predictor], seq_name=sequence_key) + # Plot all together (compatible with plot individual) + if options.plot_all: + for predictor in prediction_keys: + if predictor != 'seq': + results_dictionary = df_dict_to_dict_of_values(df_dict=df_dictionary, predictor=predictor) + plot_prediction(prediction_name=predictor, highlighting_regions=True, + predicted_values=results_dictionary, seq_name='all') + +if __name__ == "__main__": + parser = optparse.OptionParser() + parser.add_option("--dynamine", action="store_true", default=False) + parser.add_option("--disomine", action="store_true", default=False) + parser.add_option("--efoldmine", action="store_true", default=False) + parser.add_option("--agmata", action="store_true", default=False) + parser.add_option("--file", dest="input_fasta", default=False) + parser.add_option("--output", dest="output", default=False) + parser.add_option("--plot-output", dest="plot_output", default=False) + + parser.add_option("--json", dest="json_output", default=False) + parser.add_option("--plot", action="store_true", default=False) + parser.add_option("--plot_all", action="store_true", default=False) + parser.add_option("--highlight", action="store_true", default=False) + options, _args = parser.parse_args() + main(options) \ No newline at end of file