# HG changeset patch # User galaxy-australia # Date 1678416487 0 # Node ID a58f7eb0df2cd0c4df5ddb753fec518da2f9cb86 # Parent d00e15139065ff39933cfbaddf29c7d7735bdaf9 planemo upload for repository https://github.com/usegalaxy-au/tools-au commit fd45a857a71358e7e5375dcfb5043cdc8560c5a5 diff -r d00e15139065 -r a58f7eb0df2c alphafold.xml --- a/alphafold.xml Tue Feb 28 01:15:42 2023 +0000 +++ b/alphafold.xml Fri Mar 10 02:48:07 2023 +0000 @@ -2,7 +2,9 @@ - AI-guided 3D structural prediction of proteins 2.3.1 - 0 + 1 + macro_output.xml + macro_test_output.xml topic_0082 @@ -14,17 +16,20 @@ alphafold_2 - neoformit/alphafold:v2.3.1_1 + neoformit/alphafold:v2.3.1_2 input.fasta #end if @@ -32,55 +37,66 @@ && python3 '$__tool_directory__/validate_fasta.py' input.fasta --min_length \${ALPHAFOLD_AA_LENGTH_MIN:-0} --max_length \${ALPHAFOLD_AA_LENGTH_MAX:-0} -#if $multimer: +#if $model_preset == 'multimer': --multimer #end if > alphafold.fasta -## Env vars ------------------------------- +## Env vars ------------------------------------------------------------------- && export TF_FORCE_UNIFIED_MEMORY=1 && export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0 && export TODAY=`date +"%Y-%m-%d"` -## Run alphafold ------------------------- -&& python /app/alphafold/run_alphafold.py - --fasta_paths alphafold.fasta - --output_dir output - --data_dir \${ALPHAFOLD_DB:-/data} - - ## Set reference database paths - --uniref90_database_path \${ALPHAFOLD_DB:-/data}/uniref90/uniref90.fasta - --mgnify_database_path \${ALPHAFOLD_DB:-/data}/mgnify/mgy_clusters_2022_05.fa - --template_mmcif_dir \${ALPHAFOLD_DB:-/data}/pdb_mmcif/mmcif_files - --obsolete_pdbs_path \${ALPHAFOLD_DB:-/data}/pdb_mmcif/obsolete.dat - #if $dbs == 'full': - --bfd_database_path \${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt - --uniref30_database_path \${ALPHAFOLD_DB:-/data}/uniref30/UniRef30_2021_03 - #else - --db_preset=reduced_dbs - --small_bfd_database_path \${ALPHAFOLD_DB:-/data}/small_bfd/bfd-first_non_consensus_sequences.fasta - #end if +## Run AlphaFold ------------------------------------------------------------- +#if os.environ.get('PLANEMO_TESTING'): + ## Run in testing mode (mocks a successful AlphaFold run by copying outputs) + && echo "Creating dummy outputs for model_preset=$model_preset..." + && bash '$__tool_directory__/mock_alphafold.sh' $model_preset +#else: + ## Run AlphaFold + && python /app/alphafold/run_alphafold.py + --fasta_paths alphafold.fasta + --output_dir output + --data_dir \${ALPHAFOLD_DB:-/data} + --model_preset=$model_preset - #if $max_template_date: - --max_template_date=$max_template_date - #else - --max_template_date=\$TODAY - #end if + ## Set reference database paths + --uniref90_database_path \${ALPHAFOLD_DB:-/data}/uniref90/uniref90.fasta + --mgnify_database_path \${ALPHAFOLD_DB:-/data}/mgnify/mgy_clusters_2022_05.fa + --template_mmcif_dir \${ALPHAFOLD_DB:-/data}/pdb_mmcif/mmcif_files + --obsolete_pdbs_path \${ALPHAFOLD_DB:-/data}/pdb_mmcif/obsolete.dat + #if $dbs == 'full': + --bfd_database_path \${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt + --uniref30_database_path \${ALPHAFOLD_DB:-/data}/uniref30/UniRef30_2021_03 + #else + --db_preset=reduced_dbs + --small_bfd_database_path \${ALPHAFOLD_DB:-/data}/small_bfd/bfd-first_non_consensus_sequences.fasta + #end if - --use_gpu_relax=\${ALPHAFOLD_USE_GPU:-True} ## introduced in v2.1.2 + #if $max_template_date: + --max_template_date=$max_template_date + #else + --max_template_date=\$TODAY + #end if - #if $multimer: - --model_preset=multimer - --pdb_seqres_database_path=\${ALPHAFOLD_DB:-/data}/pdb_seqres/pdb_seqres.txt - --uniprot_database_path=\${ALPHAFOLD_DB:-/data}/uniprot/uniprot.fasta - --num_multimer_predictions_per_model=1 ## introduced in v2.2.0 - #else - --pdb70_database_path \${ALPHAFOLD_DB:-/data}/pdb70/pdb70 - #end if + --use_gpu_relax=\${ALPHAFOLD_USE_GPU:-True} ## introduced in v2.1.2 -## Generate additional outputs ------------ -&& python3 '$__tool_directory__/outputs.py' output/alphafold $outputs.plddts -#if $multimer: + #if $model_preset == 'multimer': + --pdb_seqres_database_path=\${ALPHAFOLD_DB:-/data}/pdb_seqres/pdb_seqres.txt + --uniprot_database_path=\${ALPHAFOLD_DB:-/data}/uniprot/uniprot.fasta + --num_multimer_predictions_per_model=1 ## introduced in v2.2.0 + #else + --pdb70_database_path \${ALPHAFOLD_DB:-/data}/pdb70/pdb70 + #end if +#end if + +## Generate additional outputs ------------------------------------------------ +&& python3 '$__tool_directory__/outputs.py' output/alphafold +$outputs.plddts +$outputs.model_pkls +$outputs.pae_csv +$outputs.plots +#if $model_preset == 'multimer': --multimer #end if @@ -137,15 +153,31 @@ + name="model_preset" + type="select" + label="Model preset" + help="Select which prediction model to run. The monomer model is the most accurate for single protein prediction. The multimer model allows prediction of protein complexes." + > + + + +
+ + - - - - - + - - - outputs['confidence_scores'] - - - - outputs['plddts'] - - - - outputs['model_pkls'] - - - outputs['model_pkls'] - - - outputs['model_pkls'] - - - outputs['model_pkls'] - - - outputs['model_pkls'] - - - outputs['relax_json'] - + + + + + + - + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + `_. - | This means that it runs with Amber relaxation enabled, with relaxed PDB models collected as output datasets. If there are additonal parameters that you would like to interact with, please `send a support request to Galaxy AU `_, or open an issue on `our GitHub `_. + | This means that it runs with Amber relaxation enabled, with relaxed PDB models collected as output datasets (ranked\_*.pdb files). If there are additonal parameters that you would like to interact with, please `send a support request to Galaxy AU `_, or open an issue on `our GitHub `_. | | diff -r d00e15139065 -r a58f7eb0df2c macro_output.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macro_output.xml Fri Mar 10 02:48:07 2023 +0000 @@ -0,0 +1,176 @@ + + + + + + + + + + + + outputs['pae_csv'] + model_preset != "monomer" + + + outputs['pae_csv'] + model_preset != "monomer" + + + outputs['pae_csv'] + model_preset != "monomer" + + + outputs['pae_csv'] + model_preset != "monomer" + + + outputs['pae_csv'] + model_preset != "monomer" + + + + + + outputs['model_pkls'] + + + outputs['model_pkls'] + + + outputs['model_pkls'] + + + outputs['model_pkls'] + + + outputs['model_pkls'] + + + + + + outputs['plots'] + + + outputs['plots'] + + + outputs['plots'] + + + outputs['plots'] + + + outputs['plots'] + + + + + + outputs['confidence_scores'] + + + + + + outputs['plddts'] + + + + + + outputs['relax_json'] + + + diff -r d00e15139065 -r a58f7eb0df2c macro_test_output.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macro_test_output.xml Fri Mar 10 02:48:07 2023 +0000 @@ -0,0 +1,199 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r d00e15139065 -r a58f7eb0df2c outputs.py --- a/outputs.py Tue Feb 28 01:15:42 2023 +0000 +++ b/outputs.py Fri Mar 10 02:48:07 2023 +0000 @@ -19,13 +19,16 @@ import os import pickle as pk import shutil +from matplotlib import pyplot as plt from pathlib import Path from typing import List -# Output file names +# Output file paths OUTPUT_DIR = 'extra' OUTPUTS = { 'model_pkl': OUTPUT_DIR + '/ranked_{rank}.pkl', + 'model_pae': OUTPUT_DIR + '/pae_ranked_{rank}.csv', + 'model_plot': OUTPUT_DIR + '/ranked_{rank}.png', 'model_confidence_scores': OUTPUT_DIR + '/model_confidence_scores.tsv', 'plddts': OUTPUT_DIR + '/plddts.tsv', 'relax': OUTPUT_DIR + '/relax_metrics_ranked.json', @@ -46,8 +49,9 @@ self.output_confidence_scores = True self.output_residue_scores = False self.is_multimer = False + self.parse() - def parse_settings(self) -> None: + def parse(self) -> None: parser = argparse.ArgumentParser() parser.add_argument( "workdir", @@ -67,15 +71,26 @@ action="store_true" ) parser.add_argument( - "--model-pkl", - dest="model_pkl", + "--pkl", help="rename model pkl outputs with rank order", action="store_true" ) + parser.add_argument( + "--pae", + help="extract PAE from pkl files to CSV format", + action="store_true" + ) + parser.add_argument( + "--plot", + help="Plot pLDDT and PAE for each model", + action="store_true" + ) args = parser.parse_args() self.workdir = Path(args.workdir.rstrip('/')) self.output_residue_scores = args.plddts - self.output_model_pkls = args.model_pkl + self.output_model_pkls = args.pkl + self.output_model_plots = args.plot + self.output_pae = args.pae self.is_multimer = args.multimer self.output_dir = self.workdir / OUTPUT_DIR os.makedirs(self.output_dir, exist_ok=True) @@ -212,6 +227,31 @@ shutil.copyfile(path, new_path) +def extract_pae_to_csv(ranking: ResultRanking, context: ExecutionContext): + """Extract predicted alignment error matrix from pickle files. + + Creates a CSV file for each of five ranked models. + """ + for path in context.model_pkl_paths: + model = ResultModelPrediction(path, context) + rank = ranking.get_rank_for_model(model.name) + with open(path, 'rb') as f: + data = pk.load(f) + if 'predicted_aligned_error' not in data: + print("Skipping PAE output" + f" - not found in {path}." + " Running with model_preset=monomer?") + return + pae = data['predicted_aligned_error'] + out_path = ( + context.settings.workdir + / OUTPUTS['model_pae'].format(rank=rank) + ) + with open(out_path, 'w') as f: + for row in pae: + f.write(','.join([str(x) for x in row]) + '\n') + + def rekey_relax_metrics(ranking: ResultRanking, context: ExecutionContext): """Replace keys in relax_metrics.json with 0-indexed rank.""" with open(context.relax_metrics) as f: @@ -224,10 +264,44 @@ json.dump(data, f) +def plddt_pae_plots(ranking: ResultRanking, context: ExecutionContext): + """Generate a pLDDT + PAE plot for each model.""" + for path in context.model_pkl_paths: + num_plots = 2 + model = ResultModelPrediction(path, context) + rank = ranking.get_rank_for_model(model.name) + png_path = ( + context.settings.workdir + / OUTPUTS['model_plot'].format(rank=rank) + ) + plddts = model.data['plddt'] + if 'predicted_aligned_error' in model.data: + pae = model.data['predicted_aligned_error'] + max_pae = model.data['max_predicted_aligned_error'] + else: + num_plots = 1 + + plt.figure(figsize=[8 * num_plots, 6]) + plt.subplot(1, num_plots, 1) + plt.plot(plddts) + plt.title('Predicted LDDT') + plt.xlabel('Residue') + plt.ylabel('pLDDT') + + if num_plots == 2: + plt.subplot(1, 2, 2) + plt.imshow(pae, vmin=0., vmax=max_pae, cmap='Greens_r') + plt.colorbar(fraction=0.046, pad=0.04) + plt.title('Predicted Aligned Error') + plt.xlabel('Scored residue') + plt.ylabel('Aligned residue') + + plt.savefig(png_path) + + def main(): """Parse output files and generate additional output files.""" settings = Settings() - settings.parse_settings() context = ExecutionContext(settings) ranking = ResultRanking(context) write_confidence_scores(ranking, context) @@ -236,7 +310,11 @@ # Optional outputs if settings.output_model_pkls: rename_model_pkls(ranking, context) - + if settings.output_model_plots: + plddt_pae_plots(ranking, context) + if settings.output_pae: + # Only created by monomer_ptm and multimer models + extract_pae_to_csv(ranking, context) if settings.output_residue_scores: write_per_residue_scores(ranking, context) diff -r d00e15139065 -r a58f7eb0df2c validate_fasta.py --- a/validate_fasta.py Tue Feb 28 01:15:42 2023 +0000 +++ b/validate_fasta.py Fri Mar 10 02:48:07 2023 +0000 @@ -205,6 +205,11 @@ for fas in clean_fastas: fw.write(fas) + sys.stderr.write("Validated FASTA sequence(s):\n\n") + for fas in clean_fastas: + sys.stderr.write(fas.header + '\n') + sys.stderr.write(fas.aa_seq + '\n\n') + except ValueError as exc: sys.stderr.write(f"{exc}\n\n") raise exc