# HG changeset patch # User galaxy-australia # Date 1744782418 0 # Node ID 2891385d6ace60574f45efa9667874452cf4e968 # Parent 3f188450ca4f2db72eb3d206c523a9d0b57ee053 planemo upload for repository https://github.com/usegalaxy-au/tools-au commit b347c6ccc82b14fcbff360b3357050d1d43e3ef5-dirty diff -r 3f188450ca4f -r 2891385d6ace alphafold.xml --- a/alphafold.xml Wed Oct 30 21:46:34 2024 +0000 +++ b/alphafold.xml Wed Apr 16 05:46:58 2025 +0000 @@ -3,7 +3,7 @@ 2.3.2 2.3 - 2 + 3 macro_output.xml macro_test_output.xml @@ -17,7 +17,7 @@ alphafold_2 - neoformit/alphafold:v2.3.2_0 + neoformit/alphafold:v2.3.2_2 @@ -52,6 +52,23 @@ #end if > alphafold.fasta +## Read MSA input ------------------------------------------------------------- + +#if $advanced.reuse_msa.selected and $advanced.reuse_msa.msas: + #for msa in $advanced.reuse_msa.msas: + #if $model_preset.selection == 'multimer': + && MSA_DIR=output/alphafold/msas/${msa.chain}/ + #else + && MSA_DIR=output/alphafold/msas/ + #end if + && mkdir -p \$MSA_DIR + && ln -s '$msa.file' ${msa.chain}.zip + && unzip ${msa.chain}.zip -d \$MSA_DIR + && rm ${msa.chain}.zip + #end for +#end if + + ## Env vars ------------------------------------------------------------------- && export TF_FORCE_UNIFIED_MEMORY=1 && export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0 @@ -61,7 +78,7 @@ #if os.environ.get('PLANEMO_TESTING'): ## Run in testing mode (mocks a successful AlphaFold run by copying outputs) && echo "Creating dummy outputs for model_preset=$model_preset.selection..." - && bash '$__tool_directory__/scripts/mock_alphafold.sh' $model_preset + && bash '$__tool_directory__/scripts/mock_alphafold.sh' $model_preset.selection #else: ## Run AlphaFold && python /app/alphafold/run_alphafold.py @@ -99,15 +116,18 @@ --pdb70_database_path \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb70/pdb70 #end if + #if $advanced.reuse_msa.selected and $advanced.reuse_msa.msas: + --use_precomputed_msas + #end if + ## Galaxy-specific options -------------------------------------------- ## See https://github.com/neoformit/alphafold/tree/release_2.3.2_galaxy - #if $advanced.disable_amber_relax: - --disable_amber_relax - #end if - #if $advanced.limit_model_outputs: --output_models=$limit_model_outputs #end if + + $advanced.disable_amber_relax + $advanced.exit_after_msa ## End Galaxy-specific options ---------------------------------------- #end if @@ -119,7 +139,10 @@ $outputs.pae_csv $outputs.plots $outputs.plot_msa +$outputs.msa +$advanced.exit_after_msa +#if not $advanced.exit_after_msa: ## HTML output && mkdir -p '${ html.files_path }' && cp output/alphafold/extra/alphafold.html '${html}' @@ -129,8 +152,17 @@ ## The working directory ends up two levels deep and the visualization html page ## fails to load the PDB files as static assets. && (([ -d working ] && cp -r working/* .) || true) +#end if ]]> + + + @@ -158,20 +190,20 @@ - - - - - + type="select" + label="Model preset" + help="Select which prediction model to run. The monomer model is the most accurate for single protein prediction. The multimer model allows prediction of protein complexes." + > + + + + @@ -187,6 +219,7 @@ +

@@ -221,13 +256,59 @@ min="1" max="5" /> + + + + + + + + + + + + + + + + + + + + + + +

- + + not advanced['exit_after_msa'] + @@ -305,6 +397,7 @@ + @@ -417,7 +510,7 @@ | - **Input** + **Inputs** *Amino acid sequence* @@ -502,6 +595,29 @@ | | + *MSAs (optional)* + + | A collection of multiple sequence alignments (MSAs) in ZIP format. + | For each sequence in the input FASTA file, a separate ZIP archive will be created and labelled in alphabetical sequence (e.g. A, B, C) with respect to the position of the sequence in the query FASTA file. + | If "MSAs only" is selected, this output will be collected automatically. + | + | + + **Advanced features** + + *Reusing MSAs* + + | You can now re-use multiple sequence alignments (MSAs) from a previous AlphaFold run to speed up processing time. This is only useful if you wish to compute models for the same protein multiple times, for example in a one-to-many series of multimers. In these cases, the MSA for a given protein chain can be computed once and then re-used in subsequent jobs. To do this, you must begin by enabling the "Multiple sequence alignments (MSAs)" output in the "Optional outputs" section. This should result in a collection of ZIP archives being collected as an output. + | + | Once you have a collection of MSAs that you can re-use, you can then select "Reuse MSAs" in the "Advanced options" section. This will allow you to select the MSA ZIP archive(s) from your history. It is important that you select an MSA archive corresponding to a specific chain in your input FASTA file. If your MSA archive relates to the first chain in your FASTA file, you should select position "A". If your MSA archive relates to the second chain in your FASTA file, you should select position "B", and so on. Note that an MSA archive labelled MSA-A corresponds to the sequence order in the generating run. This does not necessarily correspond to the current run! Be sure to check the order of your sequences/chains. To avoid confusion here, you could re-label the MSA archives in your History to match the name of the protein/chain they were generated for. + | + + *MSA-only mode* + + | To speed up generation of MSAs, you can run AlphaFold in "MSA generation only" mode, available under "Advanced options". Use this mode if you only want to obtain MSAs and don't want to waste time computing a model. If you have a list of MSAs that you want to generate, you can put them all into one FASTA file and submit them as an MSA-only job in multimer mode. This will result in a collection of MSA archives - one for each sequence provided. To make things less confusing when you come to reuse these MSAs, you may wish to rename each archive in the collection to match the corresponding protein/chain in your input FASTA file - this makes it easy to select the right one when you come to reuse them in a later job! + | + | + **AlphaFold configuration** | We have configured AlphaFold to run with the parameters suggested by default on `AlphaFold's GitHub `_. diff -r 3f188450ca4f -r 2891385d6ace macro_output.xml --- a/macro_output.xml Wed Oct 30 21:46:34 2024 +0000 +++ b/macro_output.xml Wed Apr 16 05:46:58 2025 +0000 @@ -2,17 +2,23 @@ advanced['limit_model_outputs'] > 4 + not advanced['exit_after_msa'] advanced['limit_model_outputs'] > 3 + not advanced['exit_after_msa'] advanced['limit_model_outputs'] > 2 + not advanced['exit_after_msa'] advanced['limit_model_outputs'] > 1 + not advanced['exit_after_msa'] - + + not advanced['exit_after_msa'] + @@ -25,6 +31,7 @@ outputs['pae_csv'] model_preset != "monomer" advanced['limit_model_outputs'] > 4 + not advanced['exit_after_msa'] outputs['pae_csv'] model_preset != "monomer" advanced['limit_model_outputs'] > 3 + not advanced['exit_after_msa'] outputs['pae_csv'] model_preset != "monomer" advanced['limit_model_outputs'] > 2 + not advanced['exit_after_msa'] outputs['pae_csv'] model_preset != "monomer" advanced['limit_model_outputs'] > 1 + not advanced['exit_after_msa'] outputs['pae_csv'] model_preset != "monomer" + not advanced['exit_after_msa'] @@ -76,6 +87,7 @@ > outputs['model_pkls'] advanced['limit_model_outputs'] > 4 + not advanced['exit_after_msa'] outputs['model_pkls'] advanced['limit_model_outputs'] > 3 + not advanced['exit_after_msa'] outputs['model_pkls'] advanced['limit_model_outputs'] > 2 + not advanced['exit_after_msa'] outputs['model_pkls'] advanced['limit_model_outputs'] > 1 + not advanced['exit_after_msa'] outputs['model_pkls'] + not advanced['exit_after_msa'] @@ -123,6 +139,7 @@ > outputs['plots'] advanced['limit_model_outputs'] > 4 + not advanced['exit_after_msa'] outputs['plots'] advanced['limit_model_outputs'] > 3 + not advanced['exit_after_msa'] outputs['plots'] advanced['limit_model_outputs'] > 2 + not advanced['exit_after_msa'] outputs['plots'] advanced['limit_model_outputs'] > 1 + not advanced['exit_after_msa'] outputs['plots'] + not advanced['exit_after_msa'] @@ -169,6 +190,7 @@ label="${tool.name} on ${on_string}: Model confidence scores" > outputs['confidence_scores'] + not advanced['exit_after_msa'] @@ -180,6 +202,7 @@ label="${tool.name} on ${on_string}: MSA plot" > outputs['plot_msa'] + not advanced['exit_after_msa'] @@ -191,6 +214,7 @@ label="${tool.name} on ${on_string}: Per-residue confidence scores (plddts)" > outputs['plddts'] + not advanced['exit_after_msa'] @@ -202,6 +226,7 @@ label="${tool.name} on ${on_string}: relax_metrics_ranked.json" > outputs['relax_json'] + not advanced['exit_after_msa'] @@ -213,6 +238,19 @@ label="${tool.name} on ${on_string}: timings.json" > outputs['timings_json'] + not advanced['exit_after_msa'] + + + + + outputs['msa'] or advanced['exit_after_msa'] + + diff -r 3f188450ca4f -r 2891385d6ace scripts/outputs.py --- a/scripts/outputs.py Wed Oct 30 21:46:34 2024 +0000 +++ b/scripts/outputs.py Wed Apr 16 05:46:58 2025 +0000 @@ -20,12 +20,11 @@ import os import pickle as pk import shutil +import zipfile +from matplotlib import pyplot as plt from pathlib import Path from typing import Dict, List -from matplotlib import pyplot as plt - -# Output file paths OUTPUT_DIR = 'extra' OUTPUTS = { 'model_pkl': OUTPUT_DIR + '/ranked_{rank}.pkl', @@ -64,7 +63,6 @@ self.workdir = None self.output_confidence_scores = True self.output_residue_scores = False - self.is_multimer = False self.parse() def parse(self) -> None: @@ -100,6 +98,16 @@ help="Plot multiple-sequence alignment coverage as a heatmap", action="store_true", ) + parser.add_argument( + "--msa", + help="Collect multiple-sequence alignments as ZIP archives", + action="store_true", + ) + parser.add_argument( + "--msa_only", + help="Alphafold generated MSA files only - skip all other outputs", + action="store_true", + ) args = parser.parse_args() self.workdir = Path(args.workdir.rstrip('/')) self.output_residue_scores = args.confidence_scores @@ -107,8 +115,11 @@ self.output_model_plots = args.plot self.output_pae = args.pae self.plot_msa = args.plot_msa + self.collect_msas = args.msa self.model_preset = self._sniff_model_preset() + self.is_multimer = self.model_preset == PRESETS.multimer self.output_dir = self.workdir / OUTPUT_DIR + self.msa_only = args.msa_only os.makedirs(self.output_dir, exist_ok=True) def _sniff_model_preset(self) -> bool: @@ -120,13 +131,14 @@ if '_ptm_' in path.name: return PRESETS.monomer_ptm return PRESETS.monomer + return PRESETS.monomer class ExecutionContext: """Collect file paths etc.""" def __init__(self, settings: Settings): self.settings = settings - if settings.model_preset == PRESETS.multimer: + if settings.is_multimer: self.plddt_key = PLDDT_KEY.multimer else: self.plddt_key = PLDDT_KEY.monomer @@ -378,6 +390,53 @@ plt.close() +def collect_msas(settings: Settings): + """Collect MSA files into ZIP archive(s).""" + + def zip_dir(directory: Path, is_multimer: bool, name: str): + chain_id = directory.with_suffix('.zip').stem + msa_dir = settings.output_dir / 'msas' + msa_dir.mkdir(exist_ok=True) + zip_name = ( + f"MSA-{chain_id}-{name}.zip" + if is_multimer + else f"MSA-{name}.zip") + zip_path = msa_dir / zip_name + with zipfile.ZipFile(zip_path, 'w') as z: + for path in directory.glob('*'): + z.write(path, path.name) + + print("Collecting MSA archives...") + chain_names = get_input_sequence_ids( + settings.workdir.parent.parent / 'alphafold.fasta') + msa_dir = settings.workdir / 'msas' + is_multimer = (msa_dir / 'A').exists() + if is_multimer: + msa_dirs = sorted([ + path for path in msa_dir.glob('*') + if path.is_dir() + ]) + for i, path in enumerate(msa_dirs): + zip_dir(path, is_multimer, chain_names[i]) + else: + zip_dir(msa_dir, is_multimer, chain_names[0]) + + +def get_input_sequence_ids(fasta_file: Path) -> List[str]: + """Read headers from the input FASTA file. + Split them to get a sequence ID and truncate to 20 chars max. + """ + headers = [] + for line in fasta_file.read_text().split('\n'): + if line.startswith('>'): + seq_id = line[1:].split(' ')[0] + seq_id_trunc = seq_id[:20].strip() + if len(seq_id) > 20: + seq_id_trunc += '...' + headers.append(seq_id_trunc) + return headers + + def template_html(context: ExecutionContext): """Template HTML file. @@ -397,24 +456,27 @@ def main(): """Parse output files and generate additional output files.""" settings = Settings() - context = ExecutionContext(settings) - ranking = ResultRanking(context) - write_confidence_scores(ranking, context) - rekey_relax_metrics(ranking, context) - template_html(context) + if not settings.msa_only: + context = ExecutionContext(settings) + ranking = ResultRanking(context) + write_confidence_scores(ranking, context) + rekey_relax_metrics(ranking, context) + template_html(context) - # Optional outputs - if settings.output_model_pkls: - rename_model_pkls(ranking, context) - if settings.output_model_plots: - plddt_pae_plots(ranking, context) - if settings.output_pae: - # Only created by monomer_ptm and multimer models - extract_pae_to_csv(ranking, context) - if settings.output_residue_scores: - write_per_residue_scores(ranking, context) - if settings.plot_msa: - plot_msa(context.settings.workdir) + # Optional outputs + if settings.output_model_pkls: + rename_model_pkls(ranking, context) + if settings.output_model_plots: + plddt_pae_plots(ranking, context) + if settings.output_pae: + # Only created by monomer_ptm and multimer models + extract_pae_to_csv(ranking, context) + if settings.output_residue_scores: + write_per_residue_scores(ranking, context) + if settings.plot_msa: + plot_msa(settings.workdir) + if settings.collect_msas or settings.msa_only: + collect_msas(settings) if __name__ == '__main__': diff -r 3f188450ca4f -r 2891385d6ace scripts/validate_fasta.py --- a/scripts/validate_fasta.py Wed Oct 30 21:46:34 2024 +0000 +++ b/scripts/validate_fasta.py Wed Apr 16 05:46:58 2025 +0000 @@ -12,7 +12,7 @@ class Fasta: def __init__(self, header_str: str, seq_str: str): self.header = header_str - self.aa_seq = seq_str + self.sequence = seq_str class FastaLoader: @@ -140,16 +140,16 @@ """Confirm whether sequence length is valid.""" fasta = self.fasta_list[0] if self.min_length: - if len(fasta.aa_seq) < self.min_length: + if len(fasta.sequence) < self.min_length: raise ValueError( 'Error encountered validating FASTA:\n Sequence too short' - f' ({len(fasta.aa_seq)}AA).' + f' ({len(fasta.sequence)}AA).' f' Minimum length is {self.min_length}AA.') if self.max_length: - if len(fasta.aa_seq) > self.max_length: + if len(fasta.sequence) > self.max_length: raise ValueError( 'Error encountered validating FASTA:\n' - f' Sequence too long ({len(fasta.aa_seq)}AA).' + f' Sequence too long ({len(fasta.sequence)}AA).' f' Maximum length is {self.max_length}AA.') def validate_alphabet(self): @@ -158,7 +158,7 @@ If not, report the offending character and its position. """ fasta = self.fasta_list[0] - for i, char in enumerate(fasta.aa_seq.upper()): + for i, char in enumerate(fasta.sequence.upper()): if char not in self.iupac_characters: raise ValueError( 'Error encountered validating FASTA:\n Invalid amino acid' @@ -167,7 +167,7 @@ def validate_x(self): """Check for X bases.""" fasta = self.fasta_list[0] - for i, char in enumerate(fasta.aa_seq.upper()): + for i, char in enumerate(fasta.sequence.upper()): if char == 'X': raise ValueError( 'Error encountered validating FASTA:\n Unsupported AA code' @@ -180,14 +180,14 @@ def write(self, fasta: Fasta): header = fasta.header - seq = self.format_sequence(fasta.aa_seq) + seq = self.format_sequence(fasta.sequence) sys.stdout.write(header + '\n') sys.stdout.write(seq) - def format_sequence(self, aa_seq: str): + def format_sequence(self, sequence: str): formatted_seq = '' - for i in range(0, len(aa_seq), self.line_wrap): - formatted_seq += aa_seq[i: i + self.line_wrap] + '\n' + for i in range(0, len(sequence), self.line_wrap): + formatted_seq += sequence[i: i + self.line_wrap] + '\n' return formatted_seq.upper() @@ -214,7 +214,7 @@ sys.stderr.write("Validated FASTA sequence(s):\n\n") for fas in clean_fastas: sys.stderr.write(fas.header + '\n') - sys.stderr.write(fas.aa_seq + '\n\n') + sys.stderr.write(fas.sequence + '\n\n') except ValueError as exc: sys.stderr.write(f"{exc}\n\n")