changeset 23:2891385d6ace draft default tip

planemo upload for repository https://github.com/usegalaxy-au/tools-au commit b347c6ccc82b14fcbff360b3357050d1d43e3ef5-dirty
author galaxy-australia
date Wed, 16 Apr 2025 05:46:58 +0000 (6 days ago)
parents 3f188450ca4f
children
files alphafold.xml macro_output.xml scripts/outputs.py scripts/validate_fasta.py
diffstat 4 files changed, 274 insertions(+), 58 deletions(-) [+]
line wrap: on
line diff
--- a/alphafold.xml	Wed Oct 30 21:46:34 2024 +0000
+++ b/alphafold.xml	Wed Apr 16 05:46:58 2025 +0000
@@ -3,7 +3,7 @@
     <macros>
       <token name="@TOOL_VERSION@">2.3.2</token>
       <token name="@TOOL_MINOR_VERSION@">2.3</token>
-      <token name="@VERSION_SUFFIX@">2</token>
+      <token name="@VERSION_SUFFIX@">3</token>
       <import>macro_output.xml</import>
       <import>macro_test_output.xml</import>
     </macros>
@@ -17,7 +17,7 @@
       <xref type="bio.tools">alphafold_2</xref>
     </xrefs>
     <requirements>
-        <container type="docker">neoformit/alphafold:v2.3.2_0</container>
+        <container type="docker">neoformit/alphafold:v2.3.2_2</container>
     </requirements>
     <required_files>
         <include path="scripts/outputs.py" />
@@ -52,6 +52,23 @@
 #end if
 > alphafold.fasta
 
+## Read MSA input -------------------------------------------------------------
+
+#if $advanced.reuse_msa.selected and $advanced.reuse_msa.msas:
+    #for msa in $advanced.reuse_msa.msas:
+        #if $model_preset.selection == 'multimer':
+        && MSA_DIR=output/alphafold/msas/${msa.chain}/
+        #else
+        && MSA_DIR=output/alphafold/msas/
+        #end if
+        && mkdir -p \$MSA_DIR
+        && ln -s '$msa.file' ${msa.chain}.zip
+        && unzip ${msa.chain}.zip -d \$MSA_DIR
+        && rm ${msa.chain}.zip
+    #end for
+#end if
+
+
 ## Env vars -------------------------------------------------------------------
 && export TF_FORCE_UNIFIED_MEMORY=1
 && export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0
@@ -61,7 +78,7 @@
 #if os.environ.get('PLANEMO_TESTING'):
     ## Run in testing mode (mocks a successful AlphaFold run by copying outputs)
     && echo "Creating dummy outputs for model_preset=$model_preset.selection..."
-    && bash '$__tool_directory__/scripts/mock_alphafold.sh' $model_preset
+    && bash '$__tool_directory__/scripts/mock_alphafold.sh' $model_preset.selection
 #else:
     ## Run AlphaFold
     && python /app/alphafold/run_alphafold.py
@@ -99,15 +116,18 @@
         --pdb70_database_path \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb70/pdb70
         #end if
 
+        #if $advanced.reuse_msa.selected and $advanced.reuse_msa.msas:
+        --use_precomputed_msas
+        #end if
+
         ## Galaxy-specific options --------------------------------------------
         ## See https://github.com/neoformit/alphafold/tree/release_2.3.2_galaxy
-        #if $advanced.disable_amber_relax:
-        --disable_amber_relax
-        #end if
-
         #if $advanced.limit_model_outputs:
         --output_models=$limit_model_outputs
         #end if
+
+        $advanced.disable_amber_relax
+        $advanced.exit_after_msa
         ## End Galaxy-specific options ----------------------------------------
 
 #end if
@@ -119,7 +139,10 @@
 $outputs.pae_csv
 $outputs.plots
 $outputs.plot_msa
+$outputs.msa
+$advanced.exit_after_msa
 
+#if not $advanced.exit_after_msa:
 ## HTML output
 && mkdir -p '${ html.files_path }'
 && cp output/alphafold/extra/alphafold.html '${html}'
@@ -129,8 +152,17 @@
 ## The working directory ends up two levels deep and the visualization html page
 ## fails to load the PDB files as static assets.
 && (([ -d working ] && cp -r working/* .) || true)
+#end if
 
     ]]></command>
+    <stdio>
+        <regex
+            match="concatenation axis must match exactly"
+            source="stderr"
+            level="fatal"
+            description="This error usually indicates that you are re-using an MSA that does not match the query sequence at the specified chain (e.g. position A should match the first sequence in the FASTA file)."
+        />
+    </stdio>
     <inputs>
         <conditional name="fasta_or_text">
             <param name="input_mode" type="select" label="Fasta Input" help="Protein sequence(s) to fold. Input can be fasta file from history, or text. Sequence must be valid IUPAC amino acid characters. We recommend submitting sequences with a maximum length of 3000AA, because run time scales exponentially with sequence length. If multiple-sequence FASTA file provided, multimer mode must be selected.">
@@ -158,20 +190,20 @@
         </param>
 
         <conditional name="model_preset">
-        <param
+            <param
                 name="selection"
-            type="select"
-            label="Model preset"
-            help="Select which prediction model to run. The monomer model is the most accurate for single protein prediction. The multimer model allows prediction of protein complexes."
-        >
-            <option value="monomer" selected="true">monomer - default prediction model</option>
-            <option value="monomer_ptm">
-                monomer_ptm - slightly less accurate version of the monomer model, but provides a pairwise alignment error (PAE) matrix
-            </option>
-            <option value="multimer">
-                multimer - model a protein complex (requires multi-sequence FASTA input)
-            </option>
-        </param>
+                type="select"
+                label="Model preset"
+                help="Select which prediction model to run. The monomer model is the most accurate for single protein prediction. The multimer model allows prediction of protein complexes."
+            >
+                <option value="monomer" selected="true">monomer - default prediction model</option>
+                <option value="monomer_ptm">
+                    monomer_ptm - slightly less accurate version of the monomer model, but provides a pairwise alignment error (PAE) matrix
+                </option>
+                <option value="multimer">
+                    multimer - model a protein complex (requires multi-sequence FASTA input)
+                </option>
+            </param>
             <when value="monomer"></when>
             <when value="monomer_ptm"></when>
             <when value="multimer">
@@ -187,6 +219,7 @@
             </when>
         </conditional>
 
+
         <section name="advanced" title="Advanced options" expanded="false">
             <param
                 name="max_template_date"
@@ -209,6 +242,8 @@
                 label="Disable Amber relaxation"
                 value="false"
                 optional="true"
+                truevalue="--disable_amber_relax"
+                falsevalue=""
                 help="Amber relaxation can be disabled to speed up processing time. Amber relaxation is used to refine predicted structures by removing stereochemical violations, resulting in more accurate prediction of side-chain geometry. Disabling this option with large proteins may lead to artefacts in the predicted structure. Disabling amber relax will result in the unrelaxed models being collected as PDB outputs."
             />
 
@@ -221,13 +256,59 @@
                 min="1"
                 max="5"
             />
+
+            <param
+                name="exit_after_msa"
+                type="boolean"
+                label="MSA generation only"
+                value="false"
+                optional="true"
+                truevalue="--msa_only"
+                falsevalue=""
+                help="If you only want to collect MSAs, this option will exit early. MSAs will be collected and no prediction will be made."
+            />
+
+            <conditional name="reuse_msa">
+                <param
+                    name="selected"
+                    type="boolean"
+                    checked="false"
+                    label="Reuse MSAs"
+                    help="Reuse the multiple sequence alignments (MSAs) from a previous AlphaFold run. This can be useful if you are modelling the same protein sequence(s) in numerous runs (as a different multimer complex, for example), as it eliminates redundant processing and speeds up the run time. MSA collection can be enabled below under &quot;Optional outputs&quot;."
+                />
+                <when value="true">
+                    <repeat name="msas" title="MSA archive(s)" help="The order of the MSAs provided here should match the order of your FASTA sequences. e.g. if you have an 'MSA-A' (collected from a previous job), and your first FASTA sequence matches the first sequence in the original job, then you should select that MSA and label it as chain 'A'. If the MSA matches the second sequence in your current FASTA file, you should label it as 'chain B', and so on.">
+                        <param name="chain" type="select" label="Chain" help="Which chain in your query FASTA does this MSA correspond to? Note that the letter denotes the position in your FASTA input e.g. 'A' would be the first sequence.">
+                            <option value="A">A</option>
+                            <option value="B">B</option>
+                            <option value="C">C</option>
+                            <option value="D">D</option>
+                            <option value="E">E</option>
+                            <option value="F">F</option>
+                            <option value="G">G</option>
+                            <option value="H">H</option>
+                            <option value="I">I</option>
+                            <option value="J">J</option>
+                        </param>
+                        <param
+                            name="file"
+                            type="data"
+                            multiple="false"
+                            format="zip"
+                            label="MSA ZIP archive"
+                            optional="false"
+                            help="ZIP archive extracted from a previous AlphaFold2 job. HINT - click the breadcrumbs to pick a dataset from within a collection."
+                        />
+                    </repeat>
+                </when>
+            </conditional>
         </section>
 
         <section name="outputs" title="Optional outputs" expanded="false">
             <param
                 name="plots"
                 type="boolean"
-                checked="false"
+                checked="true"
                 truevalue="--plot"
                 falsevalue=""
                 label="pLDDT and PAE matrix plots (per model)"
@@ -290,12 +371,23 @@
                 label="timings.json"
                 help="A JSON file with timings reported for each phase of the AlphaFold run."
             />
+            <param
+                name="msa"
+                type="boolean"
+                checked="false"
+                truevalue="--msa"
+                falsevalue=""
+                label="Multiple sequence alignments (MSAs)"
+                help="A ZIP archive of multiple sequence alignments which can be re-used in subsequent AlphaFold runs for increased efficiency. This is only useful if you will be modelling the same protein sequence again (in a different multimer complex, for example). For multimer runs, a ZIP archive will be created for each protein sequence input. The datasets will be labelled as an alphabetical sequence (e.g. A, B, C, ...) corresponding to the order of the FASTA sequence input."
+            />
         </section>
     </inputs>
 
     <outputs>
         <expand macro="output_pdb_models" />
-        <data name="html" format="html" label="${tool.name} on ${on_string}: Visualization" />
+        <data name="html" format="html" label="${tool.name} on ${on_string}: Visualization">
+            <filter>not advanced['exit_after_msa']</filter>
+        </data>
         <!-- Optional outputs -->
         <expand macro="output_plddts" />
         <expand macro="output_msa_plot" />
@@ -305,6 +397,7 @@
         <expand macro="output_plots" />
         <expand macro="output_relax_json" />
         <expand macro="output_timings_json" />
+        <expand macro="output_msa" />
     </outputs>
 
     <tests>
@@ -417,7 +510,7 @@
     |
 
 
-    **Input**
+    **Inputs**
 
     *Amino acid sequence*
 
@@ -502,6 +595,29 @@
     |
     |
 
+    *MSAs (optional)*
+
+    | A collection of multiple sequence alignments (MSAs) in ZIP format.
+    | For each sequence in the input FASTA file, a separate ZIP archive will be created and labelled in alphabetical sequence (e.g. A, B, C) with respect to the position of the sequence in the query FASTA file.
+    | If "MSAs only" is selected, this output will be collected automatically.
+    |
+    |
+
+    **Advanced features**
+
+    *Reusing MSAs*
+
+    | You can now re-use multiple sequence alignments (MSAs) from a previous AlphaFold run to speed up processing time. This is only useful if you wish to compute models for the same protein multiple times, for example in a one-to-many series of multimers. In these cases, the MSA for a given protein chain can be computed once and then re-used in subsequent jobs. To do this, you must begin by enabling the "Multiple sequence alignments (MSAs)" output in the "Optional outputs" section. This should result in a collection of ZIP archives being collected as an output.
+    |
+    | Once you have a collection of MSAs that you can re-use, you can then select "Reuse MSAs" in the "Advanced options" section. This will allow you to select the MSA ZIP archive(s) from your history. It is important that you select an MSA archive corresponding to a specific chain in your input FASTA file. If your MSA archive relates to the first chain in your FASTA file, you should select position "A". If your MSA archive relates to the second chain in your FASTA file, you should select position "B", and so on. Note that an MSA archive labelled MSA-A corresponds to the sequence order in the generating run. This does not necessarily correspond to the current run! Be sure to check the order of your sequences/chains. To avoid confusion here, you could re-label the MSA archives in your History to match the name of the protein/chain they were generated for.
+    |
+
+    *MSA-only mode*
+
+    | To speed up generation of MSAs, you can run AlphaFold in "MSA generation only" mode, available under "Advanced options". Use this mode if you only want to obtain MSAs and don't want to waste time computing a model. If you have a list of MSAs that you want to generate, you can put them all into one FASTA file and submit them as an MSA-only job in multimer mode. This will result in a collection of MSA archives - one for each sequence provided. To make things less confusing when you come to reuse these MSAs, you may wish to rename each archive in the collection to match the corresponding protein/chain in your input FASTA file - this makes it easy to select the right one when you come to reuse them in a later job!
+    |
+    |
+
     **AlphaFold configuration**
 
     | We have configured AlphaFold to run with the parameters suggested by default on `AlphaFold's GitHub <https://github.com/deepmind/alphafold>`_.
--- a/macro_output.xml	Wed Oct 30 21:46:34 2024 +0000
+++ b/macro_output.xml	Wed Apr 16 05:46:58 2025 +0000
@@ -2,17 +2,23 @@
     <xml name="output_pdb_models">
         <data name="model5" format="pdb" from_work_dir="output/alphafold/ranked_4.pdb" label="${tool.name} on ${on_string}: PDB ranked 4">
             <filter>advanced['limit_model_outputs'] > 4</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data name="model4" format="pdb" from_work_dir="output/alphafold/ranked_3.pdb" label="${tool.name} on ${on_string}: PDB ranked 3">
             <filter>advanced['limit_model_outputs'] > 3</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data name="model3" format="pdb" from_work_dir="output/alphafold/ranked_2.pdb" label="${tool.name} on ${on_string}: PDB ranked 2">
             <filter>advanced['limit_model_outputs'] > 2</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data name="model2" format="pdb" from_work_dir="output/alphafold/ranked_1.pdb" label="${tool.name} on ${on_string}: PDB ranked 1">
             <filter>advanced['limit_model_outputs'] > 1</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
-        <data name="model1" format="pdb" from_work_dir="output/alphafold/ranked_0.pdb" label="${tool.name} on ${on_string}: PDB ranked 0"/>
+        <data name="model1" format="pdb" from_work_dir="output/alphafold/ranked_0.pdb" label="${tool.name} on ${on_string}: PDB ranked 0">
+            <filter>not advanced['exit_after_msa']</filter>
+        </data>
     </xml>
 
     <xml name="output_pae_csv">
@@ -25,6 +31,7 @@
             <filter>outputs['pae_csv']</filter>
             <filter>model_preset != "monomer"</filter>
             <filter>advanced['limit_model_outputs'] > 4</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data
             name="pae_ranked_3"
@@ -35,6 +42,7 @@
             <filter>outputs['pae_csv']</filter>
             <filter>model_preset != "monomer"</filter>
             <filter>advanced['limit_model_outputs'] > 3</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data
             name="pae_ranked_2"
@@ -45,6 +53,7 @@
             <filter>outputs['pae_csv']</filter>
             <filter>model_preset != "monomer"</filter>
             <filter>advanced['limit_model_outputs'] > 2</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data
             name="pae_ranked_1"
@@ -55,6 +64,7 @@
             <filter>outputs['pae_csv']</filter>
             <filter>model_preset != "monomer"</filter>
             <filter>advanced['limit_model_outputs'] > 1</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data
             name="pae_ranked_0"
@@ -64,6 +74,7 @@
         >
             <filter>outputs['pae_csv']</filter>
             <filter>model_preset != "monomer"</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
     </xml>
 
@@ -76,6 +87,7 @@
         >
             <filter>outputs['model_pkls']</filter>
             <filter>advanced['limit_model_outputs'] > 4</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data
             name="output_ranked_3_pkl"
@@ -85,6 +97,7 @@
         >
             <filter>outputs['model_pkls']</filter>
             <filter>advanced['limit_model_outputs'] > 3</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data
             name="output_ranked_2_pkl"
@@ -94,6 +107,7 @@
         >
             <filter>outputs['model_pkls']</filter>
             <filter>advanced['limit_model_outputs'] > 2</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data
             name="output_ranked_1_pkl"
@@ -103,6 +117,7 @@
         >
             <filter>outputs['model_pkls']</filter>
             <filter>advanced['limit_model_outputs'] > 1</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data
             name="output_ranked_0_pkl"
@@ -111,6 +126,7 @@
             label="${tool.name} on ${on_string}: ranked_0.pkl"
         >
             <filter>outputs['model_pkls']</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
     </xml>
 
@@ -123,6 +139,7 @@
         >
             <filter>outputs['plots']</filter>
             <filter>advanced['limit_model_outputs'] > 4</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data
             name="plot_ranked_3"
@@ -132,6 +149,7 @@
         >
             <filter>outputs['plots']</filter>
             <filter>advanced['limit_model_outputs'] > 3</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data
             name="plot_ranked_2"
@@ -141,6 +159,7 @@
         >
             <filter>outputs['plots']</filter>
             <filter>advanced['limit_model_outputs'] > 2</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data
             name="plot_ranked_1"
@@ -150,6 +169,7 @@
         >
             <filter>outputs['plots']</filter>
             <filter>advanced['limit_model_outputs'] > 1</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
         <data
             name="plot_ranked_0"
@@ -158,6 +178,7 @@
             label="${tool.name} on ${on_string}: pLDDT/PAE plot ranked 0"
         >
             <filter>outputs['plots']</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
     </xml>
 
@@ -169,6 +190,7 @@
             label="${tool.name} on ${on_string}: Model confidence scores"
         >
             <filter>outputs['confidence_scores']</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
     </xml>
 
@@ -180,6 +202,7 @@
             label="${tool.name} on ${on_string}: MSA plot"
         >
             <filter>outputs['plot_msa']</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
     </xml>
 
@@ -191,6 +214,7 @@
             label="${tool.name} on ${on_string}: Per-residue confidence scores (plddts)"
         >
             <filter>outputs['plddts']</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
     </xml>
 
@@ -202,6 +226,7 @@
             label="${tool.name} on ${on_string}: relax_metrics_ranked.json"
         >
             <filter>outputs['relax_json']</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
     </xml>
 
@@ -213,6 +238,19 @@
             label="${tool.name} on ${on_string}: timings.json"
         >
             <filter>outputs['timings_json']</filter>
+            <filter>not advanced['exit_after_msa']</filter>
         </data>
     </xml>
+
+    <xml name="output_msa">
+        <collection name="output_msa" type="list" label="${tool.name} on ${on_string}: MSAs">
+            <discover_datasets
+                pattern="__designation_and_ext__"
+                directory="output/alphafold/extra/msas"
+                format="zip"
+                visible="false"
+            />
+            <filter>outputs['msa'] or advanced['exit_after_msa']</filter>
+        </collection>
+    </xml>
 </macros>
--- a/scripts/outputs.py	Wed Oct 30 21:46:34 2024 +0000
+++ b/scripts/outputs.py	Wed Apr 16 05:46:58 2025 +0000
@@ -20,12 +20,11 @@
 import os
 import pickle as pk
 import shutil
+import zipfile
+from matplotlib import pyplot as plt
 from pathlib import Path
 from typing import Dict, List
 
-from matplotlib import pyplot as plt
-
-# Output file paths
 OUTPUT_DIR = 'extra'
 OUTPUTS = {
     'model_pkl': OUTPUT_DIR + '/ranked_{rank}.pkl',
@@ -64,7 +63,6 @@
         self.workdir = None
         self.output_confidence_scores = True
         self.output_residue_scores = False
-        self.is_multimer = False
         self.parse()
 
     def parse(self) -> None:
@@ -100,6 +98,16 @@
             help="Plot multiple-sequence alignment coverage as a heatmap",
             action="store_true",
         )
+        parser.add_argument(
+            "--msa",
+            help="Collect multiple-sequence alignments as ZIP archives",
+            action="store_true",
+        )
+        parser.add_argument(
+            "--msa_only",
+            help="Alphafold generated MSA files only - skip all other outputs",
+            action="store_true",
+        )
         args = parser.parse_args()
         self.workdir = Path(args.workdir.rstrip('/'))
         self.output_residue_scores = args.confidence_scores
@@ -107,8 +115,11 @@
         self.output_model_plots = args.plot
         self.output_pae = args.pae
         self.plot_msa = args.plot_msa
+        self.collect_msas = args.msa
         self.model_preset = self._sniff_model_preset()
+        self.is_multimer = self.model_preset == PRESETS.multimer
         self.output_dir = self.workdir / OUTPUT_DIR
+        self.msa_only = args.msa_only
         os.makedirs(self.output_dir, exist_ok=True)
 
     def _sniff_model_preset(self) -> bool:
@@ -120,13 +131,14 @@
                 if '_ptm_' in path.name:
                     return PRESETS.monomer_ptm
                 return PRESETS.monomer
+        return PRESETS.monomer
 
 
 class ExecutionContext:
     """Collect file paths etc."""
     def __init__(self, settings: Settings):
         self.settings = settings
-        if settings.model_preset == PRESETS.multimer:
+        if settings.is_multimer:
             self.plddt_key = PLDDT_KEY.multimer
         else:
             self.plddt_key = PLDDT_KEY.monomer
@@ -378,6 +390,53 @@
     plt.close()
 
 
+def collect_msas(settings: Settings):
+    """Collect MSA files into ZIP archive(s)."""
+
+    def zip_dir(directory: Path, is_multimer: bool, name: str):
+        chain_id = directory.with_suffix('.zip').stem
+        msa_dir = settings.output_dir / 'msas'
+        msa_dir.mkdir(exist_ok=True)
+        zip_name = (
+            f"MSA-{chain_id}-{name}.zip"
+            if is_multimer
+            else f"MSA-{name}.zip")
+        zip_path = msa_dir / zip_name
+        with zipfile.ZipFile(zip_path, 'w') as z:
+            for path in directory.glob('*'):
+                z.write(path, path.name)
+
+    print("Collecting MSA archives...")
+    chain_names = get_input_sequence_ids(
+        settings.workdir.parent.parent / 'alphafold.fasta')
+    msa_dir = settings.workdir / 'msas'
+    is_multimer = (msa_dir / 'A').exists()
+    if is_multimer:
+        msa_dirs = sorted([
+            path for path in msa_dir.glob('*')
+            if path.is_dir()
+        ])
+        for i, path in enumerate(msa_dirs):
+            zip_dir(path, is_multimer, chain_names[i])
+    else:
+        zip_dir(msa_dir, is_multimer, chain_names[0])
+
+
+def get_input_sequence_ids(fasta_file: Path) -> List[str]:
+    """Read headers from the input FASTA file.
+    Split them to get a sequence ID and truncate to 20 chars max.
+    """
+    headers = []
+    for line in fasta_file.read_text().split('\n'):
+        if line.startswith('>'):
+            seq_id = line[1:].split(' ')[0]
+            seq_id_trunc = seq_id[:20].strip()
+            if len(seq_id) > 20:
+                seq_id_trunc += '...'
+            headers.append(seq_id_trunc)
+    return headers
+
+
 def template_html(context: ExecutionContext):
     """Template HTML file.
 
@@ -397,24 +456,27 @@
 def main():
     """Parse output files and generate additional output files."""
     settings = Settings()
-    context = ExecutionContext(settings)
-    ranking = ResultRanking(context)
-    write_confidence_scores(ranking, context)
-    rekey_relax_metrics(ranking, context)
-    template_html(context)
+    if not settings.msa_only:
+        context = ExecutionContext(settings)
+        ranking = ResultRanking(context)
+        write_confidence_scores(ranking, context)
+        rekey_relax_metrics(ranking, context)
+        template_html(context)
 
-    # Optional outputs
-    if settings.output_model_pkls:
-        rename_model_pkls(ranking, context)
-    if settings.output_model_plots:
-        plddt_pae_plots(ranking, context)
-    if settings.output_pae:
-        # Only created by monomer_ptm and multimer models
-        extract_pae_to_csv(ranking, context)
-    if settings.output_residue_scores:
-        write_per_residue_scores(ranking, context)
-    if settings.plot_msa:
-        plot_msa(context.settings.workdir)
+        # Optional outputs
+        if settings.output_model_pkls:
+            rename_model_pkls(ranking, context)
+        if settings.output_model_plots:
+            plddt_pae_plots(ranking, context)
+        if settings.output_pae:
+            # Only created by monomer_ptm and multimer models
+            extract_pae_to_csv(ranking, context)
+        if settings.output_residue_scores:
+            write_per_residue_scores(ranking, context)
+        if settings.plot_msa:
+            plot_msa(settings.workdir)
+    if settings.collect_msas or settings.msa_only:
+        collect_msas(settings)
 
 
 if __name__ == '__main__':
--- a/scripts/validate_fasta.py	Wed Oct 30 21:46:34 2024 +0000
+++ b/scripts/validate_fasta.py	Wed Apr 16 05:46:58 2025 +0000
@@ -12,7 +12,7 @@
 class Fasta:
     def __init__(self, header_str: str, seq_str: str):
         self.header = header_str
-        self.aa_seq = seq_str
+        self.sequence = seq_str
 
 
 class FastaLoader:
@@ -140,16 +140,16 @@
         """Confirm whether sequence length is valid."""
         fasta = self.fasta_list[0]
         if self.min_length:
-            if len(fasta.aa_seq) < self.min_length:
+            if len(fasta.sequence) < self.min_length:
                 raise ValueError(
                     'Error encountered validating FASTA:\n Sequence too short'
-                    f' ({len(fasta.aa_seq)}AA).'
+                    f' ({len(fasta.sequence)}AA).'
                     f' Minimum length is {self.min_length}AA.')
         if self.max_length:
-            if len(fasta.aa_seq) > self.max_length:
+            if len(fasta.sequence) > self.max_length:
                 raise ValueError(
                     'Error encountered validating FASTA:\n'
-                    f' Sequence too long ({len(fasta.aa_seq)}AA).'
+                    f' Sequence too long ({len(fasta.sequence)}AA).'
                     f' Maximum length is {self.max_length}AA.')
 
     def validate_alphabet(self):
@@ -158,7 +158,7 @@
         If not, report the offending character and its position.
         """
         fasta = self.fasta_list[0]
-        for i, char in enumerate(fasta.aa_seq.upper()):
+        for i, char in enumerate(fasta.sequence.upper()):
             if char not in self.iupac_characters:
                 raise ValueError(
                     'Error encountered validating FASTA:\n Invalid amino acid'
@@ -167,7 +167,7 @@
     def validate_x(self):
         """Check for X bases."""
         fasta = self.fasta_list[0]
-        for i, char in enumerate(fasta.aa_seq.upper()):
+        for i, char in enumerate(fasta.sequence.upper()):
             if char == 'X':
                 raise ValueError(
                     'Error encountered validating FASTA:\n Unsupported AA code'
@@ -180,14 +180,14 @@
 
     def write(self, fasta: Fasta):
         header = fasta.header
-        seq = self.format_sequence(fasta.aa_seq)
+        seq = self.format_sequence(fasta.sequence)
         sys.stdout.write(header + '\n')
         sys.stdout.write(seq)
 
-    def format_sequence(self, aa_seq: str):
+    def format_sequence(self, sequence: str):
         formatted_seq = ''
-        for i in range(0, len(aa_seq), self.line_wrap):
-            formatted_seq += aa_seq[i: i + self.line_wrap] + '\n'
+        for i in range(0, len(sequence), self.line_wrap):
+            formatted_seq += sequence[i: i + self.line_wrap] + '\n'
         return formatted_seq.upper()
 
 
@@ -214,7 +214,7 @@
         sys.stderr.write("Validated FASTA sequence(s):\n\n")
         for fas in clean_fastas:
             sys.stderr.write(fas.header + '\n')
-            sys.stderr.write(fas.aa_seq + '\n\n')
+            sys.stderr.write(fas.sequence + '\n\n')
 
     except ValueError as exc:
         sys.stderr.write(f"{exc}\n\n")