Mercurial > repos > galaxy-australia > alphafold2

--- a/README.rst	Wed Oct 12 22:25:20 2022 +0000
+++ b/README.rst	Tue Feb 28 01:15:42 2023 +0000
@@ -75,27 +75,41 @@
 ~~~~~~~~~~~~~~

 Alphafold needs reference data to run. The wrapper expects this data to
-be present at ``/data/alphafold_databases``. A custom DB root can be read from
-the ALPHAFOLD_DB environment variable, if set. To download the AlphaFold,
-reference data, run the following shell script command in the tool directory.
+be present at ``/data/alphafold_databases``. A custom path will be read from
+the ``ALPHAFOLD_DB`` environment variable, if set.
+
+To download the AlphaFold reference DBs:

 ::

-   # Set databases root
-   ALPHAFOLD_DB_ROOT=/data/alphafold_databases
+   # Set your AlphaFold DB path
+   ALPHAFOLD_DB=/data/alphafold_databases
+
+   # Set your target AlphaFold version
+   ALPHAFOLD_VERSION=  # e.g. 2.1.2
+
+   # Download repo
+   wget https://github.com/deepmind/alphafold/releases/tag/v${ALPHAFOLD_VERSION}.tar.gz
+   tar xzf v${ALPHAFOLD_VERSION}.tar.gz

-   # make folders if needed
-   mkdir -p $ALPHAFOLD_DB_ROOT
+   # Ensure dirs
+   mkdir -p $ALPHAFOLD_DB

-   # download ref data
-   bash scripts/download_all_data.sh $ALPHAFOLD_DB_ROOT
+   # Download
+   bash alphafold*/scripts/download_all_data.sh $ALPHAFOLD_DB

-This will install the reference data to ``/data/alphafold_databases``.
+You will most likely want to run this as a background job, as it will take a
+very long time (7+ days in Australia).
+
+This will install the reference data to your ``$ALPHAFOLD_DB``.
 To check this has worked, ensure the final folder structure is as
 follows:

 ::

+   # NOTE: this structure will change between minor AlphaFold versions
+   # The tree shown below was updated for v2.3.1
+
    data/alphafold_databases
    ├── bfd
    │   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata
@@ -105,18 +119,23 @@
    │   ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata
    │   └── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex
    ├── mgnify
-   │   └── mgy_clusters_2018_12.fa
+   │   └── mgy_clusters_2022_05.fa
    ├── params
    │   ├── LICENSE
    │   ├── params_model_1.npz
+   │   ├── params_model_1_multimer_v3.npz
    │   ├── params_model_1_ptm.npz
    │   ├── params_model_2.npz
+   │   ├── params_model_2_multimer_v3.npz
    │   ├── params_model_2_ptm.npz
    │   ├── params_model_3.npz
+   │   ├── params_model_3_multimer_v3.npz
    │   ├── params_model_3_ptm.npz
    │   ├── params_model_4.npz
+   │   ├── params_model_4_multimer_v3.npz
    │   ├── params_model_4_ptm.npz
    │   ├── params_model_5.npz
+   │   ├── params_model_5_multimer_v3.npz
    │   └── params_model_5_ptm.npz
    ├── pdb70
    │   ├── md5sum
@@ -131,10 +150,20 @@
    ├── pdb_mmcif
    │   ├── mmcif_files
    │   └── obsolete.dat
-   ├── uniclust30
-   │   └── uniclust30_2018_08
+   ├── pdb_seqres
+   │   └── pdb_seqres.txt
+   ├── uniprot
+   │   └── uniprot.fasta
+   ├── uniref30
+   │   ├── UniRef30_2021_03.md5sums
+   │   ├── UniRef30_2021_03_a3m.ffdata
+   │   ├── UniRef30_2021_03_a3m.ffindex
+   │   ├── UniRef30_2021_03_cs219.ffdata
+   │   ├── UniRef30_2021_03_cs219.ffindex
+   │   ├── UniRef30_2021_03_hhm.ffdata
+   │   └── UniRef30_2021_03_hhm.ffindex
    └── uniref90
-       └── uniref90.fasta
+      └── uniref90.fasta

 In more recent releases of the AlphaFold tool, you will need to download an
 additional file to allow the ``reduced_dbs`` option:
@@ -152,6 +181,25 @@
    │   └── bfd-first_non_consensus_sequences.fasta


+**Upgrading database versions**
+
+When upgrading to a new minor version of AlphaFold, you will most likely have to
+upgrade the reference database. This can be a pain, due to the size of the
+databases and the obscurity around what has changed. The simplest way to do
+this is simply create a new directory and download the DBs from scratch.
+However, you can save a considerable amount of time by downloading only the
+components that have changed.
+
+If you wish to continue hosting prior versions of the tool, you must maintain
+the reference DBs for each version. The ``ALPHAFOLD_DB`` environment variable
+must then be set respectively for each tool version in your job conf (on Galaxy
+AU this is currently `configured with TPV<https://github.com/usegalaxy-au/infrastructure/blob/master/files/galaxy/dynamic_job_rules/production/total_perspective_vortex/tools.yml#L1515-L1554>`_).
+
+To minimize redundancy between DB version, we have symlinked the database
+components that are unchanging between versions. In ``v2.1.2 -> v2.3.1`` the BFD
+database is the only component that is persistent, but they are by far the
+largest on disk.
+

 JOB DESTINATION
 ~~~~~~~~~~~~~~~
--- a/alphafold.html	Wed Oct 12 22:25:20 2022 +0000
+++ b/alphafold.html	Tue Feb 28 01:15:42 2023 +0000
@@ -336,26 +336,26 @@
       <div class="flex col controls">
         <div class="box text-center">
           <h3> Select model </h3>
-          <p>The top five structures predicted by Alphafold</p>
+          <p>The top-ranked structures predicted by Alphafold</p>
           <div>
             <button class="btn selected" id="btn-ranked_0" onclick="setModel(0);">
-              Model 1
+              Ranked 0
             </button>

             <button class="btn" id="btn-ranked_1" onclick="setModel(1);">
-              Model 2
+              Ranked 1
             </button>

             <button class="btn" id="btn-ranked_2" onclick="setModel(2);">
-              Model 3
+              Ranked 2
             </button>

             <button class="btn" id="btn-ranked_3" onclick="setModel(3);">
-              Model 4
+              Ranked 3
             </button>

             <button class="btn" id="btn-ranked_4" onclick="setModel(4);">
-              Model 5
+              Ranked 4
             </button>
           </div>
         </div>
--- a/alphafold.xml	Wed Oct 12 22:25:20 2022 +0000
+++ b/alphafold.xml	Tue Feb 28 01:15:42 2023 +0000
@@ -1,8 +1,8 @@
 <tool id="alphafold" name="Alphafold 2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">
     <description> - AI-guided 3D structural prediction of proteins</description>
     <macros>
-      <token name="@TOOL_VERSION@">2.1.2</token>
-      <token name="@VERSION_SUFFIX@">4</token>
+      <token name="@TOOL_VERSION@">2.3.1</token>
+      <token name="@VERSION_SUFFIX@">0</token>
     </macros>
     <edam_topics>
       <edam_topic>topic_0082</edam_topic>
@@ -11,92 +11,93 @@
       <edam_operation>operation_0474</edam_operation>
     </edam_operations>
     <xrefs>
-      <xref type="bio.tools">alphafold_2.0</xref>
+      <xref type="bio.tools">alphafold_2</xref>
     </xrefs>
     <requirements>
-        <container type="docker">neoformit/alphafold:v2.1.2_0</container>
+        <container type="docker">neoformit/alphafold:v2.3.1_1</container>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[

 ## $ALPHAFOLD_DB variable should point to the location of the AlphaFold
 ## databases - defaults to /data

-## fasta setup ----------------------------
+## Read FASTA input ----------------------------
 #if $fasta_or_text.input_mode == 'history':
-    cp '$fasta_or_text.fasta_file' input.fasta &&
+    cp '$fasta_or_text.fasta_file' input.fasta

 #elif $fasta_or_text.input_mode == 'textbox':
-    echo '$fasta_or_text.fasta_text' > input.fasta &&
+    echo '$fasta_or_text.fasta_text' > input.fasta
 #end if

-python3 '$__tool_directory__/validate_fasta.py' input.fasta
+&& python3 '$__tool_directory__/validate_fasta.py' input.fasta
 --min_length \${ALPHAFOLD_AA_LENGTH_MIN:-0}
 --max_length \${ALPHAFOLD_AA_LENGTH_MAX:-0}
 #if $multimer:
 --multimer
 #end if
-> alphafold.fasta &&
-
-## env vars -------------------------------
-export TF_FORCE_UNIFIED_MEMORY=1 &&
-export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0 &&
-export DATE=`date +"%Y-%m-%d"` &&
+> alphafold.fasta

-## run alphafold  -------------------------
-python /app/alphafold/run_alphafold.py
---fasta_paths alphafold.fasta
---output_dir output
---data_dir \${ALPHAFOLD_DB:-/data}
---max_template_date=\$DATE
+## Env vars -------------------------------
+&& export TF_FORCE_UNIFIED_MEMORY=1
+&& export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0
+&& export TODAY=`date +"%Y-%m-%d"`
+
+## Run alphafold  -------------------------
+&& python /app/alphafold/run_alphafold.py
+    --fasta_paths alphafold.fasta
+    --output_dir output
+    --data_dir \${ALPHAFOLD_DB:-/data}

-## Set reference data explicitly
---uniref90_database_path   \${ALPHAFOLD_DB:-/data}/uniref90/uniref90.fasta
---mgnify_database_path     \${ALPHAFOLD_DB:-/data}/mgnify/mgy_clusters_2018_12.fa
---template_mmcif_dir       \${ALPHAFOLD_DB:-/data}/pdb_mmcif/mmcif_files
---obsolete_pdbs_path       \${ALPHAFOLD_DB:-/data}/pdb_mmcif/obsolete.dat
-#if $dbs == 'full':
---bfd_database_path        \${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt
---uniclust30_database_path \${ALPHAFOLD_DB:-/data}/uniclust30/uniclust30_2018_08/uniclust30_2018_08
-#else
---db_preset=reduced_dbs
---small_bfd_database_path  \${ALPHAFOLD_DB:-/data}/small_bfd/bfd-first_non_consensus_sequences.fasta
-#end if
+    ## Set reference database paths
+    --uniref90_database_path   \${ALPHAFOLD_DB:-/data}/uniref90/uniref90.fasta
+    --mgnify_database_path     \${ALPHAFOLD_DB:-/data}/mgnify/mgy_clusters_2022_05.fa
+    --template_mmcif_dir       \${ALPHAFOLD_DB:-/data}/pdb_mmcif/mmcif_files
+    --obsolete_pdbs_path       \${ALPHAFOLD_DB:-/data}/pdb_mmcif/obsolete.dat
+    #if $dbs == 'full':
+    --bfd_database_path        \${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt
+    --uniref30_database_path   \${ALPHAFOLD_DB:-/data}/uniref30/UniRef30_2021_03
+    #else
+    --db_preset=reduced_dbs
+    --small_bfd_database_path  \${ALPHAFOLD_DB:-/data}/small_bfd/bfd-first_non_consensus_sequences.fasta
+    #end if

-## Param introduced in AlphaFold v2.1.2:
---use_gpu_relax=\${ALPHAFOLD_USE_GPU:-True}
+    #if $max_template_date:
+    --max_template_date=$max_template_date
+    #else
+    --max_template_date=\$TODAY
+    #end if
+
+    --use_gpu_relax=\${ALPHAFOLD_USE_GPU:-True}  ## introduced in v2.1.2

-#if $multimer:
---model_preset=multimer
---pdb_seqres_database_path=\${ALPHAFOLD_DB:-/data}/pdb_seqres/pdb_seqres.txt
---uniprot_database_path=\${ALPHAFOLD_DB:-/data}/uniprot/uniprot.fasta
-##--num_multimer_predictions_per_model=1  ## introduced alphafold>=2.2.0
-
-#else
---pdb70_database_path \${ALPHAFOLD_DB:-/data}/pdb70/pdb70
-#end if
-&&
+    #if $multimer:
+    --model_preset=multimer
+    --pdb_seqres_database_path=\${ALPHAFOLD_DB:-/data}/pdb_seqres/pdb_seqres.txt
+    --uniprot_database_path=\${ALPHAFOLD_DB:-/data}/uniprot/uniprot.fasta
+    --num_multimer_predictions_per_model=1  ## introduced in v2.2.0
+    #else
+    --pdb70_database_path \${ALPHAFOLD_DB:-/data}/pdb70/pdb70
+    #end if

 ## Generate additional outputs ------------
-python3 '$__tool_directory__/gen_extra_outputs.py' output/alphafold $output_plddts
+&& python3 '$__tool_directory__/outputs.py' output/alphafold $outputs.plddts
 #if $multimer:
 --multimer
 #end if
-&&

 ## HTML output
-mkdir -p '${ html.files_path }' &&
-cp '$__tool_directory__/alphafold.html' '${html}' &&
-cp output/alphafold/ranked_*.pdb '${html.files_path}' &&
+&& mkdir -p '${ html.files_path }'
+&& cp '$__tool_directory__/alphafold.html' '${html}'
+&& cp output/alphafold/ranked_*.pdb '${html.files_path}'

 ## This is a (hacky) fix for a bug that has appeared in multiple Pulsar servers.
 ## The working directory ends up two levels deep and the visualization html page
 ## fails to load the PDB files as static assets.
-(([ -d working ] && cp -r working/* .) || true)
+&& (([ -d working ] && cp -r working/* .) || true)

     ]]></command>
     <inputs>
         <conditional name="fasta_or_text">
-            <param name="input_mode" type="select" label="Fasta Input" help="Protein sequence(s) to fold. Input can be fasta file from history, or text. Sequence must be valid IUPAC amino acid characters. If multiple sequences FASTA file provided, multimer mode must be selected.">
+            <param name="input_mode" type="select" label="Fasta Input" help="Protein sequence(s) to fold. Input can be fasta file from history, or text. Sequence must be valid IUPAC amino acid characters. If multiple-sequence FASTA file provided, multimer mode must be selected.">
                 <option value="history">Use fasta from history</option>
                 <option value="textbox">Paste sequence into textbox</option>
             </param>
@@ -109,6 +110,21 @@
         </conditional>

         <param
+            name="max_template_date"
+            type="text"
+            label="Max template date (yyyy-mm-dd) (optional)"
+            help="The model will reference PDB structures deposited before this date only. Defaults to today's date."
+            optional="true"
+        >
+            <sanitizer>
+                <valid initial="string.digits">
+                    <add value="-" />
+                </valid>
+            </sanitizer>
+            <validator type="regex">[0-9]{4}-[0-9]{2}-[0-9]{2}</validator>
+        </param>
+
+        <param
           name="dbs"
           type="select"
           display="radio"
@@ -125,39 +141,135 @@
           type="boolean"
           checked="false"
           label="Multimer mode"
-          help="Fold a protein multimer from multiple input sequences. You must input multiple sequences to run this mode."
+          help="Fold a protein multimer from multiple input sequences. You must input multiple sequences in FASTA to run this mode."
         />

-        <param name="output_plddts" type="boolean" checked="false" label="Output per-residue confidence scores" truevalue="--plddts" falsevalue="" help="Alphafold produces a pLDDT score between 0-100 for each residue in the folded models. High scores represent high confidence in placement for the residue, while low scoring residues have lower confidence. Sections of low confidence often occur in disordered regions. " />
+        <section name="outputs" title="Optional outputs" expanded="false">
+            <param
+                name="confidence_scores"
+                type="boolean"
+                checked="false"
+                label="Per-model confidence scores"
+                help="A tabular file showing average confidence score for each model (predicted template modelling (PTM) score; interface PTM is incorporated into this score for multimer predictions)."
+            />
+            <param
+                name="plddts"
+                type="boolean"
+                checked="false"
+                label="Per-residue confidence scores"
+                truevalue="--plddts"
+                falsevalue=""
+                help="Alphafold produces a pLDDT score between 0-100 for each residue in the folded models. High scores represent high confidence in placement for the residue, while low scoring residues have lower confidence. This output is a tabular file with five rows (one for each output PDB model), with each column providing a pLDDT score for a single residue. These data have been parsed from the model pickle files (below)."
+            />
+            <param
+                name="model_pkls"
+                type="boolean"
+                checked="false"
+                label="ranked_*.pkl"
+                help="A pickle file containing metrics used for the assessment of the model's accuracy. These include per-residue pLDDT scores (see above), predicted TM (Template Modelling) score, which is a global superposition metric and predicted aligned error (a matrix size (number of residues) x (number of residues) where each position describes the confidence of the residue's 3D position relative to another residue in the model; can be used for the interpretation of relative positions of domains). Pickle files can be read and processed using the Python 'pickle' library. Outputs are named respectively to PDB outputs."
+            />
+            <param
+                name="relax_json"
+                type="boolean"
+                checked="false"
+                label="relax_metrics.json"
+                help="A JSON-formatted text file containing relax metrics (mostly remaining violations)."
+            />
+        </section>
     </inputs>

     <outputs>
-        <data name="model5" format="pdb" from_work_dir="output/alphafold/ranked_4.pdb" label="${tool.name} on ${on_string}: Model 5"/>
-        <data name="model4" format="pdb" from_work_dir="output/alphafold/ranked_3.pdb" label="${tool.name} on ${on_string}: Model 4"/>
-        <data name="model3" format="pdb" from_work_dir="output/alphafold/ranked_2.pdb" label="${tool.name} on ${on_string}: Model 3"/>
-        <data name="model2" format="pdb" from_work_dir="output/alphafold/ranked_1.pdb" label="${tool.name} on ${on_string}: Model 2"/>
-        <data name="model1" format="pdb" from_work_dir="output/alphafold/ranked_0.pdb" label="${tool.name} on ${on_string}: Model 1"/>
-        <data name="confidence_scores" format="tsv" from_work_dir="output/alphafold/model_confidence_scores.tsv" label="${tool.name} on ${on_string}: Model confidence scores"/>
-        <data name="plddts" format="tsv" from_work_dir="output/alphafold/plddts.tsv" label="${tool.name} on ${on_string}: Per-residue confidence scores (plddts)">
-            <filter>(output_plddts)</filter>
+        <data name="model5" format="pdb" from_work_dir="output/alphafold/ranked_4.pdb" label="${tool.name} on ${on_string}: PDB ranked 4"/>
+        <data name="model4" format="pdb" from_work_dir="output/alphafold/ranked_3.pdb" label="${tool.name} on ${on_string}: PDB ranked 3"/>
+        <data name="model3" format="pdb" from_work_dir="output/alphafold/ranked_2.pdb" label="${tool.name} on ${on_string}: PDB ranked 2"/>
+        <data name="model2" format="pdb" from_work_dir="output/alphafold/ranked_1.pdb" label="${tool.name} on ${on_string}: PDB ranked 1"/>
+        <data name="model1" format="pdb" from_work_dir="output/alphafold/ranked_0.pdb" label="${tool.name} on ${on_string}: PDB ranked 0"/>
+        <data name="html" format="html" label="${tool.name} on ${on_string}: Visualization" />
+
+        <!-- Optional outputs -->
+        <data
+            name="output_confidence_scores"
+            format="tabular"
+            from_work_dir="output/alphafold/extra/model_confidence_scores.tsv"
+            label="${tool.name} on ${on_string}: Model confidence scores"
+        >
+            <filter>outputs['confidence_scores']</filter>
+        </data>
+
+        <data
+            name="output_plddts"
+            format="tabular"
+            from_work_dir="output/alphafold/extra/plddts.tsv"
+            label="${tool.name} on ${on_string}: Per-residue confidence scores (plddts)"
+        >
+            <filter>outputs['plddts']</filter>
+        </data>
+
+        <data
+            name="output_ranked_4_pkl"
+            format="binary"
+            from_work_dir="output/alphafold/extra/ranked_4.pkl"
+            label="${tool.name} on ${on_string}: ranked_4.pkl"
+        >
+            <filter>outputs['model_pkls']</filter>
         </data>
-        <data name="html" format="html" label="${tool.name} on ${on_string}: Visualization" />
+        <data
+            name="output_ranked_3_pkl"
+            format="binary"
+            from_work_dir="output/alphafold/extra/ranked_3.pkl"
+            label="${tool.name} on ${on_string}: ranked_3.pkl"
+        >
+            <filter>outputs['model_pkls']</filter>
+        </data>
+        <data
+            name="output_ranked_2_pkl"
+            format="binary"
+            from_work_dir="output/alphafold/extra/ranked_2.pkl"
+            label="${tool.name} on ${on_string}: ranked_2.pkl"
+        >
+            <filter>outputs['model_pkls']</filter>
+        </data>
+        <data
+            name="output_ranked_1_pkl"
+            format="binary"
+            from_work_dir="output/alphafold/extra/ranked_1.pkl"
+            label="${tool.name} on ${on_string}: ranked_1.pkl"
+        >
+            <filter>outputs['model_pkls']</filter>
+        </data>
+        <data
+            name="output_ranked_0_pkl"
+            format="binary"
+            from_work_dir="output/alphafold/extra/ranked_0.pkl"
+            label="${tool.name} on ${on_string}: ranked_0.pkl"
+        >
+            <filter>outputs['model_pkls']</filter>
+        </data>
+        <data
+            name="output_relax_json"
+            format="json"
+            from_work_dir="output/alphafold/extra/relax_metrics_ranked.json"
+            label="${tool.name} on ${on_string}: relax_metrics_ranked.json"
+        >
+            <filter>outputs['relax_json']</filter>
+        </data>
     </outputs>
+
     <tests>
         <test expect_num_outputs="8">
             <conditional name="fasta_or_text">
                 <param name="input_mode" value="history"/>
                 <param name="fasta_file" value="test1.fasta"/>
             </conditional>
-            <param name="output_plddts" value="true"/>
-            <output name="plddts">
+            <param name="plddts" value="true"/>
+            <output name="output_plddts">
                 <assert_contents>
                     <has_n_columns n="2"/>
                     <has_n_lines n="6"/>
                     <has_size value="2900" delta="300"/>
                 </assert_contents>
             </output>
-            <output name="confidence_scores">
+            <output name="output_confidence_scores">
                 <assert_contents>
                     <has_n_columns n="2"/>
                     <has_n_lines n="6"/>
@@ -205,43 +317,36 @@

     .. class:: infomark

-    **What it does**
+    | AlphaFold v2: AI-guided 3D structural prediction of proteins
+    |
+    | **NOTE: this tool packages AlphaFold v2.3.1.**
+    |
+    | This means that the neural network has been trained on PDBs with a release
+    | date before 2021-09-30 (the training cutoff was 2018-04-30 until ``v2.3.0``).
+    |
+    | Find out more in the technical and release notes:
+    |

-    | AlphaFold v2.1: AI-guided 3D structure prediction of proteins
+    - `Release notes for v2.3.1 <https://github.com/deepmind/alphafold/releases/tag/v2.3.1>`_
+    - `Technical notes for v2.3 <https://github.com/deepmind/alphafold/blob/main/docs/technical_note_v2.3.0.md>`_
+
+    | If you want to use AlphaFold trained against an older cutoff date, switch to Galaxy version ``2.1.2`` (which was trained to data up to 2018-04-30).
     |

+    **What it does**
+
     *What is AlphaFold?*

-    | AlphaFold is a program which uses neural networks to predict the tertiary (3D) structure of proteins. AlphaFold accepts an amino acid sequence (in Fasta format), then will 'fold' that sequence into a 3D model.
-    |
-    | **NOTE: AlphaFold has numerous versions - this tool uses AlphaFold v2.1.2.**
+    | AlphaFold is a program which uses neural networks to predict the tertiary (3D) structure of proteins. AlphaFold accepts an amino acid sequence in Fasta format, which will be "folded" into a 3D model.
     |

     *What makes AlphaFold different?*

     | The ability to use computers to predict 3D protein structures with high accuracy is desirable because it removes the time-consuming and costly process of determining structures experimentally.
-    | In-silico protein folding has been an active field of research for decades, but existing tools ran more slowly and with less reliability than AlphaFold.
+    | In-silico protein folding has been an active field of research for decades, but existing tools were slower and far less reliable than AlphaFold.
     | AlphaFold represents a leap forward by regularly predicting structures to atomic-level accuracy, even when no similar structures are known.
     |

-    *Downstream analysis*
-
-    | Obtaining a protein structure prediction is the first step in many analyses.
-    | The 3D models created by AlphaFold can be used in downstream analysis, including the following:
-    |
-
-    - Inspecting protein features
-        3D viewers (pymol, chimera, ngl, blender) can be used to inspect active sites, regulatory domains, binding sites.
-    - Molecular docking
-        3D structures can be used to predict the binding affinity of different compounds.
-        This is especially useful in screening drug candidates.
-    - Protein-protein interactions
-        Proteins associate in many biological processes, including intracellular signalling pathways and protein complex formation.
-        To predict these interactions, other programs may ingest 3D models predicted by AlphaFold. Proprietary softwares include `GOLD <https://www.ccdc.cam.ac.uk/solutions/csd-discovery/components/gold/>`_ and `SeeSAR <https://www.biosolveit.de/SeeSAR>`_, but many `free and open-source options <https://en.wikipedia.org/wiki/List_of_protein-ligand_docking_software>`_ are available such as `AutoDock <https://autodock.scripps.edu/>`_ and `SwissDock <http://www.swissdock.ch/>`_.
-
-    | Protein complex interactions are also commonly observed with AlphaFold's multimer prediction mode.
-    |
-    |

     **Input**

@@ -269,8 +374,9 @@

     *PDB files*

-    | Five PDB (Protein Data Bank) files will be created for the best ranking models predicted by AlphaFold.
+    | Five PDB (Protein Data Bank) files are be created, ordered by rank, as predicted by AlphaFold.
     | These files describe the molecular structures and can be used for downstream analysis. e.g. *in silico* molecular docking.
+    | **PLEASE NOTE** that all outputs have been renamed to their respective rank order, including model and model.pkl files.
     |

     *Model confidence scores (optional)*
@@ -280,16 +386,28 @@
     |
     |

+    *Model data files (ranked_n.pkl)*
+
+    | Per-model data stored in pickle files (a Python binary data format). These files can be used as inputs to downstream analysis software (such as Chimera X) for visualizing structures and computing kinetics between protein multimers and domains.
+    | The tool will produce one ``.pkl`` output for each of the PDB models.
+    |
+    |
+
+    *relax_metrics.json (optional)*
+
+    | A JSON-formatted text file containing relax metrics (mostly remaining violations).
+    |
+
     **AlphaFold configuration**

     | We have configured AlphaFold to run with the parameters suggested by default on `AlphaFold's GitHub <https://github.com/deepmind/alphafold>`_.
-    | This means that it runs against the full database with Amber relaxation, with ``max_template_date`` set to today's date. If there are additonal parameters that you would like to interact with, please `send a support request to Galaxy AU <https://site.usegalaxy.org.au/request/support>`_, or open an issue on `our GitHub <https://github.com/usegalaxy-au/tools-au>`_.
+    | This means that it runs with Amber relaxation enabled, with relaxed PDB models collected as output datasets. If there are additonal parameters that you would like to interact with, please `send a support request to Galaxy AU <https://site.usegalaxy.org.au/request/support>`_, or open an issue on `our GitHub <https://github.com/usegalaxy-au/tools-au>`_.
     |
     |

     **External Resources**

-    We HIGHLY recommend checking out the
+    We highly recommend checking out the
     `Alphafold Protein Structure Database <https://alphafold.ebi.ac.uk/>`_,
     which contains pre-computed structures for over 200 million known proteins.
     See also:
@@ -297,6 +415,21 @@
     - `Google Deepmind's article on AlphaFold <https://deepmind.com/blog/article/alphafold-a-solution-to-a-50-year-old-grand-challenge-in-biology>`_
     - `AlphaFold source code on GitHub <https://github.com/deepmind/alphafold>`_

+    *Downstream analysis*
+
+    | Obtaining a protein structure prediction is the first step in many analyses.
+    | The 3D models created by AlphaFold can be used in downstream analysis, including the following:
+    |
+
+    - Inspecting protein features
+        3D viewers (pymol, chimera, ngl, blender) can be used to inspect active sites, regulatory domains, binding sites.
+    - Molecular docking
+        3D structures can be used to predict the binding affinity of different compounds.
+        This is especially useful in screening drug candidates.
+    - Protein-protein interactions
+        Proteins associate in many biological processes, including intracellular signalling pathways and protein complex formation.
+        To predict these interactions, other programs may ingest 3D models predicted by AlphaFold. Proprietary softwares include `GOLD <https://www.ccdc.cam.ac.uk/solutions/csd-discovery/components/gold/>`_ and `SeeSAR <https://www.biosolveit.de/SeeSAR>`_, but many `free and open-source options <https://en.wikipedia.org/wiki/List_of_protein-ligand_docking_software>`_ are available such as `AutoDock <https://autodock.scripps.edu/>`_, `SwissDock <http://www.swissdock.ch/>`_, `DockQ <https://github.com/bjornwallner/DockQ>`_, `MM-Align <https://zhanggroup.org/MM-align/>`_ and `TM-Align <https://zhanggroup.org/TM-align/>`_. Protein-protein interactions are often inferred from AlphaFold-Multimer predictions, which provide a level of confidence in binding affinity between homomer/heteromer subunits.
+
     ]]></help>
     <citations>
         <citation type="doi">https://doi.org/10.1038/s41586-021-03819-2</citation>
--- a/gen_extra_outputs.py	Wed Oct 12 22:25:20 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,197 +0,0 @@
-"""Generate additional output files not produced by AlphaFold."""
-
-import json
-import pickle
-import argparse
-from typing import Any, Dict, List
-
-# Keys for accessing confidence data from JSON/pkl files
-# They change depending on whether the run was monomer or multimer
-CONTEXT_KEY = {
-    'monomer': 'plddts',
-    'multimer': 'iptm+ptm',
-}
-
-
-class Settings:
-    """parses then keeps track of program settings"""
-    def __init__(self):
-        self.workdir = None
-        self.output_confidence_scores = True
-        self.output_residue_scores = False
-        self.is_multimer = False
-
-    def parse_settings(self) -> None:
-        parser = argparse.ArgumentParser()
-        parser.add_argument(
-            "workdir",
-            help="alphafold output directory",
-            type=str
-        )
-        parser.add_argument(
-            "-p",
-            "--plddts",
-            help="output per-residue confidence scores (pLDDTs)",
-            action="store_true"
-        )
-        parser.add_argument(
-            "-m",
-            "--multimer",
-            help="parse output from AlphaFold multimer",
-            action="store_true"
-        )
-        args = parser.parse_args()
-        self.workdir = args.workdir.rstrip('/')
-        self.output_residue_scores = args.plddts
-        self.is_multimer = False
-        self.is_multimer = args.multimer
-
-
-class ExecutionContext:
-    """uses program settings to get paths to files etc"""
-    def __init__(self, settings: Settings):
-        self.settings = settings
-
-    def get_model_key(self, ix):
-        """Return json key for model index."""
-        if self.settings.is_multimer:
-            return f'model_{ix}_multimer'
-        return f'model_{ix}'
-
-    @property
-    def ranking_debug(self) -> str:
-        return f'{self.settings.workdir}/ranking_debug.json'
-
-    @property
-    def model_pkls(self) -> List[str]:
-        ext = '.pkl'
-        if self.settings.is_multimer:
-            ext = '_multimer.pkl'
-        return [
-            f'{self.settings.workdir}/result_model_{i}{ext}'
-            for i in range(1, 6)
-        ]
-
-    @property
-    def model_conf_score_output(self) -> str:
-        return f'{self.settings.workdir}/model_confidence_scores.tsv'
-
-    @property
-    def plddt_output(self) -> str:
-        return f'{self.settings.workdir}/plddts.tsv'
-
-
-class FileLoader:
-    """loads file data for use by other classes"""
-
-    def __init__(self, context: ExecutionContext):
-        self.context = context
-
-    @property
-    def confidence_key(self) -> str:
-        """Return the correct key for confidence data."""
-        if self.context.settings.is_multimer:
-            return CONTEXT_KEY['multimer']
-        return CONTEXT_KEY['monomer']
-
-    def get_model_mapping(self) -> Dict[str, int]:
-        data = self.load_ranking_debug()
-        return {name: int(rank) + 1
-                for (rank, name) in enumerate(data['order'])}
-
-    def get_conf_scores(self) -> Dict[str, float]:
-        data = self.load_ranking_debug()
-        return {
-            name: float(f'{score:.2f}')
-            for name, score in data[self.confidence_key].items()
-        }
-
-    def load_ranking_debug(self) -> Dict[str, Any]:
-        with open(self.context.ranking_debug, 'r') as fp:
-            return json.load(fp)
-
-    def get_model_plddts(self) -> Dict[str, List[float]]:
-        plddts: Dict[str, List[float]] = {}
-        model_pkls = self.context.model_pkls
-        for i in range(len(model_pkls)):
-            pklfile = model_pkls[i]
-            with open(pklfile, 'rb') as fp:
-                data = pickle.load(fp)
-            plddts[self.context.get_model_key(i+1)] = [
-                float(f'{x:.2f}')
-                for x in data['plddt']
-            ]
-        return plddts
-
-
-class OutputGenerator:
-    """generates the output data we are interested in creating"""
-    def __init__(self, loader: FileLoader):
-        self.loader = loader
-        self.context = loader.context
-
-    def gen_conf_scores(self):
-        mapping = self.loader.get_model_mapping()
-        scores = self.loader.get_conf_scores()
-        ranked = list(scores.items())
-        ranked.sort(key=lambda x: x[1], reverse=True)
-        return {
-            self.context.get_model_key(mapping[name]): score
-            for name, score in ranked
-        }
-
-    def gen_residue_scores(self) -> Dict[str, List[float]]:
-        mapping = self.loader.get_model_mapping()
-        model_plddts = self.loader.get_model_plddts()
-        return {
-            self.context.get_model_key(mapping[name]): plddts
-            for name, plddts in model_plddts.items()
-        }
-
-
-class OutputWriter:
-    """writes generated data to files"""
-    def __init__(self, context: ExecutionContext):
-        self.context = context
-
-    def write_conf_scores(self, data: Dict[str, float]) -> None:
-        outfile = self.context.model_conf_score_output
-        with open(outfile, 'w') as fp:
-            for model, score in data.items():
-                fp.write(f'{model}\t{score}\n')
-
-    def write_residue_scores(self, data: Dict[str, List[float]]) -> None:
-        outfile = self.context.plddt_output
-        model_plddts = list(data.items())
-        model_plddts.sort()
-
-        with open(outfile, 'w') as fp:
-            for model, plddts in model_plddts:
-                plddt_str_list = [str(x) for x in plddts]
-                plddt_str = ','.join(plddt_str_list)
-                fp.write(f'{model}\t{plddt_str}\n')
-
-
-def main():
-    # setup
-    settings = Settings()
-    settings.parse_settings()
-    context = ExecutionContext(settings)
-    loader = FileLoader(context)
-
-    # generate & write outputs
-    generator = OutputGenerator(loader)
-    writer = OutputWriter(context)
-
-    # confidence scores
-    conf_scores = generator.gen_conf_scores()
-    writer.write_conf_scores(conf_scores)
-
-    # per-residue plddts
-    if settings.output_residue_scores:
-        residue_scores = generator.gen_residue_scores()
-        writer.write_residue_scores(residue_scores)
-
-
-if __name__ == '__main__':
-    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/outputs.py	Tue Feb 28 01:15:42 2023 +0000
@@ -0,0 +1,245 @@
+"""Generate additional output files not produced by AlphaFold.
+
+Currently this is includes:
+- model confidence scores
+- per-residue confidence scores (pLDDTs - optional output)
+- model_*.pkl files renamed with rank order
+
+N.B. There have been issues with this script breaking between AlphaFold
+versions due to minor changes in the output directory structure across minor
+versions. It will likely need updating with future releases of AlphaFold.
+
+This code is more complex than you might expect due to the output files
+'moving around' considerably, depending on run parameters. You will see that
+several output paths are determined dynamically.
+"""
+
+import argparse
+import json
+import os
+import pickle as pk
+import shutil
+from pathlib import Path
+from typing import List
+
+# Output file names
+OUTPUT_DIR = 'extra'
+OUTPUTS = {
+    'model_pkl': OUTPUT_DIR + '/ranked_{rank}.pkl',
+    'model_confidence_scores': OUTPUT_DIR + '/model_confidence_scores.tsv',
+    'plddts': OUTPUT_DIR + '/plddts.tsv',
+    'relax': OUTPUT_DIR + '/relax_metrics_ranked.json',
+}
+
+# Keys for accessing confidence data from JSON/pkl files
+# They change depending on whether the run was monomer or multimer
+PLDDT_KEY = {
+    'monomer': 'plddts',
+    'multimer': 'iptm+ptm',
+}
+
+
+class Settings:
+    """Parse and store settings/config."""
+    def __init__(self):
+        self.workdir = None
+        self.output_confidence_scores = True
+        self.output_residue_scores = False
+        self.is_multimer = False
+
+    def parse_settings(self) -> None:
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            "workdir",
+            help="alphafold output directory",
+            type=str
+        )
+        parser.add_argument(
+            "-p",
+            "--plddts",
+            help="output per-residue confidence scores (pLDDTs)",
+            action="store_true"
+        )
+        parser.add_argument(
+            "-m",
+            "--multimer",
+            help="parse output from AlphaFold multimer",
+            action="store_true"
+        )
+        parser.add_argument(
+            "--model-pkl",
+            dest="model_pkl",
+            help="rename model pkl outputs with rank order",
+            action="store_true"
+        )
+        args = parser.parse_args()
+        self.workdir = Path(args.workdir.rstrip('/'))
+        self.output_residue_scores = args.plddts
+        self.output_model_pkls = args.model_pkl
+        self.is_multimer = args.multimer
+        self.output_dir = self.workdir / OUTPUT_DIR
+        os.makedirs(self.output_dir, exist_ok=True)
+
+
+class ExecutionContext:
+    """Collect file paths etc."""
+    def __init__(self, settings: Settings):
+        self.settings = settings
+        if settings.is_multimer:
+            self.plddt_key = PLDDT_KEY['multimer']
+        else:
+            self.plddt_key = PLDDT_KEY['monomer']
+
+    def get_model_key(self, ix: int) -> str:
+        """Return json key for model index.
+
+        The key format changed between minor AlphaFold versions so this
+        function determines the correct key.
+        """
+        with open(self.ranking_debug) as f:
+            data = json.load(f)
+        model_keys = list(data[self.plddt_key].keys())
+        for k in model_keys:
+            if k.startswith(f"model_{ix}_"):
+                return k
+        return KeyError(
+            f'Could not find key for index={ix} in'
+            ' ranking_debug.json')
+
+    @property
+    def ranking_debug(self) -> str:
+        return self.settings.workdir / 'ranking_debug.json'
+
+    @property
+    def relax_metrics(self) -> str:
+        return self.settings.workdir / 'relax_metrics.json'
+
+    @property
+    def relax_metrics_ranked(self) -> str:
+        return self.settings.workdir / 'relax_metrics_ranked.json'
+
+    @property
+    def model_pkl_paths(self) -> List[str]:
+        return sorted([
+            self.settings.workdir / f
+            for f in os.listdir(self.settings.workdir)
+            if f.startswith('result_model_') and f.endswith('.pkl')
+        ])
+
+
+class ResultModelPrediction:
+    """Load and manipulate data from result_model_*.pkl files."""
+    def __init__(self, path: str, context: ExecutionContext):
+        self.context = context
+        self.path = path
+        self.name = os.path.basename(path).replace('result_', '').split('.')[0]
+        with open(path, 'rb') as path:
+            self.data = pk.load(path)
+
+    @property
+    def plddts(self) -> List[float]:
+        """Return pLDDT scores for each residue."""
+        return list(self.data['plddt'])
+
+
+class ResultRanking:
+    """Load and manipulate data from ranking_debug.json file."""
+
+    def __init__(self, context: ExecutionContext):
+        self.path = context.ranking_debug
+        self.context = context
+        with open(self.path, 'r') as f:
+            self.data = json.load(f)
+
+    @property
+    def order(self) -> List[str]:
+        """Return ordered list of model indexes."""
+        return self.data['order']
+
+    def get_plddt_for_rank(self, rank: int) -> List[float]:
+        """Get pLDDT score for model instance."""
+        return self.data[self.context.plddt_key][self.data['order'][rank - 1]]
+
+    def get_rank_for_model(self, model_name: str) -> int:
+        """Return 0-indexed rank for given model name.
+
+        Model names are expressed in result_model_*.pkl file names.
+        """
+        return self.data['order'].index(model_name)
+
+
+def write_confidence_scores(ranking: ResultRanking, context: ExecutionContext):
+    """Write per-model confidence scores."""
+    path = context.settings.workdir / OUTPUTS['model_confidence_scores']
+    with open(path, 'w') as f:
+        for rank in range(1, 6):
+            score = ranking.get_plddt_for_rank(rank)
+            f.write(f'ranked_{rank - 1}\t{score:.2f}\n')
+
+
+def write_per_residue_scores(
+    ranking: ResultRanking,
+    context: ExecutionContext,
+):
+    """Write per-residue plddts for each model.
+
+    A row of plddt values is written for each model in tabular format.
+    """
+    model_plddts = {}
+    for i, path in enumerate(context.model_pkl_paths):
+        model = ResultModelPrediction(path, context)
+        rank = ranking.get_rank_for_model(model.name)
+        model_plddts[rank] = model.plddts
+
+    path = context.settings.workdir / OUTPUTS['plddts']
+    with open(path, 'w') as f:
+        for i in sorted(list(model_plddts.keys())):
+            row = [f'ranked_{i}'] + [
+                str(x) for x in model_plddts[i]
+            ]
+            f.write('\t'.join(row) + '\n')
+
+
+def rename_model_pkls(ranking: ResultRanking, context: ExecutionContext):
+    """Rename model.pkl files so the rank order is implicit."""
+    for path in context.model_pkl_paths:
+        model = ResultModelPrediction(path, context)
+        rank = ranking.get_rank_for_model(model.name)
+        new_path = (
+            context.settings.workdir
+            / OUTPUTS['model_pkl'].format(rank=rank)
+        )
+        shutil.copyfile(path, new_path)
+
+
+def rekey_relax_metrics(ranking: ResultRanking, context: ExecutionContext):
+    """Replace keys in relax_metrics.json with 0-indexed rank."""
+    with open(context.relax_metrics) as f:
+        data = json.load(f)
+        for k in list(data.keys()):
+            rank = ranking.get_rank_for_model(k)
+            data[f'ranked_{rank}'] = data.pop(k)
+    new_path = context.settings.workdir / OUTPUTS['relax']
+    with open(new_path, 'w') as f:
+        json.dump(data, f)
+
+
+def main():
+    """Parse output files and generate additional output files."""
+    settings = Settings()
+    settings.parse_settings()
+    context = ExecutionContext(settings)
+    ranking = ResultRanking(context)
+    write_confidence_scores(ranking, context)
+    rekey_relax_metrics(ranking, context)
+
+    # Optional outputs
+    if settings.output_model_pkls:
+        rename_model_pkls(ranking, context)
+
+    if settings.output_residue_scores:
+        write_per_residue_scores(ranking, context)
+
+
+if __name__ == '__main__':
+    main()
--- a/validate_fasta.py	Wed Oct 12 22:25:20 2022 +0000
+++ b/validate_fasta.py	Tue Feb 28 01:15:42 2023 +0000
@@ -1,8 +1,8 @@
 """Validate input FASTA sequence."""

+import argparse
 import re
 import sys
-import argparse
 from typing import List

 MULTIMER_MAX_SEQUENCE_COUNT = 10