Mercurial > repos > galaxy-australia > alphafold2

--- a/alphafold.xml	Tue Feb 28 01:15:42 2023 +0000
+++ b/alphafold.xml	Fri Mar 10 02:48:07 2023 +0000
@@ -2,7 +2,9 @@
     <description> - AI-guided 3D structural prediction of proteins</description>
     <macros>
       <token name="@TOOL_VERSION@">2.3.1</token>
-      <token name="@VERSION_SUFFIX@">0</token>
+      <token name="@VERSION_SUFFIX@">1</token>
+      <import>macro_output.xml</import>
+      <import>macro_test_output.xml</import>
     </macros>
     <edam_topics>
       <edam_topic>topic_0082</edam_topic>
@@ -14,17 +16,20 @@
       <xref type="bio.tools">alphafold_2</xref>
     </xrefs>
     <requirements>
-        <container type="docker">neoformit/alphafold:v2.3.1_1</container>
+        <container type="docker">neoformit/alphafold:v2.3.1_2</container>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[

+## Developers: to test with mock alphafold run, set `export PLANEMO_TESTING=1`
+## in planemo's gx_venv_n/bin/activate script. AlphaFold outputs will be copied
+## from the test-data directory instead of running the tool.
+
 ## $ALPHAFOLD_DB variable should point to the location of the AlphaFold
 ## databases - defaults to /data

-## Read FASTA input ----------------------------
+## Read FASTA input -----------------------------------------------------------
 #if $fasta_or_text.input_mode == 'history':
     cp '$fasta_or_text.fasta_file' input.fasta
-
 #elif $fasta_or_text.input_mode == 'textbox':
     echo '$fasta_or_text.fasta_text' > input.fasta
 #end if
@@ -32,55 +37,66 @@
 && python3 '$__tool_directory__/validate_fasta.py' input.fasta
 --min_length \${ALPHAFOLD_AA_LENGTH_MIN:-0}
 --max_length \${ALPHAFOLD_AA_LENGTH_MAX:-0}
-#if $multimer:
+#if $model_preset == 'multimer':
 --multimer
 #end if
 > alphafold.fasta

-## Env vars -------------------------------
+## Env vars -------------------------------------------------------------------
 && export TF_FORCE_UNIFIED_MEMORY=1
 && export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0
 && export TODAY=`date +"%Y-%m-%d"`

-## Run alphafold  -------------------------
-&& python /app/alphafold/run_alphafold.py
-    --fasta_paths alphafold.fasta
-    --output_dir output
-    --data_dir \${ALPHAFOLD_DB:-/data}
-
-    ## Set reference database paths
-    --uniref90_database_path   \${ALPHAFOLD_DB:-/data}/uniref90/uniref90.fasta
-    --mgnify_database_path     \${ALPHAFOLD_DB:-/data}/mgnify/mgy_clusters_2022_05.fa
-    --template_mmcif_dir       \${ALPHAFOLD_DB:-/data}/pdb_mmcif/mmcif_files
-    --obsolete_pdbs_path       \${ALPHAFOLD_DB:-/data}/pdb_mmcif/obsolete.dat
-    #if $dbs == 'full':
-    --bfd_database_path        \${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt
-    --uniref30_database_path   \${ALPHAFOLD_DB:-/data}/uniref30/UniRef30_2021_03
-    #else
-    --db_preset=reduced_dbs
-    --small_bfd_database_path  \${ALPHAFOLD_DB:-/data}/small_bfd/bfd-first_non_consensus_sequences.fasta
-    #end if
+## Run AlphaFold  -------------------------------------------------------------
+#if os.environ.get('PLANEMO_TESTING'):
+    ## Run in testing mode (mocks a successful AlphaFold run by copying outputs)
+    && echo "Creating dummy outputs for model_preset=$model_preset..."
+    && bash '$__tool_directory__/mock_alphafold.sh' $model_preset
+#else:
+    ## Run AlphaFold
+    && python /app/alphafold/run_alphafold.py
+        --fasta_paths alphafold.fasta
+        --output_dir output
+        --data_dir \${ALPHAFOLD_DB:-/data}
+        --model_preset=$model_preset

-    #if $max_template_date:
-    --max_template_date=$max_template_date
-    #else
-    --max_template_date=\$TODAY
-    #end if
+        ## Set reference database paths
+        --uniref90_database_path   \${ALPHAFOLD_DB:-/data}/uniref90/uniref90.fasta
+        --mgnify_database_path     \${ALPHAFOLD_DB:-/data}/mgnify/mgy_clusters_2022_05.fa
+        --template_mmcif_dir       \${ALPHAFOLD_DB:-/data}/pdb_mmcif/mmcif_files
+        --obsolete_pdbs_path       \${ALPHAFOLD_DB:-/data}/pdb_mmcif/obsolete.dat
+        #if $dbs == 'full':
+        --bfd_database_path        \${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt
+        --uniref30_database_path   \${ALPHAFOLD_DB:-/data}/uniref30/UniRef30_2021_03
+        #else
+        --db_preset=reduced_dbs
+        --small_bfd_database_path  \${ALPHAFOLD_DB:-/data}/small_bfd/bfd-first_non_consensus_sequences.fasta
+        #end if

-    --use_gpu_relax=\${ALPHAFOLD_USE_GPU:-True}  ## introduced in v2.1.2
+        #if $max_template_date:
+        --max_template_date=$max_template_date
+        #else
+        --max_template_date=\$TODAY
+        #end if

-    #if $multimer:
-    --model_preset=multimer
-    --pdb_seqres_database_path=\${ALPHAFOLD_DB:-/data}/pdb_seqres/pdb_seqres.txt
-    --uniprot_database_path=\${ALPHAFOLD_DB:-/data}/uniprot/uniprot.fasta
-    --num_multimer_predictions_per_model=1  ## introduced in v2.2.0
-    #else
-    --pdb70_database_path \${ALPHAFOLD_DB:-/data}/pdb70/pdb70
-    #end if
+        --use_gpu_relax=\${ALPHAFOLD_USE_GPU:-True}  ## introduced in v2.1.2

-## Generate additional outputs ------------
-&& python3 '$__tool_directory__/outputs.py' output/alphafold $outputs.plddts
-#if $multimer:
+        #if $model_preset == 'multimer':
+        --pdb_seqres_database_path=\${ALPHAFOLD_DB:-/data}/pdb_seqres/pdb_seqres.txt
+        --uniprot_database_path=\${ALPHAFOLD_DB:-/data}/uniprot/uniprot.fasta
+        --num_multimer_predictions_per_model=1  ## introduced in v2.2.0
+        #else
+        --pdb70_database_path \${ALPHAFOLD_DB:-/data}/pdb70/pdb70
+        #end if
+#end if
+
+## Generate additional outputs ------------------------------------------------
+&& python3 '$__tool_directory__/outputs.py' output/alphafold
+$outputs.plddts
+$outputs.model_pkls
+$outputs.pae_csv
+$outputs.plots
+#if $model_preset == 'multimer':
 --multimer
 #end if

@@ -137,15 +153,31 @@
         </param>

         <param
-          name="multimer"
-          type="boolean"
-          checked="false"
-          label="Multimer mode"
-          help="Fold a protein multimer from multiple input sequences. You must input multiple sequences in FASTA to run this mode."
-        />
+            name="model_preset"
+            type="select"
+            label="Model preset"
+            help="Select which prediction model to run. The monomer model is the most accurate for single protein prediction. The multimer model allows prediction of protein complexes."
+        >
+            <option value="monomer" selected="true">monomer - default prediction model</option>
+            <option value="monomer_ptm">
+                monomer_ptm - slightly less accurate version of the monomer model, but provides a pairwise alignment error (PAE) matrix
+            </option>
+            <option value="multimer">
+                multimer - model a protein complex (requires multi-sequence FASTA input)
+            </option>
+        </param>

         <section name="outputs" title="Optional outputs" expanded="false">
             <param
+                name="plots"
+                type="boolean"
+                checked="false"
+                truevalue="--plot"
+                falsevalue=""
+                label="pLDDT and PAE matrix plots (per model)"
+                help="A two-panel plot showing pLDDT against residue position (left) and PAE (paired-alignment error) as a heatmap image with residue numbers running along vertical and horizontal axes and color at each pixel indicating PAE value for the corresponding pair of residues. (right). PAE heatmap is only produced with monomer_ptm and multimer model presets."
+            />
+            <param
                 name="confidence_scores"
                 type="boolean"
                 checked="false"
@@ -159,14 +191,25 @@
                 label="Per-residue confidence scores"
                 truevalue="--plddts"
                 falsevalue=""
-                help="Alphafold produces a pLDDT score between 0-100 for each residue in the folded models. High scores represent high confidence in placement for the residue, while low scoring residues have lower confidence. This output is a tabular file with five rows (one for each output PDB model), with each column providing a pLDDT score for a single residue. These data have been parsed from the model pickle files (below)."
+                help="Alphafold produces a pLDDT score between 0-100 for each residue in the folded models. High scores represent high confidence in placement for the residue, while low scoring residues have lower confidence. This output is a tabular file with five rows (one for each output PDB model), with each column providing a pLDDT score for a single residue."
+            />
+            <param
+                name="pae_csv"
+                type="boolean"
+                checked="false"
+                truevalue="--pae"
+                falsevalue=""
+                label="Paired-alignment error (PAE)"
+                help="A CSV-formatted matrix for each model. Only available for monomer_ptm and multimer model presets. Predicted aligned error (PAE) gives a distance error for every pair of residues. It gives AlphaFold's estimate of position error at residue X when the predicted and true structures are aligned on residue Y. Values range from 0 - 35 Angstroms."
             />
             <param
                 name="model_pkls"
                 type="boolean"
                 checked="false"
+                truevalue="--pkl"
+                falsevalue=""
                 label="ranked_*.pkl"
-                help="A pickle file containing metrics used for the assessment of the model's accuracy. These include per-residue pLDDT scores (see above), predicted TM (Template Modelling) score, which is a global superposition metric and predicted aligned error (a matrix size (number of residues) x (number of residues) where each position describes the confidence of the residue's 3D position relative to another residue in the model; can be used for the interpretation of relative positions of domains). Pickle files can be read and processed using the Python 'pickle' library. Outputs are named respectively to PDB outputs."
+                help="A pickle file containing metrics used for the assessment of the model's accuracy. These include per-residue pLDDT scores (see above), predicted TM (Template Modelling) score, which is a global superposition metric and predicted aligned error (a matrix size (number of residues) x (number of residues) where each position describes the confidence of the residue's 3D position relative to another residue in the model; can be used for the interpretation of relative positions of domains). Pickle files can be read and processed using the Python 'pickle' library (requires the jax Python library). Outputs are named respective to PDB outputs."
             />
             <param
                 name="relax_json"
@@ -179,138 +222,91 @@
     </inputs>

     <outputs>
-        <data name="model5" format="pdb" from_work_dir="output/alphafold/ranked_4.pdb" label="${tool.name} on ${on_string}: PDB ranked 4"/>
-        <data name="model4" format="pdb" from_work_dir="output/alphafold/ranked_3.pdb" label="${tool.name} on ${on_string}: PDB ranked 3"/>
-        <data name="model3" format="pdb" from_work_dir="output/alphafold/ranked_2.pdb" label="${tool.name} on ${on_string}: PDB ranked 2"/>
-        <data name="model2" format="pdb" from_work_dir="output/alphafold/ranked_1.pdb" label="${tool.name} on ${on_string}: PDB ranked 1"/>
-        <data name="model1" format="pdb" from_work_dir="output/alphafold/ranked_0.pdb" label="${tool.name} on ${on_string}: PDB ranked 0"/>
+        <expand macro="output_pdb_models" />
         <data name="html" format="html" label="${tool.name} on ${on_string}: Visualization" />
-
         <!-- Optional outputs -->
-        <data
-            name="output_confidence_scores"
-            format="tabular"
-            from_work_dir="output/alphafold/extra/model_confidence_scores.tsv"
-            label="${tool.name} on ${on_string}: Model confidence scores"
-        >
-            <filter>outputs['confidence_scores']</filter>
-        </data>
-
-        <data
-            name="output_plddts"
-            format="tabular"
-            from_work_dir="output/alphafold/extra/plddts.tsv"
-            label="${tool.name} on ${on_string}: Per-residue confidence scores (plddts)"
-        >
-            <filter>outputs['plddts']</filter>
-        </data>
-
-        <data
-            name="output_ranked_4_pkl"
-            format="binary"
-            from_work_dir="output/alphafold/extra/ranked_4.pkl"
-            label="${tool.name} on ${on_string}: ranked_4.pkl"
-        >
-            <filter>outputs['model_pkls']</filter>
-        </data>
-        <data
-            name="output_ranked_3_pkl"
-            format="binary"
-            from_work_dir="output/alphafold/extra/ranked_3.pkl"
-            label="${tool.name} on ${on_string}: ranked_3.pkl"
-        >
-            <filter>outputs['model_pkls']</filter>
-        </data>
-        <data
-            name="output_ranked_2_pkl"
-            format="binary"
-            from_work_dir="output/alphafold/extra/ranked_2.pkl"
-            label="${tool.name} on ${on_string}: ranked_2.pkl"
-        >
-            <filter>outputs['model_pkls']</filter>
-        </data>
-        <data
-            name="output_ranked_1_pkl"
-            format="binary"
-            from_work_dir="output/alphafold/extra/ranked_1.pkl"
-            label="${tool.name} on ${on_string}: ranked_1.pkl"
-        >
-            <filter>outputs['model_pkls']</filter>
-        </data>
-        <data
-            name="output_ranked_0_pkl"
-            format="binary"
-            from_work_dir="output/alphafold/extra/ranked_0.pkl"
-            label="${tool.name} on ${on_string}: ranked_0.pkl"
-        >
-            <filter>outputs['model_pkls']</filter>
-        </data>
-        <data
-            name="output_relax_json"
-            format="json"
-            from_work_dir="output/alphafold/extra/relax_metrics_ranked.json"
-            label="${tool.name} on ${on_string}: relax_metrics_ranked.json"
-        >
-            <filter>outputs['relax_json']</filter>
-        </data>
+        <expand macro="output_plddts" />
+        <expand macro="output_confidence_scores" />
+        <expand macro="output_pickles" />
+        <expand macro="output_pae_csv" />
+        <expand macro="output_plots" />
+        <expand macro="output_relax_json" />
     </outputs>

     <tests>
-        <test expect_num_outputs="8">
+        <!-- Test monomer with default outputs -->
+        <test expect_num_outputs="6">
+            <conditional name="fasta_or_text">
+                <param name="input_mode" value="history"/>
+                <param name="fasta_file" value="test1.fasta"/>
+            </conditional>
+            <param name="model_preset" value="monomer"/>
+            <expand macro="test_output_pdb_models" />
+        </test>
+
+        <!-- Test monomer with all outputs -->
+        <test expect_num_outputs="19">
             <conditional name="fasta_or_text">
                 <param name="input_mode" value="history"/>
                 <param name="fasta_file" value="test1.fasta"/>
             </conditional>
-            <param name="plddts" value="true"/>
-            <output name="output_plddts">
-                <assert_contents>
-                    <has_n_columns n="2"/>
-                    <has_n_lines n="6"/>
-                    <has_size value="2900" delta="300"/>
-                </assert_contents>
-            </output>
-            <output name="output_confidence_scores">
-                <assert_contents>
-                    <has_n_columns n="2"/>
-                    <has_n_lines n="6"/>
-                    <has_size value="70" delta="50"/>
-                </assert_contents>
-            </output>
-            <output name="model1">
-                <assert_contents>
-                    <has_n_columns n="12"/>
-                    <has_n_lines n="1517"/>
-                    <has_size value="123000" delta="10000"/>
-                </assert_contents>
-            </output>
-            <output name="model2">
-                <assert_contents>
-                    <has_n_columns n="12"/>
-                    <has_n_lines n="1517"/>
-                    <has_size value="123000" delta="10000"/>
-                </assert_contents>
-            </output>
-            <output name="model3">
-                <assert_contents>
-                    <has_n_columns n="12"/>
-                    <has_n_lines n="1517"/>
-                    <has_size value="123000" delta="10000"/>
-                </assert_contents>
-            </output>
-            <output name="model4">
-                <assert_contents>
-                    <has_n_columns n="12"/>
-                    <has_n_lines n="1517"/>
-                    <has_size value="123000" delta="10000"/>
-                </assert_contents>
-            </output>
-            <output name="model5">
-                <assert_contents>
-                    <has_n_columns n="12"/>
-                    <has_n_lines n="1517"/>
-                    <has_size value="123000" delta="10000"/>
-                </assert_contents>
-            </output>
+            <param name="model_preset" value="monomer"/>
+            <param name="outputs|plots" value="true"/>
+            <param name="outputs|confidence_scores" value="true"/>
+            <param name="outputs|plddts" value="true"/>
+            <param name="outputs|pae_csv" value="true"/>
+            <param name="outputs|model_pkls" value="true"/>
+            <param name="outputs|relax_json" value="true"/>
+            <expand macro="test_output_plots_1" />
+            <expand macro="test_output_confidence_scores" />
+            <expand macro="test_output_plddts" />
+            <expand macro="test_output_pdb_models" />
+            <expand macro="test_output_pickles" />
+            <expand macro="test_output_relax_json" />
+        </test>
+
+        <!-- Test monomer_ptm with all outputs -->
+        <test expect_num_outputs="24">
+            <conditional name="fasta_or_text">
+                <param name="input_mode" value="history"/>
+                <param name="fasta_file" value="test1.fasta"/>
+            </conditional>
+            <param name="model_preset" value="monomer_ptm"/>
+            <param name="outputs|plots" value="true"/>
+            <param name="outputs|confidence_scores" value="true"/>
+            <param name="outputs|plddts" value="true"/>
+            <param name="outputs|pae_csv" value="true"/>
+            <param name="outputs|model_pkls" value="true"/>
+            <param name="outputs|relax_json" value="true"/>
+            <expand macro="test_output_plots_2" />
+            <expand macro="test_output_confidence_scores" />
+            <expand macro="test_output_plddts" />
+            <expand macro="test_output_pdb_models" />
+            <expand macro="test_output_pickles" />
+            <expand macro="test_output_relax_json" />
+            <expand macro="test_output_pae_csv" />
+        </test>
+
+        <!-- Test multimer with all outputs -->
+        <test expect_num_outputs="24">
+            <conditional name="fasta_or_text">
+                <param name="input_mode" value="history"/>
+                <param name="fasta_file" value="multimer.fasta"/>
+            </conditional>
+            <param name="model_preset" value="multimer"/>
+            <param name="outputs|plots" value="true"/>
+            <param name="outputs|confidence_scores" value="true"/>
+            <param name="outputs|plddts" value="true"/>
+            <param name="outputs|pae_csv" value="true"/>
+            <param name="outputs|model_pkls" value="true"/>
+            <param name="outputs|relax_json" value="true"/>
+            <expand macro="test_output_plots_3" />
+            <expand macro="test_output_confidence_scores" />
+            <expand macro="test_output_plddts" />
+            <expand macro="test_output_pdb_models" />
+            <expand macro="test_output_pickles" />
+            <expand macro="test_output_relax_json" />
+            <expand macro="test_output_pae_csv" />
         </test>
     </tests>
     <help><![CDATA[
@@ -389,19 +385,36 @@
     *Model data files (ranked_n.pkl)*

     | Per-model data stored in pickle files (a Python binary data format). These files can be used as inputs to downstream analysis software (such as Chimera X) for visualizing structures and computing kinetics between protein multimers and domains.
-    | The tool will produce one ``.pkl`` output for each of the PDB models.
+    | The tool will produce one ``.pkl`` output for each PDB model.
+    |
+    |
+
+    *pLDDT + PAE plots (optional)*
+
+    | A two-panel figure in PNG format showing:
+    | a) pLDDT score plotted against residue position
+    | b) a heatmap of predicted-alignment error (PAE) with residue position running along vertical and horizontal axes and color at each pixel indicating PAE value for the corresponding pair of residues.
+    | Panel b) is only produced for ``monomer_ptm`` and ``multimer`` model presets.
+    |
+    |
+
+    *Model predicted-alignment error matrix (pae_ranked_n.csv)*
+
+    | Per-model predicted-alignment error (PAE) matrix - only available with the ``monomer_ptm`` and ``multimer`` model presets.
+    | The tool will produce one ``.csv`` output for each PDB model.
     |
     |

     *relax_metrics.json (optional)*

-    | A JSON-formatted text file containing relax metrics (mostly remaining violations).
+    | A JSON-formatted text file containing relax metrics (primarily remaining violations).
+    |
     |

     **AlphaFold configuration**

     | We have configured AlphaFold to run with the parameters suggested by default on `AlphaFold's GitHub <https://github.com/deepmind/alphafold>`_.
-    | This means that it runs with Amber relaxation enabled, with relaxed PDB models collected as output datasets. If there are additonal parameters that you would like to interact with, please `send a support request to Galaxy AU <https://site.usegalaxy.org.au/request/support>`_, or open an issue on `our GitHub <https://github.com/usegalaxy-au/tools-au>`_.
+    | This means that it runs with Amber relaxation enabled, with relaxed PDB models collected as output datasets (ranked\_*.pdb files). If there are additonal parameters that you would like to interact with, please `send a support request to Galaxy AU <https://site.usegalaxy.org.au/request/support>`_, or open an issue on `our GitHub <https://github.com/usegalaxy-au/tools-au>`_.
     |
     |
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macro_output.xml	Fri Mar 10 02:48:07 2023 +0000
@@ -0,0 +1,176 @@
+<macros>
+    <xml name="output_pdb_models">
+        <data name="model5" format="pdb" from_work_dir="output/alphafold/ranked_4.pdb" label="${tool.name} on ${on_string}: PDB ranked 4"/>
+        <data name="model4" format="pdb" from_work_dir="output/alphafold/ranked_3.pdb" label="${tool.name} on ${on_string}: PDB ranked 3"/>
+        <data name="model3" format="pdb" from_work_dir="output/alphafold/ranked_2.pdb" label="${tool.name} on ${on_string}: PDB ranked 2"/>
+        <data name="model2" format="pdb" from_work_dir="output/alphafold/ranked_1.pdb" label="${tool.name} on ${on_string}: PDB ranked 1"/>
+        <data name="model1" format="pdb" from_work_dir="output/alphafold/ranked_0.pdb" label="${tool.name} on ${on_string}: PDB ranked 0"/>
+    </xml>
+
+    <xml name="output_pae_csv">
+        <data
+            name="pae_ranked_4"
+            format="csv"
+            from_work_dir="output/alphafold/extra/pae_ranked_4.csv"
+            label="${tool.name} on ${on_string}: pae_ranked_4.csv"
+        >
+            <filter>outputs['pae_csv']</filter>
+            <filter>model_preset != "monomer"</filter>
+        </data>
+        <data
+            name="pae_ranked_3"
+            format="csv"
+            from_work_dir="output/alphafold/extra/pae_ranked_3.csv"
+            label="${tool.name} on ${on_string}: pae_ranked_3.csv"
+        >
+            <filter>outputs['pae_csv']</filter>
+            <filter>model_preset != "monomer"</filter>
+        </data>
+        <data
+            name="pae_ranked_2"
+            format="csv"
+            from_work_dir="output/alphafold/extra/pae_ranked_2.csv"
+            label="${tool.name} on ${on_string}: pae_ranked_2.csv"
+        >
+            <filter>outputs['pae_csv']</filter>
+            <filter>model_preset != "monomer"</filter>
+        </data>
+        <data
+            name="pae_ranked_1"
+            format="csv"
+            from_work_dir="output/alphafold/extra/pae_ranked_1.csv"
+            label="${tool.name} on ${on_string}: pae_ranked_1.csv"
+        >
+            <filter>outputs['pae_csv']</filter>
+            <filter>model_preset != "monomer"</filter>
+        </data>
+        <data
+            name="pae_ranked_0"
+            format="csv"
+            from_work_dir="output/alphafold/extra/pae_ranked_0.csv"
+            label="${tool.name} on ${on_string}: pae_ranked_0.csv"
+        >
+            <filter>outputs['pae_csv']</filter>
+            <filter>model_preset != "monomer"</filter>
+        </data>
+    </xml>
+
+    <xml name="output_pickles">
+        <data
+            name="output_ranked_4_pkl"
+            format="binary"
+            from_work_dir="output/alphafold/extra/ranked_4.pkl"
+            label="${tool.name} on ${on_string}: ranked_4.pkl"
+        >
+            <filter>outputs['model_pkls']</filter>
+        </data>
+        <data
+            name="output_ranked_3_pkl"
+            format="binary"
+            from_work_dir="output/alphafold/extra/ranked_3.pkl"
+            label="${tool.name} on ${on_string}: ranked_3.pkl"
+        >
+            <filter>outputs['model_pkls']</filter>
+        </data>
+        <data
+            name="output_ranked_2_pkl"
+            format="binary"
+            from_work_dir="output/alphafold/extra/ranked_2.pkl"
+            label="${tool.name} on ${on_string}: ranked_2.pkl"
+        >
+            <filter>outputs['model_pkls']</filter>
+        </data>
+        <data
+            name="output_ranked_1_pkl"
+            format="binary"
+            from_work_dir="output/alphafold/extra/ranked_1.pkl"
+            label="${tool.name} on ${on_string}: ranked_1.pkl"
+        >
+            <filter>outputs['model_pkls']</filter>
+        </data>
+        <data
+            name="output_ranked_0_pkl"
+            format="binary"
+            from_work_dir="output/alphafold/extra/ranked_0.pkl"
+            label="${tool.name} on ${on_string}: ranked_0.pkl"
+        >
+            <filter>outputs['model_pkls']</filter>
+        </data>
+    </xml>
+
+    <xml name="output_plots">
+        <data
+            name="plot_ranked_4"
+            format="png"
+            from_work_dir="output/alphafold/extra/ranked_4.png"
+            label="${tool.name} on ${on_string}: pLDDT/PAE plot ranked 4"
+        >
+            <filter>outputs['plots']</filter>
+        </data>
+        <data
+            name="plot_ranked_3"
+            format="png"
+            from_work_dir="output/alphafold/extra/ranked_3.png"
+            label="${tool.name} on ${on_string}: pLDDT/PAE plot ranked 3"
+        >
+            <filter>outputs['plots']</filter>
+        </data>
+        <data
+            name="plot_ranked_2"
+            format="png"
+            from_work_dir="output/alphafold/extra/ranked_2.png"
+            label="${tool.name} on ${on_string}: pLDDT/PAE plot ranked 2"
+        >
+            <filter>outputs['plots']</filter>
+        </data>
+        <data
+            name="plot_ranked_1"
+            format="png"
+            from_work_dir="output/alphafold/extra/ranked_1.png"
+            label="${tool.name} on ${on_string}: pLDDT/PAE plot ranked 1"
+        >
+            <filter>outputs['plots']</filter>
+        </data>
+        <data
+            name="plot_ranked_0"
+            format="png"
+            from_work_dir="output/alphafold/extra/ranked_0.png"
+            label="${tool.name} on ${on_string}: pLDDT/PAE plot ranked 0"
+        >
+            <filter>outputs['plots']</filter>
+        </data>
+    </xml>
+
+    <xml name="output_confidence_scores">
+        <data
+            name="output_confidence_scores"
+            format="tabular"
+            from_work_dir="output/alphafold/extra/model_confidence_scores.tsv"
+            label="${tool.name} on ${on_string}: Model confidence scores"
+        >
+            <filter>outputs['confidence_scores']</filter>
+        </data>
+    </xml>
+
+    <xml name="output_plddts">
+        <data
+            name="output_plddts"
+            format="tabular"
+            from_work_dir="output/alphafold/extra/plddts.tsv"
+            label="${tool.name} on ${on_string}: Per-residue confidence scores (plddts)"
+        >
+            <filter>outputs['plddts']</filter>
+        </data>
+    </xml>
+
+    <xml name="output_relax_json">
+        <data
+            name="output_relax_json"
+            format="json"
+            from_work_dir="output/alphafold/extra/relax_metrics_ranked.json"
+            label="${tool.name} on ${on_string}: relax_metrics_ranked.json"
+        >
+            <filter>outputs['relax_json']</filter>
+        </data>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macro_test_output.xml	Fri Mar 10 02:48:07 2023 +0000
@@ -0,0 +1,199 @@
+<macros>
+    <xml name="test_output_plddts">
+        <output name="output_plddts">
+            <assert_contents>
+                <has_n_lines n="5"/>
+                <has_size min="2000" />
+            </assert_contents>
+        </output>
+    </xml>
+
+    <xml name="test_output_confidence_scores">
+        <output name="output_confidence_scores">
+            <assert_contents>
+                <has_n_columns n="2"/>
+                <has_n_lines n="5"/>
+                <has_size value="70" delta="50"/>
+            </assert_contents>
+        </output>
+    </xml>
+
+    <xml name="test_output_relax_json">
+        <output name="output_relax_json">
+            <assert_contents>
+                <has_size min="500" />
+            </assert_contents>
+        </output>
+    </xml>
+
+    <xml name="test_output_pdb_models">
+        <output name="model1">
+            <assert_contents>
+                <has_size min="20000"/>
+            </assert_contents>
+        </output>
+        <output name="model2">
+            <assert_contents>
+                <has_size min="20000"/>
+            </assert_contents>
+        </output>
+        <output name="model3">
+            <assert_contents>
+                <has_size min="20000"/>
+            </assert_contents>
+        </output>
+        <output name="model4">
+            <assert_contents>
+                <has_size min="20000"/>
+            </assert_contents>
+        </output>
+        <output name="model5">
+            <assert_contents>
+                <has_size min="20000"/>
+            </assert_contents>
+        </output>
+    </xml>
+
+    <xml name="test_output_plots_1">
+        <!-- For one-panel plot without PAE heatmap -->
+        <output name="plot_ranked_0">
+            <assert_contents>
+                <has_size min="10000" max="50000" />
+            </assert_contents>
+        </output>
+        <output name="plot_ranked_1">
+            <assert_contents>
+                <has_size min="10000" max="50000" />
+            </assert_contents>
+        </output>
+        <output name="plot_ranked_2">
+            <assert_contents>
+                <has_size min="10000" max="50000" />
+            </assert_contents>
+        </output>
+        <output name="plot_ranked_3">
+            <assert_contents>
+                <has_size min="10000" max="50000" />
+            </assert_contents>
+        </output>
+        <output name="plot_ranked_4">
+            <assert_contents>
+                <has_size min="10000" max="50000" />
+            </assert_contents>
+        </output>
+    </xml>
+
+    <xml name="test_output_plots_2">
+        <!-- For two-panel plot with PAE heatmap -->
+        <output name="plot_ranked_0">
+            <assert_contents>
+                <has_size min="50000" max="63000" />
+            </assert_contents>
+        </output>
+        <output name="plot_ranked_1">
+            <assert_contents>
+                <has_size min="50000" max="63000" />
+            </assert_contents>
+        </output>
+        <output name="plot_ranked_2">
+            <assert_contents>
+                <has_size min="50000" max="63000" />
+            </assert_contents>
+        </output>
+        <output name="plot_ranked_3">
+            <assert_contents>
+                <has_size min="50000" max="63000" />
+            </assert_contents>
+        </output>
+        <output name="plot_ranked_4">
+            <assert_contents>
+                <has_size min="50000" max="63000" />
+            </assert_contents>
+        </output>
+    </xml>
+
+    <xml name="test_output_plots_3">
+        <!-- For two-panel plot with PAE heatmap -->
+        <output name="plot_ranked_0">
+            <assert_contents>
+                <has_size min="220000" max="270000" />
+            </assert_contents>
+        </output>
+        <output name="plot_ranked_1">
+            <assert_contents>
+                <has_size min="220000" max="270000" />
+            </assert_contents>
+        </output>
+        <output name="plot_ranked_2">
+            <assert_contents>
+                <has_size min="220000" max="270000" />
+            </assert_contents>
+        </output>
+        <output name="plot_ranked_3">
+            <assert_contents>
+                <has_size min="220000" max="270000" />
+            </assert_contents>
+        </output>
+        <output name="plot_ranked_4">
+            <assert_contents>
+                <has_size min="220000" max="270000" />
+            </assert_contents>
+        </output>
+    </xml>
+
+    <xml name="test_output_pae_csv">
+        <output name="pae_ranked_0">
+            <assert_contents>
+                <has_size min="5000"/>
+            </assert_contents>
+        </output>
+        <output name="pae_ranked_1">
+            <assert_contents>
+                <has_size min="5000"/>
+            </assert_contents>
+        </output>
+        <output name="pae_ranked_2">
+            <assert_contents>
+                <has_size min="5000"/>
+            </assert_contents>
+        </output>
+        <output name="pae_ranked_3">
+            <assert_contents>
+                <has_size min="5000"/>
+            </assert_contents>
+        </output>
+        <output name="pae_ranked_4">
+            <assert_contents>
+                <has_size min="5000"/>
+            </assert_contents>
+        </output>
+    </xml>
+
+    <xml name="test_output_pickles">
+        <output name="output_ranked_4_pkl">
+            <assert_contents>
+                <has_size min="1000000"/>
+            </assert_contents>
+        </output>
+        <output name="output_ranked_3_pkl">
+            <assert_contents>
+                <has_size min="1000000"/>
+            </assert_contents>
+        </output>
+        <output name="output_ranked_2_pkl">
+            <assert_contents>
+                <has_size min="1000000"/>
+            </assert_contents>
+        </output>
+        <output name="output_ranked_1_pkl">
+            <assert_contents>
+                <has_size min="1000000"/>
+            </assert_contents>
+        </output>
+        <output name="output_ranked_0_pkl">
+            <assert_contents>
+                <has_size min="1000000"/>
+            </assert_contents>
+        </output>
+    </xml>
+</macros>
--- a/outputs.py	Tue Feb 28 01:15:42 2023 +0000
+++ b/outputs.py	Fri Mar 10 02:48:07 2023 +0000
@@ -19,13 +19,16 @@
 import os
 import pickle as pk
 import shutil
+from matplotlib import pyplot as plt
 from pathlib import Path
 from typing import List

-# Output file names
+# Output file paths
 OUTPUT_DIR = 'extra'
 OUTPUTS = {
     'model_pkl': OUTPUT_DIR + '/ranked_{rank}.pkl',
+    'model_pae': OUTPUT_DIR + '/pae_ranked_{rank}.csv',
+    'model_plot': OUTPUT_DIR + '/ranked_{rank}.png',
     'model_confidence_scores': OUTPUT_DIR + '/model_confidence_scores.tsv',
     'plddts': OUTPUT_DIR + '/plddts.tsv',
     'relax': OUTPUT_DIR + '/relax_metrics_ranked.json',
@@ -46,8 +49,9 @@
         self.output_confidence_scores = True
         self.output_residue_scores = False
         self.is_multimer = False
+        self.parse()

-    def parse_settings(self) -> None:
+    def parse(self) -> None:
         parser = argparse.ArgumentParser()
         parser.add_argument(
             "workdir",
@@ -67,15 +71,26 @@
             action="store_true"
         )
         parser.add_argument(
-            "--model-pkl",
-            dest="model_pkl",
+            "--pkl",
             help="rename model pkl outputs with rank order",
             action="store_true"
         )
+        parser.add_argument(
+            "--pae",
+            help="extract PAE from pkl files to CSV format",
+            action="store_true"
+        )
+        parser.add_argument(
+            "--plot",
+            help="Plot pLDDT and PAE for each model",
+            action="store_true"
+        )
         args = parser.parse_args()
         self.workdir = Path(args.workdir.rstrip('/'))
         self.output_residue_scores = args.plddts
-        self.output_model_pkls = args.model_pkl
+        self.output_model_pkls = args.pkl
+        self.output_model_plots = args.plot
+        self.output_pae = args.pae
         self.is_multimer = args.multimer
         self.output_dir = self.workdir / OUTPUT_DIR
         os.makedirs(self.output_dir, exist_ok=True)
@@ -212,6 +227,31 @@
         shutil.copyfile(path, new_path)


+def extract_pae_to_csv(ranking: ResultRanking, context: ExecutionContext):
+    """Extract predicted alignment error matrix from pickle files.
+
+    Creates a CSV file for each of five ranked models.
+    """
+    for path in context.model_pkl_paths:
+        model = ResultModelPrediction(path, context)
+        rank = ranking.get_rank_for_model(model.name)
+        with open(path, 'rb') as f:
+            data = pk.load(f)
+        if 'predicted_aligned_error' not in data:
+            print("Skipping PAE output"
+                  f" - not found in {path}."
+                  " Running with model_preset=monomer?")
+            return
+        pae = data['predicted_aligned_error']
+        out_path = (
+            context.settings.workdir
+            / OUTPUTS['model_pae'].format(rank=rank)
+        )
+        with open(out_path, 'w') as f:
+            for row in pae:
+                f.write(','.join([str(x) for x in row]) + '\n')
+
+
 def rekey_relax_metrics(ranking: ResultRanking, context: ExecutionContext):
     """Replace keys in relax_metrics.json with 0-indexed rank."""
     with open(context.relax_metrics) as f:
@@ -224,10 +264,44 @@
         json.dump(data, f)


+def plddt_pae_plots(ranking: ResultRanking, context: ExecutionContext):
+    """Generate a pLDDT + PAE plot for each model."""
+    for path in context.model_pkl_paths:
+        num_plots = 2
+        model = ResultModelPrediction(path, context)
+        rank = ranking.get_rank_for_model(model.name)
+        png_path = (
+            context.settings.workdir
+            / OUTPUTS['model_plot'].format(rank=rank)
+        )
+        plddts = model.data['plddt']
+        if 'predicted_aligned_error' in model.data:
+            pae = model.data['predicted_aligned_error']
+            max_pae = model.data['max_predicted_aligned_error']
+        else:
+            num_plots = 1
+
+        plt.figure(figsize=[8 * num_plots, 6])
+        plt.subplot(1, num_plots, 1)
+        plt.plot(plddts)
+        plt.title('Predicted LDDT')
+        plt.xlabel('Residue')
+        plt.ylabel('pLDDT')
+
+        if num_plots == 2:
+            plt.subplot(1, 2, 2)
+            plt.imshow(pae, vmin=0., vmax=max_pae, cmap='Greens_r')
+            plt.colorbar(fraction=0.046, pad=0.04)
+            plt.title('Predicted Aligned Error')
+            plt.xlabel('Scored residue')
+            plt.ylabel('Aligned residue')
+
+        plt.savefig(png_path)
+
+
 def main():
     """Parse output files and generate additional output files."""
     settings = Settings()
-    settings.parse_settings()
     context = ExecutionContext(settings)
     ranking = ResultRanking(context)
     write_confidence_scores(ranking, context)
@@ -236,7 +310,11 @@
     # Optional outputs
     if settings.output_model_pkls:
         rename_model_pkls(ranking, context)
-
+    if settings.output_model_plots:
+        plddt_pae_plots(ranking, context)
+    if settings.output_pae:
+        # Only created by monomer_ptm and multimer models
+        extract_pae_to_csv(ranking, context)
     if settings.output_residue_scores:
         write_per_residue_scores(ranking, context)
--- a/validate_fasta.py	Tue Feb 28 01:15:42 2023 +0000
+++ b/validate_fasta.py	Fri Mar 10 02:48:07 2023 +0000
@@ -205,6 +205,11 @@
         for fas in clean_fastas:
             fw.write(fas)

+        sys.stderr.write("Validated FASTA sequence(s):\n\n")
+        for fas in clean_fastas:
+            sys.stderr.write(fas.header + '\n')
+            sys.stderr.write(fas.aa_seq + '\n\n')
+
     except ValueError as exc:
         sys.stderr.write(f"{exc}\n\n")
         raise exc