view alphafold.xml @ 19:2f7702fd0a4c draft

planemo upload for repository https://github.com/usegalaxy-au/tools-au commit cd0379c8ecc24977dffa462c1897d402c85fa4e6
author galaxy-australia
date Wed, 08 May 2024 06:26:55 +0000
parents e4a053d67e24
children 6ab1a261520a
line wrap: on
line source

<tool id="alphafold" name="Alphafold 2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">
    <description> - AI-guided 3D structural prediction of proteins</description>
    <macros>
      <token name="@TOOL_VERSION@">2.3.1</token>
      <token name="@TOOL_MINOR_VERSION@">2.3</token>
      <token name="@VERSION_SUFFIX@">5</token>
      <import>macro_output.xml</import>
      <import>macro_test_output.xml</import>
    </macros>
    <edam_topics>
      <edam_topic>topic_0082</edam_topic>
    </edam_topics>
    <edam_operations>
      <edam_operation>operation_0474</edam_operation>
    </edam_operations>
    <xrefs>
      <xref type="bio.tools">alphafold_2</xref>
    </xrefs>
    <requirements>
        <container type="docker">neoformit/alphafold:v2.3.1_2</container>
    </requirements>
    <required_files>
        <include path="scripts/outputs.py" />
        <include path="scripts/validate_fasta.py" />
        <include path="alphafold.html" />
    </required_files>
    <command detect_errors="exit_code"><![CDATA[

## Developers: to test with mock alphafold run, set `export PLANEMO_TESTING=1`
## in planemo's gx_venv_n/bin/activate script. AlphaFold outputs will be copied
## from the test-data directory instead of running the tool.

## $ALPHAFOLD_DB variable should point to the location containing the versioned
## AlphaFold databases - defaults to /data
## that is the directory should contain a subdir / symlink named identical as
## the value of the TOOL_MINOR_VERSION token which contains the AF reference data
## for the corresponding version

## Read FASTA input -----------------------------------------------------------
#if $fasta_or_text.input_mode == 'history':
    cp '$fasta_or_text.fasta_file' input.fasta
#elif $fasta_or_text.input_mode == 'textbox':
    echo '$fasta_or_text.fasta_text' > input.fasta
#end if

&& python3 '$__tool_directory__/scripts/validate_fasta.py' input.fasta
--min_length \${ALPHAFOLD_AA_LENGTH_MIN:-0}
--max_length \${ALPHAFOLD_AA_LENGTH_MAX:-0}
#if $model_preset == 'multimer':
--multimer
--max-sequences \${ALPHAFOLD_MAX_SEQUENCES:-10}
#end if
> alphafold.fasta

## Env vars -------------------------------------------------------------------
&& export TF_FORCE_UNIFIED_MEMORY=1
&& export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0
&& export TODAY=`date +"%Y-%m-%d"`

## Run AlphaFold  -------------------------------------------------------------
#if os.environ.get('PLANEMO_TESTING'):
    ## Run in testing mode (mocks a successful AlphaFold run by copying outputs)
    && echo "Creating dummy outputs for model_preset=$model_preset..."
    && bash '$__tool_directory__/scripts/mock_alphafold.sh' $model_preset
#else:
    ## Run AlphaFold
    && python /app/alphafold/run_alphafold.py
        --fasta_paths alphafold.fasta
        --output_dir output
        --data_dir \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/
        --model_preset=$model_preset

        ## Set reference database paths
        --uniref90_database_path   \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/uniref90/uniref90.fasta
        --mgnify_database_path     \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/mgnify/mgy_clusters_2022_05.fa
        --template_mmcif_dir       \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb_mmcif/mmcif_files
        --obsolete_pdbs_path       \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb_mmcif/obsolete.dat
        #if $dbs == 'full':
        --bfd_database_path        \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt
        --uniref30_database_path   \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/uniref30/UniRef30_2021_03
        #else
        --db_preset=reduced_dbs
        --small_bfd_database_path  \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/small_bfd/bfd-first_non_consensus_sequences.fasta
        #end if

        #if $max_template_date:
        --max_template_date=$max_template_date
        #else
        --max_template_date=\$TODAY
        #end if

        --use_gpu_relax=\${ALPHAFOLD_USE_GPU:-True}  ## introduced in v2.1.2

        #if $model_preset == 'multimer':
        --pdb_seqres_database_path=\${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb_seqres/pdb_seqres.txt
        --uniprot_database_path=\${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/uniprot/uniprot.fasta
        --num_multimer_predictions_per_model=1  ## introduced in v2.2.0
        #else
        --pdb70_database_path \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb70/pdb70
        #end if
#end if

## Generate additional outputs ------------------------------------------------
&& python3 '$__tool_directory__/scripts/outputs.py' output/alphafold
$outputs.plddts
$outputs.model_pkls
$outputs.pae_csv
$outputs.plots
#if $model_preset == 'multimer':
--multimer
#end if

## HTML output
&& mkdir -p '${ html.files_path }'
&& cp '$__tool_directory__/alphafold.html' '${html}'
&& cp output/alphafold/ranked_*.pdb '${html.files_path}'

## This is a (hacky) fix for a bug that has appeared in multiple Pulsar servers.
## The working directory ends up two levels deep and the visualization html page
## fails to load the PDB files as static assets.
&& (([ -d working ] && cp -r working/* .) || true)

    ]]></command>
    <inputs>
        <conditional name="fasta_or_text">
            <param name="input_mode" type="select" label="Fasta Input" help="Protein sequence(s) to fold. Input can be fasta file from history, or text. Sequence must be valid IUPAC amino acid characters. If multiple-sequence FASTA file provided, multimer mode must be selected.">
                <option value="history">Use fasta from history</option>
                <option value="textbox">Paste sequence into textbox</option>
            </param>
            <when value="history">
                <param name="fasta_file" type="data" multiple="false" format="fasta" label="Fasta file from history" help="Select single FASTA protein file from your history. If you wish to fold multiple proteins, submit an individual job for each protein. If you wish to run AlphaFold multimer, please supply multiple sequences in this file." />
            </when>
            <when value="textbox">
                <param name="fasta_text" type="text" area="true" value="" label="Paste sequence" help="Paste single protein sequence into the textbox. If you wish to fold multiple proteins, submit individual jobs for each protein. If you wish to run AlphaFold multimer, please supply multiple sequences in FASTA format." />
            </when>
        </conditional>

        <param
            name="max_template_date"
            type="text"
            label="Max template date (yyyy-mm-dd) (optional)"
            help="The model will reference PDB structures deposited before this date only. Defaults to today's date."
            optional="true"
        >
            <sanitizer>
                <valid initial="string.digits">
                    <add value="-" />
                </valid>
            </sanitizer>
            <validator type="regex">[0-9]{4}-[0-9]{2}-[0-9]{2}</validator>
        </param>

        <param
          name="dbs"
          type="select"
          display="radio"
          label="Select database"
          help="The reduced database allows significantly faster run time in
            exchange for a small loss in accuracy."
        >
          <option value="reduced" selected="true">Reduced database</option>
          <option value="full">Full database</option>
        </param>

        <param
            name="model_preset"
            type="select"
            label="Model preset"
            help="Select which prediction model to run. The monomer model is the most accurate for single protein prediction. The multimer model allows prediction of protein complexes."
        >
            <option value="monomer" selected="true">monomer - default prediction model</option>
            <option value="monomer_ptm">
                monomer_ptm - slightly less accurate version of the monomer model, but provides a pairwise alignment error (PAE) matrix
            </option>
            <option value="multimer">
                multimer - model a protein complex (requires multi-sequence FASTA input)
            </option>
        </param>

        <section name="outputs" title="Optional outputs" expanded="false">
            <param
                name="plots"
                type="boolean"
                checked="false"
                truevalue="--plot"
                falsevalue=""
                label="pLDDT and PAE matrix plots (per model)"
                help="A two-panel plot showing pLDDT against residue position (left) and PAE (paired-alignment error) as a heatmap image with residue numbers running along vertical and horizontal axes and color at each pixel indicating PAE value for the corresponding pair of residues. (right). PAE heatmap is only produced with monomer_ptm and multimer model presets."
            />
            <param
                name="confidence_scores"
                type="boolean"
                checked="false"
                label="Per-model confidence scores"
                help="A tabular file showing average confidence score for each model (predicted template modelling (PTM) score; interface PTM is incorporated into this score for multimer predictions)."
            />
            <param
                name="plddts"
                type="boolean"
                checked="false"
                label="Per-residue confidence scores"
                truevalue="--plddts"
                falsevalue=""
                help="Alphafold produces a pLDDT score between 0-100 for each residue in the folded models. High scores represent high confidence in placement for the residue, while low scoring residues have lower confidence. This output is a tabular file with five rows (one for each output PDB model), with each column providing a pLDDT score for a single residue."
            />
            <param
                name="pae_csv"
                type="boolean"
                checked="false"
                truevalue="--pae"
                falsevalue=""
                label="Paired-alignment error (PAE)"
                help="A CSV-formatted matrix for each model. Only available for monomer_ptm and multimer model presets. Predicted aligned error (PAE) gives a distance error for every pair of residues. It gives AlphaFold's estimate of position error at residue X when the predicted and true structures are aligned on residue Y. Values range from 0 - 35 Angstroms."
            />
            <param
                name="model_pkls"
                type="boolean"
                checked="false"
                truevalue="--pkl"
                falsevalue=""
                label="ranked_*.pkl"
                help="A pickle file containing metrics used for the assessment of the model's accuracy. These include per-residue pLDDT scores (see above), predicted TM (Template Modelling) score, which is a global superposition metric and predicted aligned error (a matrix size (number of residues) x (number of residues) where each position describes the confidence of the residue's 3D position relative to another residue in the model; can be used for the interpretation of relative positions of domains). Pickle files can be read and processed using the Python 'pickle' library (requires the jax Python library). Outputs are named respective to PDB outputs."
            />
            <param
                name="relax_json"
                type="boolean"
                checked="false"
                label="relax_metrics.json"
                help="A JSON-formatted text file containing relax metrics (mostly remaining violations)."
            />
        </section>
    </inputs>

    <outputs>
        <expand macro="output_pdb_models" />
        <data name="html" format="html" label="${tool.name} on ${on_string}: Visualization" />
        <!-- Optional outputs -->
        <expand macro="output_plddts" />
        <expand macro="output_confidence_scores" />
        <expand macro="output_pickles" />
        <expand macro="output_pae_csv" />
        <expand macro="output_plots" />
        <expand macro="output_relax_json" />
    </outputs>

    <tests>
        <!-- Test monomer with default outputs -->
        <test expect_num_outputs="6">
            <conditional name="fasta_or_text">
                <param name="input_mode" value="history"/>
                <param name="fasta_file" value="test1.fasta"/>
            </conditional>
            <param name="model_preset" value="monomer"/>
            <expand macro="test_output_pdb_models" />
        </test>

        <!-- Test monomer with all outputs -->
        <test expect_num_outputs="19">
            <conditional name="fasta_or_text">
                <param name="input_mode" value="history"/>
                <param name="fasta_file" value="test1.fasta"/>
            </conditional>
            <param name="model_preset" value="monomer"/>
            <param name="outputs|plots" value="true"/>
            <param name="outputs|confidence_scores" value="true"/>
            <param name="outputs|plddts" value="true"/>
            <param name="outputs|pae_csv" value="true"/>
            <param name="outputs|model_pkls" value="true"/>
            <param name="outputs|relax_json" value="true"/>
            <expand macro="test_output_plots_1" />
            <expand macro="test_output_confidence_scores" />
            <expand macro="test_output_plddts" />
            <expand macro="test_output_pdb_models" />
            <expand macro="test_output_pickles" />
            <expand macro="test_output_relax_json" />
        </test>

        <!-- Test monomer_ptm with all outputs -->
        <test expect_num_outputs="24">
            <conditional name="fasta_or_text">
                <param name="input_mode" value="history"/>
                <param name="fasta_file" value="test1.fasta"/>
            </conditional>
            <param name="model_preset" value="monomer_ptm"/>
            <param name="outputs|plots" value="true"/>
            <param name="outputs|confidence_scores" value="true"/>
            <param name="outputs|plddts" value="true"/>
            <param name="outputs|pae_csv" value="true"/>
            <param name="outputs|model_pkls" value="true"/>
            <param name="outputs|relax_json" value="true"/>
            <expand macro="test_output_plots_2" />
            <expand macro="test_output_confidence_scores" />
            <expand macro="test_output_plddts" />
            <expand macro="test_output_pdb_models" />
            <expand macro="test_output_pickles" />
            <expand macro="test_output_relax_json" />
            <expand macro="test_output_pae_csv" />
        </test>

        <!-- Test multimer with all outputs -->
        <test expect_num_outputs="24">
            <conditional name="fasta_or_text">
                <param name="input_mode" value="history"/>
                <param name="fasta_file" value="multimer.fasta"/>
            </conditional>
            <param name="model_preset" value="multimer"/>
            <param name="outputs|plots" value="true"/>
            <param name="outputs|confidence_scores" value="true"/>
            <param name="outputs|plddts" value="true"/>
            <param name="outputs|pae_csv" value="true"/>
            <param name="outputs|model_pkls" value="true"/>
            <param name="outputs|relax_json" value="true"/>
            <expand macro="test_output_plots_3" />
            <expand macro="test_output_confidence_scores" />
            <expand macro="test_output_plddts" />
            <expand macro="test_output_pdb_models" />
            <expand macro="test_output_pickles" />
            <expand macro="test_output_relax_json" />
            <expand macro="test_output_pae_csv" />
        </test>
    </tests>
    <help><![CDATA[

    .. class:: infomark

    | AlphaFold v2: AI-guided 3D structural prediction of proteins
    |
    | **NOTE: this tool packages AlphaFold v2.3.1.**
    |
    | This means that the neural network has been trained on PDBs with a release
    | date before 2021-09-30 (the training cutoff was 2018-04-30 until ``v2.3.0``).
    |
    | Find out more in the technical and release notes:
    |

    - `Release notes for v2.3.1 <https://github.com/deepmind/alphafold/releases/tag/v2.3.1>`_
    - `Technical notes for v2.3 <https://github.com/deepmind/alphafold/blob/main/docs/technical_note_v2.3.0.md>`_

    | If you want to use AlphaFold trained against an older cutoff date, switch to Galaxy version ``2.1.2`` (which was trained to data up to 2018-04-30).
    |

    **What it does**

    *What is AlphaFold?*

    | AlphaFold is a program which uses neural networks to predict the tertiary (3D) structure of proteins. AlphaFold accepts an amino acid sequence in Fasta format, which will be "folded" into a 3D model.
    |

    *What makes AlphaFold different?*

    | The ability to use computers to predict 3D protein structures with high accuracy is desirable because it removes the time-consuming and costly process of determining structures experimentally.
    | In-silico protein folding has been an active field of research for decades, but existing tools were slower and far less reliable than AlphaFold.
    | AlphaFold represents a leap forward by regularly predicting structures to atomic-level accuracy, even when no similar structures are known.
    |


    **Input**

    *Amino acid sequence*

    | AlphaFold monomer (default) accepts a **single amino acid sequence** in FASTA format.
    | You can choose to input either a file from your Galaxy history or paste a sequence into a text box.
    | If you choose the ``multimer`` option, you can supply a FASTA file containing **multiple sequences** to be folded concurrently into a multimer.
    |
    |

    **Outputs**

    *Visualization*

    An interactive 3D graphic of the best predicted molecular structures.
    This output can be opened in Galaxy to give a visual impression of the results, with different structural representations to choose from.
    Open the "Visualization" history output by clicking on the "view data" icon:

    .. image:: https://github.com/usegalaxy-au/galaxy-local-tools/blob/1a8d3e8daa7ccc5a345ca377697735ab95ed0666/tools/alphafold/static/img/alphafold-visualization.png?raw=true
        :height: 520
        :alt: Result visualization

    |

    *PDB files*

    | Five PDB (Protein Data Bank) files are be created, ordered by rank, as predicted by AlphaFold.
    | These files describe the molecular structures and can be used for downstream analysis. e.g. *in silico* molecular docking.
    | **PLEASE NOTE** that all outputs have been renamed to their respective rank order, including model and model.pkl files.
    |

    *Model confidence scores (optional)*

    | This optional output produces a file which describes the confidence scores for each model (based on `pLDDTs <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3799472/>`_, or the ``iptm+ptm`` score if run in multimer mode) which may be useful for downstream analysis.
    | Model confidence scores are also included as a column (replacing ``bFactor``) in the default PDB output.
    |
    |

    *Model data files (ranked_n.pkl)*

    | Per-model data stored in pickle files (a Python binary data format). These files can be used as inputs to downstream analysis software (such as Chimera X) for visualizing structures and computing kinetics between protein multimers and domains.
    | The tool will produce one ``.pkl`` output for each PDB model.
    |
    |

    *pLDDT + PAE plots (optional)*

    | A two-panel figure in PNG format showing:
    | a) pLDDT score plotted against residue position
    | b) a heatmap of predicted-alignment error (PAE) with residue position running along vertical and horizontal axes and color at each pixel indicating PAE value for the corresponding pair of residues.
    | Panel b) is only produced for ``monomer_ptm`` and ``multimer`` model presets.
    |
    |

    *Model predicted-alignment error matrix (pae_ranked_n.csv)*

    | Per-model predicted-alignment error (PAE) matrix - only available with the ``monomer_ptm`` and ``multimer`` model presets.
    | The tool will produce one ``.csv`` output for each PDB model.
    |
    |

    *relax_metrics.json (optional)*

    | A JSON-formatted text file containing relax metrics (primarily remaining violations).
    |
    |

    **AlphaFold configuration**

    | We have configured AlphaFold to run with the parameters suggested by default on `AlphaFold's GitHub <https://github.com/deepmind/alphafold>`_.
    | This means that it runs with Amber relaxation enabled, with relaxed PDB models collected as output datasets (ranked\_*.pdb files). If there are additonal parameters that you would like to interact with, please `send a support request to Galaxy AU <https://site.usegalaxy.org.au/request/support>`_, or open an issue on `our GitHub <https://github.com/usegalaxy-au/tools-au>`_.
    |
    |

    **External Resources**

    We highly recommend checking out the
    `Alphafold Protein Structure Database <https://alphafold.ebi.ac.uk/>`_,
    which contains pre-computed structures for over 200 million known proteins.
    See also:

    - `Google Deepmind's article on AlphaFold <https://deepmind.com/blog/article/alphafold-a-solution-to-a-50-year-old-grand-challenge-in-biology>`_
    - `AlphaFold source code on GitHub <https://github.com/deepmind/alphafold>`_

    *Downstream analysis*

    | Obtaining a protein structure prediction is the first step in many analyses.
    | The 3D models created by AlphaFold can be used in downstream analysis, including the following:
    |

    - Inspecting protein features
        3D viewers (pymol, chimera, ngl, blender) can be used to inspect active sites, regulatory domains, binding sites.
    - Molecular docking
        3D structures can be used to predict the binding affinity of different compounds.
        This is especially useful in screening drug candidates.
    - Protein-protein interactions
        Proteins associate in many biological processes, including intracellular signalling pathways and protein complex formation.
        To predict these interactions, other programs may ingest 3D models predicted by AlphaFold. Proprietary softwares include `GOLD <https://www.ccdc.cam.ac.uk/solutions/csd-discovery/components/gold/>`_ and `SeeSAR <https://www.biosolveit.de/SeeSAR>`_, but many `free and open-source options <https://en.wikipedia.org/wiki/List_of_protein-ligand_docking_software>`_ are available such as `AutoDock <https://autodock.scripps.edu/>`_, `SwissDock <http://www.swissdock.ch/>`_, `DockQ <https://github.com/bjornwallner/DockQ>`_, `MM-Align <https://zhanggroup.org/MM-align/>`_ and `TM-Align <https://zhanggroup.org/TM-align/>`_. Protein-protein interactions are often inferred from AlphaFold-Multimer predictions, which provide a level of confidence in binding affinity between homomer/heteromer subunits.

    ]]></help>
    <citations>
        <citation type="doi">https://doi.org/10.1038/s41586-021-03819-2</citation>
        <citation type="doi">https://doi.org/10.1101/2021.10.04.463034</citation>
    </citations>
</tool>