Mercurial > repos > galaxy-australia > alphafold2

<tool id="alphafold" name="Alphafold 2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="22.05">
    <description> - AI-guided 3D structural prediction of proteins</description>
    <macros>
      <token name="@TOOL_VERSION@">2.3.2</token>
      <token name="@TOOL_MINOR_VERSION@">2.3</token>
      <token name="@VERSION_SUFFIX@">2</token>
      <import>macro_output.xml</import>
      <import>macro_test_output.xml</import>
    </macros>
    <edam_topics>
      <edam_topic>topic_0082</edam_topic>
    </edam_topics>
    <edam_operations>
      <edam_operation>operation_0474</edam_operation>
    </edam_operations>
    <xrefs>
      <xref type="bio.tools">alphafold_2</xref>
    </xrefs>
    <requirements>
        <container type="docker">neoformit/alphafold:v2.3.2_0</container>
    </requirements>
    <required_files>
        <include path="scripts/outputs.py" />
        <include path="scripts/validate_fasta.py" />
        <include path="scripts/alphafold.html" />
    </required_files>
    <command detect_errors="exit_code"><![CDATA[

## Developers: to test with mock alphafold run, set `export PLANEMO_TESTING=1`
## in planemo's gx_venv_n/bin/activate script. AlphaFold outputs will be copied
## from the test-data directory instead of running the tool.

## $ALPHAFOLD_DB variable should point to the location containing the versioned
## AlphaFold databases - defaults to /data
## that is the directory should contain a subdir / symlink named identical as
## the value of the TOOL_MINOR_VERSION token which contains the AF reference data
## for the corresponding version

## Read FASTA input -----------------------------------------------------------
#if $fasta_or_text.input_mode == 'history':
    cp '$fasta_or_text.fasta_file' input.fasta
#elif $fasta_or_text.input_mode == 'textbox':
    echo '$fasta_or_text.fasta_text' > input.fasta
#end if

&& python3 '$__tool_directory__/scripts/validate_fasta.py' input.fasta
--min_length \${ALPHAFOLD_AA_LENGTH_MIN:-0}
--max_length \${ALPHAFOLD_AA_LENGTH_MAX:-0}
#if $model_preset.selection == 'multimer':
--multimer
--max-sequences \${ALPHAFOLD_MAX_SEQUENCES:-10}
#end if
> alphafold.fasta

## Env vars -------------------------------------------------------------------
&& export TF_FORCE_UNIFIED_MEMORY=1
&& export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0
&& export TODAY=`date +"%Y-%m-%d"`

## Run AlphaFold  -------------------------------------------------------------
#if os.environ.get('PLANEMO_TESTING'):
    ## Run in testing mode (mocks a successful AlphaFold run by copying outputs)
    && echo "Creating dummy outputs for model_preset=$model_preset.selection..."
    && bash '$__tool_directory__/scripts/mock_alphafold.sh' $model_preset
#else:
    ## Run AlphaFold
    && python /app/alphafold/run_alphafold.py
        --fasta_paths alphafold.fasta
        --output_dir output
        --data_dir \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/
        --model_preset=$model_preset.selection

        ## Set reference database paths
        --uniref90_database_path   \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/uniref90/uniref90.fasta
        --mgnify_database_path     \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/mgnify/mgy_clusters_2022_05.fa
        --template_mmcif_dir       \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb_mmcif/mmcif_files
        --obsolete_pdbs_path       \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb_mmcif/obsolete.dat
        #if $dbs == 'full':
        --bfd_database_path        \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt
        --uniref30_database_path   \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/uniref30/UniRef30_2021_03
        #else
        --db_preset=reduced_dbs
        --small_bfd_database_path  \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/small_bfd/bfd-first_non_consensus_sequences.fasta
        #end if

        #if $advanced.max_template_date:
        --max_template_date=$advanced.max_template_date
        #else
        --max_template_date=\$TODAY
        #end if

        --use_gpu_relax=\${ALPHAFOLD_USE_GPU:-True}

        #if $model_preset.selection == 'multimer':
        --pdb_seqres_database_path=\${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb_seqres/pdb_seqres.txt
        --uniprot_database_path=\${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/uniprot/uniprot.fasta
        --num_multimer_predictions_per_model=$model_preset.num_multimer_predictions_per_model
        #else
        --pdb70_database_path \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb70/pdb70
        #end if

        ## Galaxy-specific options --------------------------------------------
        ## See https://github.com/neoformit/alphafold/tree/release_2.3.2_galaxy
        #if $advanced.disable_amber_relax:
        --disable_amber_relax
        #end if

        #if $advanced.limit_model_outputs:
        --output_models=$limit_model_outputs
        #end if
        ## End Galaxy-specific options ----------------------------------------

#end if

## Generate additional outputs ------------------------------------------------
&& python3 '$__tool_directory__/scripts/outputs.py' output/alphafold
$outputs.plddts
$outputs.model_pkls
$outputs.pae_csv
$outputs.plots
$outputs.plot_msa

## HTML output
&& mkdir -p '${ html.files_path }'
&& cp output/alphafold/extra/alphafold.html '${html}'
&& cp output/alphafold/ranked_*.pdb '${html.files_path}'

## This is a (hacky) fix for a bug that has appeared in multiple Pulsar servers.
## The working directory ends up two levels deep and the visualization html page
## fails to load the PDB files as static assets.
&& (([ -d working ] && cp -r working/* .) || true)

    ]]></command>
    <inputs>
        <conditional name="fasta_or_text">
            <param name="input_mode" type="select" label="Fasta Input" help="Protein sequence(s) to fold. Input can be fasta file from history, or text. Sequence must be valid IUPAC amino acid characters. We recommend submitting sequences with a maximum length of 3000AA, because run time scales exponentially with sequence length. If multiple-sequence FASTA file provided, multimer mode must be selected.">
                <option value="history">Use fasta from history</option>
                <option value="textbox">Paste sequence into textbox</option>
            </param>
            <when value="history">
                <param name="fasta_file" type="data" multiple="false" format="fasta" label="Fasta file from history" help="Select single FASTA protein file from your history. If you wish to fold multiple proteins, submit an individual job for each protein. If you wish to run AlphaFold multimer, please supply multiple sequences in this file." />
            </when>
            <when value="textbox">
                <param name="fasta_text" type="text" area="true" value="" label="Paste sequence" help="Paste single protein sequence into the textbox. If you wish to fold multiple proteins, submit individual jobs for each protein. If you wish to run AlphaFold multimer, please supply multiple sequences in FASTA format." />
            </when>
        </conditional>

        <param
          name="dbs"
          type="select"
          display="radio"
          label="Select database"
          help="The reduced database allows significantly faster run time in
            exchange for a small loss in accuracy."
        >
          <option value="reduced" selected="true">Reduced database</option>
          <option value="full">Full database</option>
        </param>

        <conditional name="model_preset">
        <param
                name="selection"
            type="select"
            label="Model preset"
            help="Select which prediction model to run. The monomer model is the most accurate for single protein prediction. The multimer model allows prediction of protein complexes."
        >
            <option value="monomer" selected="true">monomer - default prediction model</option>
            <option value="monomer_ptm">
                monomer_ptm - slightly less accurate version of the monomer model, but provides a pairwise alignment error (PAE) matrix
            </option>
            <option value="multimer">
                multimer - model a protein complex (requires multi-sequence FASTA input)
            </option>
        </param>
            <when value="monomer"></when>
            <when value="monomer_ptm"></when>
            <when value="multimer">
                <param
                    name="num_multimer_predictions_per_model"
                    type="integer"
                    value="5"
                    label="Multimer predictions per model"
                    help="How many predictions (each with a different random seed) will be generated per model. E.g. if this is 2 and there are 5 models then there will be 10 predictions per input. For a small drop in accuracy you may wish to run a single seed per model (default 5, max 10)."
                    min="1"
                    max="10"
                />
            </when>
        </conditional>

        <section name="advanced" title="Advanced options" expanded="false">
            <param
                name="max_template_date"
                type="text"
                label="Max template date (yyyy-mm-dd) (optional)"
                help="The model will reference PDB structures deposited before this date only. Defaults to today's date."
                optional="true"
            >
                <sanitizer>
                    <valid initial="string.digits">
                        <add value="-" />
                    </valid>
                </sanitizer>
                <validator type="regex">[0-9]{4}-[0-9]{2}-[0-9]{2}</validator>
            </param>

            <param
                name="disable_amber_relax"
                type="boolean"
                label="Disable Amber relaxation"
                value="false"
                optional="true"
                help="Amber relaxation can be disabled to speed up processing time. Amber relaxation is used to refine predicted structures by removing stereochemical violations, resulting in more accurate prediction of side-chain geometry. Disabling this option with large proteins may lead to artefacts in the predicted structure. Disabling amber relax will result in the unrelaxed models being collected as PDB outputs."
            />

            <param
                name="limit_model_outputs"
                type="integer"
                label="Limit model outputs"
                value="5"
                help="Limit the number of models to output. The top N models will be output, where N is the value entered here (default 5). Please note that the top-ranking model is not always the correct one, and it is usually recommended to inspect multiple models. Reducing the number of models will result in a slight reduction in run time."
                min="1"
                max="5"
            />
        </section>

        <section name="outputs" title="Optional outputs" expanded="false">
            <param
                name="plots"
                type="boolean"
                checked="false"
                truevalue="--plot"
                falsevalue=""
                label="pLDDT and PAE matrix plots (per model)"
                help="A two-panel plot showing pLDDT against residue position (left) and PAE (paired-alignment error) as a heatmap image with residue numbers running along vertical and horizontal axes and color at each pixel indicating PAE value for the corresponding pair of residues. (right). PAE heatmap is only produced with monomer_ptm and multimer model presets."
            />
            <param
                name="plot_msa"
                type="boolean"
                checked="false"
                truevalue="--plot-msa"
                falsevalue=""
                label="MSA sequence coverage plot"
                help="A heatmap showing sequence coverage across the multiple sequence alignment (MSA). This plot can help you understand if regions of low confidence are due to poor sequence coverage."
            />
            <param
                name="confidence_scores"
                type="boolean"
                checked="false"
                label="Per-model confidence scores"
                help="A tabular file showing average confidence score for each model. The monomer preset is scored in plddt, the monomer_ptm preset is scored in predicted template modelling (PTM) and the multimer preset is scored in PTM+IPTM (interface PTM)."
            />
            <param
                name="plddts"
                type="boolean"
                checked="false"
                label="Per-residue confidence scores"
                truevalue="--confidence-scores"
                falsevalue=""
                help="Alphafold produces a pLDDT score between 0-100 for each residue in the folded models. High scores represent high confidence in placement for the residue, while low scoring residues have lower confidence. This output is a tabular file with five rows (one for each output PDB model), with each column providing a pLDDT score for a single residue."
            />
            <param
                name="pae_csv"
                type="boolean"
                checked="false"
                truevalue="--pae"
                falsevalue=""
                label="Paired-alignment error (PAE)"
                help="A CSV-formatted matrix for each model. Only available for monomer_ptm and multimer model presets. Predicted aligned error (PAE) gives a distance error for every pair of residues. It gives AlphaFold's estimate of position error at residue X when the predicted and true structures are aligned on residue Y. Values range from 0 - 35 Angstroms."
            />
            <param
                name="model_pkls"
                type="boolean"
                checked="false"
                truevalue="--pkl"
                falsevalue=""
                label="ranked_*.pkl"
                help="A pickle file containing metrics used for the assessment of the model's accuracy. These include per-residue pLDDT scores (see above), predicted TM (Template Modelling) score, which is a global superposition metric and predicted aligned error (a matrix size (number of residues) x (number of residues) where each position describes the confidence of the residue's 3D position relative to another residue in the model; can be used for the interpretation of relative positions of domains). Pickle files can be read and processed using the Python 'pickle' library (requires the jax Python library). Outputs are named respective to PDB outputs."
            />
            <param
                name="relax_json"
                type="boolean"
                checked="false"
                label="relax_metrics.json"
                help="A JSON-formatted text file containing relax metrics (mostly remaining violations)."
            />
            <param
                name="timings_json"
                type="boolean"
                checked="false"
                label="timings.json"
                help="A JSON file with timings reported for each phase of the AlphaFold run."
            />
        </section>
    </inputs>

    <outputs>
        <expand macro="output_pdb_models" />
        <data name="html" format="html" label="${tool.name} on ${on_string}: Visualization" />
        <!-- Optional outputs -->
        <expand macro="output_plddts" />
        <expand macro="output_msa_plot" />
        <expand macro="output_confidence_scores" />
        <expand macro="output_pickles" />
        <expand macro="output_pae_csv" />
        <expand macro="output_plots" />
        <expand macro="output_relax_json" />
        <expand macro="output_timings_json" />
    </outputs>

    <tests>
        <!-- Test monomer with default outputs -->
        <test expect_num_outputs="6">
            <conditional name="fasta_or_text">
                <param name="input_mode" value="history"/>
                <param name="fasta_file" value="test1.fasta"/>
            </conditional>
            <param name="model_preset|selection" value="monomer"/>
            <expand macro="test_output_pdb_models" />
        </test>

        <!-- Test monomer with all outputs -->
        <test expect_num_outputs="19">
            <conditional name="fasta_or_text">
                <param name="input_mode" value="history"/>
                <param name="fasta_file" value="test1.fasta"/>
            </conditional>
            <param name="model_preset|selection" value="monomer"/>
            <param name="outputs|plots" value="true"/>
            <param name="outputs|confidence_scores" value="true"/>
            <param name="outputs|plddts" value="true"/>
            <param name="outputs|pae_csv" value="true"/>
            <param name="outputs|model_pkls" value="true"/>
            <param name="outputs|relax_json" value="true"/>
            <expand macro="test_output_plots_1" />
            <expand macro="test_output_confidence_scores" />
            <expand macro="test_output_plddts" />
            <expand macro="test_output_pdb_models" />
            <expand macro="test_output_pickles" />
            <expand macro="test_output_relax_json" />
        </test>

        <!-- Test monomer_ptm with all outputs -->
        <test expect_num_outputs="24">
            <conditional name="fasta_or_text">
                <param name="input_mode" value="history"/>
                <param name="fasta_file" value="test1.fasta"/>
            </conditional>
            <param name="model_preset|selection" value="monomer_ptm"/>
            <param name="outputs|plots" value="true"/>
            <param name="outputs|confidence_scores" value="true"/>
            <param name="outputs|plddts" value="true"/>
            <param name="outputs|pae_csv" value="true"/>
            <param name="outputs|model_pkls" value="true"/>
            <param name="outputs|relax_json" value="true"/>
            <expand macro="test_output_plots_2" />
            <expand macro="test_output_confidence_scores" />
            <expand macro="test_output_plddts" />
            <expand macro="test_output_pdb_models" />
            <expand macro="test_output_pickles" />
            <expand macro="test_output_relax_json" />
            <expand macro="test_output_pae_csv" />
        </test>

        <!-- Test multimer with all outputs -->
        <test expect_num_outputs="24">
            <conditional name="fasta_or_text">
                <param name="input_mode" value="history"/>
                <param name="fasta_file" value="multimer.fasta"/>
            </conditional>
            <param name="model_preset|selection" value="multimer"/>
            <param name="outputs|plots" value="true"/>
            <param name="outputs|confidence_scores" value="true"/>
            <param name="outputs|plddts" value="true"/>
            <param name="outputs|pae_csv" value="true"/>
            <param name="outputs|model_pkls" value="true"/>
            <param name="outputs|relax_json" value="true"/>
            <param name="outputs|timings_json" value="true"/>
            <expand macro="test_output_plots_3" />
            <expand macro="test_output_confidence_scores" />
            <expand macro="test_output_plddts" />
            <expand macro="test_output_pdb_models" />
            <expand macro="test_output_pickles" />
            <expand macro="test_output_relax_json" />
            <expand macro="test_output_timings_json" />
            <expand macro="test_output_pae_csv" />
        </test>
    </tests>
    <help><![CDATA[

    .. class:: infomark

    | AlphaFold v2: AI-guided 3D structural prediction of proteins
    |
    | **NOTE: this tool packages** `a modified branch of AlphaFold v2.3.2. <https://github.com/neoformit/alphafold/tree/release_2.3.2_galaxy>`_
    |
    | This means that the neural network has been trained on PDBs with a release
    | date before 2021-09-30 (the training cutoff was 2018-04-30 until ``v2.3.0``).
    |
    | Find out more in the technical and release notes:
    |

    - `Release notes for v2.3.2 <https://github.com/deepmind/alphafold/releases/tag/v2.3.2>`_
    - `Technical notes for v2.3 <https://github.com/deepmind/alphafold/blob/main/docs/technical_note_v2.3.0.md>`_

    **What it does**

    *What is AlphaFold?*

    | AlphaFold is a program which uses neural networks to predict the tertiary (3D) structure of proteins. AlphaFold accepts an amino acid sequence in Fasta format, which will be "folded" into a 3D model.
    |

    *What makes AlphaFold different?*

    | The ability to use computers to predict 3D protein structures with high accuracy is desirable because it removes the time-consuming and costly process of determining structures experimentally.
    | In-silico protein folding has been an active field of research for decades, but existing tools were slower and far less reliable than AlphaFold.
    | AlphaFold represents a leap forward by regularly predicting structures to atomic-level accuracy, even when no similar structures are known.
    |


    **Input**

    *Amino acid sequence*

    | AlphaFold monomer (default) accepts a **single amino acid sequence** in FASTA format.
    | You can choose to input either a file from your Galaxy history or paste a sequence into a text box.
    | If you choose the ``multimer`` option, you can supply a FASTA file containing **multiple sequences** to be folded concurrently into a multimer.
    |
    | For pairwise screening of target-candidate with multimer, you can submit a list of paired protein sequences in batch mode (i.e. two protein sequences in each FASTA file).
    |

    **Outputs**

    *Visualization*

    An interactive 3D graphic of the best predicted molecular structures.
    This output can be opened in Galaxy to give a visual impression of the results, with different structural representations to choose from.
    Open the "Visualization" history output by clicking on the "view data" icon:

    .. image:: https://github.com/usegalaxy-au/galaxy-local-tools/blob/1a8d3e8daa7ccc5a345ca377697735ab95ed0666/tools/alphafold/static/img/alphafold-visualization.png?raw=true
        :height: 520
        :alt: Result visualization

    |

    *PDB files*

    | PDB (Protein Data Bank) files (5 by default) are be created, ordered by rank, as predicted by AlphaFold. The tool produces 5 models by default, but this can be reduced with the "Limit model outputs" for a reduced run time.
    | These files describe the molecular structures and can be used for downstream analysis. e.g. *in silico* molecular docking.
    | **PLEASE NOTE** that all outputs have been renamed to their respective rank order, including model and model.pkl files.
    |

    *Model confidence scores (optional)*

    | This optional output produces a file which describes the confidence scores for each model (based on `pLDDTs <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3799472/>`_, or the ``iptm+ptm`` score if run in multimer mode) which may be useful for downstream analysis.
    | Model confidence scores are also included as a column (replacing ``bFactor``) in the default PDB output.
    |
    |

    *Model data files (ranked_n.pkl)*

    | Per-model data stored in pickle files (a Python binary data format). These files can be used as inputs to downstream analysis software (such as Chimera X) for visualizing structures and computing kinetics between protein multimers and domains.
    | The tool will produce one ``.pkl`` output for each PDB model.
    |
    |

    *pLDDT + PAE plots (optional)*

    | A two-panel figure in PNG format showing:
    | a) pLDDT score plotted against residue position
    | b) a heatmap of predicted-alignment error (PAE) with residue position running along vertical and horizontal axes and color at each pixel indicating PAE value for the corresponding pair of residues.
    | Panel b) is only produced for ``monomer_ptm`` and ``multimer`` model presets.
    |
    |

    *MSA sequence coverage plot (optional)*

    | A heatmap in PNG format showing:
    | a) Per-position sequence identity to query as a heatmap
    | b) Per-position sequence coverage as a line plot
    |
    | This plot can help you understand if regions of low confidence are due to poor sequence coverage, rather than
    | limitations of the model or intrinsically unstable regions.
    |
    |

    *Model predicted-alignment error matrix (pae_ranked_n.csv)*

    | Per-model predicted-alignment error (PAE) matrix - only available with the ``monomer_ptm`` and ``multimer`` model presets.
    | The tool will produce one ``.csv`` output for each PDB model.
    |
    |

    *relax_metrics.json (optional)*

    | A JSON-formatted text file containing relax metrics (primarily remaining violations).
    |
    |

    *timings.json (optional)*

    | A JSON-formatted text file containing the timings for each phase of the prediction.
    |
    |

    **AlphaFold configuration**

    | We have configured AlphaFold to run with the parameters suggested by default on `AlphaFold's GitHub <https://github.com/deepmind/alphafold>`_.
    | This means that it runs with Amber relaxation enabled, with relaxed PDB models collected as output datasets (ranked\_*.pdb files). If there are additonal parameters that you would like to interact with, please `send a support request to Galaxy AU <https://site.usegalaxy.org.au/request/support>`_, or open an issue on `our GitHub <https://github.com/usegalaxy-au/tools-au>`_.
    |
    |

    **External Resources**

    We highly recommend checking out the
    `Alphafold Protein Structure Database <https://alphafold.ebi.ac.uk/>`_,
    which contains pre-computed structures for over 200 million known proteins.
    See also:

    - `Google Deepmind's article on AlphaFold <https://deepmind.com/blog/article/alphafold-a-solution-to-a-50-year-old-grand-challenge-in-biology>`_
    - `AlphaFold source code on GitHub <https://github.com/deepmind/alphafold>`_

    *Downstream analysis*

    | Obtaining a protein structure prediction is the first step in many analyses.
    | The 3D models created by AlphaFold can be used in downstream analysis, including the following:
    |

    - Inspecting protein features
        3D viewers (pymol, chimera, ngl, blender) can be used to inspect active sites, regulatory domains, binding sites.
    - Molecular docking
        3D structures can be used to predict the binding affinity of different compounds.
        This is especially useful in screening drug candidates.
    - Protein-protein interactions
        Proteins associate in many biological processes, including intracellular signalling pathways and protein complex formation.
        To predict these interactions, other programs may ingest 3D models predicted by AlphaFold. Proprietary softwares include `GOLD <https://www.ccdc.cam.ac.uk/solutions/csd-discovery/components/gold/>`_ and `SeeSAR <https://www.biosolveit.de/SeeSAR>`_, but many `free and open-source options <https://en.wikipedia.org/wiki/List_of_protein-ligand_docking_software>`_ are available such as `AutoDock <https://autodock.scripps.edu/>`_, `SwissDock <http://www.swissdock.ch/>`_, `DockQ <https://github.com/bjornwallner/DockQ>`_, `MM-Align <https://zhanggroup.org/MM-align/>`_ and `TM-Align <https://zhanggroup.org/TM-align/>`_. Protein-protein interactions are often inferred from AlphaFold-Multimer predictions, which provide a level of confidence in binding affinity between homomer/heteromer subunits.

    ]]></help>
    <citations>
        <citation type="doi">https://doi.org/10.1038/s41586-021-03819-2</citation>
        <citation type="doi">https://doi.org/10.1101/2021.10.04.463034</citation>
    </citations>
</tool>
author	galaxy-australia
date	Wed, 30 Oct 2024 21:46:34 +0000
parents	e7f1b552a695
children