view normalize.xml @ 14:d844935c906c draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit 9d49b2a98de059ae9a053dc1c5a23537cf0311de
author iuc
date Sat, 18 May 2024 18:31:20 +0000
parents 7b9fafe32c86
children 5dada6f76047
line wrap: on
line source

<tool id="scanpy_normalize" name="Normalize" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@profile@">
    <description>and impute with scanpy</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="bio_tools"/>
    <expand macro="requirements"/>
    <expand macro="version_command"/>
    <command detect_errors="exit_code"><![CDATA[
@CMD@
      ]]></command>
    <configfiles>
        <configfile name="script_file"><![CDATA[
@CMD_imports@
@CMD_read_inputs@

#if $method.method == "pp.normalize_total"
sc.pp.normalize_total(
    adata,
    #if str($method.target_sum) != ''
    target_sum=$method.target_sum,
    #end if
    exclude_highly_expressed=$method.exclude_highly_expressed.exclude_highly_expressed,
    #if $method.exclude_highly_expressed.exclude_highly_expressed == "True"
    max_fraction=$method.exclude_highly_expressed.max_fraction,
    #end if
    #if $method.key_added
    key_added='$method.key_added',
    #end if
    #if $method.layers
        #if str($method.layers) != 'all'
    layers[str(x.strip()) for x in str($method.layers).split(',')],
        #else
    layers='$method.layers',
        #end if
    #end if
    #if str($method.layer_norm) != "None"
        layer_norm='$method.layer_norm',
    #end if
    inplace=True)

#else if $method.method == "pp.recipe_zheng17"
sc.pp.recipe_zheng17(
    adata=adata,
    n_top_genes=$method.n_top_genes,
    log=$method.log,
    plot=False,
    copy=False)

#else if $method.method == "pp.recipe_weinreb17"
sc.pp.recipe_weinreb17(
    adata=adata,
    log=$method.log,
    mean_threshold=$method.mean_threshold,
    cv_threshold=$method.cv_threshold,
    n_pcs=$method.n_pcs,
    svd_solver='$method.svd_solver',
    random_state=$method.random_state,
    copy=False)

#else if $method.method == "pp.recipe_seurat"
sc.pp.recipe_seurat(
    adata=adata,
    log=$method.log,
    plot=False,
    copy=False)

#else if $method.method == "external.pp.magic"
sc.external.pp.magic(
    adata=adata,
    name_list='$method.name_list',
    knn=$method.knn,
    #if str($method.decay) != ''
    decay=$method.decay,
    #end if
    #if str($method.knn_max) != ''
    knn_max=$method.knn_max,
    #end if
    #if $method.t == -1
    t='auto',
    #else
    t=$method.t,
    #end if
    #if str($method.n_pca) != ''
    n_pca=$method.n_pca,
    #end if
    solver='$method.solver',
    knn_dist='$method.knn_dist',
    random_state=$method.random_state,
    copy=False)
#end if

@CMD_anndata_write_outputs@

]]></configfile>
    </configfiles>
    <inputs>
        <expand macro="inputs_anndata"/>
        <conditional name="method">
            <param argument="method" type="select" label="Method used for normalization">
                <option value="pp.normalize_total">Normalize counts per cell, using 'pp.normalize_total'</option>
                <option value="pp.recipe_zheng17">Normalization and filtering as of Zheng et al. (2017), using 'pp.recipe_zheng17'</option>
                <option value="pp.recipe_weinreb17">Normalization and filtering as of Weinreb et al (2017), using 'pp.recipe_weinreb17'</option>
                <option value="pp.recipe_seurat">Normalization and filtering as of Seurat et al (2015), using 'pp.recipe_seurat'</option>
                <option value="external.pp.magic">Denoising using Markov Affinity-based Graph Imputation of Cells (MAGIC) API 'external.pp.magic'</option>
            </param>
            <when value="pp.normalize_total">
                <param argument="target_sum" type="float" value="" optional="true" label="Target sum" help="If not provided, after normalization, each observation (cell) has a total count equal to the median of the total counts (cells) before normalization."/>
                <conditional name="exclude_highly_expressed">
                    <param argument="exclude_highly_expressed" type="select" label="Exclude (very) highly expressed genes for the computation of the normalization factor (size factor) for each cell" help=" A gene is considered highly expressed, if it has more than max_fraction of the total counts in at least one cell. The not-excluded genes will sum up to target_sum">
                        <option value="True">Yes</option>
                        <option value="False" selected="true">No</option>
                    </param>
                    <when value="True">
                        <param argument="max_fraction" type="float" value="0.05" label="Target sum" help="If not provided, after normalization, each observation (cell) has a total count equal to the median of the total counts (cells) before normalization."/>
                    </when>
                    <when value="False"/>
                </conditional>
                <param argument="key_added" type="text" value="" optional="true" label="Name of the field in 'adata.obs' where the normalization factor is stored" help="">
                    <expand macro="sanitize_query" />
                </param>
                <param argument="layers" type="text" value="" optional="true" label="List of layers to normalize" help="'All' will normalize all layers. The list should be comma-separated.">
                    <expand macro="sanitize_query" />
                </param>
                <param argument="layer_norm" type="select" label="How to normalize layers?">
                    <option value="None">None: after normalization, for each layer in layers each cell has a total count equal to the median of the median of the total counts (cells) before normalization of the layer.</option>
                    <option value="after">After: for each layer in layers each cell has a total count equal to target_sum.</option>
                    <option value="X">X: for each layer in layers each cell has a total count equal to the median of total counts for observations (cells) of adata.X before normalization.</option>
                </param>
            </when>
            <when value="pp.recipe_zheng17">
                <param argument="n_top_genes" type="integer" min="0" value="1000" label="Number of genes to keep" help=""/>
                <expand macro="param_log"/>
            </when>
            <when value="pp.recipe_weinreb17">
                <expand macro="param_log"/>
                <param argument="mean_threshold" type="float" value="0.01" label="Mean threshold" help=""/>
                <param argument="cv_threshold" type="float" value="2" label="CV threshold" help=""/>
                <param argument="n_pcs" type="integer" min="0" value="50" label="Number of principal component" help=""/>
                <expand macro="svd_solver"/>
                <expand macro="pca_random_state"/>
            </when>
            <when value="pp.recipe_seurat">
                <expand macro="param_log"/>
            </when>
            <when value="external.pp.magic">
                <param name="name_list" type="select" label="Denoised genes to return" help="Selecting all genes may require a large amount of memory">
                    <option value="all_genes">All genes</option>
                    <option value="pca_only">PCA only</option>
                </param>
                <param argument="knn" type="integer" min="1" value="5" label="Number of nearest neighbors on which to build kernel" help=""/>
                <param argument="decay" type="integer" optional="true" value="1" label="Set decay rate of kernel tails" 
                    help="If not set, alpha decaying kernel is not used" />
                <param argument="knn_max" type="integer" min="1" optional="true" value="" label="Maximum number of nearest neighbors with nonzero connection"
                    help="If not set, will be set to 3 * knn" />
                <param argument="t" type="integer" min="-1" value="3" label="Power to which the diffusion operator is powered. This sets the level of diffusion"
                    help="If ‘-1’, this parameter is selected according to the Procrustes disparity of the diffused data." />
                <param argument="n_pca" type="integer" value="100" optional="true" label="Number of principal components to use for calculating neighborhoods"
                    help="For extremely large datasets, using n_pca less than 20 allows neighborhoods to be calculated in roughly log(n_samples) time. If not set, no PCA is performed." />
                <param name="solver" type="select" label="Which solver to use" help="Selecting all genes may require a large amount of memory">
                    <option value="exact">"exact", the implementation described in van Dijk et al. (2018) </option>
                    <option value="approximate">"approximate", is faster that performs imputation in the PCA space and then projects back to the gene space</option>
                </param>
                <param name="knn_dist" type="select" label="Distance metric to use for the data" help="See scipy.spatial.distance.pdist documentation for more options https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html">
                    <expand macro="distance_metric_options"/>
                </param>
                <expand macro="param_random_state"/>
            </when>
        </conditional>
        <expand macro="inputs_common_advanced"/>
    </inputs>
    <outputs>
        <expand macro="anndata_outputs"/>
    </outputs>
    <tests>
        <test expect_num_outputs="2">
            <!-- test 1 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.normalize_total"/>
                <conditional name="exclude_highly_expressed">
                    <param name="exclude_highly_expressed" value="False"/>
                </conditional>
                <param name="key_added" value="n_counts"/>
                <param name="layers" value="all"/>
                <param name="layer_norm" value="None"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.normalize_total"/>
                    <has_text_matching expression="exclude_highly_expressed=False"/>
                    <has_text_matching expression="key_added='n_counts'"/>
                    <has_text_matching expression="layers='all'"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.normalize_total.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 2 -->
            <param name="adata" value="random-randint.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.recipe_zheng17"/>
                <param name="n_top_genes" value="1000"/>
                <param name="log" value="True"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.recipe_zheng17"/>
                    <has_text_matching expression="n_top_genes=1000"/>
                    <has_text_matching expression="log=True"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.recipe_zheng17.random-randint.h5ad" ftype="h5ad" compare="sim_size" delta="1000000" delta_frac="0.15"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 3 -->
            <param name="adata" value="paul15_subsample.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.recipe_weinreb17"/>
                <param name="log" value="True"/>
                <param name="mean_threshold" value="0.01"/>
                <param name="cv_threshold" value="2.0"/>
                <param name="n_pcs" value="50"/>
                <param name="svd_solver" value="randomized"/>
                <param name="random_state" value="0"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.recipe_weinreb17"/>
                    <has_text_matching expression="log=True"/>
                    <has_text_matching expression="mean_threshold=0.01"/>
                    <has_text_matching expression="cv_threshold=2.0"/>
                    <has_text_matching expression="n_pcs=50"/>
                    <has_text_matching expression="svd_solver='randomized'"/>
                    <has_text_matching expression="random_state=0"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.recipe_weinreb17.paul15_subsample.updated.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 4 -->
            <param name="adata" value="pp.recipe_zheng17.random-randint.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.recipe_seurat"/>
                <param name="log" value="True"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.recipe_seurat"/>
                    <has_text_matching expression="log=True"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.recipe_seurat.recipe_zheng17.h5ad" ftype="h5ad" compare="sim_size" delta="1000000" delta_frac="0.25"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 5 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="external.pp.magic"/>
                <param name="name_list" value="all_genes"/>
                <param name="t" value="-1"/>
                <param name="n_pca" value="5"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="external.pp.magic"/>
                    <has_text_matching expression="name_list='all_genes'"/>
                    <has_text_matching expression="t='auto'"/>
                    <has_text_matching expression="n_pca=5"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="external.pp.magic.all_genes.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 6 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="external.pp.magic"/>
                <param name="name_list" value="pca_only"/>
                <param name="t" value="3"/>
                <param name="n_pca" value="5"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="external.pp.magic"/>
                    <has_text_matching expression="name_list='pca_only'"/>
                    <has_text_matching expression="t=3"/>
                    <has_text_matching expression="n_pca=5"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="external.pp.magic.pca_only.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
            <assert_stdout>
                <has_text text="X_magic"/>
            </assert_stdout>
        </test>
    </tests>
    <help><![CDATA[
Normalize total counts per cell (`pp.normalize_per_cell`)
=========================================================

Normalize each cell by total counts over all genes, so that every cell has
the same total count after normalization.

Similar functions are used, for example, by Seurat, Cell Ranger or SPRING.

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.normalize_per_cell.html>`__


Normalization and filtering as of Zheng et al. (2017), the Cell Ranger R Kit of 10x Genomics (`pp.recipe_zheng17`)
==================================================================================================================

Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

The recipe runs the following steps:

- only consider genes with more than 1 count
- normalize with total UMI count per cell
- select highly-variable genes
- subset the genes
- renormalize after filtering
- log transform (if needed)
- scale to unit variance and shift to zero mean

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.recipe_zheng17.html>`__


Normalization and filtering as of Weinreb et al (2017) (`pp.recipe_weinreb17`)
==============================================================================

Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.recipe_weinreb17.html>`__


Normalization and filtering as of Seurat et al (2015) (`pp.recipe_seurat`)
==========================================================================

This uses a particular preprocessing.

Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.recipe_seurat.html>`__


Markov Affinity-based Graph Imputation of Cells (MAGIC) as of Van Dijk D et al. (2018) (`external.pp.magic`)
============================================================================================================

MAGIC is an algorithm for denoising and transcript recover of single cells applied to single-cell sequencing data. MAGIC builds a graph from the data and uses diffusion to smooth out noise and recover the data manifold.

The algorithm implemented here has changed primarily in two ways compared to the algorithm described in Van Dijk D et al. (2018). 

- Firstly, we use the adaptive kernel described in Moon et al, (2019) for improved stability.
- Secondly, data diffusion is applied in the PCA space, rather than the data space, for speed and memory improvements.

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.external.pp.magic.html>`__

    ]]></help>
    <expand macro="citations"/>
</tool>