view normalize.xml @ 17:5dada6f76047 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit 91121b1e72696f17478dae383badaa71e9f96dbb
author iuc
date Sat, 14 Sep 2024 12:42:55 +0000
parents d844935c906c
children
line wrap: on
line source

<tool id="scanpy_normalize" name="Scanpy normalize" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>and impute</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="bio_tools"/>
    <expand macro="requirements">
        <requirement type="package" version="3.0.0">magic-impute</requirement>
    </expand>
    <expand macro="version_command"/>
    <command detect_errors="exit_code"><![CDATA[
@CMD@
      ]]></command>
    <configfiles>
        <configfile name="script_file"><![CDATA[
@CMD_IMPORTS@
@CMD_READ_INPUTS@

#if str($method.method) == 'pp.normalize_total':
sc.pp.normalize_total(
    adata,
    #if str($method.target_sum) != '':
    target_sum=$method.target_sum,
    #end if
    exclude_highly_expressed=$method.exclude_highly_expressed.exclude_highly_expressed,
    #if str($method.exclude_highly_expressed.exclude_highly_expressed) == 'True':
    max_fraction=$method.exclude_highly_expressed.max_fraction,
    #end if
    #if str($method.key_added) != '':
    key_added='$method.key_added',
    #end if
    #if str($method.layer) != '':
        layer='$method.layer',
    #end if
    inplace=True)

#else if str($method.method) == 'pp.recipe_zheng17':
sc.pp.recipe_zheng17(
    adata=adata,
    n_top_genes=$method.n_top_genes,
    log=$method.log,
    plot=False,
    copy=False)

#else if str($method.method) == 'pp.recipe_weinreb17':
sc.pp.recipe_weinreb17(
    adata=adata,
    log=$method.log,
    mean_threshold=0.01,
    cv_threshold=2,
    n_pcs=50,
    svd_solver='randomized',
    random_state=0,
    copy=False)

#else if str($method.method) == 'pp.recipe_seurat':
sc.pp.recipe_seurat(
    adata=adata,
    log=$method.log,
    plot=False,
    copy=False)

#else if str($method.method) == 'external.pp.magic':
print("stats before magic:", "min=", f"{adata.X.min():.5f}", "max=", f"{adata.X.max():.5f}", "mean=", f"{adata.X.mean():.5f}")

sc.external.pp.magic(
    adata=adata,
    name_list='$method.name_list',
    knn=$method.knn,
    #if str($method.decay) != '':
    decay=$method.decay,
    #end if
    #if str($method.knn_max) != '':
    knn_max=$method.knn_max,
    #end if
    #if $method.t == -1:
    t='auto',
    #else
    t=$method.t,
    #end if
    #if str($method.n_pca) != '':
    n_pca=$method.n_pca,
    #end if
    solver='$method.solver',
    knn_dist='$method.knn_dist',
    #if str($method.random_state) != '':
    random_state=$method.random_state,
    #else
    random_state=None,
    #end if
    copy=False)

    #if str($method.name_list) == 'all_genes':
print("stats after magic:", "min=", f"{adata.X.min():.5f}", "max=", f"{adata.X.max():.5f}", "mean=", f"{adata.X.mean():.5f}")
    #end if
#end if

@CMD_ANNDATA_WRITE_OUTPUTS@
        ]]>
        </configfile>
    </configfiles>
    <inputs>
        <expand macro="inputs_anndata"/>
        <conditional name="method">
            <param argument="method" type="select" label="Method used for normalization">
                <option value="pp.normalize_total">Normalize counts per cell, using 'pp.normalize_total'</option>
                <option value="pp.recipe_zheng17">Normalization and filtering as of Zheng et al. (2017), using 'pp.recipe_zheng17'</option>
                <option value="pp.recipe_weinreb17">Normalization and filtering as of Weinreb et al (2017), using 'pp.recipe_weinreb17'</option>
                <option value="pp.recipe_seurat">Normalization and filtering as of Seurat et al (2015), using 'pp.recipe_seurat'</option>
                <option value="external.pp.magic">Denoising using Markov Affinity-based Graph Imputation of Cells (MAGIC) API 'external.pp.magic'</option>
            </param>
            <when value="pp.normalize_total">
                <param argument="target_sum" type="float" value="" optional="true" label="Target sum" help="If not provided, after normalization, each observation (cell) has a total count equal to the median of the total counts (cells) before normalization."/>
                <conditional name="exclude_highly_expressed">
                    <param argument="exclude_highly_expressed" type="select" label="Exclude (very) highly expressed genes for the computation of the normalization factor (size factor) for each cell" help=" A gene is considered highly expressed, if it has more than max_fraction of the total counts in at least one cell. The not-excluded genes will sum up to target_sum">
                        <option value="False" selected="true">No</option>
                        <option value="True">Yes</option>
                    </param>
                    <when value="True">
                        <param argument="max_fraction" type="float" value="0.05" label="Consider cells as highly expressed that have more counts than this value of the original total counts in at least one cell."/>
                    </when>
                    <when value="False"/>
                </conditional>
                <param argument="key_added" type="text" value="" optional="true" label="Name of the field in 'adata.obs' where the normalization factor is stored">
                    <expand macro="sanitize_query"/>
                </param>
                <param argument="layer" type="text" value="" label="Layer to normalize instead of X. If not provided, X is normalized.">
                    <expand macro="sanitize_query"/>
                </param>
            </when>
            <when value="pp.recipe_zheng17">
                <param argument="n_top_genes" type="integer" min="0" value="1000" label="Number of genes to keep"/>
                <expand macro="param_log" checked="true"/>
            </when>
            <when value="pp.recipe_weinreb17">
                <expand macro="param_log" checked="true"/>
            </when>
            <when value="pp.recipe_seurat">
                <expand macro="param_log" checked="true"/>
            </when>
            <when value="external.pp.magic">
                <param name="name_list" type="select" label="Denoised genes to return" help="Selecting all genes may require a large amount of memory">
                    <option value="all_genes" selected="true">All genes</option>
                    <option value="pca_only">PCA only</option>
                </param>
                <param argument="knn" type="integer" min="1" value="5" label="Number of nearest neighbors on which to build kernel"/>
                <param argument="decay" type="integer" optional="true" value="1" label="Set decay rate of kernel tails" help="If not set, alpha decaying kernel is not used"/>
                <param argument="knn_max" type="integer" min="1" optional="true" value="" label="Maximum number of nearest neighbors with nonzero connection" help="If not set, will be set to 3 * knn"/>
                <param argument="t" type="integer" min="-1" value="3" label="Power to which the diffusion operator is powered. This sets the level of diffusion" help="If ‘-1’, this parameter is selected according to the Procrustes disparity of the diffused data."/>
                <param argument="n_pca" type="integer" value="100" optional="true" label="Number of principal components to use for calculating neighborhoods"
                    help="For extremely large datasets, using n_pca less than 20 allows neighborhoods to be calculated in roughly log(n_samples) time. If not set, no PCA is performed."/>
                <param name="solver" type="select" label="Which solver to use" help="Selecting all genes may require a large amount of memory">
                    <option value="exact" selected="true">"exact", the implementation described in van Dijk et al. (2018) </option>
                    <option value="approximate">"approximate", is faster that performs imputation in the PCA space and then projects back to the gene space</option>
                </param>
                <param name="knn_dist" type="select" label="Distance metric to use for the data" help="See scipy.spatial.distance.pdist documentation for more options https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html">
                    <expand macro="distance_metric_options"/>
                </param>
                <param argument="random_state" type="integer" optional="true" label="Random seed" help="Defaults to the global numpy random number generator."/>
            </when>
        </conditional>
        <expand macro="inputs_common_advanced"/>
    </inputs>
    <outputs>
        <expand macro="anndata_outputs"/>
    </outputs>
    <tests>

        <!-- test 1 -->
        <test expect_num_outputs="2">
            <param name="adata" value="krumsiek11.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.normalize_total"/>
                <param name="key_added" value="n_counts"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.normalize_total"/>
                    <has_text_matching expression="exclude_highly_expressed=False"/>
                    <has_text_matching expression="key_added='n_counts'"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/n_counts"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 2 -->
        <test expect_num_outputs="2">
            <param name="adata" value="random-randint.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.recipe_zheng17"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.recipe_zheng17"/>
                    <has_text_matching expression="n_top_genes=1000"/>
                    <has_text_matching expression="log=True"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/n_counts_all"/>
                    <has_h5_keys keys="var/n_counts,var/mean,var/std"/>
                    <has_h5_keys keys="uns/log1p"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 3 -->
        <test expect_num_outputs="2">
            <param name="adata" value="paul15_subsample.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.recipe_weinreb17"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.recipe_weinreb17"/>
                    <has_text_matching expression="log=True"/>
                    <has_text_matching expression="mean_threshold=0.01"/>
                    <has_text_matching expression="cv_threshold=2"/>
                    <has_text_matching expression="n_pcs=50"/>
                    <has_text_matching expression="svd_solver='randomized'"/>
                    <has_text_matching expression="random_state=0"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="uns/log1p"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 4 -->
        <test expect_num_outputs="2">
            <param name="adata" value="pp.recipe_zheng17.random-randint.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.recipe_seurat"/>
           </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.recipe_seurat"/>
                    <has_text_matching expression="log=True"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/n_genes"/>
                    <has_h5_keys keys="var/n_cells"/>
                    <has_h5_keys keys="uns/log1p"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 5 -->
        <test expect_num_outputs="2">
            <param name="adata" value="krumsiek11.h5ad"/>
            <conditional name="method">
                <param name="method" value="external.pp.magic"/>
                <param name="t" value="-1"/>
                <param name="n_pca" value="5"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="external.pp.magic"/>
                    <has_text_matching expression="name_list='all_genes'"/>
                    <has_text_matching expression="t='auto'"/>
                    <has_text_matching expression="n_pca=5"/>
                    <has_text_matching expression="stats before magic: min= -0.01630 max= 1.01060 mean= 0.28644"/>
                    <has_text_matching expression="stats after magic: min= -0.00857 max= 1.00546 mean= 0.28645"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/cell_type"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 6 -->
        <test expect_num_outputs="2">
            <param name="adata" value="krumsiek11.h5ad"/>
            <conditional name="method">
                <param name="method" value="external.pp.magic"/>
                <param name="name_list" value="pca_only"/>
                <param name="t" value="3"/>
                <param name="n_pca" value="5"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="external.pp.magic"/>
                    <has_text_matching expression="name_list='pca_only'"/>
                    <has_text_matching expression="t=3"/>
                    <has_text_matching expression="n_pca=5"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obsm/X_magic"/>
                </assert_contents>
            </output>
            <assert_stdout>
                <has_text text="X_magic"/>
            </assert_stdout>
        </test>
    </tests>
    <help><![CDATA[
Normalize total counts per cell (`pp.normalize_total`)
======================================================

Normalize each cell by total counts over all genes, so that every cell has the same total count after normalization. If choosing target_sum=1e6, this is CPM normalization.

Similar functions are used, for example, by Seurat, Cell Ranger or SPRING.

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.normalize_total.html>`__


Normalization and filtering as of Zheng et al. (2017), the Cell Ranger R Kit of 10x Genomics (`pp.recipe_zheng17`)
==================================================================================================================

Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

The recipe runs the following steps:

- only consider genes with more than 1 count
- normalize with total UMI count per cell
- select highly-variable genes
- subset the genes
- renormalize after filtering
- log transform (if needed)
- scale to unit variance and shift to zero mean

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.recipe_zheng17.html>`__


Normalization and filtering as of Weinreb et al (2017) (`pp.recipe_weinreb17`)
==============================================================================

Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.recipe_weinreb17.html>`__


Normalization and filtering as of Seurat et al (2015) (`pp.recipe_seurat`)
==========================================================================

This uses a particular preprocessing.

Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.recipe_seurat.html>`__


Markov Affinity-based Graph Imputation of Cells (MAGIC) as of Van Dijk D et al. (2018) (`external.pp.magic`)
============================================================================================================

MAGIC is an algorithm for denoising and transcript recover of single cells applied to single-cell sequencing data. MAGIC builds a graph from the data and uses diffusion to smooth out noise and recover the data manifold.

The algorithm implemented here has changed primarily in two ways compared to the algorithm described in Van Dijk D et al. (2018).

- Firstly, we use the adaptive kernel described in Moon et al, (2019) for improved stability.
- Secondly, data diffusion is applied in the PCA space, rather than the data space, for speed and memory improvements.

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.external.pp.magic.html>`__

    ]]></help>
    <expand macro="citations"/>
</tool>