Mercurial > repos > iuc > scanpy_filter

<tool id="scanpy_filter" name="Filter" version="@galaxy_version@" profile="@profile@">
    <description>with scanpy</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <expand macro="version_command"/>
    <command detect_errors="exit_code"><![CDATA[
@CMD@
      ]]></command>
    <configfiles>
        <configfile name="script_file"><![CDATA[
@CMD_imports@
@CMD_read_inputs@

#if $method.method == 'pp.filter_cells'
sc.pp.filter_cells(
    adata,
    #if $method.filter.filter == 'min_counts'
    min_counts=$method.filter.min_counts,
    #else if $method.filter.filter == 'max_counts'
    max_counts=$method.filter.max_counts,
    #else if $method.filter.filter == 'min_genes'
    min_genes=$method.filter.min_genes,
    #else if $method.filter.filter == 'max_genes'
    max_genes=$method.filter.max_genes,
    #end if
    copy=False)

#else if $method.method == 'pp.filter_genes'
sc.pp.filter_genes(
    adata,
    #if $method.filter.filter == 'min_counts'
    min_counts=$method.filter.min_counts,
    #else if $method.filter.filter == 'max_counts'
    max_counts=$method.filter.max_counts,
    #else if $method.filter.filter == 'min_cells'
    min_cells=$method.filter.min_cells,
    #else if $method.filter.filter == 'max_cells'
    max_cells=$method.filter.max_cells,
    #end if
    copy=False)

#else if $method.method == 'tl.filter_rank_genes_groups'
sc.tl.filter_rank_genes_groups(
    adata,
    #if str($method.key) != ''
    key='$method.key',
    #end if
    #if str($method.groupby) != ''
    groupby='$method.groupby',
    #end if
    use_raw=$method.use_raw,
    log=$method.log,
    key_added='$method.key_added',
    min_in_group_fraction=$method.min_in_group_fraction,
    max_out_group_fraction=$method.max_out_group_fraction,
    min_fold_change=$method.min_fold_change)

#else if $method.method == "pp.highly_variable_genes"
sc.pp.highly_variable_genes(
    adata=adata,
    flavor='$method.flavor.flavor',
    #if $method.flavor.flavor == 'seurat'
        #if str($method.flavor.min_mean) != ''
    min_mean=$method.flavor.min_mean,
        #end if
        #if str($method.flavor.max_mean) != ''
    max_mean=$method.flavor.max_mean,
        #end if
        #if str($method.flavor.min_disp) != ''
    min_disp=$method.flavor.min_disp,
        #end if
        #if str($method.flavor.max_disp) != ''
    max_disp=$method.flavor.max_disp,
        #end if
    #else if $method.flavor.flavor == 'cell_ranger'
    n_top_genes=$method.flavor.n_top_genes,
    #end if
    n_bins=$method.n_bins,
    subset=$method.subset,
    inplace=True)

#else if $method.method == 'pp.subsample'
sc.pp.subsample(
    data=adata,
    #if $method.type.type == 'fraction'
    fraction=$method.type.fraction,
    #else if $method.type.type == 'n_obs'
    n_obs=$method.type.n_obs,
    #end if
    random_state=$method.random_state,
    copy=False)

#else if $method.method == "pp.downsample_counts"
sc.pp.downsample_counts(
    adata=adata,
    #if str($method.counts_per_cell) != ''
    counts_per_cell=$method.counts_per_cell,
    #end if
    #if str($method.total_counts) != ''
    total_counts=$method.total_counts,
    #end if
    random_state=$method.random_state,
    replace=$method.replace,
    copy=False)
#end if

@CMD_anndata_write_outputs@
]]></configfile>
    </configfiles>
    <inputs>
        <expand macro="inputs_anndata"/>
        <conditional name="method">
            <param argument="method" type="select" label="Method used for filtering">
                <option value="pp.filter_cells">Filter cell outliers based on counts and numbers of genes expressed, using 'pp.filter_cells'</option>
                <option value="pp.filter_genes">Filter genes based on number of cells or counts, using 'pp.filter_genes'</option>
                <option value="tl.filter_rank_genes_groups">Filters out genes based on fold change and fraction of genes expressing the gene within and outside the groupby categories, using 'tl.filter_rank_genes_groups'</option>
                <option value="pp.highly_variable_genes">Annotate (and filter) highly variable genes, using 'pp.highly_variable_genes'</option>
                <option value="pp.subsample">Subsample to a fraction of the number of observations, using 'pp.subsample'</option>
                <option value="pp.downsample_counts">Downsample counts from count matrix, using 'pp.downsample_counts'</option>
            </param>
            <when value="pp.filter_cells">
                <conditional name="filter">
                    <param argument="filter" type="select" label="Filter">
                        <option value="min_counts">Minimum number of counts</option>
                        <option value="max_counts">Maximum number of counts</option>
                        <option value="min_genes">Minimum number of genes expressed</option>
                        <option value="max_genes">Maximum number of genes expressed</option>
                    </param>
                    <when value="min_counts">
                        <param argument="min_counts" type="integer" min="0" value="" label="Minimum number of counts required for a cell to pass filtering" help=""/>
                    </when>
                    <when value="max_counts">
                        <param argument="max_counts" type="integer" min="0" value="" label="Maximum number of counts required for a cell to pass filtering" help=""/>
                    </when>
                    <when value="min_genes">
                        <param argument="min_genes" type="integer" min="0" value="" label="Minimum number of genes expressed required for a cell to pass filtering" help=""/>
                    </when>
                    <when value="max_genes">
                        <param argument="max_genes" type="integer" min="0" value="" label="Maximum number of genes expressed required for a cell to pass filtering" help=""/>
                    </when>
                </conditional>
            </when>
            <when value="pp.filter_genes">
                <conditional name="filter">
                    <param argument="filter" type="select" label="Filter">
                        <option value="min_counts">Minimum number of counts</option>
                        <option value="max_counts">Maximum number of counts</option>
                        <option value="min_cells">Minimum number of cells expressed</option>
                        <option value="max_cells">Maximum number of cells expressed</option>
                    </param>
                    <when value="min_counts">
                        <param argument="min_counts" type="integer" min="0" value="" label="Minimum number of counts required for a gene to pass filtering"/>
                    </when>
                    <when value="max_counts">
                        <param argument="max_counts" type="integer" min="0" value="" label="Maximum number of counts required for a gene to pass filtering"/>
                    </when>
                    <when value="min_cells">
                        <param argument="min_cells" type="integer" min="0" value="" label="Minimum number of cells expressed required for a gene to pass filtering"/>
                    </when>
                    <when value="max_cells">
                        <param argument="max_cells" type="integer" min="0" value="" label="Maximum number of cells expressed required for a gene to pass filtering"/>
                    </when>
                </conditional>
            </when>
            <when value="tl.filter_rank_genes_groups">
                <param argument="key" type="text" optional="true" label="Key in adata.uns where the rank_genes_groups output is stored">
                    <expand macro="sanitize_query" />
                </param>
                <param argument="groupby" type="text" optional="true" label="The key of the observations grouping to consider">
                    <expand macro="sanitize_query" />
                </param>
                <expand macro="param_use_raw"/>
                <expand macro="param_log"/>
                <param argument="key_added" type="text" value="rank_genes_groups_filtered" label="Key that will contain new values">
                    <expand macro="sanitize_query" />
                </param>
                <param argument="min_in_group_fraction" type="float" min="0" max="1" value="0.25" label="Minimum fraction of genes expressing the gene within the categories"/>
                <param argument="max_out_group_fraction" type="float" min="0" max="1" value="0.5" label="Maximum fraction of genes expressing the gene outside the categories"/>
                <param argument="min_fold_change" type="integer" value="2" label="Minimum fold change"/>
            </when>
            <when value="pp.highly_variable_genes">
                <conditional name='flavor'>
                    <param argument="flavor" type="select" label="Flavor for computing normalized dispersion">
                        <option value="seurat">Seurat</option>
                        <option value="cell_ranger">Cell Ranger</option>
                    </param>
                    <when value="seurat">
                        <param argument="min_mean" type="float" value="0.0125" label="Minimal mean cutoff"/>
                        <param argument="max_mean" type="float" value="3" label="Maximal mean cutoff"/>
                        <param argument="min_disp" type="float" value="0.5" label="Minimal normalized dispersion cutoff"/>
                        <param argument="max_disp" type="float" value="" optional="true" label="Maximal normalized dispersion cutoff"/>
                    </when>
                    <when value="cell_ranger">
                        <param argument="n_top_genes" type="integer" value="" label="Number of highly-variable genes to keep"/>
                    </when>
                </conditional>
                <param argument="n_bins" type="integer" value="20" label="Number of bins for binning the mean gene expression" help="Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1"/>
                <param argument="subset" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Inplace subset to highly-variable genes?" help="Otherwise it merely indicates highly variable genes."/>
            </when>
            <when value="pp.subsample">
                <conditional name="type">
                    <param name="type" type="select" label="Type of subsampling">
                        <option value="fraction">By fraction</option>
                        <option value="n_obs">By number of observation</option>
                    </param>
                    <when value="fraction">
                        <param argument="fraction" type="float" value="" label="Subsample to this 'fraction' of the number of observations"/>
                    </when>
                    <when value="n_obs">
                        <param argument="n_obs" type="integer" min="0" value="" label="Subsample to this number of observations"/>
                    </when>
                </conditional>
                <param argument="random_state" type="integer" value="0" label="Random seed to change subsampling"/>
            </when>
            <when value="pp.downsample_counts">
                <param argument="counts_per_cell" type="integer" min="0" optional="true" label="Target total counts per cell" help="If a cell has more than ‘counts_per_cell’, it will be downsampled to this number. Resulting counts can be specified on a per cell basis by passing an array."/>
                <param argument="total_counts" type="integer" min="0" optional="true" label="Target total counts" help="If the count matrix has more than total_counts it will be downsampled to have this number."/>
                <param argument="random_state" type="integer" value="0" label="Random seed to change subsampling"/>
                <param argument="replace" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Sample the counts with replacement?"/>
            </when>
        </conditional>
        <expand macro="inputs_common_advanced"/>
    </inputs>
    <outputs>
        <expand macro="anndata_outputs"/>
    </outputs>
    <tests>
        <test>
            <!-- test 0 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.filter_cells"/>
                <conditional name="filter">
                    <param name="filter" value="min_counts"/>
                    <param name="min_counts" value="3"/>
                </conditional>
            </conditional>
            <assert_stdout>
                <has_text_matching expression="336 × 11"/>
            </assert_stdout>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.filter_cells"/>
                    <has_text_matching expression="min_counts=3"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.filter_cells.krumsiek11-min_counts.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test>
            <!-- test 1 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.filter_cells"/>
                <conditional name="filter">
                    <param name="filter" value="max_genes"/>
                    <param name="max_genes" value="100"/>
                </conditional>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.filter_cells"/>
                    <has_text_matching expression="adata"/>
                    <has_text_matching expression="max_genes=100"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.filter_cells.krumsiek11-max_genes.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test>
            <!-- test 2 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.filter_genes"/>
                <conditional name="filter">
                    <param name="filter" value="min_counts"/>
                    <param name="min_counts" value="3"/>
                </conditional>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.filter_genes"/>
                    <has_text_matching expression="min_counts=3"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.filter_genes.krumsiek11-min_counts.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <!-- <test> -->
        <!--     <!-\- test 3 -\-> -->
        <!--     <!-\- Input dataset appears to be missing rank_genes_groups key... -\-> -->
        <!--     <param name="adata" value="tl.rank_genes_groups.krumsiek11.h5ad" /> -->
        <!--     <conditional name="method"> -->
        <!--         <param name="method" value="tl.filter_rank_genes_groups"/> -->
        <!--         <param name="key" value="rank_genes_groups"/> -->
        <!--         <param name="use_raw" value="False"/> -->
        <!--         <param name="log" value="False"/> -->
        <!--         <param name="key_added" value="rank_genes_groups_filtered"/> -->
        <!--         <param name="min_in_group_fraction" value="0.25"/> -->
        <!--         <param name="max_out_group_fraction" value="0.5"/> -->
        <!--         <param name="min_fold_change" value="3"/> -->
        <!--     </conditional> -->
        <!--     <output name="hidden_output"> -->
        <!--         <assert_contents> -->
        <!--             <has_text_matching expression="tl.filter_rank_genes_groups"/> -->
        <!--             <has_text_matching expression="key='rank_genes_groups'"/> -->
        <!--             <has_text_matching expression="use_raw=False"/> -->
        <!--             <has_text_matching expression="log=False"/> -->
        <!--             <has_text_matching expression="key_added='rank_genes_groups_filtered'"/> -->
        <!--             <has_text_matching expression="min_in_group_fraction=0.25"/> -->
        <!--             <has_text_matching expression="max_out_group_fraction=0.5"/> -->
        <!--             <has_text_matching expression="min_fold_change=3"/> -->
        <!--         </assert_contents> -->
        <!--     </output> -->
        <!--     <output name="anndata_out" file="pp.filter_rank_genes_groups.h5ad" ftype="h5ad" compare="sim_size"/> -->
        <!-- </test> -->
        <test>
            <!-- test 4 -->
            <param name="adata" value="blobs.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.highly_variable_genes"/>
                <conditional name="flavor">
                    <param name="flavor" value="seurat"/>
                    <param name="min_mean" value="0.0125"/>
                    <param name="max_mean" value="3"/>
                    <param name="min_disp" value="0.5"/>
                </conditional>
                <param name="n_bins" value="20"/>
                <param name="subset" value="false"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.highly_variable_genes"/>
                    <has_text_matching expression="flavor='seurat'"/>
                    <has_text_matching expression="min_mean=0.0125"/>
                    <has_text_matching expression="max_mean=3"/>
                    <has_text_matching expression="min_disp=0.5"/>
                    <has_text_matching expression="n_bins=20"/>
                    <has_text_matching expression="subset=False"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.highly_variable_genes.seurat.blobs.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test>
            <!-- test 5 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.highly_variable_genes"/>
                <conditional name="flavor">
                    <param name="flavor" value="cell_ranger"/>
                    <param name="n_top_genes" value="2"/>
                </conditional>
                <param name="n_bins" value="20"/>
                <param name="subset" value="true"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.highly_variable_genes"/>
                    <has_text_matching expression="flavor='cell_ranger'"/>
                    <has_text_matching expression="n_top_genes=2"/>
                    <has_text_matching expression="n_bins=20"/>
                    <has_text_matching expression="subset=True"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.highly_variable_genes.krumsiek11-cell_ranger.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test>
            <!-- test 6 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.subsample"/>
                <conditional name="type">
                    <param name="type" value="fraction" />
                    <param name="fraction" value="0.5"/>
                </conditional>
                <param name="random_state" value="0"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.subsample"/>
                    <has_text_matching expression="fraction=0.5"/>
                    <has_text_matching expression="random_state=0"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.subsample.krumsiek11_fraction.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test>
            <!-- test 7 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.subsample"/>
                <conditional name="type">
                    <param name="type" value="n_obs" />
                    <param name="n_obs" value="10"/>
                </conditional>
                <param name="random_state" value="0"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.subsample"/>
                    <has_text_matching expression="n_obs=10"/>
                    <has_text_matching expression="random_state=0"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.subsample.krumsiek11_n_obs.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test>
            <!-- test 8 -->
            <param name="adata" value="random-randint.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.downsample_counts"/>
                <param name="total_counts" value="20000"/>
                <param name="random_state" value="0"/>
                <param name="replace" value="false"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.downsample_counts"/>
                    <has_text_matching expression="total_counts=20000"/>
                    <has_text_matching expression="random_state=0"/>
                    <has_text_matching expression="replace=False"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.downsample_counts.random-randint.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
    </tests>
    <help><![CDATA[

Filter cells outliers based on counts and numbers of genes expressed (`pp.filter_cells`)
========================================================================================

For instance, only keep cells with at least `min_counts` counts or
`min_genes` genes expressed. This is to filter measurement outliers, i.e.,
"unreliable" observations.

Only provide one of the optional parameters `min_counts`, `min_genes`,
`max_counts`, `max_genes` per call.

More details on the `scanpy documentation
<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.pp.filter_cells.html>`__


Filter genes based on number of cells or counts (`pp.filter_genes`)
===================================================================

Keep genes that have at least `min_counts` counts or are expressed in at
least `min_cells` cells or have at most `max_counts` counts or are expressed
in at most `max_cells` cells.

Only provide one of the optional parameters `min_counts`, `min_cells`,
`max_counts`, `max_cells` per call.

More details on the `scanpy documentation
<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.pp.filter_genes.html>`__


Filters out genes based on fold change and fraction of genes expressing the gene within and outside the groupby categories (`tl.filter_rank_genes_groups`)
==========================================================================================================================================================

More details on the `scanpy documentation
<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.tl.filter_rank_genes_groups.html>`__


Annotate highly variable genes (`pp.highly_variable_genes`)
===========================================================

It expects logarithmized data.

Depending on flavor, this reproduces the R-implementations of Seurat or Cell Ranger. The normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for genes falling into a given bin for mean expression of genes. This means that for each bin of mean expression, highly variable genes are selected.


Subsample to a fraction of the number of observations (`pp.subsample`)
======================================================================

More details on the `scanpy documentation
<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.pp.subsample.html>`__

Downsample counts (`pp.downsample_counts`)
==========================================

Downsample counts so that each cell has no more than `target_counts`. Cells with fewer counts than `target_counts` are unaffected by this. This
has been implemented by M. D. Luecken.

More details on the `scanpy documentation
<https://icb-scanpy.readthedocs-hosted.com/en/@version@/api/scanpy.pp.downsample_counts.html>`__


    ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Wed, 20 May 2020 16:12:00 -0400
parents	a97abb8cd15b
children	f87ed6503715