view inspect.xml @ 15:ca086f24422f draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit c21958f44b81d740191999fb6015d5ae69538ee0
author iuc
date Wed, 31 Jul 2024 18:12:22 +0000
parents f54f0f0598ad
children 18262103fa61
line wrap: on
line source

<tool id="scanpy_inspect" name="Inspect and manipulate" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@profile@">
    <description> with scanpy</description>
    <macros>
        <import>macros.xml</import>
        <xml name="score_genes_params">
            <param argument="n_bins" type="integer" value="25" label="Number of expression level bins for sampling" help=""/>
            <param argument="random_state" type="integer" value="0" label="Random seed for sampling" help=""/>
            <expand macro="param_use_raw"/>
        </xml>
        <token name="@CMD_score_genes_inputs@"><![CDATA[
    n_bins=$method.n_bins,
    random_state=$method.random_state,
    use_raw=$method.use_raw,
    copy=False
        ]]></token>
        <xml name="corr_method">
            <param argument="corr_method" type="select" label="P-value correction method">
                <option value="benjamini-hochberg">Benjamini-Hochberg</option>
                <option value="bonferroni">Bonferroni</option>
            </param>
        </xml>
        <xml name="fit_intercept">
            <param argument="fit_intercept" type="boolean" truevalue="True" falsevalue="False" checked="true"
                label="Should a constant (a.k.a. bias or intercept) be added to the decision function?" help=""/>
        </xml>
        <xml name="max_iter">
            <param argument="max_iter" type="integer" min="0" value="100" label="Maximum number of iterations taken for the solvers to converge" help=""/>
        </xml>
        <xml name="multi_class">
            <param argument="multi_class" type="select" label="Multi class" help="">
                <option value="ovr">ovr: a binary problem is fit for each label</option>
                <option value="multinomial">multinomial: the multinomial loss fit across the entire probability distribution, even when the data is binary</option>
                <option value="auto">auto: selects ‘ovr’ if the data is binary and otherwise selects ‘multinomial’</option>
            </param>
        </xml>
        <xml name="penalty">
            <param argument="penalty" type="select" label="Norm used in the penalization" help="">
                <option value="l1">l1</option>
                <option value="l2">l2</option>
                <option value="customized">customized</option>
            </param>
        </xml>
        <xml name="custom_penalty">
            <param argument="pen" type="text" value="" label="Norm used in the penalization" help="">
                <expand macro="sanitize_query" />
            </param>
        </xml>
        <xml name="random_state">
            <param argument="random_state" type="integer" value="" optional="true"
                label="The seed of the pseudo random number generator to use when shuffling the data" help=""/>
        </xml>
    </macros>
    <expand macro="bio_tools"/>
    <expand macro="requirements"/>
    <expand macro="version_command"/>
    <command detect_errors="exit_code"><![CDATA[
@CMD@
      ]]></command>
    <configfiles>
        <configfile name="script_file"><![CDATA[
@CMD_imports@
@CMD_read_inputs@

#if $method.method == "pp.calculate_qc_metrics"
sc.pp.calculate_qc_metrics(
    adata=adata,
    expr_type='$method.expr_type',
    var_type='$method.var_type',
    #if $method.qc_vars
        #set $qc_vars = [str(x.strip()) for x in str($method.qc_vars).split(',')]
    qc_vars=$qc_vars,
    #end if
    #if $method.percent_top
        #set $percent_top = [int(x.strip()) for x in str($method.percent_top).split(',')]
        percent_top=$percent_top,
    #end if
    inplace=True)

#else if $method.method == "tl.score_genes"
sc.tl.score_genes(
    adata=adata,
    #set $gene_list = [str(x.strip()) for x in str($method.gene_list).split(',')]
    gene_list=$gene_list,
    ctrl_size=$method.ctrl_size,
    score_name='$method.score_name',
    #if $method.gene_pool
        #set $gene_pool = [str(x.strip()) for x in $method.gene_pool.split(',')]
    gene_pool=$gene_pool,
    #end if
    @CMD_score_genes_inputs@)

#else if $method.method == "tl.score_genes_cell_cycle"
    #if str($method.s_genes.format) == 'file'
with open('$method.s_genes.file', 'r') as s_genes_f:
    s_genes = [str(x.strip()) for x in s_genes_f.readlines()]
print(s_genes)
    #end if

    #if str($method.g2m_genes.format) == 'file'
with open('$method.g2m_genes.file', 'r') as g2m_genes_f:
    g2m_genes = [str(x.strip()) for x in g2m_genes_f.readlines()]
print(g2m_genes)
    #end if

sc.tl.score_genes_cell_cycle(
    adata=adata,
    #if str($method.s_genes.format) == 'text'
        #set $s_genes = [str(x.strip()) for x in $method.s_genes.text.split(',')]
    s_genes=$s_genes,
    #else if str($method.s_genes.format) == 'file'
    s_genes=s_genes,
    #end if
    #if str($method.g2m_genes.format) == 'text'
        #set $g2m_genes = [str(x.strip()) for x in $method.g2m_genes.text.split(',')]
    g2m_genes=$g2m_genes,
    #else if str($method.g2m_genes.format) == 'file'
    g2m_genes=g2m_genes,
    #end if
    @CMD_score_genes_inputs@)

#else if $method.method == 'pp.neighbors'
sc.pp.neighbors(
    adata=adata,
    n_neighbors=$method.n_neighbors,
    #if str($method.n_pcs) != ''
    n_pcs=$method.n_pcs,
    #end if
    #if $method.use_rep
    use_rep='$method.use_rep',
    #end if
    knn=$method.knn,
    random_state=$method.random_state,
    method='$method.pp_neighbors_method',
    metric='$method.metric',
    copy=False)

#else if $method.method == 'tl.rank_genes_groups'
sc.tl.rank_genes_groups(
    adata=adata,
    groupby='$method.groupby',
    #if $method.groups
    #set $group=[x.strip() for x in str($method.groups).split(',')]
    groups=$group,
    #end if
    #if $method.ref.rest == 'rest'
    reference='$method.ref.rest',
    #else
    reference='$method.ref.reference',
    #end if
    n_genes=$method.n_genes,
    method='$method.tl_rank_genes_groups_method.method',
    #if $method.tl_rank_genes_groups_method.method != 'logreg'
    corr_method='$method.tl_rank_genes_groups_method.corr_method',
    #else
    solver='$method.tl_rank_genes_groups_method.solver.solver',
        #if $method.tl_rank_genes_groups_method.solver.solver == 'newton-cg'
    penalty='l2',
    fit_intercept=$method.tl_rank_genes_groups_method.solver.fit_intercept,
    max_iter=$method.tl_rank_genes_groups_method.solver.max_iter,
    multi_class='$method.tl_rank_genes_groups_method.solver.multi_class',
        #else if $method.tl_rank_genes_groups_method.solver.solver == 'lbfgs'
    penalty='l2',
    fit_intercept=$method.tl_rank_genes_groups_method.solver.fit_intercept,
    max_iter=$method.tl_rank_genes_groups_method.solver.max_iter,
    multi_class='$method.tl_rank_genes_groups_method.solver.multi_class',
        #else if $method.tl_rank_genes_groups_method.solver.solver == 'liblinear'
            #if $method.tl_rank_genes_groups_method.solver.penalty.penalty == 'l1'
    penalty='l1',
            #else if $method.tl_rank_genes_groups_method.solver.penalty.penalty == 'l2'
    penalty='l2',
    dual=$method.tl_rank_genes_groups_method.solver.penalty.dual,
            #else
    penalty='$method.tl_rank_genes_groups_method.solver.penalty.pen',
            #end if
    fit_intercept=$method.tl_rank_genes_groups_method.solver.intercept_scaling.fit_intercept,
            #if $method.tl_rank_genes_groups_method.solver.intercept_scaling.fit_intercept == 'True'
    intercept_scaling=$method.tl_rank_genes_groups_method.solver.intercept_scaling.intercept_scaling,
            #end if
            #if str($method.tl_rank_genes_groups_method.solver.random_state) != ''
    random_state=$method.tl_rank_genes_groups_method.solver.random_state,
            #end if
        #else if $method.tl_rank_genes_groups_method.solver.solver == 'sag'
    penalty='l2',
    fit_intercept=$method.tl_rank_genes_groups_method.solver.fit_intercept,
            #if str($method.tl_rank_genes_groups_method.solver.random_state) != ''
    random_state=$method.tl_rank_genes_groups_method.solver.random_state,
            #end if
    max_iter=$method.tl_rank_genes_groups_method.solver.max_iter,
    multi_class='$method.tl_rank_genes_groups_method.solver.multi_class',
        #else if $method.tl_rank_genes_groups_method.solver.solver == 'saga'
            #if $method.tl_rank_genes_groups_method.solver.penalty.penalty == 'l1'
    penalty='l1',
            #else if $method.tl_rank_genes_groups_method.solver.penalty.penalty == 'l2'
    penalty='l2',
            #else
    penalty='$method.tl_rank_genes_groups_method.solver.penalty.pen',
            #end if
    fit_intercept=$method.tl_rank_genes_groups_method.solver.fit_intercept,
    multi_class='$method.tl_rank_genes_groups_method.solver.multi_class',
        #end if
    tol=$method.tl_rank_genes_groups_method.tol,
    C=$method.tl_rank_genes_groups_method.c,
    #end if
    use_raw=$method.use_raw)

#else if $method.method == "tl.marker_gene_overlap"
reference_markers = {}
#for $i, $s in enumerate($method.reference_markers)
    #set $list=[x.strip() for x in str($s.values).split(',')]
reference_markers['$s.key'] = $list
#end for

sc.tl.marker_gene_overlap(
    adata,
    reference_markers,
    #if $method.key
    key='$method.key',
    #end if
    method='$method.overlap.method',
    #if $method.overlap.method == 'overlap_count' and str($method.overlap.normalize) != 'None'
    normalize='$method.overlap.normalize',
    #end if
    #if str($method.top_n_markers) != ''
    top_n_markers=$method.top_n_markers,
    #end if
    #if str($method.adj_pval_threshold) != ''
    adj_pval_threshold=$method.adj_pval_threshold,
    #end if
    #if $method.key_added
    key_added='$method.key_added',
    #end if
    inplace=True)

#else if $method.method == "pp.log1p"
sc.pp.log1p(
    adata,
    copy=False)

#else if $method.method == "pp.scale"
sc.pp.scale(
    adata,
    zero_center=$method.zero_center,
    #if str($method.max_value) != ''
    max_value=$method.max_value,
    #end if
    copy=False)

#else if $method.method == "pp.sqrt"
sc.pp.sqrt(
    adata,
    copy=False)
#end if

@CMD_anndata_write_outputs@
]]></configfile>
    </configfiles>
    <inputs>
        <expand macro="inputs_anndata"/>
        <conditional name="method">
            <param argument="method" type="select" label="Method used for inspecting">
                <option value="pp.calculate_qc_metrics">Calculate quality control metrics, using 'pp.calculate_qc_metrics'</option>
                <option value="pp.neighbors">Compute a neighborhood graph of observations, using 'pp.neighbors'</option>
                <option value="tl.score_genes">Score a set of genes, using 'tl.score_genes'</option>
                <option value="tl.score_genes_cell_cycle">Score cell cycle genes, using 'tl.score_genes_cell_cycle'</option>
                <option value="tl.rank_genes_groups">Rank genes for characterizing groups, using 'tl.rank_genes_groups'</option>
                <!--<option value="tl.marker_gene_overlap">Calculate an overlap score between data-deriven marker genes and provided markers, using 'tl.marker_gene_overlap'</option>-->
                <option value="pp.log1p">Logarithmize the data matrix, using 'pp.log1p'</option>
                <option value="pp.scale">Scale data to unit variance and zero mean, using 'pp.scale'</option>
                <option value="pp.sqrt">Square root the data matrix, using 'pp.sqrt'</option>
            </param>
            <when value="pp.calculate_qc_metrics">
                <param argument="expr_type" type="text" value="counts" label="Name of kind of values in X">
                    <expand macro="sanitize_query" />
                </param>
                <param argument="var_type" type="text" value="genes" label="The kind of thing the variables are">
                    <expand macro="sanitize_query" />
                </param>
                <param argument="qc_vars" type="text" value="" label="Keys for boolean columns of '.var' which identify variables you could want to control for" help="Keys separated by a comma">
                    <expand macro="sanitize_query" />
                </param>
                <param argument="percent_top" type="text" value="" label="Proportions of top genes to cover" 
                    help=" Values (integers) are considered 1-indexed, '50' finds cumulative proportion to the 50th most expressed genes. Values separated by a comma. If empty don't calculate">
                    <expand macro="sanitize_vectors" />
                </param>
            </when>
            <when value="pp.neighbors">
                <param argument="n_neighbors" type="integer" min="0" value="15" label="The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation" help="Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If 'knn' is 'True', number of nearest neighbors to be searched. If 'knn' is 'False', a Gaussian kernel width is set to the distance of the 'n_neighbors' neighbor."/>
                <param argument="n_pcs" type="integer" min="0" value="" optional="true" label="Number of PCs to use" help=""/>
                <param argument="use_rep" type="text" value="" optional="true" label="Indicated representation to use" help="If not set, the representation is chosen automatically: for n_vars below 50, X is used, otherwise X_pca (uns) is used. If X_pca is not present, it's computed with default parameter">
                    <expand macro="sanitize_query" />
                </param>
                <param argument="knn" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Use a hard threshold to restrict the number of neighbors to n_neighbors?" help="If true, it considers a knn graph. Otherwise, it uses a Gaussian Kernel to assign low weights to neighbors more distant than the 'n_neighbors' nearest neighbor."/>
                <param argument="random_state" type="integer" value="0" label="Numpy random seed" help=""/>
                <param name="pp_neighbors_method" argument="method" type="select" label="Method for computing connectivities" help="">
                    <option value="umap">umap (McInnes et al, 2018)</option>
                    <option value="gauss">gauss: Gauss kernel following (Coifman et al 2005) with adaptive width (Haghverdi et al 2016)</option>
                </param>
                <param argument="metric" type="select" label="Distance metric" help="">
                    <expand macro="distance_metric_options"/>
                </param>
            </when>
            <when value="tl.score_genes">
                <param argument="gene_list" type="text" value="" label="The list of gene names used for score calculation" help="Genes separated by a comma">
                    <expand macro="sanitize_query" />
                </param>
                <param argument="ctrl_size" type="integer" value="50" label="Number of reference genes to be sampled"
                    help="If 'len(gene_list)' is not too low, you can set 'ctrl_size=len(gene_list)'."/>
                <param argument="gene_pool" type="text" value="" optional="true" label="Genes for sampling the reference set"
                       help="Default is all genes. Genes separated by a comma">
                    <expand macro="sanitize_query" />
                </param>
                <expand macro="score_genes_params"/>
                <param argument="score_name" type="text" value="score" label="Name of the field to be added in '.obs'" help="">
                    <expand macro="sanitize_query" />
                </param>
            </when>
            <when value="tl.score_genes_cell_cycle">
                <conditional name='s_genes'>
                    <param name="format" type="select" label="Format for the list of genes associated with S phase">
                        <option value="file">File</option>
                        <option value="text" selected="true">Text</option>
                    </param>
                    <when value="text">
                        <param name="text" type="text" value="" label="List of genes associated with S phase" help="Genes separated by a comma">
                            <expand macro="sanitize_query" />
                        </param>
                    </when>
                    <when value="file">
                        <param name="file" type="data" format="txt" label="File with the list of genes associated with S phase" help="One gene per line"/>
                    </when>
                </conditional>
                <conditional name='g2m_genes'>
                    <param name="format" type="select" label="Format for the list of genes associated with G2M phase">
                        <option value="file">File</option>
                        <option value="text" selected="true">Text</option>
                    </param>
                    <when value="text">
                        <param name="text" type="text" value="" label="List of genes associated with G2M phase" help="Genes separated by a comma">
                            <expand macro="sanitize_query" />
                        </param>
                    </when>
                    <when value="file">
                        <param name="file" type="data" format="txt" label="File with the list of genes associated with G2M phase" help="One gene per line"/>
                    </when>
                </conditional>
                <expand macro="score_genes_params"/>
            </when>
            <when value="tl.rank_genes_groups">
                <param argument="groupby" type="text" value="" label="The key of the observations grouping to consider" help="">
                    <expand macro="sanitize_query" />
                </param>
                <expand macro="param_use_raw"/>
                <param argument="groups" type="text" value="" label="Subset of groups to which comparison shall be restricted" help="e.g. ['g1', 'g2', 'g3']. If not passed, a ranking will be generated for all groups.">
                    <expand macro="sanitize_query" />
                </param>
                <conditional name="ref">
                    <param name="rest" type="select" label="Comparison">
                        <option value="rest">Compare each group to the union of the rest of the group</option>
                        <option value="group_id">Compare with respect to a specific group</option>
                    </param>
                    <when value="rest"/>
                    <when value="group_id">
                        <param argument="reference" type="text" value="" label="Group identifier with respect to which compare">
                            <expand macro="sanitize_query" />
                        </param>
                    </when>
                </conditional>
                <param argument="n_genes" type="integer" min="0" value="100" label="The number of genes that appear in the returned tables" help=""/>
                <conditional name="tl_rank_genes_groups_method">
                    <param argument="method" type="select" label="Method">
                        <option value="t-test" selected="true">t-test</option>
                        <option value="wilcoxon">Wilcoxon-Rank-Sum</option>
                        <option value="t-test_overestim_var">t-test with overestimate of variance of each group</option>
                        <option value="logreg">Logistic regression</option>
                    </param>
                    <when value="t-test">
                        <expand macro="corr_method"/>
                    </when>
                    <when value="wilcoxon">
                        <expand macro="corr_method"/>
                    </when>
                    <when value="t-test_overestim_var">
                        <expand macro="corr_method"/>
                    </when>
                    <when value="logreg">
                        <conditional name="solver">
                            <param argument="solver" type="select" label="Algorithm to use in the optimization problem" help="For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones. For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes. ‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty, whereas ‘liblinear’ and ‘saga’ handle L1 penalty.">
                                <option value="newton-cg">newton-cg</option>
                                <option value="lbfgs">lbfgs</option>
                                <option value="liblinear">liblinear</option>
                                <option value="sag">sag</option>
                                <option value="saga">saga</option>
                            </param>
                            <when value="newton-cg">
                                <expand macro="fit_intercept"/>
                                <expand macro="max_iter"/>
                                <expand macro="multi_class"/>
                            </when>
                            <when value="lbfgs">
                                <expand macro="fit_intercept"/>
                                <expand macro="max_iter"/>
                                <expand macro="multi_class"/>
                            </when>
                            <when value="liblinear">
                                <conditional name="penalty">
                                    <expand macro="penalty"/>
                                    <when value="l1"/>
                                    <when value="l2">
                                        <param argument="dual" type="boolean" truevalue="True" falsevalue="False" checked="false"
                                            label="Dual (not primal) formulation?" help="Prefer primal when n_samples > n_features"/>
                                    </when>
                                    <when value="customized">
                                        <expand macro="custom_penalty"/>
                                    </when>
                                </conditional>
                                <conditional name="intercept_scaling">
                                    <param argument="fit_intercept" type="select"
                                        label="Should a constant (a.k.a. bias or intercept) be added to the decision function?" help="">
                                        <option value="True">Yes</option>
                                        <option value="False">No</option>
                                    </param>
                                    <when value="True">
                                        <param argument="intercept_scaling" type="float" value="1.0"
                                            label="Intercept scaling"
                                            help="x becomes [x, self.intercept_scaling], i.e. a 'synthetic' feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight."/>
                                    </when>
                                    <when value="False"/>
                                </conditional>
                                <expand macro="random_state"/>
                            </when>
                            <when value="sag">
                                <expand macro="fit_intercept"/>
                                <expand macro="random_state"/>
                                <expand macro="max_iter"/>
                                <expand macro="multi_class"/>
                            </when>
                            <when value="saga">
                                <conditional name="penalty">
                                    <expand macro="penalty"/>
                                    <when value="l1"/>
                                    <when value="l2"/>
                                    <when value="customized">
                                        <expand macro="custom_penalty"/>
                                    </when>
                                </conditional>
                                <expand macro="fit_intercept"/>
                                <expand macro="multi_class"/>
                            </when>
                        </conditional>
                        <param argument="tol" type="float" value="1e-4" label="Tolerance for stopping criteria" help=""/>
                        <param argument="c" type="float" value="1.0" label="Inverse of regularization strength"
                            help="It must be a positive float. Like in support vector machines, smaller values specify stronger regularization."/>
                    </when>
                </conditional>
            </when>
            <!-- With inplace=True, NotImplementedError: Writing Pandas dataframes to h5ad is currently under development. Please use `inplace=False`. -->
            <!-- <when value="tl.marker_gene_overlap">
                <repeat name="reference_markers" title="Marker genes">
                    <param name="key" type="text" value="" label="Cell identity name" help=""/>
                    <param name="values" type="text" value="" label="List of genes" help="Comma-separated names from 'var'"/>
                </repeat>
                <param argument="key" type="text" value="rank_genes_groups" label="Key in adata.uns where the rank_genes_groups output is stored"/>
                <conditional name="overlap">
                    <param argument="method" type="select" label="Method to calculate marker gene overlap">
                        <option value="overlap_count">overlap_count: Intersection of the gene set</option>
                        <option value="overlap_coef">overlap_coef: Overlap coefficient</option>
                        <option value="jaccard">jaccard: Jaccard index</option>
                    </param>
                    <when value="overlap_count">
                        <param argument="normalize" type="select" label="Normalization option for the marker gene overlap output">
                            <option value="None">None</option>
                            <option value="reference">reference: Normalization of the data by the total number of marker genes given in the reference annotation per group</option>
                            <option value="data">data: Normalization of the data by the total number of marker genes used for each cluster</option>
                        </param>
                    </when>
                    <when value="overlap_coef"/>
                    <when value="jaccard"/>
                </conditional>
                <param argument="top_n_markers" type="integer" optional="true" label="Number of top data-derived marker genes to use" help="By default all calculated marker genes are used. If adj_pval_threshold is set along with top_n_markers, then adj_pval_threshold is ignored."/>
                <param argument="adj_pval_threshold" type="float" optional="true" label="Significance threshold on the adjusted p-values to select marker genes" help=" This can only be used when adjusted p-values are calculated by 'tl.rank_genes_groups'. If adj_pval_threshold is set along with top_n_markers, then adj_pval_threshold is ignored."/>
                <param argument="key_added" type="text" value="marker_gene_overlap" optional="true" label="Key that will contain the marker overlap scores in 'uns'"/>
            </when>-->
            <when value="pp.log1p"/>
            <when value="pp.scale">
                <param argument="zero_center" type="boolean" truevalue="True" falsevalue="False" checked="true"
                    label="Zero center?" help="If not, it omits zero-centering variables, which allows to handle sparse input efficiently."/>
                <param argument="max_value" type="float" value="" optional="true" label="Maximum value"
                    help="Clip (truncate) to this value after scaling. If not set, it does not clip."/>
            </when>
            <when value="pp.sqrt"/>
        </conditional>
        <expand macro="inputs_common_advanced"/>
    </inputs>
    <outputs>
        <expand macro="anndata_outputs"/>
    </outputs>
    <tests>
        <test expect_num_outputs="2">
            <!-- test 1 -->
            <param name="adata" value="sparce_csr_matrix.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.calculate_qc_metrics"/>
                <param name="expr_type" value="counts"/>
                <param name="var_type" value="genes"/>
                <param name="qc_vars" value="mito,negative"/>
                <param name="percent_top" value=""/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.calculate_qc_metrics" />
                    <has_text_matching expression="expr_type='counts'" />
                    <has_text_matching expression="var_type='genes'" />
                    <has_text_matching expression="qc_vars=\['mito', 'negative'\]" />
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.calculate_qc_metrics.sparce_csr_matrix.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 2 -->
            <param name="adata" value="pp.recipe_weinreb17.paul15_subsample.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.neighbors"/>
                <param name="n_neighbors" value="15"/>
                <param name="knn" value="True"/>
                <param name="random_state" value="0"/>
                <param name="pp_neighbors_method" value="umap"/>
                <param name="metric" value="euclidean"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.neighbors"/>
                    <has_text_matching expression="n_neighbors=15"/>
                    <has_text_matching expression="knn=True"/>
                    <has_text_matching expression="random_state=0"/>
                    <has_text_matching expression="method='umap'"/>
                    <has_text_matching expression="metric='euclidean'"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.neighbors_umap_euclidean.recipe_weinreb17.paul15_subsample.h5ad" ftype="h5ad" compare="sim_size">
                <assert_contents>
                    <has_h5_keys keys="X, obs, obsm, uns, var" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <!-- test 3 -->
            <param name="adata" value="pp.recipe_weinreb17.paul15_subsample.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.neighbors"/>
                <param name="n_neighbors" value="15"/>
                <param name="knn" value="True"/>
                <param name="pp_neighbors_method" value="gauss"/>
                <param name="metric" value="braycurtis"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.neighbors"/>
                    <has_text_matching expression="n_neighbors=15"/>
                    <has_text_matching expression="knn=True"/>
                    <has_text_matching expression="random_state=0"/>
                    <has_text_matching expression="method='gauss'"/>
                    <has_text_matching expression="metric='braycurtis'"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.neighbors_gauss_braycurtis.recipe_weinreb17.paul15_subsample.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 4 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="tl.score_genes"/>
                <param name="gene_list" value="Gata2, Fog1"/>
                <param name="ctrl_size" value="2"/>
                <param name="n_bins" value="2"/>
                <param name="random_state" value="2"/>
                <param name="use_raw" value="False"/>
                <param name="score_name" value="score"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.tl.score_genes" />
                    <has_text_matching expression="gene_list=\['Gata2', 'Fog1'\]" />
                    <has_text_matching expression="ctrl_size=2" />
                    <has_text_matching expression="score_name='score'" />
                    <has_text_matching expression="n_bins=2" />
                    <has_text_matching expression="random_state=2" />
                    <has_text_matching expression="use_raw=False" />
                    <has_text_matching expression="copy=False" />
                </assert_contents>
            </output>
            <output name="anndata_out" file="tl.score_genes.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 5 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="tl.score_genes_cell_cycle"/>
                <conditional name='s_genes'>
                    <param name="format" value="text"/>
                    <param name="text" value="Gata2, Fog1, EgrNab"/>
                </conditional>
                <conditional name='g2m_genes'>
                    <param name="format" value="text"/>
                    <param name="text" value="Gata2, Fog1, EgrNab"/>
                </conditional>
                <param name="n_bins" value="2"/>
                <param name="random_state" value="1"/>
                <param name="use_raw" value="False"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.tl.score_genes_cell_cycle"/>
                    <has_text_matching expression="s_genes=\['Gata2', 'Fog1', 'EgrNab'\]"/>
                    <has_text_matching expression="g2m_genes=\['Gata2', 'Fog1', 'EgrNab'\]"/>
                    <has_text_matching expression="n_bins=2"/>
                    <has_text_matching expression="random_state=1"/>
                    <has_text_matching expression="use_raw=False"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="tl.score_genes_cell_cycle.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 6 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="tl.rank_genes_groups"/>
                <param name="groupby" value="cell_type"/>
                <param name="use_raw" value="False"/>
                <conditional name="ref">
                    <param name="rest" value="rest"/>
                </conditional>
                <param name="n_genes" value="100"/>
                <conditional name="tl_rank_genes_groups_method">
                    <param name="method" value="t-test_overestim_var"/>
                    <param name="corr_method" value="benjamini-hochberg"/>
                </conditional>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.tl.rank_genes_groups"/>
                    <has_text_matching expression="groupby='cell_type'"/>
                    <has_text_matching expression="use_raw=False"/>
                    <has_text_matching expression="reference='rest'"/>
                    <has_text_matching expression="n_genes=100"/>
                    <has_text_matching expression="method='t-test_overestim_var'"/>
                    <has_text_matching expression="corr_method='benjamini-hochberg'"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="tl.rank_genes_groups.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 7 -->
            <param name="adata" value="pbmc68k_reduced.h5ad" />
            <conditional name="method">
                <param name="method" value="tl.rank_genes_groups"/>
                <param name="groupby" value="louvain"/>
                <param name="use_raw" value="True"/>
                <conditional name="ref">
                    <param name="rest" value="rest"/>
                </conditional>
                <param name="n_genes" value="100"/>
                <conditional name="tl_rank_genes_groups_method">
                    <param name="method" value="logreg"/>
                    <conditional name="solver">
                        <param name="solver" value="newton-cg"/>
                        <param name="fit_intercept" value="True"/>
                        <param name="max_iter" value="100"/>
                        <param name="multi_class" value="auto"/>
                    </conditional>
                    <param name="tol" value="1e-4"/>
                    <param name="c" value="1.0"/>
                </conditional>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.tl.rank_genes_groups"/>
                    <has_text_matching expression="groupby='louvain'"/>
                    <has_text_matching expression="use_raw=True"/>
                    <has_text_matching expression="reference='rest'"/>
                    <has_text_matching expression="n_genes=100"/>
                    <has_text_matching expression="method='logreg'"/>
                    <has_text_matching expression="solver='newton-cg'"/>
                    <has_text_matching expression="penalty='l2'"/>
                    <has_text_matching expression="fit_intercept=True"/>
                    <has_text_matching expression="max_iter=100"/>
                    <has_text_matching expression="multi_class='auto'"/>
                    <has_text_matching expression="tol=0.0001"/>
                    <has_text_matching expression="C=1.0"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="tl.rank_genes_groups.newton-cg.pbmc68k_reduced.h5ad" ftype="h5ad" compare="sim_size" delta="1000000" delta_frac="0.15">
                <assert_contents>
                    <has_h5_keys keys="X, obs, obsm, raw/X, raw/var, uns, var" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <!-- test 8 -->
            <param name="adata" value="pbmc68k_reduced.h5ad" />
            <conditional name="method">
                <param name="method" value="tl.rank_genes_groups"/>
                <param name="groupby" value="louvain"/>
                <param name="use_raw" value="True"/>
                <conditional name="ref">
                    <param name="rest" value="rest"/>
                </conditional>
                <param name="n_genes" value="100"/>
                <conditional name="tl_rank_genes_groups_method">
                    <param name="method" value="logreg"/>
                    <conditional name="solver">
                        <param name="solver" value="liblinear"/>
                        <conditional name="penalty">
                            <param name="penalty" value="l2"/>
                            <param name="dual" value="False"/>
                            <conditional name="intercept_scaling">
                                <param name="fit_intercept" value="True"/>
                                <param name="intercept_scaling" value="1.0" />
                            </conditional>
                            <param name="random_state" value="1"/>
                        </conditional>
                    </conditional>
                    <param name="tol" value="1e-4"/>
                    <param name="c" value="1.0"/>
                </conditional>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.tl.rank_genes_groups"/>
                    <has_text_matching expression="groupby='louvain'"/>
                    <has_text_matching expression="use_raw=True"/>
                    <has_text_matching expression="reference='rest'"/>
                    <has_text_matching expression="n_genes=100"/>
                    <has_text_matching expression="method='logreg'"/>
                    <has_text_matching expression="solver='liblinear'"/>
                    <has_text_matching expression="penalty='l2'"/>
                    <has_text_matching expression="dual=False"/>
                    <has_text_matching expression="fit_intercept=True"/>
                    <has_text_matching expression="intercept_scaling=1.0"/>
                    <has_text_matching expression="tol=0.0001"/>
                    <has_text_matching expression="C=1.0"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="tl.rank_genes_groups.liblinear.krumsiek11.h5ad" ftype="h5ad" compare="sim_size" delta="1000000" delta_frac="0.15">
                <assert_contents>
                    <has_h5_keys keys="X, obs, obsm, raw/X, raw/var, uns, var" />
                </assert_contents>
            </output>
        </test>
        <!-- test expect_num_outputs="2">
            < test 9  tl.marker_gene_overlap function was commented because inpace=True does not work>
            <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_reduced.h5ad" />
            <conditional name="method">
                <param name="method" value="tl.marker_gene_overlap"/>
                <repeat name="reference_markers">
                    <param name="key" value="CD4 T cells"/>
                    <param name="value" value="IL7R"/>
                </repeat>
                <repeat name="reference_markers">
                    <param name="key" value="CD14+ Monocytes"/>
                    <param name="value" value="CD14,LYZ"/>
                </repeat>
                <repeat name="reference_markers">
                    <param name="key" value="B cells"/>
                    <param name="value" value="MS4A1"/>
                </repeat>
                <conditional name="overlap">
                    <param name="method" value="overlap_count"/>
                    <param name="normalize" value="None"/>
                </conditional>
            </conditional>
            <assert_stdout>
                <has_text_matching expression="tl.marker_gene_overlap"/>
                <has_text_matching expression="key='rank_genes_groups'"/>
                <has_text_matching expression="method='overlap_count'"/>
            </assert_stdout>
            <output name="anndata_out" file="tl.marker_gene_overlap.pbmc68k_reduced.h5ad" ftype="h5ad" compare="sim_size"/>
        </test> -->
        <test expect_num_outputs="2">
            <!-- test 10 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.log1p"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.log1p"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.log1p.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 11 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.scale"/>
                <param name="zero_center" value="true"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.scale"/>
                    <has_text_matching expression="zero_center=True"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.scale.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 12 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.scale"/>
                <param name="zero_center" value="true"/>
                <param name="max_value" value="10"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.scale"/>
                    <has_text_matching expression="zero_center=True"/>
                    <has_text_matching expression="max_value=10.0"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.scale_max_value.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 13 -->
            <param name="adata" value="krumsiek11.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.sqrt"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.sqrt"/>
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.sqrt.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test 13 -->
            <param name="adata" value="sparce_csr_matrix.h5ad" />
            <conditional name="method">
                <param name="method" value="pp.calculate_qc_metrics"/>
                <param name="expr_type" value="counts"/>
                <param name="var_type" value="genes"/>
                <param name="qc_vars" value="mito,negative"/>
                <param name="percent_top" value="50,100,200,300"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.calculate_qc_metrics" />
                    <has_text_matching expression="expr_type='counts'" />
                    <has_text_matching expression="var_type='genes'" />
                    <has_text_matching expression="qc_vars=\['mito', 'negative'\]" />
                    <has_text_matching expression="percent_top=\[50, 100, 200, 300\]" />
                </assert_contents>
            </output>
            <output name="anndata_out" file="pp.calculate_qc_metrics.sparce_csr_matrix.h5ad" ftype="h5ad" compare="sim_size"/>
        </test>
    </tests>
    <help><![CDATA[
Calculate quality control metrics., using `pp.calculate_qc_metrics`
===================================================================

Calculates a number of qc metrics for an AnnData object, largely based on calculateQCMetrics from scater. 
Currently is most efficient on a sparse CSR or dense matrix.

It updates the observation level metrics with

- total_{var_type}_by_{expr_type} (e.g. "total_genes_by_counts", number of genes with positive counts in a cell)
- total_{expr_type} (e.g. "total_counts", total number of counts for a cell)
- pct_{expr_type}_in_top_{n}_{var_type} - for n in percent_top (e.g. "pct_counts_in_top_50_genes", cumulative percentage of counts for 50 most expressed genes in a cell)
- total_{expr_type}_{qc_var} - for qc_var in qc_vars (e.g. "total_counts_mito", total number of counts for variabes in qc_vars)
- pct_{expr_type}_{qc_var} - for qc_var in qc_vars (e.g. "pct_counts_mito", proportion of total counts for a cell which are mitochondrial)

And also the variable level metrics:

- total_{expr_type} (e.g. "total_counts", sum of counts for a gene)
- mean_{expr_type} (e.g. "mean counts", mean expression over all cells)
- n_cells_by_{expr_type} (e.g. "n_cells_by_counts", number of cells this expression is measured in)
- pct_dropout_by_{expr_type} (e.g. "pct_dropout_by_counts", percentage of cells this feature does not appear in)

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.calculate_qc_metrics.html>`__

Compute a neighborhood graph of observations, using `pp.neighbors`
==================================================================

The neighbor search efficiency of this heavily relies on UMAP (McInnes et al, 2018),
which also provides a method for estimating connectivities of data points -
the connectivity of the manifold (`method=='umap'`). If `method=='diffmap'`,
connectivities are computed according to Coifman et al (2005), in the adaption of
Haghverdi et al (2016).

The returned AnnData object contains:

- Weighted adjacency matrix of the neighborhood graph of data points (connectivities). Weights should be interpreted as connectivities.
- Distances for each pair of neighbors (distances)

This data are stored in the unstructured annotation (uns) and can be accessed using the inspect tool for AnnData objects

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.neighbors.html>`__

Score a set of genes, using `tl.score_genes`
============================================

The score is the average expression of a set of genes subtracted with the
average expression of a reference set of genes. The reference set is
randomly sampled from the `gene_pool` for each binned expression value.

This reproduces the approach in Seurat (Satija et al, 2015) and has been implemented
for Scanpy by Davide Cittaro.

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.tl.score_genes.html>`__

Score cell cycle genes, using `tl.score_genes_cell_cycle`
=========================================================

Given two lists of genes associated to S phase and G2M phase, calculates
scores and assigns a cell cycle phase (G1, S or G2M). See
`score_genes` for more explanation.

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.tl.score_genes_cell_cycle.html>`__

Rank genes for characterizing groups, using `tl.rank_genes_groups`
==================================================================

The returned AnnData object contains:

- Gene names, ordered according to scores
- Z-score underlying the computation of a p-value for each gene for each group, prdered according to scores
- Log2 fold change for each gene for each group, ordered according to scores. It is only provided if method is ‘t-test’ like. This is an approximation calculated from mean-log values.
- P-values
- Ajusted p-values

This data are stored in the unstructured annotation (uns) and can be accessed using the inspect tool for AnnData objects

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.tl.rank_genes_groups.html>`__


Calculate an overlap score between data-deriven marker genes and provided markers (`tl.marker_gene_overlap`)
============================================================================================================

Marker gene overlap scores can be quoted as overlap counts, overlap coefficients, or jaccard indices. The method returns a pandas dataframe which can be used to annotate clusters based on marker gene overlaps.


Logarithmize the data matrix (`pp.log1p`)
=========================================

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.log1p.html>`__

Scale data to unit variance and zero mean (`pp.scale`)
======================================================

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.scale.html>`__

Computes the square root the data matrix (`pp.sqrt`)
====================================================

`X = sqrt(X)`
    ]]></help>
    <expand macro="citations"/>
</tool>