Mercurial > repos > iuc > scanpy_inspect

<tool id="scanpy_inspect" name="Scanpy Inspect and manipulate" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <macros>
        <import>macros.xml</import>
        <xml name="params_score_genes">
            <param argument="n_bins" type="integer" value="25" label="Number of expression level bins for sampling"/>
            <param argument="random_state" type="integer" value="0" label="Random seed for sampling"/>
            <expand macro="param_use_raw"/>
        </xml>
        <token name="@CMD_PARAMS_SCORE_GENES@"><![CDATA[
    n_bins=$method.n_bins,
    random_state=$method.random_state,
    use_raw=$method.use_raw,
    copy=False
        ]]>
        </token>
        <xml name="corr_method">
            <param argument="corr_method" type="select" label="P-value correction method">
                <option value="benjamini-hochberg" selected="true">Benjamini-Hochberg</option>
                <option value="bonferroni">Bonferroni</option>
            </param>
        </xml>
        <xml name="fit_intercept">
            <param argument="fit_intercept" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Should a constant (a.k.a. bias or intercept) be added to the decision function?"/>
        </xml>
        <xml name="max_iter">
            <param argument="max_iter" type="integer" min="0" value="100" label="Maximum number of iterations taken for the solvers to converge"/>
        </xml>
        <xml name="multi_class">
            <param argument="multi_class" type="select" label="Multi class">
                <option value="auto" selected="true">auto: selects ‘ovr’ if the data is binary and otherwise selects ‘multinomial’</option>
                <option value="ovr">ovr: a binary problem is fit for each label</option>
                <option value="multinomial">multinomial: the multinomial loss fit across the entire probability distribution, even when the data is binary</option>
            </param>
        </xml>
        <xml name="penalty">
            <param argument="penalty" type="select" label="Norm used in the penalization">
                <option value="l2" selected="true">l2</option>
                <option value="l1">l1</option>
                <yield/>
            </param>
        </xml>
        <xml name="random_state">
            <param argument="random_state" type="integer" value="" optional="true" label="The seed of the pseudo random number generator to use when shuffling the data"/>
        </xml>
    </macros>
    <expand macro="bio_tools"/>
    <expand macro="requirements">
        <requirement type="package" version="1.5.1">scikit-learn</requirement>
    </expand>
    <expand macro="version_command"/>
    <command detect_errors="exit_code"><![CDATA[
@CMD@
      ]]></command>
    <configfiles>
        <configfile name="script_file"><![CDATA[
@CMD_IMPORTS@
@CMD_READ_INPUTS@

#if str($method.method) == 'pp.calculate_qc_metrics':
sc.pp.calculate_qc_metrics(
    adata=adata,
    expr_type='$method.expr_type',
    var_type='$method.var_type',
    #if str($method.qc_vars) != '':
        #set $qc_vars = [str(x.strip()) for x in str($method.qc_vars).split(',')]
    qc_vars=$qc_vars,
    #end if
    #if str($method.percent_top) != '':
        #set $percent_top = [int(x.strip()) for x in str($method.percent_top).split(',')]
    percent_top=$percent_top,
    #end if
    #if str($method.layer) != '':
    layer='$method.layer',
    #end if
    use_raw=$method.use_raw,
    log1p=$method.log1p,
    inplace=True)

#else if str($method.method) == 'pp.neighbors':
sc.pp.neighbors(
    adata=adata,
    n_neighbors=$method.n_neighbors,
    #if str($method.n_pcs) != '':
    n_pcs=$method.n_pcs,
    #end if
    #if str($method.use_rep) != '':
    use_rep='$method.use_rep',
    #end if
    knn=$method.knn,
    method='$method.pp_neighbors_method',
    metric='$method.metric',
    random_state=$method.random_state,
    #if str($method.key_added) != '':
    key_added='$method.key_added',
    #end if
    copy=False)

#else if str($method.method) == 'tl.score_genes':
sc.tl.score_genes(
    adata=adata,
    #set $gene_list = [str(x.strip()) for x in str($method.gene_list).split(',')]
    gene_list=$gene_list,
    ctrl_size=$method.ctrl_size,
    #if str($method.gene_pool) != '':
        #set $gene_pool = [str(x.strip()) for x in str($method.gene_pool).split(',')]
    gene_pool=$gene_pool,
    #end if
    score_name='$method.score_name',
    @CMD_PARAMS_SCORE_GENES@)

#else if str($method.method) == 'tl.score_genes_cell_cycle':
    #if str($method.s_genes.format) == 'file':
with open('$method.s_genes.file', 'r') as s_genes_f:
    s_genes = [str(x.strip()) for x in s_genes_f.readlines()]
print(s_genes)
    #end if

    #if str($method.g2m_genes.format) == 'file':
with open('$method.g2m_genes.file', 'r') as g2m_genes_f:
    g2m_genes = [str(x.strip()) for x in g2m_genes_f.readlines()]
print(g2m_genes)
    #end if

sc.tl.score_genes_cell_cycle(
    adata=adata,
    #if str($method.s_genes.format) == 'text':
        #set $s_genes = [str(x.strip()) for x in str($method.s_genes.text).split(',')]
    s_genes=$s_genes,
    #else if str($method.s_genes.format) == 'file':
    s_genes=s_genes,
    #end if
    #if str($method.g2m_genes.format) == 'text':
        #set $g2m_genes = [str(x.strip()) for x in str($method.g2m_genes.text).split(',')]
    g2m_genes=$g2m_genes,
    #else if str($method.g2m_genes.format) == 'file':
    g2m_genes=g2m_genes,
    #end if
    @CMD_PARAMS_SCORE_GENES@)

#else if str($method.method) == 'tl.rank_genes_groups':
sc.tl.rank_genes_groups(
    adata=adata,
    #if str($method.groupby) != '':
    groupby='$method.groupby',
    #end if
    use_raw=$method.use_raw,
    #if str($method.groups) != '':
    #set $group=[x.strip() for x in str($method.groups).split(',')]
    groups='$group',
    #end if
    #if str($method.layer) != '':
    layer='$method.layer',
    #end if
    #if str($method.ref.rest) == 'rest':
    reference='$method.ref.rest',
    #else
    reference='$method.ref.reference',
    #end if
    #if str($method.n_genes) != '':
    n_genes=$method.n_genes,
    #end if
    method='$method.tl_rank_genes_groups_method.method',
    #if str($method.tl_rank_genes_groups_method.method) != 'logreg':
    corr_method='$method.tl_rank_genes_groups_method.corr_method',
    #end if
    #if str($method.tl_rank_genes_groups_method.method) == 'wilcoxon':
    tie_correct=$method.tl_rank_genes_groups_method.tie_correct,
    #end if
    #if str($method.tl_rank_genes_groups_method.method) == 'logreg':
    solver='$method.tl_rank_genes_groups_method.solver.solver',
        #if str($method.tl_rank_genes_groups_method.solver.solver) == 'lbfgs':
    penalty='$method.tl_rank_genes_groups_method.solver.penalty',
    fit_intercept=$method.tl_rank_genes_groups_method.solver.fit_intercept,
    max_iter=$method.tl_rank_genes_groups_method.solver.max_iter,
    multi_class='$method.tl_rank_genes_groups_method.solver.multi_class',
        #else if str($method.tl_rank_genes_groups_method.solver.solver) == 'newton-cg':
    penalty='$method.tl_rank_genes_groups_method.solver.penalty',
    fit_intercept=$method.tl_rank_genes_groups_method.solver.fit_intercept,
    max_iter=$method.tl_rank_genes_groups_method.solver.max_iter,
    multi_class='$method.tl_rank_genes_groups_method.solver.multi_class',
        #else if str($method.tl_rank_genes_groups_method.solver.solver) == 'liblinear':
            #if str($method.tl_rank_genes_groups_method.solver.penalty.penalty) == 'l1':
    penalty='l1',
            #else:
    penalty='l2',
    dual=$method.tl_rank_genes_groups_method.solver.penalty.dual,
            #end if
    fit_intercept=$method.tl_rank_genes_groups_method.solver.intercept_scaling.fit_intercept,
            #if str($method.tl_rank_genes_groups_method.solver.intercept_scaling.fit_intercept) == 'True':
    intercept_scaling=$method.tl_rank_genes_groups_method.solver.intercept_scaling.intercept_scaling,
            #end if
            #if str($method.tl_rank_genes_groups_method.solver.random_state) != '':
    random_state=$method.tl_rank_genes_groups_method.solver.random_state,
            #end if
        #else if str($method.tl_rank_genes_groups_method.solver.solver) == 'sag':
    penalty='$method.tl_rank_genes_groups_method.solver.penalty.penalty',
    fit_intercept=$method.tl_rank_genes_groups_method.solver.fit_intercept,
            #if str($method.tl_rank_genes_groups_method.solver.random_state) != '':
    random_state=$method.tl_rank_genes_groups_method.solver.random_state,
            #end if
    max_iter=$method.tl_rank_genes_groups_method.solver.max_iter,
    multi_class=$method.tl_rank_genes_groups_method.solver.multi_class,
        #else if str($method.tl_rank_genes_groups_method.solver.solver) == 'saga':
            #if str($method.tl_rank_genes_groups_method.solver.penalty.penalty) == 'l1':
    penalty='l1',
            #else if str($method.tl_rank_genes_groups_method.solver.penalty.penalty) == 'l2':
    penalty='l2',
            #else if str($method.tl_rank_genes_groups_method.solver.penalty.penalty) == 'elasticnet':
    penalty='elasticnet',
            #else:
    penalty='None',
            #end if
    fit_intercept=$method.tl_rank_genes_groups_method.solver.fit_intercept,
    multi_class=$method.tl_rank_genes_groups_method.solver.multi_class,
        #end if
    tol=$method.tl_rank_genes_groups_method.tol,
    C=$method.tl_rank_genes_groups_method.c,
    #end if
    #if str($method.key_added) != '':
    key_added='$method.key_added',
    #end if
    copy=False)

#else if str($method.method) == "tl.marker_gene_overlap":
reference_markers = {}
#for $i, $s in enumerate($method.reference_markers)
    #set $list=[x.strip() for x in str($s.values).split(',')]
reference_markers['$s.key'] = $list
#end for

# Temporary fix for the issue with "inplace=True" for Pandas dataframes.
# see here: https://github.com/scverse/scanpy/blob/b6193502e11b84fc1b4a011ee9cf08a19da22ebf/src/scanpy/tools/_marker_gene_overlap.py#L167
marker_overlap_result = sc.tl.marker_gene_overlap(
                            adata,
                            reference_markers,
                            #if str($method.key) != '':
                            key='$method.key',
                            #end if
                            method='$method.overlap.method',
                            #if str($method.overlap.method) == 'overlap_count' and str($method.overlap.normalize) != 'None':
                            normalize='$method.overlap.normalize',
                            #end if
                            #if str($method.top_n_markers) != '':
                            top_n_markers=$method.top_n_markers,
                            #end if
                            #if str($method.adj_pval_threshold) != '':
                            adj_pval_threshold=$method.adj_pval_threshold,
                            #end if
                            #if $method.key_added:
                            key_added='$method.key_added',
                            #end if
                            inplace=False)

adata.uns['marker_gene_overlap'] = marker_overlap_result

#else if str($method.method) == "pp.log1p":
sc.pp.log1p(
    adata,
    #if str($method.base) != '':
    base=$method.base,
    #end if
    #if str($method.layer) != '':
    layer='$method.layer',
    #end if
    #if str($method.obsm) != '':
    obsm='$method.obsm',
    #end if
    copy=False)

#else if str($method.method) == "pp.scale":
sc.pp.scale(
    adata,
    zero_center=$method.zero_center,
    #if str($method.max_value) != '':
    max_value=$method.max_value,
    #end if
    #if str($method.layer) != '':
    layer='$method.layer',
    #end if
    #if str($method.obsm) != '':
    obsm='$method.obsm',
    #end if
    #if str($method.mask_obs) != '':
    mask_obs='$method.mask_obs',
    #end if
    copy=False)

#else if str($method.method) == "pp.sqrt":

print("stats before sqrt:", "min=", adata.X.min(), "max=", adata.X.max(), "mean=", adata.X.mean())

sc.pp.sqrt(
    adata,
    copy=False)
#end if

print("stats after sqrt:", "min=", adata.X.min(), "max=", adata.X.max(), "mean=", adata.X.mean())

@CMD_ANNDATA_WRITE_OUTPUTS@
        ]]>
        </configfile>
    </configfiles>
    <inputs>
        <expand macro="inputs_anndata"/>
        <conditional name="method">
            <param argument="method" type="select" label="Method used for inspecting">
                <option value="pp.calculate_qc_metrics">Calculate quality control metrics, using 'pp.calculate_qc_metrics'</option>
                <option value="pp.neighbors">Compute a neighborhood graph of observations, using 'pp.neighbors'</option>
                <option value="tl.score_genes">Score a set of genes, using 'tl.score_genes'</option>
                <option value="tl.score_genes_cell_cycle">Score cell cycle genes, using 'tl.score_genes_cell_cycle'</option>
                <option value="tl.rank_genes_groups">Rank genes for characterizing groups, using 'tl.rank_genes_groups'</option>
                <option value="tl.marker_gene_overlap">Calculate an overlap score between data-deriven marker genes and provided markers, using 'tl.marker_gene_overlap'</option>
                <option value="pp.log1p">Logarithmize the data matrix, using 'pp.log1p'</option>
                <option value="pp.scale">Scale data to unit variance and zero mean, using 'pp.scale'</option>
                <option value="pp.sqrt">Square root the data matrix, using 'pp.sqrt'</option>
            </param>
            <when value="pp.calculate_qc_metrics">
                <param argument="expr_type" type="text" value="counts" label="Name of kind of values in X">
                    <expand macro="sanitize_query"/>
                </param>
                <param argument="var_type" type="text" value="genes" label="The kind of thing the variables are">
                    <expand macro="sanitize_query"/>
                </param>
                <param argument="qc_vars" type="text" optional="true" value="" label="Keys for boolean columns of '.var' which identify variables you could want to control for" help="Keys separated by a comma">
                    <expand macro="sanitize_query"/>
                </param>
                <param argument="percent_top" type="text" value="" optional="true" label="Proportions of top genes to cover"
                    help=" Values (integers) are considered 1-indexed, '50' finds cumulative proportion to the 50th most expressed genes. Values separated by a comma. If empty don't calculate">
                    <expand macro="sanitize_vectors"/>
                </param>
                <expand macro="param_layer"/>
                <expand macro="param_use_raw"/>
                <param argument="log1p" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Compute log1p transformed annotations"/>
            </when>
            <when value="pp.neighbors">
                <param argument="n_neighbors" type="integer" min="0" value="15" label="The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation" help="Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If 'knn' is 'True', number of nearest neighbors to be searched. If 'knn' is 'False', a Gaussian kernel width is set to the distance of the 'n_neighbors' neighbor."/>
                <param argument="n_pcs" type="integer" min="0" value="" optional="true" label="Number of PCs to use"/>
                <expand macro="param_use_rep"/>
                <param argument="knn" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Use a hard threshold to restrict the number of neighbors to n_neighbors?" help="If true, it considers a knn graph. Otherwise, it uses a Gaussian Kernel to assign low weights to neighbors more distant than the 'n_neighbors' nearest neighbor."/>
                <param name="pp_neighbors_method" argument="method" type="select" label="Method for computing connectivities">
                    <option value="umap" selected="true">umap (McInnes et al, 2018)</option>
                    <option value="gauss">gauss: Gauss kernel following (Coifman et al 2005) with adaptive width (Haghverdi et al 2016)</option>
                </param>
                <param argument="metric" type="select" label="Distance metric">
                    <expand macro="distance_metric_options"/>
                </param>
                <param argument="random_state" type="integer" value="0" label="Numpy random seed"/>
                <param argument="key_added" type="text" value="" optional="true" label="Key to store neighbors, distances and connectivities" help="If specified, the neighbors data is added to .uns[key_added], distances are stored in .obsp[key_added+'_distances'] and connectivities in .obsp[key_added+'_connectivities']"/>
            </when>
            <when value="tl.score_genes">
                <param argument="gene_list" type="text" value="" optional="false" label="The list of gene names used for score calculation" help="Genes separated by a comma">
                    <expand macro="sanitize_query"/>
                </param>
                <param argument="ctrl_size" type="integer" value="50" label="Number of reference genes to be sampled" help="If 'len(gene_list)' is not too low, you can set 'ctrl_size=len(gene_list)'."/>
                <param argument="gene_pool" type="text" value="" optional="true" label="Genes for sampling the reference set" help="Default is all genes. Genes separated by a comma">
                    <expand macro="sanitize_query"/>
                </param>
                <param argument="score_name" type="text" value="score" label="Name of the field to be added in '.obs'">
                    <expand macro="sanitize_query"/>
                </param>
                <expand macro="params_score_genes"/>
            </when>
            <when value="tl.score_genes_cell_cycle">
                <conditional name='s_genes'>
                    <param name="format" type="select" label="Format for the list of genes associated with S phase">
                        <option value="text" selected="true">Text</option>
                        <option value="file">File</option>
                    </param>
                    <when value="text">
                        <param name="text" type="text" value="" label="List of genes associated with S phase" help="Genes separated by a comma">
                            <expand macro="sanitize_query"/>
                        </param>
                    </when>
                    <when value="file">
                        <param name="file" type="data" format="txt" label="File with the list of genes associated with S phase" help="One gene per line"/>
                    </when>
                </conditional>
                <conditional name='g2m_genes'>
                    <param name="format" type="select" label="Format for the list of genes associated with G2M phase">
                        <option value="text" selected="true">Text</option>
                        <option value="file">File</option>
                    </param>
                    <when value="text">
                        <param name="text" type="text" value="" label="List of genes associated with G2M phase" help="Genes separated by a comma">
                            <expand macro="sanitize_query"/>
                        </param>
                    </when>
                    <when value="file">
                        <param name="file" type="data" format="txt" label="File with the list of genes associated with G2M phase" help="One gene per line"/>
                    </when>
                </conditional>
                <expand macro="params_score_genes"/>
            </when>
            <when value="tl.rank_genes_groups">
                <param argument="groupby" type="text" value="" label="The key of the observations grouping to consider">
                    <expand macro="sanitize_query"/>
                </param>
                <expand macro="param_use_raw"/>
                <param argument="groups" type="text" value="" label="Subset of groups to which comparison shall be restricted" help="e.g. ['g1', 'g2', 'g3']. If not passed, a ranking will be generated for all groups.">
                    <expand macro="sanitize_query"/>
                </param>
                <param argument="layer" type="text" value="" label="Key from adata.layers whose value will be used to perform tests on">
                    <expand macro="sanitize_query"/>
                </param>
                <conditional name="ref">
                    <param name="rest" type="select" label="Comparison">
                        <option value="rest" selected="true">Compare each group to the union of the rest of the group</option>
                        <option value="group_id">Compare with respect to a specific group</option>
                    </param>
                    <when value="rest"/>
                    <when value="group_id">
                        <param argument="reference" type="text" value="" label="Group identifier with respect to which compare">
                            <expand macro="sanitize_query"/>
                        </param>
                    </when>
                </conditional>
                <param argument="n_genes" type="integer" min="0" value="" optional="true" label="The number of genes that appear in the returned tables" help="Defaults to all genes"/>
                <conditional name="tl_rank_genes_groups_method">
                    <param argument="method" type="select" label="Method">
                        <option value="t-test" selected="true">t-test</option>
                        <option value="wilcoxon">Wilcoxon-Rank-Sum</option>
                        <option value="t-test_overestim_var">t-test with overestimate of variance of each group</option>
                        <option value="logreg">Logistic regression</option>
                    </param>
                    <when value="t-test">
                        <expand macro="corr_method"/>
                    </when>
                    <when value="wilcoxon">
                        <expand macro="corr_method"/>
                        <param argument="tie_correct" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Use tie correction for 'wilcoxon' scores"/>
                    </when>
                    <when value="t-test_overestim_var">
                        <expand macro="corr_method"/>
                    </when>
                    <when value="logreg">
                        <conditional name="solver">
                            <param argument="solver" type="select" label="Algorithm to use in the optimization problem" help="For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones. For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes. ‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty, whereas ‘liblinear’ and ‘saga’ handle L1 penalty.">
                                <option value="lbfgs" selected="true">lbfgs</option>
                                <option value="newton-cg">newton-cg</option>
                                <option value="liblinear">liblinear</option>
                                <option value="sag">sag</option>
                                <option value="saga">saga</option>
                            </param>
                            <when value="lbfgs">
                                <param name="penalty" type="boolean" truevalue="l2" falsevalue="None" checked="true" label="use l2 penalty?"/>
                                <expand macro="fit_intercept"/>
                                <expand macro="max_iter"/>
                                <expand macro="multi_class"/>
                            </when>
                            <when value="newton-cg">
                                <param name="penalty" type="boolean" truevalue="l2" falsevalue="None" checked="true" label="use l2 penalty?"/>
                                <expand macro="fit_intercept"/>
                                <expand macro="max_iter"/>
                                <expand macro="multi_class"/>
                            </when>
                            <when value="liblinear">
                                <conditional name="penalty">
                                    <expand macro="penalty"/>
                                    <when value="l1"/>
                                    <when value="l2">
                                        <param argument="dual" type="boolean" truevalue="True" falsevalue="False" checked="false"
                                            label="Dual (not primal) formulation?" help="Prefer primal when n_samples > n_features"/>
                                    </when>
                                </conditional>
                                <conditional name="intercept_scaling">
                                    <param argument="fit_intercept" type="select" label="Should a constant (a.k.a. bias or intercept) be added to the decision function?">
                                        <option value="True" selected="true">Yes</option>
                                        <option value="False">No</option>
                                    </param>
                                    <when value="True">
                                        <param argument="intercept_scaling" type="float" value="1.0" label="Intercept scaling" help="x becomes [x, self.intercept_scaling], i.e. a 'synthetic' feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight."/>
                                    </when>
                                    <when value="False"/>
                                </conditional>
                                <expand macro="random_state"/>
                            </when>
                            <when value="sag">
                                <param name="penalty" type="boolean" truevalue="l2" falsevalue="None" checked="true" label="use l2 penalty?"/>
                                <expand macro="fit_intercept"/>
                                <expand macro="random_state"/>
                                <expand macro="max_iter"/>
                                <expand macro="multi_class"/>
                            </when>
                            <when value="saga">
                                <conditional name="penalty">
                                    <expand macro="penalty">
                                        <option value="elasticnet">elasticnet</option>
                                        <option value="None">None</option>
                                    </expand>
                                    <when value="l1"/>
                                    <when value="l2"/>
                                    <when value="elasticnet"/>
                                    <when value="None"/>
                                </conditional>
                                <expand macro="fit_intercept"/>
                                <expand macro="multi_class"/>
                            </when>
                        </conditional>
                        <param argument="tol" type="float" value="1e-4" label="Tolerance for stopping criteria"/>
                        <param argument="c" type="float" value="1.0" label="Inverse of regularization strength" help="It must be a positive float. Like in support vector machines, smaller values specify stronger regularization."/>
                    </when>
                </conditional>
                <param argument="key_added" type="text" value="" optional="true" label="The key in adata.uns information is saved to"/>
            </when>
            <!-- With inplace=True, NotImplementedError: Writing Pandas dataframes to h5ad is currently under development. Please use `inplace=False`. -->
            <!-- Issue is fixed in the script -->
            <when value="tl.marker_gene_overlap">
                <repeat name="reference_markers" title="Marker genes">
                    <param name="key" type="text" value="" label="Cell identity name"/>
                    <param name="values" type="text" value="" label="List of genes" help="Comma-separated names from 'var'"/>
                </repeat>
                <param argument="key" type="text" value="rank_genes_groups" label="Key in adata.uns where the rank_genes_groups output is stored"/>
                <conditional name="overlap">
                    <param argument="method" type="select" label="Method to calculate marker gene overlap">
                        <option value="overlap_count" selected="true">overlap_count: Intersection of the gene set</option>
                        <option value="overlap_coef">overlap_coef: Overlap coefficient</option>
                        <option value="jaccard">jaccard: Jaccard index</option>
                    </param>
                    <when value="overlap_count">
                        <param argument="normalize" type="select" label="Normalization option for the marker gene overlap output">
                            <option value="None" selected="true">None</option>
                            <option value="reference">reference: Normalization of the data by the total number of marker genes given in the reference annotation per group</option>
                            <option value="data">data: Normalization of the data by the total number of marker genes used for each cluster</option>
                        </param>
                    </when>
                    <when value="overlap_coef"/>
                    <when value="jaccard"/>
                </conditional>
                <param argument="top_n_markers" type="integer" optional="true" value="" label="Number of top data-derived marker genes to use" help="By default all calculated marker genes are used. If adj_pval_threshold is set along with top_n_markers, then adj_pval_threshold is ignored."/>
                <param argument="adj_pval_threshold" type="float" optional="true" value="" label="Significance threshold on the adjusted p-values to select marker genes" help=" This can only be used when adjusted p-values are calculated by 'tl.rank_genes_groups'. If adj_pval_threshold is set along with top_n_markers, then adj_pval_threshold is ignored."/>
                <param argument="key_added" type="text" optional="true" value="" label="Key that will contain the marker overlap scores in 'uns'"/>
            </when>
            <when value="pp.log1p">
                <param argument="base" type="integer" value="" optional="true" label="Base of the logarithm." help="Natural logarithm is used by default."/>
                <param argument="layer" type="text" value="" optional="true" label="Entry of layers to transform">
                    <expand macro="sanitize_query"/>
                </param>
                <param argument="obsm" type="text" value="" optional="true" label="Entry of obsm to transform">
                    <expand macro="sanitize_query"/>
                </param>
            </when>
            <when value="pp.scale">
                <param argument="zero_center" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Zero center?" help="If not, it omits zero-centering variables, which allows to handle sparse input efficiently."/>
                <param argument="max_value" type="float" value="" optional="true" label="Maximum value" help="Clip (truncate) to this value after scaling. If not set, it does not clip."/>
                <param argument="layer" type="text" value="" label="Which element of layers to scale">
                    <expand macro="sanitize_query"/>
                </param>
                <param argument="obsm" type="text" value="" label="Which element of obsm to scale">
                    <expand macro="sanitize_query"/>
                </param>
                <param argument="mask_obs" type="text" value="" label="Restrict both the derivation of scaling parameters and the scaling itself to a certain set of observations.">
                    <expand macro="sanitize_query"/>
                </param>
            </when>
            <when value="pp.sqrt"/>
        </conditional>
        <expand macro="inputs_common_advanced"/>
    </inputs>
    <outputs>
        <expand macro="anndata_outputs"/>
    </outputs>
    <tests>

        <!-- test 1 -->
        <test expect_num_outputs="2">
            <param name="adata" value="sparce_csr_matrix.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.calculate_qc_metrics"/>
                <param name="qc_vars" value="mito,negative"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.calculate_qc_metrics"/>
                    <has_text_matching expression="expr_type='counts'"/>
                    <has_text_matching expression="var_type='genes'"/>
                    <has_text_matching expression="qc_vars=\['mito', 'negative'\]"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/n_genes_by_counts,obs/log1p_n_genes_by_counts,obs/total_counts,obs/log1p_total_counts,obs/pct_counts_in_top_50_genes,obs/pct_counts_in_top_100_genes,obs/pct_counts_in_top_200_genes,obs/pct_counts_in_top_500_genes,obs/total_counts_mito,obs/log1p_total_counts_mito,obs/pct_counts_mito,obs/total_counts_negative,obs/log1p_total_counts_negative,obs/pct_counts_negative"/>
                    <has_h5_keys keys="var/n_cells_by_counts,var/mean_counts,var/log1p_mean_counts,var/pct_dropout_by_counts,var/total_counts,var/log1p_total_counts"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 2 -->
        <test expect_num_outputs="2">
            <param name="adata" value="sparce_csr_matrix.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.calculate_qc_metrics"/>
                <param name="qc_vars" value="mito,negative"/>
                <param name="percent_top" value="50,100,200,300"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true" />
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.calculate_qc_metrics" />
                    <has_text_matching expression="expr_type='counts'" />
                    <has_text_matching expression="var_type='genes'" />
                    <has_text_matching expression="qc_vars=\['mito', 'negative'\]" />
                    <has_text_matching expression="percent_top=\[50, 100, 200, 300\]" />
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/n_genes_by_counts,obs/log1p_n_genes_by_counts,obs/total_counts,obs/log1p_total_counts,obs/pct_counts_in_top_50_genes,obs/pct_counts_in_top_100_genes,obs/pct_counts_in_top_200_genes,obs/pct_counts_in_top_300_genes,obs/total_counts_mito,obs/log1p_total_counts_mito,obs/pct_counts_mito,obs/total_counts_negative,obs/log1p_total_counts_negative,obs/pct_counts_negative"/>
                    <has_h5_keys keys="var/mito,var/negative,var/n_cells_by_counts,var/mean_counts,var/log1p_mean_counts,var/pct_dropout_by_counts,var/total_counts,var/log1p_total_counts"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 3 -->
        <test expect_num_outputs="2">
            <param name="adata" value="pp.recipe_weinreb17.paul15_subsample.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.neighbors"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.neighbors"/>
                    <has_text_matching expression="n_neighbors=15"/>
                    <has_text_matching expression="knn=True"/>
                    <has_text_matching expression="random_state=0"/>
                    <has_text_matching expression="method='umap'"/>
                    <has_text_matching expression="metric='euclidean'"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="uns/neighbors"/>
                    <has_h5_keys keys="obsp/connectivities,obsp/distances"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 4 -->
        <test expect_num_outputs="2">
            <param name="adata" value="pp.recipe_weinreb17.paul15_subsample.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.neighbors"/>
                <param name="pp_neighbors_method" value="gauss"/>
                <param name="metric" value="braycurtis"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.neighbors"/>
                    <has_text_matching expression="n_neighbors=15"/>
                    <has_text_matching expression="knn=True"/>
                    <has_text_matching expression="random_state=0"/>
                    <has_text_matching expression="method='gauss'"/>
                    <has_text_matching expression="metric='braycurtis'"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obsp/connectivities,obsp/distances"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 5 -->
        <test expect_num_outputs="2">
            <param name="adata" value="krumsiek11.h5ad"/>
            <conditional name="method">
                <param name="method" value="tl.score_genes"/>
                <param name="gene_list" value="Gata2, Fog1"/>
                <param name="ctrl_size" value="2"/>
                <param name="n_bins" value="2"/>
                <param name="random_state" value="2"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.tl.score_genes"/>
                    <has_text_matching expression="gene_list=\['Gata2', 'Fog1'\]"/>
                    <has_text_matching expression="ctrl_size=2"/>
                    <has_text_matching expression="score_name='score'"/>
                    <has_text_matching expression="n_bins=2"/>
                    <has_text_matching expression="random_state=2"/>
                    <has_text_matching expression="use_raw=False"/>
                    <has_text_matching expression="copy=False"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/score"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 6 -->
        <test expect_num_outputs="2">
            <param name="adata" value="krumsiek11.h5ad"/>
            <conditional name="method">
                <param name="method" value="tl.score_genes_cell_cycle"/>
                <conditional name='s_genes'>
                    <param name="format" value="text"/>
                    <param name="text" value="Gata2, Fog1, EgrNab"/>
                </conditional>
                <conditional name='g2m_genes'>
                    <param name="format" value="text"/>
                    <param name="text" value="Gata2, Fog1, EgrNab"/>
                </conditional>
                <param name="n_bins" value="2"/>
                <param name="random_state" value="1"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.tl.score_genes_cell_cycle"/>
                    <has_text_matching expression="s_genes=\['Gata2', 'Fog1', 'EgrNab'\]"/>
                    <has_text_matching expression="g2m_genes=\['Gata2', 'Fog1', 'EgrNab'\]"/>
                    <has_text_matching expression="n_bins=2"/>
                    <has_text_matching expression="random_state=1"/>
                    <has_text_matching expression="use_raw=False"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/S_score,obs/G2M_score,obs/phase"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 7 -->
        <test expect_num_outputs="2">
            <param name="adata" value="krumsiek11.h5ad"/>
            <conditional name="method">
                <param name="method" value="tl.rank_genes_groups"/>
                <param name="groupby" value="cell_type"/>
                <param name="n_genes" value="100"/>
                <conditional name="tl_rank_genes_groups_method">
                    <param name="method" value="t-test_overestim_var"/>
                </conditional>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.tl.rank_genes_groups"/>
                    <has_text_matching expression="groupby='cell_type'"/>
                    <has_text_matching expression="use_raw=False"/>
                    <has_text_matching expression="reference='rest'"/>
                    <has_text_matching expression="n_genes=100"/>
                    <has_text_matching expression="method='t-test_overestim_var'"/>
                    <has_text_matching expression="corr_method='benjamini-hochberg'"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="uns/rank_genes_groups"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 8 -->
        <test expect_num_outputs="2">
            <param name="adata" value="krumsiek11.h5ad"/>
            <conditional name="method">
                <param name="method" value="tl.rank_genes_groups"/>
                <param name="groupby" value="cell_type"/>
                <param name="n_genes" value="100"/>
                <conditional name="tl_rank_genes_groups_method">
                    <param name="method" value="logreg"/>
                </conditional>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.tl.rank_genes_groups"/>
                    <has_text_matching expression="groupby='cell_type'"/>
                    <has_text_matching expression="use_raw=False"/>
                    <has_text_matching expression="reference='rest'"/>
                    <has_text_matching expression="n_genes=100"/>
                    <has_text_matching expression="method='logreg'"/>
                    <has_text_matching expression="solver='lbfgs'"/>
                    <has_text_matching expression="penalty='l2'"/>
                    <has_text_matching expression="fit_intercept=True"/>
                    <has_text_matching expression="max_iter=100"/>
                    <has_text_matching expression="multi_class='auto'"/>
                    <has_text_matching expression="tol=0.0001"/>
                    <has_text_matching expression="C=1.0"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="uns/rank_genes_groups"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 9 -->
        <test expect_num_outputs="2">
            <param name="adata" value="krumsiek11.h5ad"/>
            <conditional name="method">
                <param name="method" value="tl.rank_genes_groups"/>
                <param name="groupby" value="cell_type"/>
                <param name="n_genes" value="100"/>
                <conditional name="tl_rank_genes_groups_method">
                    <param name="method" value="logreg"/>
                    <conditional name="solver">
                        <param name="solver" value="liblinear"/>
                        <conditional name="penalty">
                            <param name="penalty" value="l2"/>
                            <param name="random_state" value="1"/>
                        </conditional>
                    </conditional>
                </conditional>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.tl.rank_genes_groups"/>
                    <has_text_matching expression="groupby='cell_type'"/>
                    <has_text_matching expression="use_raw=False"/>
                    <has_text_matching expression="reference='rest'"/>
                    <has_text_matching expression="n_genes=100"/>
                    <has_text_matching expression="method='logreg'"/>
                    <has_text_matching expression="solver='liblinear'"/>
                    <has_text_matching expression="penalty='l2'"/>
                    <has_text_matching expression="dual=False"/>
                    <has_text_matching expression="fit_intercept=True"/>
                    <has_text_matching expression="intercept_scaling=1.0"/>
                    <has_text_matching expression="tol=0.0001"/>
                    <has_text_matching expression="C=1.0"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="uns/rank_genes_groups"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 10 -->
        <test expect_num_outputs="2">
            <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_reduced.h5ad"/>
            <conditional name="method">
                <param name="method" value="tl.marker_gene_overlap"/>
                <repeat name="reference_markers">
                    <param name="key" value="CD4 T cells"/>
                    <param name="values" value="IL7R"/>
                </repeat>
                <repeat name="reference_markers">
                    <param name="key" value="CD14+ Monocytes"/>
                    <param name="values" value="CD14,LYZ"/>
                </repeat>
                <repeat name="reference_markers">
                    <param name="key" value="B cells"/>
                    <param name="values" value="MS4A1"/>
                </repeat>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <assert_stdout>
                <has_text_matching expression="marker_gene_overlap"/>
            </assert_stdout>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.tl.marker_gene_overlap"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="uns/rank_genes_groups"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 11 -->
        <test expect_num_outputs="2">
            <param name="adata" value="krumsiek11.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.log1p"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.log1p"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="uns/log1p"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 12 -->
        <test expect_num_outputs="2">
            <param name="adata" value="krumsiek11.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.scale"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.scale"/>
                    <has_text_matching expression="zero_center=True"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="var/mean,var/std"/>
                </assert_contents>
            </output>        </test>

        <!-- test 13 -->
        <test expect_num_outputs="2">
            <param name="adata" value="krumsiek11.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.scale"/>
                <param name="max_value" value="10"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.scale"/>
                    <has_text_matching expression="zero_center=True"/>
                    <has_text_matching expression="max_value=10.0"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="var/mean,var/std"/>
                </assert_contents>
            </output>
        </test>

        <!-- test 14 -->
        <test expect_num_outputs="2">
            <param name="adata" value="random-randint.h5ad"/>
            <conditional name="method">
                <param name="method" value="pp.sqrt"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="sc.pp.sqrt"/>
                    <has_text_matching expression="stats before sqrt: min= 0.0 max= 999.0 mean= 499.83777"/>
                    <has_text_matching expression="stats after sqrt: min= 0.0 max= 31.606962 mean= 21.079018"/>
                </assert_contents>
            </output>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/index"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
Calculate quality control metrics., using `pp.calculate_qc_metrics`
===================================================================

Calculates a number of qc metrics for an AnnData object, largely based on calculateQCMetrics from scater.
Currently is most efficient on a sparse CSR or dense matrix.

It updates the observation level metrics with

- total_{var_type}_by_{expr_type} (e.g. "total_genes_by_counts", number of genes with positive counts in a cell)
- total_{expr_type} (e.g. "total_counts", total number of counts for a cell)
- pct_{expr_type}_in_top_{n}_{var_type} - for n in percent_top (e.g. "pct_counts_in_top_50_genes", cumulative percentage of counts for 50 most expressed genes in a cell)
- total_{expr_type}_{qc_var} - for qc_var in qc_vars (e.g. "total_counts_mito", total number of counts for variabes in qc_vars)
- pct_{expr_type}_{qc_var} - for qc_var in qc_vars (e.g. "pct_counts_mito", proportion of total counts for a cell which are mitochondrial)

And also the variable level metrics:

- total_{expr_type} (e.g. "total_counts", sum of counts for a gene)
- mean_{expr_type} (e.g. "mean counts", mean expression over all cells)
- n_cells_by_{expr_type} (e.g. "n_cells_by_counts", number of cells this expression is measured in)
- pct_dropout_by_{expr_type} (e.g. "pct_dropout_by_counts", percentage of cells this feature does not appear in)

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.calculate_qc_metrics.html>`__

Compute a neighborhood graph of observations, using `pp.neighbors`
==================================================================

The neighbor search efficiency of this heavily relies on UMAP (McInnes et al, 2018),
which also provides a method for estimating connectivities of data points -
the connectivity of the manifold (`method=='umap'`). If `method=='diffmap'`,
connectivities are computed according to Coifman et al (2005), in the adaption of
Haghverdi et al (2016).

The returned AnnData object contains:

- Weighted adjacency matrix of the neighborhood graph of data points (connectivities). Weights should be interpreted as connectivities.
- Distances for each pair of neighbors (distances)

This data are stored in the unstructured annotation (uns) and can be accessed using the inspect tool for AnnData objects

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.neighbors.html>`__

Score a set of genes, using `tl.score_genes`
============================================

The score is the average expression of a set of genes subtracted with the
average expression of a reference set of genes. The reference set is
randomly sampled from the `gene_pool` for each binned expression value.

This reproduces the approach in Seurat (Satija et al, 2015) and has been implemented
for Scanpy by Davide Cittaro.

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.tl.score_genes.html>`__

Score cell cycle genes, using `tl.score_genes_cell_cycle`
=========================================================

Given two lists of genes associated to S phase and G2M phase, calculates
scores and assigns a cell cycle phase (G1, S or G2M). See
`score_genes` for more explanation.

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.tl.score_genes_cell_cycle.html>`__

Rank genes for characterizing groups, using `tl.rank_genes_groups`
==================================================================

The returned AnnData object contains:

- Gene names, ordered according to scores
- Z-score underlying the computation of a p-value for each gene for each group, prdered according to scores
- Log2 fold change for each gene for each group, ordered according to scores. It is only provided if method is ‘t-test’ like. This is an approximation calculated from mean-log values.
- P-values
- Ajusted p-values

This data are stored in the unstructured annotation (uns) and can be accessed using the inspect tool for AnnData objects

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.tl.rank_genes_groups.html>`__


Calculate an overlap score between data-deriven marker genes and provided markers (`tl.marker_gene_overlap`)
============================================================================================================

Marker gene overlap scores can be quoted as overlap counts, overlap coefficients, or jaccard indices. The method returns a pandas dataframe which can be used to annotate clusters based on marker gene overlaps.

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.marker_gene_overlap.html>`__

Logarithmize the data matrix (`pp.log1p`)
=========================================

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.log1p.html>`__

Scale data to unit variance and zero mean (`pp.scale`)
======================================================

More details on the `scanpy documentation
<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.scale.html>`__

Computes the square root the data matrix (`pp.sqrt`)
====================================================

`X = sqrt(X)`
    ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Sat, 14 Sep 2024 12:45:03 +0000
parents	ca086f24422f
children