view celltypist.xml @ 1:7518638a7b75 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/celltypist commit b7038e7d88c5b3661a6539b25b6faf95f3c7112a
author iuc
date Tue, 10 Mar 2026 21:40:01 +0000
parents 8722e08a96f4
children
line wrap: on
line source

<tool id="celltypist" name="CellTypist" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Automated cell type annotation for scRNA-seq datasets</description>
    <macros>
        <token name="@TOOL_VERSION@">1.7.1</token>
        <token name="@VERSION_SUFFIX@">0</token>
        <token name="@PROFILE@">25.0</token>
    </macros>
    <xrefs>
        <xref type="bio.tools">celltypist</xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">celltypist</requirement>
    </requirements>
    <creator>
        <organization name="European Galaxy Team" url="https://galaxyproject.org/eu/people/"/>
    </creator>
    <command><![CDATA[
cat '$script_file' &&
python '$script_file'
    ]]>
    </command>
    <configfiles>
        <configfile name="script_file"><![CDATA[
import scanpy as sc
import celltypist
from celltypist import models
adata = sc.read_h5ad('$adata')
#if $model_source.source == "cached"
model = models.Model.load(model='$model_source.cached_model.fields.path')
#else if $model_source.source == "history"
    #if $model_source.history_model_source.history_model_select == "select_model"
model = models.Model.load(model='$model_source.history_model_source.history_model')
    #else if $model_source.history_model_source.history_model_select == "train_model"
train_adata = sc.read_h5ad('$model_source.history_model_source.train_anndata')
model = celltypist.train(X=train_adata, 
                    labels = '$model_source.history_model_source.labels',
                    batch_number = $model_source.history_model_source.batch_number,
                    batch_size = $model_source.history_model_source.batch_size,
                    epochs = $model_source.history_model_source.epochs,
                    feature_selection = $model_source.history_model_source.feature_selection,
                    top_genes = $model_source.history_model_source.top_genes)
    #end if
#end if
predictions = celltypist.annotate(adata,
                model=model,
#if $majority_voting
                majority_voting=True,
#end if
#if $transpose_input
                transpose_input=$transpose_input,
#end if
                mode='$mode',
                p_thres=$p_thres,
                min_prop=$min_prop)
adata = predictions.to_adata()
adata.write_h5ad('$anndata_out', compression='gzip')

#if $dotplot.generate == "yes"
celltypist.dotplot(predictions, use_as_reference='$dotplot.reference', use_as_prediction='$dotplot.prediction',  save='.$dotplot.format', show=None)
#end if
]]>
        </configfile>
    </configfiles>
    <inputs>
        <param name="adata" type="data" format="h5ad" label="Input AnnData file" />
        <conditional name="model_source">
            <param type="select" label="Select model from" name="source">
                <option value="cached" selected="true">Cached</option>
                <option value="history">History</option>
            </param>
            <when value="cached">
                <param type="select" name="cached_model" label="Choose CellTypist model">
                    <options from_data_table="celltypist_models">
                    </options>
                </param>
            </when>
            <when value="history">
                <conditional name="history_model_source">
                    <param type="select" label="Select a models or train a model from history." name="history_model_select">
                        <option value="select_model" selected="true">Select a compatible model</option>
                        <option value="train_model">Train a model on an existing AnnData and use it</option>
                    </param>
                    <when value="select_model">
                        <param type="data" format="binary" name="history_model" label="Select compatible models from history." />
                    </when>
                    <when value="train_model">
                        <param type="data" format="h5ad" name="train_anndata" label="Select an AnnData file from history." />
                        <param type="text" name="labels" optional="false" label="The column name in the .obs attribute of the training AnnData file that contains the cell type labels." >
                            <sanitizer invalid_char="">
                                <valid initial="string.letters,string.digits">
                                    <add value="_" />
                                </valid>
                            </sanitizer>
                            <validator type="regex">[0-9a-zA-Z_]+</validator>
                        </param>
                        <param type="integer" name="batch_number" min="0" value="100" label="Batch number per epoch" help="The number of batches used for training in each epoch; only relevant when mini-batch SGD training is used (use_SGD = True and mini_batch = True)." />
                        <param type="integer" name="batch_size" min="1" value="1000" label="Cells per batch" help="The number of cells within each batch; only relevant when mini-batch SGD training is used (use_SGD = True and mini_batch = True)." />
                        <param type="integer" name="epochs" min="1" value="10" label="Epochs for mini-batch training" help="The number of epochs for the mini-batch training procedure; only relevant when mini-batch SGD training is used (use_SGD = True and mini_batch = True)." />
                        <param type="boolean" name="feature_selection" checked="false" truevalue="True" falsevalue="False" label="Enable two-pass feature selection" help="If true, performs two-pass training where the first round selects important genes using SGD learning; increases training time." />
                        <param type="integer" name="top_genes" min="1" value="300" label="Top genes per class" help="Number of top genes per class/cell-type based on absolute regression coefficients; the final feature set is the union across classes." />
                    </when>
                </conditional>
            </when>
        </conditional>
        <param name="majority_voting" type="boolean" checked="false" truevalue="True" falsevalue="False" label="Refine the predicted labels by running the majority voting classifier after over-clustering" />
        <param name="transpose_input" type="boolean" checked="false" truevalue="True" falsevalue="False" label="Transpose the input matrix is provided in the gene-by-cell format." help="Note Celltypist requires the cell-by-gene format"/>
        <param name="mode" type="select" label="Annotation mode">
            <option value="best match">Choose the cell type with the largest score/probability as the final prediction</option>
            <option value="prob match">Enable a multi-label classification utilising a probability threshold</option>
        </param>
        <param name="p_thres" type="float" value="0.5" min="0" max="1" label="Probability threshold for the multi-label classification" help="Ignored if mode is best match." />
        <param name="min_prop" type="float" value="0" min="0" max="1" label="The minimum proportion of cells required to support naming of the subcluster by this cell type" help="Ignored if majority_voting is set to False"/>
        <conditional name="dotplot">
            <param name="generate" type="select" label="Generate a dotplot of the predicted cell types" >
                <option value="no" selected="true">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="no"/>
            <when value="yes">
                <param name="reference" type="text" label="Reference column in AnnData.obs for dotplot" value="cell_type" help="The value can also be clustering column. For eg. 'leiden'">
                    <sanitizer invalid_char="">
                        <valid initial="string.letters,string.digits">
                            <add value="_" />
                        </valid>
                    </sanitizer>
                    <validator type="regex">[0-9a-zA-Z_]+</validator>
                </param>
                <param name="prediction" type="select" label="Prediction label in AnnData.obs for dotplot">
                    <option value="majority_voting" selected="true">majority_voting</option>
                    <option value="predicted_labels">predicted_labels</option>
                </param>
                <param name="format" type="select" label="Dotplot format">
                    <option value="png" selected="true">png</option>
                    <option value="pdf">pdf</option>
                    <option value="svg">svg</option>
                </param>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="anndata_out" format="h5ad" label="${tool.name} on ${on_string}: AnnData with celltype annotations" />
        <data name="out_png" format="png" from_work_dir="figures/*.png" label="on ${on_string}: Dotplot PNG">
            <filter>dotplot['generate'] == 'yes' and dotplot['format'] == 'png'</filter>
        </data>
        <data name="out_pdf" format="pdf" from_work_dir="figures/*.pdf" label="on ${on_string}: Dotplot PDF">
            <filter>dotplot['generate'] == 'yes' and dotplot['format'] == 'pdf'</filter>
        </data>
        <data name="out_svg" format="svg" from_work_dir="figures/*.svg" label="on ${on_string}: Dotplot SVG">
            <filter>dotplot['generate'] == 'yes' and dotplot['format'] == 'svg'</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="2">
            <param name="adata" location="https://celltypist.cog.sanger.ac.uk/Notebook_demo_data/demo_500_cells.h5ad"/>
            <conditional name="model_source">
                <param name="source" value="cached" />
                <param name="cached_model" value="Immune_All_High_v1" />
            </conditional>
            <param name="majority_voting" value="True" />
            <param name="mode" value="best match" />
            <param name="p_thres" value="0.5" />
            <param name="min_prop" value="0.05" />
            <conditional name="dotplot">
                <param name="generate" value="yes"/>
                <param name="reference" value="cell_type"/>
                <param name="prediction" value="majority_voting"/>
                <param name="format" value="png"/>
            </conditional>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/predicted_labels"/>
                    <has_h5_keys keys="obs/over_clustering"/>
                    <has_h5_keys keys="obs/majority_voting"/>
                    <has_h5_keys keys="obs/conf_score"/>
                </assert_contents>
            </output>
            <output name="out_png" ftype="png" value="majority_voting.png"/>
        </test>
        <test expect_num_outputs="1">
            <param name="adata" location="https://celltypist.cog.sanger.ac.uk/Notebook_demo_data/demo_500_cells.h5ad"/>
            <conditional name="model_source">
                <param name="source" value="history" />
                    <conditional name="history_model_source">
                        <param name="history_model_select" value="select_model"/>
                        <param name="history_model" location="https://celltypist.cog.sanger.ac.uk/models/Pan_Immune_CellTypist/v2/Immune_All_Low.pkl" />
                    </conditional>
            </conditional>
            <param name="majority_voting" value="False" />
            <param name="mode" value="prob match" />
            <param name="p_thres" value="0.5" />
            <param name="min_prop" value="0.05" />
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/predicted_labels"/>
                    <has_h5_keys keys="obs/conf_score"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="adata" location="https://celltypist.cog.sanger.ac.uk/Notebook_demo_data/demo_500_cells.h5ad"/>
            <conditional name="model_source">
                <param name="source" value="history" />
                <conditional name="history_model_source">
                    <param name="history_model_select" value="train_model"/>
                    <param name="train_anndata" location="https://celltypist.cog.sanger.ac.uk/Notebook_demo_data/demo_500_cells.h5ad" />
                    <param name="labels" value="cell_type" />
                    <param name="top_genes" value="100"/>
                </conditional>
            </conditional>
            <param name="majority_voting" value="False" />
            <param name="mode" value="prob match" />
            <param name="p_thres" value="0.5" />
            <param name="min_prop" value="0.05" />
            <conditional name="dotplot">
                <param name="generate" value="yes" />
                <param name="reference" value="cell_type"/>
                <param name="prediction" value="predicted_labels"/>
                <param name="format" value="pdf"/>
            </conditional>
            <output name="anndata_out" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/predicted_labels"/>
                    <has_h5_keys keys="obs/conf_score"/>
                </assert_contents>
            </output>
            <output name="out_pdf" ftype="pdf" value="predicted_labels.pdf"/>
        </test>
    </tests>
    <help><![CDATA[
**What it does**

CellTypist is an automated cell type annotation tool for scRNA-seq datasets on the basis of logistic regression classifiers optimised by the stochastic gradient descent algorithm. CellTypist allows for cell prediction using either built-in (with a current focus on immune sub-populations) or custom models, in order to assist in the accurate classification of different cell types and subtypes.


.. _CellTypist: https://www.celltypist.org/

------

**Inputs**

An anndata file in h5ad format that usually contains clustering results from single-cell RNA-seq analysis.

------

**Outputs**

An anndata file in h5ad format with predicted cell type annotations added to the .obs attribute.

cell_type	predicted_labels	over_clustering	majority_voting	conf_score
cell1	Plasma cells	Plasma cells	13	Follicular B cells	0.996313
cell2	Plasma cells	Plasma cells	6	Plasma cells	0.999478
cell3	Plasma cells	Plasma cells	12	Plasma cells	0.999957
cell4	Plasma cells	Plasma cells	6	Plasma cells	0.996070
cell5	Plasma cells	Plasma cells	6	Plasma cells	0.998888
...	...	...	...	...	...
cell496	Macro_pDC	pDC	9	Macrophages	0.187152
cell497	Macro_pDC	Macrophages	18	pDC	0.849831
cell498	Macro_pDC	Macrophages	9	Macrophages	0.809677
cell499	Macro_pDC	Macrophages	9	Macrophages	0.937306
cell500	Macro_pDC	pDC	9	Macrophages	0.612069

    ]]>    </help>
    <citations>
        <citation type="doi">10.1126/science.abl5197</citation>
    </citations>
</tool>