view scanpy-find-variable-genes.xml @ 21:79da59a0b180 draft

"planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/tertiary-analysis/scanpy commit 69b3fd29988bdcbf30d38572b75793ab74110acd-dirty"
author ebi-gxa
date Fri, 06 Aug 2021 15:22:46 +0000
parents f952b39f0794
children 82bd7bf93454
line wrap: on
line source

<?xml version="1.0" encoding="utf-8"?>
<tool id="scanpy_find_variable_genes" name="Scanpy FindVariableGenes" version="@TOOL_VERSION@+galaxy1" profile="@PROFILE@">
  <description>based on normalised dispersion of expression</description>
  <macros>
    <import>scanpy_macros2.xml</import>
  </macros>
  <expand macro="requirements"/>
  <command detect_errors="exit_code"><![CDATA[
ln -s '${input_obj_file}' input.h5 &&
PYTHONIOENCODING=utf-8 scanpy-find-variable-genes
    --flavor '${flavor}'
#if $min_mean
    --mean-limits ${min_mean} ${max_mean}
#end if    
#if $min_disp
    --disp-limits ${min_disp} ${max_disp}
#end if    
#if $n_top_gene
    --n-top-genes ${n_top_gene}
#end if
#if $span
    --span ${span}
#end if
    --n-bins '${n_bin}'
    ${filter}
#if $batch_key
    --batch-key ${batch_key}
#end if
    @INPUT_OPTS@
    @OUTPUT_OPTS@
]]></command>

  <inputs>
    <expand macro="input_object_params"/>
    <expand macro="output_object_params"/>
    <param name="flavor" argument="--flavor" type="select" label="Flavor of computing normalised dispersion">
      <option value="seurat" selected="true">Seurat</option>
      <option value="cell_ranger">Cell-ranger</option>
      <option value="seurat_v3" selected="true">Seurat V3</option>
    </param>
    <param name="min_mean" argument="--min-mean" type="float" min="0" value="0.0125" optional="true" 
           label="Min value for normalised mean expression (in log1p scale), ignored if flavor='seurat_v3'"/>
    <param name="max_mean" argument="--max-mean" type="float" min="0" value="3" optional="true" 
           label="Max value for normalised mean expresssion (in log1p scale), Ignored if flavor='seurat_v3'"/>
    <param name="min_disp" argument="--min-disp" type="float" min="0" value="0.5" optional="true" 
           label="Min value for dispersion of expression, ignored if flavor='seurat_v3'"/>
    <param name="max_disp" argument="--max-disp" type="float" min="0" value="50" optional="true" 
           label="Max value for dispersion of expresssion, ignored if flavor='seurat_v3'"/>
    <param name="n_top_gene" argument="--n-top-genes" type="integer" value="2000" optional="true" 
           label="Number of top variable genes to keep, mandatory if flavor='seurat_v3'"/>
    <param name="span" argument="--span" type="float" min="0" max="1" value="0.3" optional="true" 
           label="The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3'"/>
    <param name="n_bin" argument="--n-bins" type="integer" value="20" label="Number of bins for binning the mean expression"/>
    <param name="filter" argument="--subset" type="boolean" truevalue="--subset" falsevalue="" checked="false"
           label="Remove genes not marked as highly variable" help="When set, inplace subset to highly-variable genes, otherwise only flag highly-variable genes."/>
    <param name="batch_key" argument="--batch-key" type="text" label="Batch key" optional="true" help="If specified, highly-variable genes are selected within each batch separately and merged. This simple process avoids the selection of batch-specific genes and acts as a lightweight batch correction method. For all flavors, genes are first sorted by how many batches they are a HVG. For dispersion-based flavors ties are broken by normalized dispersion. If flavor = 'seurat_v3', ties are broken by the median (across batches) rank based on within-batch normalized variance."/>
  </inputs>

  <outputs>
    <expand macro="output_data_obj" description="Variable genes"/>
  </outputs>

  <tests>
    <test>
      <param name="input_obj_file" value="normalise_data.h5"/>
      <param name="input_format" value="anndata"/>
      <param name="output_format" value="anndata"/>
      <param name="flavor" value="seurat"/>
      <param name="n_bin" value="20"/>
      <param name="min_mean" value="0.0125"/>
      <param name="max_mean" value="3"/>
      <param name="min_disp" value="0.5"/>
      <param name="max_disp" value="1e9"/>
      <output name="output_h5" file="find_variable_genes.h5" ftype="h5" compare="sim_size"/>
    </test>
  </tests>

  <help><![CDATA[
==============================================================
Mark highly variable genes (`scanpy.pp.highly_variable_genes`)
==============================================================

Depending on `flavor`, this reproduces the R-implementations of Seurat or Cell Ranger.

The normalized dispersion is obtained by scaling with the mean and standard
deviation of the dispersions for genes falling into a given bin for mean
expression of genes. This means that for each bin of mean expression, highly
variable genes are selected.

@HELP@

@VERSION_HISTORY@
]]></help>
  <expand macro="citations">
    <citation type="doi">10.1038/nbt.3192</citation>
    <citation type="doi">10.1038/ncomms14049</citation>
  </expand>
</tool>