Mercurial > repos > iuc > scanpy_filter
diff filter.xml @ 0:6ea5a05a260a draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit 92f85afaed0097d1879317a9f513093fce5481d6
author | iuc |
---|---|
date | Mon, 04 Mar 2019 10:15:02 -0500 |
parents | |
children | 6a76b60e05f5 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter.xml Mon Mar 04 10:15:02 2019 -0500 @@ -0,0 +1,549 @@ +<tool id="scanpy_filter" name="Filter with scanpy" version="@galaxy_version@"> + <description></description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <expand macro="version_command"/> + <command detect_errors="exit_code"><![CDATA[ +@CMD@ + ]]></command> + <configfiles> + <configfile name="script_file"><![CDATA[ +@CMD_imports@ +@CMD_read_inputs@ + +#if $method.method == 'pp.filter_cells' +res = sc.pp.filter_cells( + #if $modify_anndata.modify_anndata == 'true' + adata, + #else + adata.X, + #end if + #if $method.filter.filter == 'min_counts' + min_counts=$method.filter.min_counts, + #elif $method.filter.filter == 'max_counts' + max_counts=$method.filter.max_counts, + #elif $method.filter.filter == 'min_genes' + min_genes=$method.filter.min_genes, + #elif $method.filter.filter == 'max_genes' + max_genes=$method.filter.max_genes, + #end if + copy=False) + + #if $modify_anndata.modify_anndata == 'true' +df = adata.obs + #else +df = pd.DataFrame(data=dict(cell_subset=res[0], number_per_cell=res[1])) + #end if + + #if $method.filter.filter == 'min_counts' or $method.filter.filter == 'max_counts' +df.to_csv('$counts_per_cell', sep='\t') + #elif $method.filter.filter == 'min_genes' or $method.filter.filter == 'max_genes' +df.to_csv('$genes_per_cell', sep='\t') + #end if + +#elif $method.method == 'pp.filter_genes' +res = sc.pp.filter_genes( + #if $modify_anndata.modify_anndata == 'true' + adata, + #else + adata.X, + #end if + #if $method.filter.filter == 'min_counts' + min_counts=$method.filter.min_counts, + #elif $method.filter.filter == 'max_counts' + max_counts=$method.filter.max_counts, + #elif $method.filter.filter == 'min_cells' + min_cells=$method.filter.min_cells, + #elif $method.filter.filter == 'max_cells' + max_cells=$method.filter.max_cells, + #end if + copy=False) + + #if $modify_anndata.modify_anndata == 'true' +df = adata.var + #else +df = pd.DataFrame(data=dict(gene_subset=res[0], number_per_gene=res[1])) + #end if + + #if $method.filter.filter == 'min_counts' or $method.filter.filter == 'max_counts' +df.to_csv('$counts_per_gene', sep='\t') + #elif $method.filter.filter == 'min_cells' or $method.filter.filter == 'max_cells' +df.to_csv('$cells_per_gene', sep='\t') + #end if + +#elif $method.method == 'pp.filter_genes_dispersion' +res = sc.pp.filter_genes_dispersion( + #if $modify_anndata.modify_anndata == 'true' + adata, + #else + adata.X, + #end if + flavor='$method.flavor.flavor', + #if $method.flavor.flavor=='seurat' + min_mean=$method.flavor.min_mean, + max_mean=$method.flavor.max_mean, + min_disp=$method.flavor.min_disp, + #if $method.flavor.max_disp + max_disp=$method.flavor.max_disp, + #end if + #else + n_top_genes=$method.flavor.n_top_genes, + #end if + n_bins=$method.n_bins, + log=$method.log, + copy=False) + + #if $modify_anndata.modify_anndata == 'true' +adata.var.to_csv('$per_gene', sep='\t') + #else +pd.DataFrame(res).to_csv('$per_gene', sep='\t') + #end if + +#elif $method.method == 'pp.subsample' +sc.pp.subsample( + data=adata, + #if $method.type.type == 'fraction' + fraction=$method.type.fraction, + #else if $method.type.type == 'n_obs' + n_obs=$method.type.n_obs, + #end if + random_state=$method.random_state, + copy=False) + +#end if + +@CMD_anndata_write_modify_outputs@ +]]></configfile> + </configfiles> + <inputs> + <expand macro="inputs_anndata"/> + <conditional name="method"> + <param argument="method" type="select" label="Method used for filtering"> + <option value="pp.filter_cells">Filter cell outliers based on counts and numbers of genes expressed, using `pp.filter_cells`</option> + <option value="pp.filter_genes">Filter genes based on number of cells or counts, using `pp.filter_genes`</option> + <option value="pp.filter_genes_dispersion">Extract highly variable genes, using `pp.filter_genes_dispersion`</option> + <!--<option value="pp.highly_variable_genes">, using `tl.highly_variable_genes`</option>!--> + <option value="pp.subsample">Subsample to a fraction of the number of observations, using `pp.subsample`</option> + <!--<option value="queries.gene_coordinates">, using `queries.gene_coordinates`</option>!--> + <!--<option value="queries.mitochondrial_genes">, using `queries.mitochondrial_genes`</option>!--> + </param> + <when value="pp.filter_cells"> + <conditional name="filter"> + <param argument="filter" type="select" label="Filter"> + <option value="min_counts">Minimum number of counts</option> + <option value="max_counts">Maximum number of counts</option> + <option value="min_genes">Minimum number of genes expressed</option> + <option value="max_genes">Maximum number of genes expressed</option> + </param> + <when value="min_counts"> + <param argument="min_counts" type="integer" min="0" value="" label="Minimum number of counts required for a cell to pass filtering" help=""/> + </when> + <when value="max_counts"> + <param argument="max_counts" type="integer" min="0" value="" label="Maximum number of counts required for a cell to pass filtering" help=""/> + </when> + <when value="min_genes"> + <param argument="min_genes" type="integer" min="0" value="" label="Minimum number of genes expressed required for a cell to pass filtering" help=""/> + </when> + <when value="max_genes"> + <param argument="max_genes" type="integer" min="0" value="" label="Maximum number of genes expressed required for a cell to pass filtering" help=""/> + </when> + </conditional> + </when> + <when value="pp.filter_genes"> + <conditional name="filter"> + <param argument="filter" type="select" label="Filter"> + <option value="min_counts">Minimum number of counts</option> + <option value="max_counts">Maximum number of counts</option> + <option value="min_cells">Minimum number of cells expressed</option> + <option value="max_cells">Maximum number of cells expressed</option> + </param> + <when value="min_counts"> + <param argument="min_counts" type="integer" min="0" value="" label="Minimum number of counts required for a gene to pass filtering" help=""/> + </when> + <when value="max_counts"> + <param argument="max_counts" type="integer" min="0" value="" label="Maximum number of counts required for a gene to pass filtering" help=""/> + </when> + <when value="min_cells"> + <param argument="min_cells" type="integer" min="0" value="" label="Minimum number of cells expressed required for a gene to pass filtering" help=""/> + </when> + <when value="max_cells"> + <param argument="max_cells" type="integer" min="0" value="" label="Maximum number of cells expressed required for a gene to pass filtering" help=""/> + </when> + </conditional> + </when> + <when value="pp.filter_genes_dispersion"> + <conditional name='flavor'> + <param argument="flavor" type="select" label="Flavor for computing normalized dispersion" help=""> + <option value="seurat">seurat: expects non-logarithmized data</option> + <option value="cell_ranger">cell_ranger: usually called for logarithmized data</option> + </param> + <when value="seurat"> + <param argument="min_mean" type="float" value="0.0125" label="Minimal mean cutoff" help=""/> + <param argument="max_mean" type="float" value="3" label="Maximal mean cutoff" help=""/> + <param argument="min_disp" type="float" value="0.5" label="Minimal normalized dispersion cutoff" help=""/> + <param argument="max_disp" type="float" value="" optional="true" label="Maximal normalized dispersion cutoff" help=""/> + </when> + <when value="cell_ranger"> + <param argument="n_top_genes" type="integer" value="" label="Number of highly-variable genes to keep" help=""/> + </when> + </conditional> + <param argument="n_bins" type="integer" value="20" label="Number of bins for binning the mean gene expression" help="Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1"/> + <expand macro="param_log"/> + </when> + <when value="pp.subsample"> + <conditional name="type"> + <param name="type" type="select" label="Type of subsampling"> + <option value="fraction">By fraction</option> + <option value="n_obs">By number of observation</option> + </param> + <when value="fraction"> + <param argument="fraction" type="float" value="" label="Subsample to this `fraction` of the number of observations" help=""/> + </when> + <when value="n_obs"> + <param argument="n_obs" type="integer" min="0" value="" label="Subsample to this number of observations" help=""/> + </when> + </conditional> + <param argument="random_state" type="integer" value="0" label="Random seed to change subsampling" help=""/> + </when> + </conditional> + <expand macro="anndata_modify_output_input"/> + </inputs> + <outputs> + <expand macro="anndata_modify_outputs"/> + <!-- for pp.filter_cells --> + <data name="counts_per_cell" format="tabular" label="${tool.name} on ${on_string}: Counts per cells after filtering"> + <filter>method['method'] == 'pp.filter_cells' and (method['filter']['filter'] == 'min_counts' or method['filter']['filter'] == 'max_counts')</filter> + </data> + <data name="genes_per_cell" format="tabular" label="${tool.name} on ${on_string}: Number of genes per cell after filtering"> + <filter>method['method'] == 'pp.filter_cells' and (method['filter']['filter'] == 'min_genes' or method['filter']['filter'] == 'max_genes')</filter> + </data> + <!-- for pp.filter_genes --> + <data name="counts_per_gene" format="tabular" label="${tool.name} on ${on_string}: Counts per genes after filtering"> + <filter>method['method'] == 'pp.filter_genes' and (method['filter']['filter'] == 'min_counts' or method['filter']['filter'] == 'max_counts')</filter> + </data> + <data name="cells_per_gene" format="tabular" label="${tool.name} on ${on_string}: Number of cells per genes after filtering"> + <filter>method['method'] == 'pp.filter_genes' and (method['filter']['filter'] == 'min_cells' or method['filter']['filter'] == 'max_cells')</filter> + </data> + <!-- for pp.filter_genes_dispersion --> + <data name="per_gene" format="tabular" label="${tool.name} on ${on_string}: Means, dispersions and normalized dispersions per gene"> + <filter>method['method'] == 'pp.filter_genes_dispersion'</filter> + </data> + </outputs> + <tests> + <test expect_num_outputs="2"> + <conditional name="input"> + <param name="format" value="h5ad" /> + <param name="adata" value="krumsiek11.h5ad" /> + </conditional> + <conditional name="method"> + <param name="method" value="pp.filter_cells"/> + <conditional name="filter"> + <param name="filter" value="min_counts"/> + <param name="min_counts" value="3"/> + </conditional> + </conditional> + <conditional name="modify_anndata"> + <param name="modify_anndata" value="true"/> + <param name="anndata_output_format" value="h5ad" /> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.filter_cells"/> + <has_text_matching expression="min_counts=3"/> + </assert_stdout> + <output name="anndata_out_h5ad" file="pp.filter_cells.krumsiek11-min_counts.h5ad" ftype="h5" compare="sim_size"/> + <output name="counts_per_cell"> + <assert_contents> + <has_text_matching expression="cell_type\tn_counts" /> + <has_text_matching expression="46\tprogenitor\t3.028" /> + <has_text_matching expression="85\tEry\t3.7001" /> + <has_text_matching expression="150\tMk\t4.095" /> + <has_n_columns n="3" /> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <conditional name="input"> + <param name="format" value="loom" /> + <param name="adata" value="krumsiek11.loom" /> + <param name="sparse" value="True"/> + <param name="cleanup" value="False"/> + <param name="x_name" value="spliced"/> + <param name="obs_names" value="CellID" /> + <param name="var_names" value="Gene"/> + </conditional> + <conditional name="method"> + <param name="method" value="pp.filter_cells"/> + <conditional name="filter"> + <param name="filter" value="min_counts"/> + <param name="min_counts" value="3"/> + </conditional> + </conditional> + <conditional name="modify_anndata"> + <param name="modify_anndata" value="true"/> + <param name="anndata_output_format" value="loom" /> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.filter_cells"/> + <has_text_matching expression="min_counts=3"/> + </assert_stdout> + <output name="anndata_out_loom" file="pp.filter_cells.krumsiek11-min_counts.loom" ftype="loom" compare="sim_size"/> + <output name="counts_per_cell"> + <assert_contents> + <has_text_matching expression="cell_type\tn_counts" /> + <has_text_matching expression="46\tprogenitor\t3.028" /> + <has_text_matching expression="85\tEry\t3.7001" /> + <has_text_matching expression="97\tMo\t3.925" /> + <has_text_matching expression="150\tMk\t4.095" /> + <has_n_columns n="3" /> + </assert_contents> + </output> + </test> + <test expect_num_outputs="1"> + <conditional name="input"> + <param name="format" value="h5ad" /> + <param name="adata" value="krumsiek11.h5ad"/> + </conditional> + <conditional name="method"> + <param name="method" value="pp.filter_cells"/> + <conditional name="filter"> + <param name="filter" value="max_genes"/> + <param name="max_genes" value="100"/> + </conditional> + </conditional> + <conditional name="modify_anndata"> + <param name="modify_anndata" value="false"/> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.filter_cells"/> + <has_text_matching expression="adata.X"/> + <has_text_matching expression="max_genes=100"/> + </assert_stdout> + <output name="genes_per_cell" file="pp.filter_cells.number_per_cell.krumsiek11-max_genes.tabular"/> + </test> + <test expect_num_outputs="2"> + <conditional name="input"> + <param name="format" value="h5ad" /> + <param name="adata" value="krumsiek11.h5ad" /> + </conditional> + <conditional name="method"> + <param name="method" value="pp.filter_genes"/> + <conditional name="filter"> + <param name="filter" value="min_counts"/> + <param name="min_counts" value="3"/> + </conditional> + </conditional> + <conditional name="modify_anndata"> + <param name="modify_anndata" value="true"/> + <param name="anndata_output_format" value="h5ad" /> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.filter_genes"/> + <has_text_matching expression="min_counts=3"/> + </assert_stdout> + <output name="anndata_out_h5ad" file="pp.filter_genes.krumsiek11-min_counts.h5ad" ftype="h5" compare="sim_size"/> + <output name="counts_per_gene" file="pp.filter_genes.number_per_gene.krumsiek11-min_counts.tabular"/> + </test> + <test expect_num_outputs="1"> + <conditional name="input"> + <param name="format" value="h5ad" /> + <param name="adata" value="pbmc68k_reduced.h5ad"/> + </conditional> + <conditional name="method"> + <param name="method" value="pp.filter_genes"/> + <conditional name="filter"> + <param name="filter" value="max_cells"/> + <param name="max_cells" value="500"/> + </conditional> + </conditional> + <conditional name="modify_anndata"> + <param name="modify_anndata" value="false"/> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.filter_genes"/> + <has_text_matching expression="adata.X"/> + <has_text_matching expression="max_cells=500"/> + </assert_stdout> + <output name="cells_per_gene" file="pp.filter_genes.number_per_gene.pbmc68k_reduced-max_cells.tabular"/> + </test> + <test expect_num_outputs="2"> + <conditional name="input"> + <param name="format" value="h5ad" /> + <param name="adata" value="krumsiek11.h5ad" /> + </conditional> + <conditional name="method"> + <param name="method" value="pp.filter_genes_dispersion"/> + <conditional name="flavor"> + <param name="flavor" value="seurat"/> + <param name="min_mean" value="0.0125"/> + <param name="max_mean" value="3"/> + <param name="min_disp" value="0.5"/> + </conditional> + <param name="n_bins" value="20" /> + <param name="log" value="true"/> + </conditional> + <conditional name="modify_anndata"> + <param name="modify_anndata" value="true"/> + <param name="anndata_output_format" value="h5ad" /> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.filter_genes_dispersion"/> + <has_text_matching expression="flavor='seurat'"/> + <has_text_matching expression="min_mean=0.0125"/> + <has_text_matching expression="max_mean=3.0"/> + <has_text_matching expression="min_disp=0.5"/> + <has_text_matching expression="n_bins=20"/> + <has_text_matching expression="log=True"/> + </assert_stdout> + <output name="anndata_out_h5ad" file="pp.filter_genes_dispersion.krumsiek11-seurat.h5ad" ftype="h5" compare="sim_size"/> + <output name="per_gene" file="pp.filter_genes_dispersion.per_gene.krumsiek11-seurat.tabular"/> + </test> + <test expect_num_outputs="1"> + <conditional name="input"> + <param name="format" value="h5ad" /> + <param name="adata" value="krumsiek11.h5ad" /> + </conditional> + <conditional name="method"> + <param name="method" value="pp.filter_genes_dispersion"/> + <conditional name="flavor"> + <param name="flavor" value="cell_ranger"/> + <param name="n_top_genes" value="2"/> + </conditional> + <param name="n_bins" value="20"/> + <param name="log" value="true"/> + </conditional> + <conditional name="modify_anndata"> + <param name="modify_anndata" value="false"/> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.filter_genes_dispersion"/> + <has_text_matching expression="flavor='cell_ranger'"/> + <has_text_matching expression="n_top_genes=2"/> + <has_text_matching expression="n_bins=20"/> + <has_text_matching expression="og=True"/> + </assert_stdout> + <output name="per_gene" file="pp.filter_genes_dispersion.per_gene.krumsiek11-cell_ranger.tabular"/> + </test> + <test expect_num_outputs="1"> + <conditional name="input"> + <param name="format" value="h5ad" /> + <param name="adata" value="krumsiek11.h5ad" /> + </conditional> + <conditional name="method"> + <param name="method" value="pp.subsample"/> + <conditional name="type"> + <param name="type" value="fraction" /> + <param name="fraction" value="0.5"/> + </conditional> + <param name="random_state" value="0"/> + </conditional> + <conditional name="modify_anndata"> + <param name="modify_anndata" value="true"/> + <param name="anndata_output_format" value="h5ad" /> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.subsample"/> + <has_text_matching expression="fraction=0.5"/> + <has_text_matching expression="random_state=0"/> + </assert_stdout> + <output name="anndata_out_h5ad" file="pp.subsample.krumsiek11_fraction.h5ad" ftype="h5" compare="sim_size"/> + </test> + <test expect_num_outputs="1"> + <conditional name="input"> + <param name="format" value="h5ad" /> + <param name="adata" value="krumsiek11.h5ad" /> + </conditional> + <conditional name="method"> + <param name="method" value="pp.subsample"/> + <conditional name="type"> + <param name="type" value="n_obs" /> + <param name="n_obs" value="10"/> + </conditional> + <param name="random_state" value="0"/> + </conditional> + <conditional name="modify_anndata"> + <param name="modify_anndata" value="true"/> + <param name="anndata_output_format" value="h5ad" /> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.subsample"/> + <has_text_matching expression="n_obs=10"/> + <has_text_matching expression="random_state=0"/> + </assert_stdout> + <output name="anndata_out_h5ad" file="pp.subsample.krumsiek11_n_obs.h5ad" ftype="h5" compare="sim_size"/> + </test> + </tests> + <help><![CDATA[ + +Filter cells outliers based on counts and numbers of genes expressed (`pp.filter_cells`) +======================================================================================== + +For instance, only keep cells with at least `min_counts` counts or +`min_genes` genes expressed. This is to filter measurement outliers, i.e., +"unreliable" observations. + +Only provide one of the optional parameters `min_counts`, `min_genes`, +`max_counts`, `max_genes` per call. + +More details on the `scanpy documentation +<https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.filter_cells.html#scanpy.api.pp.filter_cells>`__ + +Return +------ + +number_per_cell : Number per cell (either `n_counts` or `n_genes` per cell) + + +Filter genes based on number of cells or counts (`pp.filter_genes`) +=================================================================== + +Keep genes that have at least `min_counts` counts or are expressed in at +least `min_cells` cells or have at most `max_counts` counts or are expressed +in at most `max_cells` cells. + +Only provide one of the optional parameters `min_counts`, `min_cells`, +`max_counts`, `max_cells` per call. + +More details on the `scanpy documentation +<https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.filter_genes.html#scanpy.api.pp.filter_genes>`__ + +Return +------ + +number_per_gene : Number per genes (either `n_counts` or `n_genes` per cell) + + +Extract highly variable genes (`pp.filter_genes_dispersion`) +============================================================ + +If trying out parameters, pass the data matrix instead of AnnData. + +Depending on `flavor`, this reproduces the R-implementations of Seurat and Cell Ranger. + +The normalized dispersion is obtained by scaling with the mean and standard +deviation of the dispersions for genes falling into a given bin for mean +expression of genes. This means that for each bin of mean expression, highly +variable genes are selected. + +Use `flavor='cell_ranger'` with care and in the same way as in `pp.recipe_zheng17`. + +More details on the `scanpy documentation +<https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.filter_genes_dispersion.html#scanpy.api.pp.filter_genes_dispersion>`__ + +Returns +------- +- The annotated matrix filtered, with the annotations +- A table with the means, dispersions, and normalized dispersions per gene, logarithmized when `log` is `True`. + + +Subsample to a fraction of the number of observations (`pp.subsample`) +====================================================================== + +More details on the `scanpy documentation +<https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.subsample.html#scanpy.api.pp.subsample>`__ + + + ]]></help> + <expand macro="citations"/> +</tool>