Mercurial > repos > ebi-gxa > anndata_ops
diff anndata_operations.xml @ 26:825dfd66e3fb draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/tertiary-analysis/scanpy commit 6c9d530aa653101e9e21804393ec11f38cddf027-dirty
author | ebi-gxa |
---|---|
date | Thu, 16 Feb 2023 13:28:31 +0000 |
parents | 31e5e6d606ef |
children | 7ebc22f77d86 |
line wrap: on
line diff
--- a/anndata_operations.xml Thu Oct 28 09:55:27 2021 +0000 +++ b/anndata_operations.xml Thu Feb 16 13:28:31 2023 +0000 @@ -1,5 +1,5 @@ <?xml version="1.0" encoding="utf-8"?> -<tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy0" profile="@PROFILE@"> +<tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy9" profile="@PROFILE@"> <description>modifies metadata and flags genes</description> <macros> <import>scanpy_macros2.xml</import> @@ -45,11 +45,19 @@ ]]></command> <configfiles> <configfile name="operations"> +import gc import scanpy as sc import anndata from numpy import all import logging +def make_column_values_unique(df, field, new_field=None, suffix = '-duplicate-'): + if new_field is None: + new_field = f"{field}_u" + appendents = (suffix + df.groupby(field).cumcount().astype(str).replace('0','')).replace(suffix, '') + df[new_field] = df[field].astype(str) + appendents.astype(str) + return df + adata = sc.read('input.h5') #if $copy_adata_to_raw: @@ -60,14 +68,22 @@ qc_vars = list() #for $i, $s in enumerate($modifications) +#if $s.make_unique: +adata.obs = make_column_values_unique(adata.obs, field='${s.from_obs}', new_field='${s.to_obs}', suffix = "_d") +#else adata.obs['${s.to_obs}'] = adata.obs['${s.from_obs}'] +#end if #if not $s.keep_original: del adata.obs['${s.from_obs}'] #end if #end for #for $i, $s in enumerate($var_modifications) +#if $s.make_unique: +adata.var = make_column_values_unique(adata.var, field='${s.from_var}', new_field='${s.to_var}', suffix = "_d") +#else adata.var['${s.to_var}'] = adata.var['${s.from_var}'] +#end if #if not $s.keep_original: del adata.var['${s.from_var}'] #end if @@ -84,6 +100,21 @@ logging.warning('No genes starting with {} found, skip calculating expression of {} genes'.format('${flag.startswith}', '${flag.flag}')) #end for +#if $field_unique: +field_unique = '${field_unique}' +made_unique = 0 +if field_unique in adata.var_keys(): + adata.var = make_column_values_unique(adata.var, field_unique, suffix = "_d") + made_unique += 1 +if field_unique in adata.obs_keys(): + adata.obs = make_column_values_unique(adata.obs, field_unique, suffix = "_d") + made_unique += 1 + +if made_unique == 0: + logging.error("Specified field to be made unique is not in var or obs.") + sys.exit(1) +#end if + #if $copy_r.default and $copy_r.r_source: ad_s = sc.read('r_source.h5') if not all(adata.obs.index.isin(ad_s.obs.index)): @@ -92,6 +123,7 @@ else: adata.raw = ad_s[adata.obs.index] del ad_s +gc.collect() #end if #if $copy_x.default and len($copy_x.xlayers) > 0: @@ -107,6 +139,7 @@ logging.error("X source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") sys.exit(1) del ad_s +gc.collect() #end for #end if @@ -120,13 +153,13 @@ suffix='' if l_to_copy in adata.layers: suffix = "_${i}" - adata.layers[l_to_copy+suffix] = ad_s.layers[l_to_copy] #end for else: logging.error("Layer source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") sys.exit(1) del ad_s +gc.collect() #end for #end if @@ -149,6 +182,7 @@ logging.error("Observation source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") sys.exit(1) del ad_s +gc.collect() #end for #end if @@ -169,6 +203,7 @@ logging.error("Embedding source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") sys.exit(1) del ad_s +gc.collect() #end for #end if @@ -188,6 +223,7 @@ logging.error("Uns source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") sys.exit(1) del ad_s +gc.collect() #end for #end if @@ -224,8 +260,9 @@ <valid initial="string.printable"/> </sanitizer> </param> - <param name="to_obs" type="text" label="New name" help="New name in observations that you want to change"/> + <param name="to_obs" type="text" label="New name" help="New name in observations that you want to change to"/> <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/> + <param name="make_unique" type="boolean" label="Make values in the field unique" help="If activated, it will make the values in the column unique by appending '_dnum' on each repeated value." checked="false"/> </repeat> <repeat name="var_modifications" title="Change field names in AnnData var" min="0"> <param name="from_var" type="text" label="Original name" help="Name in var that you want to change"> @@ -235,6 +272,7 @@ </param> <param name="to_var" type="text" label="New name" help="New name in var that you want to change"/> <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/> + <param name="make_unique" type="boolean" label="Make values in the field unique" help="If activated, it will make the values in the column unique by appending '_dnum' on each repeated value." checked="false"/> </repeat> <param name="gene_symbols_field" value='index' type="text" label="Gene symbols field in AnnData" help="Field inside var.params where the gene symbols are, normally 'index' or 'gene_symbols'"/> <repeat name="gene_flags" title="Flag genes that start with these names"> @@ -242,6 +280,7 @@ <param name="flag" type="text" label="Var name" help="Name of the column in var.names where this boolean flag is stored, for example 'mito' for mitochondrial genes."/> </repeat> <param name="top_genes" label="Number of top genes" value='50' help="to calculate percentage of the flagged genes in that number of top genes. Used by sc.pp.calculate_qc_metrics (integer)." type="integer"/> + <param name="field_unique" type="text" optional="true" label="Field in var or obs to make unique" help="Field inside var or obs to be made unique by appending a suffix (useful for gene symbols in var). A new field will be added with the '_u' suffix. It happens after all the above operations."/> <conditional name="copy_r"> <param name="default" type="boolean" checked="false" label="Copy adata.X to adata.raw"/> <when value="true"> @@ -310,8 +349,6 @@ <tests> <test> <param name="input_obj_file" value="find_cluster.h5"/> - <param name="input_format" value="anndata"/> - <param name="color_by" value="louvain"/> <output name="output_h5ad" file="anndata_ops.h5" ftype="h5ad" compare="sim_size"/> </test> <test> @@ -325,8 +362,29 @@ </output> </test> <test> + <param name="input_obj_file" value="anndata_ops.h5"/> + <repeat name="var_modifications" > + <param name="from_var" value = "gene_symbols" /> + <param name="to_var" value = "gene_symbols_unique" /> + <param name="make_unique" value = "True" /> + </repeat> + <output name="output_h5ad" ftype="h5ad"> + <assert_contents> + <has_h5_keys keys="var/gene_symbols_unique" /> + </assert_contents> + </output> + </test> + <test> + <param name="input_obj_file" value="anndata_ops.h5"/> + <param name="field_unique" value = "gene_symbols" /> + <output name="output_h5ad" ftype="h5ad"> + <assert_contents> + <has_h5_keys keys="var/gene_symbols_u" /> + </assert_contents> + </output> + </test> + <test> <param name="input_obj_file" value="find_cluster.h5"/> - <param name="input_format" value="anndata"/> <conditional name="copy_r"> <param name="default" value="true"/> <param name="r_source" value="read_10x.h5"/> @@ -334,39 +392,37 @@ <output name="output_h5ad" file="anndata_ops_raw.h5" ftype="h5ad" compare="sim_size"> <assert_contents> <has_h5_keys keys="raw/X" /> - </assert_contents> + </assert_contents> </output> </test> <test> <param name="input_obj_file" value="normalise_data.h5"/> - <param name="input_format" value="anndata"/> <conditional name="copy_x"> <param name="default" value="true"/> <repeat name="xlayers"> - <param name="x_source" value='filter_genes.h5'/> - <param name="dest" value='filtered'/> - </repeat> + <param name="x_source" value='filter_genes.h5'/> + <param name="dest" value='filtered'/> + </repeat> </conditional> <output name="output_h5ad" file="anndata_ops_xlayer.h5" ftype="h5ad" compare="sim_size"> <assert_contents> <has_h5_keys keys="layers/filtered" /> - </assert_contents> + </assert_contents> </output> </test> <test> <param name="input_obj_file" value="find_cluster.h5"/> - <param name="input_format" value="anndata"/> <conditional name="copy_l"> <param name="default" value="true"/> <repeat name="layers"> - <param name="contains" value='filtered'/> - </repeat> - <param name="layer_sources" value='anndata_ops_xlayer.h5'/> + <param name="contains" value='filtered'/> + </repeat> + <param name="layer_sources" value='anndata_ops_xlayer.h5'/> </conditional> <output name="output_h5ad" file="anndata_ops_layer.h5" ftype="h5ad" compare="sim_size"> <assert_contents> <has_h5_keys keys="layers/filtered" /> - </assert_contents> + </assert_contents> </output> </test> </tests> @@ -378,11 +434,12 @@ Performs the following operations: -* Change observation/var fields, mostly for downstreaming processes convenience. Multiple fields can be changed as one. +* Change observation/var fields, mostly for downstreaming processes convenience. Multiple fields can be changed at once. * Flag genes that start with a certain text: useful for flagging mitochondrial, spikes or other groups of genes. * For the flags created, calculates qc metrics (pct_<flag>_counts). * Calculates `n_genes`, `n_counts` for cells and `n_cells`, `n_counts` for genes. * For top <N> genes specified, calculate qc metrics (pct_counts_in_top_<N>_genes). +* Make a specified column of var or obs unique (normally useful for gene symbols). * Copy from a set of compatible AnnData objects (same cells and genes): * Observations, such as clustering results. * Embeddings, such as tSNE or UMAPs. @@ -392,6 +449,7 @@ History ------- +1.8.1+galaxy10: Adds field to be made unique in obs or var. 1.6.0+galaxy0: Moves to Scanpy Scripts 0.3.0 (Scanpy 1.6.0), versioning switched to track Scanpy as other tools.