# HG changeset patch # User ebi-gxa # Date 1676554111 0 # Node ID 825dfd66e3fbf5453661fe1d92e9aa1f736bf193 # Parent a36d7a315be7dcd8303163e0eb22f68967886834 planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/tertiary-analysis/scanpy commit 6c9d530aa653101e9e21804393ec11f38cddf027-dirty diff -r a36d7a315be7 -r 825dfd66e3fb anndata_operations.xml --- a/anndata_operations.xml Thu Oct 28 09:55:27 2021 +0000 +++ b/anndata_operations.xml Thu Feb 16 13:28:31 2023 +0000 @@ -1,5 +1,5 @@ - + modifies metadata and flags genes scanpy_macros2.xml @@ -45,11 +45,19 @@ ]]> +import gc import scanpy as sc import anndata from numpy import all import logging +def make_column_values_unique(df, field, new_field=None, suffix = '-duplicate-'): + if new_field is None: + new_field = f"{field}_u" + appendents = (suffix + df.groupby(field).cumcount().astype(str).replace('0','')).replace(suffix, '') + df[new_field] = df[field].astype(str) + appendents.astype(str) + return df + adata = sc.read('input.h5') #if $copy_adata_to_raw: @@ -60,14 +68,22 @@ qc_vars = list() #for $i, $s in enumerate($modifications) +#if $s.make_unique: +adata.obs = make_column_values_unique(adata.obs, field='${s.from_obs}', new_field='${s.to_obs}', suffix = "_d") +#else adata.obs['${s.to_obs}'] = adata.obs['${s.from_obs}'] +#end if #if not $s.keep_original: del adata.obs['${s.from_obs}'] #end if #end for #for $i, $s in enumerate($var_modifications) +#if $s.make_unique: +adata.var = make_column_values_unique(adata.var, field='${s.from_var}', new_field='${s.to_var}', suffix = "_d") +#else adata.var['${s.to_var}'] = adata.var['${s.from_var}'] +#end if #if not $s.keep_original: del adata.var['${s.from_var}'] #end if @@ -84,6 +100,21 @@ logging.warning('No genes starting with {} found, skip calculating expression of {} genes'.format('${flag.startswith}', '${flag.flag}')) #end for +#if $field_unique: +field_unique = '${field_unique}' +made_unique = 0 +if field_unique in adata.var_keys(): + adata.var = make_column_values_unique(adata.var, field_unique, suffix = "_d") + made_unique += 1 +if field_unique in adata.obs_keys(): + adata.obs = make_column_values_unique(adata.obs, field_unique, suffix = "_d") + made_unique += 1 + +if made_unique == 0: + logging.error("Specified field to be made unique is not in var or obs.") + sys.exit(1) +#end if + #if $copy_r.default and $copy_r.r_source: ad_s = sc.read('r_source.h5') if not all(adata.obs.index.isin(ad_s.obs.index)): @@ -92,6 +123,7 @@ else: adata.raw = ad_s[adata.obs.index] del ad_s +gc.collect() #end if #if $copy_x.default and len($copy_x.xlayers) > 0: @@ -107,6 +139,7 @@ logging.error("X source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") sys.exit(1) del ad_s +gc.collect() #end for #end if @@ -120,13 +153,13 @@ suffix='' if l_to_copy in adata.layers: suffix = "_${i}" - adata.layers[l_to_copy+suffix] = ad_s.layers[l_to_copy] #end for else: logging.error("Layer source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") sys.exit(1) del ad_s +gc.collect() #end for #end if @@ -149,6 +182,7 @@ logging.error("Observation source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") sys.exit(1) del ad_s +gc.collect() #end for #end if @@ -169,6 +203,7 @@ logging.error("Embedding source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") sys.exit(1) del ad_s +gc.collect() #end for #end if @@ -188,6 +223,7 @@ logging.error("Uns source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") sys.exit(1) del ad_s +gc.collect() #end for #end if @@ -224,8 +260,9 @@ - + + @@ -235,6 +272,7 @@ + @@ -242,6 +280,7 @@ + @@ -310,8 +349,6 @@ - - @@ -325,8 +362,29 @@ + + + + + + + + + + + + + + + + + + + + + + - @@ -334,39 +392,37 @@ - + - - - - + + + - + - - - - + + + - + @@ -378,11 +434,12 @@ Performs the following operations: -* Change observation/var fields, mostly for downstreaming processes convenience. Multiple fields can be changed as one. +* Change observation/var fields, mostly for downstreaming processes convenience. Multiple fields can be changed at once. * Flag genes that start with a certain text: useful for flagging mitochondrial, spikes or other groups of genes. * For the flags created, calculates qc metrics (pct__counts). * Calculates `n_genes`, `n_counts` for cells and `n_cells`, `n_counts` for genes. * For top genes specified, calculate qc metrics (pct_counts_in_top__genes). +* Make a specified column of var or obs unique (normally useful for gene symbols). * Copy from a set of compatible AnnData objects (same cells and genes): * Observations, such as clustering results. * Embeddings, such as tSNE or UMAPs. @@ -392,6 +449,7 @@ History ------- +1.8.1+galaxy10: Adds field to be made unique in obs or var. 1.6.0+galaxy0: Moves to Scanpy Scripts 0.3.0 (Scanpy 1.6.0), versioning switched to track Scanpy as other tools. diff -r a36d7a315be7 -r 825dfd66e3fb scanpy_macros2.xml --- a/scanpy_macros2.xml Thu Oct 28 09:55:27 2021 +0000 +++ b/scanpy_macros2.xml Thu Feb 16 13:28:31 2023 +0000 @@ -1,9 +1,11 @@ - 1.8.1+3 + 1.8.1 More information can be found at https://scanpy.readthedocs.io 18.01 - + + - + + @@ -118,7 +122,7 @@ output_format == 'anndata_h5ad' - + output_format == 'anndata' @@ -127,13 +131,13 @@ output_format == 'anndata_h5ad' - + output_format == 'anndata' output_format == 'loom_legacy' - + output_format == 'loom'