anndata_ops: anndata_operations.xml comparison

comparison anndata_operations.xml @ 26:825dfd66e3fb draft

planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/tertiary-analysis/scanpy commit 6c9d530aa653101e9e21804393ec11f38cddf027-dirty

author	ebi-gxa
date	Thu, 16 Feb 2023 13:28:31 +0000
parents	31e5e6d606ef
children	7ebc22f77d86

comparison

equal deleted inserted replaced

-:a36d7a315be7
+:825dfd66e3fb
 <?xml version="1.0" encoding="utf-8"?>
-<tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy0" profile="@PROFILE@">
+<tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy9" profile="@PROFILE@">
 <description>modifies metadata and flags genes</description>
 <macros>
 <import>scanpy_macros2.xml</import>
 </macros>
 <expand macro="requirements"/>
 #end if
 python $operations
 ]]></command>
 <configfiles>
 <configfile name="operations">
+import gc
 import scanpy as sc
 import anndata
 from numpy import all
 import logging
+def make_column_values_unique(df, field, new_field=None, suffix = '-duplicate-'):
+if new_field is None:
+new_field = f"{field}_u"
+appendents = (suffix + df.groupby(field).cumcount().astype(str).replace('0','')).replace(suffix, '')
+df[new_field] = df[field].astype(str) + appendents.astype(str)
+return df
 adata = sc.read('input.h5')
 #if $copy_adata_to_raw:
 adata.raw = adata
 #end if
 gene_name = '${gene_symbols_field}'
 qc_vars = list()
 #for $i, $s in enumerate($modifications)
+#if $s.make_unique:
+adata.obs = make_column_values_unique(adata.obs, field='${s.from_obs}', new_field='${s.to_obs}', suffix = "_d")
+#else
 adata.obs['${s.to_obs}'] = adata.obs['${s.from_obs}']
+#end if
 #if not $s.keep_original:
 del adata.obs['${s.from_obs}']
 #end if
 #end for
 #for $i, $s in enumerate($var_modifications)
+#if $s.make_unique:
+adata.var = make_column_values_unique(adata.var, field='${s.from_var}', new_field='${s.to_var}', suffix = "_d")
+#else
 adata.var['${s.to_var}'] = adata.var['${s.from_var}']
+#end if
 #if not $s.keep_original:
 del adata.var['${s.from_var}']
 #end if
 #end for
 qc_vars.append('${flag.flag}')
 else:
 logging.warning('No genes starting with {} found, skip calculating expression of {} genes'.format('${flag.startswith}', '${flag.flag}'))
 #end for
+#if $field_unique:
+field_unique = '${field_unique}'
+made_unique = 0
+if field_unique in adata.var_keys():
+adata.var = make_column_values_unique(adata.var, field_unique, suffix = "_d")
+made_unique += 1
+if field_unique in adata.obs_keys():
+adata.obs = make_column_values_unique(adata.obs, field_unique, suffix = "_d")
+made_unique += 1
+if made_unique == 0:
+logging.error("Specified field to be made unique is not in var or obs.")
+sys.exit(1)
+#end if
 #if $copy_r.default and $copy_r.r_source:
 ad_s = sc.read('r_source.h5')
 if not all(adata.obs.index.isin(ad_s.obs.index)):
 logging.error("Specified object for .raw must contain all .obs from main object.")
 sys.exit(1)
 else:
 adata.raw = ad_s[adata.obs.index]
 del ad_s
+gc.collect()
 #end if
 #if $copy_x.default and len($copy_x.xlayers) > 0:
 #for $i, $x_s in enumerate($copy_x.xlayers):
 ad_s = sc.read('x_source_${i}.h5')
 adata.layers["${xs.dest}"] = ad_s.X
 else:
 logging.error("X source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
 sys.exit(1)
 del ad_s
+gc.collect()
 #end for
 #end if
 #if $copy_l.default and len($copy_l.layers) > 0:
 #for $i, $layer_s in enumerate($copy_l.layer_sources):
 layers_to_copy = (k for k in ad_s.layers.keys() if "${l_key.contains}" in k)
 for l_to_copy in layers_to_copy:
 suffix=''
 if l_to_copy in adata.layers:
 suffix = "_${i}"
 adata.layers[l_to_copy+suffix] = ad_s.layers[l_to_copy]
 #end for
 else:
 logging.error("Layer source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
 sys.exit(1)
 del ad_s
+gc.collect()
 #end for
 #end if
 #if $copy_o.default and len($copy_o.obs_keys) > 0:
 #for $i, $obs_s in enumerate($copy_o.obs_sources):
 #end for
 else:
 logging.error("Observation source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
 sys.exit(1)
 del ad_s
+gc.collect()
 #end for
 #end if
 #if $copy_e.default and len($copy_e.embedding_keys) > 0:
 #end for
 else:
 logging.error("Embedding source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
 sys.exit(1)
 del ad_s
+gc.collect()
 #end for
 #end if
 #if $copy_u.default and len($copy_u.uns_keys) > 0:
 #for $i, $uns_s in enumerate($copy_u.uns_sources):
 #end for
 else:
 logging.error("Uns source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
 sys.exit(1)
 del ad_s
+gc.collect()
 #end for
 #end if
 #if $sanitize_varm:
 if hasattr(adata, 'raw') and  hasattr(adata.raw, 'X') and hasattr(adata.raw, 'var'):
 <param name="from_obs" type="text" label="Original name" help="Name in observations that you want to change">
 <sanitizer>
 <valid initial="string.printable"/>
 </sanitizer>
 </param>
-<param name="to_obs" type="text" label="New name" help="New name in observations that you want to change"/>
+<param name="to_obs" type="text" label="New name" help="New name in observations that you want to change to"/>
 <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/>
+<param name="make_unique" type="boolean" label="Make values in the field unique" help="If activated, it will make the values in the column unique by appending '_dnum' on each repeated value." checked="false"/>
 </repeat>
 <repeat name="var_modifications" title="Change field names in AnnData var" min="0">
 <param name="from_var" type="text" label="Original name" help="Name in var that you want to change">
 <sanitizer>
 <valid initial="string.printable"/>
 </sanitizer>
 </param>
 <param name="to_var" type="text" label="New name" help="New name in var that you want to change"/>
 <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/>
+<param name="make_unique" type="boolean" label="Make values in the field unique" help="If activated, it will make the values in the column unique by appending '_dnum' on each repeated value." checked="false"/>
 </repeat>
 <param name="gene_symbols_field" value='index' type="text" label="Gene symbols field in AnnData" help="Field inside var.params where the gene symbols are, normally 'index' or 'gene_symbols'"/>
 <repeat name="gene_flags" title="Flag genes that start with these names">
 <param name="startswith" type="text" label="Starts with" help="Text that you expect the genes to be flagged to start with, such as 'MT-' for mito genes"/>
 <param name="flag" type="text" label="Var name" help="Name of the column in var.names where this boolean flag is stored, for example 'mito' for mitochondrial genes."/>
 </repeat>
 <param name="top_genes" label="Number of top genes" value='50' help="to calculate percentage of the flagged genes in that number of top genes. Used by sc.pp.calculate_qc_metrics (integer)." type="integer"/>
+<param name="field_unique" type="text" optional="true" label="Field in var or obs to make unique" help="Field inside var or obs to be made unique by appending a suffix (useful for gene symbols in var). A new field will be added with the '_u' suffix. It happens after all the above operations."/>
 <conditional name="copy_r">
 <param name="default" type="boolean" checked="false" label="Copy adata.X to adata.raw"/>
 <when value="true">
 <param name="r_source" type="data" label="AnnData object .X with to copy to .raw" help="Copies adata (subset to matching obs) from this AnnData object into the main input as .raw. Make sure to use an AnnData object containing all .obs in the main input." format="h5,h5ad" />
 </when>
 </outputs>
 <tests>
 <test>
 <param name="input_obj_file" value="find_cluster.h5"/>
-<param name="input_format" value="anndata"/>
-<param name="color_by" value="louvain"/>
 <output name="output_h5ad" file="anndata_ops.h5" ftype="h5ad" compare="sim_size"/>
 </test>
 <test>
 <param name="input_obj_file" value="anndata_ops.h5"/>
 <param name="from_var" value = "gene_symbols" />
 <has_h5_keys keys="var/hello_all" />
 </assert_contents>
 </output>
 </test>
 <test>
+<param name="input_obj_file" value="anndata_ops.h5"/>
+<repeat name="var_modifications" >
+<param name="from_var" value = "gene_symbols" />
+<param name="to_var" value = "gene_symbols_unique" />
+<param name="make_unique" value = "True" />
+</repeat>
+<output name="output_h5ad" ftype="h5ad">
+<assert_contents>
+<has_h5_keys keys="var/gene_symbols_unique" />
+</assert_contents>
+</output>
+</test>
+<test>
+<param name="input_obj_file" value="anndata_ops.h5"/>
+<param name="field_unique" value = "gene_symbols" />
+<output name="output_h5ad" ftype="h5ad">
+<assert_contents>
+<has_h5_keys keys="var/gene_symbols_u" />
+</assert_contents>
+</output>
+</test>
+<test>
 <param name="input_obj_file" value="find_cluster.h5"/>
-<param name="input_format" value="anndata"/>
 <conditional name="copy_r">
 <param name="default" value="true"/>
 <param name="r_source" value="read_10x.h5"/>
 </conditional>
 <output name="output_h5ad" file="anndata_ops_raw.h5" ftype="h5ad" compare="sim_size">
 <assert_contents>
 <has_h5_keys keys="raw/X" />
-	</assert_contents>
+</assert_contents>
 </output>
 </test>
 <test>
 <param name="input_obj_file" value="normalise_data.h5"/>
-<param name="input_format" value="anndata"/>
 <conditional name="copy_x">
 <param name="default" value="true"/>
 <repeat name="xlayers">
-	  <param name="x_source" value='filter_genes.h5'/>
+<param name="x_source" value='filter_genes.h5'/>
-	  <param name="dest" value='filtered'/>
+<param name="dest" value='filtered'/>
-	</repeat>
+</repeat>
 </conditional>
 <output name="output_h5ad" file="anndata_ops_xlayer.h5" ftype="h5ad" compare="sim_size">
 <assert_contents>
 <has_h5_keys keys="layers/filtered" />
-	</assert_contents>
+</assert_contents>
 </output>
 </test>
 <test>
 <param name="input_obj_file" value="find_cluster.h5"/>
-<param name="input_format" value="anndata"/>
 <conditional name="copy_l">
 <param name="default" value="true"/>
 <repeat name="layers">
-	  <param name="contains" value='filtered'/>
+<param name="contains" value='filtered'/>
-	</repeat>
+</repeat>
-	<param name="layer_sources" value='anndata_ops_xlayer.h5'/>
+<param name="layer_sources" value='anndata_ops_xlayer.h5'/>
 </conditional>
 <output name="output_h5ad" file="anndata_ops_layer.h5" ftype="h5ad" compare="sim_size">
 <assert_contents>
 <has_h5_keys keys="layers/filtered" />
-	</assert_contents>
+</assert_contents>
 </output>
 </test>
 </tests>
 <help><![CDATA[
 Operations on AnnData objects
 =============================
 Performs the following operations:
-* Change observation/var fields, mostly for downstreaming processes convenience. Multiple fields can be changed as one.
+* Change observation/var fields, mostly for downstreaming processes convenience. Multiple fields can be changed at once.
 * Flag genes that start with a certain text: useful for flagging mitochondrial, spikes or other groups of genes.
 * For the flags created, calculates qc metrics (pct_<flag>_counts).
 * Calculates `n_genes`, `n_counts` for cells and `n_cells`, `n_counts` for genes.
 * For top <N> genes specified, calculate qc metrics (pct_counts_in_top_<N>_genes).
+* Make a specified column of var or obs unique (normally useful for gene symbols).
 * Copy from a set of compatible AnnData objects (same cells and genes):
 * Observations, such as clustering results.
 * Embeddings, such as tSNE or UMAPs.
 * Unstructure annotations, like gene markers.
 This functionality will probably be added in the future to a larger package.
 History
 -------
+1.8.1+galaxy10: Adds field to be made unique in obs or var.
 1.6.0+galaxy0: Moves to Scanpy Scripts 0.3.0 (Scanpy 1.6.0), versioning switched to track Scanpy as other tools.
 0.0.3+galaxy0: Adds ability to merge AnnData objects (Scanpy 1.4.3).
 ]]></help>

Mercurial > repos > ebi-gxa > anndata_ops

comparison anndata_operations.xml @ 26:825dfd66e3fb draft