diff anndata_operations.xml @ 26:825dfd66e3fb draft

planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/tertiary-analysis/scanpy commit 6c9d530aa653101e9e21804393ec11f38cddf027-dirty
author ebi-gxa
date Thu, 16 Feb 2023 13:28:31 +0000
parents 31e5e6d606ef
children 7ebc22f77d86
line wrap: on
line diff
--- a/anndata_operations.xml	Thu Oct 28 09:55:27 2021 +0000
+++ b/anndata_operations.xml	Thu Feb 16 13:28:31 2023 +0000
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy0" profile="@PROFILE@">
+<tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy9" profile="@PROFILE@">
   <description>modifies metadata and flags genes</description>
   <macros>
     <import>scanpy_macros2.xml</import>
@@ -45,11 +45,19 @@
 ]]></command>
   <configfiles>
     <configfile name="operations">
+import gc
 import scanpy as sc
 import anndata
 from numpy import all
 import logging
 
+def make_column_values_unique(df, field, new_field=None, suffix = '-duplicate-'):
+  if new_field is None:
+    new_field = f"{field}_u"
+  appendents = (suffix + df.groupby(field).cumcount().astype(str).replace('0','')).replace(suffix, '')
+  df[new_field] = df[field].astype(str) + appendents.astype(str)
+  return df
+
 adata = sc.read('input.h5')
 
 #if $copy_adata_to_raw:
@@ -60,14 +68,22 @@
 qc_vars = list()
 
 #for $i, $s in enumerate($modifications)
+#if $s.make_unique:
+adata.obs = make_column_values_unique(adata.obs, field='${s.from_obs}', new_field='${s.to_obs}', suffix = "_d")
+#else
 adata.obs['${s.to_obs}'] = adata.obs['${s.from_obs}']
+#end if
 #if not $s.keep_original:
 del adata.obs['${s.from_obs}']
 #end if
 #end for
 
 #for $i, $s in enumerate($var_modifications)
+#if $s.make_unique:
+adata.var = make_column_values_unique(adata.var, field='${s.from_var}', new_field='${s.to_var}', suffix = "_d")
+#else
 adata.var['${s.to_var}'] = adata.var['${s.from_var}']
+#end if
 #if not $s.keep_original:
 del adata.var['${s.from_var}']
 #end if
@@ -84,6 +100,21 @@
     logging.warning('No genes starting with {} found, skip calculating expression of {} genes'.format('${flag.startswith}', '${flag.flag}'))
 #end for
 
+#if $field_unique:
+field_unique = '${field_unique}'
+made_unique = 0
+if field_unique in adata.var_keys(): 	    
+  adata.var = make_column_values_unique(adata.var, field_unique, suffix = "_d")
+  made_unique += 1
+if field_unique in adata.obs_keys():
+  adata.obs = make_column_values_unique(adata.obs, field_unique, suffix = "_d")
+  made_unique += 1
+	 
+if made_unique == 0:
+  logging.error("Specified field to be made unique is not in var or obs.")
+  sys.exit(1)  
+#end if	
+	    
 #if $copy_r.default and $copy_r.r_source:
 ad_s = sc.read('r_source.h5')
 if not all(adata.obs.index.isin(ad_s.obs.index)):
@@ -92,6 +123,7 @@
 else:
   adata.raw = ad_s[adata.obs.index]
 del ad_s
+gc.collect()
 #end if
 
 #if $copy_x.default and len($copy_x.xlayers) > 0:
@@ -107,6 +139,7 @@
   logging.error("X source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
   sys.exit(1)
 del ad_s
+gc.collect()
 #end for
 #end if
 
@@ -120,13 +153,13 @@
     suffix=''
     if l_to_copy in adata.layers:
         suffix = "_${i}"
-
     adata.layers[l_to_copy+suffix] = ad_s.layers[l_to_copy]
   #end for
 else:
   logging.error("Layer source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
   sys.exit(1)
 del ad_s
+gc.collect()
 #end for
 #end if
 
@@ -149,6 +182,7 @@
   logging.error("Observation source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
   sys.exit(1)
 del ad_s
+gc.collect()
 #end for
 #end if
 
@@ -169,6 +203,7 @@
   logging.error("Embedding source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
   sys.exit(1)
 del ad_s
+gc.collect()
 #end for
 #end if
 
@@ -188,6 +223,7 @@
   logging.error("Uns source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
   sys.exit(1)
 del ad_s
+gc.collect()
 #end for
 #end if
 
@@ -224,8 +260,9 @@
           <valid initial="string.printable"/>
         </sanitizer>
       </param>
-      <param name="to_obs" type="text" label="New name" help="New name in observations that you want to change"/>
+      <param name="to_obs" type="text" label="New name" help="New name in observations that you want to change to"/>
       <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/>
+      <param name="make_unique" type="boolean" label="Make values in the field unique" help="If activated, it will make the values in the column unique by appending '_dnum' on each repeated value." checked="false"/>
     </repeat>
     <repeat name="var_modifications" title="Change field names in AnnData var" min="0">
       <param name="from_var" type="text" label="Original name" help="Name in var that you want to change">
@@ -235,6 +272,7 @@
       </param>
       <param name="to_var" type="text" label="New name" help="New name in var that you want to change"/>
       <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/>
+      <param name="make_unique" type="boolean" label="Make values in the field unique" help="If activated, it will make the values in the column unique by appending '_dnum' on each repeated value." checked="false"/>
     </repeat>
     <param name="gene_symbols_field" value='index' type="text" label="Gene symbols field in AnnData" help="Field inside var.params where the gene symbols are, normally 'index' or 'gene_symbols'"/>
     <repeat name="gene_flags" title="Flag genes that start with these names">
@@ -242,6 +280,7 @@
       <param name="flag" type="text" label="Var name" help="Name of the column in var.names where this boolean flag is stored, for example 'mito' for mitochondrial genes."/>
     </repeat>
     <param name="top_genes" label="Number of top genes" value='50' help="to calculate percentage of the flagged genes in that number of top genes. Used by sc.pp.calculate_qc_metrics (integer)." type="integer"/>
+    <param name="field_unique" type="text" optional="true" label="Field in var or obs to make unique" help="Field inside var or obs to be made unique by appending a suffix (useful for gene symbols in var). A new field will be added with the '_u' suffix. It happens after all the above operations."/>
     <conditional name="copy_r">
       <param name="default" type="boolean" checked="false" label="Copy adata.X to adata.raw"/>
       <when value="true">
@@ -310,8 +349,6 @@
   <tests>
     <test>
       <param name="input_obj_file" value="find_cluster.h5"/>
-      <param name="input_format" value="anndata"/>
-      <param name="color_by" value="louvain"/>
       <output name="output_h5ad" file="anndata_ops.h5" ftype="h5ad" compare="sim_size"/>
     </test>
     <test>
@@ -325,8 +362,29 @@
       </output>
     </test>
     <test>
+      <param name="input_obj_file" value="anndata_ops.h5"/>
+      <repeat name="var_modifications" >
+        <param name="from_var" value = "gene_symbols" />
+        <param name="to_var" value = "gene_symbols_unique" />
+        <param name="make_unique" value = "True" />
+      </repeat>
+      <output name="output_h5ad" ftype="h5ad">
+        <assert_contents>
+          <has_h5_keys keys="var/gene_symbols_unique" />
+        </assert_contents>
+      </output>
+    </test>
+    <test>
+      <param name="input_obj_file" value="anndata_ops.h5"/>
+      <param name="field_unique" value = "gene_symbols" />
+      <output name="output_h5ad" ftype="h5ad">
+        <assert_contents>
+          <has_h5_keys keys="var/gene_symbols_u" />
+        </assert_contents>
+      </output>
+    </test>
+    <test>
       <param name="input_obj_file" value="find_cluster.h5"/>
-      <param name="input_format" value="anndata"/>
       <conditional name="copy_r">
         <param name="default" value="true"/>
         <param name="r_source" value="read_10x.h5"/>
@@ -334,39 +392,37 @@
       <output name="output_h5ad" file="anndata_ops_raw.h5" ftype="h5ad" compare="sim_size">
         <assert_contents>
           <has_h5_keys keys="raw/X" />
-	</assert_contents>
+        </assert_contents>
       </output>
     </test>
     <test>
       <param name="input_obj_file" value="normalise_data.h5"/>
-      <param name="input_format" value="anndata"/>
       <conditional name="copy_x">
         <param name="default" value="true"/>
         <repeat name="xlayers">
-	  <param name="x_source" value='filter_genes.h5'/>
-	  <param name="dest" value='filtered'/>
-	</repeat>
+          <param name="x_source" value='filter_genes.h5'/>
+          <param name="dest" value='filtered'/>
+        </repeat>
       </conditional>
       <output name="output_h5ad" file="anndata_ops_xlayer.h5" ftype="h5ad" compare="sim_size">
         <assert_contents>
           <has_h5_keys keys="layers/filtered" />
-	</assert_contents>
+        </assert_contents>
       </output>
     </test>
     <test>
       <param name="input_obj_file" value="find_cluster.h5"/>
-      <param name="input_format" value="anndata"/>
       <conditional name="copy_l">
         <param name="default" value="true"/>
         <repeat name="layers">
-	  <param name="contains" value='filtered'/>
-	</repeat>
-	<param name="layer_sources" value='anndata_ops_xlayer.h5'/>
+          <param name="contains" value='filtered'/>
+        </repeat>
+        <param name="layer_sources" value='anndata_ops_xlayer.h5'/>
       </conditional>
       <output name="output_h5ad" file="anndata_ops_layer.h5" ftype="h5ad" compare="sim_size">
         <assert_contents>
           <has_h5_keys keys="layers/filtered" />
-	</assert_contents>
+        </assert_contents>
       </output>
     </test>
   </tests>
@@ -378,11 +434,12 @@
 
 Performs the following operations:
 
-* Change observation/var fields, mostly for downstreaming processes convenience. Multiple fields can be changed as one.
+* Change observation/var fields, mostly for downstreaming processes convenience. Multiple fields can be changed at once.
 * Flag genes that start with a certain text: useful for flagging mitochondrial, spikes or other groups of genes.
 * For the flags created, calculates qc metrics (pct_<flag>_counts).
 * Calculates `n_genes`, `n_counts` for cells and `n_cells`, `n_counts` for genes.
 * For top <N> genes specified, calculate qc metrics (pct_counts_in_top_<N>_genes).
+* Make a specified column of var or obs unique (normally useful for gene symbols).
 * Copy from a set of compatible AnnData objects (same cells and genes):
   * Observations, such as clustering results.
   * Embeddings, such as tSNE or UMAPs.
@@ -392,6 +449,7 @@
 
 History
 -------
+1.8.1+galaxy10: Adds field to be made unique in obs or var.
 
 1.6.0+galaxy0: Moves to Scanpy Scripts 0.3.0 (Scanpy 1.6.0), versioning switched to track Scanpy as other tools.