Mercurial > repos > ebi-gxa > anndata_ops

--- a/anndata_operations.xml	Thu Feb 16 13:28:31 2023 +0000
+++ b/anndata_operations.xml	Fri Apr 14 13:12:01 2023 +0000
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy9" profile="@PROFILE@">
+<tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy91" profile="@PROFILE@">
   <description>modifies metadata and flags genes</description>
   <macros>
     <import>scanpy_macros2.xml</import>
@@ -41,6 +41,10 @@
   ln -s '${us}' uns_source_${i}.h5 &&
 #end for
 #end if
+
+#if $add_cell_metadata.default:
+  ln -s ${add_cell_metadata.file} cell_metadata.tsv &&
+#end if
 python $operations
 ]]></command>
   <configfiles>
@@ -57,8 +61,44 @@
   appendents = (suffix + df.groupby(field).cumcount().astype(str).replace('0','')).replace(suffix, '')
   df[new_field] = df[field].astype(str) + appendents.astype(str)
   return df
+
+adata = sc.read('input.h5')

-adata = sc.read('input.h5')
+#if $add_cell_metadata.default:
+import pandas as pd
+
+def add_cell_metadata(ad, metadata_file="cell_metadata.tsv", drop_duplicates=True):
+  metadata_df = pd.read_csv(metadata_file, sep="\t", index_col=0)
+  # we avoid renames in the original object or outright drop the column in the metadata
+  for col in ad.obs.columns:
+      if col in metadata_df.columns:
+          print(f"Renaming {col} to {col}_x")
+          if drop_duplicates:
+              metadata_df = metadata_df.drop(col, axis=1)
+          else:
+              metadata_df.rename(columns={col: col + "_x"}, inplace=True)
+  # merge metadata into ad.obs column by column, changing columns to category dtype if they become object dtype on merge
+  merged_obs = ad.obs.merge(
+      metadata_df, left_index=True, right_index=True, how="left"
+  )
+  for o_col in metadata_df.columns:
+      col = o_col
+      # lets consider cases where columns where renamed during merge
+      if o_col + "_x" in merged_obs.columns:
+          col = o_col + "_x"
+      if o_col + "_y" in merged_obs.columns:
+          col = o_col + "_y"
+      if col in merged_obs.columns:
+          if merged_obs[col].dtype == object:
+              prev_dtype = metadata_df[o_col].dtype
+              if prev_dtype == str or prev_dtype == object:
+                  prev_dtype = "category"
+              print(f"Changing {col} from {merged_obs[col].dtype} to {prev_dtype}")
+              merged_obs[col] = merged_obs[col].astype(prev_dtype)
+  return merged_obs
+
+adata.obs = add_cell_metadata(adata)
+#end if

 #if $copy_adata_to_raw:
 adata.raw = adata
@@ -253,6 +293,13 @@
   <inputs>
     <param name="input_obj_file" argument="input-object-file" type="data" format="h5,h5ad" label="Input object in hdf5 AnnData format"/>
     <expand macro="output_object_params_no_loom"/>
+    <conditional name="add_cell_metadata">
+      <param name="default" type="boolean" checked="false" label="Merge additional cell metadata"/>
+      <when value="true">
+        <param name="file" type="data" label="Cell metadata with headers" help="A tabular file with headers, where the first column contains cell barcodes. Will be merged via a left join, so not all cells in the obs need to be in the metadata. Currently duplicated column headers will be ignored and the originals in the AnnData will be kept." format="tsv,tabular"/>
+      </when>
+      <when value="false"/>
+    </conditional>
     <param name="copy_adata_to_raw" type="boolean" label="Copy AnnData to .raw" help="If activated, it will do 'adata.raw = adata'" checked="false"/>
     <repeat name="modifications" title="Change field names in AnnData observations" min="0">
       <param name="from_obs" type="text" label="Original name" help="Name in observations that you want to change">
@@ -363,6 +410,18 @@
     </test>
     <test>
       <param name="input_obj_file" value="anndata_ops.h5"/>
+      <conditional name="add_cell_metadata">
+        <param name="default" value="true"/>
+        <param name="file" value="test_incomplete_metadata.tsv"/>
+      </conditional>
+      <output name="output_h5ad" ftype="h5ad">
+        <assert_contents>
+          <has_h5_keys keys="obs/cell_type"/>
+        </assert_contents>
+      </output>
+    </test>
+    <test>
+      <param name="input_obj_file" value="anndata_ops.h5"/>
       <repeat name="var_modifications" >
         <param name="from_var" value = "gene_symbols" />
         <param name="to_var" value = "gene_symbols_unique" />