diff decoupler_pseudobulk.py @ 7:2c5686d627c0 draft

planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit 1efa285536ea940b459fd07f452a6eeb0cf0ffb9
author ebi-gxa
date Sun, 27 Oct 2024 20:39:33 +0000
parents 1d140c4a5875
children
line wrap: on
line diff
--- a/decoupler_pseudobulk.py	Fri Oct 25 15:12:19 2024 +0000
+++ b/decoupler_pseudobulk.py	Sun Oct 27 20:39:33 2024 +0000
@@ -462,7 +462,8 @@
     >>> import os
     >>> from io import StringIO
     >>> contrast_file = StringIO(f"contrast{os.linesep}condition1-\
-condition2{os.linesep}")
+condition2{os.linesep}\
+2*(condition1)-condition2{os.linesep}")
     >>> min_perc_cells_expression = 30.0
     >>> data = {
     ...     'obs': pd.DataFrame({'condition': ['condition1', 'condition1',
@@ -476,9 +477,11 @@
     ... ) # doctest:+ELLIPSIS
     Identifying genes to filter using ...
     >>> df.head() # doctest:+ELLIPSIS
-                    contrast gene
-    0  condition1-condition2    ...
-    1  condition1-condition2    ...
+                        contrast gene
+    0      condition1-condition2...
+    1      condition1-condition2...
+    2  2*(condition1)-condition2...
+    3  2*(condition1)-condition2...
     """
     import re
 
@@ -495,22 +498,38 @@
     genes_filter_for_contrast = dict()
     for contrast in contrasts.iloc[:, 0]:
         conditions = set(sides_regex.split(contrast))
+
+        selected_conditions = []
+        failed_conditions = []
+        for condition in conditions:
+            # remove any starting or trailing whitespaces from condition
+            condition = condition.strip()
+            if len(condition) == 0:
+                continue
+            # check if the condition is simply a number, then skip it
+            if condition.isnumeric():
+                continue
+            if condition not in adata.obs[obs_field].unique():
+                # add condition to failed_conditions
+                failed_conditions.append(condition)
+                continue
+            # append to selected_conditions
+            selected_conditions.append(condition)
+
+        if len(failed_conditions) > 0:
+            raise ValueError(
+                f"Condition(s) '{failed_conditions}' "
+                f"from contrast {contrast} "
+                f"is/are not present in the "
+                f"obs_field '{obs_field}' from the AnnData object."
+                f"Possible values are: "
+                f"{', '.join(adata.obs[obs_field].unique())}.")
         # we want to find the genes that are below the threshold
         # of % of cells expressed for ALL the conditions in the
         # contrast. It is enough for one of the conditions
         # of the contrast to have the genes expressed above
         # the threshold of % of cells to be of interest.
-        for condition in conditions:
-            # remove any starting or trailing whitespaces from condition
-            condition = condition.strip()
-            if condition not in adata.obs[obs_field].unique():
-                raise ValueError(
-                    f"Condition '{condition}' from contrast {contrast}"
-                    f" is not present in the "
-                    f"obs_field '{obs_field}' from the AnnData object."
-                    f"Possible values are: "
-                    f"{', '.join(adata.obs[obs_field].unique())}."
-                )
+        for condition in selected_conditions:
             # check the percentage of cells that express each gene
             # Filter the AnnData object based on the obs_field value
             adata_filtered = adata[adata.obs[obs_field] == condition]