Mercurial > repos > ebi-gxa > decoupler_pathway_inference
diff decoupler_pseudobulk.py @ 7:2c5686d627c0 draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit 1efa285536ea940b459fd07f452a6eeb0cf0ffb9
author | ebi-gxa |
---|---|
date | Sun, 27 Oct 2024 20:39:33 +0000 |
parents | 1d140c4a5875 |
children |
line wrap: on
line diff
--- a/decoupler_pseudobulk.py Fri Oct 25 15:12:19 2024 +0000 +++ b/decoupler_pseudobulk.py Sun Oct 27 20:39:33 2024 +0000 @@ -462,7 +462,8 @@ >>> import os >>> from io import StringIO >>> contrast_file = StringIO(f"contrast{os.linesep}condition1-\ -condition2{os.linesep}") +condition2{os.linesep}\ +2*(condition1)-condition2{os.linesep}") >>> min_perc_cells_expression = 30.0 >>> data = { ... 'obs': pd.DataFrame({'condition': ['condition1', 'condition1', @@ -476,9 +477,11 @@ ... ) # doctest:+ELLIPSIS Identifying genes to filter using ... >>> df.head() # doctest:+ELLIPSIS - contrast gene - 0 condition1-condition2 ... - 1 condition1-condition2 ... + contrast gene + 0 condition1-condition2... + 1 condition1-condition2... + 2 2*(condition1)-condition2... + 3 2*(condition1)-condition2... """ import re @@ -495,22 +498,38 @@ genes_filter_for_contrast = dict() for contrast in contrasts.iloc[:, 0]: conditions = set(sides_regex.split(contrast)) + + selected_conditions = [] + failed_conditions = [] + for condition in conditions: + # remove any starting or trailing whitespaces from condition + condition = condition.strip() + if len(condition) == 0: + continue + # check if the condition is simply a number, then skip it + if condition.isnumeric(): + continue + if condition not in adata.obs[obs_field].unique(): + # add condition to failed_conditions + failed_conditions.append(condition) + continue + # append to selected_conditions + selected_conditions.append(condition) + + if len(failed_conditions) > 0: + raise ValueError( + f"Condition(s) '{failed_conditions}' " + f"from contrast {contrast} " + f"is/are not present in the " + f"obs_field '{obs_field}' from the AnnData object." + f"Possible values are: " + f"{', '.join(adata.obs[obs_field].unique())}.") # we want to find the genes that are below the threshold # of % of cells expressed for ALL the conditions in the # contrast. It is enough for one of the conditions # of the contrast to have the genes expressed above # the threshold of % of cells to be of interest. - for condition in conditions: - # remove any starting or trailing whitespaces from condition - condition = condition.strip() - if condition not in adata.obs[obs_field].unique(): - raise ValueError( - f"Condition '{condition}' from contrast {contrast}" - f" is not present in the " - f"obs_field '{obs_field}' from the AnnData object." - f"Possible values are: " - f"{', '.join(adata.obs[obs_field].unique())}." - ) + for condition in selected_conditions: # check the percentage of cells that express each gene # Filter the AnnData object based on the obs_field value adata_filtered = adata[adata.obs[obs_field] == condition]