comparison decoupler_pseudobulk.py @ 7:2c5686d627c0 draft

planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit 1efa285536ea940b459fd07f452a6eeb0cf0ffb9
author ebi-gxa
date Sun, 27 Oct 2024 20:39:33 +0000
parents 1d140c4a5875
children
comparison
equal deleted inserted replaced
6:1d140c4a5875 7:2c5686d627c0
460 >>> import pandas as pd 460 >>> import pandas as pd
461 >>> import numpy as np 461 >>> import numpy as np
462 >>> import os 462 >>> import os
463 >>> from io import StringIO 463 >>> from io import StringIO
464 >>> contrast_file = StringIO(f"contrast{os.linesep}condition1-\ 464 >>> contrast_file = StringIO(f"contrast{os.linesep}condition1-\
465 condition2{os.linesep}") 465 condition2{os.linesep}\
466 2*(condition1)-condition2{os.linesep}")
466 >>> min_perc_cells_expression = 30.0 467 >>> min_perc_cells_expression = 30.0
467 >>> data = { 468 >>> data = {
468 ... 'obs': pd.DataFrame({'condition': ['condition1', 'condition1', 469 ... 'obs': pd.DataFrame({'condition': ['condition1', 'condition1',
469 ... 'condition2', 'condition2']}), 470 ... 'condition2', 'condition2']}),
470 ... 'X': np.array([[1, 0, 0, 0, 0], [0, 0, 2, 2, 0], 471 ... 'X': np.array([[1, 0, 0, 0, 0], [0, 0, 2, 2, 0],
474 >>> df = identify_genes_to_filter_per_contrast( 475 >>> df = identify_genes_to_filter_per_contrast(
475 ... contrast_file, min_perc_cells_expression, adata, 'condition' 476 ... contrast_file, min_perc_cells_expression, adata, 'condition'
476 ... ) # doctest:+ELLIPSIS 477 ... ) # doctest:+ELLIPSIS
477 Identifying genes to filter using ... 478 Identifying genes to filter using ...
478 >>> df.head() # doctest:+ELLIPSIS 479 >>> df.head() # doctest:+ELLIPSIS
479 contrast gene 480 contrast gene
480 0 condition1-condition2 ... 481 0 condition1-condition2...
481 1 condition1-condition2 ... 482 1 condition1-condition2...
483 2 2*(condition1)-condition2...
484 3 2*(condition1)-condition2...
482 """ 485 """
483 import re 486 import re
484 487
485 # Implement the logic to identify genes to filter per contrast 488 # Implement the logic to identify genes to filter per contrast
486 # This is a placeholder implementation 489 # This is a placeholder implementation
493 contrasts = pd.read_csv(contrast_file, sep="\t") 496 contrasts = pd.read_csv(contrast_file, sep="\t")
494 # Iterate over each line in the contrast file 497 # Iterate over each line in the contrast file
495 genes_filter_for_contrast = dict() 498 genes_filter_for_contrast = dict()
496 for contrast in contrasts.iloc[:, 0]: 499 for contrast in contrasts.iloc[:, 0]:
497 conditions = set(sides_regex.split(contrast)) 500 conditions = set(sides_regex.split(contrast))
501
502 selected_conditions = []
503 failed_conditions = []
504 for condition in conditions:
505 # remove any starting or trailing whitespaces from condition
506 condition = condition.strip()
507 if len(condition) == 0:
508 continue
509 # check if the condition is simply a number, then skip it
510 if condition.isnumeric():
511 continue
512 if condition not in adata.obs[obs_field].unique():
513 # add condition to failed_conditions
514 failed_conditions.append(condition)
515 continue
516 # append to selected_conditions
517 selected_conditions.append(condition)
518
519 if len(failed_conditions) > 0:
520 raise ValueError(
521 f"Condition(s) '{failed_conditions}' "
522 f"from contrast {contrast} "
523 f"is/are not present in the "
524 f"obs_field '{obs_field}' from the AnnData object."
525 f"Possible values are: "
526 f"{', '.join(adata.obs[obs_field].unique())}.")
498 # we want to find the genes that are below the threshold 527 # we want to find the genes that are below the threshold
499 # of % of cells expressed for ALL the conditions in the 528 # of % of cells expressed for ALL the conditions in the
500 # contrast. It is enough for one of the conditions 529 # contrast. It is enough for one of the conditions
501 # of the contrast to have the genes expressed above 530 # of the contrast to have the genes expressed above
502 # the threshold of % of cells to be of interest. 531 # the threshold of % of cells to be of interest.
503 for condition in conditions: 532 for condition in selected_conditions:
504 # remove any starting or trailing whitespaces from condition
505 condition = condition.strip()
506 if condition not in adata.obs[obs_field].unique():
507 raise ValueError(
508 f"Condition '{condition}' from contrast {contrast}"
509 f" is not present in the "
510 f"obs_field '{obs_field}' from the AnnData object."
511 f"Possible values are: "
512 f"{', '.join(adata.obs[obs_field].unique())}."
513 )
514 # check the percentage of cells that express each gene 533 # check the percentage of cells that express each gene
515 # Filter the AnnData object based on the obs_field value 534 # Filter the AnnData object based on the obs_field value
516 adata_filtered = adata[adata.obs[obs_field] == condition] 535 adata_filtered = adata[adata.obs[obs_field] == condition]
517 # Calculate the percentage of cells expressing each gene 536 # Calculate the percentage of cells expressing each gene
518 gene_expression = (adata_filtered.X > 0).mean(axis=0) * 100 537 gene_expression = (adata_filtered.X > 0).mean(axis=0) * 100