Mercurial > repos > ebi-gxa > decoupler_pathway_inference
comparison decoupler_pseudobulk.py @ 7:2c5686d627c0 draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit 1efa285536ea940b459fd07f452a6eeb0cf0ffb9
author | ebi-gxa |
---|---|
date | Sun, 27 Oct 2024 20:39:33 +0000 |
parents | 1d140c4a5875 |
children |
comparison
equal
deleted
inserted
replaced
6:1d140c4a5875 | 7:2c5686d627c0 |
---|---|
460 >>> import pandas as pd | 460 >>> import pandas as pd |
461 >>> import numpy as np | 461 >>> import numpy as np |
462 >>> import os | 462 >>> import os |
463 >>> from io import StringIO | 463 >>> from io import StringIO |
464 >>> contrast_file = StringIO(f"contrast{os.linesep}condition1-\ | 464 >>> contrast_file = StringIO(f"contrast{os.linesep}condition1-\ |
465 condition2{os.linesep}") | 465 condition2{os.linesep}\ |
466 2*(condition1)-condition2{os.linesep}") | |
466 >>> min_perc_cells_expression = 30.0 | 467 >>> min_perc_cells_expression = 30.0 |
467 >>> data = { | 468 >>> data = { |
468 ... 'obs': pd.DataFrame({'condition': ['condition1', 'condition1', | 469 ... 'obs': pd.DataFrame({'condition': ['condition1', 'condition1', |
469 ... 'condition2', 'condition2']}), | 470 ... 'condition2', 'condition2']}), |
470 ... 'X': np.array([[1, 0, 0, 0, 0], [0, 0, 2, 2, 0], | 471 ... 'X': np.array([[1, 0, 0, 0, 0], [0, 0, 2, 2, 0], |
474 >>> df = identify_genes_to_filter_per_contrast( | 475 >>> df = identify_genes_to_filter_per_contrast( |
475 ... contrast_file, min_perc_cells_expression, adata, 'condition' | 476 ... contrast_file, min_perc_cells_expression, adata, 'condition' |
476 ... ) # doctest:+ELLIPSIS | 477 ... ) # doctest:+ELLIPSIS |
477 Identifying genes to filter using ... | 478 Identifying genes to filter using ... |
478 >>> df.head() # doctest:+ELLIPSIS | 479 >>> df.head() # doctest:+ELLIPSIS |
479 contrast gene | 480 contrast gene |
480 0 condition1-condition2 ... | 481 0 condition1-condition2... |
481 1 condition1-condition2 ... | 482 1 condition1-condition2... |
483 2 2*(condition1)-condition2... | |
484 3 2*(condition1)-condition2... | |
482 """ | 485 """ |
483 import re | 486 import re |
484 | 487 |
485 # Implement the logic to identify genes to filter per contrast | 488 # Implement the logic to identify genes to filter per contrast |
486 # This is a placeholder implementation | 489 # This is a placeholder implementation |
493 contrasts = pd.read_csv(contrast_file, sep="\t") | 496 contrasts = pd.read_csv(contrast_file, sep="\t") |
494 # Iterate over each line in the contrast file | 497 # Iterate over each line in the contrast file |
495 genes_filter_for_contrast = dict() | 498 genes_filter_for_contrast = dict() |
496 for contrast in contrasts.iloc[:, 0]: | 499 for contrast in contrasts.iloc[:, 0]: |
497 conditions = set(sides_regex.split(contrast)) | 500 conditions = set(sides_regex.split(contrast)) |
501 | |
502 selected_conditions = [] | |
503 failed_conditions = [] | |
504 for condition in conditions: | |
505 # remove any starting or trailing whitespaces from condition | |
506 condition = condition.strip() | |
507 if len(condition) == 0: | |
508 continue | |
509 # check if the condition is simply a number, then skip it | |
510 if condition.isnumeric(): | |
511 continue | |
512 if condition not in adata.obs[obs_field].unique(): | |
513 # add condition to failed_conditions | |
514 failed_conditions.append(condition) | |
515 continue | |
516 # append to selected_conditions | |
517 selected_conditions.append(condition) | |
518 | |
519 if len(failed_conditions) > 0: | |
520 raise ValueError( | |
521 f"Condition(s) '{failed_conditions}' " | |
522 f"from contrast {contrast} " | |
523 f"is/are not present in the " | |
524 f"obs_field '{obs_field}' from the AnnData object." | |
525 f"Possible values are: " | |
526 f"{', '.join(adata.obs[obs_field].unique())}.") | |
498 # we want to find the genes that are below the threshold | 527 # we want to find the genes that are below the threshold |
499 # of % of cells expressed for ALL the conditions in the | 528 # of % of cells expressed for ALL the conditions in the |
500 # contrast. It is enough for one of the conditions | 529 # contrast. It is enough for one of the conditions |
501 # of the contrast to have the genes expressed above | 530 # of the contrast to have the genes expressed above |
502 # the threshold of % of cells to be of interest. | 531 # the threshold of % of cells to be of interest. |
503 for condition in conditions: | 532 for condition in selected_conditions: |
504 # remove any starting or trailing whitespaces from condition | |
505 condition = condition.strip() | |
506 if condition not in adata.obs[obs_field].unique(): | |
507 raise ValueError( | |
508 f"Condition '{condition}' from contrast {contrast}" | |
509 f" is not present in the " | |
510 f"obs_field '{obs_field}' from the AnnData object." | |
511 f"Possible values are: " | |
512 f"{', '.join(adata.obs[obs_field].unique())}." | |
513 ) | |
514 # check the percentage of cells that express each gene | 533 # check the percentage of cells that express each gene |
515 # Filter the AnnData object based on the obs_field value | 534 # Filter the AnnData object based on the obs_field value |
516 adata_filtered = adata[adata.obs[obs_field] == condition] | 535 adata_filtered = adata[adata.obs[obs_field] == condition] |
517 # Calculate the percentage of cells expressing each gene | 536 # Calculate the percentage of cells expressing each gene |
518 gene_expression = (adata_filtered.X > 0).mean(axis=0) * 100 | 537 gene_expression = (adata_filtered.X > 0).mean(axis=0) * 100 |