comparison decoupler_pseudobulk.py @ 5:87f1eaa410cc draft default tip

planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit dea8a066ccf04e241457719bf5162f9d39fe6c48
author ebi-gxa
date Wed, 02 Oct 2024 08:27:06 +0000
parents 6c30272fb587
children
comparison
equal deleted inserted replaced
4:6c30272fb587 5:87f1eaa410cc
296 if args.adata_obs_fields_to_merge: 296 if args.adata_obs_fields_to_merge:
297 # first split potential groups by ":" and iterate over them 297 # first split potential groups by ":" and iterate over them
298 for group in args.adata_obs_fields_to_merge.split(":"): 298 for group in args.adata_obs_fields_to_merge.split(":"):
299 fields = group.split(",") 299 fields = group.split(",")
300 check_fields(fields, adata) 300 check_fields(fields, adata)
301 adata = merge_adata_obs_fields(fields, adata) 301 merge_adata_obs_fields(fields, adata)
302 302
303 check_fields([args.groupby, args.sample_key], adata) 303 check_fields([args.groupby, args.sample_key], adata)
304 304
305 factor_fields = None 305 factor_fields = None
306 if args.factor_fields: 306 if args.factor_fields:
320 min_counts=args.min_counts, 320 min_counts=args.min_counts,
321 ) 321 )
322 322
323 print("Created pseudo-bulk AnnData, checking if fields still make sense.") 323 print("Created pseudo-bulk AnnData, checking if fields still make sense.")
324 print( 324 print(
325 "If this fails this check, it might mean that you asked for factors " 325 "If this fails this check, it might mean that you asked for factors \
326 + "that are not compatible with you sample identifiers (ie. asked for " 326 that are not compatible with you sample identifiers (ie. asked for \
327 + "phase in the factors, but each sample contains more than one phase, " 327 phase in the factors, but each sample contains more than one phase,\
328 + "try joining fields)" 328 try joining fields)."
329 ) 329 )
330 if factor_fields: 330 if factor_fields:
331 check_fields( 331 check_fields(
332 factor_fields, 332 factor_fields,
333 pseudobulk_data, 333 pseudobulk_data,
372 output_dir=args.deseq2_output_path, 372 output_dir=args.deseq2_output_path,
373 factor_fields=factor_fields, 373 factor_fields=factor_fields,
374 min_counts_per_sample_marking=args.min_counts_per_sample_marking, 374 min_counts_per_sample_marking=args.min_counts_per_sample_marking,
375 ) 375 )
376 376
377 # if contrasts file is provided, produce a file with genes that should be
378 # filtered for each contrasts
379 if args.contrasts_file:
380 contrast_genes_df = identify_genes_to_filter_per_contrast(
381 contrast_file=args.contrasts_file,
382 min_perc_cells_expression=args.min_gene_exp_perc_per_cell,
383 adata=adata,
384 obs_field=args.groupby
385 )
386 contrast_genes_df.to_csv(
387 f"{args.save_path}/genes_to_filter_by_contrast.tsv",
388 sep="\t",
389 index=False,
390 )
391
377 392
378 def merge_adata_obs_fields(obs_fields_to_merge, adata): 393 def merge_adata_obs_fields(obs_fields_to_merge, adata):
379 """ 394 """
380 Merge adata.obs fields specified in args.adata_obs_fields_to_merge 395 Merge adata.obs fields specified in args.adata_obs_fields_to_merge
381 396
392 The merged AnnData object 407 The merged AnnData object
393 408
394 docstring tests: 409 docstring tests:
395 >>> import scanpy as sc 410 >>> import scanpy as sc
396 >>> ad = sc.datasets.pbmc68k_reduced() 411 >>> ad = sc.datasets.pbmc68k_reduced()
397 >>> ad = merge_adata_obs_fields(["bulk_labels","louvain"], ad) 412 >>> merge_adata_obs_fields(["bulk_labels","louvain"], ad)
398 >>> ad.obs.columns 413 >>> ad.obs.columns
399 Index(['bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 414 Index(['bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score',
400 'G2M_score', 'phase', 'louvain', 'bulk_labels_louvain'], 415 'G2M_score', 'phase', 'louvain', 'bulk_labels_louvain'],
401 dtype='object') 416 dtype='object')
402 """ 417 """
410 adata.obs[field_name] = adata.obs[field].astype(str) 425 adata.obs[field_name] = adata.obs[field].astype(str)
411 else: 426 else:
412 adata.obs[field_name] = ( 427 adata.obs[field_name] = (
413 adata.obs[field_name] + "_" + adata.obs[field].astype(str) 428 adata.obs[field_name] + "_" + adata.obs[field].astype(str)
414 ) 429 )
415 return adata 430
431
432 def identify_genes_to_filter_per_contrast(
433 contrast_file, min_perc_cells_expression, adata, obs_field
434 ):
435 """
436 Identify genes to filter per contrast based on expression percentage.
437 We need those genes to be under the threshold for all conditions
438 in a contrast to be identified for further filtering. If
439 one condition has the gene expressed above the threshold, the gene
440 becomes of interest (it can be highly up or down regulated).
441
442 Parameters
443 ----------
444 contrast_file : str
445 Path to the contrasts file.
446 min_perc_cells_expression : float
447 Minimum percentage of cells that should express a gene.
448 adata: adata
449 Original AnnData file
450 obs_field: str
451 Field in the AnnData observations where the contrasts are defined.
452
453 Returns
454 -------
455 None
456
457 Examples
458 --------
459 >>> import anndata
460 >>> import pandas as pd
461 >>> import numpy as np
462 >>> import os
463 >>> from io import StringIO
464 >>> contrast_file = StringIO(f"contrast{os.linesep}condition1-\
465 condition2{os.linesep}")
466 >>> min_perc_cells_expression = 30.0
467 >>> data = {
468 ... 'obs': pd.DataFrame({'condition': ['condition1', 'condition1',
469 ... 'condition2', 'condition2']}),
470 ... 'X': np.array([[1, 0, 0, 0, 0], [0, 0, 2, 2, 0],
471 ... [0, 0, 1, 1, 0], [0, 0, 0, 2, 0]]),
472 ... }
473 >>> adata = anndata.AnnData(X=data['X'], obs=data['obs'])
474 >>> df = identify_genes_to_filter_per_contrast(
475 ... contrast_file, min_perc_cells_expression, adata, 'condition'
476 ... ) # doctest:+ELLIPSIS
477 Identifying genes to filter using ...
478 >>> df.head() # doctest:+ELLIPSIS
479 contrast gene
480 0 condition1-condition2 ...
481 1 condition1-condition2 ...
482 """
483 import re
484
485 # Implement the logic to identify genes to filter per contrast
486 # This is a placeholder implementation
487 print(
488 f"Identifying genes to filter using {contrast_file} "
489 f"with min expression {min_perc_cells_expression}%"
490 )
491 sides_regex = re.compile(r"[\+\-\*\/\(\)\^]+")
492
493 contrasts = pd.read_csv(contrast_file, sep="\t")
494 # Iterate over each line in the contrast file
495 genes_filter_for_contrast = dict()
496 for contrast in contrasts.iloc[:, 0]:
497 conditions = set(sides_regex.split(contrast))
498 # we want to find the genes that are below the threshold
499 # of % of cells expressed for ALL the conditions in the
500 # contrast. It is enough for one of the conditions
501 # of the contrast to have the genes expressed above
502 # the threshold of % of cells to be of interest.
503 for condition in conditions:
504 # remove any starting or trailing whitespaces from condition
505 condition = condition.strip()
506 # check the percentage of cells that express each gene
507 # Filter the AnnData object based on the obs_field value
508 adata_filtered = adata[adata.obs[obs_field] == condition]
509 # Calculate the percentage of cells expressing each gene
510 gene_expression = (adata_filtered.X > 0).mean(axis=0) * 100
511 genes_to_filter = set(adata_filtered.var[
512 gene_expression.transpose() < min_perc_cells_expression
513 ].index.tolist())
514 # Update the genes_filter_for_contrast dictionary
515 if contrast in genes_filter_for_contrast.keys():
516 genes_filter_for_contrast[contrast].intersection_update(
517 genes_to_filter
518 )
519 else:
520 genes_filter_for_contrast[contrast] = genes_to_filter
521
522 # write the genes_filter_for_contrast to pandas dataframe of two columns:
523 # contrast and gene
524
525 # Initialize an empty list to store the expanded pairs
526 expanded_pairs = []
527
528 # Iterate over the dictionary
529 for contrast, genes in genes_filter_for_contrast.items():
530 for gene in genes:
531 expanded_pairs.append((contrast, gene))
532
533 # Create the DataFrame
534 contrast_genes_df = pd.DataFrame(
535 expanded_pairs, columns=["contrast", "gene"]
536 )
537
538 return contrast_genes_df
416 539
417 540
418 if __name__ == "__main__": 541 if __name__ == "__main__":
419 # Create argument parser 542 # Create argument parser
420 parser = argparse.ArgumentParser( 543 parser = argparse.ArgumentParser(
472 "--min_cells", 595 "--min_cells",
473 type=int, 596 type=int,
474 default=10, 597 default=10,
475 help="Minimum number of cells for pseudobulk analysis", 598 help="Minimum number of cells for pseudobulk analysis",
476 ) 599 )
600 # add argument for min percentage of cells that should express a gene
601 parser.add_argument(
602 "--min_gene_exp_perc_per_cell",
603 type=float,
604 default=50,
605 help="If all the conditions of one side of a contrast express a \
606 gene in less than this percentage of cells, then the genes \
607 will be added to a list of genes to ignore for that contrast.\
608 Requires the contrast file to be provided.",
609 )
610 parser.add_argument(
611 "--contrasts_file",
612 type=str,
613 required=False,
614 help="Contrasts file, a one column tsv with a header, each line \
615 represents a contrast as a combination of conditions at each \
616 side of a substraction.",
617 )
618
477 parser.add_argument( 619 parser.add_argument(
478 "--save_path", type=str, help="Path to save the plot (optional)" 620 "--save_path", type=str, help="Path to save the plot (optional)"
479 ) 621 )
480 parser.add_argument( 622 parser.add_argument(
481 "--min_counts", 623 "--min_counts",