Mercurial > repos > ebi-gxa > decoupler_pseudobulk
changeset 16:508a93e34599 draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit 487508282bda9dbb68138d5c7091f46ef54fe52a
| author | ebi-gxa |
|---|---|
| date | Wed, 19 Feb 2025 16:55:58 +0000 |
| parents | 09c833d9b03b |
| children | 2557d7869e78 |
| files | decoupler_pseudobulk.py decoupler_pseudobulk.xml |
| diffstat | 2 files changed, 145 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/decoupler_pseudobulk.py Fri Nov 29 11:34:16 2024 +0000 +++ b/decoupler_pseudobulk.py Wed Feb 19 16:55:58 2025 +0000 @@ -2,6 +2,7 @@ import anndata import decoupler +import numpy as np import pandas as pd @@ -34,6 +35,60 @@ ) +def create_pseudo_replicates(adata, sample_key, num_replicates, seed=None): + """ + Create pseudo replicates for each sample in the sample_key groups. + + Parameters + ---------- + adata : anndata.AnnData + The AnnData object. + sample_key : str + The column in adata.obs that defines the samples. + num_replicates : int + Number of pseudo replicates to create per sample. + + Returns + ------- + anndata.AnnData + The AnnData object with pseudo replicates. + + Examples + -------- + >>> import anndata + >>> import pandas as pd + >>> import numpy as np + >>> data = { + ... 'obs': pd.DataFrame({'sample': ['A', 'A', 'B', 'B']}), + ... 'X': np.array([[1, 0], [0, 1], [1, 1], [0, 0]]) + ... } + >>> adata = anndata.AnnData(X=data['X'], obs=data['obs']) + >>> adata = create_pseudo_replicates(adata, 'sample', 2) + >>> adata.obs['sample_pseudo'].tolist() + ['A_rep1', 'A_rep2', 'B_rep1', 'B_rep2'] + """ + if seed is not None: + np.random.seed(seed) + + new_sample_key = f"{sample_key}_pseudo" + adata.obs[new_sample_key] = adata.obs[sample_key].astype(str) + + for sample in adata.obs[sample_key].unique(): + sample_indices = adata.obs[ + adata.obs[sample_key] == sample].index.to_numpy() + np.random.shuffle(sample_indices) # Shuffle the indices to randomize + replicate_size = int(len(sample_indices) / num_replicates) + for i in range(num_replicates): + start_idx = i * replicate_size + end_idx = start_idx + replicate_size + replicate_indices = sample_indices[start_idx:end_idx] + adata.obs.loc[replicate_indices, new_sample_key] = ( + adata.obs.loc[replicate_indices, new_sample_key] + f"_rep{i+1}" + ) + + return adata + + def prepend_c_to_index(index_value): if index_value and index_value[0].isdigit(): return "C" + index_value @@ -307,6 +362,13 @@ factor_fields = args.factor_fields.split(",") check_fields(factor_fields, adata) + # Create pseudo replicates if specified + if args.num_pseudo_replicates: + adata = create_pseudo_replicates( + adata, args.sample_key, args.num_pseudo_replicates, seed=args.seed + ) + args.sample_key = f"{args.sample_key}_pseudo" + print(f"Using mode: {args.mode}") # Perform pseudobulk analysis pseudobulk_data = get_pseudobulk( @@ -664,6 +726,19 @@ help="Minimum total count threshold for filtering by expression", ) parser.add_argument( + "--num_pseudo_replicates", + type=int, + choices=range(3, 1000), + help="Number of pseudo replicates to create per sample (at least 3)", + required=False + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Random seed for pseudo replicate sampling", + ) + parser.add_argument( "--anndata_output_path", type=str, help="Path to save the filtered AnnData object or pseudobulk data",
--- a/decoupler_pseudobulk.xml Fri Nov 29 11:34:16 2024 +0000 +++ b/decoupler_pseudobulk.xml Wed Feb 19 16:55:58 2025 +0000 @@ -1,4 +1,4 @@ -<tool id="decoupler_pseudobulk" name="Decoupler pseudo-bulk" version="1.4.0+galaxy8" profile="20.05"> +<tool id="decoupler_pseudobulk" name="Decoupler pseudo-bulk" version="1.4.0+galaxy9" profile="20.05"> <description>aggregates single cell RNA-seq data for running bulk RNA-seq methods</description> <requirements> <requirement type="package" version="1.4.0">decoupler</requirement> @@ -47,6 +47,12 @@ --contrasts_file '$filter_per_contrast.contrasts_file' --min_gene_exp_perc_per_cell '$filter_per_contrast.min_cells_perc_per_contrast_cond' #end if + #if $num_pseudo_replicates: + --num_pseudo_replicates $num_pseudo_replicates + #if $seed: + --seed '$seed' + #end if + #end if --deseq2_output_path deseq_output_dir --plot_samples_figsize $plot_samples_figsize --plot_filtering_figsize $plot_filtering_figsize @@ -89,6 +95,8 @@ <param type="boolean" name="filter_expr" label="Enable Filtering by Expression"/> <param type="text" name="plot_samples_figsize" label="Plot Samples Figsize" value="10 10" help="X and Y sizes in points separated by a space"/> <param type="text" name="plot_filtering_figsize" label="Plot Filtering Figsize" value="10 10" help="X and Y sizes in points separated by a space"/> + <param type="integer" name="num_pseudo_replicates" label="Number of Pseudo Replicates" optional="true" help="If set, create this number of pseudo replicates to create per sample (at least 3). If not set, there is an expectation that samples and groups are distributed in a way that (pseudo) replicates exists." min="3" max="1000"/> + <param type="integer" name="seed" label="Seed" optional="true" help="Seed for the random number generator used for sampling the pseudo replicates (only used if Number of Pseudo replicates set)."/> </inputs> <outputs> <data name="pbulk_anndata" format="h5ad" label="${tool.name} on ${on_string}: Pseudo-bulk AnnData"> @@ -229,6 +237,66 @@ </assert_contents> </output> </test> + <test expect_num_outputs="8"> + <param name="input_file" value="mito_counted_anndata.h5ad"/> + <param name="filter" value="yes"/> + <param name="contrasts_file" value="test_contrasts.txt" ftype="txt"/> + <param name="min_cells_perc_per_contrast_cond" value="25"/> + <param name="adata_obs_fields_to_merge" value="batch,sex:batch,genotype"/> + <param name="groupby" value="batch_sex"/> + <param name="sample_key" value="genotype"/> + <param name="factor_fields" value="genotype,batch_sex"/> + <param name="mode" value="sum"/> + <param name="min_cells" value="10"/> + <param name="produce_plots" value="true"/> + <param name="produce_anndata" value="true"/> + <param name="min_counts" value="10"/> + <param name="min_counts_per_sample" value="50"/> + <param name="min_total_counts" value="1000"/> + <param name="filter_expr" value="true"/> + <param name="plot_samples_figsize" value="10 10"/> + <param name="plot_filtering_figsize" value="10 10"/> + <param name="num_pseudo_replicates" value="3"/> + <param name="seed" value="42"/> + <output name="pbulk_anndata" ftype="h5ad"> + <assert_contents> + <has_h5_keys keys="obs/psbulk_n_cells"/> + </assert_contents> + </output> + <output name="count_matrix" ftype="tabular"> + <assert_contents> + <has_n_lines n="3620"/> + <has_n_columns n="22"/> + </assert_contents> + </output> + <output name="samples_metadata" ftype="tabular"> + <assert_contents> + <has_n_lines n="22"/> + <has_n_columns n="3"/> + </assert_contents> + </output> + <output name="genes_metadata" ftype="tabular"> + <assert_contents> + <has_n_lines n="3620"/> + <has_n_columns n="13"/> + </assert_contents> + </output> + <output name="plot_output" ftype="png"> + <assert_contents> + <has_size value="34626" delta="6000"/> + </assert_contents> + </output> + <output name="filter_by_expr_plot" ftype="png"> + <assert_contents> + <has_size value="21656" delta="2000"/> + </assert_contents> + </output> + <output name="genes_ignore_per_contrast" ftype="tabular"> + <assert_contents> + <has_n_lines n="35478"/> + </assert_contents> + </output> + </test> </tests> <help> <