comparison decoupler_aucell_score.py @ 1:e9b06a8fb73a draft

planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit 11fb36a94b8262ef8e78f1c6dd46c4146eb59341
author ebi-gxa
date Mon, 15 Apr 2024 13:20:27 +0000
parents 77d680b36e23
children 82b7cd3e1bbd
comparison
equal deleted inserted replaced
0:77d680b36e23 1:e9b06a8fb73a
51 # Convert the list of dictionaries to a DataFrame 51 # Convert the list of dictionaries to a DataFrame
52 return pd.DataFrame(gene_sets) 52 return pd.DataFrame(gene_sets)
53 53
54 54
55 def score_genes_aucell( 55 def score_genes_aucell(
56 adata: anndata.AnnData, gene_list: list, score_name: str, use_raw=False 56 adata: anndata.AnnData, gene_list: list, score_name: str, use_raw=False, min_n_genes=5
57 ): 57 ):
58 """Score genes using Aucell. 58 """Score genes using Aucell.
59 59
60 Parameters 60 Parameters
61 ---------- 61 ----------
78 "gene_id": gene_list, 78 "gene_id": gene_list,
79 "geneset": score_name, 79 "geneset": score_name,
80 } 80 }
81 ) 81 )
82 # run decoupler's run_aucell 82 # run decoupler's run_aucell
83 dc.run_aucell( 83 # catch the value error
84 adata, net=geneset_df, source="geneset", target="gene_id", use_raw=use_raw 84 try:
85 ) 85 dc.run_aucell(
86 # copy .obsm['aucell_estimate'] matrix columns to adata.obs using the column names 86 adata, net=geneset_df, source="geneset", target="gene_id", use_raw=use_raw
87 adata.obs[score_name] = adata.obsm["aucell_estimate"][score_name] 87 )
88 # copy .obsm['aucell_estimate'] matrix columns to adata.obs using the column names
89 adata.obs[score_name] = adata.obsm["aucell_estimate"][score_name]
90 except ValueError as ve:
91 print(f"Gene list {score_name} failed, skipping: {str(ve)}")
88 92
89 93
90 def run_for_genelists( 94 def run_for_genelists(
91 adata, gene_lists, score_names, use_raw=False, gene_symbols_field="gene_symbols" 95 adata, gene_lists, score_names, use_raw=False, gene_symbols_field="gene_symbols", min_n_genes=5
92 ): 96 ):
93 if len(gene_lists) == len(score_names): 97 if len(gene_lists) == len(score_names):
94 for gene_list, score_names in zip(gene_lists, score_names): 98 for gene_list, score_names in zip(gene_lists, score_names):
95 genes = gene_list.split(",") 99 genes = gene_list.split(",")
96 ens_gene_ids = adata.var[adata.var[gene_symbols_field].isin(genes)].index 100 ens_gene_ids = adata.var[adata.var[gene_symbols_field].isin(genes)].index
97 score_genes_aucell( 101 score_genes_aucell(
98 adata, 102 adata,
99 ens_gene_ids, 103 ens_gene_ids,
100 f"AUCell_{score_names}", 104 f"AUCell_{score_names}",
101 use_raw, 105 use_raw,
106 min_n_genes
102 ) 107 )
103 else: 108 else:
104 raise ValueError( 109 raise ValueError(
105 "The number of gene lists (separated by :) and score names (separated by :) must be the same" 110 "The number of gene lists (separated by :) and score names (separated by :) must be the same"
106 ) 111 )
141 "--gene_symbols_field", 146 "--gene_symbols_field",
142 type=str, 147 type=str,
143 help="Name of the gene symbols field in the AnnData object", 148 help="Name of the gene symbols field in the AnnData object",
144 required=True, 149 required=True,
145 ) 150 )
151 # argument for min_n Minimum of targets per source. If less, sources are removed.
152 parser.add_argument(
153 "--min_n",
154 type=int,
155 required=False,
156 default=5,
157 help="Minimum of targets per source. If less, sources are removed.",
158 )
146 parser.add_argument("--use_raw", action="store_true", help="Use raw data") 159 parser.add_argument("--use_raw", action="store_true", help="Use raw data")
147 parser.add_argument( 160 parser.add_argument(
148 "--write_anndata", action="store_true", help="Write the modified AnnData object" 161 "--write_anndata", action="store_true", help="Write the modified AnnData object"
149 ) 162 )
150 163
156 169
157 if args.gmt_file is not None: 170 if args.gmt_file is not None:
158 # Load MSigDB file in GMT format 171 # Load MSigDB file in GMT format
159 msigdb = read_gmt(args.gmt_file) 172 msigdb = read_gmt(args.gmt_file)
160 173
161 gene_sets_to_score = args.gene_sets_to_score.split(",") if args.gene_sets_to_score else [] 174 gene_sets_to_score = (
175 args.gene_sets_to_score.split(",") if args.gene_sets_to_score else []
176 )
162 # Score genes by their ensembl ids using the score_genes_aucell function 177 # Score genes by their ensembl ids using the score_genes_aucell function
163 for _, row in msigdb.iterrows(): 178 for _, row in msigdb.iterrows():
164 gene_set_name = row["gene_set_name"] 179 gene_set_name = row["gene_set_name"]
165 if not gene_sets_to_score or gene_set_name in gene_sets_to_score: 180 if not gene_sets_to_score or gene_set_name in gene_sets_to_score:
166 genes = row["genes"].split(",") 181 genes = row["genes"].split(",")
167 # Convert gene symbols to ensembl ids by using the columns gene_symbols and index in adata.var specific to the gene set 182 # Convert gene symbols to ensembl ids by using the columns gene_symbols and index in adata.var specific to the gene set
168 ens_gene_ids = adata.var[ 183 ens_gene_ids = adata.var[
169 adata.var[args.gene_symbols_field].isin(genes) 184 adata.var[args.gene_symbols_field].isin(genes)
170 ].index 185 ].index
171 score_genes_aucell( 186 score_genes_aucell(
172 adata, ens_gene_ids, f"AUCell_{gene_set_name}", args.use_raw 187 adata, ens_gene_ids, f"AUCell_{gene_set_name}", args.use_raw, args.min_n
173 ) 188 )
174 elif args.gene_lists_to_score is not None and args.score_names is not None: 189 elif args.gene_lists_to_score is not None and args.score_names is not None:
175 gene_lists = args.gene_lists_to_score.split(":") 190 gene_lists = args.gene_lists_to_score.split(":")
176 score_names = args.score_names.split(",") 191 score_names = args.score_names.split(",")
177 run_for_genelists( 192 run_for_genelists(
178 adata, gene_lists, score_names, args.use_raw, args.gene_symbols_field 193 adata, gene_lists, score_names, args.use_raw, args.gene_symbols_field, args.min_n
179 ) 194 )
180 195
181 # Save the modified AnnData object or generate a file with cells as rows and the new score_names columns 196 # Save the modified AnnData object or generate a file with cells as rows and the new score_names columns
182 if args.write_anndata: 197 if args.write_anndata:
183 adata.write_h5ad(args.output_file) 198 adata.write_h5ad(args.output_file)