annotate COBRAxy/src/ras_generator.py @ 539:2fb97466e404 draft

Uploaded
author francesco_lapi
date Sat, 25 Oct 2025 14:55:13 +0000
parents
children fcdbc81feb45
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
539
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
1 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
2 Generate Reaction Activity Scores (RAS) from a gene expression dataset and GPR rules.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
3
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
4 The script reads a tabular dataset (genes x samples) and a rules file (GPRs),
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
5 computes RAS per reaction for each sample/cell line, and writes a tabular output.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
6 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
7 from __future__ import division
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
8 import sys
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
9 import argparse
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
10 import pandas as pd
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
11 import numpy as np
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
12 import utils.general_utils as utils
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
13 from typing import List, Dict
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
14 import ast
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
15
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
16 # Optional imports for AnnData mode (not used in ras_generator.py)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
17 try:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
18 from progressbar import ProgressBar, Bar, Percentage
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
19 from scanpy import AnnData
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
20 from cobra.flux_analysis.variability import find_essential_reactions, find_essential_genes
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
21 except ImportError:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
22 # These are only needed for AnnData mode, not for ras_generator.py
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
23 pass
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
24
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
25 ERRORS = []
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
26 ########################## argparse ##########################################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
27 ARGS :argparse.Namespace
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
28 def process_args(args:List[str] = None) -> argparse.Namespace:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
29 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
30 Processes command-line arguments.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
31
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
32 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
33 args (list): List of command-line arguments.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
34
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
35 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
36 Namespace: An object containing parsed arguments.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
37 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
38 parser = argparse.ArgumentParser(
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
39 usage = '%(prog)s [options]',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
40 description = "process some value's genes to create a comparison's map.")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
41
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
42 parser.add_argument("-rl", "--model_upload", type = str,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
43 help = "path to input file containing the rules")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
44
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
45 parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
46 # Galaxy converts files into .dat, this helps infer the original extension when needed.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
47
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
48 parser.add_argument(
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
49 '-n', '--none',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
50 type = utils.Bool("none"), default = True,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
51 help = 'compute Nan values')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
52
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
53 parser.add_argument(
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
54 '-td', '--tool_dir',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
55 type = str,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
56 required = True, help = 'your tool directory')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
57
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
58 parser.add_argument(
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
59 '-ol', '--out_log',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
60 type = str,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
61 help = "Output log")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
62
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
63 parser.add_argument(
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
64 '-in', '--input',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
65 type = str,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
66 help = 'input dataset')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
67
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
68 parser.add_argument(
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
69 '-ra', '--ras_output',
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
70 type = str,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
71 required = True, help = 'ras output')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
72
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
73
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
74 return parser.parse_args(args)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
75
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
76 ############################ dataset input ####################################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
77 def read_dataset(data :str, name :str) -> pd.DataFrame:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
78 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
79 Read a dataset from a CSV file and return it as a pandas DataFrame.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
80
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
81 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
82 data (str): Path to the CSV file containing the dataset.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
83 name (str): Name of the dataset, used in error messages.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
84
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
85 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
86 pandas.DataFrame: DataFrame containing the dataset.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
87
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
88 Raises:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
89 pd.errors.EmptyDataError: If the CSV file is empty.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
90 sys.exit: If the CSV file has the wrong format, the execution is aborted.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
91 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
92 try:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
93 dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python', index_col=0)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
94 dataset = dataset.astype(float)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
95 except pd.errors.EmptyDataError:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
96 sys.exit('Execution aborted: wrong file format of ' + name + '\n')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
97 if len(dataset.columns) < 2:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
98 sys.exit('Execution aborted: wrong file format of ' + name + '\n')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
99 return dataset
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
100
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
101
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
102 def load_custom_rules() -> Dict[str,str]:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
103 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
104 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
105 performed, significantly impacting the runtime.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
106
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
107 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
108 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
109 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
110 datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload) # actual file, stored in Galaxy as a .dat
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
111
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
112 dict_rule = {}
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
113
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
114 try:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
115 rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
116 if len(rows) <= 1:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
117 raise ValueError("Model tabular with 1 column is not supported.")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
118
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
119 if not rows:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
120 raise ValueError("Model tabular is file is empty.")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
121
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
122 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
123
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
124 # First, try using a tab delimiter
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
125 for line in rows[1:]:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
126 if len(line) <= idx_gpr:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
127 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
128 continue
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
129
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
130 dict_rule[line[id_idx]] = line[idx_gpr]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
131
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
132 except Exception as e:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
133 # If parsing with tabs fails, try comma delimiter
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
134 try:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
135 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
136
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
137 if len(rows) <= 1:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
138 raise ValueError("Model tabular with 1 column is not supported.")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
139
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
140 if not rows:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
141 raise ValueError("Model tabular is file is empty.")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
142
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
143 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
144
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
145 # Try again parsing row content with the GPR column using comma-separated values
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
146 for line in rows[1:]:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
147 if len(line) <= idx_gpr:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
148 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
149 continue
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
150
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
151 dict_rule[line[id_idx]] =line[idx_gpr]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
152
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
153 except Exception as e2:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
154 raise ValueError(f"Unable to parse rules file. Tried both tab and comma delimiters. Original errors: Tab: {e}, Comma: {e2}")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
155
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
156 if not dict_rule:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
157 raise ValueError("No valid rules found in the uploaded file. Please check the file format.")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
158 # csv rules need to be parsed, those in a pickle format are taken to be pre-parsed.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
159 return dict_rule
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
160
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
161
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
162 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
163 Class to compute the RAS values
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
164
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
165 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
166
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
167 class RAS_computation:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
168
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
169 def __init__(self, adata=None, model=None, dataset=None, gene_rules=None, rules_total_string=None):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
170 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
171 Initialize RAS computation with two possible input modes:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
172
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
173 Mode 1 (Original - for sampling_main.py):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
174 adata: AnnData object with gene expression (cells × genes)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
175 model: COBRApy model object with reactions and GPRs
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
176
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
177 Mode 2 (New - for ras_generator.py):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
178 dataset: pandas DataFrame with gene expression (genes × samples)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
179 gene_rules: dict mapping reaction IDs to GPR strings
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
180 rules_total_string: list of all gene names in GPRs (for validation)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
181 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
182 self._logic_operators = ['and', 'or', '(', ')']
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
183 self.val_nan = np.nan
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
184
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
185 # Determine which mode we're in
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
186 if adata is not None and model is not None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
187 # Mode 1: AnnData + COBRApy model (original)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
188 self._init_from_anndata(adata, model)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
189 elif dataset is not None and gene_rules is not None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
190 # Mode 2: DataFrame + rules dict (ras_generator style)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
191 self._init_from_dataframe(dataset, gene_rules, rules_total_string)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
192 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
193 raise ValueError(
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
194 "Invalid initialization. Provide either:\n"
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
195 " - adata + model (for AnnData input), or\n"
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
196 " - dataset + gene_rules (for DataFrame input)"
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
197 )
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
198
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
199 def _normalize_gene_name(self, gene_name):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
200 """Normalize gene names by replacing special characters."""
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
201 return gene_name.replace("-", "_").replace(":", "_")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
202
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
203 def _normalize_rule(self, rule):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
204 """Normalize GPR rule: lowercase operators, add spaces around parentheses, normalize gene names."""
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
205 rule = rule.replace("OR", "or").replace("AND", "and")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
206 rule = rule.replace("(", "( ").replace(")", " )")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
207 # Normalize gene names in the rule
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
208 tokens = rule.split()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
209 normalized_tokens = [token if token in self._logic_operators else self._normalize_gene_name(token) for token in tokens]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
210 return " ".join(normalized_tokens)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
211
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
212 def _init_from_anndata(self, adata, model):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
213 """Initialize from AnnData and COBRApy model (original mode)."""
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
214 # Build the dictionary for the GPRs
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
215 df_reactions = pd.DataFrame(index=[reaction.id for reaction in model.reactions])
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
216 gene_rules = [self._normalize_rule(reaction.gene_reaction_rule) for reaction in model.reactions]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
217 df_reactions['rule'] = gene_rules
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
218 df_reactions = df_reactions.reset_index()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
219 df_reactions = df_reactions.groupby('rule').agg(lambda x: sorted(list(x)))
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
220
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
221 self.dict_rule_reactions = df_reactions.to_dict()['index']
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
222
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
223 # build useful structures for RAS computation
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
224 self.model = model
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
225 self.count_adata = adata.copy()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
226
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
227 # Normalize gene names in both model and dataset
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
228 model_genes = [self._normalize_gene_name(gene.id) for gene in model.genes]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
229 dataset_genes = [self._normalize_gene_name(gene) for gene in self.count_adata.var.index]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
230 self.genes = pd.Index(dataset_genes).intersection(model_genes)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
231
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
232 if len(self.genes) == 0:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
233 raise ValueError("ERROR: No genes from the count matrix match the metabolic model. Check that gene annotations are consistent between model and dataset.")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
234
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
235 self.cell_ids = list(self.count_adata.obs.index.values)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
236 # Get expression data with normalized gene names
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
237 self.count_df_filtered = self.count_adata.to_df().T
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
238 self.count_df_filtered.index = [self._normalize_gene_name(g) for g in self.count_df_filtered.index]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
239 self.count_df_filtered = self.count_df_filtered.loc[self.genes]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
240
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
241 def _init_from_dataframe(self, dataset, gene_rules, rules_total_string):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
242 """Initialize from DataFrame and rules dict (ras_generator mode)."""
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
243 reactions = list(gene_rules.keys())
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
244
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
245 # Build the dictionary for the GPRs
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
246 df_reactions = pd.DataFrame(index=reactions)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
247 gene_rules_list = [self._normalize_rule(gene_rules[reaction_id]) for reaction_id in reactions]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
248 df_reactions['rule'] = gene_rules_list
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
249 df_reactions = df_reactions.reset_index()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
250 df_reactions = df_reactions.groupby('rule').agg(lambda x: sorted(list(x)))
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
251
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
252 self.dict_rule_reactions = df_reactions.to_dict()['index']
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
253
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
254 # build useful structures for RAS computation
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
255 self.model = None
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
256 self.count_adata = None
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
257
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
258 # Normalize gene names in dataset
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
259 dataset_normalized = dataset.copy()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
260 dataset_normalized.index = [self._normalize_gene_name(g) for g in dataset_normalized.index]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
261
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
262 # Determine which genes are in both dataset and GPRs
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
263 if rules_total_string is not None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
264 rules_genes = [self._normalize_gene_name(g) for g in rules_total_string]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
265 self.genes = dataset_normalized.index.intersection(rules_genes)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
266 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
267 # Extract all genes from rules
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
268 all_genes_in_rules = set()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
269 for rule in gene_rules_list:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
270 tokens = rule.split()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
271 for token in tokens:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
272 if token not in self._logic_operators:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
273 all_genes_in_rules.add(token)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
274 self.genes = dataset_normalized.index.intersection(all_genes_in_rules)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
275
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
276 if len(self.genes) == 0:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
277 raise ValueError("ERROR: No genes from the count matrix match the metabolic model. Check that gene annotations are consistent between model and dataset.")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
278
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
279 self.cell_ids = list(dataset_normalized.columns)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
280 self.count_df_filtered = dataset_normalized.loc[self.genes]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
281
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
282 def compute(self,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
283 or_expression=np.sum, # type of operation to do in case of an or expression (sum, max, mean)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
284 and_expression=np.min, # type of operation to do in case of an and expression(min, sum)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
285 drop_na_rows=False, # if True remove the nan rows of the ras matrix
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
286 drop_duplicates=False, # if true, remove duplicates rows
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
287 ignore_nan=True, # if True, ignore NaN values in GPR evaluation (e.g., A or NaN -> A)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
288 print_progressbar=True, # if True, print the progress bar
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
289 add_count_metadata=True, # if True add metadata of cells in the ras adata
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
290 add_met_metadata=True, # if True add metadata from the metabolic model (gpr and compartments of reactions)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
291 add_essential_reactions=False,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
292 add_essential_genes=False
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
293 ):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
294
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
295 self.or_function = or_expression
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
296 self.and_function = and_expression
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
297
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
298 ras_df = np.full((len(self.dict_rule_reactions), len(self.cell_ids)), np.nan)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
299 genes_not_mapped = set() # Track genes not in dataset
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
300
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
301 if print_progressbar:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
302 pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(self.dict_rule_reactions)).start()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
303
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
304 # Process each unique GPR rule
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
305 for ind, (rule, reaction_ids) in enumerate(self.dict_rule_reactions.items()):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
306 if len(rule) == 0:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
307 # Empty rule - keep as NaN
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
308 pass
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
309 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
310 # Extract genes from rule
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
311 rule_genes = [token for token in rule.split() if token not in self._logic_operators]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
312 rule_genes_unique = list(set(rule_genes))
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
313
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
314 # Which genes are in the dataset?
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
315 genes_present = [g for g in rule_genes_unique if g in self.genes]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
316 genes_missing = [g for g in rule_genes_unique if g not in self.genes]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
317
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
318 if genes_missing:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
319 genes_not_mapped.update(genes_missing)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
320
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
321 if len(genes_present) == 0:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
322 # No genes in dataset - keep as NaN
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
323 pass
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
324 elif len(genes_missing) > 0 and not ignore_nan:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
325 # Some genes missing and we don't ignore NaN - set to NaN
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
326 pass
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
327 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
328 # Evaluate the GPR expression using AST
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
329 # For single gene, AST handles it fine: ast.parse("GENE_A") works
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
330 # more genes in the formula
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
331 check_only_and=("and" in rule and "or" not in rule) #only and
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
332 check_only_or=("or" in rule and "and" not in rule) #only or
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
333 if check_only_and or check_only_or:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
334 #or/and sequence
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
335 matrix = self.count_df_filtered.loc[genes_present].values
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
336 #compute for all cells
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
337 if check_only_and:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
338 ras_df[ind] = self.and_function(matrix, axis=0)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
339 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
340 ras_df[ind] = self.or_function(matrix, axis=0)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
341 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
342 # complex expression (e.g. A or (B and C))
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
343 data = self.count_df_filtered.loc[genes_present] # dataframe of genes in the GPRs
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
344 tree = ast.parse(rule, mode="eval").body
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
345 values_by_cell = [dict(zip(data.index, data[col].values)) for col in data.columns]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
346 for j, values in enumerate(values_by_cell):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
347 ras_df[ind, j] =self._evaluate_ast(tree, values, self.or_function, self.and_function, ignore_nan)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
348
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
349 if print_progressbar:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
350 pbar.update(ind + 1)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
351
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
352 if print_progressbar:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
353 pbar.finish()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
354
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
355 # Store genes not mapped for later use
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
356 self.genes_not_mapped = sorted(genes_not_mapped)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
357
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
358 # create the dataframe of ras (rules x samples)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
359 ras_df = pd.DataFrame(data=ras_df, index=range(len(self.dict_rule_reactions)), columns=self.cell_ids)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
360 ras_df['Reactions'] = [reaction_ids for rule, reaction_ids in self.dict_rule_reactions.items()]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
361
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
362 reactions_common = pd.DataFrame()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
363 reactions_common["Reactions"] = ras_df['Reactions']
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
364 reactions_common["proof2"] = ras_df['Reactions']
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
365 reactions_common = reactions_common.explode('Reactions')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
366 reactions_common = reactions_common.set_index("Reactions")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
367
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
368 ras_df = ras_df.explode("Reactions")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
369 ras_df = ras_df.set_index("Reactions")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
370
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
371 if drop_na_rows:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
372 ras_df = ras_df.dropna(how="all")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
373
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
374 if drop_duplicates:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
375 ras_df = ras_df.drop_duplicates()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
376
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
377 # If initialized from DataFrame (ras_generator mode), return DataFrame instead of AnnData
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
378 if self.count_adata is None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
379 return ras_df, self.genes_not_mapped
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
380
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
381 # Original AnnData mode: create AnnData structure for RAS
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
382 ras_adata = AnnData(ras_df.T)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
383
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
384 #add metadata
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
385 if add_count_metadata:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
386 ras_adata.var["common_gprs"] = reactions_common.loc[ras_df.index]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
387 ras_adata.var["common_gprs"] = ras_adata.var["common_gprs"].apply(lambda x: ",".join(x))
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
388 for el in self.count_adata.obs.columns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
389 ras_adata.obs["countmatrix_"+el]=self.count_adata.obs[el]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
390
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
391 if add_met_metadata:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
392 if self.model is not None and len(self.model.compartments)>0:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
393 ras_adata.var['compartments']=[list(self.model.reactions.get_by_id(reaction).compartments) for reaction in ras_adata.var.index]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
394 ras_adata.var['compartments']=ras_adata.var["compartments"].apply(lambda x: ",".join(x))
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
395
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
396 if self.model is not None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
397 ras_adata.var['GPR rule'] = [self.model.reactions.get_by_id(reaction).gene_reaction_rule for reaction in ras_adata.var.index]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
398
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
399 if add_essential_reactions:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
400 if self.model is not None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
401 essential_reactions=find_essential_reactions(self.model)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
402 essential_reactions=[el.id for el in essential_reactions]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
403 ras_adata.var['essential reactions']=["yes" if el in essential_reactions else "no" for el in ras_adata.var.index]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
404
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
405 if add_essential_genes:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
406 if self.model is not None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
407 essential_genes=find_essential_genes(self.model)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
408 essential_genes=[el.id for el in essential_genes]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
409 ras_adata.var['essential genes']=[" ".join([gene for gene in genes.split() if gene in essential_genes]) for genes in ras_adata.var["GPR rule"]]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
410
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
411 return ras_adata
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
412
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
413 def _evaluate_ast(self, node, values, or_function, and_function, ignore_nan):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
414 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
415 Evaluate a boolean expression using AST (Abstract Syntax Tree).
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
416 Handles all GPR types: single gene, simple (A and B), nested (A or (B and C)).
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
417
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
418 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
419 node: AST node to evaluate
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
420 values: Dictionary mapping gene names to their expression values
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
421 or_function: Function to apply for OR operations
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
422 and_function: Function to apply for AND operations
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
423 ignore_nan: If True, ignore None/NaN values (e.g., A or None -> A)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
424
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
425 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
426 Evaluated expression result (float or np.nan)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
427 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
428 if isinstance(node, ast.BoolOp):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
429 # Boolean operation (and/or)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
430 vals = [self._evaluate_ast(v, values, or_function, and_function, ignore_nan) for v in node.values]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
431
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
432 if ignore_nan:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
433 # Filter out None/NaN values
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
434 vals = [v for v in vals if v is not None and not (isinstance(v, float) and np.isnan(v))]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
435
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
436 if not vals:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
437 return np.nan
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
438
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
439 if isinstance(node.op, ast.Or):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
440 return or_function(vals)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
441 elif isinstance(node.op, ast.And):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
442 return and_function(vals)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
443
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
444 elif isinstance(node, ast.Name):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
445 # Variable (gene name)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
446 return values.get(node.id, None)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
447 elif isinstance(node, ast.Constant):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
448 # Constant (shouldn't happen in GPRs, but handle it)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
449 return values.get(str(node.value), None)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
450 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
451 raise ValueError(f"Unexpected node type in GPR: {ast.dump(node)}")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
452
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
453
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
454 # ============================================================================
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
455 # STANDALONE FUNCTION FOR RAS_GENERATOR COMPATIBILITY
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
456 # ============================================================================
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
457
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
458 def computeRAS(
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
459 dataset,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
460 gene_rules,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
461 rules_total_string,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
462 or_function=np.sum,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
463 and_function=np.min,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
464 ignore_nan=True
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
465 ):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
466 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
467 Compute RAS from tabular data and GPR rules (ras_generator.py compatible).
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
468
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
469 This is a standalone function that wraps the RAS_computation class
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
470 to provide the same interface as ras_generator.py.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
471
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
472 Args:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
473 dataset: pandas DataFrame with gene expression (genes × samples)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
474 gene_rules: dict mapping reaction IDs to GPR strings
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
475 rules_total_string: list of all gene names in GPRs
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
476 or_function: function for OR operations (default: np.sum)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
477 and_function: function for AND operations (default: np.min)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
478 ignore_nan: if True, ignore NaN in GPR evaluation (default: True)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
479
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
480 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
481 tuple: (ras_df, genes_not_mapped)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
482 - ras_df: DataFrame with RAS values (reactions × samples)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
483 - genes_not_mapped: list of genes in GPRs not found in dataset
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
484 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
485 # Create RAS computation object in DataFrame mode
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
486 ras_obj = RAS_computation(
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
487 dataset=dataset,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
488 gene_rules=gene_rules,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
489 rules_total_string=rules_total_string
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
490 )
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
491
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
492 # Compute RAS
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
493 result = ras_obj.compute(
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
494 or_expression=or_function,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
495 and_expression=and_function,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
496 ignore_nan=ignore_nan,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
497 print_progressbar=False, # No progress bar for ras_generator
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
498 add_count_metadata=False, # No metadata in DataFrame mode
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
499 add_met_metadata=False,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
500 add_essential_reactions=False,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
501 add_essential_genes=False
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
502 )
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
503
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
504 # Result is a tuple (ras_df, genes_not_mapped) in DataFrame mode
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
505 return result
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
506
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
507 def main(args:List[str] = None) -> None:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
508 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
509 Initializes everything and sets the program in motion based on the fronted input arguments.
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
510
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
511 Returns:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
512 None
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
513 """
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
514 # get args from frontend (related xml)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
515 global ARGS
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
516 ARGS = process_args(args)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
517
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
518 # read dataset and remove versioning from gene names
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
519 dataset = read_dataset(ARGS.input, "dataset")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
520 orig_gene_list=dataset.index.copy()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
521 dataset.index = [str(el.split(".")[0]) for el in dataset.index]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
522
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
523 #load GPR rules
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
524 rules = load_custom_rules()
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
525
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
526 #create a list of all the gpr
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
527 rules_total_string=""
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
528 for id,rule in rules.items():
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
529 rules_total_string+=rule.replace("(","").replace(")","") + " "
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
530 rules_total_string=list(set(rules_total_string.split(" ")))
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
531
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
532 if any(dataset.index.duplicated(keep=False)):
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
533 genes_duplicates=orig_gene_list[dataset.index.duplicated(keep=False)]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
534 genes_duplicates_in_model=[elem for elem in genes_duplicates if elem in rules_total_string]
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
535
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
536 if len(genes_duplicates_in_model)>0:#metabolic genes have duplicated entries in the dataset
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
537 list_str=", ".join(genes_duplicates_in_model)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
538 list_genes=f"ERROR: Duplicate entries in the gene dataset present in one or more GPR. The following metabolic genes are duplicated: "+list_str
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
539 raise ValueError(list_genes)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
540 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
541 list_str=", ".join(genes_duplicates)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
542 list_genes=f"INFO: Duplicate entries in the gene dataset. The following genes are duplicated in the dataset but not mentioned in the GPRs: "+list_str
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
543 utils.logWarning(list_genes,ARGS.out_log)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
544
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
545 #check if nan value must be ignored in the GPR
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
546 if ARGS.none:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
547 # #e.g. (A or nan --> A)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
548 ignore_nan = True
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
549 else:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
550 #e.g. (A or nan --> nan)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
551 ignore_nan = False
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
552
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
553 #compure ras
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
554 ras_df,genes_not_mapped=computeRAS(dataset,rules,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
555 rules_total_string,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
556 or_function=np.sum, # type of operation to do in case of an or expression (max, sum, mean)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
557 and_function=np.min,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
558 ignore_nan=ignore_nan)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
559
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
560 #save to csv and replace nan with None
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
561 ras_df.replace([np.nan,None],"None").to_csv(ARGS.ras_output, sep = '\t')
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
562
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
563 #report genes not present in the data
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
564 if len(genes_not_mapped)>0:
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
565 genes_not_mapped_str=", ".join(genes_not_mapped)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
566 utils.logWarning(
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
567 f"INFO: The following genes are mentioned in the GPR rules but don't appear in the dataset: "+genes_not_mapped_str,
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
568 ARGS.out_log)
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
569
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
570 print("Execution succeeded")
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
571
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
572 ###############################################################################
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
573 if __name__ == "__main__":
2fb97466e404 Uploaded
francesco_lapi
parents:
diff changeset
574 main()