annotate COBRAxy/rps_generator.py @ 489:97eea560a10f draft

Uploaded
author francesco_lapi
date Mon, 29 Sep 2025 10:33:26 +0000
parents 187cee1a00e2
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
1 """
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
2 Compute Reaction Propensity Scores (RPS) from metabolite abundances and reaction stoichiometry.
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
3
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
4 Given a tabular dataset (metabolites x samples) and a reaction set, this script
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
5 maps metabolite names via synonyms, fills missing metabolites, and computes RPS
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
6 per reaction for each sample using a log-normalized formula.
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
7 """
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
8 import math
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
9 import argparse
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
10
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
11 import numpy as np
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
12 import pickle as pk
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
13 import pandas as pd
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
14
293
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
15 from typing import Optional, List, Dict
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
16
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
17 import utils.general_utils as utils
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
18 import utils.reaction_parsing as reactionUtils
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
19
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
20 ########################## argparse ##########################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
21 ARGS :argparse.Namespace
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
22 def process_args(args:List[str] = None) -> argparse.Namespace:
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
23 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
24 Processes command-line arguments.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
25
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
26 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
27 args (list): List of command-line arguments.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
28
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
29 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
30 Namespace: An object containing parsed arguments.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
31 """
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
32 parser = argparse.ArgumentParser(
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
33 usage='%(prog)s [options]',
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
34 description='Process abundances and reactions to create RPS scores.'
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
35 )
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
36
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
37 parser.add_argument("-rl", "--model_upload", type = str,
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
38 help = "path to input file containing the reactions")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
39
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
40 parser.add_argument('-td', '--tool_dir',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
41 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
42 required = True,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
43 help = 'your tool directory')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
44 parser.add_argument('-ol', '--out_log',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
45 help = "Output log")
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
46 parser.add_argument('-id', '--input',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
47 type = str,
293
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
48 required = True,
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
49 help = 'input dataset')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
50 parser.add_argument('-rp', '--rps_output',
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
51 type = str,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
52 required = True,
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
53 help = 'rps output')
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
54
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
55 args = parser.parse_args(args)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
56 return args
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
57
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
58 ############################ dataset name #####################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
59 def name_dataset(name_data :str, count :int) -> str:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
60 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
61 Produces a unique name for a dataset based on what was provided by the user. The default name for any dataset is "Dataset", thus if the user didn't change it this function appends f"_{count}" to make it unique.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
62
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
63 Args:
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
64 name_data: Name associated with the dataset (from frontend input params).
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
65 count: Counter starting at 1 to make names unique when default.
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
66
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
67 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
68 str : the name made unique
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
69 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
70 if str(name_data) == 'Dataset':
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
71 return str(name_data) + '_' + str(count)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
72 else:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
73 return str(name_data)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
74
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
75 ############################ get_abund_data ####################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
76 def get_abund_data(dataset: pd.DataFrame, cell_line_index:int) -> Optional[pd.Series]:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
77 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
78 Extracts abundance data and turns it into a series for a specific cell line from the dataset, which rows are
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
79 metabolites and columns are cell lines.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
80
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
81 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
82 dataset (pandas.DataFrame): The DataFrame containing abundance data for all cell lines and metabolites.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
83 cell_line_index (int): The index of the cell line of interest in the dataset.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
84
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
85 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
86 pd.Series or None: A series containing abundance values for the specified cell line.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
87 The name of the series is the name of the cell line.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
88 Returns None if the cell index is invalid.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
89 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
90 if cell_line_index < 0 or cell_line_index >= len(dataset.index):
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
91 print(f"Error: cell line index '{cell_line_index}' is not valid.")
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
92 return None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
93
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
94 cell_line_name = dataset.columns[cell_line_index]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
95 abundances_series = dataset[cell_line_name][1:]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
96
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
97 return abundances_series
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
98
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
99 ############################ clean_metabolite_name ####################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
100 def clean_metabolite_name(name :str) -> str:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
101 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
102 Removes some characters from a metabolite's name, provided as input, and makes it lowercase in order to simplify
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
103 the search of a match in the dictionary of synonyms.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
104
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
105 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
106 name : the metabolite's name, as given in the dataset.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
107
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
108 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
109 str : a new string with the cleaned name.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
110 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
111 return "".join(ch for ch in name if ch not in ",;-_'([{ }])").lower()
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
112
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
113 ############################ get_metabolite_id ####################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
114 def get_metabolite_id(name :str, syn_dict :Dict[str, List[str]]) -> str:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
115 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
116 Looks through a dictionary of synonyms to find a match for a given metabolite's name.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
117
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
118 Args:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
119 name : the metabolite's name, as given in the dataset.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
120 syn_dict : the dictionary of synonyms, using unique identifiers as keys and lists of clean synonyms as values.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
121
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
122 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
123 str : the internal :str unique identifier of that metabolite, used in all other parts of the model in use.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
124 An empty string is returned if a match isn't found.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
125 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
126 name = clean_metabolite_name(name)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
127 for id, synonyms in syn_dict.items():
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
128 if name in synonyms:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
129 return id
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
130
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
131 return ""
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
132
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
133 ############################ check_missing_metab ####################################
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
134 def check_missing_metab(reactions: Dict[str, Dict[str, int]], dataset_by_rows: Dict[str, List[float]], cell_lines_amt :int) -> List[str]:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
135 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
136 Check for missing metabolites in the abundances dictionary compared to the reactions dictionary and update abundances accordingly.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
137
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
138 Parameters:
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
139 reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
140 stoichiometric coefficients as values.
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
141 dataset_by_rows (dict): A dictionary representing abundances where keys are metabolite names and values are their corresponding abundances for all cell lines.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
142 cell_lines_amt : amount of cell lines, needed to add a new list of abundances for missing metabolites.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
143
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
144 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
145 list[str] : list of metabolite names that were missing in the original abundances dictionary and thus their aboundances were set to 1.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
146
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
147 Side effects:
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
148 dataset_by_rows: mutated to include missing metabolites with default abundances.
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
149 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
150 missing_list = []
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
151 for reaction in reactions.values():
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
152 for metabolite in reaction.keys():
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
153 if metabolite not in dataset_by_rows:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
154 dataset_by_rows[metabolite] = [1] * cell_lines_amt
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
155 missing_list.append(metabolite)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
156
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
157 return missing_list
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
158
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
159 ############################ calculate_rps ####################################
293
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
160 def calculate_rps(reactions: Dict[str, Dict[str, int]], abundances: Dict[str, float], black_list: List[str], missing_list: List[str], substrateFreqTable: Dict[str, int]) -> Dict[str, float]:
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
161 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
162 Calculate the Reaction Propensity scores (RPS) based on the availability of reaction substrates, for (ideally) each input model reaction and for each sample.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
163 The score is computed as the product of the concentrations of the reacting substances, with each concentration raised to a power equal to its stoichiometric coefficient
293
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
164 for each reaction using the provided coefficient and abundance values. The value is then normalized, based on how frequent the metabolite is in the selected model's reactions,
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
165 and log-transformed.
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
166
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
167 Parameters:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
168 reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
169 abundances (dict): A dictionary representing metabolite abundances where keys are metabolite names and values are their corresponding abundances.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
170 black_list (list): A list containing metabolite names that should be excluded from the RPS calculation.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
171 missing_list (list): A list containing metabolite names that were missing in the original abundances dictionary and thus their values were set to 1.
293
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
172 substrateFreqTable (dict): A dictionary where each metabolite name (key) is associated with how many times it shows up in the model's reactions (value).
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
173
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
174 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
175 dict: A dictionary containing Reaction Propensity Scores (RPS) where keys are reaction names and values are the corresponding RPS scores.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
176 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
177 rps_scores = {}
293
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
178
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
179 for reaction_name, substrates in reactions.items():
326
3dccdf56cb24 Uploaded
francesco_lapi
parents: 293
diff changeset
180 total_contribution = 0
293
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
181 metab_significant = False
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
182 for metabolite, stoichiometry in substrates.items():
293
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
183 abundance = 1 if math.isnan(abundances[metabolite]) else abundances[metabolite]
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
184 if metabolite not in black_list and metabolite not in missing_list:
293
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
185 metab_significant = True
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
186
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
187 total_contribution += math.log((abundance + np.finfo(float).eps) / substrateFreqTable[metabolite]) * stoichiometry
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
188
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
189 rps_scores[reaction_name] = total_contribution if metab_significant else math.nan
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
190
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
191 return rps_scores
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
192
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
193 ############################ rps_for_cell_lines ####################################
293
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
194 def rps_for_cell_lines(dataset: List[List[str]], reactions: Dict[str, Dict[str, int]], black_list: List[str], syn_dict: Dict[str, List[str]], substrateFreqTable: Dict[str, int]) -> None:
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
195 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
196 Calculate Reaction Propensity Scores (RPS) for each cell line represented in the dataframe and creates an output file.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
197
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
198 Parameters:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
199 dataset : the dataset's data, by rows
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
200 reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
201 black_list (list): A list containing metabolite names that should be excluded from the RPS calculation.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
202 syn_dict (dict): A dictionary where keys are general metabolite names and values are lists of possible synonyms.
293
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
203 substrateFreqTable (dict): A dictionary where each metabolite name (key) is associated with how many times it shows up in the model's reactions (value).
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
204
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
205 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
206 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
207 """
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
208
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
209 cell_lines = dataset[0][1:]
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
210 abundances_dict = {}
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
211
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
212 for row in dataset[1:]:
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
213 id = get_metabolite_id(row[0], syn_dict)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
214 if id:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
215 abundances_dict[id] = list(map(utils.Float(), row[1:]))
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
216
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
217 missing_list = check_missing_metab(reactions, abundances_dict, len((cell_lines)))
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
218
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
219 rps_scores :Dict[Dict[str, float]] = {}
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
220 for pos, cell_line_name in enumerate(cell_lines):
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
221 abundances = { metab : abundances[pos] for metab, abundances in abundances_dict.items() }
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
222
293
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
223 rps_scores[cell_line_name] = calculate_rps(reactions, abundances, black_list, missing_list, substrateFreqTable)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
224
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
225 df = pd.DataFrame.from_dict(rps_scores)
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
226 df = df.loc[list(reactions.keys()),:]
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
227 # Optional preview: first 10 rows
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
228 # print(df.head(10))
281
5dd2ab4637aa Uploaded
francesco_lapi
parents: 280
diff changeset
229 df.index.name = 'Reactions'
5dd2ab4637aa Uploaded
francesco_lapi
parents: 280
diff changeset
230 df.to_csv(ARGS.rps_output, sep='\t', na_rep='None', index=True)
5dd2ab4637aa Uploaded
francesco_lapi
parents: 280
diff changeset
231
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
232 ############################ main ####################################
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
233 def main(args:List[str] = None) -> None:
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
234 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
235 Initializes everything and sets the program in motion based on the fronted input arguments.
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
236
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
237 Returns:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
238 None
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
239 """
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
240 global ARGS
147
3fca9b568faf Uploaded
bimib
parents: 4
diff changeset
241 ARGS = process_args(args)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
242
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
243 # Load support data (black list and synonyms)
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
244 with open(ARGS.tool_dir + '/local/pickle files/black_list.pickle', 'rb') as bl:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
245 black_list = pk.load(bl)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
246
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
247 with open(ARGS.tool_dir + '/local/pickle files/synonyms.pickle', 'rb') as sd:
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
248 syn_dict = pk.load(sd)
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
249
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
250 dataset = utils.readCsv(utils.FilePath.fromStrPath(ARGS.input), '\t', skipHeader=False)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
251 tmp_dict = None
406
187cee1a00e2 Uploaded
francesco_lapi
parents: 402
diff changeset
252
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
253 # Parse custom reactions from uploaded file
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
254 reactions = reactionUtils.parse_custom_reactions(ARGS.model_upload)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
255 for r, s in reactions.items():
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
256 tmp_list = list(s.keys())
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
257 for k in tmp_list:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
258 if k[-2] == '_':
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
259 s[k[:-2]] = s.pop(k)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
260 substrateFreqTable = {}
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
261 for _, substrates in reactions.items():
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
262 for substrateName, _ in substrates.items():
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
263 if substrateName not in substrateFreqTable: substrateFreqTable[substrateName] = 0
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
264 substrateFreqTable[substrateName] += 1
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
265
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
266 # Debug prints (can be enabled during troubleshooting)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
267 # print(f"Reactions: {reactions}")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
268 # print(f"Substrate Frequencies: {substrateFreqTable}")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
269 # print(f"Synonyms: {syn_dict}")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
270 tmp_dict = {}
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
271 for metabName, freq in substrateFreqTable.items():
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
272 tmp_metabName = clean_metabolite_name(metabName)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
273 for syn_key, syn_list in syn_dict.items():
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
274 if tmp_metabName in syn_list or tmp_metabName == clean_metabolite_name(syn_key):
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
275 # print(f"Mapping {tmp_metabName} to {syn_key}")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
276 tmp_dict[syn_key] = syn_list
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
277 tmp_dict[syn_key].append(tmp_metabName)
381
0a3ca20848f3 Uploaded
francesco_lapi
parents: 326
diff changeset
278
293
7b8d9de81a86 Uploaded
francesco_lapi
parents: 281
diff changeset
279 rps_for_cell_lines(dataset, reactions, black_list, syn_dict, substrateFreqTable)
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
280 print('Execution succeeded')
4
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
281
41f35c2f0c7b Uploaded
luca_milaz
parents:
diff changeset
282 ##############################################################################
326
3dccdf56cb24 Uploaded
francesco_lapi
parents: 293
diff changeset
283 if __name__ == "__main__": main()