comparison COBRAxy/rps_generator.py @ 293:7b8d9de81a86 draft

Uploaded
author francesco_lapi
date Thu, 15 May 2025 18:23:52 +0000
parents 5dd2ab4637aa
children
comparison
equal deleted inserted replaced
292:31bc171a6ba5 293:7b8d9de81a86
1 import re
2 import sys
3 import csv
4 import math 1 import math
5 import argparse 2 import argparse
6 3
7 import numpy as np 4 import numpy as np
8 import pickle as pk 5 import pickle as pk
9 import pandas as pd 6 import pandas as pd
10 7
11 from enum import Enum 8 from typing import Optional, List, Dict
12 from typing import Optional, List, Dict, Tuple
13 9
14 import utils.general_utils as utils 10 import utils.general_utils as utils
15 import utils.reaction_parsing as reactionUtils 11 import utils.reaction_parsing as reactionUtils
16 12
17 ########################## argparse ########################################## 13 ########################## argparse ##########################################
43 help = 'your tool directory') 39 help = 'your tool directory')
44 parser.add_argument('-ol', '--out_log', 40 parser.add_argument('-ol', '--out_log',
45 help = "Output log") 41 help = "Output log")
46 parser.add_argument('-id', '--input', 42 parser.add_argument('-id', '--input',
47 type = str, 43 type = str,
44 required = True,
48 help = 'input dataset') 45 help = 'input dataset')
49 parser.add_argument('-rp', '--rps_output', 46 parser.add_argument('-rp', '--rps_output',
50 type = str, 47 type = str,
51 required = True, 48 required = True,
52 help = 'rps output') 49 help = 'rps output')
68 """ 65 """
69 if str(name_data) == 'Dataset': 66 if str(name_data) == 'Dataset':
70 return str(name_data) + '_' + str(count) 67 return str(name_data) + '_' + str(count)
71 else: 68 else:
72 return str(name_data) 69 return str(name_data)
73
74 70
75 ############################ get_abund_data #################################### 71 ############################ get_abund_data ####################################
76 def get_abund_data(dataset: pd.DataFrame, cell_line_index:int) -> Optional[pd.Series]: 72 def get_abund_data(dataset: pd.DataFrame, cell_line_index:int) -> Optional[pd.Series]:
77 """ 73 """
78 Extracts abundance data and turns it into a series for a specific cell line from the dataset, which rows are 74 Extracts abundance data and turns it into a series for a specific cell line from the dataset, which rows are
94 cell_line_name = dataset.columns[cell_line_index] 90 cell_line_name = dataset.columns[cell_line_index]
95 abundances_series = dataset[cell_line_name][1:] 91 abundances_series = dataset[cell_line_name][1:]
96 92
97 return abundances_series 93 return abundances_series
98 94
99
100 ############################ clean_metabolite_name #################################### 95 ############################ clean_metabolite_name ####################################
101 def clean_metabolite_name(name :str) -> str: 96 def clean_metabolite_name(name :str) -> str:
102 """ 97 """
103 Removes some characters from a metabolite's name, provided as input, and makes it lowercase in order to simplify 98 Removes some characters from a metabolite's name, provided as input, and makes it lowercase in order to simplify
104 the search of a match in the dictionary of synonyms. 99 the search of a match in the dictionary of synonyms.
108 103
109 Returns: 104 Returns:
110 str : a new string with the cleaned name. 105 str : a new string with the cleaned name.
111 """ 106 """
112 return "".join(ch for ch in name if ch not in ",;-_'([{ }])").lower() 107 return "".join(ch for ch in name if ch not in ",;-_'([{ }])").lower()
113
114 108
115 ############################ get_metabolite_id #################################### 109 ############################ get_metabolite_id ####################################
116 def get_metabolite_id(name :str, syn_dict :Dict[str, List[str]]) -> str: 110 def get_metabolite_id(name :str, syn_dict :Dict[str, List[str]]) -> str:
117 """ 111 """
118 Looks through a dictionary of synonyms to find a match for a given metabolite's name. 112 Looks through a dictionary of synonyms to find a match for a given metabolite's name.
155 missing_list.append(metabolite) 149 missing_list.append(metabolite)
156 150
157 return missing_list 151 return missing_list
158 152
159 ############################ calculate_rps #################################### 153 ############################ calculate_rps ####################################
160 def calculate_rps(reactions: Dict[str, Dict[str, int]], abundances: Dict[str, float], black_list: List[str], missing_list: List[str]) -> Dict[str, float]: 154 def calculate_rps(reactions: Dict[str, Dict[str, int]], abundances: Dict[str, float], black_list: List[str], missing_list: List[str], substrateFreqTable: Dict[str, int]) -> Dict[str, float]:
161 """ 155 """
162 Calculate the Reaction Propensity scores (RPS) based on the availability of reaction substrates, for (ideally) each input model reaction and for each sample. 156 Calculate the Reaction Propensity scores (RPS) based on the availability of reaction substrates, for (ideally) each input model reaction and for each sample.
163 The score is computed as the product of the concentrations of the reacting substances, with each concentration raised to a power equal to its stoichiometric coefficient 157 The score is computed as the product of the concentrations of the reacting substances, with each concentration raised to a power equal to its stoichiometric coefficient
164 for each reaction using the provided coefficient and abundance values. 158 for each reaction using the provided coefficient and abundance values. The value is then normalized, based on how frequent the metabolite is in the selected model's reactions,
159 and log-transformed.
165 160
166 Parameters: 161 Parameters:
167 reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values. 162 reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values.
168 abundances (dict): A dictionary representing metabolite abundances where keys are metabolite names and values are their corresponding abundances. 163 abundances (dict): A dictionary representing metabolite abundances where keys are metabolite names and values are their corresponding abundances.
169 black_list (list): A list containing metabolite names that should be excluded from the RPS calculation. 164 black_list (list): A list containing metabolite names that should be excluded from the RPS calculation.
170 missing_list (list): A list containing metabolite names that were missing in the original abundances dictionary and thus their values were set to 1. 165 missing_list (list): A list containing metabolite names that were missing in the original abundances dictionary and thus their values were set to 1.
171 166 substrateFreqTable (dict): A dictionary where each metabolite name (key) is associated with how many times it shows up in the model's reactions (value).
167
172 Returns: 168 Returns:
173 dict: A dictionary containing Reaction Propensity Scores (RPS) where keys are reaction names and values are the corresponding RPS scores. 169 dict: A dictionary containing Reaction Propensity Scores (RPS) where keys are reaction names and values are the corresponding RPS scores.
174 """ 170 """
175 rps_scores = {} 171 rps_scores = {}
176 172
177 for reaction_name, substrates in reactions.items(): 173 for reaction_name, substrates in reactions.items():
178 total_contribution = 1 174 total_contribution = 1
179 metab_significant = False 175 metab_significant = False
180 for metabolite, stoichiometry in substrates.items(): 176 for metabolite, stoichiometry in substrates.items():
181 temp = 1 if math.isnan(abundances[metabolite]) else abundances[metabolite] 177 abundance = 1 if math.isnan(abundances[metabolite]) else abundances[metabolite]
182 if metabolite not in black_list and metabolite not in missing_list: 178 if metabolite not in black_list and metabolite not in missing_list:
183 metab_significant = True 179 metab_significant = True
184 total_contribution *= temp ** stoichiometry 180
181 total_contribution += math.log((abundance + np.finfo(float).eps) / substrateFreqTable[metabolite]) * stoichiometry
185 182
186 rps_scores[reaction_name] = total_contribution if metab_significant else math.nan 183 rps_scores[reaction_name] = total_contribution if metab_significant else math.nan
187 184
188 return rps_scores 185 return rps_scores
189 186
190
191 ############################ rps_for_cell_lines #################################### 187 ############################ rps_for_cell_lines ####################################
192 def rps_for_cell_lines(dataset: List[List[str]], reactions: Dict[str, Dict[str, int]], black_list: List[str], syn_dict: Dict[str, List[str]]) -> None: 188 def rps_for_cell_lines(dataset: List[List[str]], reactions: Dict[str, Dict[str, int]], black_list: List[str], syn_dict: Dict[str, List[str]], substrateFreqTable: Dict[str, int]) -> None:
193 """ 189 """
194 Calculate Reaction Propensity Scores (RPS) for each cell line represented in the dataframe and creates an output file. 190 Calculate Reaction Propensity Scores (RPS) for each cell line represented in the dataframe and creates an output file.
195 191
196 Parameters: 192 Parameters:
197 dataset : the dataset's data, by rows 193 dataset : the dataset's data, by rows
198 reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values. 194 reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values.
199 black_list (list): A list containing metabolite names that should be excluded from the RPS calculation. 195 black_list (list): A list containing metabolite names that should be excluded from the RPS calculation.
200 syn_dict (dict): A dictionary where keys are general metabolite names and values are lists of possible synonyms. 196 syn_dict (dict): A dictionary where keys are general metabolite names and values are lists of possible synonyms.
197 substrateFreqTable (dict): A dictionary where each metabolite name (key) is associated with how many times it shows up in the model's reactions (value).
201 198
202 Returns: 199 Returns:
203 None 200 None
204 """ 201 """
205 cell_lines = dataset[0][1:] 202 cell_lines = dataset[0][1:]
213 missing_list = check_missing_metab(reactions, abundances_dict, len((cell_lines))) 210 missing_list = check_missing_metab(reactions, abundances_dict, len((cell_lines)))
214 211
215 rps_scores :Dict[Dict[str, float]] = {} 212 rps_scores :Dict[Dict[str, float]] = {}
216 for pos, cell_line_name in enumerate(cell_lines): 213 for pos, cell_line_name in enumerate(cell_lines):
217 abundances = { metab : abundances[pos] for metab, abundances in abundances_dict.items() } 214 abundances = { metab : abundances[pos] for metab, abundances in abundances_dict.items() }
218 rps_scores[cell_line_name] = calculate_rps(reactions, abundances, black_list, missing_list) 215 rps_scores[cell_line_name] = calculate_rps(reactions, abundances, black_list, missing_list, substrateFreqTable)
219 216
220 df = pd.DataFrame.from_dict(rps_scores) 217 df = pd.DataFrame.from_dict(rps_scores)
221 218
222 df.index.name = 'Reactions' 219 df.index.name = 'Reactions'
223 df.to_csv(ARGS.rps_output, sep='\t', na_rep='None', index=True) 220 df.to_csv(ARGS.rps_output, sep='\t', na_rep='None', index=True)
224
225 221
226 ############################ main #################################### 222 ############################ main ####################################
227 def main(args:List[str] = None) -> None: 223 def main(args:List[str] = None) -> None:
228 """ 224 """
229 Initializes everything and sets the program in motion based on the fronted input arguments. 225 Initializes everything and sets the program in motion based on the fronted input arguments.
243 239
244 dataset = utils.readCsv(utils.FilePath.fromStrPath(ARGS.input), '\t', skipHeader = False) 240 dataset = utils.readCsv(utils.FilePath.fromStrPath(ARGS.input), '\t', skipHeader = False)
245 241
246 if ARGS.reaction_choice == 'default': 242 if ARGS.reaction_choice == 'default':
247 reactions = pk.load(open(ARGS.tool_dir + '/local/pickle files/reactions.pickle', 'rb')) 243 reactions = pk.load(open(ARGS.tool_dir + '/local/pickle files/reactions.pickle', 'rb'))
244 substrateFreqTable = pk.load(open(ARGS.tool_dir + '/local/pickle files/substrate_frequencies.pickle', 'rb'))
248 245
249 elif ARGS.reaction_choice == 'custom': 246 elif ARGS.reaction_choice == 'custom':
250 reactions = reactionUtils.parse_custom_reactions(ARGS.custom) 247 reactions = reactionUtils.parse_custom_reactions(ARGS.custom)
251 248 substrateFreqTable = {}
252 rps_for_cell_lines(dataset, reactions, black_list, syn_dict) 249 for _, substrates in reactions.items():
250 for substrateName, _ in substrates.items():
251 if substrateName not in substrateFreqTable: substrateFreqTable[substrateName] = 0
252 substrateFreqTable[substrateName] += 1
253
254 rps_for_cell_lines(dataset, reactions, black_list, syn_dict, substrateFreqTable)
253 print('Execution succeded') 255 print('Execution succeded')
254 256
255 ############################################################################## 257 ##############################################################################
256 if __name__ == "__main__": 258 if __name__ == "__main__": main()
257 main()