Mercurial > repos > bimib > cobraxy
diff COBRAxy/rps_generator.py @ 293:7b8d9de81a86 draft
Uploaded
author | francesco_lapi |
---|---|
date | Thu, 15 May 2025 18:23:52 +0000 |
parents | 5dd2ab4637aa |
children |
line wrap: on
line diff
--- a/COBRAxy/rps_generator.py Thu May 15 18:21:58 2025 +0000 +++ b/COBRAxy/rps_generator.py Thu May 15 18:23:52 2025 +0000 @@ -1,6 +1,3 @@ -import re -import sys -import csv import math import argparse @@ -8,8 +5,7 @@ import pickle as pk import pandas as pd -from enum import Enum -from typing import Optional, List, Dict, Tuple +from typing import Optional, List, Dict import utils.general_utils as utils import utils.reaction_parsing as reactionUtils @@ -45,6 +41,7 @@ help = "Output log") parser.add_argument('-id', '--input', type = str, + required = True, help = 'input dataset') parser.add_argument('-rp', '--rps_output', type = str, @@ -70,7 +67,6 @@ return str(name_data) + '_' + str(count) else: return str(name_data) - ############################ get_abund_data #################################### def get_abund_data(dataset: pd.DataFrame, cell_line_index:int) -> Optional[pd.Series]: @@ -96,7 +92,6 @@ return abundances_series - ############################ clean_metabolite_name #################################### def clean_metabolite_name(name :str) -> str: """ @@ -111,7 +106,6 @@ """ return "".join(ch for ch in name if ch not in ",;-_'([{ }])").lower() - ############################ get_metabolite_id #################################### def get_metabolite_id(name :str, syn_dict :Dict[str, List[str]]) -> str: """ @@ -157,39 +151,41 @@ return missing_list ############################ calculate_rps #################################### -def calculate_rps(reactions: Dict[str, Dict[str, int]], abundances: Dict[str, float], black_list: List[str], missing_list: List[str]) -> Dict[str, float]: +def calculate_rps(reactions: Dict[str, Dict[str, int]], abundances: Dict[str, float], black_list: List[str], missing_list: List[str], substrateFreqTable: Dict[str, int]) -> Dict[str, float]: """ Calculate the Reaction Propensity scores (RPS) based on the availability of reaction substrates, for (ideally) each input model reaction and for each sample. The score is computed as the product of the concentrations of the reacting substances, with each concentration raised to a power equal to its stoichiometric coefficient - for each reaction using the provided coefficient and abundance values. + for each reaction using the provided coefficient and abundance values. The value is then normalized, based on how frequent the metabolite is in the selected model's reactions, + and log-transformed. Parameters: reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values. abundances (dict): A dictionary representing metabolite abundances where keys are metabolite names and values are their corresponding abundances. black_list (list): A list containing metabolite names that should be excluded from the RPS calculation. missing_list (list): A list containing metabolite names that were missing in the original abundances dictionary and thus their values were set to 1. - + substrateFreqTable (dict): A dictionary where each metabolite name (key) is associated with how many times it shows up in the model's reactions (value). + Returns: dict: A dictionary containing Reaction Propensity Scores (RPS) where keys are reaction names and values are the corresponding RPS scores. """ rps_scores = {} - + for reaction_name, substrates in reactions.items(): total_contribution = 1 - metab_significant = False + metab_significant = False for metabolite, stoichiometry in substrates.items(): - temp = 1 if math.isnan(abundances[metabolite]) else abundances[metabolite] + abundance = 1 if math.isnan(abundances[metabolite]) else abundances[metabolite] if metabolite not in black_list and metabolite not in missing_list: - metab_significant = True - total_contribution *= temp ** stoichiometry + metab_significant = True + + total_contribution += math.log((abundance + np.finfo(float).eps) / substrateFreqTable[metabolite]) * stoichiometry rps_scores[reaction_name] = total_contribution if metab_significant else math.nan return rps_scores - ############################ rps_for_cell_lines #################################### -def rps_for_cell_lines(dataset: List[List[str]], reactions: Dict[str, Dict[str, int]], black_list: List[str], syn_dict: Dict[str, List[str]]) -> None: +def rps_for_cell_lines(dataset: List[List[str]], reactions: Dict[str, Dict[str, int]], black_list: List[str], syn_dict: Dict[str, List[str]], substrateFreqTable: Dict[str, int]) -> None: """ Calculate Reaction Propensity Scores (RPS) for each cell line represented in the dataframe and creates an output file. @@ -198,6 +194,7 @@ reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values. black_list (list): A list containing metabolite names that should be excluded from the RPS calculation. syn_dict (dict): A dictionary where keys are general metabolite names and values are lists of possible synonyms. + substrateFreqTable (dict): A dictionary where each metabolite name (key) is associated with how many times it shows up in the model's reactions (value). Returns: None @@ -215,14 +212,13 @@ rps_scores :Dict[Dict[str, float]] = {} for pos, cell_line_name in enumerate(cell_lines): abundances = { metab : abundances[pos] for metab, abundances in abundances_dict.items() } - rps_scores[cell_line_name] = calculate_rps(reactions, abundances, black_list, missing_list) + rps_scores[cell_line_name] = calculate_rps(reactions, abundances, black_list, missing_list, substrateFreqTable) df = pd.DataFrame.from_dict(rps_scores) - + df.index.name = 'Reactions' df.to_csv(ARGS.rps_output, sep='\t', na_rep='None', index=True) - ############################ main #################################### def main(args:List[str] = None) -> None: """ @@ -245,13 +241,18 @@ if ARGS.reaction_choice == 'default': reactions = pk.load(open(ARGS.tool_dir + '/local/pickle files/reactions.pickle', 'rb')) + substrateFreqTable = pk.load(open(ARGS.tool_dir + '/local/pickle files/substrate_frequencies.pickle', 'rb')) elif ARGS.reaction_choice == 'custom': reactions = reactionUtils.parse_custom_reactions(ARGS.custom) - - rps_for_cell_lines(dataset, reactions, black_list, syn_dict) + substrateFreqTable = {} + for _, substrates in reactions.items(): + for substrateName, _ in substrates.items(): + if substrateName not in substrateFreqTable: substrateFreqTable[substrateName] = 0 + substrateFreqTable[substrateName] += 1 + + rps_for_cell_lines(dataset, reactions, black_list, syn_dict, substrateFreqTable) print('Execution succeded') ############################################################################## -if __name__ == "__main__": - main() \ No newline at end of file +if __name__ == "__main__": main() \ No newline at end of file