| 456 | 1 """ | 
|  | 2 Utilities for generating and manipulating COBRA models and related metadata. | 
|  | 3 | 
|  | 4 This module includes helpers to: | 
|  | 5 - extract rules, reactions, bounds, objective coefficients, and compartments | 
|  | 6 - build a COBRA model from a tabular file | 
|  | 7 - set objective and medium from dataframes | 
|  | 8 - validate a model and convert gene identifiers | 
|  | 9 - translate model GPRs using mapping tables | 
|  | 10 """ | 
| 418 | 11 import os | 
|  | 12 import cobra | 
|  | 13 import pandas as pd | 
| 419 | 14 import re | 
| 426 | 15 import logging | 
| 419 | 16 from typing import Optional, Tuple, Union, List, Dict, Set | 
| 426 | 17 from collections import defaultdict | 
| 418 | 18 import utils.rule_parsing  as rulesUtils | 
| 419 | 19 import utils.reaction_parsing as reactionUtils | 
|  | 20 from cobra import Model as cobraModel, Reaction, Metabolite | 
| 490 | 21 import sys | 
|  | 22 | 
|  | 23 | 
|  | 24 ############################ check_methods #################################### | 
|  | 25 def gene_type(l :str, name :str) -> str: | 
|  | 26     """ | 
|  | 27     Determine the type of gene ID. | 
|  | 28 | 
|  | 29     Args: | 
|  | 30         l (str): The gene identifier to check. | 
|  | 31         name (str): The name of the dataset, used in error messages. | 
|  | 32 | 
|  | 33     Returns: | 
|  | 34         str: The type of gene ID ('hugo_id', 'ensembl_gene_id', 'symbol', or 'entrez_id'). | 
|  | 35 | 
|  | 36     Raises: | 
|  | 37         sys.exit: If the gene ID type is not supported, the execution is aborted. | 
|  | 38     """ | 
|  | 39     if check_hgnc(l): | 
|  | 40         return 'hugo_id' | 
|  | 41     elif check_ensembl(l): | 
|  | 42         return 'ensembl_gene_id' | 
|  | 43     elif check_symbol(l): | 
|  | 44         return 'symbol' | 
|  | 45     elif check_entrez(l): | 
|  | 46         return 'entrez_id' | 
|  | 47     else: | 
|  | 48         sys.exit('Execution aborted:\n' + | 
|  | 49                  'gene ID type in ' + name + ' not supported. Supported ID'+ | 
|  | 50                  'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n') | 
|  | 51 | 
|  | 52 def check_hgnc(l :str) -> bool: | 
|  | 53     """ | 
|  | 54     Check if a gene identifier follows the HGNC format. | 
|  | 55 | 
|  | 56     Args: | 
|  | 57         l (str): The gene identifier to check. | 
|  | 58 | 
|  | 59     Returns: | 
|  | 60         bool: True if the gene identifier follows the HGNC format, False otherwise. | 
|  | 61     """ | 
|  | 62     if len(l) > 5: | 
|  | 63         if (l.upper()).startswith('HGNC:'): | 
|  | 64             return l[5:].isdigit() | 
|  | 65         else: | 
|  | 66             return False | 
|  | 67     else: | 
|  | 68         return False | 
|  | 69 | 
|  | 70 def check_ensembl(l :str) -> bool: | 
|  | 71     """ | 
|  | 72     Check if a gene identifier follows the Ensembl format. | 
|  | 73 | 
|  | 74     Args: | 
|  | 75         l (str): The gene identifier to check. | 
|  | 76 | 
|  | 77     Returns: | 
|  | 78         bool: True if the gene identifier follows the Ensembl format, False otherwise. | 
|  | 79     """ | 
|  | 80     return l.upper().startswith('ENS') | 
|  | 81 | 
|  | 82 | 
|  | 83 def check_symbol(l :str) -> bool: | 
|  | 84     """ | 
|  | 85     Check if a gene identifier follows the symbol format. | 
|  | 86 | 
|  | 87     Args: | 
|  | 88         l (str): The gene identifier to check. | 
|  | 89 | 
|  | 90     Returns: | 
|  | 91         bool: True if the gene identifier follows the symbol format, False otherwise. | 
|  | 92     """ | 
|  | 93     if len(l) > 0: | 
|  | 94         if l[0].isalpha() and l[1:].isalnum(): | 
|  | 95             return True | 
|  | 96         else: | 
|  | 97             return False | 
|  | 98     else: | 
|  | 99         return False | 
|  | 100 | 
|  | 101 def check_entrez(l :str) -> bool: | 
|  | 102     """ | 
|  | 103     Check if a gene identifier follows the Entrez ID format. | 
|  | 104 | 
|  | 105     Args: | 
|  | 106         l (str): The gene identifier to check. | 
|  | 107 | 
|  | 108     Returns: | 
|  | 109         bool: True if the gene identifier follows the Entrez ID format, False otherwise. | 
|  | 110     """ | 
|  | 111     if len(l) > 0: | 
|  | 112         return l.isdigit() | 
|  | 113     else: | 
|  | 114         return False | 
| 418 | 115 | 
|  | 116 ################################- DATA GENERATION -################################ | 
|  | 117 ReactionId = str | 
| 419 | 118 def generate_rules(model: cobraModel, *, asParsed = True) -> Union[Dict[ReactionId, rulesUtils.OpList], Dict[ReactionId, str]]: | 
| 418 | 119     """ | 
| 456 | 120     Generate a dictionary mapping reaction IDs to GPR rules from the model. | 
| 418 | 121 | 
|  | 122     Args: | 
| 456 | 123         model: COBRA model to derive data from. | 
|  | 124         asParsed: If True, parse rules into a nested list structure; otherwise keep raw strings. | 
| 418 | 125 | 
|  | 126     Returns: | 
| 456 | 127         Dict[ReactionId, rulesUtils.OpList]: Parsed rules by reaction ID. | 
|  | 128         Dict[ReactionId, str]: Raw rules by reaction ID. | 
| 418 | 129     """ | 
|  | 130     _ruleGetter   =  lambda reaction : reaction.gene_reaction_rule | 
|  | 131     ruleExtractor = (lambda reaction : | 
|  | 132         rulesUtils.parseRuleToNestedList(_ruleGetter(reaction))) if asParsed else _ruleGetter | 
|  | 133 | 
|  | 134     return { | 
|  | 135         reaction.id : ruleExtractor(reaction) | 
|  | 136         for reaction in model.reactions | 
|  | 137         if reaction.gene_reaction_rule } | 
|  | 138 | 
| 419 | 139 def generate_reactions(model :cobraModel, *, asParsed = True) -> Dict[ReactionId, str]: | 
| 418 | 140     """ | 
| 456 | 141     Generate a dictionary mapping reaction IDs to reaction formulas from the model. | 
| 418 | 142 | 
|  | 143     Args: | 
| 456 | 144         model: COBRA model to derive data from. | 
|  | 145         asParsed: If True, convert formulas into a parsed representation; otherwise keep raw strings. | 
| 418 | 146 | 
|  | 147     Returns: | 
| 456 | 148         Dict[ReactionId, str]: Reactions by reaction ID (parsed if requested). | 
| 418 | 149     """ | 
|  | 150 | 
|  | 151     unparsedReactions = { | 
|  | 152         reaction.id : reaction.reaction | 
|  | 153         for reaction in model.reactions | 
|  | 154         if reaction.reaction | 
|  | 155     } | 
|  | 156 | 
|  | 157     if not asParsed: return unparsedReactions | 
|  | 158 | 
|  | 159     return reactionUtils.create_reaction_dict(unparsedReactions) | 
|  | 160 | 
| 419 | 161 def get_medium(model:cobraModel) -> pd.DataFrame: | 
| 456 | 162     """ | 
|  | 163     Extract the uptake reactions representing the model medium. | 
|  | 164 | 
|  | 165     Returns a DataFrame with a single column 'reaction' listing exchange reactions | 
|  | 166     with negative lower bound and no positive stoichiometric coefficients (uptake only). | 
|  | 167     """ | 
| 418 | 168     trueMedium=[] | 
|  | 169     for r in model.reactions: | 
|  | 170         positiveCoeff=0 | 
|  | 171         for m in r.metabolites: | 
|  | 172             if r.get_coefficient(m.id)>0: | 
|  | 173                 positiveCoeff=1; | 
|  | 174         if (positiveCoeff==0 and r.lower_bound<0): | 
|  | 175             trueMedium.append(r.id) | 
|  | 176 | 
|  | 177     df_medium = pd.DataFrame() | 
|  | 178     df_medium["reaction"] = trueMedium | 
|  | 179     return df_medium | 
|  | 180 | 
| 426 | 181 def extract_objective_coefficients(model: cobraModel) -> pd.DataFrame: | 
|  | 182     """ | 
| 456 | 183     Extract objective coefficients for each reaction. | 
|  | 184 | 
| 426 | 185     Args: | 
| 456 | 186         model: COBRA model | 
|  | 187 | 
| 426 | 188     Returns: | 
| 456 | 189         pd.DataFrame with columns: ReactionID, ObjectiveCoefficient | 
| 426 | 190     """ | 
|  | 191     coeffs = [] | 
| 456 | 192     # model.objective.expression is a linear expression | 
| 426 | 193     objective_expr = model.objective.expression.as_coefficients_dict() | 
|  | 194 | 
|  | 195     for reaction in model.reactions: | 
|  | 196         coeff = objective_expr.get(reaction.forward_variable, 0.0) | 
|  | 197         coeffs.append({ | 
|  | 198             "ReactionID": reaction.id, | 
|  | 199             "ObjectiveCoefficient": coeff | 
|  | 200         }) | 
|  | 201 | 
|  | 202     return pd.DataFrame(coeffs) | 
|  | 203 | 
| 419 | 204 def generate_bounds(model:cobraModel) -> pd.DataFrame: | 
| 456 | 205     """ | 
|  | 206     Build a DataFrame of lower/upper bounds for all reactions. | 
|  | 207 | 
|  | 208     Returns: | 
|  | 209         pd.DataFrame indexed by reaction IDs with columns ['lower_bound', 'upper_bound']. | 
|  | 210     """ | 
| 418 | 211 | 
|  | 212     rxns = [] | 
|  | 213     for reaction in model.reactions: | 
|  | 214         rxns.append(reaction.id) | 
|  | 215 | 
|  | 216     bounds = pd.DataFrame(columns = ["lower_bound", "upper_bound"], index=rxns) | 
|  | 217 | 
|  | 218     for reaction in model.reactions: | 
|  | 219         bounds.loc[reaction.id] = [reaction.lower_bound, reaction.upper_bound] | 
|  | 220     return bounds | 
|  | 221 | 
|  | 222 | 
|  | 223 | 
| 419 | 224 def generate_compartments(model: cobraModel) -> pd.DataFrame: | 
| 418 | 225     """ | 
|  | 226     Generates a DataFrame containing compartment information for each reaction. | 
|  | 227     Creates columns for each compartment position (Compartment_1, Compartment_2, etc.) | 
|  | 228 | 
|  | 229     Args: | 
|  | 230         model: the COBRA model to extract compartment data from. | 
|  | 231 | 
|  | 232     Returns: | 
|  | 233         pd.DataFrame: DataFrame with ReactionID and compartment columns | 
|  | 234     """ | 
|  | 235     pathway_data = [] | 
|  | 236 | 
|  | 237     # First pass: determine the maximum number of pathways any reaction has | 
|  | 238     max_pathways = 0 | 
|  | 239     reaction_pathways = {} | 
|  | 240 | 
|  | 241     for reaction in model.reactions: | 
|  | 242         # Get unique pathways from all metabolites in the reaction | 
|  | 243         if type(reaction.annotation['pathways']) == list: | 
|  | 244             reaction_pathways[reaction.id] = reaction.annotation['pathways'] | 
|  | 245             max_pathways = max(max_pathways, len(reaction.annotation['pathways'])) | 
|  | 246         else: | 
|  | 247             reaction_pathways[reaction.id] = [reaction.annotation['pathways']] | 
|  | 248 | 
|  | 249     # Create column names for pathways | 
|  | 250     pathway_columns = [f"Pathway_{i+1}" for i in range(max_pathways)] | 
|  | 251 | 
|  | 252     # Second pass: create the data | 
|  | 253     for reaction_id, pathways in reaction_pathways.items(): | 
|  | 254         row = {"ReactionID": reaction_id} | 
|  | 255 | 
|  | 256         # Fill pathway columns | 
|  | 257         for i in range(max_pathways): | 
|  | 258             col_name = pathway_columns[i] | 
|  | 259             if i < len(pathways): | 
|  | 260                 row[col_name] = pathways[i] | 
|  | 261             else: | 
|  | 262                 row[col_name] = None  # or "" if you prefer empty strings | 
|  | 263 | 
|  | 264         pathway_data.append(row) | 
|  | 265 | 
| 419 | 266     return pd.DataFrame(pathway_data) | 
|  | 267 | 
|  | 268 | 
|  | 269 | 
|  | 270 def build_cobra_model_from_csv(csv_path: str, model_id: str = "new_model") -> cobraModel: | 
|  | 271     """ | 
| 456 | 272     Build a COBRApy model from a tabular file with reaction data. | 
|  | 273 | 
| 419 | 274     Args: | 
| 456 | 275         csv_path: Path to the tab-separated file. | 
|  | 276         model_id: ID for the newly created model. | 
|  | 277 | 
| 419 | 278     Returns: | 
| 456 | 279         cobra.Model: The constructed COBRApy model. | 
| 419 | 280     """ | 
|  | 281 | 
|  | 282     df = pd.read_csv(csv_path, sep='\t') | 
|  | 283 | 
|  | 284     model = cobraModel(model_id) | 
|  | 285 | 
|  | 286     metabolites_dict = {} | 
|  | 287     compartments_dict = {} | 
|  | 288 | 
| 456 | 289     print(f"Building model from {len(df)} reactions...") | 
| 419 | 290 | 
|  | 291     for idx, row in df.iterrows(): | 
| 448 | 292         reaction_formula = str(row['Formula']).strip() | 
| 419 | 293         if not reaction_formula or reaction_formula == 'nan': | 
|  | 294             continue | 
|  | 295 | 
|  | 296         metabolites = extract_metabolites_from_reaction(reaction_formula) | 
|  | 297 | 
|  | 298         for met_id in metabolites: | 
|  | 299             compartment = extract_compartment_from_metabolite(met_id) | 
|  | 300 | 
|  | 301             if compartment not in compartments_dict: | 
|  | 302                 compartments_dict[compartment] = compartment | 
|  | 303 | 
|  | 304             if met_id not in metabolites_dict: | 
|  | 305                 metabolites_dict[met_id] = Metabolite( | 
|  | 306                     id=met_id, | 
|  | 307                     compartment=compartment, | 
|  | 308                     name=met_id.replace(f"_{compartment}", "").replace("__", "_") | 
|  | 309                 ) | 
|  | 310 | 
|  | 311     model.compartments = compartments_dict | 
|  | 312 | 
|  | 313     model.add_metabolites(list(metabolites_dict.values())) | 
|  | 314 | 
| 456 | 315     print(f"Added {len(metabolites_dict)} metabolites and {len(compartments_dict)} compartments") | 
| 419 | 316 | 
|  | 317     reactions_added = 0 | 
|  | 318     reactions_skipped = 0 | 
|  | 319 | 
|  | 320     for idx, row in df.iterrows(): | 
|  | 321 | 
|  | 322         reaction_id = str(row['ReactionID']).strip() | 
| 427 | 323         reaction_formula = str(row['Formula']).strip() | 
| 419 | 324 | 
|  | 325         if not reaction_formula or reaction_formula == 'nan': | 
| 456 | 326             raise ValueError(f"Missing reaction formula for {reaction_id}") | 
| 419 | 327 | 
|  | 328         reaction = Reaction(reaction_id) | 
|  | 329         reaction.name = reaction_id | 
|  | 330 | 
|  | 331         reaction.lower_bound = float(row['lower_bound']) if pd.notna(row['lower_bound']) else -1000.0 | 
|  | 332         reaction.upper_bound = float(row['upper_bound']) if pd.notna(row['upper_bound']) else 1000.0 | 
|  | 333 | 
| 427 | 334         if pd.notna(row['GPR']) and str(row['GPR']).strip(): | 
|  | 335             reaction.gene_reaction_rule = str(row['GPR']).strip() | 
| 419 | 336 | 
|  | 337         try: | 
|  | 338             parse_reaction_formula(reaction, reaction_formula, metabolites_dict) | 
|  | 339         except Exception as e: | 
| 456 | 340             print(f"Error parsing reaction {reaction_id}: {e}") | 
| 419 | 341             reactions_skipped += 1 | 
|  | 342             continue | 
|  | 343 | 
|  | 344         model.add_reactions([reaction]) | 
|  | 345         reactions_added += 1 | 
|  | 346 | 
|  | 347 | 
| 456 | 348     print(f"Added {reactions_added} reactions, skipped {reactions_skipped} reactions") | 
| 419 | 349 | 
| 430 | 350     # set objective function | 
|  | 351     set_objective_from_csv(model, df, obj_col="ObjectiveCoefficient") | 
|  | 352 | 
| 419 | 353     set_medium_from_data(model, df) | 
|  | 354 | 
| 456 | 355     print(f"Model completed: {len(model.reactions)} reactions, {len(model.metabolites)} metabolites") | 
| 419 | 356 | 
|  | 357     return model | 
|  | 358 | 
|  | 359 | 
|  | 360 # Estrae tutti gli ID metaboliti nella formula (gestisce prefissi numerici + underscore) | 
| 499 | 361 #def extract_metabolites_from_reaction(reaction_formula: str) -> Set[str]: | 
|  | 362 #    """ | 
|  | 363 #    Extract metabolite IDs from a reaction formula. | 
|  | 364 #    Robust pattern: tokens ending with _<compartment> (e.g., _c, _m, _e), | 
|  | 365 #    allowing leading digits/underscores. | 
|  | 366 #    """ | 
|  | 367 #    metabolites = set() | 
|  | 368 #    # optional coefficient followed by a token ending with _<letters> | 
|  | 369 #    if reaction_formula[-1] == ']' and reaction_formula[-3] == '[': | 
|  | 370 #        pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+[[A-Za-z0-9]]+)' | 
|  | 371 #    else: | 
|  | 372 #        pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+_[A-Za-z0-9]+)' | 
|  | 373 #    matches = re.findall(pattern, reaction_formula) | 
|  | 374 #    metabolites.update(matches) | 
|  | 375 #    return metabolites | 
|  | 376 | 
|  | 377 | 
| 419 | 378 def extract_metabolites_from_reaction(reaction_formula: str) -> Set[str]: | 
|  | 379     """ | 
| 500 | 380     Extract metabolite IDs from a reaction formula. | 
|  | 381 | 
|  | 382     Handles: | 
|  | 383       - optional stoichiometric coefficients (integers or decimals) | 
|  | 384       - compartment tags at the end of the metabolite, either [c] or _c | 
|  | 385 | 
|  | 386     Returns the IDs including the compartment suffix exactly as written. | 
| 419 | 387     """ | 
| 499 | 388     pattern = re.compile( | 
| 500 | 389         r'(?:^|(?<=\s)|(?<=\+)|(?<=,)|(?<==)|(?<=:))'              # left boundary (start, space, +, comma, =, :) | 
|  | 390         r'(?:\d+(?:\.\d+)?\s*)?'                                   # optional coefficient | 
|  | 391         r'([A-Za-z0-9_]+(?:\[[A-Za-z0-9]+\]|_[A-Za-z0-9]+))'       # metabolite + compartment | 
| 499 | 392     ) | 
|  | 393     return {m.group(1) for m in pattern.finditer(reaction_formula)} | 
| 419 | 394 | 
|  | 395 | 
| 500 | 396 | 
| 419 | 397 def extract_compartment_from_metabolite(metabolite_id: str) -> str: | 
| 456 | 398     """Extract the compartment from a metabolite ID.""" | 
| 500 | 399     if '_' == metabolite_id[-2]: | 
| 419 | 400         return metabolite_id.split('_')[-1] | 
| 493 | 401     if metabolite_id[-1] == ']' and metabolite_id[-3] == '[': | 
|  | 402         return metabolite_id[-2] | 
| 419 | 403     return 'c'  # default cytoplasm | 
|  | 404 | 
|  | 405 | 
|  | 406 def parse_reaction_formula(reaction: Reaction, formula: str, metabolites_dict: Dict[str, Metabolite]): | 
| 456 | 407     """Parse a reaction formula and set metabolites with their coefficients.""" | 
| 419 | 408 | 
|  | 409     if '<=>' in formula: | 
|  | 410         left, right = formula.split('<=>') | 
|  | 411         reversible = True | 
|  | 412     elif '<--' in formula: | 
|  | 413         left, right = formula.split('<--') | 
|  | 414         reversible = False | 
|  | 415     elif '-->' in formula: | 
|  | 416         left, right = formula.split('-->') | 
|  | 417         reversible = False | 
|  | 418     elif '<-' in formula: | 
|  | 419         left, right = formula.split('<-') | 
|  | 420         reversible = False | 
|  | 421     else: | 
| 456 | 422         raise ValueError(f"Unrecognized reaction format: {formula}") | 
| 419 | 423 | 
|  | 424     reactants = parse_metabolites_side(left.strip()) | 
|  | 425     products = parse_metabolites_side(right.strip()) | 
|  | 426 | 
|  | 427     metabolites_to_add = {} | 
|  | 428 | 
|  | 429     for met_id, coeff in reactants.items(): | 
|  | 430         if met_id in metabolites_dict: | 
|  | 431             metabolites_to_add[metabolites_dict[met_id]] = -coeff | 
|  | 432 | 
|  | 433     for met_id, coeff in products.items(): | 
|  | 434         if met_id in metabolites_dict: | 
|  | 435             metabolites_to_add[metabolites_dict[met_id]] = coeff | 
|  | 436 | 
|  | 437     reaction.add_metabolites(metabolites_to_add) | 
|  | 438 | 
|  | 439 | 
|  | 440 def parse_metabolites_side(side_str: str) -> Dict[str, float]: | 
| 456 | 441     """Parse one side of a reaction and extract metabolites with coefficients.""" | 
| 419 | 442     metabolites = {} | 
|  | 443     if not side_str or side_str.strip() == '': | 
|  | 444         return metabolites | 
|  | 445 | 
|  | 446     terms = side_str.split('+') | 
|  | 447     for term in terms: | 
|  | 448         term = term.strip() | 
|  | 449         if not term: | 
|  | 450             continue | 
|  | 451 | 
| 456 | 452         # optional coefficient + id ending with _<compartment> | 
| 419 | 453         match = re.match(r'(?:(\d+\.?\d*)\s+)?([A-Za-z0-9_]+_[a-z]+)', term) | 
|  | 454         if match: | 
|  | 455             coeff_str, met_id = match.groups() | 
|  | 456             coeff = float(coeff_str) if coeff_str else 1.0 | 
|  | 457             metabolites[met_id] = coeff | 
|  | 458 | 
|  | 459     return metabolites | 
|  | 460 | 
|  | 461 | 
|  | 462 | 
| 430 | 463 def set_objective_from_csv(model: cobra.Model, df: pd.DataFrame, obj_col: str = "ObjectiveCoefficient"): | 
| 419 | 464     """ | 
| 430 | 465     Sets the model's objective function based on a column of coefficients in the CSV. | 
|  | 466     Can be any reaction(s), not necessarily biomass. | 
| 419 | 467     """ | 
| 430 | 468     obj_dict = {} | 
| 419 | 469 | 
| 430 | 470     for idx, row in df.iterrows(): | 
|  | 471         reaction_id = str(row['ReactionID']).strip() | 
|  | 472         coeff = float(row[obj_col]) if pd.notna(row[obj_col]) else 0.0 | 
|  | 473         if coeff != 0: | 
|  | 474             if reaction_id in model.reactions: | 
|  | 475                 obj_dict[model.reactions.get_by_id(reaction_id)] = coeff | 
|  | 476             else: | 
|  | 477                 print(f"Warning: reaction {reaction_id} not found in model, skipping for objective.") | 
|  | 478 | 
|  | 479     if not obj_dict: | 
|  | 480         raise ValueError("No reactions found with non-zero objective coefficient.") | 
|  | 481 | 
|  | 482     model.objective = obj_dict | 
|  | 483     print(f"Objective set with {len(obj_dict)} reactions.") | 
|  | 484 | 
|  | 485 | 
| 419 | 486 | 
|  | 487 | 
|  | 488 def set_medium_from_data(model: cobraModel, df: pd.DataFrame): | 
| 456 | 489     """Set the medium based on the 'InMedium' column in the dataframe.""" | 
| 419 | 490     medium_reactions = df[df['InMedium'] == True]['ReactionID'].tolist() | 
|  | 491 | 
|  | 492     medium_dict = {} | 
|  | 493     for rxn_id in medium_reactions: | 
|  | 494         if rxn_id in [r.id for r in model.reactions]: | 
|  | 495             reaction = model.reactions.get_by_id(rxn_id) | 
|  | 496             if reaction.lower_bound < 0:  # Solo reazioni di uptake | 
|  | 497                 medium_dict[rxn_id] = abs(reaction.lower_bound) | 
|  | 498 | 
|  | 499     if medium_dict: | 
|  | 500         model.medium = medium_dict | 
| 456 | 501         print(f"Medium set with {len(medium_dict)} components") | 
| 419 | 502 | 
|  | 503 | 
|  | 504 def validate_model(model: cobraModel) -> Dict[str, any]: | 
| 456 | 505     """Validate the model and return basic statistics.""" | 
| 419 | 506     validation = { | 
|  | 507         'num_reactions': len(model.reactions), | 
|  | 508         'num_metabolites': len(model.metabolites), | 
|  | 509         'num_genes': len(model.genes), | 
|  | 510         'num_compartments': len(model.compartments), | 
|  | 511         'objective': str(model.objective), | 
|  | 512         'medium_size': len(model.medium), | 
|  | 513         'reversible_reactions': len([r for r in model.reactions if r.reversibility]), | 
|  | 514         'exchange_reactions': len([r for r in model.reactions if r.id.startswith('EX_')]), | 
|  | 515     } | 
|  | 516 | 
|  | 517     try: | 
| 456 | 518         # Growth test | 
| 419 | 519         solution = model.optimize() | 
|  | 520         validation['growth_rate'] = solution.objective_value | 
|  | 521         validation['status'] = solution.status | 
|  | 522     except Exception as e: | 
|  | 523         validation['growth_rate'] = None | 
|  | 524         validation['status'] = f"Error: {e}" | 
|  | 525 | 
|  | 526     return validation | 
|  | 527 | 
| 456 | 528 def convert_genes(model, annotation): | 
|  | 529     """Rename genes using a selected annotation key in gene.notes; returns a model copy.""" | 
| 419 | 530     from cobra.manipulation import rename_genes | 
|  | 531     model2=model.copy() | 
|  | 532     try: | 
|  | 533         dict_genes={gene.id:gene.notes[annotation]  for gene in model2.genes} | 
|  | 534     except: | 
|  | 535         print("No annotation in gene dict!") | 
|  | 536         return -1 | 
|  | 537     rename_genes(model2,dict_genes) | 
|  | 538 | 
| 426 | 539     return model2 | 
|  | 540 | 
|  | 541 # ---------- Utility helpers ---------- | 
|  | 542 def _normalize_colname(col: str) -> str: | 
|  | 543     return col.strip().lower().replace(' ', '_') | 
|  | 544 | 
|  | 545 def _choose_columns(mapping_df: 'pd.DataFrame') -> Dict[str, str]: | 
|  | 546     """ | 
| 456 | 547     Find useful columns and return a dict {ensg: colname1, hgnc_id: colname2, ...}. | 
|  | 548     Raise ValueError if no suitable mapping is found. | 
| 426 | 549     """ | 
|  | 550     cols = { _normalize_colname(c): c for c in mapping_df.columns } | 
|  | 551     chosen = {} | 
| 456 | 552     # candidate names for each category | 
| 426 | 553     candidates = { | 
|  | 554         'ensg': ['ensg', 'ensembl_gene_id', 'ensembl'], | 
|  | 555         'hgnc_id': ['hgnc_id', 'hgnc', 'hgnc:'], | 
| 444 | 556         'hgnc_symbol': ['hgnc_symbol', 'hgnc symbol', 'symbol'], | 
| 455 | 557         'entrez_id': ['entrez', 'entrez_id', 'entrezgene'], | 
|  | 558         'gene_number': ['gene_number'] | 
| 426 | 559     } | 
|  | 560     for key, names in candidates.items(): | 
|  | 561         for n in names: | 
|  | 562             if n in cols: | 
|  | 563                 chosen[key] = cols[n] | 
|  | 564                 break | 
|  | 565     return chosen | 
|  | 566 | 
|  | 567 def _validate_target_uniqueness(mapping_df: 'pd.DataFrame', | 
|  | 568                                 source_col: str, | 
|  | 569                                 target_col: str, | 
|  | 570                                 model_source_genes: Optional[Set[str]] = None, | 
|  | 571                                 logger: Optional[logging.Logger] = None) -> None: | 
|  | 572     """ | 
| 456 | 573         Check that, within the filtered mapping_df, each target maps to at most one source. | 
|  | 574         Log examples if duplicates are found. | 
| 426 | 575     """ | 
|  | 576     if logger is None: | 
|  | 577         logger = logging.getLogger(__name__) | 
|  | 578 | 
|  | 579     if mapping_df.empty: | 
|  | 580         logger.warning("Mapping dataframe is empty for the requested source genes; skipping uniqueness validation.") | 
|  | 581         return | 
|  | 582 | 
| 456 | 583     # normalize temporary columns for grouping (without altering the original df) | 
| 426 | 584     tmp = mapping_df[[source_col, target_col]].copy() | 
|  | 585     tmp['_src_norm'] = tmp[source_col].astype(str).map(_normalize_gene_id) | 
|  | 586     tmp['_tgt_norm'] = tmp[target_col].astype(str).str.strip() | 
|  | 587 | 
| 456 | 588     # optionally filter to the set of model source genes | 
| 426 | 589     if model_source_genes is not None: | 
|  | 590         tmp = tmp[tmp['_src_norm'].isin(model_source_genes)] | 
|  | 591 | 
|  | 592     if tmp.empty: | 
|  | 593         logger.warning("After filtering to model source genes, mapping table is empty — nothing to validate.") | 
|  | 594         return | 
|  | 595 | 
| 456 | 596     # build reverse mapping: target -> set(sources) | 
| 426 | 597     grouped = tmp.groupby('_tgt_norm')['_src_norm'].agg(lambda s: set(s.dropna())) | 
| 456 | 598     # find targets with more than one source | 
| 426 | 599     problematic = {t: sorted(list(s)) for t, s in grouped.items() if len(s) > 1} | 
|  | 600 | 
|  | 601     if problematic: | 
| 456 | 602     # prepare warning message with examples (limited subset) | 
| 455 | 603         sample_items = list(problematic.items()) | 
| 426 | 604         msg_lines = ["Mapping validation failed: some target IDs are associated with multiple source IDs."] | 
|  | 605         for tgt, sources in sample_items: | 
|  | 606             msg_lines.append(f"  - target '{tgt}' <- sources: {', '.join(sources)}") | 
|  | 607         full_msg = "\n".join(msg_lines) | 
| 456 | 608     # log warning | 
| 455 | 609         logger.warning(full_msg) | 
| 426 | 610 | 
| 456 | 611     # if everything is fine | 
| 426 | 612     logger.info("Mapping validation passed: no target ID is associated with multiple source IDs (within filtered set).") | 
|  | 613 | 
|  | 614 | 
|  | 615 def _normalize_gene_id(g: str) -> str: | 
| 456 | 616     """Normalize a gene ID for use as a key (removes prefixes like 'HGNC:' and strips).""" | 
| 426 | 617     if g is None: | 
|  | 618         return "" | 
|  | 619     g = str(g).strip() | 
|  | 620     # remove common prefixes | 
|  | 621     g = re.sub(r'^(HGNC:)', '', g, flags=re.IGNORECASE) | 
|  | 622     g = re.sub(r'^(ENSG:)', '', g, flags=re.IGNORECASE) | 
|  | 623     return g | 
|  | 624 | 
| 493 | 625 def _is_or_only_expression(expr: str) -> bool: | 
|  | 626     """ | 
|  | 627     Check if a GPR expression contains only OR operators (no AND operators). | 
|  | 628 | 
|  | 629     Args: | 
|  | 630         expr: GPR expression string | 
|  | 631 | 
|  | 632     Returns: | 
|  | 633         bool: True if expression contains only OR (and parentheses) and has multiple genes, False otherwise | 
|  | 634     """ | 
|  | 635     if not expr or not expr.strip(): | 
|  | 636         return False | 
|  | 637 | 
|  | 638     # Normalize the expression | 
|  | 639     normalized = expr.replace(' AND ', ' and ').replace(' OR ', ' or ') | 
|  | 640 | 
|  | 641     # Check if it contains any AND operators | 
|  | 642     has_and = ' and ' in normalized.lower() | 
|  | 643 | 
|  | 644     # Check if it contains OR operators | 
|  | 645     has_or = ' or ' in normalized.lower() | 
|  | 646 | 
|  | 647     # Must have OR operators and no AND operators | 
|  | 648     return has_or and not has_and | 
|  | 649 | 
|  | 650 | 
|  | 651 def _flatten_or_only_gpr(expr: str) -> str: | 
|  | 652     """ | 
|  | 653     Flatten a GPR expression that contains only OR operators by: | 
|  | 654     1. Removing all parentheses | 
|  | 655     2. Extracting unique gene names | 
|  | 656     3. Joining them with ' or ' | 
|  | 657 | 
|  | 658     Args: | 
|  | 659         expr: GPR expression string with only OR operators | 
|  | 660 | 
|  | 661     Returns: | 
|  | 662         str: Flattened GPR expression | 
|  | 663     """ | 
|  | 664     if not expr or not expr.strip(): | 
|  | 665         return expr | 
|  | 666 | 
|  | 667     # Extract all gene tokens (exclude logical operators and parentheses) | 
|  | 668     gene_pattern = r'\b[A-Za-z0-9:_.-]+\b' | 
|  | 669     logical = {'and', 'or', 'AND', 'OR', '(', ')'} | 
|  | 670 | 
|  | 671     tokens = re.findall(gene_pattern, expr) | 
|  | 672     genes = [t for t in tokens if t not in logical] | 
|  | 673 | 
|  | 674     # Create set to remove duplicates, then convert back to list to maintain some order | 
|  | 675     unique_genes = list(dict.fromkeys(genes))  # Preserves insertion order | 
|  | 676 | 
|  | 677     if len(unique_genes) == 0: | 
|  | 678         return expr | 
|  | 679     elif len(unique_genes) == 1: | 
|  | 680         return unique_genes[0] | 
|  | 681     else: | 
|  | 682         return ' or '.join(unique_genes) | 
|  | 683 | 
|  | 684 | 
| 455 | 685 def _simplify_boolean_expression(expr: str) -> str: | 
|  | 686     """ | 
| 490 | 687     Simplify a boolean expression by removing duplicates while strictly preserving semantics. | 
|  | 688     This function handles simple duplicates within parentheses while being conservative about | 
|  | 689     complex expressions that could change semantics. | 
| 455 | 690     """ | 
|  | 691     if not expr or not expr.strip(): | 
|  | 692         return expr | 
|  | 693 | 
| 490 | 694     # Normalize operators and whitespace | 
| 455 | 695     expr = expr.replace(' AND ', ' and ').replace(' OR ', ' or ') | 
| 490 | 696     expr = ' '.join(expr.split())  # Normalize whitespace | 
| 455 | 697 | 
| 490 | 698     def simplify_parentheses_content(match_obj): | 
|  | 699         """Helper function to simplify content within parentheses.""" | 
|  | 700         content = match_obj.group(1)  # Content inside parentheses | 
| 455 | 701 | 
| 490 | 702         # Only simplify if it's a pure OR or pure AND chain | 
|  | 703         if ' or ' in content and ' and ' not in content: | 
|  | 704             # Pure OR chain - safe to deduplicate | 
|  | 705             parts = [p.strip() for p in content.split(' or ') if p.strip()] | 
|  | 706             unique_parts = [] | 
|  | 707             seen = set() | 
|  | 708             for part in parts: | 
|  | 709                 if part not in seen: | 
|  | 710                     unique_parts.append(part) | 
|  | 711                     seen.add(part) | 
| 455 | 712 | 
| 490 | 713             if len(unique_parts) == 1: | 
|  | 714                 return unique_parts[0]  # Remove unnecessary parentheses for single items | 
|  | 715             else: | 
|  | 716                 return '(' + ' or '.join(unique_parts) + ')' | 
|  | 717 | 
|  | 718         elif ' and ' in content and ' or ' not in content: | 
|  | 719             # Pure AND chain - safe to deduplicate | 
|  | 720             parts = [p.strip() for p in content.split(' and ') if p.strip()] | 
|  | 721             unique_parts = [] | 
|  | 722             seen = set() | 
|  | 723             for part in parts: | 
|  | 724                 if part not in seen: | 
|  | 725                     unique_parts.append(part) | 
|  | 726                     seen.add(part) | 
| 455 | 727 | 
| 490 | 728             if len(unique_parts) == 1: | 
|  | 729                 return unique_parts[0]  # Remove unnecessary parentheses for single items | 
|  | 730             else: | 
|  | 731                 return '(' + ' and '.join(unique_parts) + ')' | 
|  | 732         else: | 
|  | 733             # Mixed operators or single item - return with parentheses as-is | 
|  | 734             return '(' + content + ')' | 
|  | 735 | 
|  | 736     def remove_duplicates_simple(parts_str: str, separator: str) -> str: | 
|  | 737         """Remove duplicates from a simple chain of operations.""" | 
|  | 738         parts = [p.strip() for p in parts_str.split(separator) if p.strip()] | 
| 455 | 739 | 
| 490 | 740         # Remove duplicates while preserving order | 
|  | 741         unique_parts = [] | 
|  | 742         seen = set() | 
|  | 743         for part in parts: | 
|  | 744             if part not in seen: | 
|  | 745                 unique_parts.append(part) | 
|  | 746                 seen.add(part) | 
| 455 | 747 | 
| 490 | 748         if len(unique_parts) == 1: | 
|  | 749             return unique_parts[0] | 
| 455 | 750         else: | 
| 490 | 751             return f' {separator} '.join(unique_parts) | 
| 455 | 752 | 
|  | 753     try: | 
| 490 | 754         import re | 
|  | 755 | 
|  | 756         # First, simplify content within parentheses | 
|  | 757         # This handles cases like (A or A) -> A and (B and B) -> B | 
|  | 758         expr_simplified = re.sub(r'\(([^()]+)\)', simplify_parentheses_content, expr) | 
|  | 759 | 
|  | 760         # Check if the resulting expression has mixed operators | 
|  | 761         has_and = ' and ' in expr_simplified | 
|  | 762         has_or = ' or ' in expr_simplified | 
|  | 763 | 
|  | 764         # Only simplify top-level if it's pure AND or pure OR | 
|  | 765         if has_and and not has_or and '(' not in expr_simplified: | 
|  | 766             # Pure AND chain at top level - safe to deduplicate | 
|  | 767             return remove_duplicates_simple(expr_simplified, 'and') | 
|  | 768         elif has_or and not has_and and '(' not in expr_simplified: | 
|  | 769             # Pure OR chain at top level - safe to deduplicate | 
|  | 770             return remove_duplicates_simple(expr_simplified, 'or') | 
|  | 771         else: | 
|  | 772             # Mixed operators or has parentheses - return the simplified version (with parentheses content cleaned) | 
|  | 773             return expr_simplified | 
|  | 774 | 
| 455 | 775     except Exception: | 
| 490 | 776         # If anything goes wrong, return the original expression | 
| 455 | 777         return expr | 
|  | 778 | 
| 492 | 779 | 
| 426 | 780 def translate_model_genes(model: 'cobra.Model', | 
|  | 781                          mapping_df: 'pd.DataFrame', | 
|  | 782                          target_nomenclature: str, | 
|  | 783                          source_nomenclature: str = 'hgnc_id', | 
| 455 | 784                          allow_many_to_one: bool = False, | 
| 490 | 785                          logger: Optional[logging.Logger] = None) -> Tuple['cobra.Model', Dict[str, str]]: | 
| 426 | 786     """ | 
| 456 | 787     Translate model genes from source_nomenclature to target_nomenclature using a mapping table. | 
|  | 788     mapping_df should contain columns enabling mapping (e.g., ensg, hgnc_id, hgnc_symbol, entrez). | 
|  | 789 | 
| 455 | 790     Args: | 
| 456 | 791         model: COBRA model to translate. | 
|  | 792         mapping_df: DataFrame containing the mapping information. | 
|  | 793         target_nomenclature: Desired target key (e.g., 'hgnc_symbol'). | 
|  | 794         source_nomenclature: Current source key in the model (default 'hgnc_id'). | 
|  | 795         allow_many_to_one: If True, allow many-to-one mappings and handle duplicates in GPRs. | 
|  | 796         logger: Optional logger. | 
| 490 | 797 | 
|  | 798     Returns: | 
|  | 799         Tuple containing: | 
|  | 800         - Translated COBRA model | 
|  | 801         - Dictionary mapping reaction IDs to translation issue descriptions | 
| 426 | 802     """ | 
|  | 803     if logger is None: | 
|  | 804         logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | 
|  | 805         logger = logging.getLogger(__name__) | 
|  | 806 | 
|  | 807     logger.info(f"Translating genes from '{source_nomenclature}' to '{target_nomenclature}'") | 
|  | 808 | 
|  | 809     # normalize column names and choose relevant columns | 
|  | 810     chosen = _choose_columns(mapping_df) | 
|  | 811     if not chosen: | 
|  | 812         raise ValueError("Could not detect useful columns in mapping_df. Expected at least one of: ensg, hgnc_id, hgnc_symbol, entrez.") | 
|  | 813 | 
|  | 814     # map source/target to actual dataframe column names (allow user-specified source/target keys) | 
|  | 815     # normalize input args | 
|  | 816     src_key = source_nomenclature.strip().lower() | 
|  | 817     tgt_key = target_nomenclature.strip().lower() | 
|  | 818 | 
|  | 819     # try to find the actual column names for requested keys | 
|  | 820     col_for_src = None | 
|  | 821     col_for_tgt = None | 
|  | 822     # first, try exact match | 
|  | 823     for k, actual in chosen.items(): | 
|  | 824         if k == src_key: | 
|  | 825             col_for_src = actual | 
|  | 826         if k == tgt_key: | 
|  | 827             col_for_tgt = actual | 
|  | 828 | 
|  | 829     # if not found, try mapping common names | 
|  | 830     if col_for_src is None: | 
|  | 831         possible_src_names = {k: v for k, v in chosen.items()} | 
|  | 832         # try to match by contained substring | 
|  | 833         for k, actual in possible_src_names.items(): | 
|  | 834             if src_key in k: | 
|  | 835                 col_for_src = actual | 
|  | 836                 break | 
|  | 837 | 
|  | 838     if col_for_tgt is None: | 
|  | 839         for k, actual in chosen.items(): | 
|  | 840             if tgt_key in k: | 
|  | 841                 col_for_tgt = actual | 
|  | 842                 break | 
|  | 843 | 
|  | 844     if col_for_src is None: | 
|  | 845         raise ValueError(f"Source column for '{source_nomenclature}' not found in mapping dataframe.") | 
|  | 846     if col_for_tgt is None: | 
|  | 847         raise ValueError(f"Target column for '{target_nomenclature}' not found in mapping dataframe.") | 
|  | 848 | 
|  | 849     model_source_genes = { _normalize_gene_id(g.id) for g in model.genes } | 
|  | 850     logger.info(f"Filtering mapping to {len(model_source_genes)} source genes present in model (normalized).") | 
|  | 851 | 
|  | 852     tmp_map = mapping_df[[col_for_src, col_for_tgt]].dropna().copy() | 
|  | 853     tmp_map[col_for_src + "_norm"] = tmp_map[col_for_src].astype(str).map(_normalize_gene_id) | 
|  | 854 | 
|  | 855     filtered_map = tmp_map[tmp_map[col_for_src + "_norm"].isin(model_source_genes)].copy() | 
|  | 856 | 
|  | 857     if filtered_map.empty: | 
|  | 858         logger.warning("No mapping rows correspond to source genes present in the model after filtering. Proceeding with empty mapping (no translation will occur).") | 
|  | 859 | 
| 455 | 860     if not allow_many_to_one: | 
|  | 861         _validate_target_uniqueness(filtered_map, col_for_src, col_for_tgt, model_source_genes=model_source_genes, logger=logger) | 
| 426 | 862 | 
| 455 | 863     # Crea il mapping | 
| 426 | 864     gene_mapping = _create_gene_mapping(filtered_map, col_for_src, col_for_tgt, logger) | 
|  | 865 | 
|  | 866     # copy model | 
|  | 867     model_copy = model.copy() | 
|  | 868 | 
|  | 869     # statistics | 
| 493 | 870     stats = {'translated': 0, 'one_to_one': 0, 'one_to_many': 0, 'not_found': 0, 'simplified_gprs': 0, 'flattened_or_gprs': 0} | 
| 426 | 871     unmapped = [] | 
|  | 872     multi = [] | 
| 490 | 873 | 
|  | 874     # Dictionary to store translation issues per reaction | 
|  | 875     reaction_translation_issues = {} | 
| 426 | 876 | 
|  | 877     original_genes = {g.id for g in model_copy.genes} | 
|  | 878     logger.info(f"Original genes count: {len(original_genes)}") | 
|  | 879 | 
|  | 880     # translate GPRs | 
|  | 881     for rxn in model_copy.reactions: | 
|  | 882         gpr = rxn.gene_reaction_rule | 
|  | 883         if gpr and gpr.strip(): | 
| 490 | 884             new_gpr, rxn_issues = _translate_gpr(gpr, gene_mapping, stats, unmapped, multi, logger, track_issues=True) | 
|  | 885             if rxn_issues: | 
|  | 886                 reaction_translation_issues[rxn.id] = rxn_issues | 
|  | 887 | 
| 426 | 888             if new_gpr != gpr: | 
| 493 | 889                 # Check if this GPR has translation issues and contains only OR operators | 
|  | 890                 if rxn_issues and _is_or_only_expression(new_gpr): | 
|  | 891                     # Flatten the GPR: remove parentheses and create set of unique genes | 
|  | 892                     flattened_gpr = _flatten_or_only_gpr(new_gpr) | 
|  | 893                     if flattened_gpr != new_gpr: | 
|  | 894                         stats['flattened_or_gprs'] += 1 | 
|  | 895                         logger.debug(f"Flattened OR-only GPR with issues for {rxn.id}: '{new_gpr}' -> '{flattened_gpr}'") | 
|  | 896                         new_gpr = flattened_gpr | 
|  | 897 | 
| 455 | 898                 simplified_gpr = _simplify_boolean_expression(new_gpr) | 
|  | 899                 if simplified_gpr != new_gpr: | 
|  | 900                     stats['simplified_gprs'] += 1 | 
|  | 901                     logger.debug(f"Simplified GPR for {rxn.id}: '{new_gpr}' -> '{simplified_gpr}'") | 
|  | 902                 rxn.gene_reaction_rule = simplified_gpr | 
|  | 903                 logger.debug(f"Reaction {rxn.id}: '{gpr}' -> '{simplified_gpr}'") | 
| 426 | 904 | 
|  | 905     # update model genes based on new GPRs | 
|  | 906     _update_model_genes(model_copy, logger) | 
|  | 907 | 
|  | 908     # final logging | 
|  | 909     _log_translation_statistics(stats, unmapped, multi, original_genes, model_copy.genes, logger) | 
|  | 910 | 
|  | 911     logger.info("Translation finished") | 
| 490 | 912     return model_copy, reaction_translation_issues | 
| 426 | 913 | 
|  | 914 | 
|  | 915 # ---------- helper functions ---------- | 
|  | 916 def _create_gene_mapping(mapping_df, source_col: str, target_col: str, logger: logging.Logger) -> Dict[str, List[str]]: | 
|  | 917     """ | 
|  | 918     Build mapping dict: source_id -> list of target_ids | 
|  | 919     Normalizes IDs (removes prefixes like 'HGNC:' etc). | 
|  | 920     """ | 
|  | 921     df = mapping_df[[source_col, target_col]].dropna().copy() | 
|  | 922     # normalize to string | 
|  | 923     df[source_col] = df[source_col].astype(str).map(_normalize_gene_id) | 
|  | 924     df[target_col] = df[target_col].astype(str).str.strip() | 
|  | 925 | 
|  | 926     df = df.drop_duplicates() | 
|  | 927 | 
|  | 928     logger.info(f"Creating mapping from {len(df)} rows") | 
|  | 929 | 
|  | 930     mapping = defaultdict(list) | 
|  | 931     for _, row in df.iterrows(): | 
|  | 932         s = row[source_col] | 
|  | 933         t = row[target_col] | 
|  | 934         if t not in mapping[s]: | 
|  | 935             mapping[s].append(t) | 
|  | 936 | 
|  | 937     # stats | 
|  | 938     one_to_one = sum(1 for v in mapping.values() if len(v) == 1) | 
|  | 939     one_to_many = sum(1 for v in mapping.values() if len(v) > 1) | 
|  | 940     logger.info(f"Mapping: {len(mapping)} source keys, {one_to_one} 1:1, {one_to_many} 1:many") | 
|  | 941     return dict(mapping) | 
|  | 942 | 
|  | 943 | 
|  | 944 def _translate_gpr(gpr_string: str, | 
|  | 945                    gene_mapping: Dict[str, List[str]], | 
|  | 946                    stats: Dict[str, int], | 
|  | 947                    unmapped_genes: List[str], | 
|  | 948                    multi_mapping_genes: List[Tuple[str, List[str]]], | 
| 490 | 949                    logger: logging.Logger, | 
|  | 950                    track_issues: bool = False) -> Union[str, Tuple[str, str]]: | 
| 426 | 951     """ | 
|  | 952     Translate genes inside a GPR string using gene_mapping. | 
| 490 | 953     Returns new GPR string, and optionally translation issues if track_issues=True. | 
| 426 | 954     """ | 
|  | 955     # Generic token pattern: letters, digits, :, _, -, ., (captures HGNC:1234, ENSG000..., symbols) | 
|  | 956     token_pattern = r'\b[A-Za-z0-9:_.-]+\b' | 
|  | 957     tokens = re.findall(token_pattern, gpr_string) | 
|  | 958 | 
|  | 959     logical = {'and', 'or', 'AND', 'OR', '(', ')'} | 
|  | 960     tokens = [t for t in tokens if t not in logical] | 
|  | 961 | 
|  | 962     new_gpr = gpr_string | 
| 490 | 963     issues = [] | 
| 426 | 964 | 
|  | 965     for token in sorted(set(tokens), key=lambda x: -len(x)):  # longer tokens first to avoid partial replacement | 
|  | 966         norm = _normalize_gene_id(token) | 
|  | 967         if norm in gene_mapping: | 
|  | 968             targets = gene_mapping[norm] | 
|  | 969             stats['translated'] += 1 | 
|  | 970             if len(targets) == 1: | 
|  | 971                 stats['one_to_one'] += 1 | 
|  | 972                 replacement = targets[0] | 
|  | 973             else: | 
|  | 974                 stats['one_to_many'] += 1 | 
|  | 975                 multi_mapping_genes.append((token, targets)) | 
|  | 976                 replacement = "(" + " or ".join(targets) + ")" | 
| 490 | 977                 if track_issues: | 
|  | 978                     issues.append(f"{token} -> {' or '.join(targets)}") | 
| 426 | 979 | 
|  | 980             pattern = r'\b' + re.escape(token) + r'\b' | 
|  | 981             new_gpr = re.sub(pattern, replacement, new_gpr) | 
|  | 982         else: | 
|  | 983             stats['not_found'] += 1 | 
|  | 984             if token not in unmapped_genes: | 
|  | 985                 unmapped_genes.append(token) | 
|  | 986             logger.debug(f"Token not found in mapping (left as-is): {token}") | 
|  | 987 | 
| 490 | 988     # Check for many-to-one cases (multiple source genes mapping to same target) | 
|  | 989     if track_issues: | 
|  | 990         # Build reverse mapping to detect many-to-one cases from original tokens | 
|  | 991         original_to_target = {} | 
|  | 992 | 
|  | 993         for orig_token in tokens: | 
|  | 994             norm = _normalize_gene_id(orig_token) | 
|  | 995             if norm in gene_mapping: | 
|  | 996                 targets = gene_mapping[norm] | 
|  | 997                 for target in targets: | 
|  | 998                     if target not in original_to_target: | 
|  | 999                         original_to_target[target] = [] | 
|  | 1000                     if orig_token not in original_to_target[target]: | 
|  | 1001                         original_to_target[target].append(orig_token) | 
|  | 1002 | 
|  | 1003         # Identify many-to-one mappings in this specific GPR | 
|  | 1004         for target, original_genes in original_to_target.items(): | 
|  | 1005             if len(original_genes) > 1: | 
|  | 1006                 issues.append(f"{' or '.join(original_genes)} -> {target}") | 
|  | 1007 | 
|  | 1008     issue_text = "; ".join(issues) if issues else "" | 
|  | 1009 | 
|  | 1010     if track_issues: | 
|  | 1011         return new_gpr, issue_text | 
|  | 1012     else: | 
|  | 1013         return new_gpr | 
| 426 | 1014 | 
|  | 1015 | 
|  | 1016 def _update_model_genes(model: 'cobra.Model', logger: logging.Logger): | 
|  | 1017     """ | 
|  | 1018     Rebuild model.genes from gene_reaction_rule content. | 
|  | 1019     Removes genes not referenced and adds missing ones. | 
|  | 1020     """ | 
|  | 1021     # collect genes in GPRs | 
|  | 1022     gene_pattern = r'\b[A-Za-z0-9:_.-]+\b' | 
|  | 1023     logical = {'and', 'or', 'AND', 'OR', '(', ')'} | 
|  | 1024     genes_in_gpr: Set[str] = set() | 
|  | 1025 | 
|  | 1026     for rxn in model.reactions: | 
|  | 1027         gpr = rxn.gene_reaction_rule | 
|  | 1028         if gpr and gpr.strip(): | 
|  | 1029             toks = re.findall(gene_pattern, gpr) | 
|  | 1030             toks = [t for t in toks if t not in logical] | 
|  | 1031             # normalize IDs consistent with mapping normalization | 
|  | 1032             toks = [_normalize_gene_id(t) for t in toks] | 
|  | 1033             genes_in_gpr.update(toks) | 
|  | 1034 | 
|  | 1035     # existing gene ids | 
|  | 1036     existing = {g.id for g in model.genes} | 
|  | 1037 | 
|  | 1038     # remove obsolete genes | 
|  | 1039     to_remove = [gid for gid in existing if gid not in genes_in_gpr] | 
|  | 1040     removed = 0 | 
|  | 1041     for gid in to_remove: | 
|  | 1042         try: | 
|  | 1043             gene_obj = model.genes.get_by_id(gid) | 
|  | 1044             model.genes.remove(gene_obj) | 
|  | 1045             removed += 1 | 
|  | 1046         except Exception: | 
|  | 1047             # safe-ignore | 
|  | 1048             pass | 
|  | 1049 | 
|  | 1050     # add new genes | 
|  | 1051     added = 0 | 
|  | 1052     for gid in genes_in_gpr: | 
|  | 1053         if gid not in existing: | 
|  | 1054             new_gene = cobra.Gene(gid) | 
|  | 1055             try: | 
|  | 1056                 model.genes.add(new_gene) | 
|  | 1057             except Exception: | 
|  | 1058                 # fallback: if model.genes doesn't support add, try append or model.add_genes | 
|  | 1059                 try: | 
|  | 1060                     model.genes.append(new_gene) | 
|  | 1061                 except Exception: | 
|  | 1062                     try: | 
|  | 1063                         model.add_genes([new_gene]) | 
|  | 1064                     except Exception: | 
|  | 1065                         logger.warning(f"Could not add gene object for {gid}") | 
|  | 1066             added += 1 | 
|  | 1067 | 
|  | 1068     logger.info(f"Model genes updated: removed {removed}, added {added}") | 
|  | 1069 | 
|  | 1070 | 
|  | 1071 def _log_translation_statistics(stats: Dict[str, int], | 
|  | 1072                                unmapped_genes: List[str], | 
|  | 1073                                multi_mapping_genes: List[Tuple[str, List[str]]], | 
|  | 1074                                original_genes: Set[str], | 
|  | 1075                                final_genes, | 
|  | 1076                                logger: logging.Logger): | 
|  | 1077     logger.info("=== TRANSLATION STATISTICS ===") | 
|  | 1078     logger.info(f"Translated: {stats.get('translated', 0)} (1:1 = {stats.get('one_to_one', 0)}, 1:many = {stats.get('one_to_many', 0)})") | 
|  | 1079     logger.info(f"Not found tokens: {stats.get('not_found', 0)}") | 
| 455 | 1080     logger.info(f"Simplified GPRs: {stats.get('simplified_gprs', 0)}") | 
| 493 | 1081     logger.info(f"Flattened OR-only GPRs with issues: {stats.get('flattened_or_gprs', 0)}") | 
| 426 | 1082 | 
|  | 1083     final_ids = {g.id for g in final_genes} | 
|  | 1084     logger.info(f"Genes in model: {len(original_genes)} -> {len(final_ids)}") | 
|  | 1085 | 
|  | 1086     if unmapped_genes: | 
|  | 1087         logger.warning(f"Unmapped tokens ({len(unmapped_genes)}): {', '.join(unmapped_genes[:20])}{(' ...' if len(unmapped_genes)>20 else '')}") | 
|  | 1088     if multi_mapping_genes: | 
|  | 1089         logger.info(f"Multi-mapping examples ({len(multi_mapping_genes)}):") | 
|  | 1090         for orig, targets in multi_mapping_genes[:10]: | 
| 490 | 1091             logger.info(f"  {orig} -> {', '.join(targets)}") | 
| 493 | 1092 | 
|  | 1093     # Log summary of flattened GPRs if any | 
|  | 1094     if stats.get('flattened_or_gprs', 0) > 0: | 
|  | 1095         logger.info(f"Flattened {stats['flattened_or_gprs']} OR-only GPRs that had translation issues (removed parentheses, created unique gene sets)") | 
| 490 | 1096 | 
|  | 1097 |