| 456 | 1 """ | 
|  | 2 Utilities for generating and manipulating COBRA models and related metadata. | 
|  | 3 | 
|  | 4 This module includes helpers to: | 
|  | 5 - extract rules, reactions, bounds, objective coefficients, and compartments | 
|  | 6 - build a COBRA model from a tabular file | 
|  | 7 - set objective and medium from dataframes | 
|  | 8 - validate a model and convert gene identifiers | 
|  | 9 - translate model GPRs using mapping tables | 
|  | 10 """ | 
| 418 | 11 import os | 
|  | 12 import cobra | 
|  | 13 import pandas as pd | 
| 419 | 14 import re | 
| 426 | 15 import logging | 
| 419 | 16 from typing import Optional, Tuple, Union, List, Dict, Set | 
| 426 | 17 from collections import defaultdict | 
| 418 | 18 import utils.rule_parsing  as rulesUtils | 
| 419 | 19 import utils.reaction_parsing as reactionUtils | 
|  | 20 from cobra import Model as cobraModel, Reaction, Metabolite | 
| 490 | 21 import sys | 
|  | 22 | 
|  | 23 | 
|  | 24 ############################ check_methods #################################### | 
|  | 25 def gene_type(l :str, name :str) -> str: | 
|  | 26     """ | 
|  | 27     Determine the type of gene ID. | 
|  | 28 | 
|  | 29     Args: | 
|  | 30         l (str): The gene identifier to check. | 
|  | 31         name (str): The name of the dataset, used in error messages. | 
|  | 32 | 
|  | 33     Returns: | 
|  | 34         str: The type of gene ID ('hugo_id', 'ensembl_gene_id', 'symbol', or 'entrez_id'). | 
|  | 35 | 
|  | 36     Raises: | 
|  | 37         sys.exit: If the gene ID type is not supported, the execution is aborted. | 
|  | 38     """ | 
|  | 39     if check_hgnc(l): | 
|  | 40         return 'hugo_id' | 
|  | 41     elif check_ensembl(l): | 
|  | 42         return 'ensembl_gene_id' | 
|  | 43     elif check_symbol(l): | 
|  | 44         return 'symbol' | 
|  | 45     elif check_entrez(l): | 
|  | 46         return 'entrez_id' | 
|  | 47     else: | 
|  | 48         sys.exit('Execution aborted:\n' + | 
|  | 49                  'gene ID type in ' + name + ' not supported. Supported ID'+ | 
|  | 50                  'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n') | 
|  | 51 | 
|  | 52 def check_hgnc(l :str) -> bool: | 
|  | 53     """ | 
|  | 54     Check if a gene identifier follows the HGNC format. | 
|  | 55 | 
|  | 56     Args: | 
|  | 57         l (str): The gene identifier to check. | 
|  | 58 | 
|  | 59     Returns: | 
|  | 60         bool: True if the gene identifier follows the HGNC format, False otherwise. | 
|  | 61     """ | 
|  | 62     if len(l) > 5: | 
|  | 63         if (l.upper()).startswith('HGNC:'): | 
|  | 64             return l[5:].isdigit() | 
|  | 65         else: | 
|  | 66             return False | 
|  | 67     else: | 
|  | 68         return False | 
|  | 69 | 
|  | 70 def check_ensembl(l :str) -> bool: | 
|  | 71     """ | 
|  | 72     Check if a gene identifier follows the Ensembl format. | 
|  | 73 | 
|  | 74     Args: | 
|  | 75         l (str): The gene identifier to check. | 
|  | 76 | 
|  | 77     Returns: | 
|  | 78         bool: True if the gene identifier follows the Ensembl format, False otherwise. | 
|  | 79     """ | 
|  | 80     return l.upper().startswith('ENS') | 
|  | 81 | 
|  | 82 | 
|  | 83 def check_symbol(l :str) -> bool: | 
|  | 84     """ | 
|  | 85     Check if a gene identifier follows the symbol format. | 
|  | 86 | 
|  | 87     Args: | 
|  | 88         l (str): The gene identifier to check. | 
|  | 89 | 
|  | 90     Returns: | 
|  | 91         bool: True if the gene identifier follows the symbol format, False otherwise. | 
|  | 92     """ | 
|  | 93     if len(l) > 0: | 
|  | 94         if l[0].isalpha() and l[1:].isalnum(): | 
|  | 95             return True | 
|  | 96         else: | 
|  | 97             return False | 
|  | 98     else: | 
|  | 99         return False | 
|  | 100 | 
|  | 101 def check_entrez(l :str) -> bool: | 
|  | 102     """ | 
|  | 103     Check if a gene identifier follows the Entrez ID format. | 
|  | 104 | 
|  | 105     Args: | 
|  | 106         l (str): The gene identifier to check. | 
|  | 107 | 
|  | 108     Returns: | 
|  | 109         bool: True if the gene identifier follows the Entrez ID format, False otherwise. | 
|  | 110     """ | 
|  | 111     if len(l) > 0: | 
|  | 112         return l.isdigit() | 
|  | 113     else: | 
|  | 114         return False | 
| 418 | 115 | 
|  | 116 ################################- DATA GENERATION -################################ | 
|  | 117 ReactionId = str | 
| 419 | 118 def generate_rules(model: cobraModel, *, asParsed = True) -> Union[Dict[ReactionId, rulesUtils.OpList], Dict[ReactionId, str]]: | 
| 418 | 119     """ | 
| 456 | 120     Generate a dictionary mapping reaction IDs to GPR rules from the model. | 
| 418 | 121 | 
|  | 122     Args: | 
| 456 | 123         model: COBRA model to derive data from. | 
|  | 124         asParsed: If True, parse rules into a nested list structure; otherwise keep raw strings. | 
| 418 | 125 | 
|  | 126     Returns: | 
| 456 | 127         Dict[ReactionId, rulesUtils.OpList]: Parsed rules by reaction ID. | 
|  | 128         Dict[ReactionId, str]: Raw rules by reaction ID. | 
| 418 | 129     """ | 
|  | 130     _ruleGetter   =  lambda reaction : reaction.gene_reaction_rule | 
|  | 131     ruleExtractor = (lambda reaction : | 
|  | 132         rulesUtils.parseRuleToNestedList(_ruleGetter(reaction))) if asParsed else _ruleGetter | 
|  | 133 | 
|  | 134     return { | 
|  | 135         reaction.id : ruleExtractor(reaction) | 
|  | 136         for reaction in model.reactions | 
|  | 137         if reaction.gene_reaction_rule } | 
|  | 138 | 
| 419 | 139 def generate_reactions(model :cobraModel, *, asParsed = True) -> Dict[ReactionId, str]: | 
| 418 | 140     """ | 
| 456 | 141     Generate a dictionary mapping reaction IDs to reaction formulas from the model. | 
| 418 | 142 | 
|  | 143     Args: | 
| 456 | 144         model: COBRA model to derive data from. | 
|  | 145         asParsed: If True, convert formulas into a parsed representation; otherwise keep raw strings. | 
| 418 | 146 | 
|  | 147     Returns: | 
| 456 | 148         Dict[ReactionId, str]: Reactions by reaction ID (parsed if requested). | 
| 418 | 149     """ | 
|  | 150 | 
|  | 151     unparsedReactions = { | 
|  | 152         reaction.id : reaction.reaction | 
|  | 153         for reaction in model.reactions | 
|  | 154         if reaction.reaction | 
|  | 155     } | 
|  | 156 | 
|  | 157     if not asParsed: return unparsedReactions | 
|  | 158 | 
|  | 159     return reactionUtils.create_reaction_dict(unparsedReactions) | 
|  | 160 | 
| 419 | 161 def get_medium(model:cobraModel) -> pd.DataFrame: | 
| 456 | 162     """ | 
|  | 163     Extract the uptake reactions representing the model medium. | 
|  | 164 | 
|  | 165     Returns a DataFrame with a single column 'reaction' listing exchange reactions | 
|  | 166     with negative lower bound and no positive stoichiometric coefficients (uptake only). | 
|  | 167     """ | 
| 418 | 168     trueMedium=[] | 
|  | 169     for r in model.reactions: | 
|  | 170         positiveCoeff=0 | 
|  | 171         for m in r.metabolites: | 
|  | 172             if r.get_coefficient(m.id)>0: | 
|  | 173                 positiveCoeff=1; | 
|  | 174         if (positiveCoeff==0 and r.lower_bound<0): | 
|  | 175             trueMedium.append(r.id) | 
|  | 176 | 
|  | 177     df_medium = pd.DataFrame() | 
|  | 178     df_medium["reaction"] = trueMedium | 
|  | 179     return df_medium | 
|  | 180 | 
| 426 | 181 def extract_objective_coefficients(model: cobraModel) -> pd.DataFrame: | 
|  | 182     """ | 
| 456 | 183     Extract objective coefficients for each reaction. | 
|  | 184 | 
| 426 | 185     Args: | 
| 456 | 186         model: COBRA model | 
|  | 187 | 
| 426 | 188     Returns: | 
| 456 | 189         pd.DataFrame with columns: ReactionID, ObjectiveCoefficient | 
| 426 | 190     """ | 
|  | 191     coeffs = [] | 
| 456 | 192     # model.objective.expression is a linear expression | 
| 426 | 193     objective_expr = model.objective.expression.as_coefficients_dict() | 
|  | 194 | 
|  | 195     for reaction in model.reactions: | 
|  | 196         coeff = objective_expr.get(reaction.forward_variable, 0.0) | 
|  | 197         coeffs.append({ | 
|  | 198             "ReactionID": reaction.id, | 
|  | 199             "ObjectiveCoefficient": coeff | 
|  | 200         }) | 
|  | 201 | 
|  | 202     return pd.DataFrame(coeffs) | 
|  | 203 | 
| 419 | 204 def generate_bounds(model:cobraModel) -> pd.DataFrame: | 
| 456 | 205     """ | 
|  | 206     Build a DataFrame of lower/upper bounds for all reactions. | 
|  | 207 | 
|  | 208     Returns: | 
|  | 209         pd.DataFrame indexed by reaction IDs with columns ['lower_bound', 'upper_bound']. | 
|  | 210     """ | 
| 418 | 211 | 
|  | 212     rxns = [] | 
|  | 213     for reaction in model.reactions: | 
|  | 214         rxns.append(reaction.id) | 
|  | 215 | 
|  | 216     bounds = pd.DataFrame(columns = ["lower_bound", "upper_bound"], index=rxns) | 
|  | 217 | 
|  | 218     for reaction in model.reactions: | 
|  | 219         bounds.loc[reaction.id] = [reaction.lower_bound, reaction.upper_bound] | 
|  | 220     return bounds | 
|  | 221 | 
|  | 222 | 
|  | 223 | 
| 419 | 224 def generate_compartments(model: cobraModel) -> pd.DataFrame: | 
| 418 | 225     """ | 
|  | 226     Generates a DataFrame containing compartment information for each reaction. | 
|  | 227     Creates columns for each compartment position (Compartment_1, Compartment_2, etc.) | 
|  | 228 | 
|  | 229     Args: | 
|  | 230         model: the COBRA model to extract compartment data from. | 
|  | 231 | 
|  | 232     Returns: | 
|  | 233         pd.DataFrame: DataFrame with ReactionID and compartment columns | 
|  | 234     """ | 
|  | 235     pathway_data = [] | 
|  | 236 | 
|  | 237     # First pass: determine the maximum number of pathways any reaction has | 
|  | 238     max_pathways = 0 | 
|  | 239     reaction_pathways = {} | 
|  | 240 | 
|  | 241     for reaction in model.reactions: | 
|  | 242         # Get unique pathways from all metabolites in the reaction | 
|  | 243         if type(reaction.annotation['pathways']) == list: | 
|  | 244             reaction_pathways[reaction.id] = reaction.annotation['pathways'] | 
|  | 245             max_pathways = max(max_pathways, len(reaction.annotation['pathways'])) | 
|  | 246         else: | 
|  | 247             reaction_pathways[reaction.id] = [reaction.annotation['pathways']] | 
|  | 248 | 
|  | 249     # Create column names for pathways | 
|  | 250     pathway_columns = [f"Pathway_{i+1}" for i in range(max_pathways)] | 
|  | 251 | 
|  | 252     # Second pass: create the data | 
|  | 253     for reaction_id, pathways in reaction_pathways.items(): | 
|  | 254         row = {"ReactionID": reaction_id} | 
|  | 255 | 
|  | 256         # Fill pathway columns | 
|  | 257         for i in range(max_pathways): | 
|  | 258             col_name = pathway_columns[i] | 
|  | 259             if i < len(pathways): | 
|  | 260                 row[col_name] = pathways[i] | 
|  | 261             else: | 
|  | 262                 row[col_name] = None  # or "" if you prefer empty strings | 
|  | 263 | 
|  | 264         pathway_data.append(row) | 
|  | 265 | 
| 419 | 266     return pd.DataFrame(pathway_data) | 
|  | 267 | 
|  | 268 | 
|  | 269 | 
|  | 270 def build_cobra_model_from_csv(csv_path: str, model_id: str = "new_model") -> cobraModel: | 
|  | 271     """ | 
| 456 | 272     Build a COBRApy model from a tabular file with reaction data. | 
|  | 273 | 
| 419 | 274     Args: | 
| 456 | 275         csv_path: Path to the tab-separated file. | 
|  | 276         model_id: ID for the newly created model. | 
|  | 277 | 
| 419 | 278     Returns: | 
| 456 | 279         cobra.Model: The constructed COBRApy model. | 
| 419 | 280     """ | 
|  | 281 | 
|  | 282     df = pd.read_csv(csv_path, sep='\t') | 
|  | 283 | 
|  | 284     model = cobraModel(model_id) | 
|  | 285 | 
|  | 286     metabolites_dict = {} | 
|  | 287     compartments_dict = {} | 
|  | 288 | 
| 456 | 289     print(f"Building model from {len(df)} reactions...") | 
| 419 | 290 | 
|  | 291     for idx, row in df.iterrows(): | 
| 448 | 292         reaction_formula = str(row['Formula']).strip() | 
| 419 | 293         if not reaction_formula or reaction_formula == 'nan': | 
|  | 294             continue | 
|  | 295 | 
|  | 296         metabolites = extract_metabolites_from_reaction(reaction_formula) | 
|  | 297 | 
|  | 298         for met_id in metabolites: | 
|  | 299             compartment = extract_compartment_from_metabolite(met_id) | 
|  | 300 | 
|  | 301             if compartment not in compartments_dict: | 
|  | 302                 compartments_dict[compartment] = compartment | 
|  | 303 | 
|  | 304             if met_id not in metabolites_dict: | 
|  | 305                 metabolites_dict[met_id] = Metabolite( | 
|  | 306                     id=met_id, | 
|  | 307                     compartment=compartment, | 
|  | 308                     name=met_id.replace(f"_{compartment}", "").replace("__", "_") | 
|  | 309                 ) | 
|  | 310 | 
|  | 311     model.compartments = compartments_dict | 
|  | 312 | 
|  | 313     model.add_metabolites(list(metabolites_dict.values())) | 
|  | 314 | 
| 456 | 315     print(f"Added {len(metabolites_dict)} metabolites and {len(compartments_dict)} compartments") | 
| 419 | 316 | 
|  | 317     reactions_added = 0 | 
|  | 318     reactions_skipped = 0 | 
|  | 319 | 
|  | 320     for idx, row in df.iterrows(): | 
|  | 321 | 
|  | 322         reaction_id = str(row['ReactionID']).strip() | 
| 427 | 323         reaction_formula = str(row['Formula']).strip() | 
| 419 | 324 | 
|  | 325         if not reaction_formula or reaction_formula == 'nan': | 
| 456 | 326             raise ValueError(f"Missing reaction formula for {reaction_id}") | 
| 419 | 327 | 
|  | 328         reaction = Reaction(reaction_id) | 
|  | 329         reaction.name = reaction_id | 
|  | 330 | 
|  | 331         reaction.lower_bound = float(row['lower_bound']) if pd.notna(row['lower_bound']) else -1000.0 | 
|  | 332         reaction.upper_bound = float(row['upper_bound']) if pd.notna(row['upper_bound']) else 1000.0 | 
|  | 333 | 
| 427 | 334         if pd.notna(row['GPR']) and str(row['GPR']).strip(): | 
|  | 335             reaction.gene_reaction_rule = str(row['GPR']).strip() | 
| 419 | 336 | 
|  | 337         try: | 
|  | 338             parse_reaction_formula(reaction, reaction_formula, metabolites_dict) | 
|  | 339         except Exception as e: | 
| 456 | 340             print(f"Error parsing reaction {reaction_id}: {e}") | 
| 419 | 341             reactions_skipped += 1 | 
|  | 342             continue | 
|  | 343 | 
|  | 344         model.add_reactions([reaction]) | 
|  | 345         reactions_added += 1 | 
|  | 346 | 
|  | 347 | 
| 456 | 348     print(f"Added {reactions_added} reactions, skipped {reactions_skipped} reactions") | 
| 419 | 349 | 
| 430 | 350     # set objective function | 
|  | 351     set_objective_from_csv(model, df, obj_col="ObjectiveCoefficient") | 
|  | 352 | 
| 419 | 353     set_medium_from_data(model, df) | 
|  | 354 | 
| 456 | 355     print(f"Model completed: {len(model.reactions)} reactions, {len(model.metabolites)} metabolites") | 
| 419 | 356 | 
|  | 357     return model | 
|  | 358 | 
|  | 359 | 
|  | 360 # Estrae tutti gli ID metaboliti nella formula (gestisce prefissi numerici + underscore) | 
|  | 361 def extract_metabolites_from_reaction(reaction_formula: str) -> Set[str]: | 
|  | 362     """ | 
| 456 | 363     Extract metabolite IDs from a reaction formula. | 
|  | 364     Robust pattern: tokens ending with _<compartment> (e.g., _c, _m, _e), | 
|  | 365     allowing leading digits/underscores. | 
| 419 | 366     """ | 
|  | 367     metabolites = set() | 
| 456 | 368     # optional coefficient followed by a token ending with _<letters> | 
| 419 | 369     pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+_[a-z]+)' | 
|  | 370     matches = re.findall(pattern, reaction_formula) | 
|  | 371     metabolites.update(matches) | 
|  | 372     return metabolites | 
|  | 373 | 
|  | 374 | 
|  | 375 def extract_compartment_from_metabolite(metabolite_id: str) -> str: | 
| 456 | 376     """Extract the compartment from a metabolite ID.""" | 
| 419 | 377     if '_' in metabolite_id: | 
|  | 378         return metabolite_id.split('_')[-1] | 
|  | 379     return 'c'  # default cytoplasm | 
|  | 380 | 
|  | 381 | 
|  | 382 def parse_reaction_formula(reaction: Reaction, formula: str, metabolites_dict: Dict[str, Metabolite]): | 
| 456 | 383     """Parse a reaction formula and set metabolites with their coefficients.""" | 
| 419 | 384 | 
|  | 385     if '<=>' in formula: | 
|  | 386         left, right = formula.split('<=>') | 
|  | 387         reversible = True | 
|  | 388     elif '<--' in formula: | 
|  | 389         left, right = formula.split('<--') | 
|  | 390         reversible = False | 
|  | 391     elif '-->' in formula: | 
|  | 392         left, right = formula.split('-->') | 
|  | 393         reversible = False | 
|  | 394     elif '<-' in formula: | 
|  | 395         left, right = formula.split('<-') | 
|  | 396         reversible = False | 
|  | 397     else: | 
| 456 | 398         raise ValueError(f"Unrecognized reaction format: {formula}") | 
| 419 | 399 | 
|  | 400     reactants = parse_metabolites_side(left.strip()) | 
|  | 401     products = parse_metabolites_side(right.strip()) | 
|  | 402 | 
|  | 403     metabolites_to_add = {} | 
|  | 404 | 
|  | 405     for met_id, coeff in reactants.items(): | 
|  | 406         if met_id in metabolites_dict: | 
|  | 407             metabolites_to_add[metabolites_dict[met_id]] = -coeff | 
|  | 408 | 
|  | 409     for met_id, coeff in products.items(): | 
|  | 410         if met_id in metabolites_dict: | 
|  | 411             metabolites_to_add[metabolites_dict[met_id]] = coeff | 
|  | 412 | 
|  | 413     reaction.add_metabolites(metabolites_to_add) | 
|  | 414 | 
|  | 415 | 
|  | 416 def parse_metabolites_side(side_str: str) -> Dict[str, float]: | 
| 456 | 417     """Parse one side of a reaction and extract metabolites with coefficients.""" | 
| 419 | 418     metabolites = {} | 
|  | 419     if not side_str or side_str.strip() == '': | 
|  | 420         return metabolites | 
|  | 421 | 
|  | 422     terms = side_str.split('+') | 
|  | 423     for term in terms: | 
|  | 424         term = term.strip() | 
|  | 425         if not term: | 
|  | 426             continue | 
|  | 427 | 
| 456 | 428         # optional coefficient + id ending with _<compartment> | 
| 419 | 429         match = re.match(r'(?:(\d+\.?\d*)\s+)?([A-Za-z0-9_]+_[a-z]+)', term) | 
|  | 430         if match: | 
|  | 431             coeff_str, met_id = match.groups() | 
|  | 432             coeff = float(coeff_str) if coeff_str else 1.0 | 
|  | 433             metabolites[met_id] = coeff | 
|  | 434 | 
|  | 435     return metabolites | 
|  | 436 | 
|  | 437 | 
|  | 438 | 
| 430 | 439 def set_objective_from_csv(model: cobra.Model, df: pd.DataFrame, obj_col: str = "ObjectiveCoefficient"): | 
| 419 | 440     """ | 
| 430 | 441     Sets the model's objective function based on a column of coefficients in the CSV. | 
|  | 442     Can be any reaction(s), not necessarily biomass. | 
| 419 | 443     """ | 
| 430 | 444     obj_dict = {} | 
| 419 | 445 | 
| 430 | 446     for idx, row in df.iterrows(): | 
|  | 447         reaction_id = str(row['ReactionID']).strip() | 
|  | 448         coeff = float(row[obj_col]) if pd.notna(row[obj_col]) else 0.0 | 
|  | 449         if coeff != 0: | 
|  | 450             if reaction_id in model.reactions: | 
|  | 451                 obj_dict[model.reactions.get_by_id(reaction_id)] = coeff | 
|  | 452             else: | 
|  | 453                 print(f"Warning: reaction {reaction_id} not found in model, skipping for objective.") | 
|  | 454 | 
|  | 455     if not obj_dict: | 
|  | 456         raise ValueError("No reactions found with non-zero objective coefficient.") | 
|  | 457 | 
|  | 458     model.objective = obj_dict | 
|  | 459     print(f"Objective set with {len(obj_dict)} reactions.") | 
|  | 460 | 
|  | 461 | 
| 419 | 462 | 
|  | 463 | 
|  | 464 def set_medium_from_data(model: cobraModel, df: pd.DataFrame): | 
| 456 | 465     """Set the medium based on the 'InMedium' column in the dataframe.""" | 
| 419 | 466     medium_reactions = df[df['InMedium'] == True]['ReactionID'].tolist() | 
|  | 467 | 
|  | 468     medium_dict = {} | 
|  | 469     for rxn_id in medium_reactions: | 
|  | 470         if rxn_id in [r.id for r in model.reactions]: | 
|  | 471             reaction = model.reactions.get_by_id(rxn_id) | 
|  | 472             if reaction.lower_bound < 0:  # Solo reazioni di uptake | 
|  | 473                 medium_dict[rxn_id] = abs(reaction.lower_bound) | 
|  | 474 | 
|  | 475     if medium_dict: | 
|  | 476         model.medium = medium_dict | 
| 456 | 477         print(f"Medium set with {len(medium_dict)} components") | 
| 419 | 478 | 
|  | 479 | 
|  | 480 def validate_model(model: cobraModel) -> Dict[str, any]: | 
| 456 | 481     """Validate the model and return basic statistics.""" | 
| 419 | 482     validation = { | 
|  | 483         'num_reactions': len(model.reactions), | 
|  | 484         'num_metabolites': len(model.metabolites), | 
|  | 485         'num_genes': len(model.genes), | 
|  | 486         'num_compartments': len(model.compartments), | 
|  | 487         'objective': str(model.objective), | 
|  | 488         'medium_size': len(model.medium), | 
|  | 489         'reversible_reactions': len([r for r in model.reactions if r.reversibility]), | 
|  | 490         'exchange_reactions': len([r for r in model.reactions if r.id.startswith('EX_')]), | 
|  | 491     } | 
|  | 492 | 
|  | 493     try: | 
| 456 | 494         # Growth test | 
| 419 | 495         solution = model.optimize() | 
|  | 496         validation['growth_rate'] = solution.objective_value | 
|  | 497         validation['status'] = solution.status | 
|  | 498     except Exception as e: | 
|  | 499         validation['growth_rate'] = None | 
|  | 500         validation['status'] = f"Error: {e}" | 
|  | 501 | 
|  | 502     return validation | 
|  | 503 | 
| 456 | 504 def convert_genes(model, annotation): | 
|  | 505     """Rename genes using a selected annotation key in gene.notes; returns a model copy.""" | 
| 419 | 506     from cobra.manipulation import rename_genes | 
|  | 507     model2=model.copy() | 
|  | 508     try: | 
|  | 509         dict_genes={gene.id:gene.notes[annotation]  for gene in model2.genes} | 
|  | 510     except: | 
|  | 511         print("No annotation in gene dict!") | 
|  | 512         return -1 | 
|  | 513     rename_genes(model2,dict_genes) | 
|  | 514 | 
| 426 | 515     return model2 | 
|  | 516 | 
|  | 517 # ---------- Utility helpers ---------- | 
|  | 518 def _normalize_colname(col: str) -> str: | 
|  | 519     return col.strip().lower().replace(' ', '_') | 
|  | 520 | 
|  | 521 def _choose_columns(mapping_df: 'pd.DataFrame') -> Dict[str, str]: | 
|  | 522     """ | 
| 456 | 523     Find useful columns and return a dict {ensg: colname1, hgnc_id: colname2, ...}. | 
|  | 524     Raise ValueError if no suitable mapping is found. | 
| 426 | 525     """ | 
|  | 526     cols = { _normalize_colname(c): c for c in mapping_df.columns } | 
|  | 527     chosen = {} | 
| 456 | 528     # candidate names for each category | 
| 426 | 529     candidates = { | 
|  | 530         'ensg': ['ensg', 'ensembl_gene_id', 'ensembl'], | 
|  | 531         'hgnc_id': ['hgnc_id', 'hgnc', 'hgnc:'], | 
| 444 | 532         'hgnc_symbol': ['hgnc_symbol', 'hgnc symbol', 'symbol'], | 
| 455 | 533         'entrez_id': ['entrez', 'entrez_id', 'entrezgene'], | 
|  | 534         'gene_number': ['gene_number'] | 
| 426 | 535     } | 
|  | 536     for key, names in candidates.items(): | 
|  | 537         for n in names: | 
|  | 538             if n in cols: | 
|  | 539                 chosen[key] = cols[n] | 
|  | 540                 break | 
|  | 541     return chosen | 
|  | 542 | 
|  | 543 def _validate_target_uniqueness(mapping_df: 'pd.DataFrame', | 
|  | 544                                 source_col: str, | 
|  | 545                                 target_col: str, | 
|  | 546                                 model_source_genes: Optional[Set[str]] = None, | 
|  | 547                                 logger: Optional[logging.Logger] = None) -> None: | 
|  | 548     """ | 
| 456 | 549         Check that, within the filtered mapping_df, each target maps to at most one source. | 
|  | 550         Log examples if duplicates are found. | 
| 426 | 551     """ | 
|  | 552     if logger is None: | 
|  | 553         logger = logging.getLogger(__name__) | 
|  | 554 | 
|  | 555     if mapping_df.empty: | 
|  | 556         logger.warning("Mapping dataframe is empty for the requested source genes; skipping uniqueness validation.") | 
|  | 557         return | 
|  | 558 | 
| 456 | 559     # normalize temporary columns for grouping (without altering the original df) | 
| 426 | 560     tmp = mapping_df[[source_col, target_col]].copy() | 
|  | 561     tmp['_src_norm'] = tmp[source_col].astype(str).map(_normalize_gene_id) | 
|  | 562     tmp['_tgt_norm'] = tmp[target_col].astype(str).str.strip() | 
|  | 563 | 
| 456 | 564     # optionally filter to the set of model source genes | 
| 426 | 565     if model_source_genes is not None: | 
|  | 566         tmp = tmp[tmp['_src_norm'].isin(model_source_genes)] | 
|  | 567 | 
|  | 568     if tmp.empty: | 
|  | 569         logger.warning("After filtering to model source genes, mapping table is empty — nothing to validate.") | 
|  | 570         return | 
|  | 571 | 
| 456 | 572     # build reverse mapping: target -> set(sources) | 
| 426 | 573     grouped = tmp.groupby('_tgt_norm')['_src_norm'].agg(lambda s: set(s.dropna())) | 
| 456 | 574     # find targets with more than one source | 
| 426 | 575     problematic = {t: sorted(list(s)) for t, s in grouped.items() if len(s) > 1} | 
|  | 576 | 
|  | 577     if problematic: | 
| 456 | 578     # prepare warning message with examples (limited subset) | 
| 455 | 579         sample_items = list(problematic.items()) | 
| 426 | 580         msg_lines = ["Mapping validation failed: some target IDs are associated with multiple source IDs."] | 
|  | 581         for tgt, sources in sample_items: | 
|  | 582             msg_lines.append(f"  - target '{tgt}' <- sources: {', '.join(sources)}") | 
|  | 583         full_msg = "\n".join(msg_lines) | 
| 456 | 584     # log warning | 
| 455 | 585         logger.warning(full_msg) | 
| 426 | 586 | 
| 456 | 587     # if everything is fine | 
| 426 | 588     logger.info("Mapping validation passed: no target ID is associated with multiple source IDs (within filtered set).") | 
|  | 589 | 
|  | 590 | 
|  | 591 def _normalize_gene_id(g: str) -> str: | 
| 456 | 592     """Normalize a gene ID for use as a key (removes prefixes like 'HGNC:' and strips).""" | 
| 426 | 593     if g is None: | 
|  | 594         return "" | 
|  | 595     g = str(g).strip() | 
|  | 596     # remove common prefixes | 
|  | 597     g = re.sub(r'^(HGNC:)', '', g, flags=re.IGNORECASE) | 
|  | 598     g = re.sub(r'^(ENSG:)', '', g, flags=re.IGNORECASE) | 
|  | 599     return g | 
|  | 600 | 
| 455 | 601 def _simplify_boolean_expression(expr: str) -> str: | 
|  | 602     """ | 
| 490 | 603     Simplify a boolean expression by removing duplicates while strictly preserving semantics. | 
|  | 604     This function handles simple duplicates within parentheses while being conservative about | 
|  | 605     complex expressions that could change semantics. | 
| 455 | 606     """ | 
|  | 607     if not expr or not expr.strip(): | 
|  | 608         return expr | 
|  | 609 | 
| 490 | 610     # Normalize operators and whitespace | 
| 455 | 611     expr = expr.replace(' AND ', ' and ').replace(' OR ', ' or ') | 
| 490 | 612     expr = ' '.join(expr.split())  # Normalize whitespace | 
| 455 | 613 | 
| 490 | 614     def simplify_parentheses_content(match_obj): | 
|  | 615         """Helper function to simplify content within parentheses.""" | 
|  | 616         content = match_obj.group(1)  # Content inside parentheses | 
| 455 | 617 | 
| 490 | 618         # Only simplify if it's a pure OR or pure AND chain | 
|  | 619         if ' or ' in content and ' and ' not in content: | 
|  | 620             # Pure OR chain - safe to deduplicate | 
|  | 621             parts = [p.strip() for p in content.split(' or ') if p.strip()] | 
|  | 622             unique_parts = [] | 
|  | 623             seen = set() | 
|  | 624             for part in parts: | 
|  | 625                 if part not in seen: | 
|  | 626                     unique_parts.append(part) | 
|  | 627                     seen.add(part) | 
| 455 | 628 | 
| 490 | 629             if len(unique_parts) == 1: | 
|  | 630                 return unique_parts[0]  # Remove unnecessary parentheses for single items | 
|  | 631             else: | 
|  | 632                 return '(' + ' or '.join(unique_parts) + ')' | 
|  | 633 | 
|  | 634         elif ' and ' in content and ' or ' not in content: | 
|  | 635             # Pure AND chain - safe to deduplicate | 
|  | 636             parts = [p.strip() for p in content.split(' and ') if p.strip()] | 
|  | 637             unique_parts = [] | 
|  | 638             seen = set() | 
|  | 639             for part in parts: | 
|  | 640                 if part not in seen: | 
|  | 641                     unique_parts.append(part) | 
|  | 642                     seen.add(part) | 
| 455 | 643 | 
| 490 | 644             if len(unique_parts) == 1: | 
|  | 645                 return unique_parts[0]  # Remove unnecessary parentheses for single items | 
|  | 646             else: | 
|  | 647                 return '(' + ' and '.join(unique_parts) + ')' | 
|  | 648         else: | 
|  | 649             # Mixed operators or single item - return with parentheses as-is | 
|  | 650             return '(' + content + ')' | 
|  | 651 | 
|  | 652     def remove_duplicates_simple(parts_str: str, separator: str) -> str: | 
|  | 653         """Remove duplicates from a simple chain of operations.""" | 
|  | 654         parts = [p.strip() for p in parts_str.split(separator) if p.strip()] | 
| 455 | 655 | 
| 490 | 656         # Remove duplicates while preserving order | 
|  | 657         unique_parts = [] | 
|  | 658         seen = set() | 
|  | 659         for part in parts: | 
|  | 660             if part not in seen: | 
|  | 661                 unique_parts.append(part) | 
|  | 662                 seen.add(part) | 
| 455 | 663 | 
| 490 | 664         if len(unique_parts) == 1: | 
|  | 665             return unique_parts[0] | 
| 455 | 666         else: | 
| 490 | 667             return f' {separator} '.join(unique_parts) | 
| 455 | 668 | 
|  | 669     try: | 
| 490 | 670         import re | 
|  | 671 | 
|  | 672         # First, simplify content within parentheses | 
|  | 673         # This handles cases like (A or A) -> A and (B and B) -> B | 
|  | 674         expr_simplified = re.sub(r'\(([^()]+)\)', simplify_parentheses_content, expr) | 
|  | 675 | 
|  | 676         # Check if the resulting expression has mixed operators | 
|  | 677         has_and = ' and ' in expr_simplified | 
|  | 678         has_or = ' or ' in expr_simplified | 
|  | 679 | 
|  | 680         # Only simplify top-level if it's pure AND or pure OR | 
|  | 681         if has_and and not has_or and '(' not in expr_simplified: | 
|  | 682             # Pure AND chain at top level - safe to deduplicate | 
|  | 683             return remove_duplicates_simple(expr_simplified, 'and') | 
|  | 684         elif has_or and not has_and and '(' not in expr_simplified: | 
|  | 685             # Pure OR chain at top level - safe to deduplicate | 
|  | 686             return remove_duplicates_simple(expr_simplified, 'or') | 
|  | 687         else: | 
|  | 688             # Mixed operators or has parentheses - return the simplified version (with parentheses content cleaned) | 
|  | 689             return expr_simplified | 
|  | 690 | 
| 455 | 691     except Exception: | 
| 490 | 692         # If anything goes wrong, return the original expression | 
| 455 | 693         return expr | 
|  | 694 | 
| 492 | 695 | 
| 426 | 696 def translate_model_genes(model: 'cobra.Model', | 
|  | 697                          mapping_df: 'pd.DataFrame', | 
|  | 698                          target_nomenclature: str, | 
|  | 699                          source_nomenclature: str = 'hgnc_id', | 
| 455 | 700                          allow_many_to_one: bool = False, | 
| 490 | 701                          logger: Optional[logging.Logger] = None) -> Tuple['cobra.Model', Dict[str, str]]: | 
| 426 | 702     """ | 
| 456 | 703     Translate model genes from source_nomenclature to target_nomenclature using a mapping table. | 
|  | 704     mapping_df should contain columns enabling mapping (e.g., ensg, hgnc_id, hgnc_symbol, entrez). | 
|  | 705 | 
| 455 | 706     Args: | 
| 456 | 707         model: COBRA model to translate. | 
|  | 708         mapping_df: DataFrame containing the mapping information. | 
|  | 709         target_nomenclature: Desired target key (e.g., 'hgnc_symbol'). | 
|  | 710         source_nomenclature: Current source key in the model (default 'hgnc_id'). | 
|  | 711         allow_many_to_one: If True, allow many-to-one mappings and handle duplicates in GPRs. | 
|  | 712         logger: Optional logger. | 
| 490 | 713 | 
|  | 714     Returns: | 
|  | 715         Tuple containing: | 
|  | 716         - Translated COBRA model | 
|  | 717         - Dictionary mapping reaction IDs to translation issue descriptions | 
| 426 | 718     """ | 
|  | 719     if logger is None: | 
|  | 720         logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | 
|  | 721         logger = logging.getLogger(__name__) | 
|  | 722 | 
|  | 723     logger.info(f"Translating genes from '{source_nomenclature}' to '{target_nomenclature}'") | 
|  | 724 | 
|  | 725     # normalize column names and choose relevant columns | 
|  | 726     chosen = _choose_columns(mapping_df) | 
|  | 727     if not chosen: | 
|  | 728         raise ValueError("Could not detect useful columns in mapping_df. Expected at least one of: ensg, hgnc_id, hgnc_symbol, entrez.") | 
|  | 729 | 
|  | 730     # map source/target to actual dataframe column names (allow user-specified source/target keys) | 
|  | 731     # normalize input args | 
|  | 732     src_key = source_nomenclature.strip().lower() | 
|  | 733     tgt_key = target_nomenclature.strip().lower() | 
|  | 734 | 
|  | 735     # try to find the actual column names for requested keys | 
|  | 736     col_for_src = None | 
|  | 737     col_for_tgt = None | 
|  | 738     # first, try exact match | 
|  | 739     for k, actual in chosen.items(): | 
|  | 740         if k == src_key: | 
|  | 741             col_for_src = actual | 
|  | 742         if k == tgt_key: | 
|  | 743             col_for_tgt = actual | 
|  | 744 | 
|  | 745     # if not found, try mapping common names | 
|  | 746     if col_for_src is None: | 
|  | 747         possible_src_names = {k: v for k, v in chosen.items()} | 
|  | 748         # try to match by contained substring | 
|  | 749         for k, actual in possible_src_names.items(): | 
|  | 750             if src_key in k: | 
|  | 751                 col_for_src = actual | 
|  | 752                 break | 
|  | 753 | 
|  | 754     if col_for_tgt is None: | 
|  | 755         for k, actual in chosen.items(): | 
|  | 756             if tgt_key in k: | 
|  | 757                 col_for_tgt = actual | 
|  | 758                 break | 
|  | 759 | 
|  | 760     if col_for_src is None: | 
|  | 761         raise ValueError(f"Source column for '{source_nomenclature}' not found in mapping dataframe.") | 
|  | 762     if col_for_tgt is None: | 
|  | 763         raise ValueError(f"Target column for '{target_nomenclature}' not found in mapping dataframe.") | 
|  | 764 | 
|  | 765     model_source_genes = { _normalize_gene_id(g.id) for g in model.genes } | 
|  | 766     logger.info(f"Filtering mapping to {len(model_source_genes)} source genes present in model (normalized).") | 
|  | 767 | 
|  | 768     tmp_map = mapping_df[[col_for_src, col_for_tgt]].dropna().copy() | 
|  | 769     tmp_map[col_for_src + "_norm"] = tmp_map[col_for_src].astype(str).map(_normalize_gene_id) | 
|  | 770 | 
|  | 771     filtered_map = tmp_map[tmp_map[col_for_src + "_norm"].isin(model_source_genes)].copy() | 
|  | 772 | 
|  | 773     if filtered_map.empty: | 
|  | 774         logger.warning("No mapping rows correspond to source genes present in the model after filtering. Proceeding with empty mapping (no translation will occur).") | 
|  | 775 | 
| 455 | 776     if not allow_many_to_one: | 
|  | 777         _validate_target_uniqueness(filtered_map, col_for_src, col_for_tgt, model_source_genes=model_source_genes, logger=logger) | 
| 426 | 778 | 
| 455 | 779     # Crea il mapping | 
| 426 | 780     gene_mapping = _create_gene_mapping(filtered_map, col_for_src, col_for_tgt, logger) | 
|  | 781 | 
|  | 782     # copy model | 
|  | 783     model_copy = model.copy() | 
|  | 784 | 
|  | 785     # statistics | 
| 455 | 786     stats = {'translated': 0, 'one_to_one': 0, 'one_to_many': 0, 'not_found': 0, 'simplified_gprs': 0} | 
| 426 | 787     unmapped = [] | 
|  | 788     multi = [] | 
| 490 | 789 | 
|  | 790     # Dictionary to store translation issues per reaction | 
|  | 791     reaction_translation_issues = {} | 
| 426 | 792 | 
|  | 793     original_genes = {g.id for g in model_copy.genes} | 
|  | 794     logger.info(f"Original genes count: {len(original_genes)}") | 
|  | 795 | 
|  | 796     # translate GPRs | 
|  | 797     for rxn in model_copy.reactions: | 
|  | 798         gpr = rxn.gene_reaction_rule | 
|  | 799         if gpr and gpr.strip(): | 
| 490 | 800             new_gpr, rxn_issues = _translate_gpr(gpr, gene_mapping, stats, unmapped, multi, logger, track_issues=True) | 
|  | 801             if rxn_issues: | 
|  | 802                 reaction_translation_issues[rxn.id] = rxn_issues | 
|  | 803 | 
| 426 | 804             if new_gpr != gpr: | 
| 455 | 805                 simplified_gpr = _simplify_boolean_expression(new_gpr) | 
|  | 806                 if simplified_gpr != new_gpr: | 
|  | 807                     stats['simplified_gprs'] += 1 | 
|  | 808                     logger.debug(f"Simplified GPR for {rxn.id}: '{new_gpr}' -> '{simplified_gpr}'") | 
|  | 809                 rxn.gene_reaction_rule = simplified_gpr | 
|  | 810                 logger.debug(f"Reaction {rxn.id}: '{gpr}' -> '{simplified_gpr}'") | 
| 426 | 811 | 
|  | 812     # update model genes based on new GPRs | 
|  | 813     _update_model_genes(model_copy, logger) | 
|  | 814 | 
|  | 815     # final logging | 
|  | 816     _log_translation_statistics(stats, unmapped, multi, original_genes, model_copy.genes, logger) | 
|  | 817 | 
|  | 818     logger.info("Translation finished") | 
| 490 | 819     return model_copy, reaction_translation_issues | 
| 426 | 820 | 
|  | 821 | 
|  | 822 # ---------- helper functions ---------- | 
|  | 823 def _create_gene_mapping(mapping_df, source_col: str, target_col: str, logger: logging.Logger) -> Dict[str, List[str]]: | 
|  | 824     """ | 
|  | 825     Build mapping dict: source_id -> list of target_ids | 
|  | 826     Normalizes IDs (removes prefixes like 'HGNC:' etc). | 
|  | 827     """ | 
|  | 828     df = mapping_df[[source_col, target_col]].dropna().copy() | 
|  | 829     # normalize to string | 
|  | 830     df[source_col] = df[source_col].astype(str).map(_normalize_gene_id) | 
|  | 831     df[target_col] = df[target_col].astype(str).str.strip() | 
|  | 832 | 
|  | 833     df = df.drop_duplicates() | 
|  | 834 | 
|  | 835     logger.info(f"Creating mapping from {len(df)} rows") | 
|  | 836 | 
|  | 837     mapping = defaultdict(list) | 
|  | 838     for _, row in df.iterrows(): | 
|  | 839         s = row[source_col] | 
|  | 840         t = row[target_col] | 
|  | 841         if t not in mapping[s]: | 
|  | 842             mapping[s].append(t) | 
|  | 843 | 
|  | 844     # stats | 
|  | 845     one_to_one = sum(1 for v in mapping.values() if len(v) == 1) | 
|  | 846     one_to_many = sum(1 for v in mapping.values() if len(v) > 1) | 
|  | 847     logger.info(f"Mapping: {len(mapping)} source keys, {one_to_one} 1:1, {one_to_many} 1:many") | 
|  | 848     return dict(mapping) | 
|  | 849 | 
|  | 850 | 
|  | 851 def _translate_gpr(gpr_string: str, | 
|  | 852                    gene_mapping: Dict[str, List[str]], | 
|  | 853                    stats: Dict[str, int], | 
|  | 854                    unmapped_genes: List[str], | 
|  | 855                    multi_mapping_genes: List[Tuple[str, List[str]]], | 
| 490 | 856                    logger: logging.Logger, | 
|  | 857                    track_issues: bool = False) -> Union[str, Tuple[str, str]]: | 
| 426 | 858     """ | 
|  | 859     Translate genes inside a GPR string using gene_mapping. | 
| 490 | 860     Returns new GPR string, and optionally translation issues if track_issues=True. | 
| 426 | 861     """ | 
|  | 862     # Generic token pattern: letters, digits, :, _, -, ., (captures HGNC:1234, ENSG000..., symbols) | 
|  | 863     token_pattern = r'\b[A-Za-z0-9:_.-]+\b' | 
|  | 864     tokens = re.findall(token_pattern, gpr_string) | 
|  | 865 | 
|  | 866     logical = {'and', 'or', 'AND', 'OR', '(', ')'} | 
|  | 867     tokens = [t for t in tokens if t not in logical] | 
|  | 868 | 
|  | 869     new_gpr = gpr_string | 
| 490 | 870     issues = [] | 
| 426 | 871 | 
|  | 872     for token in sorted(set(tokens), key=lambda x: -len(x)):  # longer tokens first to avoid partial replacement | 
|  | 873         norm = _normalize_gene_id(token) | 
|  | 874         if norm in gene_mapping: | 
|  | 875             targets = gene_mapping[norm] | 
|  | 876             stats['translated'] += 1 | 
|  | 877             if len(targets) == 1: | 
|  | 878                 stats['one_to_one'] += 1 | 
|  | 879                 replacement = targets[0] | 
|  | 880             else: | 
|  | 881                 stats['one_to_many'] += 1 | 
|  | 882                 multi_mapping_genes.append((token, targets)) | 
|  | 883                 replacement = "(" + " or ".join(targets) + ")" | 
| 490 | 884                 if track_issues: | 
|  | 885                     issues.append(f"{token} -> {' or '.join(targets)}") | 
| 426 | 886 | 
|  | 887             pattern = r'\b' + re.escape(token) + r'\b' | 
|  | 888             new_gpr = re.sub(pattern, replacement, new_gpr) | 
|  | 889         else: | 
|  | 890             stats['not_found'] += 1 | 
|  | 891             if token not in unmapped_genes: | 
|  | 892                 unmapped_genes.append(token) | 
|  | 893             logger.debug(f"Token not found in mapping (left as-is): {token}") | 
|  | 894 | 
| 490 | 895     # Check for many-to-one cases (multiple source genes mapping to same target) | 
|  | 896     if track_issues: | 
|  | 897         # Build reverse mapping to detect many-to-one cases from original tokens | 
|  | 898         original_to_target = {} | 
|  | 899 | 
|  | 900         for orig_token in tokens: | 
|  | 901             norm = _normalize_gene_id(orig_token) | 
|  | 902             if norm in gene_mapping: | 
|  | 903                 targets = gene_mapping[norm] | 
|  | 904                 for target in targets: | 
|  | 905                     if target not in original_to_target: | 
|  | 906                         original_to_target[target] = [] | 
|  | 907                     if orig_token not in original_to_target[target]: | 
|  | 908                         original_to_target[target].append(orig_token) | 
|  | 909 | 
|  | 910         # Identify many-to-one mappings in this specific GPR | 
|  | 911         for target, original_genes in original_to_target.items(): | 
|  | 912             if len(original_genes) > 1: | 
|  | 913                 issues.append(f"{' or '.join(original_genes)} -> {target}") | 
|  | 914 | 
|  | 915     issue_text = "; ".join(issues) if issues else "" | 
|  | 916 | 
|  | 917     if track_issues: | 
|  | 918         return new_gpr, issue_text | 
|  | 919     else: | 
|  | 920         return new_gpr | 
| 426 | 921 | 
|  | 922 | 
|  | 923 def _update_model_genes(model: 'cobra.Model', logger: logging.Logger): | 
|  | 924     """ | 
|  | 925     Rebuild model.genes from gene_reaction_rule content. | 
|  | 926     Removes genes not referenced and adds missing ones. | 
|  | 927     """ | 
|  | 928     # collect genes in GPRs | 
|  | 929     gene_pattern = r'\b[A-Za-z0-9:_.-]+\b' | 
|  | 930     logical = {'and', 'or', 'AND', 'OR', '(', ')'} | 
|  | 931     genes_in_gpr: Set[str] = set() | 
|  | 932 | 
|  | 933     for rxn in model.reactions: | 
|  | 934         gpr = rxn.gene_reaction_rule | 
|  | 935         if gpr and gpr.strip(): | 
|  | 936             toks = re.findall(gene_pattern, gpr) | 
|  | 937             toks = [t for t in toks if t not in logical] | 
|  | 938             # normalize IDs consistent with mapping normalization | 
|  | 939             toks = [_normalize_gene_id(t) for t in toks] | 
|  | 940             genes_in_gpr.update(toks) | 
|  | 941 | 
|  | 942     # existing gene ids | 
|  | 943     existing = {g.id for g in model.genes} | 
|  | 944 | 
|  | 945     # remove obsolete genes | 
|  | 946     to_remove = [gid for gid in existing if gid not in genes_in_gpr] | 
|  | 947     removed = 0 | 
|  | 948     for gid in to_remove: | 
|  | 949         try: | 
|  | 950             gene_obj = model.genes.get_by_id(gid) | 
|  | 951             model.genes.remove(gene_obj) | 
|  | 952             removed += 1 | 
|  | 953         except Exception: | 
|  | 954             # safe-ignore | 
|  | 955             pass | 
|  | 956 | 
|  | 957     # add new genes | 
|  | 958     added = 0 | 
|  | 959     for gid in genes_in_gpr: | 
|  | 960         if gid not in existing: | 
|  | 961             new_gene = cobra.Gene(gid) | 
|  | 962             try: | 
|  | 963                 model.genes.add(new_gene) | 
|  | 964             except Exception: | 
|  | 965                 # fallback: if model.genes doesn't support add, try append or model.add_genes | 
|  | 966                 try: | 
|  | 967                     model.genes.append(new_gene) | 
|  | 968                 except Exception: | 
|  | 969                     try: | 
|  | 970                         model.add_genes([new_gene]) | 
|  | 971                     except Exception: | 
|  | 972                         logger.warning(f"Could not add gene object for {gid}") | 
|  | 973             added += 1 | 
|  | 974 | 
|  | 975     logger.info(f"Model genes updated: removed {removed}, added {added}") | 
|  | 976 | 
|  | 977 | 
|  | 978 def _log_translation_statistics(stats: Dict[str, int], | 
|  | 979                                unmapped_genes: List[str], | 
|  | 980                                multi_mapping_genes: List[Tuple[str, List[str]]], | 
|  | 981                                original_genes: Set[str], | 
|  | 982                                final_genes, | 
|  | 983                                logger: logging.Logger): | 
|  | 984     logger.info("=== TRANSLATION STATISTICS ===") | 
|  | 985     logger.info(f"Translated: {stats.get('translated', 0)} (1:1 = {stats.get('one_to_one', 0)}, 1:many = {stats.get('one_to_many', 0)})") | 
|  | 986     logger.info(f"Not found tokens: {stats.get('not_found', 0)}") | 
| 455 | 987     logger.info(f"Simplified GPRs: {stats.get('simplified_gprs', 0)}") | 
| 426 | 988 | 
|  | 989     final_ids = {g.id for g in final_genes} | 
|  | 990     logger.info(f"Genes in model: {len(original_genes)} -> {len(final_ids)}") | 
|  | 991 | 
|  | 992     if unmapped_genes: | 
|  | 993         logger.warning(f"Unmapped tokens ({len(unmapped_genes)}): {', '.join(unmapped_genes[:20])}{(' ...' if len(unmapped_genes)>20 else '')}") | 
|  | 994     if multi_mapping_genes: | 
|  | 995         logger.info(f"Multi-mapping examples ({len(multi_mapping_genes)}):") | 
|  | 996         for orig, targets in multi_mapping_genes[:10]: | 
| 490 | 997             logger.info(f"  {orig} -> {', '.join(targets)}") | 
|  | 998 | 
|  | 999 |