comparison COBRAxy/utils/model_utils.py @ 493:a92d21f92956 draft

Uploaded
author francesco_lapi
date Tue, 30 Sep 2025 15:00:21 +0000
parents 4ed95023af20
children a2f7a6dd9d0b
comparison
equal deleted inserted replaced
492:4ed95023af20 493:a92d21f92956
364 Robust pattern: tokens ending with _<compartment> (e.g., _c, _m, _e), 364 Robust pattern: tokens ending with _<compartment> (e.g., _c, _m, _e),
365 allowing leading digits/underscores. 365 allowing leading digits/underscores.
366 """ 366 """
367 metabolites = set() 367 metabolites = set()
368 # optional coefficient followed by a token ending with _<letters> 368 # optional coefficient followed by a token ending with _<letters>
369 pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+_[a-z]+)' 369 if reaction_formula[-1] == ']' and reaction_formula[-3] == '[':
370 pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+[[A-Za-z0-9]]+)'
371 else:
372 pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+_[A-Za-z0-9]+)'
370 matches = re.findall(pattern, reaction_formula) 373 matches = re.findall(pattern, reaction_formula)
371 metabolites.update(matches) 374 metabolites.update(matches)
372 return metabolites 375 return metabolites
373 376
374 377
375 def extract_compartment_from_metabolite(metabolite_id: str) -> str: 378 def extract_compartment_from_metabolite(metabolite_id: str) -> str:
376 """Extract the compartment from a metabolite ID.""" 379 """Extract the compartment from a metabolite ID."""
377 if '_' in metabolite_id: 380 if '_' in metabolite_id:
378 return metabolite_id.split('_')[-1] 381 return metabolite_id.split('_')[-1]
382 if metabolite_id[-1] == ']' and metabolite_id[-3] == '[':
383 return metabolite_id[-2]
379 return 'c' # default cytoplasm 384 return 'c' # default cytoplasm
380 385
381 386
382 def parse_reaction_formula(reaction: Reaction, formula: str, metabolites_dict: Dict[str, Metabolite]): 387 def parse_reaction_formula(reaction: Reaction, formula: str, metabolites_dict: Dict[str, Metabolite]):
383 """Parse a reaction formula and set metabolites with their coefficients.""" 388 """Parse a reaction formula and set metabolites with their coefficients."""
595 g = str(g).strip() 600 g = str(g).strip()
596 # remove common prefixes 601 # remove common prefixes
597 g = re.sub(r'^(HGNC:)', '', g, flags=re.IGNORECASE) 602 g = re.sub(r'^(HGNC:)', '', g, flags=re.IGNORECASE)
598 g = re.sub(r'^(ENSG:)', '', g, flags=re.IGNORECASE) 603 g = re.sub(r'^(ENSG:)', '', g, flags=re.IGNORECASE)
599 return g 604 return g
605
606 def _is_or_only_expression(expr: str) -> bool:
607 """
608 Check if a GPR expression contains only OR operators (no AND operators).
609
610 Args:
611 expr: GPR expression string
612
613 Returns:
614 bool: True if expression contains only OR (and parentheses) and has multiple genes, False otherwise
615 """
616 if not expr or not expr.strip():
617 return False
618
619 # Normalize the expression
620 normalized = expr.replace(' AND ', ' and ').replace(' OR ', ' or ')
621
622 # Check if it contains any AND operators
623 has_and = ' and ' in normalized.lower()
624
625 # Check if it contains OR operators
626 has_or = ' or ' in normalized.lower()
627
628 # Must have OR operators and no AND operators
629 return has_or and not has_and
630
631
632 def _flatten_or_only_gpr(expr: str) -> str:
633 """
634 Flatten a GPR expression that contains only OR operators by:
635 1. Removing all parentheses
636 2. Extracting unique gene names
637 3. Joining them with ' or '
638
639 Args:
640 expr: GPR expression string with only OR operators
641
642 Returns:
643 str: Flattened GPR expression
644 """
645 if not expr or not expr.strip():
646 return expr
647
648 # Extract all gene tokens (exclude logical operators and parentheses)
649 gene_pattern = r'\b[A-Za-z0-9:_.-]+\b'
650 logical = {'and', 'or', 'AND', 'OR', '(', ')'}
651
652 tokens = re.findall(gene_pattern, expr)
653 genes = [t for t in tokens if t not in logical]
654
655 # Create set to remove duplicates, then convert back to list to maintain some order
656 unique_genes = list(dict.fromkeys(genes)) # Preserves insertion order
657
658 if len(unique_genes) == 0:
659 return expr
660 elif len(unique_genes) == 1:
661 return unique_genes[0]
662 else:
663 return ' or '.join(unique_genes)
664
600 665
601 def _simplify_boolean_expression(expr: str) -> str: 666 def _simplify_boolean_expression(expr: str) -> str:
602 """ 667 """
603 Simplify a boolean expression by removing duplicates while strictly preserving semantics. 668 Simplify a boolean expression by removing duplicates while strictly preserving semantics.
604 This function handles simple duplicates within parentheses while being conservative about 669 This function handles simple duplicates within parentheses while being conservative about
781 846
782 # copy model 847 # copy model
783 model_copy = model.copy() 848 model_copy = model.copy()
784 849
785 # statistics 850 # statistics
786 stats = {'translated': 0, 'one_to_one': 0, 'one_to_many': 0, 'not_found': 0, 'simplified_gprs': 0} 851 stats = {'translated': 0, 'one_to_one': 0, 'one_to_many': 0, 'not_found': 0, 'simplified_gprs': 0, 'flattened_or_gprs': 0}
787 unmapped = [] 852 unmapped = []
788 multi = [] 853 multi = []
789 854
790 # Dictionary to store translation issues per reaction 855 # Dictionary to store translation issues per reaction
791 reaction_translation_issues = {} 856 reaction_translation_issues = {}
800 new_gpr, rxn_issues = _translate_gpr(gpr, gene_mapping, stats, unmapped, multi, logger, track_issues=True) 865 new_gpr, rxn_issues = _translate_gpr(gpr, gene_mapping, stats, unmapped, multi, logger, track_issues=True)
801 if rxn_issues: 866 if rxn_issues:
802 reaction_translation_issues[rxn.id] = rxn_issues 867 reaction_translation_issues[rxn.id] = rxn_issues
803 868
804 if new_gpr != gpr: 869 if new_gpr != gpr:
870 # Check if this GPR has translation issues and contains only OR operators
871 if rxn_issues and _is_or_only_expression(new_gpr):
872 # Flatten the GPR: remove parentheses and create set of unique genes
873 flattened_gpr = _flatten_or_only_gpr(new_gpr)
874 if flattened_gpr != new_gpr:
875 stats['flattened_or_gprs'] += 1
876 logger.debug(f"Flattened OR-only GPR with issues for {rxn.id}: '{new_gpr}' -> '{flattened_gpr}'")
877 new_gpr = flattened_gpr
878
805 simplified_gpr = _simplify_boolean_expression(new_gpr) 879 simplified_gpr = _simplify_boolean_expression(new_gpr)
806 if simplified_gpr != new_gpr: 880 if simplified_gpr != new_gpr:
807 stats['simplified_gprs'] += 1 881 stats['simplified_gprs'] += 1
808 logger.debug(f"Simplified GPR for {rxn.id}: '{new_gpr}' -> '{simplified_gpr}'") 882 logger.debug(f"Simplified GPR for {rxn.id}: '{new_gpr}' -> '{simplified_gpr}'")
809 rxn.gene_reaction_rule = simplified_gpr 883 rxn.gene_reaction_rule = simplified_gpr
983 logger: logging.Logger): 1057 logger: logging.Logger):
984 logger.info("=== TRANSLATION STATISTICS ===") 1058 logger.info("=== TRANSLATION STATISTICS ===")
985 logger.info(f"Translated: {stats.get('translated', 0)} (1:1 = {stats.get('one_to_one', 0)}, 1:many = {stats.get('one_to_many', 0)})") 1059 logger.info(f"Translated: {stats.get('translated', 0)} (1:1 = {stats.get('one_to_one', 0)}, 1:many = {stats.get('one_to_many', 0)})")
986 logger.info(f"Not found tokens: {stats.get('not_found', 0)}") 1060 logger.info(f"Not found tokens: {stats.get('not_found', 0)}")
987 logger.info(f"Simplified GPRs: {stats.get('simplified_gprs', 0)}") 1061 logger.info(f"Simplified GPRs: {stats.get('simplified_gprs', 0)}")
1062 logger.info(f"Flattened OR-only GPRs with issues: {stats.get('flattened_or_gprs', 0)}")
988 1063
989 final_ids = {g.id for g in final_genes} 1064 final_ids = {g.id for g in final_genes}
990 logger.info(f"Genes in model: {len(original_genes)} -> {len(final_ids)}") 1065 logger.info(f"Genes in model: {len(original_genes)} -> {len(final_ids)}")
991 1066
992 if unmapped_genes: 1067 if unmapped_genes:
993 logger.warning(f"Unmapped tokens ({len(unmapped_genes)}): {', '.join(unmapped_genes[:20])}{(' ...' if len(unmapped_genes)>20 else '')}") 1068 logger.warning(f"Unmapped tokens ({len(unmapped_genes)}): {', '.join(unmapped_genes[:20])}{(' ...' if len(unmapped_genes)>20 else '')}")
994 if multi_mapping_genes: 1069 if multi_mapping_genes:
995 logger.info(f"Multi-mapping examples ({len(multi_mapping_genes)}):") 1070 logger.info(f"Multi-mapping examples ({len(multi_mapping_genes)}):")
996 for orig, targets in multi_mapping_genes[:10]: 1071 for orig, targets in multi_mapping_genes[:10]:
997 logger.info(f" {orig} -> {', '.join(targets)}") 1072 logger.info(f" {orig} -> {', '.join(targets)}")
998 1073
999 1074 # Log summary of flattened GPRs if any
1075 if stats.get('flattened_or_gprs', 0) > 0:
1076 logger.info(f"Flattened {stats['flattened_or_gprs']} OR-only GPRs that had translation issues (removed parentheses, created unique gene sets)")
1077
1078