Mercurial > repos > bimib > cobraxy
comparison COBRAxy/utils/model_utils.py @ 493:a92d21f92956 draft
Uploaded
| author | francesco_lapi |
|---|---|
| date | Tue, 30 Sep 2025 15:00:21 +0000 |
| parents | 4ed95023af20 |
| children | a2f7a6dd9d0b |
comparison
equal
deleted
inserted
replaced
| 492:4ed95023af20 | 493:a92d21f92956 |
|---|---|
| 364 Robust pattern: tokens ending with _<compartment> (e.g., _c, _m, _e), | 364 Robust pattern: tokens ending with _<compartment> (e.g., _c, _m, _e), |
| 365 allowing leading digits/underscores. | 365 allowing leading digits/underscores. |
| 366 """ | 366 """ |
| 367 metabolites = set() | 367 metabolites = set() |
| 368 # optional coefficient followed by a token ending with _<letters> | 368 # optional coefficient followed by a token ending with _<letters> |
| 369 pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+_[a-z]+)' | 369 if reaction_formula[-1] == ']' and reaction_formula[-3] == '[': |
| 370 pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+[[A-Za-z0-9]]+)' | |
| 371 else: | |
| 372 pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+_[A-Za-z0-9]+)' | |
| 370 matches = re.findall(pattern, reaction_formula) | 373 matches = re.findall(pattern, reaction_formula) |
| 371 metabolites.update(matches) | 374 metabolites.update(matches) |
| 372 return metabolites | 375 return metabolites |
| 373 | 376 |
| 374 | 377 |
| 375 def extract_compartment_from_metabolite(metabolite_id: str) -> str: | 378 def extract_compartment_from_metabolite(metabolite_id: str) -> str: |
| 376 """Extract the compartment from a metabolite ID.""" | 379 """Extract the compartment from a metabolite ID.""" |
| 377 if '_' in metabolite_id: | 380 if '_' in metabolite_id: |
| 378 return metabolite_id.split('_')[-1] | 381 return metabolite_id.split('_')[-1] |
| 382 if metabolite_id[-1] == ']' and metabolite_id[-3] == '[': | |
| 383 return metabolite_id[-2] | |
| 379 return 'c' # default cytoplasm | 384 return 'c' # default cytoplasm |
| 380 | 385 |
| 381 | 386 |
| 382 def parse_reaction_formula(reaction: Reaction, formula: str, metabolites_dict: Dict[str, Metabolite]): | 387 def parse_reaction_formula(reaction: Reaction, formula: str, metabolites_dict: Dict[str, Metabolite]): |
| 383 """Parse a reaction formula and set metabolites with their coefficients.""" | 388 """Parse a reaction formula and set metabolites with their coefficients.""" |
| 595 g = str(g).strip() | 600 g = str(g).strip() |
| 596 # remove common prefixes | 601 # remove common prefixes |
| 597 g = re.sub(r'^(HGNC:)', '', g, flags=re.IGNORECASE) | 602 g = re.sub(r'^(HGNC:)', '', g, flags=re.IGNORECASE) |
| 598 g = re.sub(r'^(ENSG:)', '', g, flags=re.IGNORECASE) | 603 g = re.sub(r'^(ENSG:)', '', g, flags=re.IGNORECASE) |
| 599 return g | 604 return g |
| 605 | |
| 606 def _is_or_only_expression(expr: str) -> bool: | |
| 607 """ | |
| 608 Check if a GPR expression contains only OR operators (no AND operators). | |
| 609 | |
| 610 Args: | |
| 611 expr: GPR expression string | |
| 612 | |
| 613 Returns: | |
| 614 bool: True if expression contains only OR (and parentheses) and has multiple genes, False otherwise | |
| 615 """ | |
| 616 if not expr or not expr.strip(): | |
| 617 return False | |
| 618 | |
| 619 # Normalize the expression | |
| 620 normalized = expr.replace(' AND ', ' and ').replace(' OR ', ' or ') | |
| 621 | |
| 622 # Check if it contains any AND operators | |
| 623 has_and = ' and ' in normalized.lower() | |
| 624 | |
| 625 # Check if it contains OR operators | |
| 626 has_or = ' or ' in normalized.lower() | |
| 627 | |
| 628 # Must have OR operators and no AND operators | |
| 629 return has_or and not has_and | |
| 630 | |
| 631 | |
| 632 def _flatten_or_only_gpr(expr: str) -> str: | |
| 633 """ | |
| 634 Flatten a GPR expression that contains only OR operators by: | |
| 635 1. Removing all parentheses | |
| 636 2. Extracting unique gene names | |
| 637 3. Joining them with ' or ' | |
| 638 | |
| 639 Args: | |
| 640 expr: GPR expression string with only OR operators | |
| 641 | |
| 642 Returns: | |
| 643 str: Flattened GPR expression | |
| 644 """ | |
| 645 if not expr or not expr.strip(): | |
| 646 return expr | |
| 647 | |
| 648 # Extract all gene tokens (exclude logical operators and parentheses) | |
| 649 gene_pattern = r'\b[A-Za-z0-9:_.-]+\b' | |
| 650 logical = {'and', 'or', 'AND', 'OR', '(', ')'} | |
| 651 | |
| 652 tokens = re.findall(gene_pattern, expr) | |
| 653 genes = [t for t in tokens if t not in logical] | |
| 654 | |
| 655 # Create set to remove duplicates, then convert back to list to maintain some order | |
| 656 unique_genes = list(dict.fromkeys(genes)) # Preserves insertion order | |
| 657 | |
| 658 if len(unique_genes) == 0: | |
| 659 return expr | |
| 660 elif len(unique_genes) == 1: | |
| 661 return unique_genes[0] | |
| 662 else: | |
| 663 return ' or '.join(unique_genes) | |
| 664 | |
| 600 | 665 |
| 601 def _simplify_boolean_expression(expr: str) -> str: | 666 def _simplify_boolean_expression(expr: str) -> str: |
| 602 """ | 667 """ |
| 603 Simplify a boolean expression by removing duplicates while strictly preserving semantics. | 668 Simplify a boolean expression by removing duplicates while strictly preserving semantics. |
| 604 This function handles simple duplicates within parentheses while being conservative about | 669 This function handles simple duplicates within parentheses while being conservative about |
| 781 | 846 |
| 782 # copy model | 847 # copy model |
| 783 model_copy = model.copy() | 848 model_copy = model.copy() |
| 784 | 849 |
| 785 # statistics | 850 # statistics |
| 786 stats = {'translated': 0, 'one_to_one': 0, 'one_to_many': 0, 'not_found': 0, 'simplified_gprs': 0} | 851 stats = {'translated': 0, 'one_to_one': 0, 'one_to_many': 0, 'not_found': 0, 'simplified_gprs': 0, 'flattened_or_gprs': 0} |
| 787 unmapped = [] | 852 unmapped = [] |
| 788 multi = [] | 853 multi = [] |
| 789 | 854 |
| 790 # Dictionary to store translation issues per reaction | 855 # Dictionary to store translation issues per reaction |
| 791 reaction_translation_issues = {} | 856 reaction_translation_issues = {} |
| 800 new_gpr, rxn_issues = _translate_gpr(gpr, gene_mapping, stats, unmapped, multi, logger, track_issues=True) | 865 new_gpr, rxn_issues = _translate_gpr(gpr, gene_mapping, stats, unmapped, multi, logger, track_issues=True) |
| 801 if rxn_issues: | 866 if rxn_issues: |
| 802 reaction_translation_issues[rxn.id] = rxn_issues | 867 reaction_translation_issues[rxn.id] = rxn_issues |
| 803 | 868 |
| 804 if new_gpr != gpr: | 869 if new_gpr != gpr: |
| 870 # Check if this GPR has translation issues and contains only OR operators | |
| 871 if rxn_issues and _is_or_only_expression(new_gpr): | |
| 872 # Flatten the GPR: remove parentheses and create set of unique genes | |
| 873 flattened_gpr = _flatten_or_only_gpr(new_gpr) | |
| 874 if flattened_gpr != new_gpr: | |
| 875 stats['flattened_or_gprs'] += 1 | |
| 876 logger.debug(f"Flattened OR-only GPR with issues for {rxn.id}: '{new_gpr}' -> '{flattened_gpr}'") | |
| 877 new_gpr = flattened_gpr | |
| 878 | |
| 805 simplified_gpr = _simplify_boolean_expression(new_gpr) | 879 simplified_gpr = _simplify_boolean_expression(new_gpr) |
| 806 if simplified_gpr != new_gpr: | 880 if simplified_gpr != new_gpr: |
| 807 stats['simplified_gprs'] += 1 | 881 stats['simplified_gprs'] += 1 |
| 808 logger.debug(f"Simplified GPR for {rxn.id}: '{new_gpr}' -> '{simplified_gpr}'") | 882 logger.debug(f"Simplified GPR for {rxn.id}: '{new_gpr}' -> '{simplified_gpr}'") |
| 809 rxn.gene_reaction_rule = simplified_gpr | 883 rxn.gene_reaction_rule = simplified_gpr |
| 983 logger: logging.Logger): | 1057 logger: logging.Logger): |
| 984 logger.info("=== TRANSLATION STATISTICS ===") | 1058 logger.info("=== TRANSLATION STATISTICS ===") |
| 985 logger.info(f"Translated: {stats.get('translated', 0)} (1:1 = {stats.get('one_to_one', 0)}, 1:many = {stats.get('one_to_many', 0)})") | 1059 logger.info(f"Translated: {stats.get('translated', 0)} (1:1 = {stats.get('one_to_one', 0)}, 1:many = {stats.get('one_to_many', 0)})") |
| 986 logger.info(f"Not found tokens: {stats.get('not_found', 0)}") | 1060 logger.info(f"Not found tokens: {stats.get('not_found', 0)}") |
| 987 logger.info(f"Simplified GPRs: {stats.get('simplified_gprs', 0)}") | 1061 logger.info(f"Simplified GPRs: {stats.get('simplified_gprs', 0)}") |
| 1062 logger.info(f"Flattened OR-only GPRs with issues: {stats.get('flattened_or_gprs', 0)}") | |
| 988 | 1063 |
| 989 final_ids = {g.id for g in final_genes} | 1064 final_ids = {g.id for g in final_genes} |
| 990 logger.info(f"Genes in model: {len(original_genes)} -> {len(final_ids)}") | 1065 logger.info(f"Genes in model: {len(original_genes)} -> {len(final_ids)}") |
| 991 | 1066 |
| 992 if unmapped_genes: | 1067 if unmapped_genes: |
| 993 logger.warning(f"Unmapped tokens ({len(unmapped_genes)}): {', '.join(unmapped_genes[:20])}{(' ...' if len(unmapped_genes)>20 else '')}") | 1068 logger.warning(f"Unmapped tokens ({len(unmapped_genes)}): {', '.join(unmapped_genes[:20])}{(' ...' if len(unmapped_genes)>20 else '')}") |
| 994 if multi_mapping_genes: | 1069 if multi_mapping_genes: |
| 995 logger.info(f"Multi-mapping examples ({len(multi_mapping_genes)}):") | 1070 logger.info(f"Multi-mapping examples ({len(multi_mapping_genes)}):") |
| 996 for orig, targets in multi_mapping_genes[:10]: | 1071 for orig, targets in multi_mapping_genes[:10]: |
| 997 logger.info(f" {orig} -> {', '.join(targets)}") | 1072 logger.info(f" {orig} -> {', '.join(targets)}") |
| 998 | 1073 |
| 999 | 1074 # Log summary of flattened GPRs if any |
| 1075 if stats.get('flattened_or_gprs', 0) > 0: | |
| 1076 logger.info(f"Flattened {stats['flattened_or_gprs']} OR-only GPRs that had translation issues (removed parentheses, created unique gene sets)") | |
| 1077 | |
| 1078 |
