Mercurial > repos > bimib > cobraxy
comparison COBRAxy/utils/model_utils.py @ 493:a92d21f92956 draft
Uploaded
author | francesco_lapi |
---|---|
date | Tue, 30 Sep 2025 15:00:21 +0000 |
parents | 4ed95023af20 |
children | a2f7a6dd9d0b |
comparison
equal
deleted
inserted
replaced
492:4ed95023af20 | 493:a92d21f92956 |
---|---|
364 Robust pattern: tokens ending with _<compartment> (e.g., _c, _m, _e), | 364 Robust pattern: tokens ending with _<compartment> (e.g., _c, _m, _e), |
365 allowing leading digits/underscores. | 365 allowing leading digits/underscores. |
366 """ | 366 """ |
367 metabolites = set() | 367 metabolites = set() |
368 # optional coefficient followed by a token ending with _<letters> | 368 # optional coefficient followed by a token ending with _<letters> |
369 pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+_[a-z]+)' | 369 if reaction_formula[-1] == ']' and reaction_formula[-3] == '[': |
370 pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+[[A-Za-z0-9]]+)' | |
371 else: | |
372 pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+_[A-Za-z0-9]+)' | |
370 matches = re.findall(pattern, reaction_formula) | 373 matches = re.findall(pattern, reaction_formula) |
371 metabolites.update(matches) | 374 metabolites.update(matches) |
372 return metabolites | 375 return metabolites |
373 | 376 |
374 | 377 |
375 def extract_compartment_from_metabolite(metabolite_id: str) -> str: | 378 def extract_compartment_from_metabolite(metabolite_id: str) -> str: |
376 """Extract the compartment from a metabolite ID.""" | 379 """Extract the compartment from a metabolite ID.""" |
377 if '_' in metabolite_id: | 380 if '_' in metabolite_id: |
378 return metabolite_id.split('_')[-1] | 381 return metabolite_id.split('_')[-1] |
382 if metabolite_id[-1] == ']' and metabolite_id[-3] == '[': | |
383 return metabolite_id[-2] | |
379 return 'c' # default cytoplasm | 384 return 'c' # default cytoplasm |
380 | 385 |
381 | 386 |
382 def parse_reaction_formula(reaction: Reaction, formula: str, metabolites_dict: Dict[str, Metabolite]): | 387 def parse_reaction_formula(reaction: Reaction, formula: str, metabolites_dict: Dict[str, Metabolite]): |
383 """Parse a reaction formula and set metabolites with their coefficients.""" | 388 """Parse a reaction formula and set metabolites with their coefficients.""" |
595 g = str(g).strip() | 600 g = str(g).strip() |
596 # remove common prefixes | 601 # remove common prefixes |
597 g = re.sub(r'^(HGNC:)', '', g, flags=re.IGNORECASE) | 602 g = re.sub(r'^(HGNC:)', '', g, flags=re.IGNORECASE) |
598 g = re.sub(r'^(ENSG:)', '', g, flags=re.IGNORECASE) | 603 g = re.sub(r'^(ENSG:)', '', g, flags=re.IGNORECASE) |
599 return g | 604 return g |
605 | |
606 def _is_or_only_expression(expr: str) -> bool: | |
607 """ | |
608 Check if a GPR expression contains only OR operators (no AND operators). | |
609 | |
610 Args: | |
611 expr: GPR expression string | |
612 | |
613 Returns: | |
614 bool: True if expression contains only OR (and parentheses) and has multiple genes, False otherwise | |
615 """ | |
616 if not expr or not expr.strip(): | |
617 return False | |
618 | |
619 # Normalize the expression | |
620 normalized = expr.replace(' AND ', ' and ').replace(' OR ', ' or ') | |
621 | |
622 # Check if it contains any AND operators | |
623 has_and = ' and ' in normalized.lower() | |
624 | |
625 # Check if it contains OR operators | |
626 has_or = ' or ' in normalized.lower() | |
627 | |
628 # Must have OR operators and no AND operators | |
629 return has_or and not has_and | |
630 | |
631 | |
632 def _flatten_or_only_gpr(expr: str) -> str: | |
633 """ | |
634 Flatten a GPR expression that contains only OR operators by: | |
635 1. Removing all parentheses | |
636 2. Extracting unique gene names | |
637 3. Joining them with ' or ' | |
638 | |
639 Args: | |
640 expr: GPR expression string with only OR operators | |
641 | |
642 Returns: | |
643 str: Flattened GPR expression | |
644 """ | |
645 if not expr or not expr.strip(): | |
646 return expr | |
647 | |
648 # Extract all gene tokens (exclude logical operators and parentheses) | |
649 gene_pattern = r'\b[A-Za-z0-9:_.-]+\b' | |
650 logical = {'and', 'or', 'AND', 'OR', '(', ')'} | |
651 | |
652 tokens = re.findall(gene_pattern, expr) | |
653 genes = [t for t in tokens if t not in logical] | |
654 | |
655 # Create set to remove duplicates, then convert back to list to maintain some order | |
656 unique_genes = list(dict.fromkeys(genes)) # Preserves insertion order | |
657 | |
658 if len(unique_genes) == 0: | |
659 return expr | |
660 elif len(unique_genes) == 1: | |
661 return unique_genes[0] | |
662 else: | |
663 return ' or '.join(unique_genes) | |
664 | |
600 | 665 |
601 def _simplify_boolean_expression(expr: str) -> str: | 666 def _simplify_boolean_expression(expr: str) -> str: |
602 """ | 667 """ |
603 Simplify a boolean expression by removing duplicates while strictly preserving semantics. | 668 Simplify a boolean expression by removing duplicates while strictly preserving semantics. |
604 This function handles simple duplicates within parentheses while being conservative about | 669 This function handles simple duplicates within parentheses while being conservative about |
781 | 846 |
782 # copy model | 847 # copy model |
783 model_copy = model.copy() | 848 model_copy = model.copy() |
784 | 849 |
785 # statistics | 850 # statistics |
786 stats = {'translated': 0, 'one_to_one': 0, 'one_to_many': 0, 'not_found': 0, 'simplified_gprs': 0} | 851 stats = {'translated': 0, 'one_to_one': 0, 'one_to_many': 0, 'not_found': 0, 'simplified_gprs': 0, 'flattened_or_gprs': 0} |
787 unmapped = [] | 852 unmapped = [] |
788 multi = [] | 853 multi = [] |
789 | 854 |
790 # Dictionary to store translation issues per reaction | 855 # Dictionary to store translation issues per reaction |
791 reaction_translation_issues = {} | 856 reaction_translation_issues = {} |
800 new_gpr, rxn_issues = _translate_gpr(gpr, gene_mapping, stats, unmapped, multi, logger, track_issues=True) | 865 new_gpr, rxn_issues = _translate_gpr(gpr, gene_mapping, stats, unmapped, multi, logger, track_issues=True) |
801 if rxn_issues: | 866 if rxn_issues: |
802 reaction_translation_issues[rxn.id] = rxn_issues | 867 reaction_translation_issues[rxn.id] = rxn_issues |
803 | 868 |
804 if new_gpr != gpr: | 869 if new_gpr != gpr: |
870 # Check if this GPR has translation issues and contains only OR operators | |
871 if rxn_issues and _is_or_only_expression(new_gpr): | |
872 # Flatten the GPR: remove parentheses and create set of unique genes | |
873 flattened_gpr = _flatten_or_only_gpr(new_gpr) | |
874 if flattened_gpr != new_gpr: | |
875 stats['flattened_or_gprs'] += 1 | |
876 logger.debug(f"Flattened OR-only GPR with issues for {rxn.id}: '{new_gpr}' -> '{flattened_gpr}'") | |
877 new_gpr = flattened_gpr | |
878 | |
805 simplified_gpr = _simplify_boolean_expression(new_gpr) | 879 simplified_gpr = _simplify_boolean_expression(new_gpr) |
806 if simplified_gpr != new_gpr: | 880 if simplified_gpr != new_gpr: |
807 stats['simplified_gprs'] += 1 | 881 stats['simplified_gprs'] += 1 |
808 logger.debug(f"Simplified GPR for {rxn.id}: '{new_gpr}' -> '{simplified_gpr}'") | 882 logger.debug(f"Simplified GPR for {rxn.id}: '{new_gpr}' -> '{simplified_gpr}'") |
809 rxn.gene_reaction_rule = simplified_gpr | 883 rxn.gene_reaction_rule = simplified_gpr |
983 logger: logging.Logger): | 1057 logger: logging.Logger): |
984 logger.info("=== TRANSLATION STATISTICS ===") | 1058 logger.info("=== TRANSLATION STATISTICS ===") |
985 logger.info(f"Translated: {stats.get('translated', 0)} (1:1 = {stats.get('one_to_one', 0)}, 1:many = {stats.get('one_to_many', 0)})") | 1059 logger.info(f"Translated: {stats.get('translated', 0)} (1:1 = {stats.get('one_to_one', 0)}, 1:many = {stats.get('one_to_many', 0)})") |
986 logger.info(f"Not found tokens: {stats.get('not_found', 0)}") | 1060 logger.info(f"Not found tokens: {stats.get('not_found', 0)}") |
987 logger.info(f"Simplified GPRs: {stats.get('simplified_gprs', 0)}") | 1061 logger.info(f"Simplified GPRs: {stats.get('simplified_gprs', 0)}") |
1062 logger.info(f"Flattened OR-only GPRs with issues: {stats.get('flattened_or_gprs', 0)}") | |
988 | 1063 |
989 final_ids = {g.id for g in final_genes} | 1064 final_ids = {g.id for g in final_genes} |
990 logger.info(f"Genes in model: {len(original_genes)} -> {len(final_ids)}") | 1065 logger.info(f"Genes in model: {len(original_genes)} -> {len(final_ids)}") |
991 | 1066 |
992 if unmapped_genes: | 1067 if unmapped_genes: |
993 logger.warning(f"Unmapped tokens ({len(unmapped_genes)}): {', '.join(unmapped_genes[:20])}{(' ...' if len(unmapped_genes)>20 else '')}") | 1068 logger.warning(f"Unmapped tokens ({len(unmapped_genes)}): {', '.join(unmapped_genes[:20])}{(' ...' if len(unmapped_genes)>20 else '')}") |
994 if multi_mapping_genes: | 1069 if multi_mapping_genes: |
995 logger.info(f"Multi-mapping examples ({len(multi_mapping_genes)}):") | 1070 logger.info(f"Multi-mapping examples ({len(multi_mapping_genes)}):") |
996 for orig, targets in multi_mapping_genes[:10]: | 1071 for orig, targets in multi_mapping_genes[:10]: |
997 logger.info(f" {orig} -> {', '.join(targets)}") | 1072 logger.info(f" {orig} -> {', '.join(targets)}") |
998 | 1073 |
999 | 1074 # Log summary of flattened GPRs if any |
1075 if stats.get('flattened_or_gprs', 0) > 0: | |
1076 logger.info(f"Flattened {stats['flattened_or_gprs']} OR-only GPRs that had translation issues (removed parentheses, created unique gene sets)") | |
1077 | |
1078 |