diff COBRAxy/metabolic_model_setting.py @ 490:c6ea189ea7e9 draft

Uploaded
author francesco_lapi
date Mon, 29 Sep 2025 15:13:21 +0000
parents 5b625d91bc7f
children
line wrap: on
line diff
--- a/COBRAxy/metabolic_model_setting.py	Mon Sep 29 10:33:26 2025 +0000
+++ b/COBRAxy/metabolic_model_setting.py	Mon Sep 29 15:13:21 2025 +0000
@@ -16,6 +16,8 @@
 from typing import Optional, Tuple, List
 import utils.model_utils as modelUtils
 import logging
+from pathlib import Path
+
 
 ARGS : argparse.Namespace
 def process_args(args: List[str] = None) -> argparse.Namespace:
@@ -147,6 +149,24 @@
         df.to_csv(path, sep="\t", index=False)
     except Exception as e:
         raise utils.DataErr(path, f"failed writing tabular output: {e}")
+    
+def is_placeholder(gid) -> bool:
+    """Return True if the gene id looks like a placeholder (e.g., 0/NA/NAN/empty)."""
+    if gid is None:
+        return True
+    s = str(gid).strip().lower()
+    return s in {"0", "", "na", "nan"}  # lowercase for simple matching
+
+def sample_valid_gene_ids(genes, limit=10):
+    """Yield up to `limit` valid gene IDs, skipping placeholders (e.g., the first 0 in RECON)."""
+    out = []
+    for g in genes:
+        gid = getattr(g, "id", getattr(g, "gene_id", g))
+        if not is_placeholder(gid):
+            out.append(str(gid))
+            if len(out) >= limit:
+                break
+    return out
 
 
 ###############################- ENTRY POINT -################################
@@ -200,11 +220,14 @@
             if value is not None:
                 model.reactions.get_by_id(reaction).lower_bound = -float(value)
 
+    # Initialize translation_issues dictionary
+    translation_issues = {}
+    
     if (ARGS.name == "Recon" or ARGS.name == "ENGRO2") and ARGS.gene_format != "Default":
         logging.basicConfig(level=logging.INFO)
         logger = logging.getLogger(__name__)
 
-        model = modelUtils.translate_model_genes(
+        model, translation_issues = modelUtils.translate_model_genes(
             model=model,
             mapping_df= pd.read_csv(ARGS.tool_dir + "/local/mappings/genes_human.csv", dtype={'entrez_id': str}),
             target_nomenclature=ARGS.gene_format,
@@ -212,6 +235,80 @@
             logger=logger
         )
 
+    if ARGS.name == "Custom_model" and ARGS.gene_format != "Default":
+        logging.basicConfig(level=logging.INFO)
+        logger = logging.getLogger(__name__)
+
+        tmp_check = []
+        for g in model.genes[1:5]:  # check first 3 genes only
+            tmp_check.append(modelUtils.gene_type(g.id, "Custom_model"))
+        
+        if len(set(tmp_check)) > 1:
+            raise utils.DataErr("Custom_model", "The custom model contains genes with mixed or unrecognized nomenclature. Please ensure all genes use the same recognized nomenclature before applying gene_format conversion.")
+        else:
+            source_nomenclature = tmp_check[0]
+
+        if source_nomenclature != ARGS.gene_format:
+            model, translation_issues = modelUtils.translate_model_genes(
+                model=model,
+                mapping_df= pd.read_csv(ARGS.tool_dir + "/local/mappings/genes_human.csv", dtype={'entrez_id': str}),
+                target_nomenclature=ARGS.gene_format,
+                source_nomenclature=source_nomenclature,
+                logger=logger
+            )
+
+
+
+
+    if ARGS.name == "Custom_model" and ARGS.gene_format != "Default":
+        logger = logging.getLogger(__name__)
+
+        # Take a small, clean sample of gene IDs (skipping placeholders like 0)
+        ids_sample = sample_valid_gene_ids(model.genes, limit=10)
+        if not ids_sample:
+            raise utils.DataErr(
+                "Custom_model",
+                "No valid gene IDs found (many may be placeholders like 0)."
+            )
+
+        # Detect source nomenclature on the sample
+        types = []
+        for gid in ids_sample:
+            try:
+                t = modelUtils.gene_type(gid, "Custom_model")
+            except Exception as e:
+                # Keep it simple: skip problematic IDs
+                logger.debug(f"gene_type failed for {gid}: {e}")
+                t = None
+            if t:
+                types.append(t)
+
+        if not types:
+            raise utils.DataErr(
+                "Custom_model",
+                "Could not detect a known gene nomenclature from the sample."
+            )
+
+        unique_types = set(types)
+        if len(unique_types) > 1:
+            raise utils.DataErr(
+                "Custom_model",
+                "Mixed or inconsistent gene nomenclatures detected. "
+                "Please unify them before converting."
+            )
+
+        source_nomenclature = types[0]
+
+        # Convert only if needed
+        if source_nomenclature != ARGS.gene_format:
+            model, translation_issues = modelUtils.translate_model_genes(
+                model=model,
+                mapping_df= pd.read_csv(ARGS.tool_dir + "/local/mappings/genes_human.csv", dtype={'entrez_id': str}),
+                target_nomenclature=ARGS.gene_format,
+                source_nomenclature=source_nomenclature,
+                logger=logger
+            )
+
     # generate data
     rules = modelUtils.generate_rules(model, asParsed = False)
     reactions = modelUtils.generate_reactions(model, asParsed = False)
@@ -225,6 +322,12 @@
     df_rules = pd.DataFrame(list(rules.items()), columns = ["ReactionID", "GPR"])
     df_reactions = pd.DataFrame(list(reactions.items()), columns = ["ReactionID", "Formula"])
 
+    # Create DataFrame for translation issues
+    df_translation_issues = pd.DataFrame([
+        {"ReactionID": rxn_id, "TranslationIssues": issues}
+        for rxn_id, issues in translation_issues.items()
+    ])
+    
     df_bounds = bounds.reset_index().rename(columns = {"index": "ReactionID"})
     df_medium = medium.rename(columns = {"reaction": "ReactionID"})
     df_medium["InMedium"] = True
@@ -235,6 +338,15 @@
     if ARGS.name == "ENGRO2": 
         merged = merged.merge(compartments, on = "ReactionID", how = "outer")
     merged = merged.merge(df_medium, on = "ReactionID", how = "left")
+    
+    # Add translation issues column
+    if not df_translation_issues.empty:
+        merged = merged.merge(df_translation_issues, on = "ReactionID", how = "left")
+        merged["TranslationIssues"] = merged["TranslationIssues"].fillna("")
+    else:
+        # Add empty TranslationIssues column if no issues found
+        #merged["TranslationIssues"] = ""
+        pass
 
     merged["InMedium"] = merged["InMedium"].fillna(False)