Mercurial > repos > bimib > cobraxy
diff COBRAxy/metabolic_model_setting.py @ 490:c6ea189ea7e9 draft
Uploaded
author | francesco_lapi |
---|---|
date | Mon, 29 Sep 2025 15:13:21 +0000 |
parents | 5b625d91bc7f |
children |
line wrap: on
line diff
--- a/COBRAxy/metabolic_model_setting.py Mon Sep 29 10:33:26 2025 +0000 +++ b/COBRAxy/metabolic_model_setting.py Mon Sep 29 15:13:21 2025 +0000 @@ -16,6 +16,8 @@ from typing import Optional, Tuple, List import utils.model_utils as modelUtils import logging +from pathlib import Path + ARGS : argparse.Namespace def process_args(args: List[str] = None) -> argparse.Namespace: @@ -147,6 +149,24 @@ df.to_csv(path, sep="\t", index=False) except Exception as e: raise utils.DataErr(path, f"failed writing tabular output: {e}") + +def is_placeholder(gid) -> bool: + """Return True if the gene id looks like a placeholder (e.g., 0/NA/NAN/empty).""" + if gid is None: + return True + s = str(gid).strip().lower() + return s in {"0", "", "na", "nan"} # lowercase for simple matching + +def sample_valid_gene_ids(genes, limit=10): + """Yield up to `limit` valid gene IDs, skipping placeholders (e.g., the first 0 in RECON).""" + out = [] + for g in genes: + gid = getattr(g, "id", getattr(g, "gene_id", g)) + if not is_placeholder(gid): + out.append(str(gid)) + if len(out) >= limit: + break + return out ###############################- ENTRY POINT -################################ @@ -200,11 +220,14 @@ if value is not None: model.reactions.get_by_id(reaction).lower_bound = -float(value) + # Initialize translation_issues dictionary + translation_issues = {} + if (ARGS.name == "Recon" or ARGS.name == "ENGRO2") and ARGS.gene_format != "Default": logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) - model = modelUtils.translate_model_genes( + model, translation_issues = modelUtils.translate_model_genes( model=model, mapping_df= pd.read_csv(ARGS.tool_dir + "/local/mappings/genes_human.csv", dtype={'entrez_id': str}), target_nomenclature=ARGS.gene_format, @@ -212,6 +235,80 @@ logger=logger ) + if ARGS.name == "Custom_model" and ARGS.gene_format != "Default": + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) + + tmp_check = [] + for g in model.genes[1:5]: # check first 3 genes only + tmp_check.append(modelUtils.gene_type(g.id, "Custom_model")) + + if len(set(tmp_check)) > 1: + raise utils.DataErr("Custom_model", "The custom model contains genes with mixed or unrecognized nomenclature. Please ensure all genes use the same recognized nomenclature before applying gene_format conversion.") + else: + source_nomenclature = tmp_check[0] + + if source_nomenclature != ARGS.gene_format: + model, translation_issues = modelUtils.translate_model_genes( + model=model, + mapping_df= pd.read_csv(ARGS.tool_dir + "/local/mappings/genes_human.csv", dtype={'entrez_id': str}), + target_nomenclature=ARGS.gene_format, + source_nomenclature=source_nomenclature, + logger=logger + ) + + + + + if ARGS.name == "Custom_model" and ARGS.gene_format != "Default": + logger = logging.getLogger(__name__) + + # Take a small, clean sample of gene IDs (skipping placeholders like 0) + ids_sample = sample_valid_gene_ids(model.genes, limit=10) + if not ids_sample: + raise utils.DataErr( + "Custom_model", + "No valid gene IDs found (many may be placeholders like 0)." + ) + + # Detect source nomenclature on the sample + types = [] + for gid in ids_sample: + try: + t = modelUtils.gene_type(gid, "Custom_model") + except Exception as e: + # Keep it simple: skip problematic IDs + logger.debug(f"gene_type failed for {gid}: {e}") + t = None + if t: + types.append(t) + + if not types: + raise utils.DataErr( + "Custom_model", + "Could not detect a known gene nomenclature from the sample." + ) + + unique_types = set(types) + if len(unique_types) > 1: + raise utils.DataErr( + "Custom_model", + "Mixed or inconsistent gene nomenclatures detected. " + "Please unify them before converting." + ) + + source_nomenclature = types[0] + + # Convert only if needed + if source_nomenclature != ARGS.gene_format: + model, translation_issues = modelUtils.translate_model_genes( + model=model, + mapping_df= pd.read_csv(ARGS.tool_dir + "/local/mappings/genes_human.csv", dtype={'entrez_id': str}), + target_nomenclature=ARGS.gene_format, + source_nomenclature=source_nomenclature, + logger=logger + ) + # generate data rules = modelUtils.generate_rules(model, asParsed = False) reactions = modelUtils.generate_reactions(model, asParsed = False) @@ -225,6 +322,12 @@ df_rules = pd.DataFrame(list(rules.items()), columns = ["ReactionID", "GPR"]) df_reactions = pd.DataFrame(list(reactions.items()), columns = ["ReactionID", "Formula"]) + # Create DataFrame for translation issues + df_translation_issues = pd.DataFrame([ + {"ReactionID": rxn_id, "TranslationIssues": issues} + for rxn_id, issues in translation_issues.items() + ]) + df_bounds = bounds.reset_index().rename(columns = {"index": "ReactionID"}) df_medium = medium.rename(columns = {"reaction": "ReactionID"}) df_medium["InMedium"] = True @@ -235,6 +338,15 @@ if ARGS.name == "ENGRO2": merged = merged.merge(compartments, on = "ReactionID", how = "outer") merged = merged.merge(df_medium, on = "ReactionID", how = "left") + + # Add translation issues column + if not df_translation_issues.empty: + merged = merged.merge(df_translation_issues, on = "ReactionID", how = "left") + merged["TranslationIssues"] = merged["TranslationIssues"].fillna("") + else: + # Add empty TranslationIssues column if no issues found + #merged["TranslationIssues"] = "" + pass merged["InMedium"] = merged["InMedium"].fillna(False)