Mercurial > repos > bimib > cobraxy
diff COBRAxy/ras_generator_beta.py @ 456:a6e45049c1b9 draft default tip
Uploaded
author | francesco_lapi |
---|---|
date | Fri, 12 Sep 2025 17:28:45 +0000 |
parents | 4a385fdb9e58 |
children |
line wrap: on
line diff
--- a/COBRAxy/ras_generator_beta.py Fri Sep 12 15:05:54 2025 +0000 +++ b/COBRAxy/ras_generator_beta.py Fri Sep 12 17:28:45 2025 +0000 @@ -1,5 +1,10 @@ +""" +Generate Reaction Activity Scores (RAS) from a gene expression dataset and GPR rules. + +The script reads a tabular dataset (genes x samples) and a rules file (GPRs), +computes RAS per reaction for each sample/cell line, and writes a tabular output. +""" from __future__ import division -# galaxy complains this ^^^ needs to be at the very beginning of the file, for some reason. import sys import argparse import collections @@ -8,7 +13,6 @@ import utils.general_utils as utils import utils.rule_parsing as ruleUtils from typing import Union, Optional, List, Dict, Tuple, TypeVar -import os ERRORS = [] ########################## argparse ########################################## @@ -31,7 +35,7 @@ help = "path to input file containing the rules") parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name") - # ^ I need this because galaxy converts my files into .dat but I need to know what extension they were in + # Galaxy converts files into .dat, this helps infer the original extension when needed. parser.add_argument( '-n', '--none', @@ -49,7 +53,7 @@ help = "Output log") parser.add_argument( - '-in', '--input', #id รจ diventato in + '-in', '--input', type = str, help = 'input dataset') @@ -253,14 +257,14 @@ ############################ resolve ########################################## def replace_gene_value(l :str, d :str) -> Tuple[Union[int, float], list]: """ - Replace gene identifiers with corresponding values from a dictionary. + Replace gene identifiers in a parsed rule expression with values from a dict. Args: - l (str): String of gene identifier. - d (str): String corresponding to its value. + l: Parsed rule as a nested list structure (strings, lists, and operators). + d: Dict mapping gene IDs to numeric values. Returns: - tuple: A tuple containing two lists: the first list contains replaced values, and the second list contains any errors encountered during replacement. + tuple: (new_expression, not_found_genes) """ tmp = [] err = [] @@ -277,16 +281,16 @@ l = l[1:] return (tmp, err) -def replace_gene(l :str, d :str) -> Union[int, float]: +def replace_gene(l: str, d: Dict[str, Union[int, float]]) -> Union[int, float, None]: """ Replace a single gene identifier with its corresponding value from a dictionary. Args: l (str): Gene identifier to replace. - d (str): String corresponding to its value. + d (dict): Dict mapping gene IDs to numeric values. Returns: - float/int: Corresponding value from the dictionary if found, None otherwise. + float/int/None: Corresponding value from the dictionary if found, None otherwise. Raises: sys.exit: If the value associated with the gene identifier is not valid. @@ -508,9 +512,9 @@ Args: dataset (pd.DataFrame): Dataset containing gene values. rules (dict): The dict containing reaction ids as keys and rules as values. - - Side effects: - dataset : mut + + Note: + Modifies dataset in place by setting the first column as index. Returns: dict: A dictionary where each key corresponds to a cell line name and each value is a dictionary @@ -590,11 +594,11 @@ def save_as_tsv(rasScores: Dict[str, Dict[str, Ras]], reactions :List[str]) -> None: """ - Save computed ras scores to the given path, as a tsv file. + Save computed RAS scores to ARGS.ras_output as a TSV file. Args: rasScores : the computed ras scores. - path : the output tsv file's path. + reactions : the list of reaction IDs, used as the first column. Returns: None @@ -627,7 +631,7 @@ """ supportedGenesInEncoding = geneTranslator[encoding] if geneName in supportedGenesInEncoding: return supportedGenesInEncoding[geneName] - raise ValueError(f"Gene \"{geneName}\" non trovato, verifica di star utilizzando il modello corretto!") + raise ValueError(f"Gene '{geneName}' not found. Please verify you are using the correct model.") def load_custom_rules() -> Dict[str, ruleUtils.OpList]: """ @@ -637,14 +641,7 @@ Returns: Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules. """ - datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload) # actual file, stored in galaxy as a .dat - - #try: filenamePath = utils.FilePath.fromStrPath(ARGS.model_upload_name) # file's name in input, to determine its original ext - #except utils.PathErr as err: - # utils.logWarning(f"Cannot determine file extension from filename '{ARGS.model_upload_name}'. Assuming tabular format.", ARGS.out_log) - # filenamePath = None - - #if filenamePath.ext is utils.FileFormat.PICKLE: return utils.readPickle(datFilePath) + datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload) # actual file, stored in Galaxy as a .dat dict_rule = {} @@ -658,7 +655,7 @@ id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR") - # Proviamo prima con delimitatore tab + # First, try using a tab delimiter for line in rows[1:]: if len(line) <= idx_gpr: utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) @@ -670,7 +667,7 @@ dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr]) except Exception as e: - # Se fallisce con tab, proviamo con virgola + # If parsing with tabs fails, try comma delimiter try: rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False) @@ -682,7 +679,7 @@ id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR") - # Proviamo prima con delimitatore tab + # Try again parsing row content with the GPR column using comma-separated values for line in rows[1:]: if len(line) <= idx_gpr: utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) @@ -729,39 +726,7 @@ ARGS.out_log) - ############ - - # handle custom models - #model :utils.Model = ARGS.rules_selector - - #if model is utils.Model.Custom: - # rules = load_custom_rules() - # reactions = list(rules.keys()) - - # save_as_tsv(ras_for_cell_lines(dataset, rules), reactions) - # if ERRORS: utils.logWarning( - # f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}", - # ARGS.out_log) - - # return - - # This is the standard flow of the ras_generator program, for non-custom models. - #name = "RAS Dataset" - #type_gene = gene_type(dataset.iloc[0, 0], name) - - #rules = model.getRules(ARGS.tool_dir) - #genes = data_gene(dataset, type_gene, name, None) - #ids, rules = load_id_rules(rules.get(type_gene)) - - #resolve_rules, err = resolve(genes, rules, ids, ARGS.none, name) - #create_ras(resolve_rules, name, rules, ids, ARGS.ras_output) - - #if err: utils.logWarning( - # f"Warning: gene(s) {err} not found in class \"{name}\", " + - # "the expression level for this gene will be considered NaN", - # ARGS.out_log) - - print("Execution succeded") + print("Execution succeeded") ############################################################################### if __name__ == "__main__":