comparison COBRAxy/ras_generator_beta.py @ 456:a6e45049c1b9 draft default tip

Uploaded
author francesco_lapi
date Fri, 12 Sep 2025 17:28:45 +0000
parents 4a385fdb9e58
children
comparison
equal deleted inserted replaced
455:4e2bc80764b6 456:a6e45049c1b9
1 """
2 Generate Reaction Activity Scores (RAS) from a gene expression dataset and GPR rules.
3
4 The script reads a tabular dataset (genes x samples) and a rules file (GPRs),
5 computes RAS per reaction for each sample/cell line, and writes a tabular output.
6 """
1 from __future__ import division 7 from __future__ import division
2 # galaxy complains this ^^^ needs to be at the very beginning of the file, for some reason.
3 import sys 8 import sys
4 import argparse 9 import argparse
5 import collections 10 import collections
6 import pandas as pd 11 import pandas as pd
7 import pickle as pk 12 import pickle as pk
8 import utils.general_utils as utils 13 import utils.general_utils as utils
9 import utils.rule_parsing as ruleUtils 14 import utils.rule_parsing as ruleUtils
10 from typing import Union, Optional, List, Dict, Tuple, TypeVar 15 from typing import Union, Optional, List, Dict, Tuple, TypeVar
11 import os
12 16
13 ERRORS = [] 17 ERRORS = []
14 ########################## argparse ########################################## 18 ########################## argparse ##########################################
15 ARGS :argparse.Namespace 19 ARGS :argparse.Namespace
16 def process_args(args:List[str] = None) -> argparse.Namespace: 20 def process_args(args:List[str] = None) -> argparse.Namespace:
29 33
30 parser.add_argument("-rl", "--model_upload", type = str, 34 parser.add_argument("-rl", "--model_upload", type = str,
31 help = "path to input file containing the rules") 35 help = "path to input file containing the rules")
32 36
33 parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name") 37 parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name")
34 # ^ I need this because galaxy converts my files into .dat but I need to know what extension they were in 38 # Galaxy converts files into .dat, this helps infer the original extension when needed.
35 39
36 parser.add_argument( 40 parser.add_argument(
37 '-n', '--none', 41 '-n', '--none',
38 type = utils.Bool("none"), default = True, 42 type = utils.Bool("none"), default = True,
39 help = 'compute Nan values') 43 help = 'compute Nan values')
47 '-ol', '--out_log', 51 '-ol', '--out_log',
48 type = str, 52 type = str,
49 help = "Output log") 53 help = "Output log")
50 54
51 parser.add_argument( 55 parser.add_argument(
52 '-in', '--input', #id รจ diventato in 56 '-in', '--input',
53 type = str, 57 type = str,
54 help = 'input dataset') 58 help = 'input dataset')
55 59
56 parser.add_argument( 60 parser.add_argument(
57 '-ra', '--ras_output', 61 '-ra', '--ras_output',
251 return (gene.set_index(gene.columns[0])).to_dict() 255 return (gene.set_index(gene.columns[0])).to_dict()
252 256
253 ############################ resolve ########################################## 257 ############################ resolve ##########################################
254 def replace_gene_value(l :str, d :str) -> Tuple[Union[int, float], list]: 258 def replace_gene_value(l :str, d :str) -> Tuple[Union[int, float], list]:
255 """ 259 """
256 Replace gene identifiers with corresponding values from a dictionary. 260 Replace gene identifiers in a parsed rule expression with values from a dict.
257 261
258 Args: 262 Args:
259 l (str): String of gene identifier. 263 l: Parsed rule as a nested list structure (strings, lists, and operators).
260 d (str): String corresponding to its value. 264 d: Dict mapping gene IDs to numeric values.
261 265
262 Returns: 266 Returns:
263 tuple: A tuple containing two lists: the first list contains replaced values, and the second list contains any errors encountered during replacement. 267 tuple: (new_expression, not_found_genes)
264 """ 268 """
265 tmp = [] 269 tmp = []
266 err = [] 270 err = []
267 while l: 271 while l:
268 if isinstance(l[0], list): 272 if isinstance(l[0], list):
275 if value == None: 279 if value == None:
276 err.append(l[0]) 280 err.append(l[0])
277 l = l[1:] 281 l = l[1:]
278 return (tmp, err) 282 return (tmp, err)
279 283
280 def replace_gene(l :str, d :str) -> Union[int, float]: 284 def replace_gene(l: str, d: Dict[str, Union[int, float]]) -> Union[int, float, None]:
281 """ 285 """
282 Replace a single gene identifier with its corresponding value from a dictionary. 286 Replace a single gene identifier with its corresponding value from a dictionary.
283 287
284 Args: 288 Args:
285 l (str): Gene identifier to replace. 289 l (str): Gene identifier to replace.
286 d (str): String corresponding to its value. 290 d (dict): Dict mapping gene IDs to numeric values.
287 291
288 Returns: 292 Returns:
289 float/int: Corresponding value from the dictionary if found, None otherwise. 293 float/int/None: Corresponding value from the dictionary if found, None otherwise.
290 294
291 Raises: 295 Raises:
292 sys.exit: If the value associated with the gene identifier is not valid. 296 sys.exit: If the value associated with the gene identifier is not valid.
293 """ 297 """
294 if l =='and' or l == 'or': 298 if l =='and' or l == 'or':
506 Generates the RAS scores for each cell line found in the dataset. 510 Generates the RAS scores for each cell line found in the dataset.
507 511
508 Args: 512 Args:
509 dataset (pd.DataFrame): Dataset containing gene values. 513 dataset (pd.DataFrame): Dataset containing gene values.
510 rules (dict): The dict containing reaction ids as keys and rules as values. 514 rules (dict): The dict containing reaction ids as keys and rules as values.
511 515
512 Side effects: 516 Note:
513 dataset : mut 517 Modifies dataset in place by setting the first column as index.
514 518
515 Returns: 519 Returns:
516 dict: A dictionary where each key corresponds to a cell line name and each value is a dictionary 520 dict: A dictionary where each key corresponds to a cell line name and each value is a dictionary
517 where each key corresponds to a reaction ID and each value is its computed RAS score. 521 where each key corresponds to a reaction ID and each value is its computed RAS score.
518 """ 522 """
588 592
589 return ras_value 593 return ras_value
590 594
591 def save_as_tsv(rasScores: Dict[str, Dict[str, Ras]], reactions :List[str]) -> None: 595 def save_as_tsv(rasScores: Dict[str, Dict[str, Ras]], reactions :List[str]) -> None:
592 """ 596 """
593 Save computed ras scores to the given path, as a tsv file. 597 Save computed RAS scores to ARGS.ras_output as a TSV file.
594 598
595 Args: 599 Args:
596 rasScores : the computed ras scores. 600 rasScores : the computed ras scores.
597 path : the output tsv file's path. 601 reactions : the list of reaction IDs, used as the first column.
598 602
599 Returns: 603 Returns:
600 None 604 None
601 """ 605 """
602 for scores in rasScores.values(): # this is actually a lot faster than using the ootb dataframe metod, sadly 606 for scores in rasScores.values(): # this is actually a lot faster than using the ootb dataframe metod, sadly
625 Returns: 629 Returns:
626 str: the gene in HugoID encoding. 630 str: the gene in HugoID encoding.
627 """ 631 """
628 supportedGenesInEncoding = geneTranslator[encoding] 632 supportedGenesInEncoding = geneTranslator[encoding]
629 if geneName in supportedGenesInEncoding: return supportedGenesInEncoding[geneName] 633 if geneName in supportedGenesInEncoding: return supportedGenesInEncoding[geneName]
630 raise ValueError(f"Gene \"{geneName}\" non trovato, verifica di star utilizzando il modello corretto!") 634 raise ValueError(f"Gene '{geneName}' not found. Please verify you are using the correct model.")
631 635
632 def load_custom_rules() -> Dict[str, ruleUtils.OpList]: 636 def load_custom_rules() -> Dict[str, ruleUtils.OpList]:
633 """ 637 """
634 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be 638 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be
635 performed, significantly impacting the runtime. 639 performed, significantly impacting the runtime.
636 640
637 Returns: 641 Returns:
638 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules. 642 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules.
639 """ 643 """
640 datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload) # actual file, stored in galaxy as a .dat 644 datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload) # actual file, stored in Galaxy as a .dat
641
642 #try: filenamePath = utils.FilePath.fromStrPath(ARGS.model_upload_name) # file's name in input, to determine its original ext
643 #except utils.PathErr as err:
644 # utils.logWarning(f"Cannot determine file extension from filename '{ARGS.model_upload_name}'. Assuming tabular format.", ARGS.out_log)
645 # filenamePath = None
646
647 #if filenamePath.ext is utils.FileFormat.PICKLE: return utils.readPickle(datFilePath)
648 645
649 dict_rule = {} 646 dict_rule = {}
650 647
651 try: 648 try:
652 rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False) 649 rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False)
656 if not rows: 653 if not rows:
657 raise ValueError("Model tabular is file is empty.") 654 raise ValueError("Model tabular is file is empty.")
658 655
659 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR") 656 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
660 657
661 # Proviamo prima con delimitatore tab 658 # First, try using a tab delimiter
662 for line in rows[1:]: 659 for line in rows[1:]:
663 if len(line) <= idx_gpr: 660 if len(line) <= idx_gpr:
664 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) 661 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
665 continue 662 continue
666 663
668 dict_rule[line[id_idx]] = ruleUtils.OpList([""]) 665 dict_rule[line[id_idx]] = ruleUtils.OpList([""])
669 else: 666 else:
670 dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr]) 667 dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr])
671 668
672 except Exception as e: 669 except Exception as e:
673 # Se fallisce con tab, proviamo con virgola 670 # If parsing with tabs fails, try comma delimiter
674 try: 671 try:
675 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False) 672 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False)
676 673
677 if len(rows) <= 1: 674 if len(rows) <= 1:
678 raise ValueError("Model tabular with 1 column is not supported.") 675 raise ValueError("Model tabular with 1 column is not supported.")
680 if not rows: 677 if not rows:
681 raise ValueError("Model tabular is file is empty.") 678 raise ValueError("Model tabular is file is empty.")
682 679
683 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR") 680 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
684 681
685 # Proviamo prima con delimitatore tab 682 # Try again parsing row content with the GPR column using comma-separated values
686 for line in rows[1:]: 683 for line in rows[1:]:
687 if len(line) <= idx_gpr: 684 if len(line) <= idx_gpr:
688 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) 685 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
689 continue 686 continue
690 687
727 if ERRORS: utils.logWarning( 724 if ERRORS: utils.logWarning(
728 f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}", 725 f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}",
729 ARGS.out_log) 726 ARGS.out_log)
730 727
731 728
732 ############ 729 print("Execution succeeded")
733
734 # handle custom models
735 #model :utils.Model = ARGS.rules_selector
736
737 #if model is utils.Model.Custom:
738 # rules = load_custom_rules()
739 # reactions = list(rules.keys())
740
741 # save_as_tsv(ras_for_cell_lines(dataset, rules), reactions)
742 # if ERRORS: utils.logWarning(
743 # f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}",
744 # ARGS.out_log)
745
746 # return
747
748 # This is the standard flow of the ras_generator program, for non-custom models.
749 #name = "RAS Dataset"
750 #type_gene = gene_type(dataset.iloc[0, 0], name)
751
752 #rules = model.getRules(ARGS.tool_dir)
753 #genes = data_gene(dataset, type_gene, name, None)
754 #ids, rules = load_id_rules(rules.get(type_gene))
755
756 #resolve_rules, err = resolve(genes, rules, ids, ARGS.none, name)
757 #create_ras(resolve_rules, name, rules, ids, ARGS.ras_output)
758
759 #if err: utils.logWarning(
760 # f"Warning: gene(s) {err} not found in class \"{name}\", " +
761 # "the expression level for this gene will be considered NaN",
762 # ARGS.out_log)
763
764 print("Execution succeded")
765 730
766 ############################################################################### 731 ###############################################################################
767 if __name__ == "__main__": 732 if __name__ == "__main__":
768 main() 733 main()