Mercurial > repos > bimib > cobraxy
comparison COBRAxy/ras_generator_beta.py @ 456:a6e45049c1b9 draft default tip
Uploaded
author | francesco_lapi |
---|---|
date | Fri, 12 Sep 2025 17:28:45 +0000 |
parents | 4a385fdb9e58 |
children |
comparison
equal
deleted
inserted
replaced
455:4e2bc80764b6 | 456:a6e45049c1b9 |
---|---|
1 """ | |
2 Generate Reaction Activity Scores (RAS) from a gene expression dataset and GPR rules. | |
3 | |
4 The script reads a tabular dataset (genes x samples) and a rules file (GPRs), | |
5 computes RAS per reaction for each sample/cell line, and writes a tabular output. | |
6 """ | |
1 from __future__ import division | 7 from __future__ import division |
2 # galaxy complains this ^^^ needs to be at the very beginning of the file, for some reason. | |
3 import sys | 8 import sys |
4 import argparse | 9 import argparse |
5 import collections | 10 import collections |
6 import pandas as pd | 11 import pandas as pd |
7 import pickle as pk | 12 import pickle as pk |
8 import utils.general_utils as utils | 13 import utils.general_utils as utils |
9 import utils.rule_parsing as ruleUtils | 14 import utils.rule_parsing as ruleUtils |
10 from typing import Union, Optional, List, Dict, Tuple, TypeVar | 15 from typing import Union, Optional, List, Dict, Tuple, TypeVar |
11 import os | |
12 | 16 |
13 ERRORS = [] | 17 ERRORS = [] |
14 ########################## argparse ########################################## | 18 ########################## argparse ########################################## |
15 ARGS :argparse.Namespace | 19 ARGS :argparse.Namespace |
16 def process_args(args:List[str] = None) -> argparse.Namespace: | 20 def process_args(args:List[str] = None) -> argparse.Namespace: |
29 | 33 |
30 parser.add_argument("-rl", "--model_upload", type = str, | 34 parser.add_argument("-rl", "--model_upload", type = str, |
31 help = "path to input file containing the rules") | 35 help = "path to input file containing the rules") |
32 | 36 |
33 parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name") | 37 parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name") |
34 # ^ I need this because galaxy converts my files into .dat but I need to know what extension they were in | 38 # Galaxy converts files into .dat, this helps infer the original extension when needed. |
35 | 39 |
36 parser.add_argument( | 40 parser.add_argument( |
37 '-n', '--none', | 41 '-n', '--none', |
38 type = utils.Bool("none"), default = True, | 42 type = utils.Bool("none"), default = True, |
39 help = 'compute Nan values') | 43 help = 'compute Nan values') |
47 '-ol', '--out_log', | 51 '-ol', '--out_log', |
48 type = str, | 52 type = str, |
49 help = "Output log") | 53 help = "Output log") |
50 | 54 |
51 parser.add_argument( | 55 parser.add_argument( |
52 '-in', '--input', #id รจ diventato in | 56 '-in', '--input', |
53 type = str, | 57 type = str, |
54 help = 'input dataset') | 58 help = 'input dataset') |
55 | 59 |
56 parser.add_argument( | 60 parser.add_argument( |
57 '-ra', '--ras_output', | 61 '-ra', '--ras_output', |
251 return (gene.set_index(gene.columns[0])).to_dict() | 255 return (gene.set_index(gene.columns[0])).to_dict() |
252 | 256 |
253 ############################ resolve ########################################## | 257 ############################ resolve ########################################## |
254 def replace_gene_value(l :str, d :str) -> Tuple[Union[int, float], list]: | 258 def replace_gene_value(l :str, d :str) -> Tuple[Union[int, float], list]: |
255 """ | 259 """ |
256 Replace gene identifiers with corresponding values from a dictionary. | 260 Replace gene identifiers in a parsed rule expression with values from a dict. |
257 | 261 |
258 Args: | 262 Args: |
259 l (str): String of gene identifier. | 263 l: Parsed rule as a nested list structure (strings, lists, and operators). |
260 d (str): String corresponding to its value. | 264 d: Dict mapping gene IDs to numeric values. |
261 | 265 |
262 Returns: | 266 Returns: |
263 tuple: A tuple containing two lists: the first list contains replaced values, and the second list contains any errors encountered during replacement. | 267 tuple: (new_expression, not_found_genes) |
264 """ | 268 """ |
265 tmp = [] | 269 tmp = [] |
266 err = [] | 270 err = [] |
267 while l: | 271 while l: |
268 if isinstance(l[0], list): | 272 if isinstance(l[0], list): |
275 if value == None: | 279 if value == None: |
276 err.append(l[0]) | 280 err.append(l[0]) |
277 l = l[1:] | 281 l = l[1:] |
278 return (tmp, err) | 282 return (tmp, err) |
279 | 283 |
280 def replace_gene(l :str, d :str) -> Union[int, float]: | 284 def replace_gene(l: str, d: Dict[str, Union[int, float]]) -> Union[int, float, None]: |
281 """ | 285 """ |
282 Replace a single gene identifier with its corresponding value from a dictionary. | 286 Replace a single gene identifier with its corresponding value from a dictionary. |
283 | 287 |
284 Args: | 288 Args: |
285 l (str): Gene identifier to replace. | 289 l (str): Gene identifier to replace. |
286 d (str): String corresponding to its value. | 290 d (dict): Dict mapping gene IDs to numeric values. |
287 | 291 |
288 Returns: | 292 Returns: |
289 float/int: Corresponding value from the dictionary if found, None otherwise. | 293 float/int/None: Corresponding value from the dictionary if found, None otherwise. |
290 | 294 |
291 Raises: | 295 Raises: |
292 sys.exit: If the value associated with the gene identifier is not valid. | 296 sys.exit: If the value associated with the gene identifier is not valid. |
293 """ | 297 """ |
294 if l =='and' or l == 'or': | 298 if l =='and' or l == 'or': |
506 Generates the RAS scores for each cell line found in the dataset. | 510 Generates the RAS scores for each cell line found in the dataset. |
507 | 511 |
508 Args: | 512 Args: |
509 dataset (pd.DataFrame): Dataset containing gene values. | 513 dataset (pd.DataFrame): Dataset containing gene values. |
510 rules (dict): The dict containing reaction ids as keys and rules as values. | 514 rules (dict): The dict containing reaction ids as keys and rules as values. |
511 | 515 |
512 Side effects: | 516 Note: |
513 dataset : mut | 517 Modifies dataset in place by setting the first column as index. |
514 | 518 |
515 Returns: | 519 Returns: |
516 dict: A dictionary where each key corresponds to a cell line name and each value is a dictionary | 520 dict: A dictionary where each key corresponds to a cell line name and each value is a dictionary |
517 where each key corresponds to a reaction ID and each value is its computed RAS score. | 521 where each key corresponds to a reaction ID and each value is its computed RAS score. |
518 """ | 522 """ |
588 | 592 |
589 return ras_value | 593 return ras_value |
590 | 594 |
591 def save_as_tsv(rasScores: Dict[str, Dict[str, Ras]], reactions :List[str]) -> None: | 595 def save_as_tsv(rasScores: Dict[str, Dict[str, Ras]], reactions :List[str]) -> None: |
592 """ | 596 """ |
593 Save computed ras scores to the given path, as a tsv file. | 597 Save computed RAS scores to ARGS.ras_output as a TSV file. |
594 | 598 |
595 Args: | 599 Args: |
596 rasScores : the computed ras scores. | 600 rasScores : the computed ras scores. |
597 path : the output tsv file's path. | 601 reactions : the list of reaction IDs, used as the first column. |
598 | 602 |
599 Returns: | 603 Returns: |
600 None | 604 None |
601 """ | 605 """ |
602 for scores in rasScores.values(): # this is actually a lot faster than using the ootb dataframe metod, sadly | 606 for scores in rasScores.values(): # this is actually a lot faster than using the ootb dataframe metod, sadly |
625 Returns: | 629 Returns: |
626 str: the gene in HugoID encoding. | 630 str: the gene in HugoID encoding. |
627 """ | 631 """ |
628 supportedGenesInEncoding = geneTranslator[encoding] | 632 supportedGenesInEncoding = geneTranslator[encoding] |
629 if geneName in supportedGenesInEncoding: return supportedGenesInEncoding[geneName] | 633 if geneName in supportedGenesInEncoding: return supportedGenesInEncoding[geneName] |
630 raise ValueError(f"Gene \"{geneName}\" non trovato, verifica di star utilizzando il modello corretto!") | 634 raise ValueError(f"Gene '{geneName}' not found. Please verify you are using the correct model.") |
631 | 635 |
632 def load_custom_rules() -> Dict[str, ruleUtils.OpList]: | 636 def load_custom_rules() -> Dict[str, ruleUtils.OpList]: |
633 """ | 637 """ |
634 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be | 638 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be |
635 performed, significantly impacting the runtime. | 639 performed, significantly impacting the runtime. |
636 | 640 |
637 Returns: | 641 Returns: |
638 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules. | 642 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules. |
639 """ | 643 """ |
640 datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload) # actual file, stored in galaxy as a .dat | 644 datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload) # actual file, stored in Galaxy as a .dat |
641 | |
642 #try: filenamePath = utils.FilePath.fromStrPath(ARGS.model_upload_name) # file's name in input, to determine its original ext | |
643 #except utils.PathErr as err: | |
644 # utils.logWarning(f"Cannot determine file extension from filename '{ARGS.model_upload_name}'. Assuming tabular format.", ARGS.out_log) | |
645 # filenamePath = None | |
646 | |
647 #if filenamePath.ext is utils.FileFormat.PICKLE: return utils.readPickle(datFilePath) | |
648 | 645 |
649 dict_rule = {} | 646 dict_rule = {} |
650 | 647 |
651 try: | 648 try: |
652 rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False) | 649 rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False) |
656 if not rows: | 653 if not rows: |
657 raise ValueError("Model tabular is file is empty.") | 654 raise ValueError("Model tabular is file is empty.") |
658 | 655 |
659 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR") | 656 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR") |
660 | 657 |
661 # Proviamo prima con delimitatore tab | 658 # First, try using a tab delimiter |
662 for line in rows[1:]: | 659 for line in rows[1:]: |
663 if len(line) <= idx_gpr: | 660 if len(line) <= idx_gpr: |
664 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) | 661 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) |
665 continue | 662 continue |
666 | 663 |
668 dict_rule[line[id_idx]] = ruleUtils.OpList([""]) | 665 dict_rule[line[id_idx]] = ruleUtils.OpList([""]) |
669 else: | 666 else: |
670 dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr]) | 667 dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr]) |
671 | 668 |
672 except Exception as e: | 669 except Exception as e: |
673 # Se fallisce con tab, proviamo con virgola | 670 # If parsing with tabs fails, try comma delimiter |
674 try: | 671 try: |
675 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False) | 672 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False) |
676 | 673 |
677 if len(rows) <= 1: | 674 if len(rows) <= 1: |
678 raise ValueError("Model tabular with 1 column is not supported.") | 675 raise ValueError("Model tabular with 1 column is not supported.") |
680 if not rows: | 677 if not rows: |
681 raise ValueError("Model tabular is file is empty.") | 678 raise ValueError("Model tabular is file is empty.") |
682 | 679 |
683 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR") | 680 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR") |
684 | 681 |
685 # Proviamo prima con delimitatore tab | 682 # Try again parsing row content with the GPR column using comma-separated values |
686 for line in rows[1:]: | 683 for line in rows[1:]: |
687 if len(line) <= idx_gpr: | 684 if len(line) <= idx_gpr: |
688 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) | 685 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) |
689 continue | 686 continue |
690 | 687 |
727 if ERRORS: utils.logWarning( | 724 if ERRORS: utils.logWarning( |
728 f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}", | 725 f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}", |
729 ARGS.out_log) | 726 ARGS.out_log) |
730 | 727 |
731 | 728 |
732 ############ | 729 print("Execution succeeded") |
733 | |
734 # handle custom models | |
735 #model :utils.Model = ARGS.rules_selector | |
736 | |
737 #if model is utils.Model.Custom: | |
738 # rules = load_custom_rules() | |
739 # reactions = list(rules.keys()) | |
740 | |
741 # save_as_tsv(ras_for_cell_lines(dataset, rules), reactions) | |
742 # if ERRORS: utils.logWarning( | |
743 # f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}", | |
744 # ARGS.out_log) | |
745 | |
746 # return | |
747 | |
748 # This is the standard flow of the ras_generator program, for non-custom models. | |
749 #name = "RAS Dataset" | |
750 #type_gene = gene_type(dataset.iloc[0, 0], name) | |
751 | |
752 #rules = model.getRules(ARGS.tool_dir) | |
753 #genes = data_gene(dataset, type_gene, name, None) | |
754 #ids, rules = load_id_rules(rules.get(type_gene)) | |
755 | |
756 #resolve_rules, err = resolve(genes, rules, ids, ARGS.none, name) | |
757 #create_ras(resolve_rules, name, rules, ids, ARGS.ras_output) | |
758 | |
759 #if err: utils.logWarning( | |
760 # f"Warning: gene(s) {err} not found in class \"{name}\", " + | |
761 # "the expression level for this gene will be considered NaN", | |
762 # ARGS.out_log) | |
763 | |
764 print("Execution succeded") | |
765 | 730 |
766 ############################################################################### | 731 ############################################################################### |
767 if __name__ == "__main__": | 732 if __name__ == "__main__": |
768 main() | 733 main() |