93
+ − 1 from __future__ import division
+ − 2 # galaxy complains this ^^^ needs to be at the very beginning of the file, for some reason.
+ − 3 import sys
+ − 4 import argparse
+ − 5 import collections
+ − 6 import pandas as pd
+ − 7 import pickle as pk
+ − 8 import utils.general_utils as utils
+ − 9 import utils.rule_parsing as ruleUtils
+ − 10 from typing import Union, Optional, List, Dict, Tuple, TypeVar
+ − 11
+ − 12 ERRORS = []
+ − 13 ########################## argparse ##########################################
+ − 14 ARGS :argparse.Namespace
147
+ − 15 def process_args(args:List[str] = None) -> argparse.Namespace:
93
+ − 16 """
+ − 17 Processes command-line arguments.
+ − 18
+ − 19 Args:
+ − 20 args (list): List of command-line arguments.
+ − 21
+ − 22 Returns:
+ − 23 Namespace: An object containing parsed arguments.
+ − 24 """
+ − 25 parser = argparse.ArgumentParser(
+ − 26 usage = '%(prog)s [options]',
+ − 27 description = "process some value's genes to create a comparison's map.")
+ − 28
+ − 29 parser.add_argument(
+ − 30 '-rs', '--rules_selector',
+ − 31 type = utils.Model, default = utils.Model.HMRcore, choices = list(utils.Model),
+ − 32 help = 'chose which type of dataset you want use')
+ − 33
+ − 34 parser.add_argument("-rl", "--rule_list", type = str,
+ − 35 help = "path to input file with custom rules, if provided")
+ − 36
+ − 37 parser.add_argument("-rn", "--rules_name", type = str, help = "custom rules name")
+ − 38 # ^ I need this because galaxy converts my files into .dat but I need to know what extension they were in
+ − 39
+ − 40 parser.add_argument(
+ − 41 '-n', '--none',
+ − 42 type = utils.Bool("none"), default = True,
+ − 43 help = 'compute Nan values')
+ − 44
+ − 45 parser.add_argument(
+ − 46 '-td', '--tool_dir',
+ − 47 type = str,
+ − 48 required = True, help = 'your tool directory')
+ − 49
+ − 50 parser.add_argument(
+ − 51 '-ol', '--out_log',
+ − 52 type = str,
+ − 53 help = "Output log")
+ − 54
+ − 55 parser.add_argument(
+ − 56 '-in', '--input', #id è diventato in
+ − 57 type = str,
+ − 58 help = 'input dataset')
+ − 59
+ − 60 parser.add_argument(
+ − 61 '-ra', '--ras_output',
+ − 62 type = str,
+ − 63 required = True, help = 'ras output')
147
+ − 64
93
+ − 65
147
+ − 66 return parser.parse_args(args)
93
+ − 67
+ − 68 ############################ dataset input ####################################
+ − 69 def read_dataset(data :str, name :str) -> pd.DataFrame:
+ − 70 """
+ − 71 Read a dataset from a CSV file and return it as a pandas DataFrame.
+ − 72
+ − 73 Args:
+ − 74 data (str): Path to the CSV file containing the dataset.
+ − 75 name (str): Name of the dataset, used in error messages.
+ − 76
+ − 77 Returns:
+ − 78 pandas.DataFrame: DataFrame containing the dataset.
+ − 79
+ − 80 Raises:
+ − 81 pd.errors.EmptyDataError: If the CSV file is empty.
+ − 82 sys.exit: If the CSV file has the wrong format, the execution is aborted.
+ − 83 """
+ − 84 try:
+ − 85 dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python')
+ − 86 except pd.errors.EmptyDataError:
+ − 87 sys.exit('Execution aborted: wrong format of ' + name + '\n')
+ − 88 if len(dataset.columns) < 2:
+ − 89 sys.exit('Execution aborted: wrong format of ' + name + '\n')
+ − 90 return dataset
+ − 91
+ − 92 ############################ load id e rules ##################################
+ − 93 def load_id_rules(reactions :Dict[str, Dict[str, List[str]]]) -> Tuple[List[str], List[Dict[str, List[str]]]]:
+ − 94 """
+ − 95 Load IDs and rules from a dictionary of reactions.
+ − 96
+ − 97 Args:
+ − 98 reactions (dict): A dictionary where keys are IDs and values are rules.
+ − 99
+ − 100 Returns:
+ − 101 tuple: A tuple containing two lists, the first list containing IDs and the second list containing rules.
+ − 102 """
+ − 103 ids, rules = [], []
+ − 104 for key, value in reactions.items():
+ − 105 ids.append(key)
+ − 106 rules.append(value)
+ − 107 return (ids, rules)
+ − 108
+ − 109 ############################ check_methods ####################################
+ − 110 def gene_type(l :str, name :str) -> str:
+ − 111 """
+ − 112 Determine the type of gene ID.
+ − 113
+ − 114 Args:
+ − 115 l (str): The gene identifier to check.
+ − 116 name (str): The name of the dataset, used in error messages.
+ − 117
+ − 118 Returns:
+ − 119 str: The type of gene ID ('hugo_id', 'ensembl_gene_id', 'symbol', or 'entrez_id').
+ − 120
+ − 121 Raises:
+ − 122 sys.exit: If the gene ID type is not supported, the execution is aborted.
+ − 123 """
+ − 124 if check_hgnc(l):
+ − 125 return 'hugo_id'
+ − 126 elif check_ensembl(l):
+ − 127 return 'ensembl_gene_id'
+ − 128 elif check_symbol(l):
+ − 129 return 'symbol'
+ − 130 elif check_entrez(l):
+ − 131 return 'entrez_id'
+ − 132 else:
+ − 133 sys.exit('Execution aborted:\n' +
+ − 134 'gene ID type in ' + name + ' not supported. Supported ID'+
+ − 135 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n')
+ − 136
+ − 137 def check_hgnc(l :str) -> bool:
+ − 138 """
+ − 139 Check if a gene identifier follows the HGNC format.
+ − 140
+ − 141 Args:
+ − 142 l (str): The gene identifier to check.
+ − 143
+ − 144 Returns:
+ − 145 bool: True if the gene identifier follows the HGNC format, False otherwise.
+ − 146 """
+ − 147 if len(l) > 5:
+ − 148 if (l.upper()).startswith('HGNC:'):
+ − 149 return l[5:].isdigit()
+ − 150 else:
+ − 151 return False
+ − 152 else:
+ − 153 return False
+ − 154
+ − 155 def check_ensembl(l :str) -> bool:
+ − 156 """
+ − 157 Check if a gene identifier follows the Ensembl format.
+ − 158
+ − 159 Args:
+ − 160 l (str): The gene identifier to check.
+ − 161
+ − 162 Returns:
+ − 163 bool: True if the gene identifier follows the Ensembl format, False otherwise.
+ − 164 """
+ − 165 return l.upper().startswith('ENS')
+ − 166
+ − 167
+ − 168 def check_symbol(l :str) -> bool:
+ − 169 """
+ − 170 Check if a gene identifier follows the symbol format.
+ − 171
+ − 172 Args:
+ − 173 l (str): The gene identifier to check.
+ − 174
+ − 175 Returns:
+ − 176 bool: True if the gene identifier follows the symbol format, False otherwise.
+ − 177 """
+ − 178 if len(l) > 0:
+ − 179 if l[0].isalpha() and l[1:].isalnum():
+ − 180 return True
+ − 181 else:
+ − 182 return False
+ − 183 else:
+ − 184 return False
+ − 185
+ − 186 def check_entrez(l :str) -> bool:
+ − 187 """
+ − 188 Check if a gene identifier follows the Entrez ID format.
+ − 189
+ − 190 Args:
+ − 191 l (str): The gene identifier to check.
+ − 192
+ − 193 Returns:
+ − 194 bool: True if the gene identifier follows the Entrez ID format, False otherwise.
+ − 195 """
+ − 196 if len(l) > 0:
+ − 197 return l.isdigit()
+ − 198 else:
+ − 199 return False
+ − 200
+ − 201 ############################ gene #############################################
+ − 202 def data_gene(gene: pd.DataFrame, type_gene: str, name: str, gene_custom: Optional[Dict[str, str]]) -> Dict[str, str]:
+ − 203 """
+ − 204 Process gene data to ensure correct formatting and handle duplicates.
+ − 205
+ − 206 Args:
+ − 207 gene (DataFrame): DataFrame containing gene data.
+ − 208 type_gene (str): Type of gene data (e.g., 'hugo_id', 'ensembl_gene_id', 'symbol', 'entrez_id').
+ − 209 name (str): Name of the dataset.
+ − 210 gene_custom (dict or None): Custom gene data dictionary if provided.
+ − 211
+ − 212 Returns:
+ − 213 dict: A dictionary containing gene data with gene IDs as keys and corresponding values.
+ − 214 """
+ − 215 args = process_args()
+ − 216 for i in range(len(gene)):
+ − 217 tmp = gene.iloc[i, 0]
+ − 218 gene.iloc[i, 0] = tmp.strip().split('.')[0]
+ − 219
+ − 220 gene_dup = [item for item, count in
+ − 221 collections.Counter(gene[gene.columns[0]]).items() if count > 1]
+ − 222 pat_dup = [item for item, count in
+ − 223 collections.Counter(list(gene.columns)).items() if count > 1]
+ − 224
+ − 225 if gene_dup:
+ − 226 if gene_custom == None:
+ − 227 if args.rules_selector == 'HMRcore':
+ − 228 gene_in_rule = pk.load(open(args.tool_dir + '/local/pickle files/HMRcore_genes.p', 'rb'))
+ − 229
+ − 230 elif args.rules_selector == 'Recon':
+ − 231 gene_in_rule = pk.load(open(args.tool_dir + '/local/pickle files/Recon_genes.p', 'rb'))
+ − 232
+ − 233 elif args.rules_selector == 'ENGRO2':
+ − 234 gene_in_rule = pk.load(open(args.tool_dir + '/local/pickle files/ENGRO2_genes.p', 'rb'))
+ − 235 print(f"{args.tool_dir}'/local/pickle files/ENGRO2_genes.p'")
+ − 236 utils.logWarning(f"{args.tool_dir}'/local/pickle files/ENGRO2_genes.p'", ARGS.out_log)
+ − 237 print(args.rules_selector)
+ − 238 gene_in_rule = gene_in_rule.get(type_gene)
+ − 239
+ − 240 else:
+ − 241 gene_in_rule = gene_custom
+ − 242 tmp = []
+ − 243 for i in gene_dup:
+ − 244 if gene_in_rule.get(i) == 'ok':
+ − 245 tmp.append(i)
+ − 246 if tmp:
+ − 247 sys.exit('Execution aborted because gene ID '
+ − 248 +str(tmp)+' in '+name+' is duplicated\n')
+ − 249
+ − 250 if pat_dup: utils.logWarning(f"Warning: duplicated label\n{pat_dup} in {name}", ARGS.out_log)
+ − 251 return (gene.set_index(gene.columns[0])).to_dict()
+ − 252
+ − 253 ############################ resolve ##########################################
+ − 254 def replace_gene_value(l :str, d :str) -> Tuple[Union[int, float], list]:
+ − 255 """
+ − 256 Replace gene identifiers with corresponding values from a dictionary.
+ − 257
+ − 258 Args:
+ − 259 l (str): String of gene identifier.
+ − 260 d (str): String corresponding to its value.
+ − 261
+ − 262 Returns:
+ − 263 tuple: A tuple containing two lists: the first list contains replaced values, and the second list contains any errors encountered during replacement.
+ − 264 """
+ − 265 tmp = []
+ − 266 err = []
+ − 267 while l:
+ − 268 if isinstance(l[0], list):
+ − 269 tmp_rules, tmp_err = replace_gene_value(l[0], d)
+ − 270 tmp.append(tmp_rules)
+ − 271 err.extend(tmp_err)
+ − 272 else:
+ − 273 value = replace_gene(l[0], d)
+ − 274 tmp.append(value)
+ − 275 if value == None:
+ − 276 err.append(l[0])
+ − 277 l = l[1:]
+ − 278 return (tmp, err)
+ − 279
+ − 280 def replace_gene(l :str, d :str) -> Union[int, float]:
+ − 281 """
+ − 282 Replace a single gene identifier with its corresponding value from a dictionary.
+ − 283
+ − 284 Args:
+ − 285 l (str): Gene identifier to replace.
+ − 286 d (str): String corresponding to its value.
+ − 287
+ − 288 Returns:
+ − 289 float/int: Corresponding value from the dictionary if found, None otherwise.
+ − 290
+ − 291 Raises:
+ − 292 sys.exit: If the value associated with the gene identifier is not valid.
+ − 293 """
+ − 294 if l =='and' or l == 'or':
+ − 295 return l
+ − 296 else:
+ − 297 value = d.get(l, None)
+ − 298 if not(value == None or isinstance(value, (int, float))):
+ − 299 sys.exit('Execution aborted: ' + value + ' value not valid\n')
+ − 300 return value
+ − 301
+ − 302 T = TypeVar("T", bound = Optional[Union[int, float]])
+ − 303 def computes(val1 :T, op :str, val2 :T, cn :bool) -> T:
+ − 304 """
+ − 305 Compute the RAS value between two value and an operator ('and' or 'or').
+ − 306
+ − 307 Args:
+ − 308 val1(Optional(Union[float, int])): First value.
+ − 309 op (str): Operator ('and' or 'or').
+ − 310 val2(Optional(Union[float, int])): Second value.
+ − 311 cn (bool): Control boolean value.
+ − 312
+ − 313 Returns:
+ − 314 Optional(Union[float, int]): Result of the computation.
+ − 315 """
+ − 316 if val1 != None and val2 != None:
+ − 317 if op == 'and':
+ − 318 return min(val1, val2)
+ − 319 else:
+ − 320 return val1 + val2
+ − 321 elif op == 'and':
+ − 322 if cn is True:
+ − 323 if val1 != None:
+ − 324 return val1
+ − 325 elif val2 != None:
+ − 326 return val2
+ − 327 else:
+ − 328 return None
+ − 329 else:
+ − 330 return None
+ − 331 else:
+ − 332 if val1 != None:
+ − 333 return val1
+ − 334 elif val2 != None:
+ − 335 return val2
+ − 336 else:
+ − 337 return None
+ − 338
+ − 339 # ris should be Literal[None] but Literal is not supported in Python 3.7
+ − 340 def control(ris, l :List[Union[int, float, list]], cn :bool) -> Union[bool, int, float]: #Union[Literal[False], int, float]:
+ − 341 """
+ − 342 Control the format of the expression.
+ − 343
+ − 344 Args:
+ − 345 ris: Intermediate result.
+ − 346 l (list): Expression to control.
+ − 347 cn (bool): Control boolean value.
+ − 348
+ − 349 Returns:
+ − 350 Union[Literal[False], int, float]: Result of the control.
+ − 351 """
+ − 352 if len(l) == 1:
+ − 353 if isinstance(l[0], (float, int)) or l[0] == None:
+ − 354 return l[0]
+ − 355 elif isinstance(l[0], list):
+ − 356 return control(None, l[0], cn)
+ − 357 else:
+ − 358 return False
+ − 359 elif len(l) > 2:
+ − 360 return control_list(ris, l, cn)
+ − 361 else:
+ − 362 return False
+ − 363
+ − 364 def control_list(ris, l :List[Optional[Union[float, int, list]]], cn :bool) -> Optional[bool]: #Optional[Literal[False]]:
+ − 365 """
+ − 366 Control the format of a list of expressions.
+ − 367
+ − 368 Args:
+ − 369 ris: Intermediate result.
+ − 370 l (list): List of expressions to control.
+ − 371 cn (bool): Control boolean value.
+ − 372
+ − 373 Returns:
+ − 374 Optional[Literal[False]]: Result of the control.
+ − 375 """
+ − 376 while l:
+ − 377 if len(l) == 1:
+ − 378 return False
+ − 379 elif (isinstance(l[0], (float, int)) or
+ − 380 l[0] == None) and l[1] in ['and', 'or']:
+ − 381 if isinstance(l[2], (float, int)) or l[2] == None:
+ − 382 ris = computes(l[0], l[1], l[2], cn)
+ − 383 elif isinstance(l[2], list):
+ − 384 tmp = control(None, l[2], cn)
+ − 385 if tmp is False:
+ − 386 return False
+ − 387 else:
+ − 388 ris = computes(l[0], l[1], tmp, cn)
+ − 389 else:
+ − 390 return False
+ − 391 l = l[3:]
+ − 392 elif l[0] in ['and', 'or']:
+ − 393 if isinstance(l[1], (float, int)) or l[1] == None:
+ − 394 ris = computes(ris, l[0], l[1], cn)
+ − 395 elif isinstance(l[1], list):
+ − 396 tmp = control(None,l[1], cn)
+ − 397 if tmp is False:
+ − 398 return False
+ − 399 else:
+ − 400 ris = computes(ris, l[0], tmp, cn)
+ − 401 else:
+ − 402 return False
+ − 403 l = l[2:]
+ − 404 elif isinstance(l[0], list) and l[1] in ['and', 'or']:
+ − 405 if isinstance(l[2], (float, int)) or l[2] == None:
+ − 406 tmp = control(None, l[0], cn)
+ − 407 if tmp is False:
+ − 408 return False
+ − 409 else:
+ − 410 ris = computes(tmp, l[1], l[2], cn)
+ − 411 elif isinstance(l[2], list):
+ − 412 tmp = control(None, l[0], cn)
+ − 413 tmp2 = control(None, l[2], cn)
+ − 414 if tmp is False or tmp2 is False:
+ − 415 return False
+ − 416 else:
+ − 417 ris = computes(tmp, l[1], tmp2, cn)
+ − 418 else:
+ − 419 return False
+ − 420 l = l[3:]
+ − 421 else:
+ − 422 return False
+ − 423 return ris
+ − 424
+ − 425 ResolvedRules = Dict[str, List[Optional[Union[float, int]]]]
+ − 426 def resolve(genes: Dict[str, str], rules: List[str], ids: List[str], resolve_none: bool, name: str) -> Tuple[Optional[ResolvedRules], Optional[list]]:
+ − 427 """
+ − 428 Resolve rules using gene data to compute scores for each rule.
+ − 429
+ − 430 Args:
+ − 431 genes (dict): Dictionary containing gene data with gene IDs as keys and corresponding values.
+ − 432 rules (list): List of rules to resolve.
+ − 433 ids (list): List of IDs corresponding to the rules.
+ − 434 resolve_none (bool): Flag indicating whether to resolve None values in the rules.
+ − 435 name (str): Name of the dataset.
+ − 436
+ − 437 Returns:
+ − 438 tuple: A tuple containing resolved rules as a dictionary and a list of gene IDs not found in the data.
+ − 439 """
+ − 440 resolve_rules = {}
+ − 441 not_found = []
+ − 442 flag = False
+ − 443 for key, value in genes.items():
+ − 444 tmp_resolve = []
+ − 445 for i in range(len(rules)):
+ − 446 tmp = rules[i]
+ − 447 if tmp:
+ − 448 tmp, err = replace_gene_value(tmp, value)
+ − 449 if err:
+ − 450 not_found.extend(err)
+ − 451 ris = control(None, tmp, resolve_none)
+ − 452 if ris is False or ris == None:
+ − 453 tmp_resolve.append(None)
+ − 454 else:
+ − 455 tmp_resolve.append(ris)
+ − 456 flag = True
+ − 457 else:
+ − 458 tmp_resolve.append(None)
+ − 459 resolve_rules[key] = tmp_resolve
+ − 460
+ − 461 if flag is False:
+ − 462 utils.logWarning(
+ − 463 f"Warning: no computable score (due to missing gene values) for class {name}, the class has been disregarded",
+ − 464 ARGS.out_log)
+ − 465
+ − 466 return (None, None)
+ − 467
+ − 468 return (resolve_rules, list(set(not_found)))
+ − 469 ############################ create_ras #######################################
+ − 470 def create_ras(resolve_rules: Optional[ResolvedRules], dataset_name: str, rules: List[str], ids: List[str], file: str) -> None:
+ − 471 """
+ − 472 Create a RAS (Reaction Activity Score) file from resolved rules.
+ − 473
+ − 474 Args:
+ − 475 resolve_rules (dict): Dictionary containing resolved rules.
+ − 476 dataset_name (str): Name of the dataset.
+ − 477 rules (list): List of rules.
+ − 478 file (str): Path to the output RAS file.
+ − 479
+ − 480 Returns:
+ − 481 None
+ − 482 """
+ − 483 if resolve_rules is None:
+ − 484 utils.logWarning(f"Couldn't generate RAS for current dataset: {dataset_name}", ARGS.out_log)
+ − 485
+ − 486 for geni in resolve_rules.values():
+ − 487 for i, valori in enumerate(geni):
+ − 488 if valori == None:
+ − 489 geni[i] = 'None'
+ − 490
+ − 491 output_ras = pd.DataFrame.from_dict(resolve_rules)
+ − 492
+ − 493 output_ras.insert(0, 'Reactions', ids)
+ − 494 output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False)
+ − 495
+ − 496 text_file = open(file, "w")
+ − 497
+ − 498 text_file.write(output_to_csv)
+ − 499 text_file.close()
+ − 500
+ − 501 ################################- NEW RAS COMPUTATION -################################
+ − 502 Expr = Optional[Union[int, float]]
+ − 503 Ras = Expr
+ − 504 def ras_for_cell_lines(dataset: pd.DataFrame, rules: Dict[str, ruleUtils.OpList]) -> Dict[str, Dict[str, Ras]]:
+ − 505 """
+ − 506 Generates the RAS scores for each cell line found in the dataset.
+ − 507
+ − 508 Args:
+ − 509 dataset (pd.DataFrame): Dataset containing gene values.
+ − 510 rules (dict): The dict containing reaction ids as keys and rules as values.
+ − 511
+ − 512 Side effects:
+ − 513 dataset : mut
+ − 514
+ − 515 Returns:
+ − 516 dict: A dictionary where each key corresponds to a cell line name and each value is a dictionary
+ − 517 where each key corresponds to a reaction ID and each value is its computed RAS score.
+ − 518 """
+ − 519 ras_values_by_cell_line = {}
+ − 520 dataset.set_index(dataset.columns[0], inplace=True)
+ − 521 # Considera tutte le colonne tranne la prima in cui ci sono gli hugo quindi va scartata
+ − 522 for cell_line_name in dataset.columns[1:]:
+ − 523 cell_line = dataset[cell_line_name].to_dict()
+ − 524 ras_values_by_cell_line[cell_line_name]= get_ras_values(rules, cell_line)
+ − 525 return ras_values_by_cell_line
+ − 526
+ − 527 def get_ras_values(value_rules: Dict[str, ruleUtils.OpList], dataset: Dict[str, Expr]) -> Dict[str, Ras]:
+ − 528 """
+ − 529 Computes the RAS (Reaction Activity Score) values for each rule in the given dict.
+ − 530
+ − 531 Args:
+ − 532 value_rules (dict): A dictionary where keys are reaction ids and values are OpLists.
+ − 533 dataset : gene expression data of one cell line.
+ − 534
+ − 535 Returns:
+ − 536 dict: A dictionary where keys are reaction ids and values are the computed RAS values for each rule.
+ − 537 """
+ − 538 return {key: ras_op_list(op_list, dataset) for key, op_list in value_rules.items()}
+ − 539
+ − 540 def get_gene_expr(dataset :Dict[str, Expr], name :str) -> Expr:
+ − 541 """
+ − 542 Extracts the gene expression of the given gene from a cell line dataset.
+ − 543
+ − 544 Args:
+ − 545 dataset : gene expression data of one cell line.
+ − 546 name : gene name.
+ − 547
+ − 548 Returns:
+ − 549 Expr : the gene's expression value.
+ − 550 """
+ − 551 expr = dataset.get(name, None)
+ − 552 if expr is None: ERRORS.append(name)
+ − 553
+ − 554 return expr
+ − 555
+ − 556 def ras_op_list(op_list: ruleUtils.OpList, dataset: Dict[str, Expr]) -> Ras:
+ − 557 """
+ − 558 Computes recursively the RAS (Reaction Activity Score) value for the given OpList, considering the specified flag to control None behavior.
+ − 559
+ − 560 Args:
+ − 561 op_list (OpList): The OpList representing a rule with gene values.
+ − 562 dataset : gene expression data of one cell line.
+ − 563
+ − 564 Returns:
+ − 565 Ras: The computed RAS value for the given OpList.
+ − 566 """
+ − 567 op = op_list.op
+ − 568 ras_value :Ras = None
+ − 569 if not op: return get_gene_expr(dataset, op_list[0])
+ − 570 if op is ruleUtils.RuleOp.AND and not ARGS.none and None in op_list: return None
+ − 571
+ − 572 for i in range(len(op_list)):
+ − 573 item = op_list[i]
+ − 574 if isinstance(item, ruleUtils.OpList):
+ − 575 item = ras_op_list(item, dataset)
+ − 576
+ − 577 else:
+ − 578 item = get_gene_expr(dataset, item)
+ − 579
+ − 580 if item is None:
+ − 581 if op is ruleUtils.RuleOp.AND and not ARGS.none: return None
+ − 582 continue
+ − 583
+ − 584 if ras_value is None:
+ − 585 ras_value = item
+ − 586 else:
+ − 587 ras_value = ras_value + item if op is ruleUtils.RuleOp.OR else min(ras_value, item)
+ − 588
+ − 589 return ras_value
+ − 590
+ − 591 def save_as_tsv(rasScores: Dict[str, Dict[str, Ras]], reactions :List[str]) -> None:
+ − 592 """
+ − 593 Save computed ras scores to the given path, as a tsv file.
+ − 594
+ − 595 Args:
+ − 596 rasScores : the computed ras scores.
+ − 597 path : the output tsv file's path.
+ − 598
+ − 599 Returns:
+ − 600 None
+ − 601 """
+ − 602 for scores in rasScores.values(): # this is actually a lot faster than using the ootb dataframe metod, sadly
+ − 603 for reactId, score in scores.items():
+ − 604 if score is None: scores[reactId] = "None"
+ − 605
+ − 606 output_ras = pd.DataFrame.from_dict(rasScores)
+ − 607 output_ras.insert(0, 'Reactions', reactions)
+ − 608 output_ras.to_csv(ARGS.ras_output, sep = '\t', index = False)
+ − 609
+ − 610 ############################ MAIN #############################################
+ − 611 #TODO: not used but keep, it will be when the new translator dicts will be used.
+ − 612 def translateGene(geneName :str, encoding :str, geneTranslator :Dict[str, Dict[str, str]]) -> str:
+ − 613 """
+ − 614 Translate gene from any supported encoding to HugoID.
+ − 615
+ − 616 Args:
+ − 617 geneName (str): the name of the gene in its current encoding.
+ − 618 encoding (str): the encoding.
+ − 619 geneTranslator (Dict[str, Dict[str, str]]): the dict containing all supported gene names
+ − 620 and encodings in the current model, mapping each to the corresponding HugoID encoding.
+ − 621
+ − 622 Raises:
+ − 623 ValueError: When the gene isn't supported in the model.
+ − 624
+ − 625 Returns:
+ − 626 str: the gene in HugoID encoding.
+ − 627 """
+ − 628 supportedGenesInEncoding = geneTranslator[encoding]
+ − 629 if geneName in supportedGenesInEncoding: return supportedGenesInEncoding[geneName]
+ − 630 raise ValueError(f"Gene \"{geneName}\" non trovato, verifica di star utilizzando il modello corretto!")
+ − 631
+ − 632 def load_custom_rules() -> Dict[str, ruleUtils.OpList]:
+ − 633 """
+ − 634 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be
+ − 635 performed, significantly impacting the runtime.
+ − 636
+ − 637 Returns:
+ − 638 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules.
+ − 639 """
+ − 640 datFilePath = utils.FilePath.fromStrPath(ARGS.rule_list) # actual file, stored in galaxy as a .dat
+ − 641
+ − 642 try: filenamePath = utils.FilePath.fromStrPath(ARGS.rules_name) # file's name in input, to determine its original ext
+ − 643 except utils.PathErr as err:
+ − 644 raise utils.PathErr(filenamePath, f"Please make sure your file's name is a valid file path, {err.msg}")
+ − 645
+ − 646 if filenamePath.ext is utils.FileFormat.PICKLE: return utils.readPickle(datFilePath)
+ − 647
+ − 648 # csv rules need to be parsed, those in a pickle format are taken to be pre-parsed.
+ − 649 return { line[0] : ruleUtils.parseRuleToNestedList(line[1]) for line in utils.readCsv(datFilePath) }
+ − 650
147
+ − 651 def main(args:List[str] = None) -> None:
93
+ − 652 """
+ − 653 Initializes everything and sets the program in motion based on the fronted input arguments.
+ − 654
+ − 655 Returns:
+ − 656 None
+ − 657 """
+ − 658 # get args from frontend (related xml)
+ − 659 global ARGS
147
+ − 660 ARGS = process_args(args)
93
+ − 661 print(ARGS.rules_selector)
+ − 662 # read dataset
+ − 663 dataset = read_dataset(ARGS.input, "dataset")
+ − 664 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
+ − 665
+ − 666 # remove versioning from gene names
+ − 667 dataset.iloc[:, 0] = dataset.iloc[:, 0].str.split('.').str[0]
+ − 668
+ − 669 # handle custom models
+ − 670 model :utils.Model = ARGS.rules_selector
+ − 671 if model is utils.Model.Custom:
+ − 672 rules = load_custom_rules()
+ − 673 reactions = list(rules.keys())
+ − 674
+ − 675 save_as_tsv(ras_for_cell_lines(dataset, rules), reactions)
+ − 676 if ERRORS: utils.logWarning(
+ − 677 f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}",
+ − 678 ARGS.out_log)
+ − 679
+ − 680 return
+ − 681
+ − 682 # This is the standard flow of the ras_generator program, for non-custom models.
+ − 683 name = "RAS Dataset"
+ − 684 type_gene = gene_type(dataset.iloc[0, 0], name)
+ − 685
+ − 686 rules = model.getRules(ARGS.tool_dir)
+ − 687 genes = data_gene(dataset, type_gene, name, None)
+ − 688 ids, rules = load_id_rules(rules.get(type_gene))
+ − 689
+ − 690 resolve_rules, err = resolve(genes, rules, ids, ARGS.none, name)
+ − 691 create_ras(resolve_rules, name, rules, ids, ARGS.ras_output)
+ − 692
+ − 693 if err: utils.logWarning(
+ − 694 f"Warning: gene(s) {err} not found in class \"{name}\", " +
+ − 695 "the expression level for this gene will be considered NaN",
+ − 696 ARGS.out_log)
+ − 697
+ − 698 print("Execution succeded")
+ − 699
+ − 700 ###############################################################################
+ − 701 if __name__ == "__main__":
+ − 702 main()