Mercurial > repos > bimib > cobraxy
comparison COBRAxy/ras_generator.py @ 93:7e703e546998 draft
Uploaded
author | luca_milaz |
---|---|
date | Sun, 13 Oct 2024 11:41:34 +0000 |
parents | |
children | 3fca9b568faf |
comparison
equal
deleted
inserted
replaced
92:fdf713bb5772 | 93:7e703e546998 |
---|---|
1 from __future__ import division | |
2 # galaxy complains this ^^^ needs to be at the very beginning of the file, for some reason. | |
3 import sys | |
4 import argparse | |
5 import collections | |
6 import pandas as pd | |
7 import pickle as pk | |
8 import utils.general_utils as utils | |
9 import utils.rule_parsing as ruleUtils | |
10 from typing import Union, Optional, List, Dict, Tuple, TypeVar | |
11 | |
12 ERRORS = [] | |
13 ########################## argparse ########################################## | |
14 ARGS :argparse.Namespace | |
15 def process_args() -> argparse.Namespace: | |
16 """ | |
17 Processes command-line arguments. | |
18 | |
19 Args: | |
20 args (list): List of command-line arguments. | |
21 | |
22 Returns: | |
23 Namespace: An object containing parsed arguments. | |
24 """ | |
25 parser = argparse.ArgumentParser( | |
26 usage = '%(prog)s [options]', | |
27 description = "process some value's genes to create a comparison's map.") | |
28 | |
29 parser.add_argument( | |
30 '-rs', '--rules_selector', | |
31 type = utils.Model, default = utils.Model.HMRcore, choices = list(utils.Model), | |
32 help = 'chose which type of dataset you want use') | |
33 | |
34 parser.add_argument("-rl", "--rule_list", type = str, | |
35 help = "path to input file with custom rules, if provided") | |
36 | |
37 parser.add_argument("-rn", "--rules_name", type = str, help = "custom rules name") | |
38 # ^ I need this because galaxy converts my files into .dat but I need to know what extension they were in | |
39 | |
40 parser.add_argument( | |
41 '-n', '--none', | |
42 type = utils.Bool("none"), default = True, | |
43 help = 'compute Nan values') | |
44 | |
45 parser.add_argument( | |
46 '-td', '--tool_dir', | |
47 type = str, | |
48 required = True, help = 'your tool directory') | |
49 | |
50 parser.add_argument( | |
51 '-ol', '--out_log', | |
52 type = str, | |
53 help = "Output log") | |
54 | |
55 parser.add_argument( | |
56 '-in', '--input', #id รจ diventato in | |
57 type = str, | |
58 help = 'input dataset') | |
59 | |
60 parser.add_argument( | |
61 '-ra', '--ras_output', | |
62 type = str, | |
63 required = True, help = 'ras output') | |
64 | |
65 return parser.parse_args() | |
66 | |
67 ############################ dataset input #################################### | |
68 def read_dataset(data :str, name :str) -> pd.DataFrame: | |
69 """ | |
70 Read a dataset from a CSV file and return it as a pandas DataFrame. | |
71 | |
72 Args: | |
73 data (str): Path to the CSV file containing the dataset. | |
74 name (str): Name of the dataset, used in error messages. | |
75 | |
76 Returns: | |
77 pandas.DataFrame: DataFrame containing the dataset. | |
78 | |
79 Raises: | |
80 pd.errors.EmptyDataError: If the CSV file is empty. | |
81 sys.exit: If the CSV file has the wrong format, the execution is aborted. | |
82 """ | |
83 try: | |
84 dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python') | |
85 except pd.errors.EmptyDataError: | |
86 sys.exit('Execution aborted: wrong format of ' + name + '\n') | |
87 if len(dataset.columns) < 2: | |
88 sys.exit('Execution aborted: wrong format of ' + name + '\n') | |
89 return dataset | |
90 | |
91 ############################ load id e rules ################################## | |
92 def load_id_rules(reactions :Dict[str, Dict[str, List[str]]]) -> Tuple[List[str], List[Dict[str, List[str]]]]: | |
93 """ | |
94 Load IDs and rules from a dictionary of reactions. | |
95 | |
96 Args: | |
97 reactions (dict): A dictionary where keys are IDs and values are rules. | |
98 | |
99 Returns: | |
100 tuple: A tuple containing two lists, the first list containing IDs and the second list containing rules. | |
101 """ | |
102 ids, rules = [], [] | |
103 for key, value in reactions.items(): | |
104 ids.append(key) | |
105 rules.append(value) | |
106 return (ids, rules) | |
107 | |
108 ############################ check_methods #################################### | |
109 def gene_type(l :str, name :str) -> str: | |
110 """ | |
111 Determine the type of gene ID. | |
112 | |
113 Args: | |
114 l (str): The gene identifier to check. | |
115 name (str): The name of the dataset, used in error messages. | |
116 | |
117 Returns: | |
118 str: The type of gene ID ('hugo_id', 'ensembl_gene_id', 'symbol', or 'entrez_id'). | |
119 | |
120 Raises: | |
121 sys.exit: If the gene ID type is not supported, the execution is aborted. | |
122 """ | |
123 if check_hgnc(l): | |
124 return 'hugo_id' | |
125 elif check_ensembl(l): | |
126 return 'ensembl_gene_id' | |
127 elif check_symbol(l): | |
128 return 'symbol' | |
129 elif check_entrez(l): | |
130 return 'entrez_id' | |
131 else: | |
132 sys.exit('Execution aborted:\n' + | |
133 'gene ID type in ' + name + ' not supported. Supported ID'+ | |
134 'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n') | |
135 | |
136 def check_hgnc(l :str) -> bool: | |
137 """ | |
138 Check if a gene identifier follows the HGNC format. | |
139 | |
140 Args: | |
141 l (str): The gene identifier to check. | |
142 | |
143 Returns: | |
144 bool: True if the gene identifier follows the HGNC format, False otherwise. | |
145 """ | |
146 if len(l) > 5: | |
147 if (l.upper()).startswith('HGNC:'): | |
148 return l[5:].isdigit() | |
149 else: | |
150 return False | |
151 else: | |
152 return False | |
153 | |
154 def check_ensembl(l :str) -> bool: | |
155 """ | |
156 Check if a gene identifier follows the Ensembl format. | |
157 | |
158 Args: | |
159 l (str): The gene identifier to check. | |
160 | |
161 Returns: | |
162 bool: True if the gene identifier follows the Ensembl format, False otherwise. | |
163 """ | |
164 return l.upper().startswith('ENS') | |
165 | |
166 | |
167 def check_symbol(l :str) -> bool: | |
168 """ | |
169 Check if a gene identifier follows the symbol format. | |
170 | |
171 Args: | |
172 l (str): The gene identifier to check. | |
173 | |
174 Returns: | |
175 bool: True if the gene identifier follows the symbol format, False otherwise. | |
176 """ | |
177 if len(l) > 0: | |
178 if l[0].isalpha() and l[1:].isalnum(): | |
179 return True | |
180 else: | |
181 return False | |
182 else: | |
183 return False | |
184 | |
185 def check_entrez(l :str) -> bool: | |
186 """ | |
187 Check if a gene identifier follows the Entrez ID format. | |
188 | |
189 Args: | |
190 l (str): The gene identifier to check. | |
191 | |
192 Returns: | |
193 bool: True if the gene identifier follows the Entrez ID format, False otherwise. | |
194 """ | |
195 if len(l) > 0: | |
196 return l.isdigit() | |
197 else: | |
198 return False | |
199 | |
200 ############################ gene ############################################# | |
201 def data_gene(gene: pd.DataFrame, type_gene: str, name: str, gene_custom: Optional[Dict[str, str]]) -> Dict[str, str]: | |
202 """ | |
203 Process gene data to ensure correct formatting and handle duplicates. | |
204 | |
205 Args: | |
206 gene (DataFrame): DataFrame containing gene data. | |
207 type_gene (str): Type of gene data (e.g., 'hugo_id', 'ensembl_gene_id', 'symbol', 'entrez_id'). | |
208 name (str): Name of the dataset. | |
209 gene_custom (dict or None): Custom gene data dictionary if provided. | |
210 | |
211 Returns: | |
212 dict: A dictionary containing gene data with gene IDs as keys and corresponding values. | |
213 """ | |
214 args = process_args() | |
215 for i in range(len(gene)): | |
216 tmp = gene.iloc[i, 0] | |
217 gene.iloc[i, 0] = tmp.strip().split('.')[0] | |
218 | |
219 gene_dup = [item for item, count in | |
220 collections.Counter(gene[gene.columns[0]]).items() if count > 1] | |
221 pat_dup = [item for item, count in | |
222 collections.Counter(list(gene.columns)).items() if count > 1] | |
223 | |
224 if gene_dup: | |
225 if gene_custom == None: | |
226 if args.rules_selector == 'HMRcore': | |
227 gene_in_rule = pk.load(open(args.tool_dir + '/local/pickle files/HMRcore_genes.p', 'rb')) | |
228 | |
229 elif args.rules_selector == 'Recon': | |
230 gene_in_rule = pk.load(open(args.tool_dir + '/local/pickle files/Recon_genes.p', 'rb')) | |
231 | |
232 elif args.rules_selector == 'ENGRO2': | |
233 gene_in_rule = pk.load(open(args.tool_dir + '/local/pickle files/ENGRO2_genes.p', 'rb')) | |
234 print(f"{args.tool_dir}'/local/pickle files/ENGRO2_genes.p'") | |
235 utils.logWarning(f"{args.tool_dir}'/local/pickle files/ENGRO2_genes.p'", ARGS.out_log) | |
236 print(args.rules_selector) | |
237 gene_in_rule = gene_in_rule.get(type_gene) | |
238 | |
239 else: | |
240 gene_in_rule = gene_custom | |
241 tmp = [] | |
242 for i in gene_dup: | |
243 if gene_in_rule.get(i) == 'ok': | |
244 tmp.append(i) | |
245 if tmp: | |
246 sys.exit('Execution aborted because gene ID ' | |
247 +str(tmp)+' in '+name+' is duplicated\n') | |
248 | |
249 if pat_dup: utils.logWarning(f"Warning: duplicated label\n{pat_dup} in {name}", ARGS.out_log) | |
250 return (gene.set_index(gene.columns[0])).to_dict() | |
251 | |
252 ############################ resolve ########################################## | |
253 def replace_gene_value(l :str, d :str) -> Tuple[Union[int, float], list]: | |
254 """ | |
255 Replace gene identifiers with corresponding values from a dictionary. | |
256 | |
257 Args: | |
258 l (str): String of gene identifier. | |
259 d (str): String corresponding to its value. | |
260 | |
261 Returns: | |
262 tuple: A tuple containing two lists: the first list contains replaced values, and the second list contains any errors encountered during replacement. | |
263 """ | |
264 tmp = [] | |
265 err = [] | |
266 while l: | |
267 if isinstance(l[0], list): | |
268 tmp_rules, tmp_err = replace_gene_value(l[0], d) | |
269 tmp.append(tmp_rules) | |
270 err.extend(tmp_err) | |
271 else: | |
272 value = replace_gene(l[0], d) | |
273 tmp.append(value) | |
274 if value == None: | |
275 err.append(l[0]) | |
276 l = l[1:] | |
277 return (tmp, err) | |
278 | |
279 def replace_gene(l :str, d :str) -> Union[int, float]: | |
280 """ | |
281 Replace a single gene identifier with its corresponding value from a dictionary. | |
282 | |
283 Args: | |
284 l (str): Gene identifier to replace. | |
285 d (str): String corresponding to its value. | |
286 | |
287 Returns: | |
288 float/int: Corresponding value from the dictionary if found, None otherwise. | |
289 | |
290 Raises: | |
291 sys.exit: If the value associated with the gene identifier is not valid. | |
292 """ | |
293 if l =='and' or l == 'or': | |
294 return l | |
295 else: | |
296 value = d.get(l, None) | |
297 if not(value == None or isinstance(value, (int, float))): | |
298 sys.exit('Execution aborted: ' + value + ' value not valid\n') | |
299 return value | |
300 | |
301 T = TypeVar("T", bound = Optional[Union[int, float]]) | |
302 def computes(val1 :T, op :str, val2 :T, cn :bool) -> T: | |
303 """ | |
304 Compute the RAS value between two value and an operator ('and' or 'or'). | |
305 | |
306 Args: | |
307 val1(Optional(Union[float, int])): First value. | |
308 op (str): Operator ('and' or 'or'). | |
309 val2(Optional(Union[float, int])): Second value. | |
310 cn (bool): Control boolean value. | |
311 | |
312 Returns: | |
313 Optional(Union[float, int]): Result of the computation. | |
314 """ | |
315 if val1 != None and val2 != None: | |
316 if op == 'and': | |
317 return min(val1, val2) | |
318 else: | |
319 return val1 + val2 | |
320 elif op == 'and': | |
321 if cn is True: | |
322 if val1 != None: | |
323 return val1 | |
324 elif val2 != None: | |
325 return val2 | |
326 else: | |
327 return None | |
328 else: | |
329 return None | |
330 else: | |
331 if val1 != None: | |
332 return val1 | |
333 elif val2 != None: | |
334 return val2 | |
335 else: | |
336 return None | |
337 | |
338 # ris should be Literal[None] but Literal is not supported in Python 3.7 | |
339 def control(ris, l :List[Union[int, float, list]], cn :bool) -> Union[bool, int, float]: #Union[Literal[False], int, float]: | |
340 """ | |
341 Control the format of the expression. | |
342 | |
343 Args: | |
344 ris: Intermediate result. | |
345 l (list): Expression to control. | |
346 cn (bool): Control boolean value. | |
347 | |
348 Returns: | |
349 Union[Literal[False], int, float]: Result of the control. | |
350 """ | |
351 if len(l) == 1: | |
352 if isinstance(l[0], (float, int)) or l[0] == None: | |
353 return l[0] | |
354 elif isinstance(l[0], list): | |
355 return control(None, l[0], cn) | |
356 else: | |
357 return False | |
358 elif len(l) > 2: | |
359 return control_list(ris, l, cn) | |
360 else: | |
361 return False | |
362 | |
363 def control_list(ris, l :List[Optional[Union[float, int, list]]], cn :bool) -> Optional[bool]: #Optional[Literal[False]]: | |
364 """ | |
365 Control the format of a list of expressions. | |
366 | |
367 Args: | |
368 ris: Intermediate result. | |
369 l (list): List of expressions to control. | |
370 cn (bool): Control boolean value. | |
371 | |
372 Returns: | |
373 Optional[Literal[False]]: Result of the control. | |
374 """ | |
375 while l: | |
376 if len(l) == 1: | |
377 return False | |
378 elif (isinstance(l[0], (float, int)) or | |
379 l[0] == None) and l[1] in ['and', 'or']: | |
380 if isinstance(l[2], (float, int)) or l[2] == None: | |
381 ris = computes(l[0], l[1], l[2], cn) | |
382 elif isinstance(l[2], list): | |
383 tmp = control(None, l[2], cn) | |
384 if tmp is False: | |
385 return False | |
386 else: | |
387 ris = computes(l[0], l[1], tmp, cn) | |
388 else: | |
389 return False | |
390 l = l[3:] | |
391 elif l[0] in ['and', 'or']: | |
392 if isinstance(l[1], (float, int)) or l[1] == None: | |
393 ris = computes(ris, l[0], l[1], cn) | |
394 elif isinstance(l[1], list): | |
395 tmp = control(None,l[1], cn) | |
396 if tmp is False: | |
397 return False | |
398 else: | |
399 ris = computes(ris, l[0], tmp, cn) | |
400 else: | |
401 return False | |
402 l = l[2:] | |
403 elif isinstance(l[0], list) and l[1] in ['and', 'or']: | |
404 if isinstance(l[2], (float, int)) or l[2] == None: | |
405 tmp = control(None, l[0], cn) | |
406 if tmp is False: | |
407 return False | |
408 else: | |
409 ris = computes(tmp, l[1], l[2], cn) | |
410 elif isinstance(l[2], list): | |
411 tmp = control(None, l[0], cn) | |
412 tmp2 = control(None, l[2], cn) | |
413 if tmp is False or tmp2 is False: | |
414 return False | |
415 else: | |
416 ris = computes(tmp, l[1], tmp2, cn) | |
417 else: | |
418 return False | |
419 l = l[3:] | |
420 else: | |
421 return False | |
422 return ris | |
423 | |
424 ResolvedRules = Dict[str, List[Optional[Union[float, int]]]] | |
425 def resolve(genes: Dict[str, str], rules: List[str], ids: List[str], resolve_none: bool, name: str) -> Tuple[Optional[ResolvedRules], Optional[list]]: | |
426 """ | |
427 Resolve rules using gene data to compute scores for each rule. | |
428 | |
429 Args: | |
430 genes (dict): Dictionary containing gene data with gene IDs as keys and corresponding values. | |
431 rules (list): List of rules to resolve. | |
432 ids (list): List of IDs corresponding to the rules. | |
433 resolve_none (bool): Flag indicating whether to resolve None values in the rules. | |
434 name (str): Name of the dataset. | |
435 | |
436 Returns: | |
437 tuple: A tuple containing resolved rules as a dictionary and a list of gene IDs not found in the data. | |
438 """ | |
439 resolve_rules = {} | |
440 not_found = [] | |
441 flag = False | |
442 for key, value in genes.items(): | |
443 tmp_resolve = [] | |
444 for i in range(len(rules)): | |
445 tmp = rules[i] | |
446 if tmp: | |
447 tmp, err = replace_gene_value(tmp, value) | |
448 if err: | |
449 not_found.extend(err) | |
450 ris = control(None, tmp, resolve_none) | |
451 if ris is False or ris == None: | |
452 tmp_resolve.append(None) | |
453 else: | |
454 tmp_resolve.append(ris) | |
455 flag = True | |
456 else: | |
457 tmp_resolve.append(None) | |
458 resolve_rules[key] = tmp_resolve | |
459 | |
460 if flag is False: | |
461 utils.logWarning( | |
462 f"Warning: no computable score (due to missing gene values) for class {name}, the class has been disregarded", | |
463 ARGS.out_log) | |
464 | |
465 return (None, None) | |
466 | |
467 return (resolve_rules, list(set(not_found))) | |
468 ############################ create_ras ####################################### | |
469 def create_ras(resolve_rules: Optional[ResolvedRules], dataset_name: str, rules: List[str], ids: List[str], file: str) -> None: | |
470 """ | |
471 Create a RAS (Reaction Activity Score) file from resolved rules. | |
472 | |
473 Args: | |
474 resolve_rules (dict): Dictionary containing resolved rules. | |
475 dataset_name (str): Name of the dataset. | |
476 rules (list): List of rules. | |
477 file (str): Path to the output RAS file. | |
478 | |
479 Returns: | |
480 None | |
481 """ | |
482 if resolve_rules is None: | |
483 utils.logWarning(f"Couldn't generate RAS for current dataset: {dataset_name}", ARGS.out_log) | |
484 | |
485 for geni in resolve_rules.values(): | |
486 for i, valori in enumerate(geni): | |
487 if valori == None: | |
488 geni[i] = 'None' | |
489 | |
490 output_ras = pd.DataFrame.from_dict(resolve_rules) | |
491 | |
492 output_ras.insert(0, 'Reactions', ids) | |
493 output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False) | |
494 | |
495 text_file = open(file, "w") | |
496 | |
497 text_file.write(output_to_csv) | |
498 text_file.close() | |
499 | |
500 ################################- NEW RAS COMPUTATION -################################ | |
501 Expr = Optional[Union[int, float]] | |
502 Ras = Expr | |
503 def ras_for_cell_lines(dataset: pd.DataFrame, rules: Dict[str, ruleUtils.OpList]) -> Dict[str, Dict[str, Ras]]: | |
504 """ | |
505 Generates the RAS scores for each cell line found in the dataset. | |
506 | |
507 Args: | |
508 dataset (pd.DataFrame): Dataset containing gene values. | |
509 rules (dict): The dict containing reaction ids as keys and rules as values. | |
510 | |
511 Side effects: | |
512 dataset : mut | |
513 | |
514 Returns: | |
515 dict: A dictionary where each key corresponds to a cell line name and each value is a dictionary | |
516 where each key corresponds to a reaction ID and each value is its computed RAS score. | |
517 """ | |
518 ras_values_by_cell_line = {} | |
519 dataset.set_index(dataset.columns[0], inplace=True) | |
520 # Considera tutte le colonne tranne la prima in cui ci sono gli hugo quindi va scartata | |
521 for cell_line_name in dataset.columns[1:]: | |
522 cell_line = dataset[cell_line_name].to_dict() | |
523 ras_values_by_cell_line[cell_line_name]= get_ras_values(rules, cell_line) | |
524 return ras_values_by_cell_line | |
525 | |
526 def get_ras_values(value_rules: Dict[str, ruleUtils.OpList], dataset: Dict[str, Expr]) -> Dict[str, Ras]: | |
527 """ | |
528 Computes the RAS (Reaction Activity Score) values for each rule in the given dict. | |
529 | |
530 Args: | |
531 value_rules (dict): A dictionary where keys are reaction ids and values are OpLists. | |
532 dataset : gene expression data of one cell line. | |
533 | |
534 Returns: | |
535 dict: A dictionary where keys are reaction ids and values are the computed RAS values for each rule. | |
536 """ | |
537 return {key: ras_op_list(op_list, dataset) for key, op_list in value_rules.items()} | |
538 | |
539 def get_gene_expr(dataset :Dict[str, Expr], name :str) -> Expr: | |
540 """ | |
541 Extracts the gene expression of the given gene from a cell line dataset. | |
542 | |
543 Args: | |
544 dataset : gene expression data of one cell line. | |
545 name : gene name. | |
546 | |
547 Returns: | |
548 Expr : the gene's expression value. | |
549 """ | |
550 expr = dataset.get(name, None) | |
551 if expr is None: ERRORS.append(name) | |
552 | |
553 return expr | |
554 | |
555 def ras_op_list(op_list: ruleUtils.OpList, dataset: Dict[str, Expr]) -> Ras: | |
556 """ | |
557 Computes recursively the RAS (Reaction Activity Score) value for the given OpList, considering the specified flag to control None behavior. | |
558 | |
559 Args: | |
560 op_list (OpList): The OpList representing a rule with gene values. | |
561 dataset : gene expression data of one cell line. | |
562 | |
563 Returns: | |
564 Ras: The computed RAS value for the given OpList. | |
565 """ | |
566 op = op_list.op | |
567 ras_value :Ras = None | |
568 if not op: return get_gene_expr(dataset, op_list[0]) | |
569 if op is ruleUtils.RuleOp.AND and not ARGS.none and None in op_list: return None | |
570 | |
571 for i in range(len(op_list)): | |
572 item = op_list[i] | |
573 if isinstance(item, ruleUtils.OpList): | |
574 item = ras_op_list(item, dataset) | |
575 | |
576 else: | |
577 item = get_gene_expr(dataset, item) | |
578 | |
579 if item is None: | |
580 if op is ruleUtils.RuleOp.AND and not ARGS.none: return None | |
581 continue | |
582 | |
583 if ras_value is None: | |
584 ras_value = item | |
585 else: | |
586 ras_value = ras_value + item if op is ruleUtils.RuleOp.OR else min(ras_value, item) | |
587 | |
588 return ras_value | |
589 | |
590 def save_as_tsv(rasScores: Dict[str, Dict[str, Ras]], reactions :List[str]) -> None: | |
591 """ | |
592 Save computed ras scores to the given path, as a tsv file. | |
593 | |
594 Args: | |
595 rasScores : the computed ras scores. | |
596 path : the output tsv file's path. | |
597 | |
598 Returns: | |
599 None | |
600 """ | |
601 for scores in rasScores.values(): # this is actually a lot faster than using the ootb dataframe metod, sadly | |
602 for reactId, score in scores.items(): | |
603 if score is None: scores[reactId] = "None" | |
604 | |
605 output_ras = pd.DataFrame.from_dict(rasScores) | |
606 output_ras.insert(0, 'Reactions', reactions) | |
607 output_ras.to_csv(ARGS.ras_output, sep = '\t', index = False) | |
608 | |
609 ############################ MAIN ############################################# | |
610 #TODO: not used but keep, it will be when the new translator dicts will be used. | |
611 def translateGene(geneName :str, encoding :str, geneTranslator :Dict[str, Dict[str, str]]) -> str: | |
612 """ | |
613 Translate gene from any supported encoding to HugoID. | |
614 | |
615 Args: | |
616 geneName (str): the name of the gene in its current encoding. | |
617 encoding (str): the encoding. | |
618 geneTranslator (Dict[str, Dict[str, str]]): the dict containing all supported gene names | |
619 and encodings in the current model, mapping each to the corresponding HugoID encoding. | |
620 | |
621 Raises: | |
622 ValueError: When the gene isn't supported in the model. | |
623 | |
624 Returns: | |
625 str: the gene in HugoID encoding. | |
626 """ | |
627 supportedGenesInEncoding = geneTranslator[encoding] | |
628 if geneName in supportedGenesInEncoding: return supportedGenesInEncoding[geneName] | |
629 raise ValueError(f"Gene \"{geneName}\" non trovato, verifica di star utilizzando il modello corretto!") | |
630 | |
631 def load_custom_rules() -> Dict[str, ruleUtils.OpList]: | |
632 """ | |
633 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be | |
634 performed, significantly impacting the runtime. | |
635 | |
636 Returns: | |
637 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules. | |
638 """ | |
639 datFilePath = utils.FilePath.fromStrPath(ARGS.rule_list) # actual file, stored in galaxy as a .dat | |
640 | |
641 try: filenamePath = utils.FilePath.fromStrPath(ARGS.rules_name) # file's name in input, to determine its original ext | |
642 except utils.PathErr as err: | |
643 raise utils.PathErr(filenamePath, f"Please make sure your file's name is a valid file path, {err.msg}") | |
644 | |
645 if filenamePath.ext is utils.FileFormat.PICKLE: return utils.readPickle(datFilePath) | |
646 | |
647 # csv rules need to be parsed, those in a pickle format are taken to be pre-parsed. | |
648 return { line[0] : ruleUtils.parseRuleToNestedList(line[1]) for line in utils.readCsv(datFilePath) } | |
649 | |
650 def main() -> None: | |
651 """ | |
652 Initializes everything and sets the program in motion based on the fronted input arguments. | |
653 | |
654 Returns: | |
655 None | |
656 """ | |
657 # get args from frontend (related xml) | |
658 global ARGS | |
659 ARGS = process_args() | |
660 print(ARGS.rules_selector) | |
661 # read dataset | |
662 dataset = read_dataset(ARGS.input, "dataset") | |
663 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) | |
664 | |
665 # remove versioning from gene names | |
666 dataset.iloc[:, 0] = dataset.iloc[:, 0].str.split('.').str[0] | |
667 | |
668 # handle custom models | |
669 model :utils.Model = ARGS.rules_selector | |
670 if model is utils.Model.Custom: | |
671 rules = load_custom_rules() | |
672 reactions = list(rules.keys()) | |
673 | |
674 save_as_tsv(ras_for_cell_lines(dataset, rules), reactions) | |
675 if ERRORS: utils.logWarning( | |
676 f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}", | |
677 ARGS.out_log) | |
678 | |
679 return | |
680 | |
681 # This is the standard flow of the ras_generator program, for non-custom models. | |
682 name = "RAS Dataset" | |
683 type_gene = gene_type(dataset.iloc[0, 0], name) | |
684 | |
685 rules = model.getRules(ARGS.tool_dir) | |
686 genes = data_gene(dataset, type_gene, name, None) | |
687 ids, rules = load_id_rules(rules.get(type_gene)) | |
688 | |
689 resolve_rules, err = resolve(genes, rules, ids, ARGS.none, name) | |
690 create_ras(resolve_rules, name, rules, ids, ARGS.ras_output) | |
691 | |
692 if err: utils.logWarning( | |
693 f"Warning: gene(s) {err} not found in class \"{name}\", " + | |
694 "the expression level for this gene will be considered NaN", | |
695 ARGS.out_log) | |
696 | |
697 print("Execution succeded") | |
698 | |
699 ############################################################################### | |
700 if __name__ == "__main__": | |
701 main() |