| 
489
 | 
     1 """
 | 
| 
 | 
     2 Generate Reaction Activity Scores (RAS) from a gene expression dataset and GPR rules.
 | 
| 
 | 
     3 
 | 
| 
 | 
     4 The script reads a tabular dataset (genes x samples) and a rules file (GPRs),
 | 
| 
 | 
     5 computes RAS per reaction for each sample/cell line, and writes a tabular output.
 | 
| 
 | 
     6 """
 | 
| 
93
 | 
     7 from __future__ import division
 | 
| 
 | 
     8 import sys
 | 
| 
 | 
     9 import argparse
 | 
| 
 | 
    10 import pandas as pd
 | 
| 
505
 | 
    11 import numpy as np
 | 
| 
93
 | 
    12 import utils.general_utils as utils
 | 
| 
505
 | 
    13 from typing import  List, Dict
 | 
| 
 | 
    14 import ast
 | 
| 
93
 | 
    15 
 | 
| 
 | 
    16 ERRORS = []
 | 
| 
 | 
    17 ########################## argparse ##########################################
 | 
| 
 | 
    18 ARGS :argparse.Namespace
 | 
| 
147
 | 
    19 def process_args(args:List[str] = None) -> argparse.Namespace:
 | 
| 
93
 | 
    20     """
 | 
| 
 | 
    21     Processes command-line arguments.
 | 
| 
 | 
    22 
 | 
| 
 | 
    23     Args:
 | 
| 
 | 
    24         args (list): List of command-line arguments.
 | 
| 
 | 
    25 
 | 
| 
 | 
    26     Returns:
 | 
| 
 | 
    27         Namespace: An object containing parsed arguments.
 | 
| 
 | 
    28     """
 | 
| 
 | 
    29     parser = argparse.ArgumentParser(
 | 
| 
 | 
    30         usage = '%(prog)s [options]',
 | 
| 
 | 
    31         description = "process some value's genes to create a comparison's map.")
 | 
| 
 | 
    32     
 | 
| 
489
 | 
    33     parser.add_argument("-rl", "--model_upload", type = str,
 | 
| 
 | 
    34         help = "path to input file containing the rules")
 | 
| 
93
 | 
    35 
 | 
| 
489
 | 
    36     parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name")
 | 
| 
 | 
    37     # Galaxy converts files into .dat, this helps infer the original extension when needed.
 | 
| 
93
 | 
    38     
 | 
| 
 | 
    39     parser.add_argument(
 | 
| 
 | 
    40         '-n', '--none',
 | 
| 
 | 
    41         type = utils.Bool("none"), default = True,
 | 
| 
 | 
    42         help = 'compute Nan values')
 | 
| 
 | 
    43     
 | 
| 
 | 
    44     parser.add_argument(
 | 
| 
 | 
    45         '-td', '--tool_dir',
 | 
| 
 | 
    46         type = str,
 | 
| 
 | 
    47         required = True, help = 'your tool directory')
 | 
| 
 | 
    48     
 | 
| 
 | 
    49     parser.add_argument(
 | 
| 
 | 
    50         '-ol', '--out_log',
 | 
| 
 | 
    51         type = str,
 | 
| 
 | 
    52         help = "Output log")    
 | 
| 
 | 
    53     
 | 
| 
 | 
    54     parser.add_argument(
 | 
| 
489
 | 
    55         '-in', '--input',
 | 
| 
93
 | 
    56         type = str,
 | 
| 
 | 
    57         help = 'input dataset')
 | 
| 
 | 
    58     
 | 
| 
 | 
    59     parser.add_argument(
 | 
| 
 | 
    60         '-ra', '--ras_output',
 | 
| 
 | 
    61         type = str,
 | 
| 
 | 
    62         required = True, help = 'ras output')
 | 
| 
147
 | 
    63 
 | 
| 
93
 | 
    64     
 | 
| 
147
 | 
    65     return parser.parse_args(args)
 | 
| 
93
 | 
    66 
 | 
| 
 | 
    67 ############################ dataset input ####################################
 | 
| 
 | 
    68 def read_dataset(data :str, name :str) -> pd.DataFrame:
 | 
| 
 | 
    69     """
 | 
| 
 | 
    70     Read a dataset from a CSV file and return it as a pandas DataFrame.
 | 
| 
 | 
    71 
 | 
| 
 | 
    72     Args:
 | 
| 
 | 
    73         data (str): Path to the CSV file containing the dataset.
 | 
| 
 | 
    74         name (str): Name of the dataset, used in error messages.
 | 
| 
 | 
    75 
 | 
| 
 | 
    76     Returns:
 | 
| 
 | 
    77         pandas.DataFrame: DataFrame containing the dataset.
 | 
| 
 | 
    78 
 | 
| 
 | 
    79     Raises:
 | 
| 
 | 
    80         pd.errors.EmptyDataError: If the CSV file is empty.
 | 
| 
 | 
    81         sys.exit: If the CSV file has the wrong format, the execution is aborted.
 | 
| 
 | 
    82     """
 | 
| 
 | 
    83     try:
 | 
| 
505
 | 
    84         dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python', index_col=0)
 | 
| 
 | 
    85         dataset = dataset.astype(float)
 | 
| 
93
 | 
    86     except pd.errors.EmptyDataError:
 | 
| 
505
 | 
    87         sys.exit('Execution aborted: wrong file format of ' + name + '\n')
 | 
| 
93
 | 
    88     if len(dataset.columns) < 2:
 | 
| 
505
 | 
    89         sys.exit('Execution aborted: wrong file format of ' + name + '\n')
 | 
| 
93
 | 
    90     return dataset
 | 
| 
 | 
    91 
 | 
| 
 | 
    92 
 | 
| 
505
 | 
    93 def load_custom_rules() -> Dict[str,str]:
 | 
| 
93
 | 
    94     """
 | 
| 
 | 
    95     Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be
 | 
| 
 | 
    96     performed, significantly impacting the runtime.
 | 
| 
 | 
    97 
 | 
| 
 | 
    98     Returns:
 | 
| 
 | 
    99         Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules.
 | 
| 
 | 
   100     """
 | 
| 
489
 | 
   101     datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload)  # actual file, stored in Galaxy as a .dat
 | 
| 
 | 
   102 
 | 
| 
 | 
   103     dict_rule = {}
 | 
| 
 | 
   104 
 | 
| 
 | 
   105     try:
 | 
| 
 | 
   106         rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False)
 | 
| 
 | 
   107         if len(rows) <= 1:
 | 
| 
 | 
   108             raise ValueError("Model tabular with 1 column is not supported.")
 | 
| 
381
 | 
   109 
 | 
| 
489
 | 
   110         if not rows:
 | 
| 
 | 
   111             raise ValueError("Model tabular is file is empty.")
 | 
| 
 | 
   112         
 | 
| 
 | 
   113         id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
 | 
| 
 | 
   114         
 | 
| 
 | 
   115     # First, try using a tab delimiter
 | 
| 
 | 
   116         for line in rows[1:]:
 | 
| 
 | 
   117             if len(line) <= idx_gpr:
 | 
| 
 | 
   118                 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
 | 
| 
 | 
   119                 continue
 | 
| 
 | 
   120             
 | 
| 
505
 | 
   121             dict_rule[line[id_idx]] = line[idx_gpr] 
 | 
| 
 | 
   122 
 | 
| 
489
 | 
   123     except Exception as e:
 | 
| 
 | 
   124         # If parsing with tabs fails, try comma delimiter
 | 
| 
 | 
   125         try:
 | 
| 
 | 
   126             rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False)
 | 
| 
 | 
   127             
 | 
| 
 | 
   128             if len(rows) <= 1:
 | 
| 
 | 
   129                 raise ValueError("Model tabular with 1 column is not supported.")
 | 
| 
 | 
   130 
 | 
| 
 | 
   131             if not rows:
 | 
| 
 | 
   132                 raise ValueError("Model tabular is file is empty.")
 | 
| 
 | 
   133             
 | 
| 
 | 
   134             id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
 | 
| 
 | 
   135             
 | 
| 
 | 
   136             # Try again parsing row content with the GPR column using comma-separated values
 | 
| 
 | 
   137             for line in rows[1:]:
 | 
| 
 | 
   138                 if len(line) <= idx_gpr:
 | 
| 
 | 
   139                     utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
 | 
| 
 | 
   140                     continue
 | 
| 
 | 
   141                 
 | 
| 
505
 | 
   142                 dict_rule[line[id_idx]] =line[idx_gpr]
 | 
| 
489
 | 
   143                     
 | 
| 
 | 
   144         except Exception as e2:
 | 
| 
 | 
   145             raise ValueError(f"Unable to parse rules file. Tried both tab and comma delimiters. Original errors: Tab: {e}, Comma: {e2}")
 | 
| 
 | 
   146 
 | 
| 
 | 
   147     if not dict_rule:
 | 
| 
 | 
   148             raise ValueError("No valid rules found in the uploaded file. Please check the file format.")
 | 
| 
93
 | 
   149     # csv rules need to be parsed, those in a pickle format are taken to be pre-parsed.
 | 
| 
489
 | 
   150     return dict_rule
 | 
| 
 | 
   151 
 | 
| 
401
 | 
   152 
 | 
| 
505
 | 
   153 
 | 
| 
 | 
   154 def computeRAS(
 | 
| 
 | 
   155             dataset,gene_rules,rules_total_string,
 | 
| 
 | 
   156             or_function=np.sum,    # type of operation to do in case of an or expression (max, sum, mean)
 | 
| 
 | 
   157             and_function=np.min,   # type of operation to do in case of an and expression(min, sum)
 | 
| 
 | 
   158             ignore_nan = True
 | 
| 
 | 
   159             ):
 | 
| 
 | 
   160 
 | 
| 
 | 
   161 
 | 
| 
 | 
   162     logic_operators = ['and', 'or', '(', ')']
 | 
| 
 | 
   163     reactions=list(gene_rules.keys())
 | 
| 
 | 
   164 
 | 
| 
 | 
   165     # Build the dictionary for the GPRs
 | 
| 
 | 
   166     df_reactions = pd.DataFrame(index=reactions)
 | 
| 
 | 
   167     gene_rules=[gene_rules[reaction_id].replace("OR","or").replace("AND","and").replace("(","( ").replace(")"," )") for reaction_id in reactions]        
 | 
| 
 | 
   168     df_reactions['rule'] = gene_rules
 | 
| 
 | 
   169     df_reactions = df_reactions.reset_index()
 | 
| 
 | 
   170     df_reactions = df_reactions.groupby('rule').agg(lambda x: sorted(list(x)))
 | 
| 
 | 
   171 
 | 
| 
 | 
   172     dict_rule_reactions = df_reactions.to_dict()['index']
 | 
| 
 | 
   173 
 | 
| 
 | 
   174     # build useful structures for RAS computation
 | 
| 
 | 
   175     genes =dataset.index.intersection(rules_total_string)
 | 
| 
 | 
   176     
 | 
| 
 | 
   177     #check if there is one gene at least 
 | 
| 
 | 
   178     if len(genes)==0:
 | 
| 
510
 | 
   179         raise ValueError("ERROR: No genes from the count matrix match the metabolic model. Check that gene annotations are consistent between model and dataset.") 
 | 
| 
505
 | 
   180     
 | 
| 
 | 
   181     cell_ids = list(dataset.columns)
 | 
| 
 | 
   182     count_df_filtered = dataset.loc[genes]
 | 
| 
510
 | 
   183     count_df_filtered = count_df_filtered.rename(index=lambda x: x.replace("-", "_").replace(":", "_"))
 | 
| 
505
 | 
   184 
 | 
| 
 | 
   185     ras_df=np.full((len(dict_rule_reactions), len(cell_ids)), np.nan)
 | 
| 
 | 
   186 
 | 
| 
 | 
   187     # for loop on rules
 | 
| 
510
 | 
   188     genes_not_mapped=[]
 | 
| 
505
 | 
   189     ind = 0       
 | 
| 
 | 
   190     for rule, reaction_ids in dict_rule_reactions.items():
 | 
| 
 | 
   191         if len(rule) != 0:
 | 
| 
 | 
   192             # there is one gene at least in the formula
 | 
| 
510
 | 
   193             warning_rule="_"
 | 
| 
 | 
   194             if "-" in rule:
 | 
| 
 | 
   195                 warning_rule="-"
 | 
| 
 | 
   196             if ":" in rule:
 | 
| 
 | 
   197                 warning_rule=":"
 | 
| 
 | 
   198             rule_orig=rule.split().copy()  #original rule in list
 | 
| 
 | 
   199             rule = rule.replace(warning_rule,"_")
 | 
| 
 | 
   200              #modified rule
 | 
| 
505
 | 
   201             rule_split = rule.split()
 | 
| 
510
 | 
   202             rule_split_elements = list(filter(lambda x: x not in logic_operators, rule_split))  # remove of all logical operators
 | 
| 
 | 
   203             rule_split_elements = list(set(rule_split_elements))                                # genes in formula
 | 
| 
505
 | 
   204             
 | 
| 
 | 
   205             # which genes are in the count matrix?                
 | 
| 
 | 
   206             genes_in_count_matrix = [el for el in rule_split_elements if el in genes]
 | 
| 
510
 | 
   207             genes_notin_count_matrix = []
 | 
| 
 | 
   208             for el in rule_split_elements:
 | 
| 
 | 
   209                 if el not in genes: #not present in original dataset
 | 
| 
 | 
   210                     if el.replace("_",warning_rule) in rule_orig: 
 | 
| 
 | 
   211                         genes_notin_count_matrix.append(el.replace("_",warning_rule))
 | 
| 
 | 
   212                     else:
 | 
| 
 | 
   213                         genes_notin_count_matrix.append(el)
 | 
| 
 | 
   214             genes_not_mapped.extend(genes_notin_count_matrix)
 | 
| 
 | 
   215             
 | 
| 
 | 
   216             # add genes not present in the data
 | 
| 
 | 
   217             if len(genes_in_count_matrix) > 0: #there is at least one gene in the count matrix                 
 | 
| 
505
 | 
   218                     if len(rule_split) == 1:
 | 
| 
 | 
   219                         #one gene --> one reaction
 | 
| 
 | 
   220                         ras_df[ind] = count_df_filtered.loc[genes_in_count_matrix]
 | 
| 
 | 
   221                     else:    
 | 
| 
 | 
   222                         if len(genes_notin_count_matrix) > 0 and ignore_nan == False:
 | 
| 
 | 
   223                                 ras_df[ind] = np.nan
 | 
| 
 | 
   224                         else:                   
 | 
| 
 | 
   225                             # more genes in the formula
 | 
| 
 | 
   226                             check_only_and=("and" in rule and "or" not in rule) #only and
 | 
| 
 | 
   227                             check_only_or=("or" in rule and "and" not in rule)  #only or
 | 
| 
 | 
   228                             if check_only_and or check_only_or:
 | 
| 
 | 
   229                                 #or/and sequence
 | 
| 
 | 
   230                                 matrix = count_df_filtered.loc[genes_in_count_matrix].values
 | 
| 
 | 
   231                                 #compute for all cells
 | 
| 
 | 
   232                                 if check_only_and: 
 | 
| 
 | 
   233                                     ras_df[ind] = and_function(matrix, axis=0)
 | 
| 
 | 
   234                                 else:
 | 
| 
 | 
   235                                     ras_df[ind] = or_function(matrix, axis=0)
 | 
| 
 | 
   236                             else:
 | 
| 
 | 
   237                                 # complex expression (e.g. A or (B and C))
 | 
| 
 | 
   238                                 data = count_df_filtered.loc[genes_in_count_matrix]  # dataframe of genes in the GPRs
 | 
| 
 | 
   239                                 tree = ast.parse(rule, mode="eval").body
 | 
| 
 | 
   240                                 values_by_cell = [dict(zip(data.index, data[col].values)) for col in data.columns]
 | 
| 
 | 
   241                                 for j, values in enumerate(values_by_cell):
 | 
| 
 | 
   242                                     ras_df[ind, j] = _evaluate_ast(tree, values, or_function, and_function)
 | 
| 
 | 
   243     
 | 
| 
 | 
   244         ind +=1
 | 
| 
 | 
   245     
 | 
| 
 | 
   246     #create the dataframe of ras (rules x samples)
 | 
| 
 | 
   247     ras_df= pd.DataFrame(data=ras_df,index=range(len(dict_rule_reactions)), columns=cell_ids)
 | 
| 
 | 
   248     ras_df['Reactions'] = [reaction_ids for rule,reaction_ids in dict_rule_reactions.items()]
 | 
| 
 | 
   249     
 | 
| 
 | 
   250     #create the reaction dataframe for ras (reactions x samples)
 | 
| 
 | 
   251     ras_df = ras_df.explode("Reactions").set_index("Reactions")
 | 
| 
 | 
   252 
 | 
| 
510
 | 
   253     #total genes not mapped from the gpr
 | 
| 
 | 
   254     genes_not_mapped = sorted(set(genes_not_mapped))
 | 
| 
505
 | 
   255 
 | 
| 
510
 | 
   256     return ras_df,genes_not_mapped
 | 
| 
 | 
   257 
 | 
| 
 | 
   258 # function to evalute a complex boolean expression e.g. A or (B and C)
 | 
| 
505
 | 
   259 # function to evalute a complex boolean expression e.g. A or (B and C)
 | 
| 
 | 
   260 def _evaluate_ast( node, values,or_function,and_function):
 | 
| 
 | 
   261     if isinstance(node, ast.BoolOp):
 | 
| 
510
 | 
   262         
 | 
| 
505
 | 
   263         vals = [_evaluate_ast(v, values,or_function,and_function) for v in node.values]
 | 
| 
 | 
   264        
 | 
| 
 | 
   265         vals = [v for v in vals if v is not None]
 | 
| 
 | 
   266         if not vals:
 | 
| 
510
 | 
   267             return np.nan
 | 
| 
505
 | 
   268       
 | 
| 
 | 
   269         vals = [np.array(v) if isinstance(v, (list, np.ndarray)) else v for v in vals]
 | 
| 
 | 
   270 
 | 
| 
 | 
   271         if isinstance(node.op, ast.Or):
 | 
| 
 | 
   272             return or_function(vals)
 | 
| 
 | 
   273         elif isinstance(node.op, ast.And):
 | 
| 
 | 
   274             return and_function(vals)
 | 
| 
 | 
   275 
 | 
| 
 | 
   276     elif isinstance(node, ast.Name):
 | 
| 
 | 
   277         return values.get(node.id, None)
 | 
| 
 | 
   278     elif isinstance(node, ast.Constant):
 | 
| 
510
 | 
   279         key = str(node.value)     #convert in str       
 | 
| 
 | 
   280         return values.get(key, None)   
 | 
| 
505
 | 
   281     else:
 | 
| 
 | 
   282         raise ValueError(f"Error in boolean expression: {ast.dump(node)}")
 | 
| 
 | 
   283 
 | 
| 
147
 | 
   284 def main(args:List[str] = None) -> None:
 | 
| 
93
 | 
   285     """
 | 
| 
 | 
   286     Initializes everything and sets the program in motion based on the fronted input arguments.
 | 
| 
 | 
   287     
 | 
| 
 | 
   288     Returns:
 | 
| 
 | 
   289         None
 | 
| 
 | 
   290     """
 | 
| 
 | 
   291     # get args from frontend (related xml)
 | 
| 
 | 
   292     global ARGS
 | 
| 
147
 | 
   293     ARGS = process_args(args)
 | 
| 
309
 | 
   294 
 | 
| 
505
 | 
   295     # read dataset and remove versioning from gene names
 | 
| 
93
 | 
   296     dataset = read_dataset(ARGS.input, "dataset")
 | 
| 
510
 | 
   297     orig_gene_list=dataset.index.copy()
 | 
| 
 | 
   298     dataset.index =  [str(el.split(".")[0]) for el in dataset.index]  
 | 
| 
 | 
   299 
 | 
| 
505
 | 
   300     #load GPR rules
 | 
| 
489
 | 
   301     rules = load_custom_rules()
 | 
| 
505
 | 
   302     
 | 
| 
 | 
   303     #create a list of all the gpr
 | 
| 
 | 
   304     rules_total_string=""
 | 
| 
 | 
   305     for id,rule in rules.items():
 | 
| 
 | 
   306         rules_total_string+=rule.replace("(","").replace(")","") + " "
 | 
| 
 | 
   307     rules_total_string=list(set(rules_total_string.split(" ")))
 | 
| 
93
 | 
   308 
 | 
| 
512
 | 
   309     if any(dataset.index.duplicated(keep=False)):
 | 
| 
 | 
   310         genes_duplicates=orig_gene_list[dataset.index.duplicated(keep=False)]
 | 
| 
 | 
   311         genes_duplicates_in_model=[elem for elem in genes_duplicates if elem in rules_total_string]
 | 
| 
 | 
   312         if len(genes_duplicates_in_model)>0:#metabolic genes have duplicated entries in the dataset
 | 
| 
 | 
   313             list_str=", ".join(genes_duplicates_in_model)
 | 
| 
 | 
   314             raise ValueError(f"ERROR: Duplicate entries in the gene dataset present in one or more GPR. The following metabolic genes are duplicated: "+list_str)       
 | 
| 
 | 
   315 
 | 
| 
505
 | 
   316     #check if nan value must be ignored in the GPR 
 | 
| 
 | 
   317     if ARGS.none:
 | 
| 
 | 
   318     #    #e.g. (A or nan --> A)
 | 
| 
 | 
   319         ignore_nan = True
 | 
| 
 | 
   320     else:
 | 
| 
 | 
   321         #e.g. (A or nan --> nan)
 | 
| 
 | 
   322         ignore_nan = False
 | 
| 
 | 
   323     
 | 
| 
 | 
   324     #compure ras
 | 
| 
510
 | 
   325     ras_df,genes_not_mapped=computeRAS(dataset,rules,
 | 
| 
505
 | 
   326                     rules_total_string,
 | 
| 
 | 
   327                     or_function=np.sum,    # type of operation to do in case of an or expression (max, sum, mean)
 | 
| 
 | 
   328                     and_function=np.min,
 | 
| 
 | 
   329                     ignore_nan=ignore_nan)
 | 
| 
 | 
   330     
 | 
| 
 | 
   331     #save to csv and replace nan with None
 | 
| 
510
 | 
   332     ras_df.replace([np.nan,None],"None").to_csv(ARGS.ras_output, sep = '\t')
 | 
| 
381
 | 
   333 
 | 
| 
510
 | 
   334     #report genes not present in the data
 | 
| 
 | 
   335     if len(genes_not_mapped)>0: 
 | 
| 
 | 
   336         genes_not_mapped_str=", ".join(genes_not_mapped)
 | 
| 
 | 
   337         utils.logWarning(
 | 
| 
 | 
   338         f"The following genes are mentioned in the GPR rules but don't appear in the dataset: "+genes_not_mapped_str,
 | 
| 
 | 
   339         ARGS.out_log)  
 | 
| 
 | 
   340  
 | 
| 
489
 | 
   341     print("Execution succeeded")
 | 
| 
93
 | 
   342 
 | 
| 
 | 
   343 ###############################################################################
 | 
| 
 | 
   344 if __name__ == "__main__":
 | 
| 
505
 | 
   345     main() |