| 
489
 | 
     1 """
 | 
| 
 | 
     2 Generate Reaction Activity Scores (RAS) from a gene expression dataset and GPR rules.
 | 
| 
 | 
     3 
 | 
| 
 | 
     4 The script reads a tabular dataset (genes x samples) and a rules file (GPRs),
 | 
| 
 | 
     5 computes RAS per reaction for each sample/cell line, and writes a tabular output.
 | 
| 
 | 
     6 """
 | 
| 
93
 | 
     7 from __future__ import division
 | 
| 
 | 
     8 import sys
 | 
| 
 | 
     9 import argparse
 | 
| 
 | 
    10 import pandas as pd
 | 
| 
505
 | 
    11 import numpy as np
 | 
| 
93
 | 
    12 import utils.general_utils as utils
 | 
| 
505
 | 
    13 from typing import  List, Dict
 | 
| 
 | 
    14 import ast
 | 
| 
93
 | 
    15 
 | 
| 
 | 
    16 ERRORS = []
 | 
| 
 | 
    17 ########################## argparse ##########################################
 | 
| 
 | 
    18 ARGS :argparse.Namespace
 | 
| 
147
 | 
    19 def process_args(args:List[str] = None) -> argparse.Namespace:
 | 
| 
93
 | 
    20     """
 | 
| 
 | 
    21     Processes command-line arguments.
 | 
| 
 | 
    22 
 | 
| 
 | 
    23     Args:
 | 
| 
 | 
    24         args (list): List of command-line arguments.
 | 
| 
 | 
    25 
 | 
| 
 | 
    26     Returns:
 | 
| 
 | 
    27         Namespace: An object containing parsed arguments.
 | 
| 
 | 
    28     """
 | 
| 
 | 
    29     parser = argparse.ArgumentParser(
 | 
| 
 | 
    30         usage = '%(prog)s [options]',
 | 
| 
 | 
    31         description = "process some value's genes to create a comparison's map.")
 | 
| 
 | 
    32     
 | 
| 
489
 | 
    33     parser.add_argument("-rl", "--model_upload", type = str,
 | 
| 
 | 
    34         help = "path to input file containing the rules")
 | 
| 
93
 | 
    35 
 | 
| 
489
 | 
    36     parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name")
 | 
| 
 | 
    37     # Galaxy converts files into .dat, this helps infer the original extension when needed.
 | 
| 
93
 | 
    38     
 | 
| 
 | 
    39     parser.add_argument(
 | 
| 
 | 
    40         '-n', '--none',
 | 
| 
 | 
    41         type = utils.Bool("none"), default = True,
 | 
| 
 | 
    42         help = 'compute Nan values')
 | 
| 
 | 
    43     
 | 
| 
 | 
    44     parser.add_argument(
 | 
| 
 | 
    45         '-td', '--tool_dir',
 | 
| 
 | 
    46         type = str,
 | 
| 
 | 
    47         required = True, help = 'your tool directory')
 | 
| 
 | 
    48     
 | 
| 
 | 
    49     parser.add_argument(
 | 
| 
 | 
    50         '-ol', '--out_log',
 | 
| 
 | 
    51         type = str,
 | 
| 
 | 
    52         help = "Output log")    
 | 
| 
 | 
    53     
 | 
| 
 | 
    54     parser.add_argument(
 | 
| 
489
 | 
    55         '-in', '--input',
 | 
| 
93
 | 
    56         type = str,
 | 
| 
 | 
    57         help = 'input dataset')
 | 
| 
 | 
    58     
 | 
| 
 | 
    59     parser.add_argument(
 | 
| 
 | 
    60         '-ra', '--ras_output',
 | 
| 
 | 
    61         type = str,
 | 
| 
 | 
    62         required = True, help = 'ras output')
 | 
| 
147
 | 
    63 
 | 
| 
93
 | 
    64     
 | 
| 
147
 | 
    65     return parser.parse_args(args)
 | 
| 
93
 | 
    66 
 | 
| 
 | 
    67 ############################ dataset input ####################################
 | 
| 
 | 
    68 def read_dataset(data :str, name :str) -> pd.DataFrame:
 | 
| 
 | 
    69     """
 | 
| 
 | 
    70     Read a dataset from a CSV file and return it as a pandas DataFrame.
 | 
| 
 | 
    71 
 | 
| 
 | 
    72     Args:
 | 
| 
 | 
    73         data (str): Path to the CSV file containing the dataset.
 | 
| 
 | 
    74         name (str): Name of the dataset, used in error messages.
 | 
| 
 | 
    75 
 | 
| 
 | 
    76     Returns:
 | 
| 
 | 
    77         pandas.DataFrame: DataFrame containing the dataset.
 | 
| 
 | 
    78 
 | 
| 
 | 
    79     Raises:
 | 
| 
 | 
    80         pd.errors.EmptyDataError: If the CSV file is empty.
 | 
| 
 | 
    81         sys.exit: If the CSV file has the wrong format, the execution is aborted.
 | 
| 
 | 
    82     """
 | 
| 
 | 
    83     try:
 | 
| 
505
 | 
    84         dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python', index_col=0)
 | 
| 
 | 
    85         dataset = dataset.astype(float)
 | 
| 
93
 | 
    86     except pd.errors.EmptyDataError:
 | 
| 
505
 | 
    87         sys.exit('Execution aborted: wrong file format of ' + name + '\n')
 | 
| 
93
 | 
    88     if len(dataset.columns) < 2:
 | 
| 
505
 | 
    89         sys.exit('Execution aborted: wrong file format of ' + name + '\n')
 | 
| 
93
 | 
    90     return dataset
 | 
| 
 | 
    91 
 | 
| 
 | 
    92 
 | 
| 
505
 | 
    93 def load_custom_rules() -> Dict[str,str]:
 | 
| 
93
 | 
    94     """
 | 
| 
 | 
    95     Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be
 | 
| 
 | 
    96     performed, significantly impacting the runtime.
 | 
| 
 | 
    97 
 | 
| 
 | 
    98     Returns:
 | 
| 
 | 
    99         Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules.
 | 
| 
 | 
   100     """
 | 
| 
489
 | 
   101     datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload)  # actual file, stored in Galaxy as a .dat
 | 
| 
 | 
   102 
 | 
| 
 | 
   103     dict_rule = {}
 | 
| 
 | 
   104 
 | 
| 
 | 
   105     try:
 | 
| 
 | 
   106         rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False)
 | 
| 
 | 
   107         if len(rows) <= 1:
 | 
| 
 | 
   108             raise ValueError("Model tabular with 1 column is not supported.")
 | 
| 
381
 | 
   109 
 | 
| 
489
 | 
   110         if not rows:
 | 
| 
 | 
   111             raise ValueError("Model tabular is file is empty.")
 | 
| 
 | 
   112         
 | 
| 
 | 
   113         id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
 | 
| 
 | 
   114         
 | 
| 
 | 
   115     # First, try using a tab delimiter
 | 
| 
 | 
   116         for line in rows[1:]:
 | 
| 
 | 
   117             if len(line) <= idx_gpr:
 | 
| 
 | 
   118                 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
 | 
| 
 | 
   119                 continue
 | 
| 
 | 
   120             
 | 
| 
505
 | 
   121             dict_rule[line[id_idx]] = line[idx_gpr] 
 | 
| 
 | 
   122 
 | 
| 
489
 | 
   123     except Exception as e:
 | 
| 
 | 
   124         # If parsing with tabs fails, try comma delimiter
 | 
| 
 | 
   125         try:
 | 
| 
 | 
   126             rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False)
 | 
| 
 | 
   127             
 | 
| 
 | 
   128             if len(rows) <= 1:
 | 
| 
 | 
   129                 raise ValueError("Model tabular with 1 column is not supported.")
 | 
| 
 | 
   130 
 | 
| 
 | 
   131             if not rows:
 | 
| 
 | 
   132                 raise ValueError("Model tabular is file is empty.")
 | 
| 
 | 
   133             
 | 
| 
 | 
   134             id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
 | 
| 
 | 
   135             
 | 
| 
 | 
   136             # Try again parsing row content with the GPR column using comma-separated values
 | 
| 
 | 
   137             for line in rows[1:]:
 | 
| 
 | 
   138                 if len(line) <= idx_gpr:
 | 
| 
 | 
   139                     utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
 | 
| 
 | 
   140                     continue
 | 
| 
 | 
   141                 
 | 
| 
505
 | 
   142                 dict_rule[line[id_idx]] =line[idx_gpr]
 | 
| 
489
 | 
   143                     
 | 
| 
 | 
   144         except Exception as e2:
 | 
| 
 | 
   145             raise ValueError(f"Unable to parse rules file. Tried both tab and comma delimiters. Original errors: Tab: {e}, Comma: {e2}")
 | 
| 
 | 
   146 
 | 
| 
 | 
   147     if not dict_rule:
 | 
| 
 | 
   148             raise ValueError("No valid rules found in the uploaded file. Please check the file format.")
 | 
| 
93
 | 
   149     # csv rules need to be parsed, those in a pickle format are taken to be pre-parsed.
 | 
| 
489
 | 
   150     return dict_rule
 | 
| 
 | 
   151 
 | 
| 
401
 | 
   152 
 | 
| 
505
 | 
   153 
 | 
| 
 | 
   154 def computeRAS(
 | 
| 
 | 
   155             dataset,gene_rules,rules_total_string,
 | 
| 
 | 
   156             or_function=np.sum,    # type of operation to do in case of an or expression (max, sum, mean)
 | 
| 
 | 
   157             and_function=np.min,   # type of operation to do in case of an and expression(min, sum)
 | 
| 
 | 
   158             ignore_nan = True
 | 
| 
 | 
   159             ):
 | 
| 
 | 
   160 
 | 
| 
 | 
   161 
 | 
| 
 | 
   162     logic_operators = ['and', 'or', '(', ')']
 | 
| 
 | 
   163     reactions=list(gene_rules.keys())
 | 
| 
 | 
   164 
 | 
| 
 | 
   165     # Build the dictionary for the GPRs
 | 
| 
 | 
   166     df_reactions = pd.DataFrame(index=reactions)
 | 
| 
 | 
   167     gene_rules=[gene_rules[reaction_id].replace("OR","or").replace("AND","and").replace("(","( ").replace(")"," )") for reaction_id in reactions]        
 | 
| 
 | 
   168     df_reactions['rule'] = gene_rules
 | 
| 
 | 
   169     df_reactions = df_reactions.reset_index()
 | 
| 
 | 
   170     df_reactions = df_reactions.groupby('rule').agg(lambda x: sorted(list(x)))
 | 
| 
 | 
   171 
 | 
| 
 | 
   172     dict_rule_reactions = df_reactions.to_dict()['index']
 | 
| 
 | 
   173 
 | 
| 
 | 
   174     # build useful structures for RAS computation
 | 
| 
 | 
   175     genes =dataset.index.intersection(rules_total_string)
 | 
| 
 | 
   176     
 | 
| 
 | 
   177     #check if there is one gene at least 
 | 
| 
 | 
   178     if len(genes)==0:
 | 
| 
 | 
   179         print("ERROR: no gene of the count matrix is in the metabolic model!")
 | 
| 
 | 
   180         print(" are you sure that the gene annotation is the same for the model and the count matrix?")
 | 
| 
 | 
   181         return -1
 | 
| 
 | 
   182     
 | 
| 
 | 
   183     cell_ids = list(dataset.columns)
 | 
| 
 | 
   184     count_df_filtered = dataset.loc[genes]
 | 
| 
 | 
   185     count_df_filtered = count_df_filtered.rename(index=lambda x: x.replace("-", "_"))
 | 
| 
 | 
   186 
 | 
| 
 | 
   187     ras_df=np.full((len(dict_rule_reactions), len(cell_ids)), np.nan)
 | 
| 
 | 
   188 
 | 
| 
 | 
   189     # for loop on rules
 | 
| 
 | 
   190     ind = 0       
 | 
| 
 | 
   191     for rule, reaction_ids in dict_rule_reactions.items():
 | 
| 
 | 
   192         if len(rule) != 0:
 | 
| 
 | 
   193             # there is one gene at least in the formula
 | 
| 
 | 
   194             rule_split = rule.split()
 | 
| 
 | 
   195             rule_split_elements = list(set(rule_split))                                # genes in formula
 | 
| 
 | 
   196             
 | 
| 
 | 
   197             # which genes are in the count matrix?                
 | 
| 
 | 
   198             genes_in_count_matrix = [el for el in rule_split_elements if el in genes]
 | 
| 
 | 
   199             genes_notin_count_matrix = [el for el in rule_split_elements if el not in genes]
 | 
| 
 | 
   200 
 | 
| 
 | 
   201             if len(genes_in_count_matrix) > 0: #there is at least one gene in the count matrix
 | 
| 
 | 
   202                     if len(rule_split) == 1:
 | 
| 
 | 
   203                         #one gene --> one reaction
 | 
| 
 | 
   204                         ras_df[ind] = count_df_filtered.loc[genes_in_count_matrix]
 | 
| 
 | 
   205                     else:    
 | 
| 
 | 
   206                         if len(genes_notin_count_matrix) > 0 and ignore_nan == False:
 | 
| 
 | 
   207                                 ras_df[ind] = np.nan
 | 
| 
 | 
   208                         else:                   
 | 
| 
 | 
   209                             # more genes in the formula
 | 
| 
 | 
   210                             check_only_and=("and" in rule and "or" not in rule) #only and
 | 
| 
 | 
   211                             check_only_or=("or" in rule and "and" not in rule)  #only or
 | 
| 
 | 
   212                             if check_only_and or check_only_or:
 | 
| 
 | 
   213                                 #or/and sequence
 | 
| 
 | 
   214                                 matrix = count_df_filtered.loc[genes_in_count_matrix].values
 | 
| 
 | 
   215                                 #compute for all cells
 | 
| 
 | 
   216                                 if check_only_and: 
 | 
| 
 | 
   217                                     ras_df[ind] = and_function(matrix, axis=0)
 | 
| 
 | 
   218                                 else:
 | 
| 
 | 
   219                                     ras_df[ind] = or_function(matrix, axis=0)
 | 
| 
 | 
   220                             else:
 | 
| 
 | 
   221                                 # complex expression (e.g. A or (B and C))
 | 
| 
 | 
   222                                 data = count_df_filtered.loc[genes_in_count_matrix]  # dataframe of genes in the GPRs
 | 
| 
 | 
   223 
 | 
| 
 | 
   224                                 rule = rule.replace("-", "_")
 | 
| 
 | 
   225                                 tree = ast.parse(rule, mode="eval").body
 | 
| 
 | 
   226                                 values_by_cell = [dict(zip(data.index, data[col].values)) for col in data.columns]
 | 
| 
 | 
   227                                 for j, values in enumerate(values_by_cell):
 | 
| 
 | 
   228                                     ras_df[ind, j] = _evaluate_ast(tree, values, or_function, and_function)
 | 
| 
 | 
   229     
 | 
| 
 | 
   230         ind +=1
 | 
| 
 | 
   231     
 | 
| 
 | 
   232     #create the dataframe of ras (rules x samples)
 | 
| 
 | 
   233     ras_df= pd.DataFrame(data=ras_df,index=range(len(dict_rule_reactions)), columns=cell_ids)
 | 
| 
 | 
   234     ras_df['Reactions'] = [reaction_ids for rule,reaction_ids in dict_rule_reactions.items()]
 | 
| 
 | 
   235     
 | 
| 
 | 
   236     #create the reaction dataframe for ras (reactions x samples)
 | 
| 
 | 
   237     ras_df = ras_df.explode("Reactions").set_index("Reactions")
 | 
| 
 | 
   238 
 | 
| 
 | 
   239     return ras_df
 | 
| 
 | 
   240 
 | 
| 
 | 
   241 # function to evalute a complex boolean expression e.g. A or (B and C)
 | 
| 
 | 
   242 def _evaluate_ast( node, values,or_function,and_function):
 | 
| 
 | 
   243     if isinstance(node, ast.BoolOp):
 | 
| 
 | 
   244         # Ricorsione sugli argomenti
 | 
| 
 | 
   245         vals = [_evaluate_ast(v, values,or_function,and_function) for v in node.values]
 | 
| 
 | 
   246        
 | 
| 
 | 
   247         vals = [v for v in vals if v is not None]
 | 
| 
 | 
   248         if not vals:
 | 
| 
 | 
   249             return None
 | 
| 
 | 
   250       
 | 
| 
 | 
   251         vals = [np.array(v) if isinstance(v, (list, np.ndarray)) else v for v in vals]
 | 
| 
 | 
   252 
 | 
| 
 | 
   253         if isinstance(node.op, ast.Or):
 | 
| 
 | 
   254             return or_function(vals)
 | 
| 
 | 
   255         elif isinstance(node.op, ast.And):
 | 
| 
 | 
   256             return and_function(vals)
 | 
| 
 | 
   257 
 | 
| 
 | 
   258     elif isinstance(node, ast.Name):
 | 
| 
 | 
   259         return values.get(node.id, None)
 | 
| 
 | 
   260     elif isinstance(node, ast.Constant):
 | 
| 
 | 
   261         return node.value
 | 
| 
 | 
   262     else:
 | 
| 
 | 
   263         raise ValueError(f"Error in boolean expression: {ast.dump(node)}")
 | 
| 
 | 
   264 
 | 
| 
147
 | 
   265 def main(args:List[str] = None) -> None:
 | 
| 
93
 | 
   266     """
 | 
| 
 | 
   267     Initializes everything and sets the program in motion based on the fronted input arguments.
 | 
| 
 | 
   268     
 | 
| 
 | 
   269     Returns:
 | 
| 
 | 
   270         None
 | 
| 
 | 
   271     """
 | 
| 
 | 
   272     # get args from frontend (related xml)
 | 
| 
 | 
   273     global ARGS
 | 
| 
147
 | 
   274     ARGS = process_args(args)
 | 
| 
309
 | 
   275 
 | 
| 
505
 | 
   276     # read dataset and remove versioning from gene names
 | 
| 
93
 | 
   277     dataset = read_dataset(ARGS.input, "dataset")
 | 
| 
505
 | 
   278     dataset.index =  [str(el.split(".")[0]) for el in dataset.index]
 | 
| 
93
 | 
   279 
 | 
| 
505
 | 
   280     #load GPR rules
 | 
| 
489
 | 
   281     rules = load_custom_rules()
 | 
| 
505
 | 
   282     
 | 
| 
 | 
   283     #create a list of all the gpr
 | 
| 
 | 
   284     rules_total_string=""
 | 
| 
 | 
   285     for id,rule in rules.items():
 | 
| 
 | 
   286         rules_total_string+=rule.replace("(","").replace(")","") + " "
 | 
| 
 | 
   287     rules_total_string=list(set(rules_total_string.split(" ")))
 | 
| 
93
 | 
   288 
 | 
| 
505
 | 
   289     #check if nan value must be ignored in the GPR 
 | 
| 
 | 
   290     print(ARGS.none,"\n\n\n*************",ARGS.none==True)
 | 
| 
 | 
   291     if ARGS.none:
 | 
| 
 | 
   292     #    #e.g. (A or nan --> A)
 | 
| 
 | 
   293         ignore_nan = True
 | 
| 
 | 
   294     else:
 | 
| 
 | 
   295         #e.g. (A or nan --> nan)
 | 
| 
 | 
   296         ignore_nan = False
 | 
| 
 | 
   297     
 | 
| 
 | 
   298     #compure ras
 | 
| 
 | 
   299     ras_df=computeRAS(dataset,rules,
 | 
| 
 | 
   300                     rules_total_string,
 | 
| 
 | 
   301                     or_function=np.sum,    # type of operation to do in case of an or expression (max, sum, mean)
 | 
| 
 | 
   302                     and_function=np.min,
 | 
| 
 | 
   303                     ignore_nan=ignore_nan)
 | 
| 
 | 
   304     
 | 
| 
 | 
   305     #save to csv and replace nan with None
 | 
| 
 | 
   306     ras_df.replace(np.nan,"None").to_csv(ARGS.ras_output, sep = '\t')
 | 
| 
381
 | 
   307 
 | 
| 
505
 | 
   308     #print 
 | 
| 
489
 | 
   309     print("Execution succeeded")
 | 
| 
93
 | 
   310 
 | 
| 
 | 
   311 ###############################################################################
 | 
| 
 | 
   312 if __name__ == "__main__":
 | 
| 
505
 | 
   313     main() |