cobraxy: COBRAxy/ras_generator.py comparison

comparison COBRAxy/ras_generator.py @ 510:c17c6c9d112c draft

Uploaded

author	francesco_lapi
date	Tue, 07 Oct 2025 09:48:18 +0000
parents	5956dcf94277
children	f32d3c9089fc

comparison

equal deleted inserted replaced

-:5956dcf94277
+:c17c6c9d112c
 # build useful structures for RAS computation
 genes =dataset.index.intersection(rules_total_string)
 #check if there is one gene at least
 if len(genes)==0:
-print("ERROR: no gene of the count matrix is in the metabolic model!")
+raise ValueError("ERROR: No genes from the count matrix match the metabolic model. Check that gene annotations are consistent between model and dataset.")
-print(" are you sure that the gene annotation is the same for the model and the count matrix?")
-return -1
 cell_ids = list(dataset.columns)
 count_df_filtered = dataset.loc[genes]
-count_df_filtered = count_df_filtered.rename(index=lambda x: x.replace("-", "_"))
+count_df_filtered = count_df_filtered.rename(index=lambda x: x.replace("-", "_").replace(":", "_"))
 ras_df=np.full((len(dict_rule_reactions), len(cell_ids)), np.nan)
 # for loop on rules
+genes_not_mapped=[]
 ind = 0
 for rule, reaction_ids in dict_rule_reactions.items():
 if len(rule) != 0:
 # there is one gene at least in the formula
-rule = rule.replace("-","_")
+warning_rule="_"
+if "-" in rule:
+warning_rule="-"
+if ":" in rule:
+warning_rule=":"
+rule_orig=rule.split().copy()  #original rule in list
+rule = rule.replace(warning_rule,"_")
+#modified rule
 rule_split = rule.split()
-rule_split_elements = list(set(rule_split))                                # genes in formula
+rule_split_elements = list(filter(lambda x: x not in logic_operators, rule_split))  # remove of all logical operators
+rule_split_elements = list(set(rule_split_elements))                                # genes in formula
 # which genes are in the count matrix?
 genes_in_count_matrix = [el for el in rule_split_elements if el in genes]
-genes_notin_count_matrix = [el for el in rule_split_elements if el not in genes]
+genes_notin_count_matrix = []
+for el in rule_split_elements:
-if len(genes_in_count_matrix) > 0: #there is at least one gene in the count matrix
+if el not in genes: #not present in original dataset
+if el.replace("_",warning_rule) in rule_orig:
+genes_notin_count_matrix.append(el.replace("_",warning_rule))
+else:
+genes_notin_count_matrix.append(el)
+genes_not_mapped.extend(genes_notin_count_matrix)
+# add genes not present in the data
+if len(genes_in_count_matrix) > 0: #there is at least one gene in the count matrix
 if len(rule_split) == 1:
 #one gene --> one reaction
 ras_df[ind] = count_df_filtered.loc[genes_in_count_matrix]
 else:
 if len(genes_notin_count_matrix) > 0 and ignore_nan == False:
 ras_df['Reactions'] = [reaction_ids for rule,reaction_ids in dict_rule_reactions.items()]
 #create the reaction dataframe for ras (reactions x samples)
 ras_df = ras_df.explode("Reactions").set_index("Reactions")
-return ras_df
+#total genes not mapped from the gpr
+genes_not_mapped = sorted(set(genes_not_mapped))
+return ras_df,genes_not_mapped
+# function to evalute a complex boolean expression e.g. A or (B and C)
 # function to evalute a complex boolean expression e.g. A or (B and C)
 def _evaluate_ast( node, values,or_function,and_function):
 if isinstance(node, ast.BoolOp):
-# Ricorsione sugli argomenti
 vals = [_evaluate_ast(v, values,or_function,and_function) for v in node.values]
 vals = [v for v in vals if v is not None]
 if not vals:
-return None
+return np.nan
 vals = [np.array(v) if isinstance(v, (list, np.ndarray)) else v for v in vals]
 if isinstance(node.op, ast.Or):
 return or_function(vals)
 return and_function(vals)
 elif isinstance(node, ast.Name):
 return values.get(node.id, None)
 elif isinstance(node, ast.Constant):
-return node.value
+key = str(node.value)     #convert in str
+return values.get(key, None)
 else:
 raise ValueError(f"Error in boolean expression: {ast.dump(node)}")
 def main(args:List[str] = None) -> None:
 """
 global ARGS
 ARGS = process_args(args)
 # read dataset and remove versioning from gene names
 dataset = read_dataset(ARGS.input, "dataset")
-dataset.index =  [str(el.split(".")[0]) for el in dataset.index]
+orig_gene_list=dataset.index.copy()
+dataset.index =  [str(el.split(".")[0]) for el in dataset.index]
+if any(dataset.index.duplicated(keep=False)):
+list_str=", ".join(orig_gene_list[dataset.index.duplicated(keep=False)])
+raise ValueError(f"ERROR: Duplicate entries in the gene dataset. The following genes are duplicated: "+list_str)
 #load GPR rules
 rules = load_custom_rules()
 #create a list of all the gpr
 for id,rule in rules.items():
 rules_total_string+=rule.replace("(","").replace(")","") + " "
 rules_total_string=list(set(rules_total_string.split(" ")))
 #check if nan value must be ignored in the GPR
-print(ARGS.none,"\n\n\n*************",ARGS.none==True)
 if ARGS.none:
 #    #e.g. (A or nan --> A)
 ignore_nan = True
 else:
 #e.g. (A or nan --> nan)
 ignore_nan = False
 #compure ras
-ras_df=computeRAS(dataset,rules,
+ras_df,genes_not_mapped=computeRAS(dataset,rules,
 rules_total_string,
 or_function=np.sum,    # type of operation to do in case of an or expression (max, sum, mean)
 and_function=np.min,
 ignore_nan=ignore_nan)
 #save to csv and replace nan with None
-ras_df.replace(np.nan,"None").to_csv(ARGS.ras_output, sep = '\t')
+ras_df.replace([np.nan,None],"None").to_csv(ARGS.ras_output, sep = '\t')
-#print
+#report genes not present in the data
+if len(genes_not_mapped)>0:
+genes_not_mapped_str=", ".join(genes_not_mapped)
+utils.logWarning(
+f"The following genes are mentioned in the GPR rules but don't appear in the dataset: "+genes_not_mapped_str,
+ARGS.out_log)
 print("Execution succeeded")
 ###############################################################################
 if __name__ == "__main__":
 main()

Mercurial > repos > bimib > cobraxy

comparison COBRAxy/ras_generator.py @ 510:c17c6c9d112c draft