Mercurial > repos > bimib > cobraxy
comparison COBRAxy/flux_simulation.py @ 4:41f35c2f0c7b draft
Uploaded
| author | luca_milaz |
|---|---|
| date | Wed, 18 Sep 2024 10:59:10 +0000 |
| parents | |
| children | 74b383211ab5 |
comparison
equal
deleted
inserted
replaced
| 3:1f3ac6fd9867 | 4:41f35c2f0c7b |
|---|---|
| 1 import argparse | |
| 2 import utils.general_utils as utils | |
| 3 from typing import Optional, List | |
| 4 import os | |
| 5 import numpy as np | |
| 6 import pandas as pd | |
| 7 import cobra | |
| 8 import utils.CBS_backend as CBS_backend | |
| 9 from joblib import Parallel, delayed, cpu_count | |
| 10 from cobra.sampling import OptGPSampler | |
| 11 import sys | |
| 12 | |
| 13 ################################# process args ############################### | |
| 14 def process_args(args :List[str]) -> argparse.Namespace: | |
| 15 """ | |
| 16 Processes command-line arguments. | |
| 17 | |
| 18 Args: | |
| 19 args (list): List of command-line arguments. | |
| 20 | |
| 21 Returns: | |
| 22 Namespace: An object containing parsed arguments. | |
| 23 """ | |
| 24 parser = argparse.ArgumentParser(usage = '%(prog)s [options]', | |
| 25 description = 'process some value\'s') | |
| 26 | |
| 27 parser.add_argument('-ol', '--out_log', | |
| 28 help = "Output log") | |
| 29 | |
| 30 parser.add_argument('-td', '--tool_dir', | |
| 31 type = str, | |
| 32 required = True, | |
| 33 help = 'your tool directory') | |
| 34 | |
| 35 parser.add_argument('-in', '--input', | |
| 36 required = True, | |
| 37 type=str, | |
| 38 help = 'inputs bounds') | |
| 39 | |
| 40 parser.add_argument('-ni', '--names', | |
| 41 required = True, | |
| 42 type=str, | |
| 43 help = 'cell names') | |
| 44 | |
| 45 parser.add_argument( | |
| 46 '-ms', '--model_selector', | |
| 47 type = utils.Model, default = utils.Model.ENGRO2, choices = [utils.Model.ENGRO2, utils.Model.Custom], | |
| 48 help = 'chose which type of model you want use') | |
| 49 | |
| 50 parser.add_argument("-mo", "--model", type = str) | |
| 51 | |
| 52 parser.add_argument("-mn", "--model_name", type = str, help = "custom mode name") | |
| 53 | |
| 54 parser.add_argument('-a', '--algorithm', | |
| 55 type = str, | |
| 56 choices = ['OPTGP', 'CBS'], | |
| 57 required = True, | |
| 58 help = 'choose sampling algorithm') | |
| 59 | |
| 60 parser.add_argument('-th', '--thinning', | |
| 61 type = int, | |
| 62 default= 100, | |
| 63 required=False, | |
| 64 help = 'choose thinning') | |
| 65 | |
| 66 parser.add_argument('-ns', '--n_samples', | |
| 67 type = int, | |
| 68 required = True, | |
| 69 help = 'choose how many samples') | |
| 70 | |
| 71 parser.add_argument('-sd', '--seed', | |
| 72 type = int, | |
| 73 required = True, | |
| 74 help = 'seed') | |
| 75 | |
| 76 parser.add_argument('-nb', '--n_batches', | |
| 77 type = int, | |
| 78 required = True, | |
| 79 help = 'choose how many batches') | |
| 80 | |
| 81 parser.add_argument('-ot', '--output_type', | |
| 82 type = str, | |
| 83 required = True, | |
| 84 help = 'output type') | |
| 85 | |
| 86 parser.add_argument('-ota', '--output_type_analysis', | |
| 87 type = str, | |
| 88 required = False, | |
| 89 help = 'output type analysis') | |
| 90 | |
| 91 ARGS = parser.parse_args() | |
| 92 return ARGS | |
| 93 | |
| 94 ########################### warning ########################################### | |
| 95 def warning(s :str) -> None: | |
| 96 """ | |
| 97 Log a warning message to an output log file and print it to the console. | |
| 98 | |
| 99 Args: | |
| 100 s (str): The warning message to be logged and printed. | |
| 101 | |
| 102 Returns: | |
| 103 None | |
| 104 """ | |
| 105 with open(ARGS.out_log, 'a') as log: | |
| 106 log.write(s + "\n\n") | |
| 107 print(s) | |
| 108 | |
| 109 | |
| 110 def write_to_file(dataset: pd.DataFrame, name: str, keep_index:bool=False)->None: | |
| 111 dataset.index.name = 'Reactions' | |
| 112 dataset.to_csv(ARGS.output_folder + name + ".csv", sep = '\t', index = keep_index) | |
| 113 | |
| 114 ############################ dataset input #################################### | |
| 115 def read_dataset(data :str, name :str) -> pd.DataFrame: | |
| 116 """ | |
| 117 Read a dataset from a CSV file and return it as a pandas DataFrame. | |
| 118 | |
| 119 Args: | |
| 120 data (str): Path to the CSV file containing the dataset. | |
| 121 name (str): Name of the dataset, used in error messages. | |
| 122 | |
| 123 Returns: | |
| 124 pandas.DataFrame: DataFrame containing the dataset. | |
| 125 | |
| 126 Raises: | |
| 127 pd.errors.EmptyDataError: If the CSV file is empty. | |
| 128 sys.exit: If the CSV file has the wrong format, the execution is aborted. | |
| 129 """ | |
| 130 try: | |
| 131 dataset = pd.read_csv(data, sep = '\t', header = 0, index_col=0, engine='python') | |
| 132 except pd.errors.EmptyDataError: | |
| 133 sys.exit('Execution aborted: wrong format of ' + name + '\n') | |
| 134 if len(dataset.columns) < 2: | |
| 135 sys.exit('Execution aborted: wrong format of ' + name + '\n') | |
| 136 return dataset | |
| 137 | |
| 138 | |
| 139 | |
| 140 def OPTGP_sampler(model:cobra.Model, model_name:str, n_samples:int=1000, thinning:int=100, n_batches:int=1, seed:int=0)-> None: | |
| 141 """ | |
| 142 Samples from the OPTGP (Optimal Global Perturbation) algorithm and saves the results to CSV files. | |
| 143 | |
| 144 Args: | |
| 145 model (cobra.Model): The COBRA model to sample from. | |
| 146 model_name (str): The name of the model, used in naming output files. | |
| 147 n_samples (int, optional): Number of samples per batch. Default is 1000. | |
| 148 thinning (int, optional): Thinning parameter for the sampler. Default is 100. | |
| 149 n_batches (int, optional): Number of batches to run. Default is 1. | |
| 150 seed (int, optional): Random seed for reproducibility. Default is 0. | |
| 151 | |
| 152 Returns: | |
| 153 None | |
| 154 """ | |
| 155 | |
| 156 for i in range(0, n_batches): | |
| 157 optgp = OptGPSampler(model, thinning, seed) | |
| 158 samples = optgp.sample(n_samples) | |
| 159 samples.to_csv(ARGS.output_folder + model_name + '_'+ str(i)+'_OPTGP.csv', index=False) | |
| 160 seed+=1 | |
| 161 samplesTotal = pd.DataFrame() | |
| 162 for i in range(0, n_batches): | |
| 163 samples_batch = pd.read_csv(ARGS.output_folder + model_name + '_'+ str(i)+'_OPTGP.csv') | |
| 164 samplesTotal = pd.concat([samplesTotal, samples_batch], ignore_index = True) | |
| 165 | |
| 166 write_to_file(samplesTotal.T, model_name, True) | |
| 167 | |
| 168 for i in range(0, n_batches): | |
| 169 os.remove(ARGS.output_folder + model_name + '_'+ str(i)+'_OPTGP.csv') | |
| 170 pass | |
| 171 | |
| 172 | |
| 173 def CBS_sampler(model:cobra.Model, model_name:str, n_samples:int=1000, n_batches:int=1, seed:int=0)-> None: | |
| 174 """ | |
| 175 Samples using the CBS (Constraint-based Sampling) algorithm and saves the results to CSV files. | |
| 176 | |
| 177 Args: | |
| 178 model (cobra.Model): The COBRA model to sample from. | |
| 179 model_name (str): The name of the model, used in naming output files. | |
| 180 n_samples (int, optional): Number of samples per batch. Default is 1000. | |
| 181 n_batches (int, optional): Number of batches to run. Default is 1. | |
| 182 seed (int, optional): Random seed for reproducibility. Default is 0. | |
| 183 | |
| 184 Returns: | |
| 185 None | |
| 186 """ | |
| 187 | |
| 188 df_FVA = cobra.flux_analysis.flux_variability_analysis(model,fraction_of_optimum=0).round(6) | |
| 189 | |
| 190 df_coefficients = CBS_backend.randomObjectiveFunction(model, n_samples*n_batches, df_FVA, seed=seed) | |
| 191 | |
| 192 for i in range(0, n_batches): | |
| 193 samples = pd.DataFrame(columns =[reaction.id for reaction in model.reactions], index = range(n_samples)) | |
| 194 try: | |
| 195 CBS_backend.randomObjectiveFunctionSampling(model, n_samples, df_coefficients.iloc[:,i*n_samples:(i+1)*n_samples], samples) | |
| 196 except Exception as e: | |
| 197 utils.logWarning( | |
| 198 "Warning: GLPK solver has failed for " + model_name + ". Trying with COBRA interface. Error:" + str(e), | |
| 199 ARGS.out_log) | |
| 200 CBS_backend.randomObjectiveFunctionSampling_cobrapy(model, n_samples, df_coefficients.iloc[:,i*n_samples:(i+1)*n_samples], | |
| 201 samples) | |
| 202 utils.logWarning(ARGS.output_folder + model_name + '_'+ str(i)+'_CBS.csv', ARGS.out_log) | |
| 203 samples.to_csv(ARGS.output_folder + model_name + '_'+ str(i)+'_CBS.csv', index=False) | |
| 204 | |
| 205 samplesTotal = pd.DataFrame() | |
| 206 for i in range(0, n_batches): | |
| 207 samples_batch = pd.read_csv(ARGS.output_folder + model_name + '_'+ str(i)+'_CBS.csv') | |
| 208 samplesTotal = pd.concat([samplesTotal, samples_batch], ignore_index = True) | |
| 209 | |
| 210 write_to_file(samplesTotal.T, model_name, True) | |
| 211 | |
| 212 for i in range(0, n_batches): | |
| 213 os.remove(ARGS.output_folder + model_name + '_'+ str(i)+'_CBS.csv') | |
| 214 pass | |
| 215 | |
| 216 | |
| 217 def model_sampler(model_input_original:cobra.Model, bounds_path:str, cell_name:str)-> List[pd.DataFrame]: | |
| 218 """ | |
| 219 Prepares the model with bounds from the dataset and performs sampling and analysis based on the selected algorithm. | |
| 220 | |
| 221 Args: | |
| 222 model_input_original (cobra.Model): The original COBRA model. | |
| 223 bounds_path (str): Path to the CSV file containing the bounds dataset. | |
| 224 cell_name (str): Name of the cell, used to generate filenames for output. | |
| 225 | |
| 226 Returns: | |
| 227 List[pd.DataFrame]: A list of DataFrames containing statistics and analysis results. | |
| 228 """ | |
| 229 | |
| 230 model_input = model_input_original.copy() | |
| 231 bounds_df = read_dataset(bounds_path, "bounds dataset") | |
| 232 for rxn_index, row in bounds_df.iterrows(): | |
| 233 model_input.reactions.get_by_id(rxn_index).lower_bound = row.lower_bound | |
| 234 model_input.reactions.get_by_id(rxn_index).upper_bound = row.upper_bound | |
| 235 | |
| 236 name = cell_name.split('.')[0] | |
| 237 | |
| 238 if ARGS.algorithm == 'OPTGP': | |
| 239 OPTGP_sampler(model_input, name, ARGS.n_samples, ARGS.thinning, ARGS.n_batches, ARGS.seed) | |
| 240 | |
| 241 elif ARGS.algorithm == 'CBS': | |
| 242 CBS_sampler(model_input, name, ARGS.n_samples, ARGS.n_batches, ARGS.seed) | |
| 243 | |
| 244 df_mean, df_median, df_quantiles = fluxes_statistics(name, ARGS.output_types) | |
| 245 | |
| 246 if("fluxes" not in ARGS.output_types): | |
| 247 os.remove(ARGS.output_folder + name + '.csv') | |
| 248 | |
| 249 returnList = [] | |
| 250 returnList.append(df_mean) | |
| 251 returnList.append(df_median) | |
| 252 returnList.append(df_quantiles) | |
| 253 | |
| 254 df_pFBA, df_FVA, df_sensitivity = fluxes_analysis(model_input, name, ARGS.output_type_analysis) | |
| 255 | |
| 256 if("pFBA" in ARGS.output_type_analysis): | |
| 257 returnList.append(df_pFBA) | |
| 258 if("FVA" in ARGS.output_type_analysis): | |
| 259 returnList.append(df_FVA) | |
| 260 if("sensitivity" in ARGS.output_type_analysis): | |
| 261 returnList.append(df_sensitivity) | |
| 262 | |
| 263 return returnList | |
| 264 | |
| 265 def fluxes_statistics(model_name: str, output_types:List)-> List[pd.DataFrame]: | |
| 266 """ | |
| 267 Computes statistics (mean, median, quantiles) for the fluxes. | |
| 268 | |
| 269 Args: | |
| 270 model_name (str): Name of the model, used in filename for input. | |
| 271 output_types (List[str]): Types of statistics to compute (mean, median, quantiles). | |
| 272 | |
| 273 Returns: | |
| 274 List[pd.DataFrame]: List of DataFrames containing mean, median, and quantiles statistics. | |
| 275 """ | |
| 276 | |
| 277 df_mean = pd.DataFrame() | |
| 278 df_median= pd.DataFrame() | |
| 279 df_quantiles= pd.DataFrame() | |
| 280 | |
| 281 df_samples = pd.read_csv(ARGS.output_folder + model_name + '.csv', sep = '\t', index_col = 0).T | |
| 282 df_samples = df_samples.round(8) | |
| 283 | |
| 284 for output_type in output_types: | |
| 285 if(output_type == "mean"): | |
| 286 df_mean = df_samples.mean() | |
| 287 df_mean = df_mean.to_frame().T | |
| 288 df_mean = df_mean.reset_index(drop=True) | |
| 289 df_mean.index = [model_name] | |
| 290 elif(output_type == "median"): | |
| 291 df_median = df_samples.median() | |
| 292 df_median = df_median.to_frame().T | |
| 293 df_median = df_median.reset_index(drop=True) | |
| 294 df_median.index = [model_name] | |
| 295 elif(output_type == "quantiles"): | |
| 296 newRow = [] | |
| 297 cols = [] | |
| 298 for rxn in df_samples.columns: | |
| 299 quantiles = df_samples[rxn].quantile([0.25, 0.50, 0.75]) | |
| 300 newRow.append(quantiles[0.25]) | |
| 301 cols.append(rxn + "_q1") | |
| 302 newRow.append(quantiles[0.5]) | |
| 303 cols.append(rxn + "_q2") | |
| 304 newRow.append(quantiles[0.75]) | |
| 305 cols.append(rxn + "_q3") | |
| 306 df_quantiles = pd.DataFrame(columns=cols) | |
| 307 df_quantiles.loc[0] = newRow | |
| 308 df_quantiles = df_quantiles.reset_index(drop=True) | |
| 309 df_quantiles.index = [model_name] | |
| 310 | |
| 311 return df_mean, df_median, df_quantiles | |
| 312 | |
| 313 def fluxes_analysis(model:cobra.Model, model_name:str, output_types:List)-> List[pd.DataFrame]: | |
| 314 """ | |
| 315 Performs flux analysis including pFBA, FVA, and sensitivity analysis. | |
| 316 | |
| 317 Args: | |
| 318 model (cobra.Model): The COBRA model to analyze. | |
| 319 model_name (str): Name of the model, used in filenames for output. | |
| 320 output_types (List[str]): Types of analysis to perform (pFBA, FVA, sensitivity). | |
| 321 | |
| 322 Returns: | |
| 323 List[pd.DataFrame]: List of DataFrames containing pFBA, FVA, and sensitivity analysis results. | |
| 324 """ | |
| 325 | |
| 326 df_pFBA = pd.DataFrame() | |
| 327 df_FVA= pd.DataFrame() | |
| 328 df_sensitivity= pd.DataFrame() | |
| 329 | |
| 330 for output_type in output_types: | |
| 331 if(output_type == "pFBA"): | |
| 332 model.objective = "Biomass" | |
| 333 solution = cobra.flux_analysis.pfba(model) | |
| 334 fluxes = solution.fluxes | |
| 335 df_pFBA.loc[0,[rxn._id for rxn in model.reactions]] = fluxes.tolist() | |
| 336 df_pFBA = df_pFBA.reset_index(drop=True) | |
| 337 df_pFBA.index = [model_name] | |
| 338 df_pFBA = df_pFBA.astype(float).round(6) | |
| 339 elif(output_type == "FVA"): | |
| 340 fva = cobra.flux_analysis.flux_variability_analysis(model, fraction_of_optimum=0, processes=1).round(8) | |
| 341 columns = [] | |
| 342 for rxn in fva.index.to_list(): | |
| 343 columns.append(rxn + "_min") | |
| 344 columns.append(rxn + "_max") | |
| 345 df_FVA= pd.DataFrame(columns = columns) | |
| 346 for index_rxn, row in fva.iterrows(): | |
| 347 df_FVA.loc[0, index_rxn+ "_min"] = fva.loc[index_rxn, "minimum"] | |
| 348 df_FVA.loc[0, index_rxn+ "_max"] = fva.loc[index_rxn, "maximum"] | |
| 349 df_FVA = df_FVA.reset_index(drop=True) | |
| 350 df_FVA.index = [model_name] | |
| 351 df_FVA = df_FVA.astype(float).round(6) | |
| 352 elif(output_type == "sensitivity"): | |
| 353 model.objective = "Biomass" | |
| 354 solution_original = model.optimize().objective_value | |
| 355 reactions = model.reactions | |
| 356 single = cobra.flux_analysis.single_reaction_deletion(model) | |
| 357 newRow = [] | |
| 358 df_sensitivity = pd.DataFrame(columns = [rxn.id for rxn in reactions], index = [model_name]) | |
| 359 for rxn in reactions: | |
| 360 newRow.append(single.knockout[rxn.id].growth.values[0]/solution_original) | |
| 361 df_sensitivity.loc[model_name] = newRow | |
| 362 df_sensitivity = df_sensitivity.astype(float).round(6) | |
| 363 return df_pFBA, df_FVA, df_sensitivity | |
| 364 | |
| 365 ############################# main ########################################### | |
| 366 def main() -> None: | |
| 367 """ | |
| 368 Initializes everything and sets the program in motion based on the fronted input arguments. | |
| 369 | |
| 370 Returns: | |
| 371 None | |
| 372 """ | |
| 373 if not os.path.exists('flux_simulation/'): | |
| 374 os.makedirs('flux_simulation/') | |
| 375 | |
| 376 num_processors = cpu_count() | |
| 377 | |
| 378 global ARGS | |
| 379 ARGS = process_args(sys.argv) | |
| 380 | |
| 381 ARGS.output_folder = 'flux_simulation/' | |
| 382 | |
| 383 | |
| 384 model_type :utils.Model = ARGS.model_selector | |
| 385 if model_type is utils.Model.Custom: | |
| 386 model = model_type.getCOBRAmodel(customPath = utils.FilePath.fromStrPath(ARGS.model), customExtension = utils.FilePath.fromStrPath(ARGS.model_name).ext) | |
| 387 else: | |
| 388 model = model_type.getCOBRAmodel(toolDir=ARGS.tool_dir) | |
| 389 | |
| 390 ARGS.bounds = ARGS.input.split(",") | |
| 391 ARGS.bounds_name = ARGS.names.split(",") | |
| 392 ARGS.output_types = ARGS.output_type.split(",") | |
| 393 ARGS.output_type_analysis = ARGS.output_type_analysis.split(",") | |
| 394 | |
| 395 | |
| 396 results = Parallel(n_jobs=num_processors)(delayed(model_sampler)(model, bounds_path, cell_name) for bounds_path, cell_name in zip(ARGS.bounds, ARGS.bounds_name)) | |
| 397 | |
| 398 all_mean = pd.concat([result[0] for result in results], ignore_index=False) | |
| 399 all_median = pd.concat([result[1] for result in results], ignore_index=False) | |
| 400 all_quantiles = pd.concat([result[2] for result in results], ignore_index=False) | |
| 401 | |
| 402 if("mean" in ARGS.output_types): | |
| 403 all_mean = all_mean.fillna(0.0) | |
| 404 all_mean = all_mean.sort_index() | |
| 405 write_to_file(all_mean.T, "mean", True) | |
| 406 | |
| 407 if("median" in ARGS.output_types): | |
| 408 all_median = all_median.fillna(0.0) | |
| 409 all_median = all_median.sort_index() | |
| 410 write_to_file(all_median.T, "median", True) | |
| 411 | |
| 412 if("quantiles" in ARGS.output_types): | |
| 413 all_quantiles = all_quantiles.fillna(0.0) | |
| 414 all_quantiles = all_quantiles.sort_index() | |
| 415 write_to_file(all_quantiles.T, "quantiles", True) | |
| 416 | |
| 417 index_result = 3 | |
| 418 if("pFBA" in ARGS.output_type_analysis): | |
| 419 all_pFBA = pd.concat([result[index_result] for result in results], ignore_index=False) | |
| 420 all_pFBA = all_pFBA.sort_index() | |
| 421 write_to_file(all_pFBA.T, "pFBA", True) | |
| 422 index_result+=1 | |
| 423 if("FVA" in ARGS.output_type_analysis): | |
| 424 all_FVA= pd.concat([result[index_result] for result in results], ignore_index=False) | |
| 425 all_FVA = all_FVA.sort_index() | |
| 426 write_to_file(all_FVA.T, "FVA", True) | |
| 427 index_result+=1 | |
| 428 if("sensitivity" in ARGS.output_type_analysis): | |
| 429 all_sensitivity = pd.concat([result[index_result] for result in results], ignore_index=False) | |
| 430 all_sensitivity = all_sensitivity.sort_index() | |
| 431 write_to_file(all_sensitivity.T, "sensitivity", True) | |
| 432 | |
| 433 pass | |
| 434 | |
| 435 ############################################################################## | |
| 436 if __name__ == "__main__": | |
| 437 main() |
