diff pycaret_train.py @ 0:1f20fe57fdee draft

planemo upload for repository https://github.com/goeckslab/Galaxy-Pycaret commit d79b0f722b7d09505a526d1a4332f87e548a3df1
author goeckslab
date Wed, 11 Dec 2024 04:59:43 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pycaret_train.py	Wed Dec 11 04:59:43 2024 +0000
@@ -0,0 +1,117 @@
+import argparse
+import logging
+
+from pycaret_classification import ClassificationModelTrainer
+
+from pycaret_regression import RegressionModelTrainer
+
+logging.basicConfig(level=logging.DEBUG)
+LOG = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", help="Path to the input file")
+    parser.add_argument("--target_col", help="Column number of the target")
+    parser.add_argument("--output_dir",
+                        help="Path to the output directory")
+    parser.add_argument("--model_type",
+                        choices=["classification", "regression"],
+                        help="Type of the model")
+    parser.add_argument("--train_size", type=float,
+                        default=None,
+                        help="Train size for PyCaret setup")
+    parser.add_argument("--normalize", action="store_true",
+                        default=None,
+                        help="Normalize data for PyCaret setup")
+    parser.add_argument("--feature_selection", action="store_true",
+                        default=None,
+                        help="Perform feature selection for PyCaret setup")
+    parser.add_argument("--cross_validation", action="store_true",
+                        default=None,
+                        help="Perform cross-validation for PyCaret setup")
+    parser.add_argument("--cross_validation_folds", type=int,
+                        default=None,
+                        help="Number of cross-validation folds \
+                          for PyCaret setup")
+    parser.add_argument("--remove_outliers", action="store_true",
+                        default=None,
+                        help="Remove outliers for PyCaret setup")
+    parser.add_argument("--remove_multicollinearity", action="store_true",
+                        default=None,
+                        help="Remove multicollinearity for PyCaret setup")
+    parser.add_argument("--polynomial_features", action="store_true",
+                        default=None,
+                        help="Generate polynomial features for PyCaret setup")
+    parser.add_argument("--feature_interaction", action="store_true",
+                        default=None,
+                        help="Generate feature interactions for PyCaret setup")
+    parser.add_argument("--feature_ratio", action="store_true",
+                        default=None,
+                        help="Generate feature ratios for PyCaret setup")
+    parser.add_argument("--fix_imbalance", action="store_true",
+                        default=None,
+                        help="Fix class imbalance for PyCaret setup")
+    parser.add_argument("--models", nargs='+',
+                        default=None,
+                        help="Selected models for training")
+    parser.add_argument("--random_seed", type=int,
+                        default=42,
+                        help="Random seed for PyCaret setup")
+    parser.add_argument("--test_file", type=str, default=None,
+                        help="Path to the test data file")
+
+    args = parser.parse_args()
+
+    model_kwargs = {
+        "train_size": args.train_size,
+        "normalize": args.normalize,
+        "feature_selection": args.feature_selection,
+        "cross_validation": args.cross_validation,
+        "cross_validation_folds": args.cross_validation_folds,
+        "remove_outliers": args.remove_outliers,
+        "remove_multicollinearity": args.remove_multicollinearity,
+        "polynomial_features": args.polynomial_features,
+        "feature_interaction": args.feature_interaction,
+        "feature_ratio": args.feature_ratio,
+        "fix_imbalance": args.fix_imbalance,
+    }
+    LOG.info(f"Model kwargs: {model_kwargs}")
+
+    # Remove None values from model_kwargs
+
+    LOG.info(f"Model kwargs 2: {model_kwargs}")
+    if args.models:
+        model_kwargs["models"] = args.models[0].split(",")
+
+    model_kwargs = {k: v for k, v in model_kwargs.items() if v is not None}
+
+    if args.model_type == "classification":
+        trainer = ClassificationModelTrainer(
+            args.input_file,
+            args.target_col,
+            args.output_dir,
+            args.model_type,
+            args.random_seed,
+            args.test_file,
+            **model_kwargs)
+    elif args.model_type == "regression":
+        if "fix_imbalance" in model_kwargs:
+            del model_kwargs["fix_imbalance"]
+        trainer = RegressionModelTrainer(
+            args.input_file,
+            args.target_col,
+            args.output_dir,
+            args.model_type,
+            args.random_seed,
+            args.test_file,
+            **model_kwargs)
+    else:
+        LOG.error("Invalid model type. Please choose \
+                  'classification' or 'regression'.")
+        return
+    trainer.run()
+
+
+if __name__ == "__main__":
+    main()