Mercurial > repos > ufz > dfpl_train
comparison dfpl_train.xml @ 0:e0bb949eac45 draft default tip
planemo upload for repository https://github.com/Helmholtz-UFZ/galaxy-tools/tree/main/tools/dfpl commit 66c6acfeff5441c36fba97787ddc5ee3d6a4a6ec
| author | ufz |
|---|---|
| date | Thu, 19 Dec 2024 12:51:21 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e0bb949eac45 |
|---|---|
| 1 <tool id="dfpl_train" name="deepFPlearn train" version="@TOOL_VERSION@+galaxy0" profile="23.0"> | |
| 2 <description>model to predict association of molecular structures to biological targets</description> | |
| 3 <macros> | |
| 4 <import>macros.xml</import> | |
| 5 </macros> | |
| 6 <expand macro="requirements"/> | |
| 7 <command detect_errors="exit_code"><![CDATA[ | |
| 8 set -o pipefail; | |
| 9 cat '$inputs' | |
| 10 | python '$__tool_directory__/json_flatten.py' | |
| 11 | python '$__tool_directory__/json_train.py' | |
| 12 > config.json && | |
| 13 mkdir -p 'autoencoder' && | |
| 14 mkdir -p 'model' && | |
| 15 dfpl train --configFile config.json && | |
| 16 cp 'autoencoder/encoder_weights.h5' '$output_autoencoder_weights' && | |
| 17 cp 'model/${model_configuration.target}/model_weights.h5' '$output_model_weights' | |
| 18 ]]></command> | |
| 19 <configfiles> | |
| 20 <inputs name="inputs" data_style="paths"/> | |
| 21 </configfiles> | |
| 22 <inputs> | |
| 23 <section name="model_configuration" title="Model Configuration" expanded="true"> | |
| 24 <param label="Input File" argument="--inputFile" | |
| 25 type="data" format="csv" optional="false" | |
| 26 help="The file containing the data for training in comma separated CSV format. The first column should be smiles"/> | |
| 27 <param label="Target" name="target" | |
| 28 type="text" optional="false" | |
| 29 help="The target column in the input file that should be trained for"> | |
| 30 <validator type="empty_field" message="A column name must be specified"/> | |
| 31 </param> | |
| 32 <param label="Chemical Representation" argument="--type" | |
| 33 type="select" optional="true" | |
| 34 help="Type of the chemical representation"> | |
| 35 <option value="fp" selected="true">fp</option> | |
| 36 <option value="smiles">smiles</option> | |
| 37 </param> | |
| 38 <param label="Classification Threshold" argument="--threshold" | |
| 39 type="float" min="0" max="1" value="0.5" optional="true" | |
| 40 help="Threshold for binary classification"/> | |
| 41 <param label="Fingerprint Type" | |
| 42 argument="--fpType" | |
| 43 optional="true" | |
| 44 type="select" | |
| 45 help="The type of fingerprint to be generated/used in input file"> | |
| 46 <option value="topological" selected="true">topological</option> | |
| 47 <option value="MACCS">MACCS</option> | |
| 48 </param> | |
| 49 <param label="Fingerprint Size" argument="--fpSize" | |
| 50 type="integer" min="1" value="2048" optional="true" | |
| 51 help="Length of the fingerprint that should be generated"/> | |
| 52 <param label="Multi-Label Classification" argument="--enableMultiLabel" | |
| 53 type="boolean" | |
| 54 checked="false" | |
| 55 help="Train multi-label classification model"/> | |
| 56 </section> | |
| 57 <section name="training_configuration" title="Training Configuration" expanded="true"> | |
| 58 <param argument="--split_type" type="select" optional="true" label="split_type" | |
| 59 help="Set how the data is split for the feedforward neural network"> | |
| 60 <option value="scaffold_balanced">Scaffold_balanced</option> | |
| 61 <option value="random" selected="true">Random</option> | |
| 62 <option value="molecular_weight">Molecular_weight</option> | |
| 63 </param> | |
| 64 <param label="Test Size" argument="--testSize" | |
| 65 type="float" min="0" max="1" value="0.2" optional="true" | |
| 66 help="Fraction of the dataset that should be used for testing"/> | |
| 67 <param label="kFolds Cross-Validation" argument="--kFolds" | |
| 68 type="integer" value="1" min="1" optional="true" | |
| 69 help="Number of folds for cross-validation"/> | |
| 70 <param label="Train FNN" argument="--trainFNN" | |
| 71 type="boolean" checked="true" | |
| 72 help="Deactivates the FNN training"/> | |
| 73 <param label="Sample Down" argument="--sampleDown" | |
| 74 type="boolean" | |
| 75 help="Down sampling of the 0-valued samples"/> | |
| 76 <param label="Sample Fraction Ones" argument="--sampleFractionOnes" | |
| 77 type="float" min="0" max="1" value="0.5" optional="true" | |
| 78 help="This is the desired fraction 1s/0s.only works if --sampleDown is enabled"/> | |
| 79 <param label="Epochs" argument="--epochs" | |
| 80 type="integer" min="10" value="100" optional="true" | |
| 81 help="Number of epochs for the FNN training"/> | |
| 82 <param label="Loss Function" argument="--lossFunction" | |
| 83 type="select" optional="true" | |
| 84 help="Loss function for FNN training. mse - mean squared error, bce - binary cross entropy"> | |
| 85 <option value="mse">MSE</option> | |
| 86 <option value="bce" selected="true">BCE</option> | |
| 87 <option value="focal">Focal</option> | |
| 88 </param> | |
| 89 <param label="Optimizer" argument="--optimizer" | |
| 90 type="select" optional="true" | |
| 91 help="Optimizer of the FNN"> | |
| 92 <option value="Adam" selected="true">Adam</option> | |
| 93 <option value="SGD">Sgd</option> | |
| 94 </param> | |
| 95 <param label="Batch Size" argument="--batchSize" | |
| 96 type="integer" min="1" value="128" optional="true" | |
| 97 help="Batch size in FNN training"/> | |
| 98 <param label="L2 Regularization" argument="--l2reg" | |
| 99 type="float" min="0" value="0.001" optional="true" | |
| 100 help="Value for l2 kernel regularizer"/> | |
| 101 <param label="Dropout" argument="--dropout" | |
| 102 type="float" min="0" max="1" value="0.2" optional="true" | |
| 103 help="The fraction of data that is dropped out in each dropout layer"/> | |
| 104 <param label="Learning Rate" argument="--learningRate" | |
| 105 type="float" min="0" value="2.2e-05" optional="true" | |
| 106 help="Learning rate size in FNN training"/> | |
| 107 <param label="Learning Rate Decay" argument="--learningRateDecay" | |
| 108 type="float" min="0" max="1" value="0.96" optional="true" | |
| 109 help="Learning rate decay in FNN training"/> | |
| 110 <param label="Activation Function" argument="--activationFunction" | |
| 111 type="select" optional="true" | |
| 112 help="The activation function of the FNN"> | |
| 113 <option value="relu" selected="true">Relu</option> | |
| 114 <option value="selu">Selu</option> | |
| 115 </param> | |
| 116 </section> | |
| 117 <conditional name="autoencoder"> | |
| 118 <param label="Compress Fingerprints with Autoencoder" argument="--compressFeatures" | |
| 119 type="select" | |
| 120 help="Compress the fingerprints using an autoencoder. | |
| 121 Either uses an already trained autoencoder (requires a weights file) | |
| 122 or creates and trains a new autoencoder."> | |
| 123 <option value="true">Compress fingerprints</option> | |
| 124 <option value="false">Use raw fingerprints</option> | |
| 125 </param> | |
| 126 <when value="true"> | |
| 127 <conditional name="train-autoencoder"> | |
| 128 <param label="Load / Train Autoencoder" argument="--trainAC" | |
| 129 type="select" | |
| 130 help="Select if a new autoencoder should be trained | |
| 131 or if you want to provide the weights of a trained autoencoder yourself"> | |
| 132 <option value="true">Train new autoencoder</option> | |
| 133 <option value="false">Load autoencoder from file</option> | |
| 134 </param> | |
| 135 <when value="false"> | |
| 136 <param label="Encoder Weights File" argument="--ecWeightsFile" | |
| 137 type="data" format="h5" | |
| 138 help="The .hdf5 file of a trained encoder"/> | |
| 139 </when> | |
| 140 <when value="true"> | |
| 141 <param label="Autoencoder Type" argument="--aeType" | |
| 142 type="select" optional="true" | |
| 143 help="Autoencoder type, variational or deterministic"> | |
| 144 <option value="variational">Variational</option> | |
| 145 <option value="deterministic" selected="true">Deterministic</option> | |
| 146 </param> | |
| 147 <param label="Epochs" argument="--aeEpochs" | |
| 148 type="integer" min="5" value="100" optional="true" | |
| 149 help="Number of epochs for autoencoder training"/> | |
| 150 <param label="Batch Size" argument="--aeBatchSize" | |
| 151 type="integer" min="1" value="512" optional="true" | |
| 152 help="Batch size in autoencoder training"/> | |
| 153 <param label="Learning Rate" argument="--aeLearningRate" | |
| 154 type="float" min="0" value="0.001" optional="true" | |
| 155 help="Learning rate for autoencoder training"/> | |
| 156 <param label="Learning Rate Decay" argument="--aeLearningRateDecay" | |
| 157 type="float" value="0.96" min="0" max="1" optional="true" | |
| 158 help="Learning rate decay for autoencoder training"/> | |
| 159 <param label="Split Type" argument="--aeSplitType" | |
| 160 type="select" optional="true" | |
| 161 help="Set how the data is split for the autoencoder"> | |
| 162 <option value="scaffold_balanced">Scaffold Balanced</option> | |
| 163 <option value="random" selected="true">Random</option> | |
| 164 <option value="molecular_weight">Molecular Weight</option> | |
| 165 </param> | |
| 166 <param label="FNN Type" argument="--fnnType" | |
| 167 type="select" optional="true" | |
| 168 help="The type of the feedforward neural network"> | |
| 169 <option value="FNN" selected="true">FNN</option> | |
| 170 <option value="SNN">SNN</option> | |
| 171 </param> | |
| 172 <param label="Fingerprint Size" argument="--encFPSize" | |
| 173 type="integer" min="1" value="256" optional="true" | |
| 174 help="Size of encoded fingerprint (z-layer of autoencoder)"/> | |
| 175 <param label="Activation Function" argument="--aeActivationFunction" | |
| 176 type="select" optional="true" | |
| 177 help="The activation function of the autoencoder"> | |
| 178 <option value="relu" selected="true">ReLU</option> | |
| 179 <option value="selu">SELU</option> | |
| 180 </param> | |
| 181 <param label="Visualize Latent Space" argument="--visualizeLatent" | |
| 182 type="boolean" checked="false" | |
| 183 help="UMAP the latent space for exploration"/> | |
| 184 </when> | |
| 185 </conditional> | |
| 186 </when> | |
| 187 <when value="false"/> | |
| 188 </conditional> | |
| 189 <section title="Logging" name="logging_configuration" expanded="false"> | |
| 190 <param label="Verbosity Level" argument="--verbose" | |
| 191 type="select" optional="true" | |
| 192 help="Verbosity level of output"> | |
| 193 <option value="0">0: No additional output</option> | |
| 194 <option value="1">1: Some additional output</option> | |
| 195 <option value="2">2: Full additional output</option> | |
| 196 </param> | |
| 197 <!-- <section name="tracking_configuration" title="Weights & Biases" expanded="true">--> | |
| 198 <!-- <param label="Target"--> | |
| 199 <!-- argument="--wabTarget" type="text" optional="true"--> | |
| 200 <!-- help="Which endpoint to use for tracking performance via Weights & Biases. Should match the column name"/>--> | |
| 201 <!-- <param label="Track FNN" argument="--wabTracking"--> | |
| 202 <!-- type="boolean"--> | |
| 203 <!-- help="Track FNN performance via Weights & Biases"/>--> | |
| 204 <!-- <param label="Track Autoencoder" argument="--aeWabTracking"--> | |
| 205 <!-- type="boolean"--> | |
| 206 <!-- help="Track autoencoder performance via Weights & Biases"/>--> | |
| 207 <!-- </section>--> | |
| 208 </section> | |
| 209 </inputs> | |
| 210 <outputs> | |
| 211 <!-- todo: filter -> let user decide if they want output svg/csv or nothing --> | |
| 212 <!-- <data name="loss_table" format="csv" from_work_dir="output/" label="${tool.name} on ${on_string}: csv">--> | |
| 213 <!-- </data>--> | |
| 214 <!-- <data name="loss_diagram" format="svg" from_work_dir="output/" label="${tool.name} on ${on_string}: svg">--> | |
| 215 <!-- </data>--> | |
| 216 <data name="output_model_weights" label="${tool.name} on ${on_string}: model weights" | |
| 217 format="h5"/> | |
| 218 <data name="output_autoencoder_weights" label="${tool.name} on ${on_string}: autoencoder weights" | |
| 219 format="h5"/> | |
| 220 </outputs> | |
| 221 <tests> | |
| 222 <test> | |
| 223 <section name="model_configuration"> | |
| 224 <param name="inputFile" value="S_dataset.csv"/> | |
| 225 <param name="target" value="Aromatase"/> | |
| 226 <param name="type" value="smiles"/> | |
| 227 <param name="fpType" value="topological"/> | |
| 228 <param name="fpSize" value="2048"/> | |
| 229 <param name="enableMultiLabel" value="false"/> | |
| 230 <param name="threshold" value="0.5"/> | |
| 231 </section> | |
| 232 <section name="training_configuration"> | |
| 233 <param name="split_type" value="random"/> | |
| 234 <param name="sampleFractionOnes" value="0"/> | |
| 235 <param name="sampleDown" value="false"/> | |
| 236 <param name="trainFNN" value="true"/> | |
| 237 <param name="kFolds" value="1"/> | |
| 238 <param name="testSize" value="0.2"/> | |
| 239 <param name="optimizer" value="Adam"/> | |
| 240 <param name="lossFunction" value="bce"/> | |
| 241 <param name="epochs" value="10"/> | |
| 242 <param name="batchSize" value="128"/> | |
| 243 <param name="activationFunction" value="selu"/> | |
| 244 <param name="dropout" value="0.0107"/> | |
| 245 <param name="learningRate" value="2.2e-06"/> | |
| 246 <param name="l2reg" value="0.001"/> | |
| 247 </section> | |
| 248 <conditional name="autoencoder"> | |
| 249 <param name="compressFeatures" value="true"/> | |
| 250 <conditional name="train-autoencoder"> | |
| 251 <param name="trainAC" value="true"/> | |
| 252 <param name="encFPSize" value="256"/> | |
| 253 <param name="aeSplitType" value="random"/> | |
| 254 <param name="aeEpochs" value="5"/> | |
| 255 <param name="aeBatchSize" value="351"/> | |
| 256 <param name="aeActivationFunction" value="relu"/> | |
| 257 <param name="aeLearningRate" value="0.001"/> | |
| 258 <param name="aeLearningRateDecay" value="0.0001"/> | |
| 259 <param name="aeType" value="deterministic"/> | |
| 260 <param name="fnnType" value="FNN"/> | |
| 261 </conditional> | |
| 262 </conditional> | |
| 263 <!-- <param name="aeWabTracking" value="false"/> | |
| 264 <param name="wabTracking" value="false"/> --> | |
| 265 <section name="logging_configuration"> | |
| 266 <param name="verbose" value="2"/> | |
| 267 </section> | |
| 268 <!-- todo: add tests for svg, csv --> | |
| 269 <output name="output_model_weights"> | |
| 270 <assert_contents> | |
| 271 <has_h5_keys keys="alpha_dropout_18,alpha_dropout_19,alpha_dropout_20,dense_30,dense_31,dense_32,dense_33,top_level_model_weights"/> | |
| 272 </assert_contents> | |
| 273 </output> | |
| 274 <assert_stdout> | |
| 275 <has_text text="Evaluating trained model"/> | |
| 276 </assert_stdout> | |
| 277 </test> | |
| 278 </tests> | |
| 279 <help><![CDATA[ | |
| 280 This tool is the train mode of `DeepFPLearn <https://github.com/yigbt/deepFPlearn>`_. | |
| 281 It's equivalent to running ``dfpl train`` from the command line. | |
| 282 | |
| 283 The train mode is used to train models to predict the association of molecular structures to biological targets. | |
| 284 The encoding of the molecules is done based on molecular fingerprints. | |
| 285 | |
| 286 The training data contains three targets and you may train models for each with this tool. | |
| 287 | |
| 288 The tool will generate the following outputs: | |
| 289 | |
| 290 - the trained models as a ``.zip`` file including | |
| 291 | |
| 292 - the weights of the trained FNN, if selected | |
| 293 | |
| 294 - the weights of the trained autoencoder, if selected | |
| 295 | |
| 296 - the training histories as tabular data (``.csv``) | |
| 297 | |
| 298 - the training histories as a plot (``.svg``) | |
| 299 ]]></help> | |
| 300 <expand macro="citations"/> | |
| 301 </tool> |
