Mercurial > repos > recetox > spec2vec_training
comparison spec2vec_training.xml @ 0:e1e22ada831e draft
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/spec2vec commit 2e4bdc2fd94445aa5a8d1882a3d092cca727e4b6
| author | recetox |
|---|---|
| date | Thu, 05 Jan 2023 10:08:12 +0000 |
| parents | |
| children | 9d917de87cca |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e1e22ada831e |
|---|---|
| 1 <tool id="spec2vec_training" name="Spec2Vec Model Training" version="@TOOL_VERSION@-@TOOL_DEV_VERSION@+galaxy0" python_template_version="3.5" profile="21.05"> | |
| 2 <description>Train a Spec2Vec model for mass spectra similarity scoring</description> | |
| 3 | |
| 4 <macros> | |
| 5 <import>macros.xml</import> | |
| 6 </macros> | |
| 7 <expand macro="creator"/> | |
| 8 | |
| 9 <requirements> | |
| 10 <container type="docker">recetox/spec2vec:@COMMIT_SHA@</container> | |
| 11 </requirements> | |
| 12 | |
| 13 <command detect_errors="exit_code"><![CDATA[ | |
| 14 ln -fs '${weights_filename}' '${weights_filename}.npy' && | |
| 15 sh ${spec2vec_python_cli} | |
| 16 ]]></command> | |
| 17 | |
| 18 <configfiles> | |
| 19 <configfile name="spec2vec_python_cli"> | |
| 20 python3 '${__tool_directory__}/spec2vec_training_wrapper.py' \ | |
| 21 --spectra_filename '$spectra_filename' \ | |
| 22 --spectra_fileformat '$spectra_filename.ext' \ | |
| 23 #if $output_parameters.model_checkpoints.save_checkpoints == 'TRUE' | |
| 24 --checkpoints '$output_parameters.model_checkpoints.checkpoints' \ | |
| 25 #else | |
| 26 --epochs $output_parameters.model_checkpoints.epochs \ | |
| 27 #end if | |
| 28 --vector_size $training_parameters.vector_size \ | |
| 29 --alpha $training_parameters.alpha \ | |
| 30 --min_alpha $training_parameters.min_alpha \ | |
| 31 --window $training_parameters.window \ | |
| 32 --min_count $training_parameters.min_count \ | |
| 33 --sample $training_parameters.sample \ | |
| 34 --seed $training_parameters.seed \ | |
| 35 --sg $training_parameters.sg_param.sg \ | |
| 36 #if not $training_parameters.sg_param.sg | |
| 37 --cbow_mean $training_parameters.sg_param.cbow_mean \ | |
| 38 #end if | |
| 39 --hs $training_parameters.hs_param.hs \ | |
| 40 #if not $training_parameters.hs_param.hs | |
| 41 --negative $training_parameters.hs_param.negative \ | |
| 42 --ns_exponent $training_parameters.hs_param.ns_exponent \ | |
| 43 #end if | |
| 44 --sorted_vocab $training_parameters.sorted_vocab \ | |
| 45 --batch_words $training_parameters.batch_words \ | |
| 46 --shrink_windows $training_parameters.shrink_windows \ | |
| 47 #if $training_parameters.trim_vocab.max_vocab_size_bool == 'TRUE' | |
| 48 --max_vocab_size $training_parameters.trim_vocab.max_vocab_size \ | |
| 49 #end if | |
| 50 --n_decimals $training_parameters.n_decimals \ | |
| 51 --n_workers \${GALAXY_SLOTS:-1} \ | |
| 52 #if $output_parameters.as_pickle | |
| 53 --model_filename_pickle '$model_filename_pickle' \ | |
| 54 #end if | |
| 55 --model_filename '$model_filename' \ | |
| 56 --weights_filename '$weights_filename' \ | |
| 57 </configfile> | |
| 58 </configfiles> | |
| 59 | |
| 60 <inputs> | |
| 61 <param label="Training spectra" name="spectra_filename" type="data" format="msp,mgf" | |
| 62 help="Spectra file to train a Spec2Vec model."/> | |
| 63 | |
| 64 <section title="Output parameters" name="output_parameters" expanded="true"> | |
| 65 <param label="Save model as Pickle file" name="as_pickle" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" | |
| 66 help="Add a Pickle output besides default JSON."/> | |
| 67 <conditional name="model_checkpoints"> | |
| 68 <param label="Model checkpoints" name="save_checkpoints" type="select" display="radio" | |
| 69 help="Epochs after which to save a model."> | |
| 70 <option value="TRUE">Yes</option> | |
| 71 <option value="FALSE" selected="true">No</option> | |
| 72 </param> | |
| 73 <when value="TRUE"> | |
| 74 <param label="Number of training epochs with checkpoints" name="checkpoints" type="text" value="10,20,50" | |
| 75 help="Comma-separated epoch numbers after which to save a model. The highest number will be used as a total number of epochs for training."> | |
| 76 <validator type="empty_field"/> | |
| 77 <validator type="regex" | |
| 78 message="The input has to be a comma-separated sequence of integers without trailing commas. For example: 10,20,50">^[0-9]+(,[0-9]+)*$</validator> | |
| 79 </param> | |
| 80 </when> | |
| 81 <when value="FALSE"> | |
| 82 <param label="Number of training epochs" name="epochs" type="integer" value="10" | |
| 83 help="Number of epochs to train the model."/> | |
| 84 </when> | |
| 85 </conditional> | |
| 86 </section> | |
| 87 | |
| 88 <section title="Training hyperparameters" name="training_parameters" expanded="true"> | |
| 89 <param label="Vector size" name="vector_size" type="integer" value="300" | |
| 90 min="1" help="Dimensionality of the feature vectors (i.e., into how many dimensions to encode each m/z and neutral loss peak."/> | |
| 91 <param label="Alpha" name="alpha" type="float" value="0.025" | |
| 92 min="0" help="The initial learning rate."/> | |
| 93 <param label="Minimum Alpha" name="min_alpha" type="float" value="0.00025" | |
| 94 min="0" help="Learning rate will linearly drop to this value as training progresses."/> | |
| 95 <param label="Window" name="window" type="integer" value="500" | |
| 96 help="Maximum distance between the current and predicted peak within a spectrum."/> | |
| 97 <param label="Minimum peak count" name="min_count" type="integer" value="1" | |
| 98 min="0" help="Ignores all peaks with absolute frequency lower than this."/> | |
| 99 <param label="Sample" name="sample" type="float" value="0.001" | |
| 100 help="The threshold for configuring which higher-frequency peaks are randomly downsampled."/> | |
| 101 <param label="Seed" name="seed" type="integer" value="1" | |
| 102 help="Seed of random number generator for model reproducibility."/> | |
| 103 <conditional name="sg_param"> | |
| 104 <param label="Word-Embedding type" name="sg" type="select" | |
| 105 help="Embedding type: Skip-gram or Continuous Bag of Words"> | |
| 106 <option value="0">CBOW</option> | |
| 107 <option value="1">Skip-gram</option> | |
| 108 </param> | |
| 109 <when value="0"> | |
| 110 <param label="CBOW mean" name="cbow_mean" type="select" | |
| 111 help="Whether to use the sum of the context word vectors or their mean."> | |
| 112 <option value="0">Sum</option> | |
| 113 <option value="1" selected="true">Mean</option> | |
| 114 </param> | |
| 115 </when> | |
| 116 </conditional> | |
| 117 <conditional name="hs_param"> | |
| 118 <param label="Last Layer Activation" name="hs" type="select" | |
| 119 help="Activation function of the last layer of the neural network. Negative sampling is more computationally efficient."> | |
| 120 <option value="0">Negative Sampling</option> | |
| 121 <option value="1">Hierarchical Softmax</option> | |
| 122 </param> | |
| 123 <when value="0"> | |
| 124 <param label="Negative Samples" name="negative" type="integer" value="5" | |
| 125 min="1" help="Specify how many 'negative' examples should be drawn for each peak and neutral loss (usually between 5-20)."> | |
| 126 <validator type="in_range" min="1" message="The value must be larger than 0."/> | |
| 127 </param> | |
| 128 <param label="Negative Sample Exponent" name="ns_exponent" type="float" value="0.75" | |
| 129 help="The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion to the frequencies, | |
| 130 0.0 samples all peaks and neutral losses equally, while a negative value samples low-frequency peaks more often than high-requency peaks."> | |
| 131 <validator type="in_range" min="-1.0" max="1.0" message="The value must be within -1.0 and 1.0 range."/> | |
| 132 </param> | |
| 133 </when> | |
| 134 </conditional> | |
| 135 <param label="Sort the vocabulary of spectra" name="sorted_vocab" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" | |
| 136 help="If true, sort the vocabulary by descending frequency before assigning peak and neutral loss indices."/> | |
| 137 <param label="Batch size" name="batch_words" type="integer" value="10000" | |
| 138 help="Target size (in peaks and neutral losses) for batches of examples passed to worker threads (and thus cython routines). | |
| 139 Larger batches will be passed if individual peak sequences are longer than 10000 words, but the standard cython code truncates to that maximum."/> | |
| 140 <param label="Shrink windows" name="shrink_windows" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" | |
| 141 help="EXPERIMENTAL. If true, the effective window size is uniformly sampled in range [1,Window] for each target peak during training."/> | |
| 142 <conditional name="trim_vocab"> | |
| 143 <param label="Limit unique peaks and neutral losses in the spectral vocabulary" name="max_vocab_size_bool" type="select" display="radio" | |
| 144 help="Limits the RAM during vocabulary building; if there are more unique peaks and neutral losses than this, then prune the infrequent ones. Disable for no limit (default)."> | |
| 145 <option value="FALSE">No limit</option> | |
| 146 <option value="TRUE">Limit</option> | |
| 147 </param> | |
| 148 <when value="TRUE"> | |
| 149 <param label="Maximum unique peaks and neutral losses" name="max_vocab_size" type="integer" value="100000" min="1"/> | |
| 150 </when> | |
| 151 </conditional> | |
| 152 <param label="Number of decimals to round m/z values" name="n_decimals" type="integer" value="2" | |
| 153 min="0" max="5" help="Rounds peak position to this number of decimals."/> | |
| 154 </section> | |
| 155 </inputs> | |
| 156 | |
| 157 <outputs> | |
| 158 <data label="Spec2Vec model on ${on_string}" name="model_filename" format="json"/> | |
| 159 <data label="Spec2Vec weights on ${on_string}" name="weights_filename" format="binary"/> | |
| 160 <data label="Spec2Vec pickle model on ${on_string}" name="model_filename_pickle" format="binary"> | |
| 161 <filter>output_parameters['as_pickle']</filter> | |
| 162 </data> | |
| 163 <collection name="model_checkpoints" type="list" label="Spec2Vec model checkpoints on ${on_string}"> | |
| 164 <discover_datasets pattern="__name_and_ext__" /> | |
| 165 <filter>output_parameters['model_checkpoints']['save_checkpoints'] == 'TRUE'</filter> | |
| 166 </collection> | |
| 167 </outputs> | |
| 168 | |
| 169 <tests> | |
| 170 <test expect_num_outputs="2"> <!-- Test 1: with default parameters --> | |
| 171 <param name="spectra_filename" value="RECETOX_Exposome_pesticides_HR_MS_normalized_20220323.msp" ftype="msp"/> | |
| 172 <output name="model_filename" file="model.json" ftype="json"/> | |
| 173 <output name="weights_filename" ftype="binary"> | |
| 174 <assert_contents> | |
| 175 <has_size value="1708000" delta="1000"/> | |
| 176 <has_text text="'shape': (1423, 300)" n="1"/> | |
| 177 </assert_contents> | |
| 178 </output> | |
| 179 </test> | |
| 180 <test expect_num_outputs="3"> <!-- Test 2: pickle output --> | |
| 181 <param name="spectra_filename" value="RECETOX_Exposome_pesticides_HR_MS_normalized_20220323.msp" ftype="msp"/> | |
| 182 <param name="as_pickle" value="TRUE"/> | |
| 183 <output name="model_filename" file="model.json" ftype="json"/> | |
| 184 <output name="weights_filename" ftype="binary"> | |
| 185 <assert_contents> | |
| 186 <has_size value="1708000" delta="1000"/> | |
| 187 <has_text text="'shape': (1423, 300)" n="1"/> | |
| 188 </assert_contents> | |
| 189 </output> | |
| 190 <output name="model_filename_pickle" ftype="binary"> | |
| 191 <assert_contents> | |
| 192 <has_size value="3468000" delta="1000" /> | |
| 193 <has_text text="gensim.models.word2vec"/> | |
| 194 <has_text text="peak@" n="1423"/> | |
| 195 </assert_contents> | |
| 196 </output> | |
| 197 </test> | |
| 198 <test expect_num_outputs="3"> <!-- Test 3: model checkpoints --> | |
| 199 <param name="spectra_filename" value="RECETOX_Exposome_pesticides_HR_MS_normalized_20220323.msp" ftype="msp"/> | |
| 200 <conditional name="model_checkpoints"> | |
| 201 <param name="save_checkpoints" value="TRUE"/> | |
| 202 <param name="checkpoints" value="1,5,8,10"/> | |
| 203 </conditional> | |
| 204 <output name="model_filename" file="model.json" ftype="json"/> | |
| 205 <output name="weights_filename" ftype="binary"> | |
| 206 <assert_contents> | |
| 207 <has_size value="1708000" delta="1000"/> | |
| 208 <has_text text="'shape': (1423, 300)" n="1"/> | |
| 209 </assert_contents> | |
| 210 </output> | |
| 211 <output_collection name="model_checkpoints" type="list" count="3"> | |
| 212 <element name="spec2vec_iter_1"> | |
| 213 <assert_contents> | |
| 214 <has_size value="3468000" delta="1000" /> | |
| 215 <has_text text="gensim.models.word2vec" /> | |
| 216 <has_text text="peak@" n="1423" /> | |
| 217 </assert_contents> | |
| 218 </element> | |
| 219 <element name="spec2vec_iter_5"> | |
| 220 <assert_contents> | |
| 221 <has_size value="3468000" delta="1000" /> | |
| 222 <has_text text="gensim.models.word2vec" /> | |
| 223 <has_text text="peak@" n="1423" /> | |
| 224 </assert_contents> | |
| 225 </element> | |
| 226 <element name="spec2vec_iter_8"> | |
| 227 <assert_contents> | |
| 228 <has_size value="3468000" delta="1000" /> | |
| 229 <has_text text="gensim.models.word2vec" /> | |
| 230 <has_text text="peak@" n="1423" /> | |
| 231 </assert_contents> | |
| 232 </element> | |
| 233 </output_collection> | |
| 234 </test> | |
| 235 <test> <!-- Test 4: embeddings size in output corresponds to `vector_size` param --> | |
| 236 <param name="spectra_filename" value="RECETOX_Exposome_pesticides_HR_MS_normalized_20220323.msp" ftype="msp"/> | |
| 237 <param name="vector_size" value="100"/> | |
| 238 <output name="model_filename" file="model_vector_size_100.json" ftype="json"/> | |
| 239 <output name="weights_filename" ftype="binary"> | |
| 240 <assert_contents> | |
| 241 <has_size value="569000" delta="1000"/> | |
| 242 <has_text text="'shape': (1423, 100)" n="1"/> | |
| 243 </assert_contents> | |
| 244 </output> | |
| 245 </test> | |
| 246 </tests> | |
| 247 | |
| 248 <help><![CDATA[ | |
| 249 **Spec2vec** is a spectral similarity score inspired by a natural language processing algorithm – Word2Vec. | |
| 250 Where Word2Vec learns relationships between words in sentences, spec2vec does so for mass fragments and neutral losses in MS/MS spectra. | |
| 251 The spectral similarity score is based on spectral embeddings learnt from the fragmental relationships within a large set of spectral data. | |
| 252 ]]></help> | |
| 253 | |
| 254 <citations> | |
| 255 <citation type="doi">10.1371/journal.pcbi.1008724</citation> | |
| 256 </citations> | |
| 257 </tool> |
