Mercurial > repos > bgruening > create_tool_recommendation_model
comparison create_tool_recommendation_model.xml @ 2:76251d1ccdcc draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
author | bgruening |
---|---|
date | Fri, 11 Oct 2019 18:24:54 -0400 |
parents | 12764915e1c5 |
children | 5b3c08710e47 |
comparison
equal
deleted
inserted
replaced
1:12764915e1c5 | 2:76251d1ccdcc |
---|---|
1 <tool id="create_tool_recommendation_model" name="Create a model to recommend tools" version="0.0.1"> | 1 <tool id="create_tool_recommendation_model" name="Create a model to recommend tools" version="0.0.1"> |
2 <description>using deep learning</description> | 2 <description>using deep learning</description> |
3 <requirements> | 3 <requirements> |
4 <requirement type="package" version="3.6">python</requirement> | 4 <requirement type="package" version="3.6">python</requirement> |
5 <requirement type="package" version="1.14.0">tensorflow</requirement> | 5 <requirement type="package" version="1.13.1">tensorflow</requirement> |
6 <requirement type="package" version="2.2.4">keras</requirement> | 6 <requirement type="package" version="2.3.0">keras</requirement> |
7 <requirement type="package" version="0.20.1">scikit-learn</requirement> | 7 <requirement type="package" version="0.21.3">scikit-learn</requirement> |
8 <requirement type="package" version="2.9.0">h5py</requirement> | 8 <requirement type="package" version="2.9.0">h5py</requirement> |
9 <requirement type="package" version="1.0">csvkit</requirement> | 9 <requirement type="package" version="1.0.4">csvkit</requirement> |
10 <requirement type="package" version="0.1.2">hyperopt</requirement> | 10 <requirement type="package" version="0.1.2">hyperopt</requirement> |
11 </requirements> | 11 </requirements> |
12 <version_command>echo "@VERSION@"</version_command> | 12 <version_command>echo "@VERSION@"</version_command> |
13 <command detect_errors="aggressive"> | 13 <command detect_errors="aggressive"> |
14 <![CDATA[ | 14 <![CDATA[ |
36 </command> | 36 </command> |
37 <inputs> | 37 <inputs> |
38 <param name="input_tabular_workflows" type="data" format="tabular" label="Dataset containing workflows" help="Please provide Galaxy workflows as a tabular file."/> | 38 <param name="input_tabular_workflows" type="data" format="tabular" label="Dataset containing workflows" help="Please provide Galaxy workflows as a tabular file."/> |
39 <param name="input_tabular_tool_usage" type="data" format="tabular" label="Dataset containing usage frequencies of tools" help="Please provide tools usage frequencies as a tabular file."/> | 39 <param name="input_tabular_tool_usage" type="data" format="tabular" label="Dataset containing usage frequencies of tools" help="Please provide tools usage frequencies as a tabular file."/> |
40 <section name="data_parameters" title="Data parameters" expanded="False"> | 40 <section name="data_parameters" title="Data parameters" expanded="False"> |
41 | |
41 <param name="input_cutoff_date" type="text" value="2017-12-01" label="Cutoff date" help="Provide a date (in the past) in yyyy-mm-dd format. The earliest date from which usage of tools will be extracted. For example, 2017-12-01 specifies that the usage of tools from this date until the data extraction date is extracted. The usage of tools before this date is not considered."/> | 42 <param name="input_cutoff_date" type="text" value="2017-12-01" label="Cutoff date" help="Provide a date (in the past) in yyyy-mm-dd format. The earliest date from which usage of tools will be extracted. For example, 2017-12-01 specifies that the usage of tools from this date until the data extraction date is extracted. The usage of tools before this date is not considered."/> |
43 | |
42 <param name="input_maximum_path_length" type="integer" value="25" label="Maximum number of tools in a tool sequence" help="Provide an integer between 1 and 25. A workflow is divided into unique paths and this number specifies the maximum number of tools a path can have. Paths longer than this number are ignored and are not included in the deep learning training."/> | 44 <param name="input_maximum_path_length" type="integer" value="25" label="Maximum number of tools in a tool sequence" help="Provide an integer between 1 and 25. A workflow is divided into unique paths and this number specifies the maximum number of tools a path can have. Paths longer than this number are ignored and are not included in the deep learning training."/> |
45 | |
43 </section> | 46 </section> |
44 <section name="training_parameters" title="Training parameters" expanded="False"> | 47 <section name="training_parameters" title="Training parameters" expanded="False"> |
45 <param name="max_evals" type="integer" value="2" label="Maximum number of evaluations of different configurations of parameters" help="Provide an integer. Different combinations of parameters are sampled and optimized to find the best one. This number specifies the number of different configurations sampled and tested."/> | 48 <param name="max_evals" type="integer" value="50" label="Maximum number of evaluations of different configurations of parameters" help="Provide an integer. Different combinations of parameters are sampled and optimized to find the best one. This number specifies the number of different configurations sampled and tested."/> |
46 <param name="optimize_n_epochs" type="integer" value="2" label="Number of training iterations to optimize the neural network parameters" help="Provide an integer. This number specifies the number of training iterations done for each sampled configuration while optimising the parameters."/> | 49 |
47 <param name="n_epochs" type="integer" value="2" label="Number of training iterations" help="Provide an integer. This specifies the number of deep learning training iterations done after finding the best/optimised configuration of neural network parameters."/> | 50 <param name="optimize_n_epochs" type="integer" value="20" label="Number of training iterations to optimize the neural network parameters" help="Provide an integer. This number specifies the number of training iterations done for each sampled configuration while optimising the parameters."/> |
48 <param name="test_share" type="float" value="0.2" label="Share of the test data" help="Provide a real number between 0.0 and 1.0. This set of data is used to look through the prediction accuracy on unseen data after neural network training on an optimised configuration of parameters. It should be set to 0.0 while training for a model to be deployed to production. The minimum value can be 0.0 and maximum value should not be more than 0.5."/> | 51 |
52 <param name="n_epochs" type="integer" value="20" label="Number of training iterations" help="Provide an integer. This specifies the number of deep learning training iterations done after finding the best/optimised configuration of neural network parameters."/> | |
53 | |
54 <param name="test_share" type="float" value="0.0" label="Share of the test data" help="Provide a real number between 0.0 and 1.0. This set of data is used to look through the prediction accuracy on unseen data after neural network training on an optimised configuration of parameters. It should be set to 0.0 while training for a model to be deployed to production. The minimum value can be 0.0 and maximum value should not be more than 0.5."/> | |
55 | |
49 <param name="validation_share" type="float" value="0.2" label="Share of the validation data" help="Provide a real number between 0.0 and 1.0. This set of data is used to validate each step of learning while optimising the configurations of parameters. The minimum value can be 0.0 and maximum value should not be more than 0.5."/> | 56 <param name="validation_share" type="float" value="0.2" label="Share of the validation data" help="Provide a real number between 0.0 and 1.0. This set of data is used to validate each step of learning while optimising the configurations of parameters. The minimum value can be 0.0 and maximum value should not be more than 0.5."/> |
57 | |
50 </section> | 58 </section> |
51 <section name="nn_parameters" title="Neural network parameters" expanded="False"> | 59 <section name="nn_parameters" title="Neural network parameters" expanded="False"> |
52 <param name="batch_size" type="text" value="30,500" label="Training batch size" help="Provide a comma-separated range to sample the batch size from. The training of the neural network is done using batch learning in this work. The training data is divided into equal batches and for each epoch (a training iteration), all the batches of data are trained one after another. An example: 10,500." /> | 60 <param name="batch_size" type="text" value="1,512" label="Training batch size" help="Provide a comma-separated range to sample the batch size from. The training of the neural network is done using batch learning in this work. The training data is divided into equal batches and for each epoch (a training iteration), all the batches of data are trained one after another. An example: 1,512." /> |
53 <param name="units" type="text" value="30,500" label="Number of hidden recurrent units" help="Provide a comma-separated range to sample the number of hidden recurrent units. A higher value provide stronger neural network model (may lead to overfitting in case of smaller data) and smaller value leads to weaker model (may lead to underfitting in case of larger data). An example: 30,500."/> | 61 |
54 <param name="embedding_size" type="text" value="30,500" label="Embedding vector size" help="Provide a comma-separated range to sample the embedding size for tools. A fixed-size vector is learned for each tool. This number specifies the fixed-size. An example: 30,500."/> | 62 <param name="units" type="text" value="1,512" label="Number of hidden recurrent units" help="Provide a comma-separated range to sample the number of hidden recurrent units. A higher value provide stronger neural network model (may lead to overfitting in case of smaller data) and smaller value leads to weaker model (may lead to underfitting in case of larger data). An example: 1,512."/> |
63 | |
64 <param name="embedding_size" type="text" value="1,512" label="Embedding vector size" help="Provide a comma-separated range to sample the embedding size for tools. A fixed-size vector is learned for each tool. This number specifies the fixed-size. An example: 1,512."/> | |
65 | |
55 <param name="dropout" type="text" value="0.0,0.5" label="Dropout between neural network layers" help="Provide a comma-separated range to sample the amount of dropout to be used after neural netwrok layers. The minimum value should be 0.0 and the maximum value should be 1.0. Dropout is used to prevent or minimize overfitting after each neural network layer. An example: 0.0,0.5"/> | 66 <param name="dropout" type="text" value="0.0,0.5" label="Dropout between neural network layers" help="Provide a comma-separated range to sample the amount of dropout to be used after neural netwrok layers. The minimum value should be 0.0 and the maximum value should be 1.0. Dropout is used to prevent or minimize overfitting after each neural network layer. An example: 0.0,0.5"/> |
67 | |
56 <param name="spatial_dropout" type="text" value="0.0,0.5" label="Dropout for the embedding layer" help="Provide a comma-separated range to sample the amount of dropout to be used after embedding layer. The minimum value should be 0.0 and the maximum value should be 1.0. Dropout is used to prevent or minimize overfitting in the embedding layer. An example: 0.0,0.5"/> | 68 <param name="spatial_dropout" type="text" value="0.0,0.5" label="Dropout for the embedding layer" help="Provide a comma-separated range to sample the amount of dropout to be used after embedding layer. The minimum value should be 0.0 and the maximum value should be 1.0. Dropout is used to prevent or minimize overfitting in the embedding layer. An example: 0.0,0.5"/> |
69 | |
57 <param name="recurrent_dropout" type="text" value="0.0,0.5" label="Dropout for recurrent layers" help="Provide a comma-separated range to sample the amount of dropout to be used for the recurrent layers. The minimum value should be 0.0 and the maximum value should be 1.0. Dropout is used to prevent or minimize overfitting in the recurrent layers. An example: 0.0,0.5"/> | 70 <param name="recurrent_dropout" type="text" value="0.0,0.5" label="Dropout for recurrent layers" help="Provide a comma-separated range to sample the amount of dropout to be used for the recurrent layers. The minimum value should be 0.0 and the maximum value should be 1.0. Dropout is used to prevent or minimize overfitting in the recurrent layers. An example: 0.0,0.5"/> |
71 | |
58 <param name="learning_rate" type="text" value="0.0001,0.1" label="Learning rate" help="Provide a range of positive real numbers to sample the learning rate. Learning rate defines the speed of neural network learning. A higher value will ensure fast learning and smaller value will ensure slower learning. An example: 0.0001,0.1"/> | 72 <param name="learning_rate" type="text" value="0.0001,0.1" label="Learning rate" help="Provide a range of positive real numbers to sample the learning rate. Learning rate defines the speed of neural network learning. A higher value will ensure fast learning and smaller value will ensure slower learning. An example: 0.0001,0.1"/> |
59 <param name="activation_recurrent" type="text" value="elu" label="Name of the activation function for recurrent layers" help="It is a mathematical function that transforms the input of recurrent layers to the following neural network layer."/> | 73 <param name="activation_recurrent" type="text" value="elu" label="Name of the activation function for recurrent layers" help="It is a mathematical function that transforms the input of recurrent layers to the following neural network layer."/> |
74 | |
60 <param name="activation_output" type="text" value="sigmoid" label="Name of the activation function for output layer" help="It is a mathematical function that transforms the input of the last dense layer to the output of the neural network."/> | 75 <param name="activation_output" type="text" value="sigmoid" label="Name of the activation function for output layer" help="It is a mathematical function that transforms the input of the last dense layer to the output of the neural network."/> |
61 </section> | 76 </section> |
62 </inputs> | 77 </inputs> |
63 <outputs> | 78 <outputs> |
64 <data format="h5" name="outfile_model" label="Model to recommend tools in Galaxy"></data> | 79 <data format="h5" name="outfile_model" label="Model to recommend tools in Galaxy"></data> |
65 </outputs> | 80 </outputs> |
66 <tests> | 81 <tests> |
67 <test> | 82 <test> |
68 <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/> | 83 <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/> |
69 <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/> | 84 <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/> |
85 <param name="max_evals" value="1"/> | |
86 <param name="optimize_n_epochs" value="1"/> | |
87 <param name="n_epochs" value="1"/> | |
88 <param name="test_share" value="0.1"/> | |
70 <output name="outfile_model"> | 89 <output name="outfile_model"> |
71 <assert_contents> | 90 <assert_contents> |
72 <has_h5_keys keys="best_parameters,class_weights,compatible_tools,data_dictionary,model_config,weight_0,weight_1,weight_2,weight_3,weight_4,weight_5,weight_6,weight_7,weight_8"/> | 91 <has_h5_keys keys="best_parameters,class_weights,compatible_tools,data_dictionary,model_config,weight_0,weight_1,weight_2,weight_3,weight_4,weight_5,weight_6,weight_7,weight_8"/> |
73 </assert_contents> | 92 </assert_contents> |
74 </output> | 93 </output> |
94 </test> | |
95 <test> | |
96 <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/> | |
97 <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/> | |
98 <param name="max_evals" value="1"/> | |
99 <param name="optimize_n_epochs" value="1"/> | |
100 <param name="n_epochs" value="1"/> | |
101 <param name="test_share" value="0.0"/> | |
102 <output name="outfile_model"> | |
103 <assert_contents> | |
104 <has_h5_keys keys="best_parameters,class_weights,compatible_tools,data_dictionary,model_config,weight_0,weight_1,weight_2,weight_3,weight_4,weight_5,weight_6,weight_7,weight_8"/> | |
105 </assert_contents> | |
106 </output> | |
107 </test> | |
108 <test expect_failure="true"> | |
109 <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/> | |
110 <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/> | |
111 <param name="validation_share" value="0.0"/> | |
112 </test> | |
113 <test expect_failure="true"> | |
114 <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/> | |
115 <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/> | |
116 <param name="batch_size" value="1"/> | |
117 </test> | |
118 <test expect_failure="true"> | |
119 <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/> | |
120 <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/> | |
121 <param name="units" value="1"/> | |
122 </test> | |
123 <test expect_failure="true"> | |
124 <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/> | |
125 <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/> | |
126 <param name="embedding_size" value="1"/> | |
127 </test> | |
128 <test expect_failure="true"> | |
129 <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/> | |
130 <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/> | |
131 <param name="dropout" value="0.1"/> | |
132 </test> | |
133 <test expect_failure="true"> | |
134 <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/> | |
135 <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/> | |
136 <param name="spatial_dropout" value="0.1"/> | |
137 </test> | |
138 <test expect_failure="true"> | |
139 <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/> | |
140 <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/> | |
141 <param name="recurrent_dropout" value="0.1"/> | |
142 </test> | |
143 <test expect_failure="true"> | |
144 <param name="input_tabular_workflows" value="test_workflows" ftype="tabular"/> | |
145 <param name="input_tabular_tool_usage" value="test_tool_usage" ftype="tabular"/> | |
146 <param name="learning_rate" value="0.0001"/> | |
75 </test> | 147 </test> |
76 </tests> | 148 </tests> |
77 <help> | 149 <help> |
78 <![CDATA[ | 150 <![CDATA[ |
79 **What it does** | 151 **What it does** |
152 <citation type="bibtex"> | 224 <citation type="bibtex"> |
153 @ARTICLE{anuprulez_galaxytools, | 225 @ARTICLE{anuprulez_galaxytools, |
154 Author = {Anup Kumar and Björn Grüning}, | 226 Author = {Anup Kumar and Björn Grüning}, |
155 keywords = {bioinformatics, recommendation system, deep learning}, | 227 keywords = {bioinformatics, recommendation system, deep learning}, |
156 title = {{Tool recommendation system for Galaxy workflows}}, | 228 title = {{Tool recommendation system for Galaxy workflows}}, |
157 url = {https://github.com/anuprulez/galaxytools} | 229 url = {https://github.com/bgruening/galaxytools} |
158 } | 230 } |
159 </citation> | 231 </citation> |
160 </citations> | 232 </citations> |
161 </tool> | 233 </tool> |