Mercurial > repos > bgruening > sklearn_discriminant_classifier
comparison discriminant.xml @ 0:e0067d9baffc draft
planemo upload for repository https://github.com/bgruening/galaxytools/tools/sklearn commit 0e582cf1f3134c777cce3aa57d71b80ed95e6ba9
author | bgruening |
---|---|
date | Fri, 16 Feb 2018 09:19:24 -0500 |
parents | |
children | f46da2feb233 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e0067d9baffc |
---|---|
1 <tool id="sklearn_discriminant_classifier" name="Discriminant Analysis" version="@VERSION@"> | |
2 <description></description> | |
3 <macros> | |
4 <import>main_macros.xml</import> | |
5 <!--macro name="priors"--> | |
6 </macros> | |
7 <expand macro="python_requirements"/> | |
8 <expand macro="macro_stdio"/> | |
9 <version_command>echo "@VERSION@"</version_command> | |
10 <command><![CDATA[ | |
11 python "$discriminant_script" '$inputs' | |
12 ]]> | |
13 </command> | |
14 <configfiles> | |
15 <inputs name="inputs"/> | |
16 <configfile name="discriminant_script"> | |
17 <![CDATA[ | |
18 import sys | |
19 import json | |
20 import numpy as np | |
21 import sklearn.discriminant_analysis | |
22 import pandas | |
23 import pickle | |
24 | |
25 input_json_path = sys.argv[1] | |
26 params = json.load(open(input_json_path, "r")) | |
27 | |
28 | |
29 #if $selected_tasks.selected_task == "load": | |
30 | |
31 classifier_object = pickle.load(open("$infile_model", 'r')) | |
32 | |
33 data = pandas.read_csv("$selected_tasks.infile_data", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) | |
34 prediction = classifier_object.predict(data) | |
35 prediction_df = pandas.DataFrame(prediction) | |
36 res = pandas.concat([data, prediction_df], axis=1) | |
37 res.to_csv(path_or_buf = "$outfile_predict", sep="\t", index=False) | |
38 | |
39 #else: | |
40 | |
41 data_train = pandas.read_csv("$selected_tasks.infile_train", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) | |
42 | |
43 data = data_train.ix[:,0:len(data_train.columns)-1] | |
44 labels = np.array(data_train[data_train.columns[len(data_train.columns)-1]]) | |
45 | |
46 options = params["selected_tasks"]["selected_algorithms"]["options"] | |
47 selected_algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"] | |
48 | |
49 my_class = getattr(sklearn.discriminant_analysis, selected_algorithm) | |
50 classifier_object = my_class(**options) | |
51 classifier_object.fit(data,labels) | |
52 pickle.dump(classifier_object,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL) | |
53 | |
54 #end if | |
55 ]]> | |
56 </configfile> | |
57 </configfiles> | |
58 <inputs> | |
59 <expand macro="train_loadConditional" model="zip"> | |
60 <param name="selected_algorithm" type="select" label="Classifier type"> | |
61 <option value="LinearDiscriminantAnalysis" selected="true">Linear Discriminant Classifier</option> | |
62 <option value="QuadraticDiscriminantAnalysis">Quadratic Discriminant Classifier</option> | |
63 </param> | |
64 <when value="LinearDiscriminantAnalysis"> | |
65 <section name="options" title="Advanced Options" expanded="False"> | |
66 <param argument="solver" type="select" optional="true" label="Solver" help=""> | |
67 <option value="svd" selected="true">Singular Value Decomposition</option> | |
68 <option value="lsqr">Least Squares Solution</option> | |
69 <option value="eigen">Eigenvalue Decomposition</option> | |
70 </param> | |
71 <!--param name="shrinkage"--> | |
72 <!--expand macro="priors"/--> | |
73 <param argument="n_components" type="integer" optional="true" value="" label="Number of components" | |
74 help="Number of components for dimensionality reduction. ( always less than n_classes - 1 )"/> | |
75 <expand macro="tol" default_value="0.0001" help_text="Rank estimation threshold used in SVD solver."/> | |
76 <param argument="store_covariance" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" | |
77 label="Store covariance" help="Compute class covariance matrix."/> | |
78 </section> | |
79 </when> | |
80 <when value="QuadraticDiscriminantAnalysis"> | |
81 <section name="options" title="Advanced Options" expanded="False"> | |
82 <!--expand macro="priors"/--> | |
83 <param argument="reg_param" type="float" optional="true" value="0.0" label="Regularization coefficient" help="Covariance estimate regularizer."/> | |
84 <expand macro="tol" default_value="0.00001" help_text="Rank estimation threshold used in SVD solver."/> | |
85 <param argument="store_covariances" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" | |
86 label="Store covariances" help="Compute class covariance matrixes."/> | |
87 </section> | |
88 </when> | |
89 </expand> | |
90 </inputs> | |
91 <expand macro="output"/> | |
92 <tests> | |
93 <test> | |
94 <param name="infile_train" value="train.tabular" ftype="tabular"/> | |
95 <param name="selected_task" value="train"/> | |
96 <param name="selected_algorithm" value="LinearDiscriminantAnalysis"/> | |
97 <param name="solver" value="svd" /> | |
98 <param name="store_covariances" value="True"/> | |
99 <output name="outfile_fit" file="lda_model01" compare="sim_size" delta="500"/> | |
100 </test> | |
101 <test> | |
102 <param name="infile_train" value="train.tabular" ftype="tabular"/> | |
103 <param name="selected_task" value="train"/> | |
104 <param name="selected_algorithm" value="LinearDiscriminantAnalysis"/> | |
105 <param name="solver" value="lsqr"/> | |
106 <output name="outfile_fit" file="lda_model02" compare="sim_size" delta="500"/> | |
107 </test> | |
108 <test> | |
109 <param name="infile_train" value="train.tabular" ftype="tabular"/> | |
110 <param name="selected_task" value="train"/> | |
111 <param name="selected_algorithm" value="QuadraticAnalysis"/> | |
112 <output name="outfile_fit" file="qda_model01" compare="sim_size" delta="500"/> | |
113 </test> | |
114 <test> | |
115 <param name="infile_model" value="lda_model01" ftype="zip"/> | |
116 <param name="infile_data" value="test.tabular" ftype="tabular"/> | |
117 <param name="selected_task" value="load"/> | |
118 <output name="outfile_predict" file="lda_prediction_result01.tabular"/> | |
119 </test> | |
120 <test> | |
121 <param name="infile_model" value="lda_model02" ftype="zip"/> | |
122 <param name="infile_data" value="test.tabular" ftype="tabular"/> | |
123 <param name="selected_task" value="load"/> | |
124 <output name="outfile_predict" file="lda_prediction_result02.tabular"/> | |
125 </test> | |
126 <test> | |
127 <param name="infile_model" value="qda_model01" ftype="zip"/> | |
128 <param name="infile_data" value="test.tabular" ftype="tabular"/> | |
129 <param name="selected_task" value="load"/> | |
130 <output name="outfile_predict" file="qda_prediction_result01.tabular"/> | |
131 </test> | |
132 </tests> | |
133 <help><![CDATA[ | |
134 ***What it does*** | |
135 Linear and Quadratic Discriminant Analysis are two classic classifiers with a linear and a quadratic decision surface respectively. These classifiers are fast and easy to interprete. | |
136 | |
137 | |
138 **1 - Training input** | |
139 | |
140 When you choose to train a model, discriminant analysis tool expects a tabular file with numeric values, the order of the columns being as follows: | |
141 | |
142 :: | |
143 | |
144 "feature_1" "feature_2" "..." "feature_n" "class_label" | |
145 | |
146 **Example for training data** | |
147 The following training dataset contains 3 feature columns and a column containing class labels: | |
148 | |
149 :: | |
150 | |
151 4.01163365529 -6.10797684314 8.29829894763 1 | |
152 10.0788438916 1.59539821454 10.0684278289 0 | |
153 -5.17607775503 -0.878286135332 6.92941850665 2 | |
154 4.00975406235 -7.11847496542 9.3802423585 1 | |
155 4.61204065139 -5.71217537352 9.12509610964 1 | |
156 | |
157 | |
158 **2 - Trainig output** | |
159 | |
160 Based on your choice, this tool fits a sklearn discriminant_analysis.LinearDiscriminantAnalysis or discriminant_analysis.QuadraticDiscriminantAnalysis on the traning data and outputs the trained model in the form of pickled object in a text file. | |
161 | |
162 | |
163 **3 - Prediction input** | |
164 | |
165 When you choose to load a model and do prediction, the tool expects an already trained Discriminant Analysis estimator and a tabular dataset as input. The dataset is a tabular file with new samples which you want to classify. It just contains feature columns. | |
166 | |
167 **Example for prediction data** | |
168 | |
169 :: | |
170 | |
171 8.26530668997 2.96705005011 8.88881190248 | |
172 2.96366327113 -3.76295851562 11.7113372463 | |
173 8.13319631944 -0.223645298585 10.5820605308 | |
174 | |
175 .. class:: warningmark | |
176 | |
177 The number of feature columns must be the same in training and prediction datasets! | |
178 | |
179 | |
180 **3 - Prediction output** | |
181 The tool predicts the class labels for new samples and adds them as the last column to the prediction dataset. The new dataset then is output as a tabular file. The prediction output format should look like the training dataset. | |
182 | |
183 Discriminant Analysis is based on sklearn.discriminant_analysis library from Scikit-learn. | |
184 For more information please refer to `Scikit-learn site`_. | |
185 | |
186 .. _`Scikit-learn site`: http://scikit-learn.org/stable/modules/lda_qda.html | |
187 | |
188 ]]></help> | |
189 <expand macro="sklearn_citation"/> | |
190 </tool> |