Mercurial > repos > bgruening > sklearn_mlxtend_association_rules
comparison pca.py @ 0:af2624d5ab32 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
author | bgruening |
---|---|
date | Sat, 01 May 2021 01:24:32 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:af2624d5ab32 |
---|---|
1 import argparse | |
2 | |
3 import numpy as np | |
4 from galaxy_ml.utils import read_columns | |
5 from sklearn.decomposition import IncrementalPCA, KernelPCA, PCA | |
6 | |
7 | |
8 def main(): | |
9 parser = argparse.ArgumentParser(description="RDKit screen") | |
10 parser.add_argument("-i", "--infile", help="Input file") | |
11 parser.add_argument( | |
12 "--header", action="store_true", help="Include the header row or skip it" | |
13 ) | |
14 parser.add_argument( | |
15 "-c", | |
16 "--columns", | |
17 type=str.lower, | |
18 default="all", | |
19 choices=[ | |
20 "by_index_number", | |
21 "all_but_by_index_number", | |
22 "by_header_name", | |
23 "all_but_by_header_name", | |
24 "all_columns", | |
25 ], | |
26 help="Choose to select all columns, or exclude/include some", | |
27 ) | |
28 parser.add_argument( | |
29 "-ci", | |
30 "--column_indices", | |
31 type=str.lower, | |
32 help="Choose to select all columns, or exclude/include some", | |
33 ) | |
34 parser.add_argument( | |
35 "-n", | |
36 "--number", | |
37 nargs="?", | |
38 type=int, | |
39 default=None, | |
40 help="Number of components to keep. If not set, all components are kept", | |
41 ) | |
42 parser.add_argument("--whiten", action="store_true", help="Whiten the components") | |
43 parser.add_argument( | |
44 "-t", | |
45 "--pca_type", | |
46 type=str.lower, | |
47 default="classical", | |
48 choices=["classical", "incremental", "kernel"], | |
49 help="Choose which flavour of PCA to use", | |
50 ) | |
51 parser.add_argument( | |
52 "-s", | |
53 "--svd_solver", | |
54 type=str.lower, | |
55 default="auto", | |
56 choices=["auto", "full", "arpack", "randomized"], | |
57 help="Choose the type of svd solver.", | |
58 ) | |
59 parser.add_argument( | |
60 "-b", | |
61 "--batch_size", | |
62 nargs="?", | |
63 type=int, | |
64 default=None, | |
65 help="The number of samples to use for each batch", | |
66 ) | |
67 parser.add_argument( | |
68 "-k", | |
69 "--kernel", | |
70 type=str.lower, | |
71 default="linear", | |
72 choices=["linear", "poly", "rbf", "sigmoid", "cosine", "precomputed"], | |
73 help="Choose the type of kernel.", | |
74 ) | |
75 parser.add_argument( | |
76 "-g", | |
77 "--gamma", | |
78 nargs="?", | |
79 type=float, | |
80 default=None, | |
81 help="Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other kernels", | |
82 ) | |
83 parser.add_argument( | |
84 "-tol", | |
85 "--tolerance", | |
86 type=float, | |
87 default=0.0, | |
88 help="Convergence tolerance for arpack. If 0, optimal value will be chosen by arpack", | |
89 ) | |
90 parser.add_argument( | |
91 "-mi", | |
92 "--max_iter", | |
93 nargs="?", | |
94 type=int, | |
95 default=None, | |
96 help="Maximum number of iterations for arpack", | |
97 ) | |
98 parser.add_argument( | |
99 "-d", | |
100 "--degree", | |
101 type=int, | |
102 default=3, | |
103 help="Degree for poly kernels. Ignored by other kernels", | |
104 ) | |
105 parser.add_argument( | |
106 "-cf", | |
107 "--coef0", | |
108 type=float, | |
109 default=1.0, | |
110 help="Independent term in poly and sigmoid kernels", | |
111 ) | |
112 parser.add_argument( | |
113 "-e", | |
114 "--eigen_solver", | |
115 type=str.lower, | |
116 default="auto", | |
117 choices=["auto", "dense", "arpack"], | |
118 help="Choose the type of eigen solver.", | |
119 ) | |
120 parser.add_argument( | |
121 "-o", "--outfile", help="Base name for output file (no extension)." | |
122 ) | |
123 args = parser.parse_args() | |
124 | |
125 usecols = None | |
126 pca_params = {} | |
127 | |
128 if args.columns == "by_index_number" or args.columns == "all_but_by_index_number": | |
129 usecols = [int(i) for i in args.column_indices.split(",")] | |
130 elif args.columns == "by_header_name" or args.columns == "all_but_by_header_name": | |
131 usecols = args.column_indices | |
132 | |
133 header = "infer" if args.header else None | |
134 | |
135 pca_input = read_columns( | |
136 f=args.infile, | |
137 c=usecols, | |
138 c_option=args.columns, | |
139 sep="\t", | |
140 header=header, | |
141 parse_dates=True, | |
142 encoding=None, | |
143 index_col=None, | |
144 ) | |
145 | |
146 pca_params.update({"n_components": args.number}) | |
147 | |
148 if args.pca_type == "classical": | |
149 pca_params.update({"svd_solver": args.svd_solver, "whiten": args.whiten}) | |
150 if args.svd_solver == "arpack": | |
151 pca_params.update({"tol": args.tolerance}) | |
152 pca = PCA() | |
153 | |
154 elif args.pca_type == "incremental": | |
155 pca_params.update({"batch_size": args.batch_size, "whiten": args.whiten}) | |
156 pca = IncrementalPCA() | |
157 | |
158 elif args.pca_type == "kernel": | |
159 pca_params.update( | |
160 { | |
161 "kernel": args.kernel, | |
162 "eigen_solver": args.eigen_solver, | |
163 "gamma": args.gamma, | |
164 } | |
165 ) | |
166 | |
167 if args.kernel == "poly": | |
168 pca_params.update({"degree": args.degree, "coef0": args.coef0}) | |
169 elif args.kernel == "sigmoid": | |
170 pca_params.update({"coef0": args.coef0}) | |
171 elif args.kernel == "precomputed": | |
172 pca_input = np.dot(pca_input, pca_input.T) | |
173 | |
174 if args.eigen_solver == "arpack": | |
175 pca_params.update({"tol": args.tolerance, "max_iter": args.max_iter}) | |
176 | |
177 pca = KernelPCA() | |
178 | |
179 print(pca_params) | |
180 pca.set_params(**pca_params) | |
181 pca_output = pca.fit_transform(pca_input) | |
182 np.savetxt(fname=args.outfile, X=pca_output, fmt="%.4f", delimiter="\t") | |
183 | |
184 | |
185 if __name__ == "__main__": | |
186 main() |