comparison pca.py @ 0:af2624d5ab32 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
author bgruening
date Sat, 01 May 2021 01:24:32 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:af2624d5ab32
1 import argparse
2
3 import numpy as np
4 from galaxy_ml.utils import read_columns
5 from sklearn.decomposition import IncrementalPCA, KernelPCA, PCA
6
7
8 def main():
9 parser = argparse.ArgumentParser(description="RDKit screen")
10 parser.add_argument("-i", "--infile", help="Input file")
11 parser.add_argument(
12 "--header", action="store_true", help="Include the header row or skip it"
13 )
14 parser.add_argument(
15 "-c",
16 "--columns",
17 type=str.lower,
18 default="all",
19 choices=[
20 "by_index_number",
21 "all_but_by_index_number",
22 "by_header_name",
23 "all_but_by_header_name",
24 "all_columns",
25 ],
26 help="Choose to select all columns, or exclude/include some",
27 )
28 parser.add_argument(
29 "-ci",
30 "--column_indices",
31 type=str.lower,
32 help="Choose to select all columns, or exclude/include some",
33 )
34 parser.add_argument(
35 "-n",
36 "--number",
37 nargs="?",
38 type=int,
39 default=None,
40 help="Number of components to keep. If not set, all components are kept",
41 )
42 parser.add_argument("--whiten", action="store_true", help="Whiten the components")
43 parser.add_argument(
44 "-t",
45 "--pca_type",
46 type=str.lower,
47 default="classical",
48 choices=["classical", "incremental", "kernel"],
49 help="Choose which flavour of PCA to use",
50 )
51 parser.add_argument(
52 "-s",
53 "--svd_solver",
54 type=str.lower,
55 default="auto",
56 choices=["auto", "full", "arpack", "randomized"],
57 help="Choose the type of svd solver.",
58 )
59 parser.add_argument(
60 "-b",
61 "--batch_size",
62 nargs="?",
63 type=int,
64 default=None,
65 help="The number of samples to use for each batch",
66 )
67 parser.add_argument(
68 "-k",
69 "--kernel",
70 type=str.lower,
71 default="linear",
72 choices=["linear", "poly", "rbf", "sigmoid", "cosine", "precomputed"],
73 help="Choose the type of kernel.",
74 )
75 parser.add_argument(
76 "-g",
77 "--gamma",
78 nargs="?",
79 type=float,
80 default=None,
81 help="Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other kernels",
82 )
83 parser.add_argument(
84 "-tol",
85 "--tolerance",
86 type=float,
87 default=0.0,
88 help="Convergence tolerance for arpack. If 0, optimal value will be chosen by arpack",
89 )
90 parser.add_argument(
91 "-mi",
92 "--max_iter",
93 nargs="?",
94 type=int,
95 default=None,
96 help="Maximum number of iterations for arpack",
97 )
98 parser.add_argument(
99 "-d",
100 "--degree",
101 type=int,
102 default=3,
103 help="Degree for poly kernels. Ignored by other kernels",
104 )
105 parser.add_argument(
106 "-cf",
107 "--coef0",
108 type=float,
109 default=1.0,
110 help="Independent term in poly and sigmoid kernels",
111 )
112 parser.add_argument(
113 "-e",
114 "--eigen_solver",
115 type=str.lower,
116 default="auto",
117 choices=["auto", "dense", "arpack"],
118 help="Choose the type of eigen solver.",
119 )
120 parser.add_argument(
121 "-o", "--outfile", help="Base name for output file (no extension)."
122 )
123 args = parser.parse_args()
124
125 usecols = None
126 pca_params = {}
127
128 if args.columns == "by_index_number" or args.columns == "all_but_by_index_number":
129 usecols = [int(i) for i in args.column_indices.split(",")]
130 elif args.columns == "by_header_name" or args.columns == "all_but_by_header_name":
131 usecols = args.column_indices
132
133 header = "infer" if args.header else None
134
135 pca_input = read_columns(
136 f=args.infile,
137 c=usecols,
138 c_option=args.columns,
139 sep="\t",
140 header=header,
141 parse_dates=True,
142 encoding=None,
143 index_col=None,
144 )
145
146 pca_params.update({"n_components": args.number})
147
148 if args.pca_type == "classical":
149 pca_params.update({"svd_solver": args.svd_solver, "whiten": args.whiten})
150 if args.svd_solver == "arpack":
151 pca_params.update({"tol": args.tolerance})
152 pca = PCA()
153
154 elif args.pca_type == "incremental":
155 pca_params.update({"batch_size": args.batch_size, "whiten": args.whiten})
156 pca = IncrementalPCA()
157
158 elif args.pca_type == "kernel":
159 pca_params.update(
160 {
161 "kernel": args.kernel,
162 "eigen_solver": args.eigen_solver,
163 "gamma": args.gamma,
164 }
165 )
166
167 if args.kernel == "poly":
168 pca_params.update({"degree": args.degree, "coef0": args.coef0})
169 elif args.kernel == "sigmoid":
170 pca_params.update({"coef0": args.coef0})
171 elif args.kernel == "precomputed":
172 pca_input = np.dot(pca_input, pca_input.T)
173
174 if args.eigen_solver == "arpack":
175 pca_params.update({"tol": args.tolerance, "max_iter": args.max_iter})
176
177 pca = KernelPCA()
178
179 print(pca_params)
180 pca.set_params(**pca_params)
181 pca_output = pca.fit_transform(pca_input)
182 np.savetxt(fname=args.outfile, X=pca_output, fmt="%.4f", delimiter="\t")
183
184
185 if __name__ == "__main__":
186 main()