annotate kmersvm/scripts/kmersvm_train.py @ 0:7fe1103032f7 draft

Uploaded
author cafletezbrant
date Mon, 20 Aug 2012 18:07:22 -0400
parents
children fd740d515502
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
1 #!/usr/bin/env python
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
2 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
3 kmersvm_train.py; train a support vector machine using shogun toolbox
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
4 Copyright (C) 2011 Dongwon Lee
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
5
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
6 This program is free software: you can redistribute it and/or modify
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
7 it under the terms of the GNU General Public License as published by
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
8 the Free Software Foundation, either version 3 of the License, or
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
9 (at your option) any later version.
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
10
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
11 This program is distributed in the hope that it will be useful,
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
14 GNU General Public License for more details.
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
15
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
16 You should have received a copy of the GNU General Public License
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
18
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
19
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
20 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
21
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
22
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
23
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
24 import sys
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
25 import optparse
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
26 import random
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
27 import numpy
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
28 from math import log, exp
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
29
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
30 from libkmersvm import *
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
31 try:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
32 from shogun.PreProc import SortWordString, SortUlongString
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
33 except ImportError:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
34 from shogun.Preprocessor import SortWordString, SortUlongString
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
35 from shogun.Kernel import CommWordStringKernel, CommUlongStringKernel, \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
36 CombinedKernel
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
37
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
38 from shogun.Features import StringWordFeatures, StringUlongFeatures, \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
39 StringCharFeatures, CombinedFeatures, DNA, Labels
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
40 from shogun.Classifier import MSG_INFO, MSG_ERROR
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
41 try:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
42 from shogun.Classifier import SVMLight
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
43 except ImportError:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
44 from shogun.Classifier import LibSVM
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
45
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
46 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
47 global variables
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
48 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
49 g_kmers = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
50 g_rcmap = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
51
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
52
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
53 def kmerid2kmer(kmerid, kmerlen):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
54 """convert integer kmerid to kmer string
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
55
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
56 Arguments:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
57 kmerid -- integer, id of k-mer
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
58 kmerlen -- integer, length of k-mer
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
59
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
60 Return:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
61 kmer string
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
62 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
63
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
64 nts = "ACGT"
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
65 kmernts = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
66 kmerid2 = kmerid
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
67
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
68 for i in xrange(kmerlen):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
69 ntid = kmerid2 % 4
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
70 kmernts.append(nts[ntid])
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
71 kmerid2 = int((kmerid2-ntid)/4)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
72
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
73 return ''.join(reversed(kmernts))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
74
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
75
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
76 def kmer2kmerid(kmer, kmerlen):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
77 """convert kmer string to integer kmerid
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
78
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
79 Arguments:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
80 kmerid -- integer, id of k-mer
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
81 kmerlen -- integer, length of k-mer
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
82
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
83 Return:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
84 id of k-mer
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
85 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
86
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
87 nt2id = {'A':0, 'C':1, 'G':2, 'T':3}
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
88
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
89 return reduce(lambda x, y: (4*x+y), [nt2id[x] for x in kmer])
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
90
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
91
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
92 def get_rcmap(kmerid, kmerlen):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
93 """mapping kmerid to its reverse complement k-mer on-the-fly
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
94
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
95 Arguments:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
96 kmerid -- integer, id of k-mer
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
97 kmerlen -- integer, length of k-mer
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
98
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
99 Return:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
100 integer kmerid after mapping to its reverse complement
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
101 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
102
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
103 #1. get kmer from kmerid
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
104 #2. get reverse complement kmer
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
105 #3. get kmerid from revcomp kmer
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
106 rckmerid = kmer2kmerid(revcomp(kmerid2kmer(kmerid, kmerlen)), kmerlen)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
107
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
108 if rckmerid < kmerid:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
109 return rckmerid
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
110
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
111 return kmerid
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
112
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
113
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
114 def non_redundant_word_features(feats, kmerlen):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
115 """convert the features from Shogun toolbox to non-redundant word features (handle reverse complements)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
116 Arguments:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
117 feats -- StringWordFeatures
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
118 kmerlen -- integer, length of k-mer
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
119
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
120 Return:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
121 StringWordFeatures after converting reverse complement k-mer ids
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
122 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
123
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
124 rcmap = g_rcmap
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
125
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
126 for i in xrange(feats.get_num_vectors()):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
127 nf = [rcmap[int(kmerid)] for kmerid in feats.get_feature_vector(i)]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
128
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
129 feats.set_feature_vector(numpy.array(nf, numpy.dtype('u2')), i)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
130
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
131 preproc = SortWordString()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
132 preproc.init(feats)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
133 try:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
134 feats.add_preproc(preproc)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
135 feats.apply_preproc()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
136 except AttributeError:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
137 feats.add_preprocessor(preproc)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
138 feats.apply_preprocessor()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
139
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
140 return feats
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
141
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
142
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
143 def non_redundant_ulong_features(feats, kmerlen):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
144 """convert the features from Shogun toolbox to non-redundant ulong features
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
145 Arguments:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
146 feats -- StringUlongFeatures
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
147 kmerlen -- integer, length of k-mer
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
148
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
149 Return:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
150 StringUlongFeatures after converting reverse complement k-mer ids
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
151 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
152
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
153 for i in xrange(feats.get_num_vectors()):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
154 nf = [get_rcmap(int(kmerid), kmerlen) \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
155 for kmerid in feats.get_feature_vector(i)]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
156
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
157 feats.set_feature_vector(numpy.array(nf, numpy.dtype('u8')), i)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
158
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
159 preproc = SortUlongString()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
160 preproc.init(feats)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
161 try:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
162 feats.add_preproc(preproc)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
163 feats.apply_preproc()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
164 except AttributeError:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
165 feats.add_preprocessor(preproc)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
166 feats.apply_preprocessor()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
167
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
168 return feats
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
169
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
170
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
171 def svm_learn(kernel, labels, options):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
172 """train SVM using SVMLight or LibSVM
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
173
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
174 Arguments:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
175 kernel -- kernel object from Shogun toolbox
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
176 lebels -- list of labels
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
177 options -- object containing option data
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
178
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
179 Return:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
180 trained svm object
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
181 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
182
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
183 try:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
184 svm=SVMLight(options.svmC, kernel, Labels(numpy.array(labels, dtype=numpy.double)))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
185 except NameError:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
186 svm=LibSVM(options.svmC, kernel, Labels(numpy.array(labels, dtype=numpy.double)))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
187
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
188 if options.quiet == False:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
189 svm.io.set_loglevel(MSG_INFO)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
190 svm.io.set_target_to_stderr()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
191
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
192 svm.set_epsilon(options.epsilon)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
193 svm.parallel.set_num_threads(1)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
194 if options.weight != 1.0:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
195 svm.set_C(options.svmC, options.svmC*options.weight)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
196 svm.train()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
197
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
198 if options.quiet == False:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
199 svm.io.set_loglevel(MSG_ERROR)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
200
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
201 return svm
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
202
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
203
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
204 def _get_spectrum_features(seqs, kmerlen):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
205 """generate spectrum features (internal)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
206
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
207 Arguments:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
208 seqs -- list of sequences
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
209 kmerlen -- integer, length of k-mer
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
210
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
211 Return:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
212 StringWord(Ulong)Features after treatment of redundant reverse complement k-mers
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
213 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
214
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
215 char_feats = StringCharFeatures(seqs, DNA)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
216
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
217 if kmerlen <= 8:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
218 string_features = StringWordFeatures
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
219 non_redundant_features = non_redundant_word_features
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
220 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
221 string_features = StringUlongFeatures
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
222 non_redundant_features = non_redundant_ulong_features
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
223
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
224 feats = string_features(DNA)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
225 feats.obtain_from_char(char_feats, kmerlen-1, kmerlen, 0, False)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
226 return non_redundant_features(feats, kmerlen)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
227
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
228
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
229 def get_spectrum_features(seqs, options):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
230 """generate spectrum features (wrapper)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
231 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
232 return _get_spectrum_features(seqs, options.kmerlen)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
233
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
234
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
235 def get_weighted_spectrum_features(seqs, options):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
236 """generate weighted spectrum features
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
237 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
238 global g_kmers
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
239 global g_rcmap
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
240
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
241 subfeats_list = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
242
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
243 for k in xrange(options.kmerlen, options.kmerlen2+1):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
244 char_feats = StringCharFeatures(seqs, DNA)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
245 if k <= 8:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
246 g_kmers = generate_kmers(k)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
247 g_rcmap = generate_rcmap_table(k, g_kmers)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
248
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
249 subfeats = _get_spectrum_features(seqs, k)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
250 subfeats_list.append(subfeats)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
251
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
252 return subfeats_list
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
253
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
254
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
255 def get_spectrum_kernel(feats, options):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
256 """build spectrum kernel with non-redundant k-mer list (removing reverse complement)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
257
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
258 Arguments:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
259 feats -- feature object
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
260 options -- object containing option data
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
261
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
262 Return:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
263 StringWord(Ulong)Features, CommWord(Ulong)StringKernel
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
264 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
265 if options.kmerlen <= 8:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
266 return CommWordStringKernel(feats, feats)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
267 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
268 return CommUlongStringKernel(feats, feats)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
269
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
270
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
271 def get_weighted_spectrum_kernel(subfeats_list, options):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
272 """build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
273
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
274 Arguments:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
275 subfeats_list -- list of sub-feature objects
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
276 options -- object containing option data
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
277
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
278 Return:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
279 CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
280 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
281 kmerlen = options.kmerlen
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
282 kmerlen2 = options.kmerlen2
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
283
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
284 subkernels = 0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
285 kernel = CombinedKernel()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
286 feats = CombinedFeatures()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
287
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
288 for subfeats in subfeats_list:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
289 feats.append_feature_obj(subfeats)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
290
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
291 for k in xrange(kmerlen, kmerlen2+1):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
292 if k <= 8:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
293 subkernel = CommWordStringKernel(10, False)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
294 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
295 subkernel = CommUlongStringKernel(10, False)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
296
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
297 kernel.append_kernel(subkernel)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
298 subkernels+=1
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
299
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
300 kernel.init(feats, feats)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
301
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
302 kernel.set_subkernel_weights(numpy.array([1/float(subkernels)]*subkernels, numpy.dtype('float64')))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
303
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
304 return kernel
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
305
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
306
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
307 def init_spectrum_kernel(kern, feats_lhs, feats_rhs):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
308 """initialize spectrum kernel (wrapper function)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
309 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
310 kern.init(feats_lhs, feats_rhs)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
311
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
312
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
313 def init_weighted_spectrum_kernel(kern, subfeats_list_lhs, subfeats_list_rhs):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
314 """initialize weighted spectrum kernel (wrapper function)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
315 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
316 feats_lhs = CombinedFeatures()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
317 feats_rhs = CombinedFeatures()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
318
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
319 for subfeats in subfeats_list_lhs:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
320 feats_lhs.append_feature_obj(subfeats)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
321
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
322 for subfeats in subfeats_list_rhs:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
323 feats_rhs.append_feature_obj(subfeats)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
324
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
325 kern.init(feats_lhs, feats_rhs)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
326
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
327
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
328 def get_sksvm_weights(svm, feats, options):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
329 """calculate the SVM weight vector of spectrum kernel
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
330 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
331 kmerlen = options.kmerlen
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
332 alphas = svm.get_alphas()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
333 support_vector_ids = svm.get_support_vectors()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
334
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
335 w = numpy.array([0]*(2**(2*kmerlen)), numpy.double)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
336
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
337 for i in xrange(len(alphas)):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
338 x = [0]*(2**(2*kmerlen))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
339 for kmerid in feats.get_feature_vector(int(support_vector_ids[i])):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
340 x[int(kmerid)] += 1
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
341 x = numpy.array(x, numpy.double)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
342 w += (alphas[i]*x/numpy.sqrt(numpy.sum(x**2)))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
343
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
344 return w
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
345
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
346
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
347 def get_wsksvm_weights(svm, subfeats_list, options):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
348 """calculate the SVM weight vector of weighted spectrum kernel
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
349 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
350 kmerlen = options.kmerlen
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
351 kmerlen2 = options.kmerlen2
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
352 alphas = svm.get_alphas()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
353 support_vector_ids = svm.get_support_vectors()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
354 kmerlens = range(kmerlen, kmerlen2+1)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
355
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
356 weights = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
357 for idx in xrange(len(kmerlens)):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
358 subfeats = subfeats_list[idx]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
359
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
360 k = kmerlens[idx]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
361 w = numpy.array([0]*(2**(2*k)), numpy.double)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
362
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
363 for i in xrange(len(alphas)):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
364 x = [0]*(2**(2*k))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
365 for kmerid in subfeats.get_feature_vector(int(support_vector_ids[i])):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
366 x[int(kmerid)] += 1
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
367 x = numpy.array(x, numpy.double)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
368 w += (alphas[i]*x/numpy.sqrt(numpy.sum(x**2)))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
369
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
370 w /= len(kmerlens)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
371 weights.append(w)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
372
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
373 return weights
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
374
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
375
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
376 def save_header(f, bias, A, B, options):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
377 f.write("#parameters:\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
378 f.write("#kernel=" + str(options.ktype) + "\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
379 f.write("#kmerlen=" + str(options.kmerlen) + "\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
380 if options.ktype == 2:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
381 f.write("#kmerlen2=" + str(options.kmerlen2) + "\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
382 f.write("#bias=" + str(bias) + "\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
383 f.write("#A=" + str(A) + "\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
384 f.write("#B=" + str(B) + "\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
385 f.write("#NOTE: k-mers with large negative weights are also important. They can be found at the bottom of the list.\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
386 f.write("#k-mer\trevcomp\tSVM-weight\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
387
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
388
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
389 def save_sksvm_weights(w, bias, A, B, options):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
390 """save the SVM weight vector from spectrum kernel
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
391 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
392 output = options.outputname + "_weights.out"
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
393 kmerlen = options.kmerlen
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
394
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
395 f = open(output, 'w')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
396 save_header(f, bias, A, B, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
397
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
398 global g_kmers
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
399 global g_rcmap
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
400
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
401 if options.sort:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
402 w_sorted = sorted(zip(range(len(w)), w), key=lambda x: x[1], reverse=True)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
403 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
404 w_sorted = zip(range(len(w)), w)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
405
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
406 if kmerlen <= 8:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
407 for i in map(lambda x: x[0], w_sorted):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
408 if i == g_rcmap[i]:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
409 f.write('\t'.join( [g_kmers[i], revcomp(g_kmers[i]), str(w[i])] ) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
410 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
411 for i in map(lambda x: x[0], w_sorted):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
412 if i == get_rcmap(i, kmerlen):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
413 kmer = kmerid2kmer(i, kmerlen)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
414 f.write('\t'.join( [kmer, revcomp(kmer), str(w[i])] ) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
415
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
416 f.close()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
417
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
418
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
419 def save_wsksvm_weights(w, bias, A, B, options):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
420 """save the SVM weight vector from weighted spectrum kernel
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
421 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
422 output = options.outputname + "_weights.out"
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
423 kmerlen = options.kmerlen
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
424 kmerlen2 = options.kmerlen2
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
425
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
426 f = open(output, 'w')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
427 save_header(f, bias, A, B, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
428
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
429 global g_kmers
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
430 global g_rcmap
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
431
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
432 kmerlens = range(kmerlen, kmerlen2+1)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
433 for idx in xrange(len(kmerlens)):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
434 k = kmerlens[idx]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
435 subw = w[idx]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
436
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
437 if options.sort:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
438 subw_sorted = sorted(zip(range(len(subw)), subw), key=lambda x: x[1], reverse=True)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
439 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
440 subw_sorted = zip(range(len(subw)), subw)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
441
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
442 if k <= 8:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
443 g_kmers = generate_kmers(k)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
444 g_rcmap = generate_rcmap_table(k, g_kmers)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
445 for i in map(lambda x: x[0], subw_sorted):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
446 if i == g_rcmap[i]:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
447 f.write('\t'.join( [g_kmers[i], revcomp(g_kmers[i]), str(subw[i])] ) + "\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
448 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
449 for i in map(lambda x: x[0], subw_sorted):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
450 if i == get_rcmap(i, k):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
451 kmer = kmerid2kmer(i, k)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
452 f.write('\t'.join( [kmers, revcomp(kmers), str(subw[i])] ) + "\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
453
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
454 f.close()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
455
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
456
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
457 def save_predictions(output, preds, cvs):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
458 """save prediction
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
459 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
460 f = open(output, 'w')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
461 f.write('\t'.join(["#seq_id", "SVM score", "label", "NCV"]) + "\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
462 for i in xrange(len(preds)):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
463 f.write('\t'.join([preds[i][1], str(preds[i][2]), str(preds[i][3]), str(cvs[i])]) + "\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
464 f.close()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
465
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
466
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
467 def generate_cv_list(ncv, n1, n2):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
468 """generate the N-fold cross validation list
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
469
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
470 Arguments:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
471 ncv -- integer, number of cross-validation
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
472 n1 -- integer, number of positives
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
473 n2 -- integer, number of negatives
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
474
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
475 Return:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
476 a list of N-fold cross validation
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
477 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
478
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
479 shuffled_idx_list1 = range(n1)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
480 shuffled_idx_list2 = range(n1,n1+n2)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
481
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
482 random.shuffle(shuffled_idx_list1)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
483 random.shuffle(shuffled_idx_list2)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
484
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
485 shuffled_idx_list = shuffled_idx_list1 + shuffled_idx_list2
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
486
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
487 idx = 0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
488 icv = 0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
489 cv = [0] * (n1+n2)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
490 while(idx < (n1+n2)):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
491 cv[shuffled_idx_list[idx]] = icv
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
492
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
493 idx += 1
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
494 icv += 1
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
495 if icv == ncv:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
496 icv = 0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
497
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
498 return cv
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
499
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
500
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
501 def split_cv_list(cvlist, icv, data):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
502 """split data into training and test based on cross-validation list
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
503
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
504 Arguments:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
505 cvlist -- list, cross-validation list
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
506 icv -- integer, corss-validation set of interest
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
507 data -- list, data set to be splitted
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
508
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
509 Return:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
510 a list of training set and a list of test set
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
511 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
512
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
513 tr_data = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
514 te_data = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
515
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
516 for i in xrange(len(data)):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
517 if cvlist[i] == icv:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
518 te_data.append(data[i])
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
519 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
520 tr_data.append(data[i])
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
521
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
522 return tr_data, te_data
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
523
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
524
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
525 def LMAI(svms, labels, prior0, prior1):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
526 """fitting svms to sigmoid function (improved version introduced by Lin 2003)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
527
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
528 Arguments:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
529 svms -- list of svm scores
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
530 labels -- list of labels
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
531 prior0 -- prior of negative set
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
532 prior1 -- prior of positive set
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
533
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
534 Return:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
535 A, B parameter of 1/(1+exp(A*SVM+B))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
536 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
537
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
538 #parameter settings
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
539 maxiter = 100
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
540 minstep = 1e-10
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
541 sigma = 1e-3
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
542
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
543 hiTarget = (prior1+1.0)/float(prior1+2.0)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
544 loTarget = 1/float(prior0+2.0)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
545
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
546 t = [0]*len(labels)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
547 for i in xrange(len(labels)):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
548 if labels[i] == 1:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
549 t[i] = hiTarget
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
550 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
551 t[i] = loTarget
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
552
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
553 A = 0.0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
554 B = log((prior0+1.0)/float(prior1+1.0))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
555 fval = 0.0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
556
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
557 for i in xrange(len(labels)):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
558 fApB = svms[i]*A+B
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
559 if fApB >= 0:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
560 fval += (t[i]*fApB+log(1+exp(-fApB)))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
561 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
562 fval += ((t[i]-1)*fApB+log(1+exp(fApB)))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
563
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
564
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
565 for it in xrange(maxiter):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
566 #print "iteration:", it
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
567 #Update Graidient and Hessian (use H'= H + sigma I)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
568 h11 = sigma
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
569 h22 = sigma
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
570 h21 = 0.0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
571 g1 = 0.0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
572 g2 = 0.0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
573
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
574 for i in xrange(len(labels)):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
575 fApB = svms[i]*A+B
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
576 if fApB >= 0:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
577 p = exp(-fApB) / float(1.0+exp(-fApB))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
578 q = 1.0 / float(1.0 + exp(-fApB))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
579 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
580 p = 1.0 / float(1.0 + exp(fApB))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
581 q = exp(fApB) / float(1.0+exp(fApB))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
582 d2 = p*q
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
583 h11 += (svms[i]*svms[i]*d2)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
584 h22 += d2
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
585 h21 += (svms[i]*d2)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
586 d1 = t[i]-p
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
587 g1 += (svms[i]*d1)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
588 g2 += d1
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
589
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
590 #Stopping criteria
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
591 if (abs(g1)<1e-5) and (abs(g2)<1e-5):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
592 break
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
593
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
594 det = h11*h22-h21*h21
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
595 dA = -(h22*g1-h21*g2)/float(det)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
596 dB = -(-h21*g1+h11*g2)/float(det)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
597 gd = g1*dA+g2*dB
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
598 stepsize=1
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
599 while stepsize >= minstep:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
600 newA = A+stepsize*dA
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
601 newB = B+stepsize*dB
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
602 newf = 0.0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
603
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
604 for i in xrange(len(labels)):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
605 fApB = svms[i]*newA+newB
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
606 if fApB >= 0:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
607 newf += (t[i]*fApB + log(1+exp(-fApB)))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
608 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
609 newf += ((t[i]-1)*fApB + log(1+exp(fApB)))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
610
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
611 if newf < (fval+0.0001*stepsize*gd):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
612 A=newA
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
613 B=newB
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
614 fval=newf
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
615 break
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
616 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
617 stepsize=stepsize/float(2.0)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
618
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
619 #Line search failes
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
620 if stepsize < minstep:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
621 #print "Line search fails"
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
622 break
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
623
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
624 #if it >= maxiter:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
625 # print "Reaching maximum iterations"
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
626
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
627 return A, B
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
628
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
629
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
630 def wsksvm_classify(seqs, svm, kern, feats, options):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
631 feats_te = get_weighted_spectrum_features(seqs, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
632 init_weighted_spectrum_kernel(kern, feats, feats_te)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
633
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
634 return svm.apply().get_labels().tolist()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
635
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
636
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
637 def score_seq(s, svmw, kmerlen):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
638 """calculate SVM score of given sequence using single set of svm weights
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
639
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
640 Arguments:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
641 s -- string, DNA sequence
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
642 svmw -- numpy array, SVM weights
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
643 kmerlen -- integer, length of k-mer of SVM weight
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
644
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
645 Return:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
646 SVM score
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
647 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
648
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
649 global g_rcmap
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
650 kmer2kmerid_func = kmer2kmerid
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
651
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
652 x = [0]*(2**(2*kmerlen))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
653 for j in xrange(len(s)-kmerlen+1):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
654 x[ g_rcmap[kmer2kmerid_func(s[j:j+kmerlen], kmerlen)] ] += 1
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
655
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
656 x = numpy.array(x, numpy.double)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
657 score_norm = numpy.dot(svmw, x)/numpy.sqrt(numpy.sum(x**2))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
658
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
659 return score_norm
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
660
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
661
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
662 def sksvm_classify(seqs, svm, kern, feats, options):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
663 """classify the given sequences
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
664 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
665 if options.kmerlen <= 8:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
666 #this is much faster when the length of kmer is short, and SVs are many
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
667 svmw = get_sksvm_weights(svm, feats, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
668 return [score_seq(s, svmw, options.kmerlen)+svm.get_bias() for s in seqs]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
669 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
670 feats_te = get_spectrum_features(seqs, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
671 init_spectrum_kernel(kern, feats, feats_te)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
672
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
673 return svm.apply().get_labels().tolist()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
674
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
675
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
676 def main(argv = sys.argv):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
677 usage = "Usage: %prog [options] POSITIVE_SEQ NEGATIVE_SEQ"
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
678 desc = "1. take two files(FASTA format) as input, 2. train an SVM and store the trained SVM weights"
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
679 parser = optparse.OptionParser(usage=usage, description=desc)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
680 parser.add_option("-t", dest="ktype", type="int", default=1, \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
681 help="set the type of kernel, 1:Spectrum, 2:Weighted Spectrums (default=1.Spectrum)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
682
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
683 parser.add_option("-C", dest="svmC", type="float", default=1, \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
684 help="set the regularization parameter svmC (default=1)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
685
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
686 parser.add_option("-e", dest="epsilon", type="float", default=0.00001, \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
687 help="set the precision parameter epsilon (default=0.00001)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
688
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
689 parser.add_option("-w", dest="weight", type="float", default=0.0, \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
690 help="set the weight for positive set (default=auto, 1+log(N/P))")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
691
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
692 parser.add_option("-k", dest="kmerlen", type="int",default=6, \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
693 help="set the (min) length of k-mer for (weighted) spectrum kernel (default = 6)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
694
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
695 parser.add_option("-K", dest="kmerlen2", type="int",default=8, \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
696 help="set the max length of k-mer for weighted spectrum kernel (default = 8)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
697
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
698 parser.add_option("-n", dest="outputname", default="kmersvm_output", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
699 help="set the name of output files (default=kmersvm_output)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
700
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
701 parser.add_option("-v", dest="ncv", type="int", default=0, \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
702 help="if set, it will perform N-fold cross-validation and generate a prediction file (default = 0)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
703
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
704 parser.add_option("-p", dest="posteriorp", default=False, action="store_true", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
705 help="estimate parameters for posterior probability with N-CV. this option requires -v option to be set (default=false)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
706
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
707 parser.add_option("-r", dest="rseed", type="int", default=1, \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
708 help="set the random number seed for cross-validation (-p option) (default=1)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
709
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
710 parser.add_option("-q", dest="quiet", default=False, action="store_true", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
711 help="supress messages (default=false)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
712
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
713 parser.add_option("-s", dest="sort", default=False, action="store_true", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
714 help="sort the kmers by absolute values of SVM weights (default=false)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
715
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
716 ktype_str = ["", "Spectrum", "Weighted Spectrums"]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
717
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
718 (options, args) = parser.parse_args()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
719
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
720 if len(args) == 0:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
721 parser.print_help()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
722 sys.exit(0)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
723
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
724 if len(args) != 2:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
725 parser.error("incorrect number of arguments")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
726 parser.print_help()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
727 sys.exit(0)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
728
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
729 if options.posteriorp and options.ncv == 0:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
730 parser.error("posterior probability estimation requires N-fold CV process (-v option should be set)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
731 parser.print_help()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
732 sys.exit(0)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
733
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
734 random.seed(options.rseed)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
735
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
736 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
737 set global variable
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
738 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
739 if (options.ktype == 1) and (options.kmerlen <= 8):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
740 global g_kmers
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
741 global g_rcmap
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
742
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
743 g_kmers = generate_kmers(options.kmerlen)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
744 g_rcmap = generate_rcmap_table(options.kmerlen, g_kmers)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
745
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
746 posf = args[0]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
747 negf = args[1]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
748
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
749 seqs_pos, sids_pos = read_fastafile(posf)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
750 seqs_neg, sids_neg = read_fastafile(negf)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
751 npos = len(seqs_pos)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
752 nneg = len(seqs_neg)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
753 seqs = seqs_pos + seqs_neg
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
754 sids = sids_pos + sids_neg
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
755
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
756 if options.weight == 0:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
757 options.weight = 1 + log(nneg/npos)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
758
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
759 if options.quiet == False:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
760 sys.stderr.write('SVM parameters:\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
761 sys.stderr.write(' kernel-type: ' + str(options.ktype) + "." + ktype_str[options.ktype] + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
762 sys.stderr.write(' svm-C: ' + str(options.svmC) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
763 sys.stderr.write(' epsilon: ' + str(options.epsilon) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
764 sys.stderr.write(' weight: ' + str(options.weight) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
765 sys.stderr.write('\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
766
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
767 sys.stderr.write('Other options:\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
768 sys.stderr.write(' kmerlen: ' + str(options.kmerlen) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
769 if options.ktype == 2:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
770 sys.stderr.write(' kmerlen2: ' + str(options.kmerlen2) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
771 sys.stderr.write(' outputname: ' + options.outputname + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
772 sys.stderr.write(' posteriorp: ' + str(options.posteriorp) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
773 if options.ncv > 0:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
774 sys.stderr.write(' ncv: ' + str(options.ncv) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
775 sys.stderr.write(' rseed: ' + str(options.rseed) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
776 sys.stderr.write(' sorted-weight: ' + str(options.sort) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
777
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
778 sys.stderr.write('\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
779
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
780 sys.stderr.write('Input args:\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
781 sys.stderr.write(' positive sequence file: ' + posf + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
782 sys.stderr.write(' negative sequence file: ' + negf + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
783 sys.stderr.write('\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
784
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
785 sys.stderr.write('numer of total positive seqs: ' + str(npos) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
786 sys.stderr.write('numer of total negative seqs: ' + str(nneg) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
787 sys.stderr.write('\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
788
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
789 #generate labels
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
790 labels = [1]*npos + [-1]*nneg
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
791
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
792 if options.ktype == 1:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
793 get_features = get_spectrum_features
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
794 get_kernel = get_spectrum_kernel
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
795 get_weights = get_sksvm_weights
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
796 save_weights = save_sksvm_weights
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
797 svm_classify = sksvm_classify
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
798 elif options.ktype == 2:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
799 get_features = get_weighted_spectrum_features
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
800 get_kernel = get_weighted_spectrum_kernel
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
801 get_weights = get_wsksvm_weights
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
802 save_weights = save_wsksvm_weights
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
803 svm_classify = wsksvm_classify
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
804 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
805 sys.stderr.write('..unknown kernel..\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
806 sys.exit(0)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
807
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
808 A = B = 0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
809 if options.ncv > 0:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
810 if options.quiet == False:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
811 sys.stderr.write('..Cross-validation\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
812
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
813 cvlist = generate_cv_list(options.ncv, npos, nneg)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
814 labels_cv = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
815 preds_cv = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
816 sids_cv = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
817 indices_cv = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
818 for icv in xrange(options.ncv):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
819 #split data into training and test set
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
820 seqs_tr, seqs_te = split_cv_list(cvlist, icv, seqs)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
821 labs_tr, labs_te = split_cv_list(cvlist, icv, labels)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
822 sids_tr, sids_te = split_cv_list(cvlist, icv, sids)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
823 indices_tr, indices_te = split_cv_list(cvlist, icv, range(len(seqs)))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
824
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
825 #train SVM
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
826 feats_tr = get_features(seqs_tr, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
827 kernel_tr = get_kernel(feats_tr, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
828 svm_cv = svm_learn(kernel_tr, labs_tr, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
829
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
830 preds_cv = preds_cv + svm_classify(seqs_te, svm_cv, kernel_tr, feats_tr, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
831
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
832 labels_cv = labels_cv + labs_te
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
833 sids_cv = sids_cv + sids_te
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
834 indices_cv = indices_cv + indices_te
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
835
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
836 output_cvpred = options.outputname + "_cvpred.out"
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
837 prediction_results = sorted(zip(indices_cv, sids_cv, preds_cv, labels_cv), key=lambda p: p[0])
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
838 save_predictions(output_cvpred, prediction_results, cvlist)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
839
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
840 if options.posteriorp:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
841 A, B = LMAI(preds_cv, labels_cv, labels_cv.count(-1), labels_cv.count(1))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
842
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
843 if options.quiet == False:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
844 sys.stderr.write('Estimated Parameters:\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
845 sys.stderr.write(' A: ' + str(A) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
846 sys.stderr.write(' B: ' + str(B) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
847
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
848 if options.quiet == False:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
849 sys.stderr.write('..SVM weights\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
850
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
851 feats = get_features(seqs, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
852 kernel = get_kernel(feats, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
853 svm = svm_learn(kernel, labels, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
854 w = get_weights(svm, feats, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
855 b = svm.get_bias()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
856
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
857 save_weights(w, b, A, B, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
858
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
859 if __name__=='__main__': main()