0
|
1 function fit=locfit(varargin)
|
|
2
|
|
3 % Smoothing noisy data using Local Regression and Likelihood.
|
|
4 %
|
|
5 % arguments still to add: dc maxit
|
|
6 %
|
|
7 % Usage: fit = locfit(x,y) % local regression fit of x and y.
|
|
8 % fit = locfit(x) % density estimation of x.
|
|
9 %
|
|
10 % Smoothing with locfit is a two-step procedure. The locfit()
|
|
11 % function evaluates the local regression smooth at a set of points
|
|
12 % (can be specified through an evaluation structure). Then, use
|
|
13 % the predict() function to interpolate this fit to other points.
|
|
14 %
|
|
15 % Additional arguments to locfit() are specified as 'name',value pairs, e.g.:
|
|
16 % locfit( x, 'alpha',[0.7,1.5] , 'family','rate' , 'ev','grid' , 'mg',100 );
|
|
17 %
|
|
18 %
|
|
19 % Data-related inputs:
|
|
20 %
|
|
21 % x is a vector or matrix of the independent (or predictor) variables.
|
|
22 % Rows of x represent subjects, columns represent variables.
|
|
23 % Generally, local regression would be used with 1-4 independent
|
|
24 % variables. In higher dimensions, the curse-of-dimensionality,
|
|
25 % as well as the difficulty of visualizing higher dimensional
|
|
26 % surfaces, may limit usefulness.
|
|
27 %
|
|
28 % y is the column vector of the dependent (or response) variable.
|
|
29 % For density families, 'y' is omitted.
|
|
30 % NOTE: x and y are the first two arguments. All other arguments require
|
|
31 % the 'name',value notation.
|
|
32 %
|
|
33 % 'weights' Prior weights for observations (reciprocal of variance, or
|
|
34 % sample size).
|
|
35 % 'cens' Censoring indicators for hazard rate or censored regression.
|
|
36 % The coding is '1' (or 'TRUE') for a censored observation, and
|
|
37 % '0' (or 'FALSE') for uncensored observations.
|
|
38 % 'base' Baseline parameter estimate. If a baseline is provided,
|
|
39 % the local regression model is fitted as
|
|
40 % Y_i = b_i + m(x_i) + epsilon_i,
|
|
41 % with Locfit estimating the m(x) term. For regression models,
|
|
42 % this effectively subtracts b_i from Y_i. The advantage of the
|
|
43 % 'base' formulation is that it extends to likelihood
|
|
44 % regression models.
|
|
45 % 'scale' A scale to apply to each variable. This is especially
|
|
46 % important for multivariate fitting, where variables may be
|
|
47 % measured in non-comparable units. It is also used to specify
|
|
48 % the frequency for variables with the 'a' (angular) style.
|
|
49 % 'sty' Character string (length d) of styles for each predictor variable.
|
|
50 % n denotes `normal'; a denotes angular (or periodic); l and r
|
|
51 % denotes one-sided left and right; c is conditionally parametric.
|
|
52 %
|
|
53 %
|
|
54 % Smoothing Parameters and Bandwidths:
|
|
55 % The bandwidth (or more accurately, half-width) of the smoothing window
|
|
56 % controls the amount of smoothing. Locfit allows specification of constant
|
|
57 % (fixed), nearest neighbor, certain locally adaptive variable bandwidths,
|
|
58 % and combinations of these. Also related to the smoothing parameter
|
|
59 % are the local polynmial degree and weight function.
|
|
60 %
|
|
61 % 'nn' 'Nearest neighbor' smoothing parameter. Specifying 'nn',0.5
|
|
62 % means that the width of each smoothing neighborhood is chosen
|
|
63 % to cover 50% of the data.
|
|
64 %
|
|
65 % 'h' A constant (or fixed) bandwidth parameter. For example, 'h',2
|
|
66 % means that the smoothing windows have constant half-width
|
|
67 % (or radius) 2. Note that h is applied after scaling.
|
|
68 %
|
|
69 % 'pen' penalty parameter for adaptive smoothing. Needs to be used
|
|
70 % with care.
|
|
71 %
|
|
72 % 'alpha' The old way of specifying smoothing parameters, as used in
|
|
73 % my book. alpha is equivalent to the vector [nn,h,pen].
|
|
74 % If multiple componenents are non-zero, the largest corresponding
|
|
75 % bandwidth is used. The default (if none of alpha,nn,h,pen
|
|
76 % are provided) is [0.7 0 0].
|
|
77 %
|
|
78 % 'deg' Degree of local polynomial. Default: 2 (local quadratic).
|
|
79 % Degrees 0 to 3 are supported by almost all parts of the
|
|
80 % Locfit code. Higher degrees may work in some cases.
|
|
81 %
|
|
82 % 'kern' Weight function, default = 'tcub'. Other choices are
|
|
83 % 'rect', 'trwt', 'tria', 'epan', 'bisq' and 'gauss'.
|
|
84 % Choices may be restricted when derivatives are
|
|
85 % required; e.g. for confidence bands and some bandwidth
|
|
86 % selectors.
|
|
87 %
|
|
88 % 'kt' Kernel type, 'sph' (default); 'prod'. In multivariate
|
|
89 % problems, 'prod' uses a simplified product model which
|
|
90 % speeds up computations.
|
|
91 %
|
|
92 % 'acri' Criterion for adaptive bandwidth selection.
|
|
93 %
|
|
94 %
|
|
95 % Derivative Estimation.
|
|
96 % Generally I recommend caution when using derivative estimation
|
|
97 % (and especially higher order derivative estimation) -- can you
|
|
98 % really estimate derivatives from noisy data? Any derivative
|
|
99 % estimate is inherently more dependent on an assumed smoothness
|
|
100 % (expressed through the bandwidth) than the data. Warnings aside...
|
|
101 %
|
|
102 % 'deriv' Derivative estimation. 'deriv',1 specifies the first derivative
|
|
103 % (or more correctly, an estimate of the local slope is returned.
|
|
104 % 'deriv',[1 1] specifies the second derivative. For bivariate fits
|
|
105 % 'deriv',2 specifies the first partial derivative wrt x2.
|
|
106 % 'deriv',[1 2] is mixed second-order derivative.
|
|
107 %
|
|
108 % Fitting family.
|
|
109 % 'family' is used to specify the local likelihood family.
|
|
110 % Regression-type families are 'gaussian', 'binomial',
|
|
111 % 'poisson', 'gamma' and 'geom'. If the family is preceded
|
|
112 % by a q (e.g. 'qgauss', or 'qpois') then quasi-likelihood is
|
|
113 % used; in particular, a dispersion estimate is computed.
|
|
114 % Preceding by an 'r' makes an attempt at robust (outlier-resistant)
|
|
115 % estimation. Combining q and r (e.g. 'family','qrpois') may
|
|
116 % work, if you're lucky.
|
|
117 % Density estimation-type families are 'dens', 'rate' and 'hazard'
|
|
118 % (hazard or failure rate). Note that `dens' scales the output
|
|
119 % to be a statistical density estimate (i.e. scaled to integrate
|
|
120 % to 1). 'rate' estimates the rate or intensity function (events
|
|
121 % per unit time, or events per unit area), which may be called
|
|
122 % density in some fields.
|
|
123 % The default family is 'qgauss' if a response (y argument) has been
|
|
124 % provided, and 'dens' if no response is given.
|
|
125 % 'link' Link function for local likelihood fitting. Depending on the
|
|
126 % family, choices may be 'ident', 'log', 'logit',
|
|
127 % 'inverse', 'sqrt' and 'arcsin'.
|
|
128 %
|
|
129 % Evaluation structures.
|
|
130 % By default, locfit chooses a set of points, depending on the data
|
|
131 % and smoothing parameters, to evaluate at. This is controlled by
|
|
132 % the evaluation structure.
|
|
133 % 'ev' Specify the evaluation structure. Default is 'tree'.
|
|
134 % Other choices include 'phull' (triangulation), 'grid' (a grid
|
|
135 % of points), 'data' (each data point), 'crossval' (data,
|
|
136 % but use leave-one-out cross validation), 'none' (no evaluation
|
|
137 % points, effectively producing the global parametric fit).
|
|
138 % Alternatively, a vector/matrix of evaluation points may be
|
|
139 % provided.
|
|
140 % (kd trees not currently supported in mlocfit)
|
|
141 % 'll' and 'ur' -- row vectors specifying the upper and lower limits
|
|
142 % for the bounding box used by the evaluation structure.
|
|
143 % They default to the data range.
|
|
144 % 'mg' For the 'grid' evaluation structure, 'mg' specifies the
|
|
145 % number of points on each margin. Default 10. Can be either a
|
|
146 % single number or vector.
|
|
147 % 'cut' Refinement parameter for adaptive partitions. Default 0.8;
|
|
148 % smaller values result in more refined partitions.
|
|
149 % 'maxk' Controls space assignment for evaluation structures. For the
|
|
150 % adaptive evaluation structures, it is impossible to be sure
|
|
151 % in advance how many vertices will be generated. If you get
|
|
152 % warnings about `Insufficient vertex space', Locfit's default
|
|
153 % assigment can be increased by increasing 'maxk'. The default
|
|
154 % is 'maxk','100'.
|
|
155 %
|
|
156 % 'xlim' For density estimation, Locfit allows the density to be
|
|
157 % supported on a bounded interval (or rectangle, in more than
|
|
158 % one dimension). The format should be [ll;ul] (ie, matrix with
|
|
159 % two rows, d columns) where ll is the lower left corner of
|
|
160 % the rectangle, and ur is the upper right corner.
|
|
161 % One-sided bounds, such as [0,infty), are not supported, but can be
|
|
162 % effectively specified by specifying a very large upper
|
|
163 % bound.
|
|
164 %
|
|
165 % 'module' either 'name' or {'name','/path/to/module',parameters}.
|
|
166 %
|
|
167 % Density Estimation
|
|
168 % 'renorm',1 will attempt to renormalize the local likelihood
|
|
169 % density estimate so that it integrates to 1. The llde
|
|
170 % (specified by 'family','dens') is scaled to estimate the
|
|
171 % density, but since the estimation is pointwise, there is
|
|
172 % no guarantee that the resulting density integrates exactly
|
|
173 % to 1. Renormalization attempts to achieve this.
|
|
174 %
|
|
175 % The output of locfit() is a Matlab structure:
|
|
176 %
|
|
177 % fit.data.x (n*d)
|
|
178 % fit.data.y (n*1)
|
|
179 % fit.data.weights (n*1 or 1*1)
|
|
180 % fit.data.censor (n*1 or 1*1)
|
|
181 % fit.data.baseline (n*1 or 1*1)
|
|
182 % fit.data.style (string length d)
|
|
183 % fit.data.scales (1*d)
|
|
184 % fit.data.xlim (2*d)
|
|
185 %
|
|
186 % fit.evaluation_structure.type (string)
|
|
187 % fit.evaluation_structure.module.name (string)
|
|
188 % fit.evaluation_structure.module.directory (string)
|
|
189 % fit.evaluation_structure.module.parameters (string)
|
|
190 % fit.evaluation_structure.lower_left (numeric 1*d)
|
|
191 % fit.evaluation_structure.upper_right (numeric 1*d)
|
|
192 % fit.evaluation_structure.grid (numeric 1*d)
|
|
193 % fit.evaluation_structure.cut (numeric 1*d)
|
|
194 % fit.evaluation_structure.maxk
|
|
195 % fit.evaluation_structure.derivative
|
|
196 %
|
|
197 % fit.smoothing_parameters.alpha = (nn h pen) vector
|
|
198 % fit.smoothing_parameters.adaptive_criterion (string)
|
|
199 % fit.smoothing_parameters.degree (numeric)
|
|
200 % fit.smoothing_parameters.family (string)
|
|
201 % fit.smoothing_parameters.link (string)
|
|
202 % fit.smoothing_parameters.kernel (string)
|
|
203 % fit.smoothing_parameters.kernel_type (string)
|
|
204 % fit.smoothing_parameters.deren
|
|
205 % fit.smoothing_parameters.deit
|
|
206 % fit.smoothing_parameters.demint
|
|
207 % fit.smoothing_parameters.debug
|
|
208 %
|
|
209 % fit.fit_points.evaluation_points (d*nv matrix)
|
|
210 % fit.fit_points.fitted_values (matrix, nv rows, many columns)
|
|
211 % fit.fit_points.evaluation_vectors.cell
|
|
212 % fit.fit_points.evaluation_vectors.splitvar
|
|
213 % fit.fit_points.evaluation_vectors.lo
|
|
214 % fit.fit_points.evaluation_vectors.hi
|
|
215 % fit.fit_points.fit_limits (d*2 matrix)
|
|
216 % fit.fit_points.family_link (numeric values)
|
|
217 % fit.fit_points.kappa (likelihood, degrees of freedom, etc)
|
|
218 %
|
|
219 % fit.parametric_component
|
|
220 %
|
|
221 %
|
|
222 % The OLD format:
|
|
223 %
|
|
224 % fit{1} = data.
|
|
225 % fit{2} = evaluation structure.
|
|
226 % fit{3} = smoothing parameter structure.
|
|
227 % fit{4}{1} = fit points matrix.
|
|
228 % fit{4}{2} = matrix of fitted values etc.
|
|
229 % Note that these are not back-transformed, and may have the
|
|
230 % parametric component removed.
|
|
231 % (exact content varies according to module).
|
|
232 % fit{4}{3} = various details of the evaluation points.
|
|
233 % fit{4}{4} = fit limits.
|
|
234 % fit{4}{5} = family,link.
|
|
235 % fit{5} = parametric component values.
|
|
236 %
|
|
237
|
|
238
|
|
239
|
|
240 % Minimal input validation
|
|
241 if nargin < 1
|
|
242 error( 'At least one input argument required' );
|
|
243 end
|
|
244
|
|
245 xdata = double(varargin{1});
|
|
246 d = size(xdata,2);
|
|
247 n = size(xdata,1);
|
|
248 if ((nargin>1) && (~ischar(varargin{2})))
|
|
249 ydata = double(varargin{2});
|
|
250 if (any(size(ydata) ~= [n 1])); error('y must be n*1 column vector'); end;
|
|
251 family = 'qgauss';
|
|
252 na = 3;
|
|
253 else
|
|
254 ydata = 0;
|
|
255 family = 'density';
|
|
256 na = 2;
|
|
257 end;
|
|
258 if mod(nargin-na,2)==0
|
|
259 error( 'All arguments other than x, y must be name,value pairs' );
|
|
260 end
|
|
261
|
|
262
|
|
263 wdata = ones(n,1);
|
|
264 cdata = 0;
|
|
265 base = 0;
|
|
266 style = 'n';
|
|
267 scale = 1;
|
|
268 xl = zeros(2,d);
|
|
269
|
|
270 alpha = [0 0 0];
|
|
271 deg = 2;
|
|
272 link = 'default';
|
|
273 acri = 'none';
|
|
274 kern = 'tcub';
|
|
275 kt = 'sph';
|
|
276 deren = 0;
|
|
277 deit = 'default';
|
|
278 demint= 20;
|
|
279 debug = 0;
|
|
280
|
|
281 ev = 'tree';
|
|
282 ll = zeros(1,d);
|
|
283 ur = zeros(1,d);
|
|
284 mg = 10;
|
|
285 maxk = 100;
|
|
286 deriv=0;
|
|
287 cut = 0.8;
|
|
288 mdl = struct('name','std', 'directory','', 'parameters',0 );
|
|
289
|
|
290 while na < length(varargin)
|
|
291 inc = 0;
|
|
292 if (varargin{na}=='y')
|
|
293 ydata = double(varargin{na+1});
|
|
294 family = 'qgauss';
|
|
295 inc = 2;
|
|
296 if (any(size(ydata) ~= [n 1])); error('y must be n*1 column vector'); end;
|
|
297 end
|
|
298 if (strcmp(varargin{na},'weights'))
|
|
299 wdata = double(varargin{na+1});
|
|
300 inc = 2;
|
|
301 if (any(size(wdata) ~= [n 1])); error('weights must be n*1 column vector'); end;
|
|
302 end
|
|
303 if (strcmp(varargin{na},'cens'))
|
|
304 cdata = double(varargin{na+1});
|
|
305 inc = 2;
|
|
306 if (any(size(cdata) ~= [n 1])); error('cens must be n*1 column vector'); end;
|
|
307 end
|
|
308 if (strcmp(varargin{na},'base')) % numeric vector, n*1 or 1*1.
|
|
309 base = double(varargin{na+1});
|
|
310 if (length(base)==1); base = base*ones(n,1); end;
|
|
311 inc = 2;
|
|
312 end
|
|
313 if (strcmp(varargin{na},'style')) % character string of length d.
|
|
314 style = varargin{na+1};
|
|
315 inc = 2;
|
|
316 end;
|
|
317 if (strcmp(varargin{na},'scale')) % row vector, length 1 or d.
|
|
318 scale = varargin{na+1};
|
|
319 if (scale==0)
|
|
320 scale = zeros(1,d);
|
|
321 for i=1:d
|
|
322 scale(i) = sqrt(var(xdata(:,i)));
|
|
323 end;
|
|
324 end;
|
|
325 inc = 2;
|
|
326 end;
|
|
327 if (strcmp(varargin{na},'xlim')) % 2*d numeric matrix.
|
|
328 xl = varargin{na+1};
|
|
329 inc = 2;
|
|
330 end
|
|
331 if (strcmp(varargin{na},'alpha')) % row vector of length 1, 2 or 3.
|
|
332 alpha = [varargin{na+1} 0 0 0];
|
|
333 alpha = alpha(1:3);
|
|
334 inc = 2;
|
|
335 end
|
|
336 if (strcmp(varargin{na},'nn')) % scalar
|
|
337 alpha(1) = varargin{na+1};
|
|
338 inc = 2;
|
|
339 end
|
|
340 if (strcmp(varargin{na},'h')) % scalar
|
|
341 alpha(2) = varargin{na+1};
|
|
342 inc = 2;
|
|
343 end;
|
|
344 if (strcmp(varargin{na},'pen')) % scalar
|
|
345 alpha(3) = varargin{na+1};
|
|
346 inc = 2;
|
|
347 end;
|
|
348 if (strcmp(varargin{na},'acri')) % string
|
|
349 acri = varargin{na+1};
|
|
350 inc = 2;
|
|
351 end
|
|
352 if (strcmp(varargin{na},'deg')) % positive integer.
|
|
353 deg = varargin{na+1};
|
|
354 inc = 2;
|
|
355 end;
|
|
356 if (strcmp(varargin{na},'family')) % character string.
|
|
357 family = varargin{na+1};
|
|
358 inc = 2;
|
|
359 end;
|
|
360 if (strcmp(varargin{na},'link')) % character string.
|
|
361 link = varargin{na+1};
|
|
362 inc = 2;
|
|
363 end;
|
|
364 if (strcmp(varargin{na},'kern')) % character string.
|
|
365 kern = varargin{na+1};
|
|
366 inc = 2;
|
|
367 end;
|
|
368 if (strcmp(varargin{na},'kt')) % character string.
|
|
369 kt = varargin{na+1};
|
|
370 inc = 2;
|
|
371 end;
|
|
372 if (strcmp(varargin{na},'ev')) % char. string, or matrix with d columns.
|
|
373 ev = varargin{na+1};
|
|
374 if (isnumeric(ev)); ev = ev'; end;
|
|
375 inc = 2;
|
|
376 end;
|
|
377 if (strcmp(varargin{na},'ll')) % row vector of length d.
|
|
378 ll = varargin{na+1};
|
|
379 inc = 2;
|
|
380 end;
|
|
381 if (strcmp(varargin{na},'ur')) % row vector of length d.
|
|
382 ur = varargin{na+1};
|
|
383 inc = 2;
|
|
384 end;
|
|
385 if (strcmp(varargin{na},'mg')) % row vector of length d.
|
|
386 mg = varargin{na+1};
|
|
387 inc = 2;
|
|
388 end;
|
|
389 if (strcmp(varargin{na},'cut')) % positive scalar.
|
|
390 cut = varargin{na+1};
|
|
391 inc = 2;
|
|
392 end;
|
|
393 if (strcmp(varargin{na},'module')) % string.
|
|
394 mdl = struct('name',varargin{na+1}, 'directory','', 'parameters',0 );
|
|
395 inc = 2;
|
|
396 end;
|
|
397 if (strcmp(varargin{na},'maxk')) % positive integer.
|
|
398 maxk = varargin{na+1};
|
|
399 inc = 2;
|
|
400 end;
|
|
401 if (strcmp(varargin{na},'deriv')) % numeric row vector, up to deg elements.
|
|
402 deriv = varargin{na+1};
|
|
403 inc = 2;
|
|
404 end;
|
|
405 if (strcmp(varargin{na},'renorm')) % density renormalization.
|
|
406 deren = varargin{na+1};
|
|
407 inc = 2;
|
|
408 end;
|
|
409 if (strcmp(varargin{na},'itype')) % density - integration type.
|
|
410 deit = varargin{na+1};
|
|
411 inc = 2;
|
|
412 end;
|
|
413 if (strcmp(varargin{na},'mint')) % density - # of integration points.
|
|
414 demint = varargin{na+1};
|
|
415 inc = 2;
|
|
416 end;
|
|
417 if (strcmp(varargin{na},'debug')) % debug level.
|
|
418 debug = varargin{na+1};
|
|
419 inc = 2;
|
|
420 end;
|
|
421 if (inc==0)
|
|
422 disp(varargin{na});
|
|
423 error('Unknown Input Argument.');
|
|
424 end;
|
|
425 na=na+inc;
|
|
426 end
|
|
427
|
|
428
|
|
429 fit.data.x = xdata;
|
|
430 fit.data.y = ydata;
|
|
431 fit.data.weights = wdata;
|
|
432 fit.data.censor = cdata;
|
|
433 fit.data.baseline = base;
|
|
434 fit.data.style = style;
|
|
435 fit.data.scales = scale;
|
|
436 fit.data.xlim = xl;
|
|
437
|
|
438 fit.evaluation_structure.type = ev;
|
|
439 fit.evaluation_structure.module = mdl;
|
|
440 fit.evaluation_structure.lower_left = ll;
|
|
441 fit.evaluation_structure.upper_right = ur;
|
|
442 fit.evaluation_structure.grid = mg;
|
|
443 fit.evaluation_structure.cut = cut;
|
|
444 fit.evaluation_structure.maxk = maxk;
|
|
445 fit.evaluation_structure.derivative = deriv;
|
|
446
|
|
447 if (alpha==0); alpha = [0.7 0 0]; end;
|
|
448
|
|
449 fit.smoothing_parameters.alpha = alpha;
|
|
450 fit.smoothing_parameters.adaptive_criterion = acri;
|
|
451 fit.smoothing_parameters.degree = deg;
|
|
452 fit.smoothing_parameters.family = family;
|
|
453 fit.smoothing_parameters.link = link;
|
|
454 fit.smoothing_parameters.kernel = kern;
|
|
455 fit.smoothing_parameters.kernel_type = kt;
|
|
456 fit.smoothing_parameters.deren = deren;
|
|
457 fit.smoothing_parameters.deit = deit;
|
|
458 fit.smoothing_parameters.demint = demint;
|
|
459 fit.smoothing_parameters.debug = debug;
|
|
460
|
|
461 [fpc pcomp] = mexlf(fit.data,fit.evaluation_structure,fit.smoothing_parameters);
|
|
462 fit.fit_points = fpc;
|
|
463 fit.parametric_component = pcomp;
|
|
464
|
|
465 return
|
|
466
|
|
467
|
|
468
|