comparison rDiff/src/locfit/m/locfit.m @ 0:0f80a5141704

version 0.3 uploaded
author vipints
date Thu, 14 Feb 2013 23:38:36 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0f80a5141704
1 function fit=locfit(varargin)
2
3 % Smoothing noisy data using Local Regression and Likelihood.
4 %
5 % arguments still to add: dc maxit
6 %
7 % Usage: fit = locfit(x,y) % local regression fit of x and y.
8 % fit = locfit(x) % density estimation of x.
9 %
10 % Smoothing with locfit is a two-step procedure. The locfit()
11 % function evaluates the local regression smooth at a set of points
12 % (can be specified through an evaluation structure). Then, use
13 % the predict() function to interpolate this fit to other points.
14 %
15 % Additional arguments to locfit() are specified as 'name',value pairs, e.g.:
16 % locfit( x, 'alpha',[0.7,1.5] , 'family','rate' , 'ev','grid' , 'mg',100 );
17 %
18 %
19 % Data-related inputs:
20 %
21 % x is a vector or matrix of the independent (or predictor) variables.
22 % Rows of x represent subjects, columns represent variables.
23 % Generally, local regression would be used with 1-4 independent
24 % variables. In higher dimensions, the curse-of-dimensionality,
25 % as well as the difficulty of visualizing higher dimensional
26 % surfaces, may limit usefulness.
27 %
28 % y is the column vector of the dependent (or response) variable.
29 % For density families, 'y' is omitted.
30 % NOTE: x and y are the first two arguments. All other arguments require
31 % the 'name',value notation.
32 %
33 % 'weights' Prior weights for observations (reciprocal of variance, or
34 % sample size).
35 % 'cens' Censoring indicators for hazard rate or censored regression.
36 % The coding is '1' (or 'TRUE') for a censored observation, and
37 % '0' (or 'FALSE') for uncensored observations.
38 % 'base' Baseline parameter estimate. If a baseline is provided,
39 % the local regression model is fitted as
40 % Y_i = b_i + m(x_i) + epsilon_i,
41 % with Locfit estimating the m(x) term. For regression models,
42 % this effectively subtracts b_i from Y_i. The advantage of the
43 % 'base' formulation is that it extends to likelihood
44 % regression models.
45 % 'scale' A scale to apply to each variable. This is especially
46 % important for multivariate fitting, where variables may be
47 % measured in non-comparable units. It is also used to specify
48 % the frequency for variables with the 'a' (angular) style.
49 % 'sty' Character string (length d) of styles for each predictor variable.
50 % n denotes `normal'; a denotes angular (or periodic); l and r
51 % denotes one-sided left and right; c is conditionally parametric.
52 %
53 %
54 % Smoothing Parameters and Bandwidths:
55 % The bandwidth (or more accurately, half-width) of the smoothing window
56 % controls the amount of smoothing. Locfit allows specification of constant
57 % (fixed), nearest neighbor, certain locally adaptive variable bandwidths,
58 % and combinations of these. Also related to the smoothing parameter
59 % are the local polynmial degree and weight function.
60 %
61 % 'nn' 'Nearest neighbor' smoothing parameter. Specifying 'nn',0.5
62 % means that the width of each smoothing neighborhood is chosen
63 % to cover 50% of the data.
64 %
65 % 'h' A constant (or fixed) bandwidth parameter. For example, 'h',2
66 % means that the smoothing windows have constant half-width
67 % (or radius) 2. Note that h is applied after scaling.
68 %
69 % 'pen' penalty parameter for adaptive smoothing. Needs to be used
70 % with care.
71 %
72 % 'alpha' The old way of specifying smoothing parameters, as used in
73 % my book. alpha is equivalent to the vector [nn,h,pen].
74 % If multiple componenents are non-zero, the largest corresponding
75 % bandwidth is used. The default (if none of alpha,nn,h,pen
76 % are provided) is [0.7 0 0].
77 %
78 % 'deg' Degree of local polynomial. Default: 2 (local quadratic).
79 % Degrees 0 to 3 are supported by almost all parts of the
80 % Locfit code. Higher degrees may work in some cases.
81 %
82 % 'kern' Weight function, default = 'tcub'. Other choices are
83 % 'rect', 'trwt', 'tria', 'epan', 'bisq' and 'gauss'.
84 % Choices may be restricted when derivatives are
85 % required; e.g. for confidence bands and some bandwidth
86 % selectors.
87 %
88 % 'kt' Kernel type, 'sph' (default); 'prod'. In multivariate
89 % problems, 'prod' uses a simplified product model which
90 % speeds up computations.
91 %
92 % 'acri' Criterion for adaptive bandwidth selection.
93 %
94 %
95 % Derivative Estimation.
96 % Generally I recommend caution when using derivative estimation
97 % (and especially higher order derivative estimation) -- can you
98 % really estimate derivatives from noisy data? Any derivative
99 % estimate is inherently more dependent on an assumed smoothness
100 % (expressed through the bandwidth) than the data. Warnings aside...
101 %
102 % 'deriv' Derivative estimation. 'deriv',1 specifies the first derivative
103 % (or more correctly, an estimate of the local slope is returned.
104 % 'deriv',[1 1] specifies the second derivative. For bivariate fits
105 % 'deriv',2 specifies the first partial derivative wrt x2.
106 % 'deriv',[1 2] is mixed second-order derivative.
107 %
108 % Fitting family.
109 % 'family' is used to specify the local likelihood family.
110 % Regression-type families are 'gaussian', 'binomial',
111 % 'poisson', 'gamma' and 'geom'. If the family is preceded
112 % by a q (e.g. 'qgauss', or 'qpois') then quasi-likelihood is
113 % used; in particular, a dispersion estimate is computed.
114 % Preceding by an 'r' makes an attempt at robust (outlier-resistant)
115 % estimation. Combining q and r (e.g. 'family','qrpois') may
116 % work, if you're lucky.
117 % Density estimation-type families are 'dens', 'rate' and 'hazard'
118 % (hazard or failure rate). Note that `dens' scales the output
119 % to be a statistical density estimate (i.e. scaled to integrate
120 % to 1). 'rate' estimates the rate or intensity function (events
121 % per unit time, or events per unit area), which may be called
122 % density in some fields.
123 % The default family is 'qgauss' if a response (y argument) has been
124 % provided, and 'dens' if no response is given.
125 % 'link' Link function for local likelihood fitting. Depending on the
126 % family, choices may be 'ident', 'log', 'logit',
127 % 'inverse', 'sqrt' and 'arcsin'.
128 %
129 % Evaluation structures.
130 % By default, locfit chooses a set of points, depending on the data
131 % and smoothing parameters, to evaluate at. This is controlled by
132 % the evaluation structure.
133 % 'ev' Specify the evaluation structure. Default is 'tree'.
134 % Other choices include 'phull' (triangulation), 'grid' (a grid
135 % of points), 'data' (each data point), 'crossval' (data,
136 % but use leave-one-out cross validation), 'none' (no evaluation
137 % points, effectively producing the global parametric fit).
138 % Alternatively, a vector/matrix of evaluation points may be
139 % provided.
140 % (kd trees not currently supported in mlocfit)
141 % 'll' and 'ur' -- row vectors specifying the upper and lower limits
142 % for the bounding box used by the evaluation structure.
143 % They default to the data range.
144 % 'mg' For the 'grid' evaluation structure, 'mg' specifies the
145 % number of points on each margin. Default 10. Can be either a
146 % single number or vector.
147 % 'cut' Refinement parameter for adaptive partitions. Default 0.8;
148 % smaller values result in more refined partitions.
149 % 'maxk' Controls space assignment for evaluation structures. For the
150 % adaptive evaluation structures, it is impossible to be sure
151 % in advance how many vertices will be generated. If you get
152 % warnings about `Insufficient vertex space', Locfit's default
153 % assigment can be increased by increasing 'maxk'. The default
154 % is 'maxk','100'.
155 %
156 % 'xlim' For density estimation, Locfit allows the density to be
157 % supported on a bounded interval (or rectangle, in more than
158 % one dimension). The format should be [ll;ul] (ie, matrix with
159 % two rows, d columns) where ll is the lower left corner of
160 % the rectangle, and ur is the upper right corner.
161 % One-sided bounds, such as [0,infty), are not supported, but can be
162 % effectively specified by specifying a very large upper
163 % bound.
164 %
165 % 'module' either 'name' or {'name','/path/to/module',parameters}.
166 %
167 % Density Estimation
168 % 'renorm',1 will attempt to renormalize the local likelihood
169 % density estimate so that it integrates to 1. The llde
170 % (specified by 'family','dens') is scaled to estimate the
171 % density, but since the estimation is pointwise, there is
172 % no guarantee that the resulting density integrates exactly
173 % to 1. Renormalization attempts to achieve this.
174 %
175 % The output of locfit() is a Matlab structure:
176 %
177 % fit.data.x (n*d)
178 % fit.data.y (n*1)
179 % fit.data.weights (n*1 or 1*1)
180 % fit.data.censor (n*1 or 1*1)
181 % fit.data.baseline (n*1 or 1*1)
182 % fit.data.style (string length d)
183 % fit.data.scales (1*d)
184 % fit.data.xlim (2*d)
185 %
186 % fit.evaluation_structure.type (string)
187 % fit.evaluation_structure.module.name (string)
188 % fit.evaluation_structure.module.directory (string)
189 % fit.evaluation_structure.module.parameters (string)
190 % fit.evaluation_structure.lower_left (numeric 1*d)
191 % fit.evaluation_structure.upper_right (numeric 1*d)
192 % fit.evaluation_structure.grid (numeric 1*d)
193 % fit.evaluation_structure.cut (numeric 1*d)
194 % fit.evaluation_structure.maxk
195 % fit.evaluation_structure.derivative
196 %
197 % fit.smoothing_parameters.alpha = (nn h pen) vector
198 % fit.smoothing_parameters.adaptive_criterion (string)
199 % fit.smoothing_parameters.degree (numeric)
200 % fit.smoothing_parameters.family (string)
201 % fit.smoothing_parameters.link (string)
202 % fit.smoothing_parameters.kernel (string)
203 % fit.smoothing_parameters.kernel_type (string)
204 % fit.smoothing_parameters.deren
205 % fit.smoothing_parameters.deit
206 % fit.smoothing_parameters.demint
207 % fit.smoothing_parameters.debug
208 %
209 % fit.fit_points.evaluation_points (d*nv matrix)
210 % fit.fit_points.fitted_values (matrix, nv rows, many columns)
211 % fit.fit_points.evaluation_vectors.cell
212 % fit.fit_points.evaluation_vectors.splitvar
213 % fit.fit_points.evaluation_vectors.lo
214 % fit.fit_points.evaluation_vectors.hi
215 % fit.fit_points.fit_limits (d*2 matrix)
216 % fit.fit_points.family_link (numeric values)
217 % fit.fit_points.kappa (likelihood, degrees of freedom, etc)
218 %
219 % fit.parametric_component
220 %
221 %
222 % The OLD format:
223 %
224 % fit{1} = data.
225 % fit{2} = evaluation structure.
226 % fit{3} = smoothing parameter structure.
227 % fit{4}{1} = fit points matrix.
228 % fit{4}{2} = matrix of fitted values etc.
229 % Note that these are not back-transformed, and may have the
230 % parametric component removed.
231 % (exact content varies according to module).
232 % fit{4}{3} = various details of the evaluation points.
233 % fit{4}{4} = fit limits.
234 % fit{4}{5} = family,link.
235 % fit{5} = parametric component values.
236 %
237
238
239
240 % Minimal input validation
241 if nargin < 1
242 error( 'At least one input argument required' );
243 end
244
245 xdata = double(varargin{1});
246 d = size(xdata,2);
247 n = size(xdata,1);
248 if ((nargin>1) && (~ischar(varargin{2})))
249 ydata = double(varargin{2});
250 if (any(size(ydata) ~= [n 1])); error('y must be n*1 column vector'); end;
251 family = 'qgauss';
252 na = 3;
253 else
254 ydata = 0;
255 family = 'density';
256 na = 2;
257 end;
258 if mod(nargin-na,2)==0
259 error( 'All arguments other than x, y must be name,value pairs' );
260 end
261
262
263 wdata = ones(n,1);
264 cdata = 0;
265 base = 0;
266 style = 'n';
267 scale = 1;
268 xl = zeros(2,d);
269
270 alpha = [0 0 0];
271 deg = 2;
272 link = 'default';
273 acri = 'none';
274 kern = 'tcub';
275 kt = 'sph';
276 deren = 0;
277 deit = 'default';
278 demint= 20;
279 debug = 0;
280
281 ev = 'tree';
282 ll = zeros(1,d);
283 ur = zeros(1,d);
284 mg = 10;
285 maxk = 100;
286 deriv=0;
287 cut = 0.8;
288 mdl = struct('name','std', 'directory','', 'parameters',0 );
289
290 while na < length(varargin)
291 inc = 0;
292 if (varargin{na}=='y')
293 ydata = double(varargin{na+1});
294 family = 'qgauss';
295 inc = 2;
296 if (any(size(ydata) ~= [n 1])); error('y must be n*1 column vector'); end;
297 end
298 if (strcmp(varargin{na},'weights'))
299 wdata = double(varargin{na+1});
300 inc = 2;
301 if (any(size(wdata) ~= [n 1])); error('weights must be n*1 column vector'); end;
302 end
303 if (strcmp(varargin{na},'cens'))
304 cdata = double(varargin{na+1});
305 inc = 2;
306 if (any(size(cdata) ~= [n 1])); error('cens must be n*1 column vector'); end;
307 end
308 if (strcmp(varargin{na},'base')) % numeric vector, n*1 or 1*1.
309 base = double(varargin{na+1});
310 if (length(base)==1); base = base*ones(n,1); end;
311 inc = 2;
312 end
313 if (strcmp(varargin{na},'style')) % character string of length d.
314 style = varargin{na+1};
315 inc = 2;
316 end;
317 if (strcmp(varargin{na},'scale')) % row vector, length 1 or d.
318 scale = varargin{na+1};
319 if (scale==0)
320 scale = zeros(1,d);
321 for i=1:d
322 scale(i) = sqrt(var(xdata(:,i)));
323 end;
324 end;
325 inc = 2;
326 end;
327 if (strcmp(varargin{na},'xlim')) % 2*d numeric matrix.
328 xl = varargin{na+1};
329 inc = 2;
330 end
331 if (strcmp(varargin{na},'alpha')) % row vector of length 1, 2 or 3.
332 alpha = [varargin{na+1} 0 0 0];
333 alpha = alpha(1:3);
334 inc = 2;
335 end
336 if (strcmp(varargin{na},'nn')) % scalar
337 alpha(1) = varargin{na+1};
338 inc = 2;
339 end
340 if (strcmp(varargin{na},'h')) % scalar
341 alpha(2) = varargin{na+1};
342 inc = 2;
343 end;
344 if (strcmp(varargin{na},'pen')) % scalar
345 alpha(3) = varargin{na+1};
346 inc = 2;
347 end;
348 if (strcmp(varargin{na},'acri')) % string
349 acri = varargin{na+1};
350 inc = 2;
351 end
352 if (strcmp(varargin{na},'deg')) % positive integer.
353 deg = varargin{na+1};
354 inc = 2;
355 end;
356 if (strcmp(varargin{na},'family')) % character string.
357 family = varargin{na+1};
358 inc = 2;
359 end;
360 if (strcmp(varargin{na},'link')) % character string.
361 link = varargin{na+1};
362 inc = 2;
363 end;
364 if (strcmp(varargin{na},'kern')) % character string.
365 kern = varargin{na+1};
366 inc = 2;
367 end;
368 if (strcmp(varargin{na},'kt')) % character string.
369 kt = varargin{na+1};
370 inc = 2;
371 end;
372 if (strcmp(varargin{na},'ev')) % char. string, or matrix with d columns.
373 ev = varargin{na+1};
374 if (isnumeric(ev)); ev = ev'; end;
375 inc = 2;
376 end;
377 if (strcmp(varargin{na},'ll')) % row vector of length d.
378 ll = varargin{na+1};
379 inc = 2;
380 end;
381 if (strcmp(varargin{na},'ur')) % row vector of length d.
382 ur = varargin{na+1};
383 inc = 2;
384 end;
385 if (strcmp(varargin{na},'mg')) % row vector of length d.
386 mg = varargin{na+1};
387 inc = 2;
388 end;
389 if (strcmp(varargin{na},'cut')) % positive scalar.
390 cut = varargin{na+1};
391 inc = 2;
392 end;
393 if (strcmp(varargin{na},'module')) % string.
394 mdl = struct('name',varargin{na+1}, 'directory','', 'parameters',0 );
395 inc = 2;
396 end;
397 if (strcmp(varargin{na},'maxk')) % positive integer.
398 maxk = varargin{na+1};
399 inc = 2;
400 end;
401 if (strcmp(varargin{na},'deriv')) % numeric row vector, up to deg elements.
402 deriv = varargin{na+1};
403 inc = 2;
404 end;
405 if (strcmp(varargin{na},'renorm')) % density renormalization.
406 deren = varargin{na+1};
407 inc = 2;
408 end;
409 if (strcmp(varargin{na},'itype')) % density - integration type.
410 deit = varargin{na+1};
411 inc = 2;
412 end;
413 if (strcmp(varargin{na},'mint')) % density - # of integration points.
414 demint = varargin{na+1};
415 inc = 2;
416 end;
417 if (strcmp(varargin{na},'debug')) % debug level.
418 debug = varargin{na+1};
419 inc = 2;
420 end;
421 if (inc==0)
422 disp(varargin{na});
423 error('Unknown Input Argument.');
424 end;
425 na=na+inc;
426 end
427
428
429 fit.data.x = xdata;
430 fit.data.y = ydata;
431 fit.data.weights = wdata;
432 fit.data.censor = cdata;
433 fit.data.baseline = base;
434 fit.data.style = style;
435 fit.data.scales = scale;
436 fit.data.xlim = xl;
437
438 fit.evaluation_structure.type = ev;
439 fit.evaluation_structure.module = mdl;
440 fit.evaluation_structure.lower_left = ll;
441 fit.evaluation_structure.upper_right = ur;
442 fit.evaluation_structure.grid = mg;
443 fit.evaluation_structure.cut = cut;
444 fit.evaluation_structure.maxk = maxk;
445 fit.evaluation_structure.derivative = deriv;
446
447 if (alpha==0); alpha = [0.7 0 0]; end;
448
449 fit.smoothing_parameters.alpha = alpha;
450 fit.smoothing_parameters.adaptive_criterion = acri;
451 fit.smoothing_parameters.degree = deg;
452 fit.smoothing_parameters.family = family;
453 fit.smoothing_parameters.link = link;
454 fit.smoothing_parameters.kernel = kern;
455 fit.smoothing_parameters.kernel_type = kt;
456 fit.smoothing_parameters.deren = deren;
457 fit.smoothing_parameters.deit = deit;
458 fit.smoothing_parameters.demint = demint;
459 fit.smoothing_parameters.debug = debug;
460
461 [fpc pcomp] = mexlf(fit.data,fit.evaluation_structure,fit.smoothing_parameters);
462 fit.fit_points = fpc;
463 fit.parametric_component = pcomp;
464
465 return
466
467
468