0
|
1 package Statistics::LineFit;
|
|
2 use strict;
|
|
3 use Carp qw(carp);
|
|
4 BEGIN {
|
|
5 use Exporter ();
|
|
6 use vars qw ($AUTHOR $VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
|
|
7 $AUTHOR = 'Richard Anderson <cpan(AT)richardanderson(DOT)org>';
|
|
8 @EXPORT = @EXPORT_OK = qw();
|
|
9 %EXPORT_TAGS = ();
|
|
10 @ISA = qw(Exporter);
|
|
11 $VERSION = 0.06;
|
|
12 }
|
|
13
|
|
14 sub new {
|
|
15 #
|
|
16 # Purpose: Create a new Statistics::LineFit object
|
|
17 #
|
|
18 my ($caller, $validate, $hush) = @_;
|
|
19 my $self = { doneRegress => 0,
|
|
20 gotData => 0,
|
|
21 hush => defined $hush ? $hush : 0,
|
|
22 validate => defined $validate ? $validate : 0,
|
|
23 };
|
|
24 bless $self, ref($caller) || $caller;
|
|
25 return $self;
|
|
26 }
|
|
27
|
|
28 sub coefficients {
|
|
29 #
|
|
30 # Purpose: Return the slope and intercept from least squares line fit
|
|
31 #
|
|
32 my $self = shift;
|
|
33 unless (defined $self->{intercept} and defined $self->{slope}) {
|
|
34 $self->regress() or return;
|
|
35 }
|
|
36 return ($self->{intercept}, $self->{slope});
|
|
37 }
|
|
38
|
|
39 sub computeSums {
|
|
40 #
|
|
41 # Purpose: Compute sum of x, y, x**2, y**2 and x*y (private method)
|
|
42 #
|
|
43 my $self = shift;
|
|
44 my ($sumX, $sumY, $sumXX, $sumYY, $sumXY) = (0, 0, 0, 0, 0);
|
|
45 if (defined $self->{weight}) {
|
|
46 for (my $i = 0; $i < $self->{numXY}; ++$i) {
|
|
47 $sumX += $self->{weight}[$i] * $self->{x}[$i];
|
|
48 $sumY += $self->{weight}[$i] * $self->{y}[$i];
|
|
49 $sumXX += $self->{weight}[$i] * $self->{x}[$i] ** 2;
|
|
50 $sumYY += $self->{weight}[$i] * $self->{y}[$i] ** 2;
|
|
51 $sumXY += $self->{weight}[$i] * $self->{x}[$i]
|
|
52 * $self->{y}[$i];
|
|
53 }
|
|
54 } else {
|
|
55 for (my $i = 0; $i < $self->{numXY}; ++$i) {
|
|
56 $sumX += $self->{x}[$i];
|
|
57 $sumY += $self->{y}[$i];
|
|
58 $sumXX += $self->{x}[$i] ** 2;
|
|
59 $sumYY += $self->{y}[$i] ** 2;
|
|
60 $sumXY += $self->{x}[$i] * $self->{y}[$i];
|
|
61 }
|
|
62 }
|
|
63 return ($sumX, $sumY, $sumXX, $sumYY, $sumXY);
|
|
64 }
|
|
65
|
|
66 sub durbinWatson {
|
|
67 #
|
|
68 # Purpose: Return the Durbin-Watson statistic
|
|
69 #
|
|
70 my $self = shift;
|
|
71 unless (defined $self->{durbinWatson}) {
|
|
72 $self->regress() or return;
|
|
73 my $sumErrDiff = 0;
|
|
74 my $errorTMinus1 = $self->{y}[0] - ($self->{intercept} + $self->{slope}
|
|
75 * $self->{x}[0]);
|
|
76 for (my $i = 1; $i < $self->{numXY}; ++$i) {
|
|
77 my $error = $self->{y}[$i] - ($self->{intercept} + $self->{slope}
|
|
78 * $self->{x}[$i]);
|
|
79 $sumErrDiff += ($error - $errorTMinus1) ** 2;
|
|
80 $errorTMinus1 = $error;
|
|
81 }
|
|
82 $self->{durbinWatson} = $self->sumSqErrors() > 0 ?
|
|
83 $sumErrDiff / $self->sumSqErrors() : 0;
|
|
84 }
|
|
85 return $self->{durbinWatson};
|
|
86 }
|
|
87
|
|
88 sub meanSqError {
|
|
89 #
|
|
90 # Purpose: Return the mean squared error
|
|
91 #
|
|
92 my $self = shift;
|
|
93 unless (defined $self->{meanSqError}) {
|
|
94 $self->regress() or return;
|
|
95 $self->{meanSqError} = $self->sumSqErrors() / $self->{numXY};
|
|
96 }
|
|
97 return $self->{meanSqError};
|
|
98 }
|
|
99
|
|
100 sub predictedYs {
|
|
101 #
|
|
102 # Purpose: Return the predicted y values
|
|
103 #
|
|
104 my $self = shift;
|
|
105 unless (defined $self->{predictedYs}) {
|
|
106 $self->regress() or return;
|
|
107 $self->{predictedYs} = [];
|
|
108 for (my $i = 0; $i < $self->{numXY}; ++$i) {
|
|
109 $self->{predictedYs}[$i] = $self->{intercept}
|
|
110 + $self->{slope} * $self->{x}[$i];
|
|
111 }
|
|
112 }
|
|
113 return @{$self->{predictedYs}};
|
|
114 }
|
|
115
|
|
116 sub regress {
|
|
117 #
|
|
118 # Purpose: Do weighted or unweighted least squares 2-D line fit (if needed)
|
|
119 #
|
|
120 # Description:
|
|
121 # The equations below apply to both the weighted and unweighted fit: the
|
|
122 # weights are normalized in setWeights(), so the sum of the weights is
|
|
123 # equal to numXY.
|
|
124 #
|
|
125 my $self = shift;
|
|
126 return $self->{regressOK} if $self->{doneRegress};
|
|
127 unless ($self->{gotData}) {
|
|
128 carp "No valid data input - can't do regression" unless $self->{hush};
|
|
129 return 0;
|
|
130 }
|
|
131 my ($sumX, $sumY, $sumYY, $sumXY);
|
|
132 ($sumX, $sumY, $self->{sumXX}, $sumYY, $sumXY) = $self->computeSums();
|
|
133 $self->{sumSqDevX} = $self->{sumXX} - $sumX ** 2 / $self->{numXY};
|
|
134 if ($self->{sumSqDevX} != 0) {
|
|
135 $self->{sumSqDevY} = $sumYY - $sumY ** 2 / $self->{numXY};
|
|
136 $self->{sumSqDevXY} = $sumXY - $sumX * $sumY / $self->{numXY};
|
|
137 $self->{slope} = $self->{sumSqDevXY} / $self->{sumSqDevX};
|
|
138 $self->{intercept} = ($sumY - $self->{slope} * $sumX) / $self->{numXY};
|
|
139 $self->{regressOK} = 1;
|
|
140 } else {
|
|
141 carp "Can't fit line when x values are all equal" unless $self->{hush};
|
|
142 $self->{sumXX} = $self->{sumSqDevX} = undef;
|
|
143 $self->{regressOK} = 0;
|
|
144 }
|
|
145 $self->{doneRegress} = 1;
|
|
146 return $self->{regressOK};
|
|
147 }
|
|
148
|
|
149 sub residuals {
|
|
150 #
|
|
151 # Purpose: Return the predicted Y values minus the observed Y values
|
|
152 #
|
|
153 my $self = shift;
|
|
154 unless (defined $self->{residuals}) {
|
|
155 $self->regress() or return;
|
|
156 $self->{residuals} = [];
|
|
157 for (my $i = 0; $i < $self->{numXY}; ++$i) {
|
|
158 $self->{residuals}[$i] = $self->{y}[$i] - ($self->{intercept}
|
|
159 + $self->{slope} * $self->{x}[$i]);
|
|
160 }
|
|
161 }
|
|
162 return @{$self->{residuals}};
|
|
163 }
|
|
164
|
|
165 sub rSquared {
|
|
166 #
|
|
167 # Purpose: Return the correlation coefficient
|
|
168 #
|
|
169 my $self = shift;
|
|
170 unless (defined $self->{rSquared}) {
|
|
171 $self->regress() or return;
|
|
172 my $denom = $self->{sumSqDevX} * $self->{sumSqDevY};
|
|
173 $self->{rSquared} = $denom != 0 ? $self->{sumSqDevXY} ** 2 / $denom : 1;
|
|
174 }
|
|
175 return $self->{rSquared};
|
|
176 }
|
|
177
|
|
178 sub setData {
|
|
179 #
|
|
180 # Purpose: Initialize (x,y) values and optional weights
|
|
181 #
|
|
182 my ($self, $x, $y, $weights) = @_;
|
|
183 $self->{doneRegress} = 0;
|
|
184 $self->{x} = $self->{y} = $self->{numXY} = $self->{weight}
|
|
185 = $self->{intercept} = $self->{slope} = $self->{rSquared}
|
|
186 = $self->{sigma} = $self->{durbinWatson} = $self->{meanSqError}
|
|
187 = $self->{sumSqErrors} = $self->{tStatInt} = $self->{tStatSlope}
|
|
188 = $self->{predictedYs} = $self->{residuals} = $self->{sumXX}
|
|
189 = $self->{sumSqDevX} = $self->{sumSqDevY} = $self->{sumSqDevXY}
|
|
190 = undef;
|
|
191 if (@$x < 2) {
|
|
192 carp "Must input more than one data point!" unless $self->{hush};
|
|
193 return 0;
|
|
194 }
|
|
195 $self->{numXY} = @$x;
|
|
196 if (ref $x->[0]) {
|
|
197 $self->setWeights($y) or return 0;
|
|
198 $self->{x} = [ ];
|
|
199 $self->{y} = [ ];
|
|
200 foreach my $xy (@$x) {
|
|
201 push @{$self->{x}}, $xy->[0];
|
|
202 push @{$self->{y}}, $xy->[1];
|
|
203 }
|
|
204 } else {
|
|
205 if (@$x != @$y) {
|
|
206 carp "Length of x and y arrays must be equal!" unless $self->{hush};
|
|
207 return 0;
|
|
208 }
|
|
209 $self->setWeights($weights) or return 0;
|
|
210 $self->{x} = [ @$x ];
|
|
211 $self->{y} = [ @$y ];
|
|
212 }
|
|
213 if ($self->{validate}) {
|
|
214 unless ($self->validData()) {
|
|
215 $self->{x} = $self->{y} = $self->{weights} = $self->{numXY} = undef;
|
|
216 return 0;
|
|
217 }
|
|
218 }
|
|
219 $self->{gotData} = 1;
|
|
220 return 1;
|
|
221 }
|
|
222
|
|
223 sub setWeights {
|
|
224 #
|
|
225 # Purpose: Normalize and initialize line fit weighting factors (private method)
|
|
226 #
|
|
227 my ($self, $weights) = @_;
|
|
228 return 1 unless defined $weights;
|
|
229 if (@$weights != $self->{numXY}) {
|
|
230 carp "Length of weight array must equal length of data array!"
|
|
231 unless $self->{hush};
|
|
232 return 0;
|
|
233 }
|
|
234 if ($self->{validate}) { $self->validWeights($weights) or return 0 }
|
|
235 my $sumW = my $numNonZero = 0;
|
|
236 foreach my $weight (@$weights) {
|
|
237 if ($weight < 0) {
|
|
238 carp "Weights must be non-negative numbers!" unless $self->{hush};
|
|
239 return 0;
|
|
240 }
|
|
241 $sumW += $weight;
|
|
242 if ($weight != 0) { ++$numNonZero }
|
|
243 }
|
|
244 if ($numNonZero < 2) {
|
|
245 carp "At least two weights must be nonzero!" unless $self->{hush};
|
|
246 return 0;
|
|
247 }
|
|
248 my $factor = @$weights / $sumW;
|
|
249 foreach my $weight (@$weights) { $weight *= $factor }
|
|
250 $self->{weight} = [ @$weights ];
|
|
251 return 1;
|
|
252 }
|
|
253
|
|
254 sub sigma {
|
|
255 #
|
|
256 # Purpose: Return the estimated homoscedastic standard deviation of the
|
|
257 # error term
|
|
258 #
|
|
259 my $self = shift;
|
|
260 unless (defined $self->{sigma}) {
|
|
261 $self->regress() or return;
|
|
262 $self->{sigma} = $self->{numXY} > 2 ?
|
|
263 sqrt($self->sumSqErrors() / ($self->{numXY} - 2)) : 0;
|
|
264 }
|
|
265 return $self->{sigma};
|
|
266 }
|
|
267
|
|
268 sub sumSqErrors {
|
|
269 #
|
|
270 # Purpose: Return the sum of the squared errors (private method)
|
|
271 #
|
|
272 my $self = shift;
|
|
273 unless (defined $self->{sumSqErrors}) {
|
|
274 $self->regress() or return;
|
|
275 $self->{sumSqErrors} = $self->{sumSqDevY} - $self->{sumSqDevX}
|
|
276 * $self->{slope} ** 2;
|
|
277 if ($self->{sumSqErrors} < 0) { $self->{sumSqErrors} = 0 }
|
|
278 }
|
|
279 return $self->{sumSqErrors};
|
|
280 }
|
|
281
|
|
282 sub tStatistics {
|
|
283 #
|
|
284 # Purpose: Return the T statistics
|
|
285 #
|
|
286 my $self = shift;
|
|
287 unless (defined $self->{tStatInt} and defined $self->{tStatSlope}) {
|
|
288 $self->regress() or return;
|
|
289 my $biasEstimateInt = $self->sigma() * sqrt($self->{sumXX}
|
|
290 / ($self->{sumSqDevX} * $self->{numXY}));
|
|
291 $self->{tStatInt} = $biasEstimateInt != 0 ?
|
|
292 $self->{intercept} / $biasEstimateInt : 0;
|
|
293 my $biasEstimateSlope = $self->sigma() / sqrt($self->{sumSqDevX});
|
|
294 $self->{tStatSlope} = $biasEstimateSlope != 0 ?
|
|
295 $self->{slope} / $biasEstimateSlope : 0;
|
|
296 }
|
|
297 return ($self->{tStatInt}, $self->{tStatSlope});
|
|
298 }
|
|
299
|
|
300 sub validData {
|
|
301 #
|
|
302 # Purpose: Verify that the input x-y data are numeric (private method)
|
|
303 #
|
|
304 my $self = shift;
|
|
305 for (my $i = 0; $i < $self->{numXY}; ++$i) {
|
|
306 if (not defined $self->{x}[$i]) {
|
|
307 carp "Input x[$i] is not defined" unless $self->{hush};
|
|
308 return 0;
|
|
309 }
|
|
310 if ($self->{x}[$i] !~
|
|
311 /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/)
|
|
312 {
|
|
313 carp "Input x[$i] is not a number: $self->{x}[$i]"
|
|
314 unless $self->{hush};
|
|
315 return 0;
|
|
316 }
|
|
317 if (not defined $self->{y}[$i]) {
|
|
318 carp "Input y[$i] is not defined" unless $self->{hush};
|
|
319 return 0;
|
|
320 }
|
|
321 if ($self->{y}[$i] !~
|
|
322 /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/)
|
|
323 {
|
|
324 carp "Input y[$i] is not a number: $self->{y}[$i]"
|
|
325 unless $self->{hush};
|
|
326 return 0;
|
|
327 }
|
|
328 }
|
|
329 return 1;
|
|
330 }
|
|
331
|
|
332 sub validWeights {
|
|
333 #
|
|
334 # Purpose: Verify that the input weights are numeric (private method)
|
|
335 #
|
|
336 my ($self, $weights) = @_;
|
|
337 for (my $i = 0; $i < @$weights; ++$i) {
|
|
338 if (not defined $weights->[$i]) {
|
|
339 carp "Input weights[$i] is not defined" unless $self->{hush};
|
|
340 return 0;
|
|
341 }
|
|
342 if ($weights->[$i]
|
|
343 !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/)
|
|
344 {
|
|
345 carp "Input weights[$i] is not a number: $weights->[$i]"
|
|
346 unless $self->{hush};
|
|
347 return 0;
|
|
348 }
|
|
349 }
|
|
350 return 1;
|
|
351 }
|
|
352
|
|
353 sub varianceOfEstimates {
|
|
354 #
|
|
355 # Purpose: Return the variances in the estimates of the intercept and slope
|
|
356 #
|
|
357 my $self = shift;
|
|
358 unless (defined $self->{intercept} and defined $self->{slope}) {
|
|
359 $self->regress() or return;
|
|
360 }
|
|
361 my @predictedYs = $self->predictedYs();
|
|
362 my ($s, $sx, $sxx) = (0, 0, 0);
|
|
363 if (defined $self->{weight}) {
|
|
364 for (my $i = 0; $i < $self->{numXY}; ++$i) {
|
|
365 my $variance = ($predictedYs[$i] - $self->{y}[$i]) ** 2;
|
|
366 next if 0 == $variance;
|
|
367 $s += 1.0 / $variance;
|
|
368 $sx += $self->{weight}[$i] * $self->{x}[$i] / $variance;
|
|
369 $sxx += $self->{weight}[$i] * $self->{x}[$i] ** 2 / $variance;
|
|
370 }
|
|
371 } else {
|
|
372 for (my $i = 0; $i < $self->{numXY}; ++$i) {
|
|
373 my $variance = ($predictedYs[$i] - $self->{y}[$i]) ** 2;
|
|
374 next if 0 == $variance;
|
|
375 $s += 1.0 / $variance;
|
|
376 $sx += $self->{x}[$i] / $variance;
|
|
377 $sxx += $self->{x}[$i] ** 2 / $variance;
|
|
378 }
|
|
379 }
|
|
380 my $denominator = ($s * $sxx - $sx ** 2);
|
|
381 if (0 == $denominator) {
|
|
382 return;
|
|
383 } else {
|
|
384 return ($sxx / $denominator, $s / $denominator);
|
|
385 }
|
|
386 }
|
|
387
|
|
388 1;
|
|
389
|
|
390 __END__
|
|
391
|
|
392 =head1 NAME
|
|
393
|
|
394 Statistics::LineFit - Least squares line fit, weighted or unweighted
|
|
395
|
|
396 =head1 SYNOPSIS
|
|
397
|
|
398 use Statistics::LineFit;
|
|
399 $lineFit = Statistics::LineFit->new();
|
|
400 $lineFit->setData (\@xValues, \@yValues) or die "Invalid data";
|
|
401 ($intercept, $slope) = $lineFit->coefficients();
|
|
402 defined $intercept or die "Can't fit line if x values are all equal";
|
|
403 $rSquared = $lineFit->rSquared();
|
|
404 $meanSquaredError = $lineFit->meanSqError();
|
|
405 $durbinWatson = $lineFit->durbinWatson();
|
|
406 $sigma = $lineFit->sigma();
|
|
407 ($tStatIntercept, $tStatSlope) = $lineFit->tStatistics();
|
|
408 @predictedYs = $lineFit->predictedYs();
|
|
409 @residuals = $lineFit->residuals();
|
|
410 (varianceIntercept, $varianceSlope) = $lineFit->varianceOfEstimates();
|
|
411
|
|
412 =head1 DESCRIPTION
|
|
413
|
|
414 The Statistics::LineFit module does weighted or unweighted least-squares
|
|
415 line fitting to two-dimensional data (y = a + b * x). (This is also called
|
|
416 linear regression.) In addition to the slope and y-intercept, the module
|
|
417 can return the square of the correlation coefficient (R squared), the
|
|
418 Durbin-Watson statistic, the mean squared error, sigma, the t statistics,
|
|
419 the variance of the estimates of the slope and y-intercept,
|
|
420 the predicted y values and the residuals of the y values. (See the METHODS
|
|
421 section for a description of these statistics.)
|
|
422
|
|
423 The module accepts input data in separate x and y arrays or a single
|
|
424 2-D array (an array of arrayrefs). The optional weights are input in a
|
|
425 separate array. The module can optionally verify that the input data and
|
|
426 weights are valid numbers. If weights are input, the line fit minimizes
|
|
427 the weighted sum of the squared errors and the following statistics are
|
|
428 weighted: the correlation coefficient, the Durbin-Watson statistic, the
|
|
429 mean squared error, sigma and the t statistics.
|
|
430
|
|
431 The module is state-oriented and caches its results. Once you call the
|
|
432 setData() method, you can call the other methods in any order or call a
|
|
433 method several times without invoking redundant calculations. After calling
|
|
434 setData(), you can modify the input data or weights without affecting the
|
|
435 module's results.
|
|
436
|
|
437 The decision to use or not use weighting could be made using your a
|
|
438 priori knowledge of the data or using supplemental data. If the data is
|
|
439 sparse or contains non-random noise, weighting can degrade the solution.
|
|
440 Weighting is a good option if some points are suspect or less relevant (e.g.,
|
|
441 older terms in a time series, points that are known to have more noise).
|
|
442
|
|
443 =head1 ALGORITHM
|
|
444
|
|
445 The least-square line is the line that minimizes the sum of the squares
|
|
446 of the y residuals:
|
|
447
|
|
448 Minimize SUM((y[i] - (a + b * x[i])) ** 2)
|
|
449
|
|
450 Setting the parial derivatives of a and b to zero yields a solution that
|
|
451 can be expressed in terms of the means, variances and covariances of x and y:
|
|
452
|
|
453 b = SUM((x[i] - meanX) * (y[i] - meanY)) / SUM((x[i] - meanX) ** 2)
|
|
454
|
|
455 a = meanY - b * meanX
|
|
456
|
|
457 Note that a and b are undefined if all the x values are the same.
|
|
458
|
|
459 If you use weights, each term in the above sums is multiplied by the
|
|
460 value of the weight for that index. The program normalizes the weights
|
|
461 (after copying the input values) so that the sum of the weights equals
|
|
462 the number of points. This minimizes the differences between the weighted
|
|
463 and unweighted equations.
|
|
464
|
|
465 Statistics::LineFit uses equations that are mathematically equivalent to
|
|
466 the above equations and computationally more efficient. The module runs
|
|
467 in O(N) (linear time).
|
|
468
|
|
469 =head1 LIMITATIONS
|
|
470
|
|
471 The regression fails if the input x values are all equal or the only unequal
|
|
472 x values have zero weights. This is an inherent limit to fitting a line of
|
|
473 the form y = a + b * x. In this case, the module issues an error message
|
|
474 and methods that return statistical values will return undefined values.
|
|
475 You can also use the return value of the regress() method to check the
|
|
476 status of the regression.
|
|
477
|
|
478 As the sum of the squared deviations of the x values approaches zero,
|
|
479 the module's results becomes sensitive to the precision of floating point
|
|
480 operations on the host system.
|
|
481
|
|
482 If the x values are not all the same and the apparent "best fit" line is
|
|
483 vertical, the module will fit a horizontal line. For example, an input of
|
|
484 (1, 1), (1, 7), (2, 3), (2, 5) returns a slope of zero, an intercept of 4
|
|
485 and an R squared of zero. This is correct behavior because this line is the
|
|
486 best least-squares fit to the data for the given parameterization
|
|
487 (y = a + b * x).
|
|
488
|
|
489 On a 32-bit system the results are accurate to about 11 significant digits,
|
|
490 depending on the input data. Many of the installation tests will fail
|
|
491 on a system with word lengths of 16 bits or fewer. (You might want to
|
|
492 upgrade your old 80286 IBM PC.)
|
|
493
|
|
494 =head1 EXAMPLES
|
|
495
|
|
496 =head2 Alternate calling sequence:
|
|
497
|
|
498 use Statistics::LineFit;
|
|
499 $lineFit = Statistics::LineFit->new();
|
|
500 $lineFit->setData(\@x, \@y) or die "Invalid regression data\n";
|
|
501 if (defined $lineFit->rSquared()
|
|
502 and $lineFit->rSquared() > $threshold)
|
|
503 {
|
|
504 ($intercept, $slope) = $lineFit->coefficients();
|
|
505 print "Slope: $slope Y-intercept: $intercept\n";
|
|
506 }
|
|
507
|
|
508 =head2 Multiple calls with same object, validate input, suppress error messages:
|
|
509
|
|
510 use Statistics::LineFit;
|
|
511 $lineFit = Statistics::LineFit->new(1, 1);
|
|
512 while (1) {
|
|
513 @xy = read2Dxy(); # User-supplied subroutine
|
|
514 $lineFit->setData(\@xy);
|
|
515 ($intercept, $slope) = $lineFit->coefficients();
|
|
516 if (defined $intercept) {
|
|
517 print "Slope: $slope Y-intercept: $intercept\n";
|
|
518 }
|
|
519 }
|
|
520
|
|
521 =head1 METHODS
|
|
522
|
|
523 The module is state-oriented and caches its results. Once you call the
|
|
524 setData() method, you can call the other methods in any order or call
|
|
525 a method several times without invoking redundant calculations.
|
|
526
|
|
527 The regression fails if the x values are all the same. In this case,
|
|
528 the module issues an error message and methods that return statistical
|
|
529 values will return undefined values. You can also use the return value
|
|
530 of the regress() method to check the status of the regression.
|
|
531
|
|
532 =head2 new() - create a new Statistics::LineFit object
|
|
533
|
|
534 $lineFit = Statistics::LineFit->new();
|
|
535 $lineFit = Statistics::LineFit->new($validate);
|
|
536 $lineFit = Statistics::LineFit->new($validate, $hush);
|
|
537
|
|
538 $validate = 1 -> Verify input data is numeric (slower execution)
|
|
539 0 -> Don't verify input data (default, faster execution)
|
|
540 $hush = 1 -> Suppress error messages
|
|
541 = 0 -> Enable error messages (default)
|
|
542
|
|
543 =head2 coefficients() - Return the slope and y intercept
|
|
544
|
|
545 ($intercept, $slope) = $lineFit->coefficients();
|
|
546
|
|
547 The returned list is undefined if the regression fails.
|
|
548
|
|
549 =head2 durbinWatson() - Return the Durbin-Watson statistic
|
|
550
|
|
551 $durbinWatson = $lineFit->durbinWatson();
|
|
552
|
|
553 The Durbin-Watson test is a test for first-order autocorrelation in
|
|
554 the residuals of a time series regression. The Durbin-Watson statistic
|
|
555 has a range of 0 to 4; a value of 2 indicates there is no
|
|
556 autocorrelation.
|
|
557
|
|
558 The return value is undefined if the regression fails. If weights are
|
|
559 input, the return value is the weighted Durbin-Watson statistic.
|
|
560
|
|
561 =head2 meanSqError() - Return the mean squared error
|
|
562
|
|
563 $meanSquaredError = $lineFit->meanSqError();
|
|
564
|
|
565 The return value is undefined if the regression fails. If weights are
|
|
566 input, the return value is the weighted mean squared error.
|
|
567
|
|
568 =head2 predictedYs() - Return the predicted y values
|
|
569
|
|
570 @predictedYs = $lineFit->predictedYs();
|
|
571
|
|
572 The returned list is undefined if the regression fails.
|
|
573
|
|
574 =head2 regress() - Do the least squares line fit (if not already done)
|
|
575
|
|
576 $lineFit->regress() or die "Regression failed"
|
|
577
|
|
578 You don't need to call this method because it is invoked by the other
|
|
579 methods as needed. After you call setData(), you can call regress()
|
|
580 at any time to get the status of the regression for the current data.
|
|
581
|
|
582 =head2 residuals() - Return predicted y values minus input y values
|
|
583
|
|
584 @residuals = $lineFit->residuals();
|
|
585
|
|
586 The returned list is undefined if the regression fails.
|
|
587
|
|
588 =head2 rSquared() - Return the square of the correlation coefficient
|
|
589
|
|
590 $rSquared = $lineFit->rSquared();
|
|
591
|
|
592 R squared, also called the square of the Pearson product-moment correlation
|
|
593 coefficient, is a measure of goodness-of-fit. It is the fraction of the
|
|
594 variation in Y that can be attributed to the variation in X. A perfect fit
|
|
595 will have an R squared of 1; fitting a line to the vertices of a
|
|
596 regular polygon will yield an R squared of zero. Graphical displays of data
|
|
597 with an R squared of less than about 0.1 do not show a visible linear trend.
|
|
598
|
|
599 The return value is undefined if the regression fails. If weights are
|
|
600 input, the return value is the weighted correlation coefficient.
|
|
601
|
|
602 =head2 setData() - Initialize (x,y) values and optional weights
|
|
603
|
|
604 $lineFit->setData(\@x, \@y) or die "Invalid regression data";
|
|
605 $lineFit->setData(\@x, \@y, \@weights) or die "Invalid regression data";
|
|
606 $lineFit->setData(\@xy) or die "Invalid regression data";
|
|
607 $lineFit->setData(\@xy, \@weights) or die "Invalid regression data";
|
|
608
|
|
609 @xy is an array of arrayrefs; x values are $xy[$i][0], y values are
|
|
610 $xy[$i][1]. (The module does not access any indices greater than $xy[$i][1],
|
|
611 so the arrayrefs can point to arrays that are longer than two elements.)
|
|
612 The method identifies the difference between the first and fourth calling
|
|
613 signatures by examining the first argument.
|
|
614
|
|
615 The optional weights array must be the same length as the data array(s).
|
|
616 The weights must be non-negative numbers; at least two of the weights
|
|
617 must be nonzero. Only the relative size of the weights is significant:
|
|
618 the program normalizes the weights (after copying the input values) so
|
|
619 that the sum of the weights equals the number of points. If you want to
|
|
620 do multiple line fits using the same weights, the weights must be passed
|
|
621 to each call to setData().
|
|
622
|
|
623 The method will return zero if the array lengths don't match, there are
|
|
624 less than two data points, any weights are negative or less than two of
|
|
625 the weights are nonzero. If the new() method was called with validate = 1,
|
|
626 the method will also verify that the data and weights are valid numbers.
|
|
627 Once you successfully call setData(), the next call to any method other than
|
|
628 new() or setData() invokes the regression. You can modify the input data
|
|
629 or weights after calling setData() without affecting the module's results.
|
|
630
|
|
631 =head2 sigma() - Return the standard error of the estimate
|
|
632
|
|
633 $sigma = $lineFit->sigma();
|
|
634
|
|
635 Sigma is an estimate of the homoscedastic standard deviation of the
|
|
636 error. Sigma is also known as the standard error of the estimate.
|
|
637
|
|
638 The return value is undefined if the regression fails. If weights are
|
|
639 input, the return value is the weighted standard error.
|
|
640
|
|
641 =head2 tStatistics() - Return the t statistics
|
|
642
|
|
643 (tStatIntercept, $tStatSlope) = $lineFit->tStatistics();
|
|
644
|
|
645 The t statistic, also called the t ratio or Wald statistic, is used to
|
|
646 accept or reject a hypothesis using a table of cutoff values computed from
|
|
647 the t distribution. The t-statistic suggests that the estimated value is
|
|
648 (reasonable, too small, too large) when the t-statistic is (close to zero,
|
|
649 large and positive, large and negative).
|
|
650
|
|
651 The returned list is undefined if the regression fails. If weights
|
|
652 are input, the returned values are the weighted t statistics.
|
|
653
|
|
654 =head2 varianceOfEstimates() - Return variances of estimates of intercept, slope
|
|
655
|
|
656 (varianceIntercept, $varianceSlope) = $lineFit->varianceOfEstimates();
|
|
657
|
|
658 Assuming the data are noisy or inaccurate, the intercept and slope returned
|
|
659 by the coefficients() method are only estimates of the true intercept and
|
|
660 slope. The varianceofEstimate() method returns the variances of the
|
|
661 estimates of the intercept and slope, respectively. See Numerical Recipes
|
|
662 in C, section 15.2 (Fitting Data to a Straight Line), equation 15.2.9.
|
|
663
|
|
664 The returned list is undefined if the regression fails. If weights
|
|
665 are input, the returned values are the weighted variances.
|
|
666
|
|
667 =head1 SEE ALSO
|
|
668
|
|
669 Mendenhall, W., and Sincich, T.L., 2003, A Second Course in Statistics:
|
|
670 Regression Analysis, 6th ed., Prentice Hall.
|
|
671 Press, W. H., Flannery, B. P., Teukolsky, S. A., Vetterling, W. T., 1992,
|
|
672 Numerical Recipes in C : The Art of Scientific Computing, 2nd ed.,
|
|
673 Cambridge University Press.
|
|
674 The man page for perl(1).
|
|
675 The CPAN modules Statistics::OLS, Statistics::GaussHelmert and
|
|
676 Statistics::Regression.
|
|
677
|
|
678 Statistics::LineFit is simpler to use than Statistics::GaussHelmert or
|
|
679 Statistics::Regression. Statistics::LineFit was inspired by and borrows some
|
|
680 ideas from the venerable Statistics::OLS module.
|
|
681
|
|
682 The significant differences
|
|
683 between Statistics::LineFit and Statistics::OLS (version 0.07) are:
|
|
684
|
|
685 =over 4
|
|
686
|
|
687 =item B<Statistics::LineFit is more robust.>
|
|
688
|
|
689 Statistics::OLS returns incorrect results for certain input datasets.
|
|
690 Statistics::OLS does not deep copy its input arrays, which can lead
|
|
691 to subtle bugs. The Statistics::OLS installation test has only one
|
|
692 test and does not verify that the regression returns correct results.
|
|
693 In contrast, Statistics::LineFit has over 200 installation tests that use
|
|
694 various datasets/calling sequences to verify the accuracy of the
|
|
695 regression to within 1.0e-10.
|
|
696
|
|
697 =item B<Statistics::LineFit is faster.>
|
|
698
|
|
699 For a sequence of calls to new(), setData(\@x, \@y) and regress(),
|
|
700 Statistics::LineFit is faster than Statistics::OLS by factors of 2.0, 1.6
|
|
701 and 2.4 for array lengths of 5, 100 and 10000, respectively.
|
|
702
|
|
703 =item B<Statistics::LineFit can do weighted or unweighted regression.>
|
|
704
|
|
705 Statistics::OLS lacks this option.
|
|
706
|
|
707 =item B<Statistics::LineFit has a better interface.>
|
|
708
|
|
709 Once you call the Statistics::LineFit::setData() method, you can call the
|
|
710 other methods in any order and call methods multiple times without invoking
|
|
711 redundant calculations. Statistics::LineFit lets you enable or disable
|
|
712 data verification or error messages.
|
|
713
|
|
714 =item B<Statistics::LineFit has better code and documentation.>
|
|
715
|
|
716 The code in Statistics::LineFit is more readable, more object oriented and
|
|
717 more compliant with Perl coding standards than the code in Statistics::OLS.
|
|
718 The documentation for Statistics::LineFit is more detailed and complete.
|
|
719
|
|
720 =back
|
|
721
|
|
722 =head1 AUTHOR
|
|
723
|
|
724 Richard Anderson, cpan(AT)richardanderson(DOT)org,
|
|
725 http://www.richardanderson.org
|
|
726
|
|
727 =head1 LICENSE
|
|
728
|
|
729 This program is free software; you can redistribute it and/or modify it under
|
|
730 the same terms as Perl itself.
|
|
731
|
|
732 The full text of the license can be found in the LICENSE file included in
|
|
733 the distribution and available in the CPAN listing for Statistics::LineFit
|
|
734 (see www.cpan.org or search.cpan.org).
|
|
735
|
|
736 =head1 DISCLAIMER
|
|
737
|
|
738 To the maximum extent permitted by applicable law, the author of this
|
|
739 module disclaims all warranties, either express or implied, including
|
|
740 but not limited to implied warranties of merchantability and fitness for
|
|
741 a particular purpose, with regard to the software and the accompanying
|
|
742 documentation.
|
|
743
|
|
744 =cut
|
|
745
|