Mercurial > repos > dereeper > pgap
comparison PGAP-1.2.1/Statistics/LineFit.pm @ 0:83e62a1aeeeb draft
Uploaded
author | dereeper |
---|---|
date | Thu, 24 Jun 2021 13:51:52 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:83e62a1aeeeb |
---|---|
1 package Statistics::LineFit; | |
2 use strict; | |
3 use Carp qw(carp); | |
4 BEGIN { | |
5 use Exporter (); | |
6 use vars qw ($AUTHOR $VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | |
7 $AUTHOR = 'Richard Anderson <cpan(AT)richardanderson(DOT)org>'; | |
8 @EXPORT = @EXPORT_OK = qw(); | |
9 %EXPORT_TAGS = (); | |
10 @ISA = qw(Exporter); | |
11 $VERSION = 0.06; | |
12 } | |
13 | |
14 sub new { | |
15 # | |
16 # Purpose: Create a new Statistics::LineFit object | |
17 # | |
18 my ($caller, $validate, $hush) = @_; | |
19 my $self = { doneRegress => 0, | |
20 gotData => 0, | |
21 hush => defined $hush ? $hush : 0, | |
22 validate => defined $validate ? $validate : 0, | |
23 }; | |
24 bless $self, ref($caller) || $caller; | |
25 return $self; | |
26 } | |
27 | |
28 sub coefficients { | |
29 # | |
30 # Purpose: Return the slope and intercept from least squares line fit | |
31 # | |
32 my $self = shift; | |
33 unless (defined $self->{intercept} and defined $self->{slope}) { | |
34 $self->regress() or return; | |
35 } | |
36 return ($self->{intercept}, $self->{slope}); | |
37 } | |
38 | |
39 sub computeSums { | |
40 # | |
41 # Purpose: Compute sum of x, y, x**2, y**2 and x*y (private method) | |
42 # | |
43 my $self = shift; | |
44 my ($sumX, $sumY, $sumXX, $sumYY, $sumXY) = (0, 0, 0, 0, 0); | |
45 if (defined $self->{weight}) { | |
46 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
47 $sumX += $self->{weight}[$i] * $self->{x}[$i]; | |
48 $sumY += $self->{weight}[$i] * $self->{y}[$i]; | |
49 $sumXX += $self->{weight}[$i] * $self->{x}[$i] ** 2; | |
50 $sumYY += $self->{weight}[$i] * $self->{y}[$i] ** 2; | |
51 $sumXY += $self->{weight}[$i] * $self->{x}[$i] | |
52 * $self->{y}[$i]; | |
53 } | |
54 } else { | |
55 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
56 $sumX += $self->{x}[$i]; | |
57 $sumY += $self->{y}[$i]; | |
58 $sumXX += $self->{x}[$i] ** 2; | |
59 $sumYY += $self->{y}[$i] ** 2; | |
60 $sumXY += $self->{x}[$i] * $self->{y}[$i]; | |
61 } | |
62 } | |
63 return ($sumX, $sumY, $sumXX, $sumYY, $sumXY); | |
64 } | |
65 | |
66 sub durbinWatson { | |
67 # | |
68 # Purpose: Return the Durbin-Watson statistic | |
69 # | |
70 my $self = shift; | |
71 unless (defined $self->{durbinWatson}) { | |
72 $self->regress() or return; | |
73 my $sumErrDiff = 0; | |
74 my $errorTMinus1 = $self->{y}[0] - ($self->{intercept} + $self->{slope} | |
75 * $self->{x}[0]); | |
76 for (my $i = 1; $i < $self->{numXY}; ++$i) { | |
77 my $error = $self->{y}[$i] - ($self->{intercept} + $self->{slope} | |
78 * $self->{x}[$i]); | |
79 $sumErrDiff += ($error - $errorTMinus1) ** 2; | |
80 $errorTMinus1 = $error; | |
81 } | |
82 $self->{durbinWatson} = $self->sumSqErrors() > 0 ? | |
83 $sumErrDiff / $self->sumSqErrors() : 0; | |
84 } | |
85 return $self->{durbinWatson}; | |
86 } | |
87 | |
88 sub meanSqError { | |
89 # | |
90 # Purpose: Return the mean squared error | |
91 # | |
92 my $self = shift; | |
93 unless (defined $self->{meanSqError}) { | |
94 $self->regress() or return; | |
95 $self->{meanSqError} = $self->sumSqErrors() / $self->{numXY}; | |
96 } | |
97 return $self->{meanSqError}; | |
98 } | |
99 | |
100 sub predictedYs { | |
101 # | |
102 # Purpose: Return the predicted y values | |
103 # | |
104 my $self = shift; | |
105 unless (defined $self->{predictedYs}) { | |
106 $self->regress() or return; | |
107 $self->{predictedYs} = []; | |
108 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
109 $self->{predictedYs}[$i] = $self->{intercept} | |
110 + $self->{slope} * $self->{x}[$i]; | |
111 } | |
112 } | |
113 return @{$self->{predictedYs}}; | |
114 } | |
115 | |
116 sub regress { | |
117 # | |
118 # Purpose: Do weighted or unweighted least squares 2-D line fit (if needed) | |
119 # | |
120 # Description: | |
121 # The equations below apply to both the weighted and unweighted fit: the | |
122 # weights are normalized in setWeights(), so the sum of the weights is | |
123 # equal to numXY. | |
124 # | |
125 my $self = shift; | |
126 return $self->{regressOK} if $self->{doneRegress}; | |
127 unless ($self->{gotData}) { | |
128 carp "No valid data input - can't do regression" unless $self->{hush}; | |
129 return 0; | |
130 } | |
131 my ($sumX, $sumY, $sumYY, $sumXY); | |
132 ($sumX, $sumY, $self->{sumXX}, $sumYY, $sumXY) = $self->computeSums(); | |
133 $self->{sumSqDevX} = $self->{sumXX} - $sumX ** 2 / $self->{numXY}; | |
134 if ($self->{sumSqDevX} != 0) { | |
135 $self->{sumSqDevY} = $sumYY - $sumY ** 2 / $self->{numXY}; | |
136 $self->{sumSqDevXY} = $sumXY - $sumX * $sumY / $self->{numXY}; | |
137 $self->{slope} = $self->{sumSqDevXY} / $self->{sumSqDevX}; | |
138 $self->{intercept} = ($sumY - $self->{slope} * $sumX) / $self->{numXY}; | |
139 $self->{regressOK} = 1; | |
140 } else { | |
141 carp "Can't fit line when x values are all equal" unless $self->{hush}; | |
142 $self->{sumXX} = $self->{sumSqDevX} = undef; | |
143 $self->{regressOK} = 0; | |
144 } | |
145 $self->{doneRegress} = 1; | |
146 return $self->{regressOK}; | |
147 } | |
148 | |
149 sub residuals { | |
150 # | |
151 # Purpose: Return the predicted Y values minus the observed Y values | |
152 # | |
153 my $self = shift; | |
154 unless (defined $self->{residuals}) { | |
155 $self->regress() or return; | |
156 $self->{residuals} = []; | |
157 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
158 $self->{residuals}[$i] = $self->{y}[$i] - ($self->{intercept} | |
159 + $self->{slope} * $self->{x}[$i]); | |
160 } | |
161 } | |
162 return @{$self->{residuals}}; | |
163 } | |
164 | |
165 sub rSquared { | |
166 # | |
167 # Purpose: Return the correlation coefficient | |
168 # | |
169 my $self = shift; | |
170 unless (defined $self->{rSquared}) { | |
171 $self->regress() or return; | |
172 my $denom = $self->{sumSqDevX} * $self->{sumSqDevY}; | |
173 $self->{rSquared} = $denom != 0 ? $self->{sumSqDevXY} ** 2 / $denom : 1; | |
174 } | |
175 return $self->{rSquared}; | |
176 } | |
177 | |
178 sub setData { | |
179 # | |
180 # Purpose: Initialize (x,y) values and optional weights | |
181 # | |
182 my ($self, $x, $y, $weights) = @_; | |
183 $self->{doneRegress} = 0; | |
184 $self->{x} = $self->{y} = $self->{numXY} = $self->{weight} | |
185 = $self->{intercept} = $self->{slope} = $self->{rSquared} | |
186 = $self->{sigma} = $self->{durbinWatson} = $self->{meanSqError} | |
187 = $self->{sumSqErrors} = $self->{tStatInt} = $self->{tStatSlope} | |
188 = $self->{predictedYs} = $self->{residuals} = $self->{sumXX} | |
189 = $self->{sumSqDevX} = $self->{sumSqDevY} = $self->{sumSqDevXY} | |
190 = undef; | |
191 if (@$x < 2) { | |
192 carp "Must input more than one data point!" unless $self->{hush}; | |
193 return 0; | |
194 } | |
195 $self->{numXY} = @$x; | |
196 if (ref $x->[0]) { | |
197 $self->setWeights($y) or return 0; | |
198 $self->{x} = [ ]; | |
199 $self->{y} = [ ]; | |
200 foreach my $xy (@$x) { | |
201 push @{$self->{x}}, $xy->[0]; | |
202 push @{$self->{y}}, $xy->[1]; | |
203 } | |
204 } else { | |
205 if (@$x != @$y) { | |
206 carp "Length of x and y arrays must be equal!" unless $self->{hush}; | |
207 return 0; | |
208 } | |
209 $self->setWeights($weights) or return 0; | |
210 $self->{x} = [ @$x ]; | |
211 $self->{y} = [ @$y ]; | |
212 } | |
213 if ($self->{validate}) { | |
214 unless ($self->validData()) { | |
215 $self->{x} = $self->{y} = $self->{weights} = $self->{numXY} = undef; | |
216 return 0; | |
217 } | |
218 } | |
219 $self->{gotData} = 1; | |
220 return 1; | |
221 } | |
222 | |
223 sub setWeights { | |
224 # | |
225 # Purpose: Normalize and initialize line fit weighting factors (private method) | |
226 # | |
227 my ($self, $weights) = @_; | |
228 return 1 unless defined $weights; | |
229 if (@$weights != $self->{numXY}) { | |
230 carp "Length of weight array must equal length of data array!" | |
231 unless $self->{hush}; | |
232 return 0; | |
233 } | |
234 if ($self->{validate}) { $self->validWeights($weights) or return 0 } | |
235 my $sumW = my $numNonZero = 0; | |
236 foreach my $weight (@$weights) { | |
237 if ($weight < 0) { | |
238 carp "Weights must be non-negative numbers!" unless $self->{hush}; | |
239 return 0; | |
240 } | |
241 $sumW += $weight; | |
242 if ($weight != 0) { ++$numNonZero } | |
243 } | |
244 if ($numNonZero < 2) { | |
245 carp "At least two weights must be nonzero!" unless $self->{hush}; | |
246 return 0; | |
247 } | |
248 my $factor = @$weights / $sumW; | |
249 foreach my $weight (@$weights) { $weight *= $factor } | |
250 $self->{weight} = [ @$weights ]; | |
251 return 1; | |
252 } | |
253 | |
254 sub sigma { | |
255 # | |
256 # Purpose: Return the estimated homoscedastic standard deviation of the | |
257 # error term | |
258 # | |
259 my $self = shift; | |
260 unless (defined $self->{sigma}) { | |
261 $self->regress() or return; | |
262 $self->{sigma} = $self->{numXY} > 2 ? | |
263 sqrt($self->sumSqErrors() / ($self->{numXY} - 2)) : 0; | |
264 } | |
265 return $self->{sigma}; | |
266 } | |
267 | |
268 sub sumSqErrors { | |
269 # | |
270 # Purpose: Return the sum of the squared errors (private method) | |
271 # | |
272 my $self = shift; | |
273 unless (defined $self->{sumSqErrors}) { | |
274 $self->regress() or return; | |
275 $self->{sumSqErrors} = $self->{sumSqDevY} - $self->{sumSqDevX} | |
276 * $self->{slope} ** 2; | |
277 if ($self->{sumSqErrors} < 0) { $self->{sumSqErrors} = 0 } | |
278 } | |
279 return $self->{sumSqErrors}; | |
280 } | |
281 | |
282 sub tStatistics { | |
283 # | |
284 # Purpose: Return the T statistics | |
285 # | |
286 my $self = shift; | |
287 unless (defined $self->{tStatInt} and defined $self->{tStatSlope}) { | |
288 $self->regress() or return; | |
289 my $biasEstimateInt = $self->sigma() * sqrt($self->{sumXX} | |
290 / ($self->{sumSqDevX} * $self->{numXY})); | |
291 $self->{tStatInt} = $biasEstimateInt != 0 ? | |
292 $self->{intercept} / $biasEstimateInt : 0; | |
293 my $biasEstimateSlope = $self->sigma() / sqrt($self->{sumSqDevX}); | |
294 $self->{tStatSlope} = $biasEstimateSlope != 0 ? | |
295 $self->{slope} / $biasEstimateSlope : 0; | |
296 } | |
297 return ($self->{tStatInt}, $self->{tStatSlope}); | |
298 } | |
299 | |
300 sub validData { | |
301 # | |
302 # Purpose: Verify that the input x-y data are numeric (private method) | |
303 # | |
304 my $self = shift; | |
305 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
306 if (not defined $self->{x}[$i]) { | |
307 carp "Input x[$i] is not defined" unless $self->{hush}; | |
308 return 0; | |
309 } | |
310 if ($self->{x}[$i] !~ | |
311 /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) | |
312 { | |
313 carp "Input x[$i] is not a number: $self->{x}[$i]" | |
314 unless $self->{hush}; | |
315 return 0; | |
316 } | |
317 if (not defined $self->{y}[$i]) { | |
318 carp "Input y[$i] is not defined" unless $self->{hush}; | |
319 return 0; | |
320 } | |
321 if ($self->{y}[$i] !~ | |
322 /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) | |
323 { | |
324 carp "Input y[$i] is not a number: $self->{y}[$i]" | |
325 unless $self->{hush}; | |
326 return 0; | |
327 } | |
328 } | |
329 return 1; | |
330 } | |
331 | |
332 sub validWeights { | |
333 # | |
334 # Purpose: Verify that the input weights are numeric (private method) | |
335 # | |
336 my ($self, $weights) = @_; | |
337 for (my $i = 0; $i < @$weights; ++$i) { | |
338 if (not defined $weights->[$i]) { | |
339 carp "Input weights[$i] is not defined" unless $self->{hush}; | |
340 return 0; | |
341 } | |
342 if ($weights->[$i] | |
343 !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) | |
344 { | |
345 carp "Input weights[$i] is not a number: $weights->[$i]" | |
346 unless $self->{hush}; | |
347 return 0; | |
348 } | |
349 } | |
350 return 1; | |
351 } | |
352 | |
353 sub varianceOfEstimates { | |
354 # | |
355 # Purpose: Return the variances in the estimates of the intercept and slope | |
356 # | |
357 my $self = shift; | |
358 unless (defined $self->{intercept} and defined $self->{slope}) { | |
359 $self->regress() or return; | |
360 } | |
361 my @predictedYs = $self->predictedYs(); | |
362 my ($s, $sx, $sxx) = (0, 0, 0); | |
363 if (defined $self->{weight}) { | |
364 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
365 my $variance = ($predictedYs[$i] - $self->{y}[$i]) ** 2; | |
366 next if 0 == $variance; | |
367 $s += 1.0 / $variance; | |
368 $sx += $self->{weight}[$i] * $self->{x}[$i] / $variance; | |
369 $sxx += $self->{weight}[$i] * $self->{x}[$i] ** 2 / $variance; | |
370 } | |
371 } else { | |
372 for (my $i = 0; $i < $self->{numXY}; ++$i) { | |
373 my $variance = ($predictedYs[$i] - $self->{y}[$i]) ** 2; | |
374 next if 0 == $variance; | |
375 $s += 1.0 / $variance; | |
376 $sx += $self->{x}[$i] / $variance; | |
377 $sxx += $self->{x}[$i] ** 2 / $variance; | |
378 } | |
379 } | |
380 my $denominator = ($s * $sxx - $sx ** 2); | |
381 if (0 == $denominator) { | |
382 return; | |
383 } else { | |
384 return ($sxx / $denominator, $s / $denominator); | |
385 } | |
386 } | |
387 | |
388 1; | |
389 | |
390 __END__ | |
391 | |
392 =head1 NAME | |
393 | |
394 Statistics::LineFit - Least squares line fit, weighted or unweighted | |
395 | |
396 =head1 SYNOPSIS | |
397 | |
398 use Statistics::LineFit; | |
399 $lineFit = Statistics::LineFit->new(); | |
400 $lineFit->setData (\@xValues, \@yValues) or die "Invalid data"; | |
401 ($intercept, $slope) = $lineFit->coefficients(); | |
402 defined $intercept or die "Can't fit line if x values are all equal"; | |
403 $rSquared = $lineFit->rSquared(); | |
404 $meanSquaredError = $lineFit->meanSqError(); | |
405 $durbinWatson = $lineFit->durbinWatson(); | |
406 $sigma = $lineFit->sigma(); | |
407 ($tStatIntercept, $tStatSlope) = $lineFit->tStatistics(); | |
408 @predictedYs = $lineFit->predictedYs(); | |
409 @residuals = $lineFit->residuals(); | |
410 (varianceIntercept, $varianceSlope) = $lineFit->varianceOfEstimates(); | |
411 | |
412 =head1 DESCRIPTION | |
413 | |
414 The Statistics::LineFit module does weighted or unweighted least-squares | |
415 line fitting to two-dimensional data (y = a + b * x). (This is also called | |
416 linear regression.) In addition to the slope and y-intercept, the module | |
417 can return the square of the correlation coefficient (R squared), the | |
418 Durbin-Watson statistic, the mean squared error, sigma, the t statistics, | |
419 the variance of the estimates of the slope and y-intercept, | |
420 the predicted y values and the residuals of the y values. (See the METHODS | |
421 section for a description of these statistics.) | |
422 | |
423 The module accepts input data in separate x and y arrays or a single | |
424 2-D array (an array of arrayrefs). The optional weights are input in a | |
425 separate array. The module can optionally verify that the input data and | |
426 weights are valid numbers. If weights are input, the line fit minimizes | |
427 the weighted sum of the squared errors and the following statistics are | |
428 weighted: the correlation coefficient, the Durbin-Watson statistic, the | |
429 mean squared error, sigma and the t statistics. | |
430 | |
431 The module is state-oriented and caches its results. Once you call the | |
432 setData() method, you can call the other methods in any order or call a | |
433 method several times without invoking redundant calculations. After calling | |
434 setData(), you can modify the input data or weights without affecting the | |
435 module's results. | |
436 | |
437 The decision to use or not use weighting could be made using your a | |
438 priori knowledge of the data or using supplemental data. If the data is | |
439 sparse or contains non-random noise, weighting can degrade the solution. | |
440 Weighting is a good option if some points are suspect or less relevant (e.g., | |
441 older terms in a time series, points that are known to have more noise). | |
442 | |
443 =head1 ALGORITHM | |
444 | |
445 The least-square line is the line that minimizes the sum of the squares | |
446 of the y residuals: | |
447 | |
448 Minimize SUM((y[i] - (a + b * x[i])) ** 2) | |
449 | |
450 Setting the parial derivatives of a and b to zero yields a solution that | |
451 can be expressed in terms of the means, variances and covariances of x and y: | |
452 | |
453 b = SUM((x[i] - meanX) * (y[i] - meanY)) / SUM((x[i] - meanX) ** 2) | |
454 | |
455 a = meanY - b * meanX | |
456 | |
457 Note that a and b are undefined if all the x values are the same. | |
458 | |
459 If you use weights, each term in the above sums is multiplied by the | |
460 value of the weight for that index. The program normalizes the weights | |
461 (after copying the input values) so that the sum of the weights equals | |
462 the number of points. This minimizes the differences between the weighted | |
463 and unweighted equations. | |
464 | |
465 Statistics::LineFit uses equations that are mathematically equivalent to | |
466 the above equations and computationally more efficient. The module runs | |
467 in O(N) (linear time). | |
468 | |
469 =head1 LIMITATIONS | |
470 | |
471 The regression fails if the input x values are all equal or the only unequal | |
472 x values have zero weights. This is an inherent limit to fitting a line of | |
473 the form y = a + b * x. In this case, the module issues an error message | |
474 and methods that return statistical values will return undefined values. | |
475 You can also use the return value of the regress() method to check the | |
476 status of the regression. | |
477 | |
478 As the sum of the squared deviations of the x values approaches zero, | |
479 the module's results becomes sensitive to the precision of floating point | |
480 operations on the host system. | |
481 | |
482 If the x values are not all the same and the apparent "best fit" line is | |
483 vertical, the module will fit a horizontal line. For example, an input of | |
484 (1, 1), (1, 7), (2, 3), (2, 5) returns a slope of zero, an intercept of 4 | |
485 and an R squared of zero. This is correct behavior because this line is the | |
486 best least-squares fit to the data for the given parameterization | |
487 (y = a + b * x). | |
488 | |
489 On a 32-bit system the results are accurate to about 11 significant digits, | |
490 depending on the input data. Many of the installation tests will fail | |
491 on a system with word lengths of 16 bits or fewer. (You might want to | |
492 upgrade your old 80286 IBM PC.) | |
493 | |
494 =head1 EXAMPLES | |
495 | |
496 =head2 Alternate calling sequence: | |
497 | |
498 use Statistics::LineFit; | |
499 $lineFit = Statistics::LineFit->new(); | |
500 $lineFit->setData(\@x, \@y) or die "Invalid regression data\n"; | |
501 if (defined $lineFit->rSquared() | |
502 and $lineFit->rSquared() > $threshold) | |
503 { | |
504 ($intercept, $slope) = $lineFit->coefficients(); | |
505 print "Slope: $slope Y-intercept: $intercept\n"; | |
506 } | |
507 | |
508 =head2 Multiple calls with same object, validate input, suppress error messages: | |
509 | |
510 use Statistics::LineFit; | |
511 $lineFit = Statistics::LineFit->new(1, 1); | |
512 while (1) { | |
513 @xy = read2Dxy(); # User-supplied subroutine | |
514 $lineFit->setData(\@xy); | |
515 ($intercept, $slope) = $lineFit->coefficients(); | |
516 if (defined $intercept) { | |
517 print "Slope: $slope Y-intercept: $intercept\n"; | |
518 } | |
519 } | |
520 | |
521 =head1 METHODS | |
522 | |
523 The module is state-oriented and caches its results. Once you call the | |
524 setData() method, you can call the other methods in any order or call | |
525 a method several times without invoking redundant calculations. | |
526 | |
527 The regression fails if the x values are all the same. In this case, | |
528 the module issues an error message and methods that return statistical | |
529 values will return undefined values. You can also use the return value | |
530 of the regress() method to check the status of the regression. | |
531 | |
532 =head2 new() - create a new Statistics::LineFit object | |
533 | |
534 $lineFit = Statistics::LineFit->new(); | |
535 $lineFit = Statistics::LineFit->new($validate); | |
536 $lineFit = Statistics::LineFit->new($validate, $hush); | |
537 | |
538 $validate = 1 -> Verify input data is numeric (slower execution) | |
539 0 -> Don't verify input data (default, faster execution) | |
540 $hush = 1 -> Suppress error messages | |
541 = 0 -> Enable error messages (default) | |
542 | |
543 =head2 coefficients() - Return the slope and y intercept | |
544 | |
545 ($intercept, $slope) = $lineFit->coefficients(); | |
546 | |
547 The returned list is undefined if the regression fails. | |
548 | |
549 =head2 durbinWatson() - Return the Durbin-Watson statistic | |
550 | |
551 $durbinWatson = $lineFit->durbinWatson(); | |
552 | |
553 The Durbin-Watson test is a test for first-order autocorrelation in | |
554 the residuals of a time series regression. The Durbin-Watson statistic | |
555 has a range of 0 to 4; a value of 2 indicates there is no | |
556 autocorrelation. | |
557 | |
558 The return value is undefined if the regression fails. If weights are | |
559 input, the return value is the weighted Durbin-Watson statistic. | |
560 | |
561 =head2 meanSqError() - Return the mean squared error | |
562 | |
563 $meanSquaredError = $lineFit->meanSqError(); | |
564 | |
565 The return value is undefined if the regression fails. If weights are | |
566 input, the return value is the weighted mean squared error. | |
567 | |
568 =head2 predictedYs() - Return the predicted y values | |
569 | |
570 @predictedYs = $lineFit->predictedYs(); | |
571 | |
572 The returned list is undefined if the regression fails. | |
573 | |
574 =head2 regress() - Do the least squares line fit (if not already done) | |
575 | |
576 $lineFit->regress() or die "Regression failed" | |
577 | |
578 You don't need to call this method because it is invoked by the other | |
579 methods as needed. After you call setData(), you can call regress() | |
580 at any time to get the status of the regression for the current data. | |
581 | |
582 =head2 residuals() - Return predicted y values minus input y values | |
583 | |
584 @residuals = $lineFit->residuals(); | |
585 | |
586 The returned list is undefined if the regression fails. | |
587 | |
588 =head2 rSquared() - Return the square of the correlation coefficient | |
589 | |
590 $rSquared = $lineFit->rSquared(); | |
591 | |
592 R squared, also called the square of the Pearson product-moment correlation | |
593 coefficient, is a measure of goodness-of-fit. It is the fraction of the | |
594 variation in Y that can be attributed to the variation in X. A perfect fit | |
595 will have an R squared of 1; fitting a line to the vertices of a | |
596 regular polygon will yield an R squared of zero. Graphical displays of data | |
597 with an R squared of less than about 0.1 do not show a visible linear trend. | |
598 | |
599 The return value is undefined if the regression fails. If weights are | |
600 input, the return value is the weighted correlation coefficient. | |
601 | |
602 =head2 setData() - Initialize (x,y) values and optional weights | |
603 | |
604 $lineFit->setData(\@x, \@y) or die "Invalid regression data"; | |
605 $lineFit->setData(\@x, \@y, \@weights) or die "Invalid regression data"; | |
606 $lineFit->setData(\@xy) or die "Invalid regression data"; | |
607 $lineFit->setData(\@xy, \@weights) or die "Invalid regression data"; | |
608 | |
609 @xy is an array of arrayrefs; x values are $xy[$i][0], y values are | |
610 $xy[$i][1]. (The module does not access any indices greater than $xy[$i][1], | |
611 so the arrayrefs can point to arrays that are longer than two elements.) | |
612 The method identifies the difference between the first and fourth calling | |
613 signatures by examining the first argument. | |
614 | |
615 The optional weights array must be the same length as the data array(s). | |
616 The weights must be non-negative numbers; at least two of the weights | |
617 must be nonzero. Only the relative size of the weights is significant: | |
618 the program normalizes the weights (after copying the input values) so | |
619 that the sum of the weights equals the number of points. If you want to | |
620 do multiple line fits using the same weights, the weights must be passed | |
621 to each call to setData(). | |
622 | |
623 The method will return zero if the array lengths don't match, there are | |
624 less than two data points, any weights are negative or less than two of | |
625 the weights are nonzero. If the new() method was called with validate = 1, | |
626 the method will also verify that the data and weights are valid numbers. | |
627 Once you successfully call setData(), the next call to any method other than | |
628 new() or setData() invokes the regression. You can modify the input data | |
629 or weights after calling setData() without affecting the module's results. | |
630 | |
631 =head2 sigma() - Return the standard error of the estimate | |
632 | |
633 $sigma = $lineFit->sigma(); | |
634 | |
635 Sigma is an estimate of the homoscedastic standard deviation of the | |
636 error. Sigma is also known as the standard error of the estimate. | |
637 | |
638 The return value is undefined if the regression fails. If weights are | |
639 input, the return value is the weighted standard error. | |
640 | |
641 =head2 tStatistics() - Return the t statistics | |
642 | |
643 (tStatIntercept, $tStatSlope) = $lineFit->tStatistics(); | |
644 | |
645 The t statistic, also called the t ratio or Wald statistic, is used to | |
646 accept or reject a hypothesis using a table of cutoff values computed from | |
647 the t distribution. The t-statistic suggests that the estimated value is | |
648 (reasonable, too small, too large) when the t-statistic is (close to zero, | |
649 large and positive, large and negative). | |
650 | |
651 The returned list is undefined if the regression fails. If weights | |
652 are input, the returned values are the weighted t statistics. | |
653 | |
654 =head2 varianceOfEstimates() - Return variances of estimates of intercept, slope | |
655 | |
656 (varianceIntercept, $varianceSlope) = $lineFit->varianceOfEstimates(); | |
657 | |
658 Assuming the data are noisy or inaccurate, the intercept and slope returned | |
659 by the coefficients() method are only estimates of the true intercept and | |
660 slope. The varianceofEstimate() method returns the variances of the | |
661 estimates of the intercept and slope, respectively. See Numerical Recipes | |
662 in C, section 15.2 (Fitting Data to a Straight Line), equation 15.2.9. | |
663 | |
664 The returned list is undefined if the regression fails. If weights | |
665 are input, the returned values are the weighted variances. | |
666 | |
667 =head1 SEE ALSO | |
668 | |
669 Mendenhall, W., and Sincich, T.L., 2003, A Second Course in Statistics: | |
670 Regression Analysis, 6th ed., Prentice Hall. | |
671 Press, W. H., Flannery, B. P., Teukolsky, S. A., Vetterling, W. T., 1992, | |
672 Numerical Recipes in C : The Art of Scientific Computing, 2nd ed., | |
673 Cambridge University Press. | |
674 The man page for perl(1). | |
675 The CPAN modules Statistics::OLS, Statistics::GaussHelmert and | |
676 Statistics::Regression. | |
677 | |
678 Statistics::LineFit is simpler to use than Statistics::GaussHelmert or | |
679 Statistics::Regression. Statistics::LineFit was inspired by and borrows some | |
680 ideas from the venerable Statistics::OLS module. | |
681 | |
682 The significant differences | |
683 between Statistics::LineFit and Statistics::OLS (version 0.07) are: | |
684 | |
685 =over 4 | |
686 | |
687 =item B<Statistics::LineFit is more robust.> | |
688 | |
689 Statistics::OLS returns incorrect results for certain input datasets. | |
690 Statistics::OLS does not deep copy its input arrays, which can lead | |
691 to subtle bugs. The Statistics::OLS installation test has only one | |
692 test and does not verify that the regression returns correct results. | |
693 In contrast, Statistics::LineFit has over 200 installation tests that use | |
694 various datasets/calling sequences to verify the accuracy of the | |
695 regression to within 1.0e-10. | |
696 | |
697 =item B<Statistics::LineFit is faster.> | |
698 | |
699 For a sequence of calls to new(), setData(\@x, \@y) and regress(), | |
700 Statistics::LineFit is faster than Statistics::OLS by factors of 2.0, 1.6 | |
701 and 2.4 for array lengths of 5, 100 and 10000, respectively. | |
702 | |
703 =item B<Statistics::LineFit can do weighted or unweighted regression.> | |
704 | |
705 Statistics::OLS lacks this option. | |
706 | |
707 =item B<Statistics::LineFit has a better interface.> | |
708 | |
709 Once you call the Statistics::LineFit::setData() method, you can call the | |
710 other methods in any order and call methods multiple times without invoking | |
711 redundant calculations. Statistics::LineFit lets you enable or disable | |
712 data verification or error messages. | |
713 | |
714 =item B<Statistics::LineFit has better code and documentation.> | |
715 | |
716 The code in Statistics::LineFit is more readable, more object oriented and | |
717 more compliant with Perl coding standards than the code in Statistics::OLS. | |
718 The documentation for Statistics::LineFit is more detailed and complete. | |
719 | |
720 =back | |
721 | |
722 =head1 AUTHOR | |
723 | |
724 Richard Anderson, cpan(AT)richardanderson(DOT)org, | |
725 http://www.richardanderson.org | |
726 | |
727 =head1 LICENSE | |
728 | |
729 This program is free software; you can redistribute it and/or modify it under | |
730 the same terms as Perl itself. | |
731 | |
732 The full text of the license can be found in the LICENSE file included in | |
733 the distribution and available in the CPAN listing for Statistics::LineFit | |
734 (see www.cpan.org or search.cpan.org). | |
735 | |
736 =head1 DISCLAIMER | |
737 | |
738 To the maximum extent permitted by applicable law, the author of this | |
739 module disclaims all warranties, either express or implied, including | |
740 but not limited to implied warranties of merchantability and fitness for | |
741 a particular purpose, with regard to the software and the accompanying | |
742 documentation. | |
743 | |
744 =cut | |
745 |