/usr/share/octave/packages/statistics-1.3.0/regress.m is in octave-statistics 1.3.0-4.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | ## Copyright (C) 2005, 2006 William Poetra Yoga Hadisoeseno
## Copyright (C) 2011 Nir Krakauer
##
## This program is free software; you can redistribute it and/or modify it under
## the terms of the GNU General Public License as published by the Free Software
## Foundation; either version 3 of the License, or (at your option) any later
## version.
##
## This program is distributed in the hope that it will be useful, but WITHOUT
## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
## details.
##
## You should have received a copy of the GNU General Public License along with
## this program; if not, see <http://www.gnu.org/licenses/>.
## -*- texinfo -*-
## @deftypefn {Function File} {[@var{b}, @var{bint}, @var{r}, @var{rint}, @var{stats}] =} regress (@var{y}, @var{X}, [@var{alpha}])
## Multiple Linear Regression using Least Squares Fit of @var{y} on @var{X}
## with the model @code{y = X * beta + e}.
##
## Here,
##
## @itemize
## @item
## @code{y} is a column vector of observed values
## @item
## @code{X} is a matrix of regressors, with the first column filled with
## the constant value 1
## @item
## @code{beta} is a column vector of regression parameters
## @item
## @code{e} is a column vector of random errors
## @end itemize
##
## Arguments are
##
## @itemize
## @item
## @var{y} is the @code{y} in the model
## @item
## @var{X} is the @code{X} in the model
## @item
## @var{alpha} is the significance level used to calculate the confidence
## intervals @var{bint} and @var{rint} (see `Return values' below). If not
## specified, ALPHA defaults to 0.05
## @end itemize
##
## Return values are
##
## @itemize
## @item
## @var{b} is the @code{beta} in the model
## @item
## @var{bint} is the confidence interval for @var{b}
## @item
## @var{r} is a column vector of residuals
## @item
## @var{rint} is the confidence interval for @var{r}
## @item
## @var{stats} is a row vector containing:
##
## @itemize
## @item The R^2 statistic
## @item The F statistic
## @item The p value for the full model
## @item The estimated error variance
## @end itemize
## @end itemize
##
## @var{r} and @var{rint} can be passed to @code{rcoplot} to visualize
## the residual intervals and identify outliers.
##
## NaN values in @var{y} and @var{X} are removed before calculation begins.
##
## @end deftypefn
## References:
## - Matlab 7.0 documentation (pdf)
## - ¡¶´óѧÊýѧʵÑé¡· ½ªÆôÔ´ µÈ (textbook)
## - http://www.netnam.vn/unescocourse/statistics/12_5.htm
## - wsolve.m in octave-forge
## - http://www.stanford.edu/class/ee263/ls_ln_matlab.pdf
function [b, bint, r, rint, stats] = regress (y, X, alpha)
if (nargin < 2 || nargin > 3)
print_usage;
endif
if (! ismatrix (y))
error ("regress: y must be a numeric matrix");
endif
if (! ismatrix (X))
error ("regress: X must be a numeric matrix");
endif
if (columns (y) != 1)
error ("regress: y must be a column vector");
endif
if (rows (y) != rows (X))
error ("regress: y and X must contain the same number of rows");
endif
if (nargin < 3)
alpha = 0.05;
elseif (! isscalar (alpha))
error ("regress: alpha must be a scalar value")
endif
notnans = ! logical (sum (isnan ([y X]), 2));
y = y(notnans);
X = X(notnans,:);
[Xq Xr] = qr (X, 0);
pinv_X = Xr \ Xq';
b = pinv_X * y;
if (nargout > 1)
n = rows (X);
p = columns (X);
dof = n - p;
t_alpha_2 = tinv (alpha / 2, dof);
r = y - X * b; # added -- Nir
SSE = sum (r .^ 2);
v = SSE / dof;
# c = diag(inv (X' * X)) using (economy) QR decomposition
# which means that we only have to use Xr
c = diag (inv (Xr' * Xr));
db = t_alpha_2 * sqrt (v * c);
bint = [b + db, b - db];
endif
if (nargout > 3)
dof1 = n - p - 1;
h = sum(X.*pinv_X', 2); #added -- Nir (same as diag(X*pinv_X), without doing the matrix multiply)
# From Matlab's documentation on Multiple Linear Regression,
# sigmaihat2 = norm (r) ^ 2 / dof1 - r .^ 2 / (dof1 * (1 - h));
# dr = -tinv (1 - alpha / 2, dof) * sqrt (sigmaihat2 .* (1 - h));
# Substitute
# norm (r) ^ 2 == sum (r .^ 2) == SSE
# -tinv (1 - alpha / 2, dof) == tinv (alpha / 2, dof) == t_alpha_2
# We get
# sigmaihat2 = (SSE - r .^ 2 / (1 - h)) / dof1;
# dr = t_alpha_2 * sqrt (sigmaihat2 .* (1 - h));
# Combine, we get
# dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1);
dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1);
rint = [r + dr, r - dr];
endif
if (nargout > 4)
R2 = 1 - SSE / sum ((y - mean (y)) .^ 2);
# F = (R2 / (p - 1)) / ((1 - R2) / dof);
F = dof / (p - 1) / (1 / R2 - 1);
pval = 1 - fcdf (F, p - 1, dof);
stats = [R2 F pval v];
endif
endfunction
%!test
%! % Longley data from the NIST Statistical Reference Dataset
%! Z = [ 60323 83.0 234289 2356 1590 107608 1947
%! 61122 88.5 259426 2325 1456 108632 1948
%! 60171 88.2 258054 3682 1616 109773 1949
%! 61187 89.5 284599 3351 1650 110929 1950
%! 63221 96.2 328975 2099 3099 112075 1951
%! 63639 98.1 346999 1932 3594 113270 1952
%! 64989 99.0 365385 1870 3547 115094 1953
%! 63761 100.0 363112 3578 3350 116219 1954
%! 66019 101.2 397469 2904 3048 117388 1955
%! 67857 104.6 419180 2822 2857 118734 1956
%! 68169 108.4 442769 2936 2798 120445 1957
%! 66513 110.8 444546 4681 2637 121950 1958
%! 68655 112.6 482704 3813 2552 123366 1959
%! 69564 114.2 502601 3931 2514 125368 1960
%! 69331 115.7 518173 4806 2572 127852 1961
%! 70551 116.9 554894 4007 2827 130081 1962 ];
%! % Results certified by NIST using 500 digit arithmetic
%! % b and standard error in b
%! V = [ -3482258.63459582 890420.383607373
%! 15.0618722713733 84.9149257747669
%! -0.358191792925910E-01 0.334910077722432E-01
%! -2.02022980381683 0.488399681651699
%! -1.03322686717359 0.214274163161675
%! -0.511041056535807E-01 0.226073200069370
%! 1829.15146461355 455.478499142212 ];
%! Rsq = 0.995479004577296;
%! F = 330.285339234588;
%! y = Z(:,1); X = [ones(rows(Z),1), Z(:,2:end)];
%! alpha = 0.05;
%! [b, bint, r, rint, stats] = regress (y, X, alpha);
%! assert(b,V(:,1),3e-6);
%! assert(stats(1),Rsq,1e-12);
%! assert(stats(2),F,3e-8);
%! assert(((bint(:,1)-bint(:,2))/2)/tinv(alpha/2,9),V(:,2),-1.e-5);
|