This file is indexed.

/usr/share/octave/packages/statistics-1.3.0/regress.m is in octave-statistics 1.3.0-4.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
## Copyright (C) 2005, 2006 William Poetra Yoga Hadisoeseno
## Copyright (C) 2011 Nir Krakauer
##
## This program is free software; you can redistribute it and/or modify it under
## the terms of the GNU General Public License as published by the Free Software
## Foundation; either version 3 of the License, or (at your option) any later
## version.
##
## This program is distributed in the hope that it will be useful, but WITHOUT
## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
## details.
##
## You should have received a copy of the GNU General Public License along with
## this program; if not, see <http://www.gnu.org/licenses/>.

## -*- texinfo -*-
## @deftypefn {Function File} {[@var{b}, @var{bint}, @var{r}, @var{rint}, @var{stats}] =} regress (@var{y}, @var{X}, [@var{alpha}])
## Multiple Linear Regression using Least Squares Fit of @var{y} on @var{X}
## with the model @code{y = X * beta + e}.
##
## Here,
##
## @itemize
## @item
## @code{y} is a column vector of observed values
## @item
## @code{X} is a matrix of regressors, with the first column filled with
## the constant value 1
## @item
## @code{beta} is a column vector of regression parameters
## @item
## @code{e} is a column vector of random errors
## @end itemize
##
## Arguments are
##
## @itemize
## @item
## @var{y} is the @code{y} in the model
## @item
## @var{X} is the @code{X} in the model
## @item
## @var{alpha} is the significance level used to calculate the confidence
## intervals @var{bint} and @var{rint} (see `Return values' below). If not
## specified, ALPHA defaults to 0.05
## @end itemize
##
## Return values are
##
## @itemize
## @item
## @var{b} is the @code{beta} in the model
## @item
## @var{bint} is the confidence interval for @var{b}
## @item
## @var{r} is a column vector of residuals
## @item
## @var{rint} is the confidence interval for @var{r}
## @item
## @var{stats} is a row vector containing:
##
##   @itemize
##   @item The R^2 statistic
##   @item The F statistic
##   @item The p value for the full model
##   @item The estimated error variance
##   @end itemize
## @end itemize
##
## @var{r} and @var{rint} can be passed to @code{rcoplot} to visualize
## the residual intervals and identify outliers.
##
## NaN values in @var{y} and @var{X} are removed before calculation begins.
##
## @end deftypefn

## References:
## - Matlab 7.0 documentation (pdf)
## - ¡¶´óѧÊýѧʵÑé¡· ½ªÆôÔ´ µÈ (textbook)
## - http://www.netnam.vn/unescocourse/statistics/12_5.htm
## - wsolve.m in octave-forge
## - http://www.stanford.edu/class/ee263/ls_ln_matlab.pdf

function [b, bint, r, rint, stats] = regress (y, X, alpha)

  if (nargin < 2 || nargin > 3)
    print_usage;
  endif

  if (! ismatrix (y))
    error ("regress: y must be a numeric matrix");
  endif
  if (! ismatrix (X))
    error ("regress: X must be a numeric matrix");
  endif

  if (columns (y) != 1)
    error ("regress: y must be a column vector");
  endif

  if (rows (y) != rows (X))
    error ("regress: y and X must contain the same number of rows");
  endif

  if (nargin < 3)
    alpha = 0.05;
  elseif (! isscalar (alpha))
    error ("regress: alpha must be a scalar value")
  endif

  notnans = ! logical (sum (isnan ([y X]), 2));
  y = y(notnans);
  X = X(notnans,:);

  [Xq Xr] = qr (X, 0);
  pinv_X = Xr \ Xq';

  b = pinv_X * y;

  if (nargout > 1)

    n = rows (X);
    p = columns (X);
    dof = n - p;
    t_alpha_2 = tinv (alpha / 2, dof);

    r = y - X * b; # added -- Nir
    SSE = sum (r .^ 2);
    v = SSE / dof;

    # c = diag(inv (X' * X)) using (economy) QR decomposition
    # which means that we only have to use Xr
    c = diag (inv (Xr' * Xr));

    db = t_alpha_2 * sqrt (v * c);

    bint = [b + db, b - db];

  endif

  if (nargout > 3)

    dof1 = n - p - 1;
    h = sum(X.*pinv_X', 2); #added -- Nir (same as diag(X*pinv_X), without doing the matrix multiply)

    # From Matlab's documentation on Multiple Linear Regression,
    #   sigmaihat2 = norm (r) ^ 2 / dof1 - r .^ 2 / (dof1 * (1 - h));
    #   dr = -tinv (1 - alpha / 2, dof) * sqrt (sigmaihat2 .* (1 - h));
    # Substitute
    #   norm (r) ^ 2 == sum (r .^ 2) == SSE
    #   -tinv (1 - alpha / 2, dof) == tinv (alpha / 2, dof) == t_alpha_2
    # We get
    #   sigmaihat2 = (SSE - r .^ 2 / (1 - h)) / dof1;
    #   dr = t_alpha_2 * sqrt (sigmaihat2 .* (1 - h));
    # Combine, we get
    #   dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1);

    dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1);

    rint = [r + dr, r - dr];

  endif

  if (nargout > 4)

    R2 = 1 - SSE / sum ((y - mean (y)) .^ 2);
#    F = (R2 / (p - 1)) / ((1 - R2) / dof);
    F = dof / (p - 1) / (1 / R2 - 1);
    pval = 1 - fcdf (F, p - 1, dof);

    stats = [R2 F pval v];

  endif

endfunction


%!test
%! % Longley data from the NIST Statistical Reference Dataset
%! Z = [  60323    83.0   234289   2356     1590    107608  1947
%!        61122    88.5   259426   2325     1456    108632  1948
%!        60171    88.2   258054   3682     1616    109773  1949
%!        61187    89.5   284599   3351     1650    110929  1950
%!        63221    96.2   328975   2099     3099    112075  1951
%!        63639    98.1   346999   1932     3594    113270  1952
%!        64989    99.0   365385   1870     3547    115094  1953
%!        63761   100.0   363112   3578     3350    116219  1954
%!        66019   101.2   397469   2904     3048    117388  1955
%!        67857   104.6   419180   2822     2857    118734  1956
%!        68169   108.4   442769   2936     2798    120445  1957
%!        66513   110.8   444546   4681     2637    121950  1958
%!        68655   112.6   482704   3813     2552    123366  1959
%!        69564   114.2   502601   3931     2514    125368  1960
%!        69331   115.7   518173   4806     2572    127852  1961
%!        70551   116.9   554894   4007     2827    130081  1962 ];
%! % Results certified by NIST using 500 digit arithmetic
%! % b and standard error in b
%! V = [  -3482258.63459582         890420.383607373
%!         15.0618722713733         84.9149257747669
%!        -0.358191792925910E-01    0.334910077722432E-01
%!        -2.02022980381683         0.488399681651699
%!        -1.03322686717359         0.214274163161675
%!        -0.511041056535807E-01    0.226073200069370
%!         1829.15146461355         455.478499142212 ];
%! Rsq = 0.995479004577296;
%! F = 330.285339234588;
%! y = Z(:,1); X = [ones(rows(Z),1), Z(:,2:end)];
%! alpha = 0.05;
%! [b, bint, r, rint, stats] = regress (y, X, alpha);
%! assert(b,V(:,1),3e-6);
%! assert(stats(1),Rsq,1e-12);
%! assert(stats(2),F,3e-8);
%! assert(((bint(:,1)-bint(:,2))/2)/tinv(alpha/2,9),V(:,2),-1.e-5);