/usr/share/pyshared/statsmodels/sandbox/sysreg.py is in python-statsmodels 0.4.2-1.2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 | from statsmodels.regression.linear_model import GLS
import numpy as np
import statsmodels.tools.tools as tools
from statsmodels.base.model import LikelihoodModelResults
from scipy import sparse
#http://www.irisa.fr/aladin/wg-statlin/WORKSHOPS/RENNES02/SLIDES/Foschi.pdf
__all__ = ['SUR', 'Sem2SLS']
#probably should have a SystemModel superclass
# TODO: does it make sense of SUR equations to have
# independent endogenous regressors? If so, then
# change docs to LHS = RHS
#TODO: make a dictionary that holds equation specific information
#rather than these cryptic lists? Slower to get a dict value?
#TODO: refine sigma definition
class SUR(object):
"""
Seemingly Unrelated Regression
Parameters
----------
sys : list
[endog1, exog1, endog2, exog2,...] It will be of length 2 x M,
where M is the number of equations endog = exog.
sigma : array-like
M x M array where sigma[i,j] is the covariance between equation i and j
dfk : None, 'dfk1', or 'dfk2'
Default is None. Correction for the degrees of freedom
should be specified for small samples. See the notes for more
information.
Attributes
----------
cholsigmainv : array
The transpose of the Cholesky decomposition of `pinv_wexog`
df_model : array
Model degrees of freedom of each equation. p_{m} - 1 where p is
the number of regressors for each equation m and one is subtracted
for the constant.
df_resid : array
Residual degrees of freedom of each equation. Number of observations
less the number of parameters.
endog : array
The LHS variables for each equation in the system.
It is a M x nobs array where M is the number of equations.
exog : array
The RHS variable for each equation in the system.
It is a nobs x sum(p_{m}) array. Which is just each
RHS array stacked next to each other in columns.
history : dict
Contains the history of fitting the model. Probably not of interest
if the model is fit with `igls` = False.
iterations : int
The number of iterations until convergence if the model is fit
iteratively.
nobs : float
The number of observations of the equations.
normalized_cov_params : array
sum(p_{m}) x sum(p_{m}) array
:math:`\\left[X^{T}\\left(\\Sigma^{-1}\\otimes\\boldsymbol{I}\\right)X\\right]^{-1}`
pinv_wexog : array
The pseudo-inverse of the `wexog`
sigma : array
M x M covariance matrix of the cross-equation disturbances. See notes.
sp_exog : CSR sparse matrix
Contains a block diagonal sparse matrix of the design so that
exog1 ... exogM are on the diagonal.
wendog : array
M * nobs x 1 array of the endogenous variables whitened by
`cholsigmainv` and stacked into a single column.
wexog : array
M*nobs x sum(p_{m}) array of the whitened exogenous variables.
Notes
-----
All individual equations are assumed to be well-behaved, homoeskedastic
iid errors. This is basically an extension of GLS, using sparse matrices.
.. math:: \\Sigma=\\left[\\begin{array}{cccc}
\\sigma_{11} & \\sigma_{12} & \\cdots & \\sigma_{1M}\\\\
\\sigma_{21} & \\sigma_{22} & \\cdots & \\sigma_{2M}\\\\
\\vdots & \\vdots & \\ddots & \\vdots\\\\
\\sigma_{M1} & \\sigma_{M2} & \\cdots & \\sigma_{MM}\\end{array}\\right]
References
----------
Zellner (1962), Greene (2003)
"""
#TODO: Does each equation need nobs to be the same?
def __init__(self, sys, sigma=None, dfk=None):
if len(sys) % 2 != 0:
raise ValueError("sys must be a list of pairs of endogenous and \
exogenous variables. Got length %s" % len(sys))
if dfk:
if not dfk.lower() in ['dfk1','dfk2']:
raise ValueError("dfk option %s not understood" % (dfk))
self._dfk = dfk
M = len(sys[1::2])
self._M = M
# exog = np.zeros((M,M), dtype=object)
# for i,eq in enumerate(sys[1::2]):
# exog[i,i] = np.asarray(eq) # not sure this exog is needed
# used to compute resids for now
exog = np.column_stack(np.asarray(sys[1::2][i]) for i in range(M))
# exog = np.vstack(np.asarray(sys[1::2][i]) for i in range(M))
self.exog = exog # 2d ndarray exog is better
# Endog, might just go ahead and reshape this?
endog = np.asarray(sys[::2])
self.endog = endog
self.nobs = float(self.endog[0].shape[0]) # assumes all the same length
# Degrees of Freedom
df_resid = []
df_model = []
[df_resid.append(self.nobs - tools.rank(_)) \
for _ in sys[1::2]]
[df_model.append(tools.rank(_) - 1) for _ in sys[1::2]]
self.df_resid = np.asarray(df_resid)
self.df_model = np.asarray(df_model)
# "Block-diagonal" sparse matrix of exog
sp_exog = sparse.lil_matrix((int(self.nobs*M),
int(np.sum(self.df_model+1)))) # linked lists to build
self._cols = np.cumsum(np.hstack((0, self.df_model+1)))
for i in range(M):
sp_exog[i*self.nobs:(i+1)*self.nobs,
self._cols[i]:self._cols[i+1]] = sys[1::2][i]
self.sp_exog = sp_exog.tocsr() # cast to compressed for efficiency
# Deal with sigma, check shape earlier if given
if np.any(sigma):
sigma = np.asarray(sigma) # check shape
elif sigma == None:
resids = []
for i in range(M):
resids.append(GLS(endog[i],exog[:,
self._cols[i]:self._cols[i+1]]).fit().resid)
resids = np.asarray(resids).reshape(M,-1)
sigma = self._compute_sigma(resids)
self.sigma = sigma
self.cholsigmainv = np.linalg.cholesky(np.linalg.pinv(\
self.sigma)).T
self.initialize()
def initialize(self):
self.wendog = self.whiten(self.endog)
self.wexog = self.whiten(self.sp_exog)
self.pinv_wexog = np.linalg.pinv(self.wexog)
self.normalized_cov_params = np.dot(self.pinv_wexog,
np.transpose(self.pinv_wexog))
self.history = {'params' : [np.inf]}
self.iterations = 0
def _update_history(self, params):
self.history['params'].append(params)
def _compute_sigma(self, resids):
"""
Computes the sigma matrix and update the cholesky decomposition.
"""
M = self._M
nobs = self.nobs
sig = np.dot(resids, resids.T) # faster way to do this?
if not self._dfk:
div = nobs
elif self._dfk.lower() == 'dfk1':
div = np.zeros(M**2)
for i in range(M):
for j in range(M):
div[i+j] = ((self.df_model[i]+1) *\
(self.df_model[j]+1))**(1/2)
div.reshape(M,M)
else: # 'dfk2' error checking is done earlier
div = np.zeros(M**2)
for i in range(M):
for j in range(M):
div[i+j] = nobs - np.max(self.df_model[i]+1,
self.df_model[j]+1)
div.reshape(M,M)
# doesn't handle (#,)
self.cholsigmainv = np.linalg.cholesky(np.linalg.pinv(sig/div)).T
return sig/div
def whiten(self, X):
"""
SUR whiten method.
Parameters
-----------
X : list of arrays
Data to be whitened.
Returns
-------
If X is the exogenous RHS of the system.
``np.dot(np.kron(cholsigmainv,np.eye(M)),np.diag(X))``
If X is the endogenous LHS of the system.
"""
nobs = self.nobs
if X is self.endog: # definitely not a robust check
return np.dot(np.kron(self.cholsigmainv,np.eye(nobs)),
X.reshape(-1,1))
elif X is self.sp_exog:
return (sparse.kron(self.cholsigmainv,
sparse.eye(nobs,nobs))*X).toarray()#*=dot until cast to array
def fit(self, igls=False, tol=1e-5, maxiter=100):
"""
igls : bool
Iterate until estimates converge if sigma is None instead of
two-step GLS, which is the default is sigma is None.
tol : float
maxiter : int
Notes
-----
This ia naive implementation that does not exploit the block
diagonal structure. It should work for ill-conditioned `sigma`
but this is untested.
"""
if not np.any(self.sigma):
self.sigma = self._compute_sigma(self.endog, self.exog)
M = self._M
beta = np.dot(self.pinv_wexog, self.wendog)
self._update_history(beta)
self.iterations += 1
if not igls:
sur_fit = SysResults(self, beta, self.normalized_cov_params)
return sur_fit
conv = self.history['params']
while igls and (np.any(np.abs(conv[-2] - conv[-1]) > tol)) and \
(self.iterations < maxiter):
fittedvalues = (self.sp_exog*beta).reshape(M,-1)
resids = self.endog - fittedvalues # don't attach results yet
self.sigma = self._compute_sigma(resids) # need to attach for compute?
self.wendog = self.whiten(self.endog)
self.wexog = self.whiten(self.sp_exog)
self.pinv_wexog = np.linalg.pinv(self.wexog)
self.normalized_cov_params = np.dot(self.pinv_wexog,
np.transpose(self.pinv_wexog))
beta = np.dot(self.pinv_wexog, self.wendog)
self._update_history(beta)
self.iterations += 1
sur_fit = SysResults(self, beta, self.normalized_cov_params)
return sur_fit
def predict(self, design):
pass
#TODO: Should just have a general 2SLS estimator to subclass
# for IV, FGLS, etc.
# Also should probably have SEM class and estimators as subclasses
class Sem2SLS(object):
"""
Two-Stage Least Squares for Simultaneous equations
Parameters
----------
sys : list
[endog1, exog1, endog2, exog2,...] It will be of length 2 x M,
where M is the number of equations endog = exog.
indep_endog : dict
A dictionary mapping the equation to the column numbers of the
the independent endogenous regressors in each equation.
It is assumed that the system is inputed as broken up into
LHS and RHS. For now, the values of the dict have to be sequences.
Note that the keys for the equations should be zero-indexed.
instruments : array
Array of the exogenous independent variables.
Notes
-----
This is unfinished, and the design should be refactored.
Estimation is done by brute force and there is no exploitation of
the structure of the system.
"""
def __init__(self, sys, indep_endog=None, instruments=None):
if len(sys) % 2 != 0:
raise ValueError("sys must be a list of pairs of endogenous and \
exogenous variables. Got length %s" % len(sys))
M = len(sys[1::2])
self._M = M
# The lists are probably a bad idea
self.endog = sys[::2] # these are just list containers
self.exog = sys[1::2]
self._K = [tools.rank(_) for _ in sys[1::2]]
# fullexog = np.column_stack((_ for _ in self.exog))
self.instruments = instruments
# Keep the Y_j's in a container to get IVs
instr_endog = {}
[instr_endog.setdefault(_,[]) for _ in indep_endog.keys()]
for eq_key in indep_endog:
for varcol in indep_endog[eq_key]:
instr_endog[eq_key].append(self.exog[eq_key][:,varcol])
# ^ copy needed?
# self._instr_endog = instr_endog
self._indep_endog = indep_endog
_col_map = np.cumsum(np.hstack((0,self._K))) # starting col no.s
# move this check to whiten since we're not going to build a full exog?
for eq_key in indep_endog:
try:
iter(indep_endog[eq_key])
except:
# eq_key = [eq_key]
raise TypeError("The values of the indep_exog dict must be\
iterable. Got type %s for converter %s" % (type(del_col)))
# for del_col in indep_endog[eq_key]:
# fullexog = np.delete(fullexog, _col_map[eq_key]+del_col, 1)
# _col_map[eq_key+1:] -= 1
# Josef's example for deleting reoccuring "rows"
# fullexog = np.unique(fullexog.T.view([('',fullexog.dtype)]*\
# fullexog.shape[0])).view(fullexog.dtype).reshape(\
# fullexog.shape[0],-1)
# From http://article.gmane.org/gmane.comp.python.numeric.general/32276/
# Or Jouni' suggetsion of taking a hash:
# http://www.mail-archive.com/numpy-discussion@scipy.org/msg04209.html
# not clear to me how this would work though, only if they are the *same*
# elements?
# self.fullexog = fullexog
self.wexog = self.whiten(instr_endog)
def whiten(self, Y):
"""
Runs the first stage of the 2SLS.
Returns the RHS variables that include the instruments.
"""
wexog = []
indep_endog = self._indep_endog # this has the col mapping
# fullexog = self.fullexog
instruments = self.instruments
for eq in range(self._M): # need to go through all equations regardless
instr_eq = Y.get(eq, None) # Y has the eq to ind endog array map
newRHS = self.exog[eq].copy()
if instr_eq:
for i,LHS in enumerate(instr_eq):
yhat = GLS(LHS, self.instruments).fit().fittedvalues
newRHS[:,indep_endog[eq][i]] = yhat
# this might fail if there is a one variable column (nobs,)
# in exog
wexog.append(newRHS)
return wexog
def fit(self):
"""
"""
delta = []
wexog = self.wexog
endog = self.endog
for j in range(self._M):
delta.append(GLS(endog[j], wexog[j]).fit().params)
return delta
class SysResults(LikelihoodModelResults):
"""
Not implemented yet.
"""
def __init__(self, model, params, normalized_cov_params=None, scale=1.):
super(SysResults, self).__init__(model, params,
normalized_cov_params, scale)
self._get_results()
def _get_results(self):
pass
|