This file is indexed.

/usr/share/pyshared/statsmodels/sandbox/sysreg.py is in python-statsmodels 0.4.2-1.2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
from statsmodels.regression.linear_model import GLS
import numpy as np
import statsmodels.tools.tools as tools
from statsmodels.base.model import LikelihoodModelResults
from scipy import sparse

#http://www.irisa.fr/aladin/wg-statlin/WORKSHOPS/RENNES02/SLIDES/Foschi.pdf

__all__ = ['SUR', 'Sem2SLS']

#probably should have a SystemModel superclass
# TODO: does it make sense of SUR equations to have
# independent endogenous regressors?  If so, then
# change docs to LHS = RHS
#TODO: make a dictionary that holds equation specific information
#rather than these cryptic lists?  Slower to get a dict value?
#TODO: refine sigma definition
class SUR(object):
    """
    Seemingly Unrelated Regression

    Parameters
    ----------
    sys : list
        [endog1, exog1, endog2, exog2,...] It will be of length 2 x M,
        where M is the number of equations endog = exog.
    sigma : array-like
        M x M array where sigma[i,j] is the covariance between equation i and j
    dfk : None, 'dfk1', or 'dfk2'
        Default is None.  Correction for the degrees of freedom
        should be specified for small samples.  See the notes for more
        information.

    Attributes
    ----------
    cholsigmainv : array
        The transpose of the Cholesky decomposition of `pinv_wexog`
    df_model : array
        Model degrees of freedom of each equation. p_{m} - 1 where p is
        the number of regressors for each equation m and one is subtracted
        for the constant.
    df_resid : array
        Residual degrees of freedom of each equation. Number of observations
        less the number of parameters.
    endog : array
        The LHS variables for each equation in the system.
        It is a M x nobs array where M is the number of equations.
    exog : array
        The RHS variable for each equation in the system.
        It is a nobs x sum(p_{m}) array.  Which is just each
        RHS array stacked next to each other in columns.
    history : dict
        Contains the history of fitting the model. Probably not of interest
        if the model is fit with `igls` = False.
    iterations : int
        The number of iterations until convergence if the model is fit
        iteratively.
    nobs : float
        The number of observations of the equations.
    normalized_cov_params : array
        sum(p_{m}) x sum(p_{m}) array
        :math:`\\left[X^{T}\\left(\\Sigma^{-1}\\otimes\\boldsymbol{I}\\right)X\\right]^{-1}`
    pinv_wexog : array
        The pseudo-inverse of the `wexog`
    sigma : array
        M x M covariance matrix of the cross-equation disturbances. See notes.
    sp_exog : CSR sparse matrix
        Contains a block diagonal sparse matrix of the design so that
        exog1 ... exogM are on the diagonal.
    wendog : array
        M * nobs x 1 array of the endogenous variables whitened by
        `cholsigmainv` and stacked into a single column.
    wexog : array
        M*nobs x sum(p_{m}) array of the whitened exogenous variables.

    Notes
    -----
    All individual equations are assumed to be well-behaved, homoeskedastic
    iid errors.  This is basically an extension of GLS, using sparse matrices.

    .. math:: \\Sigma=\\left[\\begin{array}{cccc}
              \\sigma_{11} & \\sigma_{12} & \\cdots & \\sigma_{1M}\\\\
              \\sigma_{21} & \\sigma_{22} & \\cdots & \\sigma_{2M}\\\\
              \\vdots & \\vdots & \\ddots & \\vdots\\\\
              \\sigma_{M1} & \\sigma_{M2} & \\cdots & \\sigma_{MM}\\end{array}\\right]

    References
    ----------
    Zellner (1962), Greene (2003)
    """
#TODO: Does each equation need nobs to be the same?
    def __init__(self, sys, sigma=None, dfk=None):
        if len(sys) % 2 != 0:
            raise ValueError("sys must be a list of pairs of endogenous and \
exogenous variables.  Got length %s" % len(sys))
        if dfk:
            if not dfk.lower() in ['dfk1','dfk2']:
                raise ValueError("dfk option %s not understood" % (dfk))
        self._dfk = dfk
        M = len(sys[1::2])
        self._M = M
#        exog = np.zeros((M,M), dtype=object)
#        for i,eq in enumerate(sys[1::2]):
#            exog[i,i] = np.asarray(eq)  # not sure this exog is needed
                                        # used to compute resids for now
        exog = np.column_stack(np.asarray(sys[1::2][i]) for i in range(M))
#       exog = np.vstack(np.asarray(sys[1::2][i]) for i in range(M))
        self.exog = exog # 2d ndarray exog is better
# Endog, might just go ahead and reshape this?
        endog = np.asarray(sys[::2])
        self.endog = endog
        self.nobs = float(self.endog[0].shape[0]) # assumes all the same length

# Degrees of Freedom
        df_resid = []
        df_model = []
        [df_resid.append(self.nobs - tools.rank(_)) \
                for _ in sys[1::2]]
        [df_model.append(tools.rank(_) - 1) for _ in sys[1::2]]
        self.df_resid = np.asarray(df_resid)
        self.df_model = np.asarray(df_model)

# "Block-diagonal" sparse matrix of exog
        sp_exog = sparse.lil_matrix((int(self.nobs*M),
            int(np.sum(self.df_model+1)))) # linked lists to build
        self._cols = np.cumsum(np.hstack((0, self.df_model+1)))
        for i in range(M):
            sp_exog[i*self.nobs:(i+1)*self.nobs,
                    self._cols[i]:self._cols[i+1]] = sys[1::2][i]
        self.sp_exog = sp_exog.tocsr() # cast to compressed for efficiency
# Deal with sigma, check shape earlier if given
        if np.any(sigma):
            sigma = np.asarray(sigma) # check shape
        elif sigma == None:
            resids = []
            for i in range(M):
                resids.append(GLS(endog[i],exog[:,
                    self._cols[i]:self._cols[i+1]]).fit().resid)
            resids = np.asarray(resids).reshape(M,-1)
            sigma = self._compute_sigma(resids)
        self.sigma = sigma
        self.cholsigmainv = np.linalg.cholesky(np.linalg.pinv(\
                    self.sigma)).T
        self.initialize()

    def initialize(self):
        self.wendog = self.whiten(self.endog)
        self.wexog = self.whiten(self.sp_exog)
        self.pinv_wexog = np.linalg.pinv(self.wexog)
        self.normalized_cov_params = np.dot(self.pinv_wexog,
                np.transpose(self.pinv_wexog))
        self.history = {'params' : [np.inf]}
        self.iterations = 0

    def _update_history(self, params):
        self.history['params'].append(params)

    def _compute_sigma(self, resids):
        """
        Computes the sigma matrix and update the cholesky decomposition.
        """
        M = self._M
        nobs = self.nobs
        sig = np.dot(resids, resids.T)  # faster way to do this?
        if not self._dfk:
            div = nobs
        elif self._dfk.lower() == 'dfk1':
            div = np.zeros(M**2)
            for i in range(M):
                for j in range(M):
                    div[i+j] = ((self.df_model[i]+1) *\
                            (self.df_model[j]+1))**(1/2)
            div.reshape(M,M)
        else: # 'dfk2' error checking is done earlier
            div = np.zeros(M**2)
            for i in range(M):
                for j in range(M):
                    div[i+j] = nobs - np.max(self.df_model[i]+1,
                        self.df_model[j]+1)
            div.reshape(M,M)
# doesn't handle (#,)
        self.cholsigmainv = np.linalg.cholesky(np.linalg.pinv(sig/div)).T
        return sig/div

    def whiten(self, X):
        """
        SUR whiten method.

        Parameters
        -----------
        X : list of arrays
            Data to be whitened.

        Returns
        -------
        If X is the exogenous RHS of the system.
        ``np.dot(np.kron(cholsigmainv,np.eye(M)),np.diag(X))``

        If X is the endogenous LHS of the system.

        """
        nobs = self.nobs
        if X is self.endog: # definitely not a robust check
            return np.dot(np.kron(self.cholsigmainv,np.eye(nobs)),
                X.reshape(-1,1))
        elif X is self.sp_exog:
            return (sparse.kron(self.cholsigmainv,
                sparse.eye(nobs,nobs))*X).toarray()#*=dot until cast to array

    def fit(self, igls=False, tol=1e-5, maxiter=100):
        """
        igls : bool
            Iterate until estimates converge if sigma is None instead of
            two-step GLS, which is the default is sigma is None.

        tol : float

        maxiter : int

        Notes
        -----
        This ia naive implementation that does not exploit the block
        diagonal structure. It should work for ill-conditioned `sigma`
        but this is untested.
        """

        if not np.any(self.sigma):
            self.sigma = self._compute_sigma(self.endog, self.exog)
        M = self._M
        beta = np.dot(self.pinv_wexog, self.wendog)
        self._update_history(beta)
        self.iterations += 1
        if not igls:
            sur_fit = SysResults(self, beta, self.normalized_cov_params)
            return sur_fit

        conv = self.history['params']
        while igls and (np.any(np.abs(conv[-2] - conv[-1]) > tol)) and \
                (self.iterations < maxiter):
            fittedvalues = (self.sp_exog*beta).reshape(M,-1)
            resids = self.endog - fittedvalues # don't attach results yet
            self.sigma = self._compute_sigma(resids) # need to attach for compute?
            self.wendog = self.whiten(self.endog)
            self.wexog = self.whiten(self.sp_exog)
            self.pinv_wexog = np.linalg.pinv(self.wexog)
            self.normalized_cov_params = np.dot(self.pinv_wexog,
                    np.transpose(self.pinv_wexog))
            beta = np.dot(self.pinv_wexog, self.wendog)
            self._update_history(beta)
            self.iterations += 1
        sur_fit = SysResults(self, beta, self.normalized_cov_params)
        return sur_fit

    def predict(self, design):
        pass

#TODO: Should just have a general 2SLS estimator to subclass
# for IV, FGLS, etc.
# Also should probably have SEM class and estimators as subclasses
class Sem2SLS(object):
    """
    Two-Stage Least Squares for Simultaneous equations

    Parameters
    ----------
    sys : list
        [endog1, exog1, endog2, exog2,...] It will be of length 2 x M,
        where M is the number of equations endog = exog.
    indep_endog : dict
        A dictionary mapping the equation to the column numbers of the
        the independent endogenous regressors in each equation.
        It is assumed that the system is inputed as broken up into
        LHS and RHS. For now, the values of the dict have to be sequences.
        Note that the keys for the equations should be zero-indexed.
    instruments : array
        Array of the exogenous independent variables.

    Notes
    -----
    This is unfinished, and the design should be refactored.
    Estimation is done by brute force and there is no exploitation of
    the structure of the system.
    """
    def __init__(self, sys, indep_endog=None, instruments=None):
        if len(sys) % 2 != 0:
            raise ValueError("sys must be a list of pairs of endogenous and \
exogenous variables.  Got length %s" % len(sys))
        M = len(sys[1::2])
        self._M = M
# The lists are probably a bad idea
        self.endog = sys[::2]   # these are just list containers
        self.exog = sys[1::2]
        self._K = [tools.rank(_) for _ in sys[1::2]]
#        fullexog = np.column_stack((_ for _ in self.exog))

        self.instruments = instruments

        # Keep the Y_j's in a container to get IVs
        instr_endog = {}
        [instr_endog.setdefault(_,[]) for _ in indep_endog.keys()]

        for eq_key in indep_endog:
            for varcol in indep_endog[eq_key]:
                instr_endog[eq_key].append(self.exog[eq_key][:,varcol])
                # ^ copy needed?
#        self._instr_endog = instr_endog

        self._indep_endog = indep_endog
        _col_map = np.cumsum(np.hstack((0,self._K))) # starting col no.s
# move this check to whiten since we're not going to build a full exog?
        for eq_key in indep_endog:
            try:
                iter(indep_endog[eq_key])
            except:
#                eq_key = [eq_key]
                raise TypeError("The values of the indep_exog dict must be\
 iterable. Got type %s for converter %s" % (type(del_col)))
#            for del_col in indep_endog[eq_key]:
#                fullexog = np.delete(fullexog,  _col_map[eq_key]+del_col, 1)
#                _col_map[eq_key+1:] -= 1

# Josef's example for deleting reoccuring "rows"
#        fullexog = np.unique(fullexog.T.view([('',fullexog.dtype)]*\
#                fullexog.shape[0])).view(fullexog.dtype).reshape(\
#                fullexog.shape[0],-1)
# From http://article.gmane.org/gmane.comp.python.numeric.general/32276/
# Or Jouni' suggetsion of taking a hash:
# http://www.mail-archive.com/numpy-discussion@scipy.org/msg04209.html
# not clear to me how this would work though, only if they are the *same*
# elements?
#        self.fullexog = fullexog
        self.wexog = self.whiten(instr_endog)


    def whiten(self, Y):
        """
        Runs the first stage of the 2SLS.

        Returns the RHS variables that include the instruments.
        """
        wexog = []
        indep_endog = self._indep_endog # this has the col mapping
#        fullexog = self.fullexog
        instruments = self.instruments
        for eq in range(self._M): # need to go through all equations regardless
            instr_eq = Y.get(eq, None) # Y has the eq to ind endog array map
            newRHS = self.exog[eq].copy()
            if instr_eq:
                for i,LHS in enumerate(instr_eq):
                    yhat = GLS(LHS, self.instruments).fit().fittedvalues
                    newRHS[:,indep_endog[eq][i]] = yhat
                # this might fail if there is a one variable column (nobs,)
                # in exog
            wexog.append(newRHS)
        return wexog

    def fit(self):
        """
        """
        delta = []
        wexog = self.wexog
        endog = self.endog
        for j in range(self._M):
            delta.append(GLS(endog[j], wexog[j]).fit().params)
        return delta

class SysResults(LikelihoodModelResults):
    """
    Not implemented yet.
    """
    def __init__(self, model, params, normalized_cov_params=None, scale=1.):
        super(SysResults, self).__init__(model, params,
                normalized_cov_params, scale)
        self._get_results()

    def _get_results(self):
        pass