This file is indexed.

/usr/share/pyshared/statsmodels/tools/grouputils.py is in python-statsmodels 0.4.2-1.2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
# -*- coding: utf-8 -*-
"""Tools for working with groups

This provides several functions to work with groups and a Group class that
keeps track of the different representations and has methods to work more
easily with groups.


Author: Josef Perktold,
Author: Nathaniel Smith, recipe for sparse_dummies on scipy user mailing list

Created on Tue Nov 29 15:44:53 2011 : sparse_dummies
Created on Wed Nov 30 14:28:24 2011 : combine_indices
changes: add Group class

Notes
~~~~~

This reverses the class I used before, where the class was for the data and
the group was auxiliary. Here, it is only the group, no data is kept.

sparse_dummies needs checking for corner cases, e.g.
what if a category level has zero elements? This can happen with subset
    selection even if the original groups where defined as arange.

Not all methods and options have been tried out yet after refactoring

need more efficient loop if groups are sorted -> see GroupSorted.group_iter



"""

import numpy as np
from statsmodels.compatnp.np_compat import npc_unique

def combine_indices(groups, prefix='', sep='.', return_labels=False):
    '''use np.unique to get integer group indices for product, intersection

    '''
    if isinstance(groups, tuple):
        groups = np.column_stack(groups)
    else:
        groups = np.asarray(groups)

    dt = groups.dtype
    #print dt

    is2d = (groups.ndim == 2) #need to store

    if is2d:
        ncols = groups.shape[1]
        if not groups.flags.c_contiguous:
            groups = np.array(groups, order='C')

        groups_ = groups.view([('',groups.dtype)]*groups.shape[1])
    else:
        groups_ = groups

    uni, uni_idx, uni_inv = npc_unique(groups_, return_index=True,
                                      return_inverse=True)

    if is2d:
        uni = uni.view(dt).reshape(-1, ncols)

        #avoiding a view would be
#        for t in uni.dtype.fields.values():
#            assert (t[0] == dt)
#
#        uni.dtype = dt
#        uni.shape = (uni.size//ncols, ncols)

    if return_labels:
        label = [(prefix+sep.join(['%s']*len(uni[0]))) % tuple(ii)
                                           for ii in uni]
        return uni_inv, uni_idx, uni, label
    else:
        return uni_inv, uni_idx, uni


#written for and used in try_covariance_grouploop.py
def group_sums(x, group, use_bincount=True):
    '''simple bincount version, again

    group : array, integer
        assumed to be consecutive integers

    no dtype checking because I want to raise in that case

    uses loop over columns of x

    for comparison, simple python loop
    '''
    x = np.asarray(x)
    if x.ndim == 1:
        x = x[:,None]
    elif x.ndim > 2 and use_bincount:
        raise ValueError('not implemented yet')

    if use_bincount:
        return np.array([np.bincount(group, weights=x[:,col])
                               for col in range(x.shape[1])])
    else:
        uniques = np.unique(group)
        result = np.zeros([len(uniques)] + list(x.shape[1:]))
        for ii, cat in enumerate(uniques):
            result[ii] = x[g==cat].sum(0)
        return result


def group_sums_dummy(x, group_dummy):
    '''sum by groups given group dummy variable

    group_dummy can be either ndarray or sparse matrix
    '''
    if type(group_dummy) is np.ndarray:
        return np.dot(x.T, group_dummy)
    else:  #check for sparse
        return x.T * group_dummy

def dummy_sparse(groups):
    '''create a sparse indicator from a group array with integer labels

    Parameters
    ----------
    groups: ndarray, int, 1d (nobs,)
        an array of group indicators for each observation. Group levels are assumed
        to be defined as consecutive integers, i.e. range(n_groups) where
        n_groups is the number of group levels. A group level with no
        observations for it will still produce a column of zeros.

    Returns
    -------
    indi : ndarray, int8, 2d (nobs, n_groups)
        an indicator array with one row per observation, that has 1 in the
        column of the group level for that observation

    Examples
    --------

    >>> g = np.array([0, 0, 2, 1, 1, 2, 0])
    >>> indi = dummy_sparse(g)
    >>> indi
    <7x3 sparse matrix of type '<type 'numpy.int8'>'
        with 7 stored elements in Compressed Sparse Row format>
    >>> indi.todense()
    matrix([[1, 0, 0],
            [1, 0, 0],
            [0, 0, 1],
            [0, 1, 0],
            [0, 1, 0],
            [0, 0, 1],
            [1, 0, 0]], dtype=int8)


    current behavior with missing groups
    >>> g = np.array([0, 0, 2, 0, 2, 0])
    >>> indi = dummy_sparse(g)
    >>> indi.todense()
    matrix([[1, 0, 0],
            [1, 0, 0],
            [0, 0, 1],
            [1, 0, 0],
            [0, 0, 1],
            [1, 0, 0]], dtype=int8)

    '''
    from scipy import sparse

    indptr = np.arange(len(groups)+1)
    data = np.ones(len(groups), dtype=np.int8)
    indi = sparse.csr_matrix((data, g, indptr))

    return indi


class Group(object):

    def __init__(self, group, name=''):

        #self.group = np.asarray(group)   #TODO: use checks in combine_indices
        self.name = name
        uni, uni_idx, uni_inv = combine_indices(group)

        #TODO: rename these to something easier to remember
        self.group_int, self.uni_idx, self.uni = uni, uni_idx, uni_inv

        self.n_groups = len(self.uni)

        #put this here so they can be overwritten before calling labels
        self.separator = '.'
        self.prefix = self.name
        if self.prefix:
            self.prefix = self.prefix + '='

    #cache decorator
    def counts(self):
        return np.bincount(self.group_int)

    #cache_decorator
    def labels(self):
        #is this only needed for product of groups (intersection)?
        prefix = self.prefix
        uni = self.uni
        sep = self.separator

        if uni.ndim > 1:
            label = [(prefix+sep.join(['%s']*len(uni[0]))) % tuple(ii)
            for ii in uni]
        else:
            label = [prefix + '%s' % ii for ii in uni]
        return label

    def dummy(self, drop_idx=None, sparse=False, dtype=int):
        '''
        drop_idx is only available if sparse=False

        drop_idx is supposed to index into uni
        '''
        uni = self.uni
        if drop_idx is not None:
            idx = range(len(uni))
            del idx[drop_idx]
            uni = uni[idx]

        group = self.group

        if not sparse:
            return (group[:,None] == uni[None,:]).astype(dtype)

        else:
            return dummy_sparse(self.group_int)

    def interaction(self, other):
        if isinstance(other, self.__class__):
            other = other.group
        return self.__class__((self, other))

    def group_sums(self, x, use_bincount=True):
        return group_sums(x, self.group_int, use_bincount=use_bincount)

    def group_demean(self, x, use_bincount=True):
        means_g = group_demean(x/float(nobs), self.group_int,
                               use_bincount=use_bincount)
        x_demeaned = x - means_g[self.group_int]  #check reverse_index?
        return x_demeaned, means_g


class GroupSorted(Group):

    def __init__(self, group, name=''):
        super(self.__class__, self).__init__(group, name=name)

        idx = (np.nonzero(np.diff(group))[0]+1).tolist()
        self.groupidx = groupidx = zip([0]+idx, idx+[len(group)])

        ngroups = len(groupidx)

    def group_iter(self):
        for low, upp in self.groupidx:
            yield slice(low, upp)

    def lag_indices(self, lag):
        '''return the index array for lagged values

        Warning: if k is larger then the number of observations for an
        individual, then no values for that individual are returned.

        TODO: for the unbalanced case, I should get the same truncation for
        the array with lag=0. From the return of lag_idx we wouldn't know
        which individual is missing.

        TODO: do I want the full equivalent of lagmat in tsa?
        maxlag or lag or lags.

        not tested yet

        '''
        lag_idx = np.asarray(self.groupidx)[:,1] - lag   #asarray or already?
        mask_ok = (low <= lag_idx)
        #still an observation that belongs to the same individual

        return lag_idx[mask_ok]


if __name__ == '__main__':

    #---------- examples combine_indices
    from numpy.testing import assert_equal

    np.random.seed(985367)
    groups = np.random.randint(0,2,size=(10,2))
    uv, ux, u, label = combine_indices(groups, return_labels=True)
    uv, ux, u, label = combine_indices(groups, prefix='g1,g2=', sep=',',
                                       return_labels=True)

    group0 = np.array(['sector0', 'sector1'])[groups[:,0]]
    group1 = np.array(['region0', 'region1'])[groups[:,1]]
    uv, ux, u, label = combine_indices((group0, group1),
                                       prefix='sector,region=',
                                       sep=',',
                                       return_labels=True)
    uv, ux, u, label = combine_indices((group0, group1), prefix='', sep='.',
                                       return_labels=True)
    group_joint = np.array(label)[uv]
    group_joint_expected = np.array(
                  ['sector1.region0', 'sector0.region1', 'sector0.region0',
                   'sector0.region1', 'sector1.region1', 'sector0.region0',
                   'sector1.region0', 'sector1.region0', 'sector0.region1',
                   'sector0.region0'],
      dtype='|S15')
    assert_equal(group_joint, group_joint_expected)

    '''
    >>> uv
    array([2, 1, 0, 0, 1, 0, 2, 0, 1, 0])
    >>> label
    ['sector0.region0', 'sector1.region0', 'sector1.region1']
    >>> np.array(label)[uv]
    array(['sector1.region1', 'sector1.region0', 'sector0.region0',
           'sector0.region0', 'sector1.region0', 'sector0.region0',
           'sector1.region1', 'sector0.region0', 'sector1.region0',
           'sector0.region0'],
          dtype='|S15')
    >>> np.column_stack((group0, group1))
    array([['sector1', 'region1'],
           ['sector1', 'region0'],
           ['sector0', 'region0'],
           ['sector0', 'region0'],
           ['sector1', 'region0'],
           ['sector0', 'region0'],
           ['sector1', 'region1'],
           ['sector0', 'region0'],
           ['sector1', 'region0'],
           ['sector0', 'region0']],
          dtype='|S7')
      '''

    #------------- examples sparse_dummies
    from scipy import sparse

    g = np.array([0, 0, 1, 2, 1, 1, 2, 0])
    u = range(3)
    indptr = np.arange(len(g)+1)
    data = np.ones(len(g), dtype=np.int8)
    a = sparse.csr_matrix((data, g, indptr))
    print a.todense()
    print np.all(a.todense() == (g[:,None] == np.arange(3)).astype(int))

    x = np.arange(len(g)*3).reshape(len(g), 3, order='F')

    print 'group means'
    print x.T * a
    print np.dot(x.T, g[:,None] == np.arange(3))
    print np.array([np.bincount(g, weights=x[:,col]) for col in range(3)])
    for cat in u:
        print x[g==cat].sum(0)
    for cat in u: x[g==cat].sum(0)

    cc = sparse.csr_matrix([[0, 1, 0, 1, 0, 0, 0, 0, 0],
                [1, 0, 1, 0, 1, 0, 0, 0, 0],
                [0, 1, 0, 0, 0, 1, 0, 0, 0],
                [1, 0, 0, 0, 1, 0, 1, 0, 0],
                [0, 1, 0, 1, 0, 1, 0, 1, 0],
                [0, 0, 1, 0, 1, 0, 0, 0, 1],
                [0, 0, 0, 1, 0, 0, 0, 1, 0],
                [0, 0, 0, 0, 1, 0, 1, 0, 1],
                [0, 0, 0, 0, 0, 1, 0, 1, 0]])

    #------------- groupsums
    print group_sums(np.arange(len(g)*3*2).reshape(len(g),3,2), g,
                    use_bincount=False).T
    print group_sums(np.arange(len(g)*3*2).reshape(len(g),3,2)[:,:,0], g)
    print group_sums(np.arange(len(g)*3*2).reshape(len(g),3,2)[:,:,1], g)

    #------------- examples class
    x = np.arange(len(g)*3).reshape(len(g), 3, order='F')
    mygroup = Group(g)
    print mygroup.group_int
    print mygroup.group_sums(x)
    print mygroup.labels()