This file is indexed.

/usr/share/pyshared/pebl/data.py is in python-pebl 1.0.2-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
"""Classes and functions for working with datasets."""

from __future__ import with_statement
import re
import copy
from itertools import groupby

import numpy as N

from pebl.util import *
from pebl import discretizer
from pebl import config

#
# Module parameters
#
_pfilename = config.StringParameter(
    'data.filename',
    'File to read data from.',
    config.fileexists(),
)

_ptext = config.StringParameter(
    'data.text',
    'The text of a dataset included in config file.',
    default=''
)

_pdiscretize = config.IntParameter(
    'data.discretize',
    'Number of bins used to discretize data. Specify 0 to indicate that '+\
    'data should not be discretized.',
    default=0
)

#
# Exceptions
#
class ParsingError(Exception): 
    """Error encountered while parsing an ill-formed datafile."""
    pass

class IncorrectArityError(Exception):
    """Error encountered when the datafile speifies an incorrect variable arity.

    If variable arity is specified, it should be greater than the number of
    unique observation values for the variable.

    """
    
    def __init__(self, errors):
        self.errors = errors

    def __repr__(self):
        msg = "Incorrect arity specified for some variables.\n"
        for v,uniquevals in errors:
            msg += "Variable %s has arity of %d but %d unique values.\n" % \
                   (v.name, v.arity, uniquevals)

class ClassVariableError(Exception):
    """Error with a class variable."""
    msg = """Data for class variables must include only the labels specified in
    the variable annotation."""


#
# Variables and Samples
#
class Annotation(object):
    """Additional information about a sample or variable."""

    def __init__(self, name, *args):
        # *args is for subclasses
        self.name = str(name)

    def __repr__(self):
        return "<%s: %s>" % (self.__class__.__name__,  self.name)

class Sample(Annotation):
    """Additional information about a sample."""
    pass 

class Variable(Annotation): 
    """Additional information about a variable."""
    arity = -1

class ContinuousVariable(Variable): 
    """A variable from a continuous domain."""
    def __init__(self, name, param):
        self.name = str(name)

class DiscreteVariable(Variable):
    """A variable from a discrete domain."""
    def __init__(self, name, param):
        self.name = str(name)
        self.arity = int(param)

class ClassVariable(DiscreteVariable):
    """A labeled, discrete variable."""
    def __init__(self, name, param):
        self.name = str(name)
        self.labels = [l.strip() for l in param.split(',')]
        self.label2int = dict((l,i) for i,l in enumerate(self.labels))
        self.arity = len(self.labels)

#
# Main class for dataset
#
class Dataset(object):
    def __init__(self, observations, missing=None, interventions=None, 
                 variables=None, samples=None, skip_stats=False):
        """Create a pebl Dataset instance.

        A Dataset consists of the following data structures which are all
        numpy.ndarray instances:

        * observations: a 2D matrix of observed values. 
            - dimension 1 is over samples, dimension 2 is over variables.
            - observations[i,j] is the observed value for jth variable in the ith
              sample.

        * missing: a 2D binary mask for missing values
            - missing[i,j] = 1 IFF observation[i,j] is missing
        
        * interventions: a 2D binary mask for interventions
            - interventions[i,j] = 1 IFF the jth variable was intervened upon in
              the ith sample.
        
        * variables,samples: 1D array of variable or sample annotations
        
        This class provides a few public methods to manipulate datasets; one can
        also use numpy functions/methods directly.

        Required/Default values:

             * The only required argument is observations (a 2D numpy array).
             * If missing or interventions are not specified, they are assumed to
               be all zeros (no missing values and no interventions).
             * If variables or samples are not specified, appropriate Variable or
               Sample annotations are created with only the name attribute.

        Note:
            If you alter Dataset.interventions or Dataset.missing, you must
            call Dataset._calc_stats(). This is a terrible hack but it speeds
            up pebl when used with datasets without interventions or missing
            values (a common case).

        """

        self.observations = observations
        self.missing = missing
        self.interventions = interventions
        self.variables = variables
        self.samples = samples

        # With a numpy array X, we can't do 'if not X' to check the
        # truth value because it raises an exception. So, we must use the
        # non-pythonic 'if X is None'
        
        obs = observations
        if missing is None:
            self.missing = N.zeros(obs.shape, dtype=bool)
        if interventions is None:
            self.interventions = N.zeros(obs.shape, dtype=bool)
        if variables is None:
            self.variables = N.array([Variable(str(i)) for i in xrange(obs.shape[1])])
            self._guess_arities()
        if samples is None:
            self.samples = N.array([Sample(str(i)) for i in xrange(obs.shape[0])])

        if not skip_stats:
            self._calc_stats()

    # 
    # public methods
    # 
    def subset(self, variables=None, samples=None):
        """Returns a subset of the dataset (and metadata).
        
        Specify the variables and samples for creating a subset of the data.
        variables and samples should be a list of ids. If not specified, it is
        assumed to be all variables or samples. 

        Some examples:
        
            - d.subset([3], [4])
            - d.subset([3,1,2])
            - d.subset(samples=[5,2,7,1])
        
        Note: order matters! d.subset([3,1,2]) != d.subset([1,2,3])

        """

        variables = variables if variables is not None else range(self.variables.size)
        samples = samples if samples is not None else range(self.samples.size)
        skip_stats = not (self.has_interventions or self.has_missing)
        d = Dataset(
            self.observations[N.ix_(samples,variables)],
            self.missing[N.ix_(samples,variables)],
            self.interventions[N.ix_(samples,variables)],
            self.variables[variables],
            self.samples[samples],
            skip_stats = skip_stats
        )
        
        # if self does not have interventions or missing, the subset can't.
        if skip_stats:
            d._has_interventions = False
            d._has_missing = False

        return d

    
    def _subset_ni_fast(self, variables):
        ds = _FastDataset.__new__(_FastDataset)

        if not self.has_interventions:
            ds.observations = self.observations[:,variables]
            ds.samples = self.samples
        else:
            samples = N.where(self.interventions[:,variables[0]] == False)[0] 
            ds.observations = self.observations[samples][:,variables]
            ds.samples = self.samples[samples]

        ds.variables = self.variables[variables]
        return ds


    # TODO: test
    def subset_byname(self, variables=None, samples=None):
        """Returns a subset of the dataset (and metadata).

        Same as Dataset.subset() except that variables and samples can be
        specified by their names.  
        
        Some examples:

            - d.subset(variables=['shh', 'genex'])
            - s.subset(samples=["control%d" % i for i in xrange(10)])

        """

        vardict = dict((v.name, i) for i,v in enumerate(self.variables))
        sampledict = dict((s.name, i) for i,s in enumerate(self.samples))
        
        # if name not found, we let the KeyError be raised
        variables = [vardict[v] for v in variables] if variables else variables
        samples = [sampledict[s] for s in samples] if samples else samples

        return self.subset(variables, samples)


    def discretize(self, includevars=None, excludevars=[], numbins=3):
        """Discretize (or bin) the data in-place.

        This method is just an alias for pebl.discretizer.maximum_entropy_discretizer()
        See the module documentation for pebl.discretizer for more information.

        """
        self.original_observations = self.observations.copy()
        self = discretizer.maximum_entropy_discretize(
           self, 
           includevars, excludevars, 
           numbins
        ) 


    def tofile(self, filename, *args, **kwargs):
        """Write the data and metadata to file in a tab-delimited format."""
        
        with file(filename, 'w') as f:
            f.write(self.tostring(*args, **kwargs))


    def tostring(self, linesep='\n', variable_header=True, sample_header=True):
        """Return the data and metadata as a string in a tab-delimited format.
        
        If variable_header is True, include variable names and type.
        If sample_header is True, include sample names.
        Both are True by default.

        """

        def dataitem(row, col):
            val = "X" if self.missing[row,col] else str(self.observations[row,col])
            val += "!" if self.interventions[row,col] else ''
            return val
        
        def variable(v):
            name = v.name

            if isinstance(v, ClassVariable):
                return "%s,class(%s)" % (name, ','.join(v.labels))    
            elif isinstance(v, DiscreteVariable):
                return "%s,discrete(%d)" % (name, v.arity)
            elif isinstance(v, ContinuousVariable):
                return "%s,continuous" % name
            else:
                return v.name

        # ---------------------------------------------------------------------

        # python strings are immutable, so string concatenation is expensive!
        # preferred way is to make list of lines, then use one join.
        lines = []

        # add variable annotations
        if sample_header:
            lines.append("\t".join([variable(v) for v in self.variables]))
        
        # format data
        nrows,ncols = self.shape
        d = [[dataitem(r,c) for c in xrange(ncols)] for r in xrange(nrows)]
        
        # add sample names if we have them
        if sample_header and hasattr(self.samples[0], 'name'):
            d = [[s.name] + row for row,s in zip(d,self.samples)]

        # add data to lines
        lines.extend(["\t".join(row) for row in d])
        
        return linesep.join(lines)


    #
    # public propoerties
    #
    @property
    def shape(self):
        """The shape of the dataset as (number of samples, number of variables)."""
        return self.observations.shape

    @property
    def has_interventions(self):
        """Whether the dataset has any interventions."""
        if hasattr(self, '_has_interventions'):
            return self._has_interventions
        else:
            self._has_interventions = self.interventions.any()
            return self._has_interventions

    @property
    def has_missing(self):
        """Whether the dataset has any missing values."""
        if hasattr(self, '_has_missing'):
            return self._has_missing
        else:
            self._has_missing = self.missing.any()
            return self._has_missing


    #
    # private methods/properties
    #
    def _calc_stats(self):
        self._has_interventions = self.interventions.any()
        self._has_missing = self.missing.any()
    
    def _guess_arities(self):
        """Guesses variable arity by counting the number of unique observations."""

        for col,var in enumerate(self.variables):
            var.arity = N.unique(self.observations[:,col]).size
            var.__class__ = DiscreteVariable


    def check_arities(self):
        """Checks whether the specified airty >= number of unique observations.

        The check is only performed for discrete variables.

        If this check fails, the CPT and other data structures would fail.
        So, we should raise error while loading the data. Fail Early and Explicitly!

        """
        
        errors = [] 
        for col,v in enumerate(self.variables):
            if isinstance(v, DiscreteVariable):
                uniquevals = N.unique(self.observations[:,col]).size
                if v.arity < uniquevals:
                    errors.append((v, uniquevals))

        if errors:
            raise IncorrectArityError(errors)


class _FastDataset(Dataset):
    """A version of the Dataset class created by the _subset_ni_fast method.

    The Dataset._subset_ni_fast method creates a quick and dirty subset that
    skips many steps. It's a private method used by the evaluator module. Do
    not use this unless you know what you're doing.  
    
    """
    pass


#
# Factory Functions
#
def fromfile(filename):
    """Parse file and return a Dataset instance.

    The data file is expected to conform to the following format

        - comment lines begin with '#' and are ignored.
        - The first non-comment line *must* specify variable annotations
          separated by tab characters.
        - data lines specify the data values separated by tab characters.
        - data lines *can* include sample names
    
    A data value specifies the observed numeric value, whether it's missing and
    whether it represents an intervention:

        - An 'x' or 'X' indicate that the value is missing
        - A '!' before or after the numeric value indicates an intervention

    Variable annotations specify the name of the variable and, *optionally*,
    the data type.

    Examples include:

        - Foo                     : just variable name
        - Foo,continuous          : Foo is a continuous variable
        - Foo,discrete(3)         : Foo is a discrete variable with arity of 3
        - Foo,class(normal,cancer): Foo is a class variable with arity of 2 and
                                    values of either normal or cancer.

    """
    
    with file(filename) as f:
        return fromstring(f.read())


def fromstring(stringrep, fieldsep='\t'):
    """Parse the string representation of a dataset and return a Dataset instance.
    
    See the documentation for fromfile() for information about file format.
    
    """

    # parse a data item (examples: '5' '2.5', 'X', 'X!', '5!')
    def dataitem(item, v):
        item = item.strip()

        intervention = False
        missing = False
        
        # intervention?
        if item[0] == "!":
            intervention = True
            item = item[1:]
        elif item[-1] == "!":
            intervention = True
            item = item[:-1]

        # missing value?
        if item[0] in ('x', 'X') or item[-1] in ('x', 'X'):
            missing = True
            item = "0" if not isinstance(v, ClassVariable) else v.labels[0]

        # convert to expected data type
        val = item
        if isinstance(v, ClassVariable):
            try:
                val = v.label2int[val]
            except KeyError:
                raise ClassVariableError()

        elif isinstance(v, DiscreteVariable):
            try:
                val = int(val)
            except ValueError:
                msg = "Invalid value for discrete variable %s: %s" % (v.name, val)
                raise ParsingError(msg)

        elif isinstance(v, ContinuousVariable):
            try:
                val = float(val)
            except ValueError:
                msg = "Invalid value for continuous variable %s: %s" % (v.name, val)
                raise ParsingError(msg)
        else:
            # if not specified, try parsing as float or int
            if '.' in val:
                try:
                    val = float(val)
                except:
                    msg = "Cannot convert value %s to a float." % val
                    raise ParsingError(msg)
            else:
                try:
                    val = int(val)
                except:
                    msg = "Cannot convert value %s to an int." % val
                    raise ParsingError(msg)

        return (val, missing, intervention)


    dtype_re = re.compile("([\w\d_-]+)[\(]*([\w\d\s,]*)[\)]*") 
    def variable(v):
        # MS Excel encloses cells with punctuations in double quotes 
        # and many people use Excel to prepare data
        v = v.strip("\"")

        parts = v.split(",", 1)
        if len(parts) is 2:  # datatype specified?
            name,dtype = parts
            match = dtype_re.match(dtype)
            if not match:
                raise ParsingError("Error parsing variable header: %s" % v)
            dtype_name,dtype_param = match.groups()
            dtype_name = dtype_name.lower()
        else:
            name = parts[0]
            dtype_name, dtype_param = None,None

        vartypes = {
            None: Variable,
            'continuous': ContinuousVariable,
            'discrete': DiscreteVariable,
            'class': ClassVariable
        }
        
        return vartypes[dtype_name](name, dtype_param)

    # -------------------------------------------------------------------------

    # split on all known line seperators, ignoring blank and comment lines
    lines = (l.strip() for l in stringrep.splitlines() if l)
    lines = (l for l in lines if not l.startswith('#'))
    
    # parse variable annotations (first non-comment line)
    variables = lines.next().split(fieldsep)
    variables = N.array([variable(v) for v in variables])

    # split data into cells
    d = [[c for c in row.split(fieldsep)] for row in lines]

    # does file contain sample names?
    samplenames = True if len(d[0]) == len(variables) + 1 else False
    samples = None
    if samplenames:
        samples = N.array([Sample(row[0]) for row in d])
        d = [row[1:] for row in d]
    
    # parse data lines and separate into 3 numpy arrays
    #    d is a 3D array where the inner dimension is over 
    #    (values, missing, interventions) transpose(2,0,1) makes the inner
    #    dimension the outer one
    d = N.array([[dataitem(c,v) for c,v in zip(row,variables)] for row in d]) 
    obs, missing, interventions = d.transpose(2,0,1)

    # pack observations into bytes if possible (they're integers and < 255)
    dtype = 'int' if obs.dtype.kind is 'i' else obs.dtype
    
    # x.astype() returns a casted *copy* of x
    # returning copies of observations, missing and interventions ensures that
    # they're contiguous in memory (should speedup future calculations)
    d = Dataset(
        obs.astype(dtype),
        missing.astype(bool),
        interventions.astype(bool), 
        variables, 
        samples,
    )
    d.check_arities()
    return d


def fromconfig():
    """Create a Dataset from the configuration information.

    Loads data and discretizes (if requested) based on configuration
    parameters.
    
    """

    fname = config.get('data.filename')
    text = config.get('data.text')
    if text:
        data_ = fromstring(text)
    else:
        if not fname:
            raise Exception("Filename (nor text) for dataset not specified.")
        data_ = fromfile(fname)

    numbins = config.get('data.discretize')
    if numbins > 0:
        data_.discretize(numbins=numbins)
    
    return data_


def merge(datasets, axis=None):
    """Merges multiple datasets.

    datasets should be a list of Dataset objects.
    axis should be either 'variables' or 'samples' and determines how the
    datasets are merged.  
    
    """

    if axis == 'variables':
        variables = N.hstack(tuple(d.variables for d in datasets))
        samples = datasets[0].samples
        stacker = N.hstack
    else:
        samples = N.hstack(tuple(d.samples for d in datasets))
        variables = datasets[0].variables
        stacker = N.vstack

    missing = stacker(tuple(d.missing for d in datasets))
    interventions = stacker(tuple(d.interventions for d in datasets))
    observations = stacker(tuple(d.observations for d in datasets))

    return Dataset(observations, missing, interventions, variables, samples)