/usr/lib/python2.7/dist-packages/patsy/state.py is in python-patsy 0.4.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | # This file is part of Patsy
# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
# See file LICENSE.txt for license information.
# Stateful transform protocol:
# def __init__(self):
# pass
# def memorize_chunk(self, input_data):
# return None
# def memorize_finish(self):
# return None
# def transform(self, input_data):
# return output_data
# BETTER WAY: always run the first row of data through the builder alone, and
# check that it gives the same output row as when running the whole block of
# data through at once. This gives us the same information, but it's robust
# against people writing their own centering functions.
# QUESTION: right now we refuse to even fit a model that contains a
# my_transform(x)-style function. Maybe we should allow it to be fit (with a
# warning), and only disallow making predictions with it? Need to revisit this
# question once it's clearer what exactly our public API will look like,
# because right now I'm not sure how to tell whether we are being called for
# fitting versus being called for prediction.
import numpy as np
from patsy.util import (atleast_2d_column_default,
asarray_or_pandas, pandas_friendly_reshape,
wide_dtype_for, safe_issubdtype,
no_pickling, assert_no_pickling)
from patsy.compat import wraps
# These are made available in the patsy.* namespace
__all__ = ["stateful_transform",
"center", "standardize", "scale",
]
def stateful_transform(class_):
"""Create a stateful transform callable object from a class that fulfills
the :ref:`stateful transform protocol <stateful-transform-protocol>`.
"""
@wraps(class_)
def stateful_transform_wrapper(*args, **kwargs):
transform = class_()
transform.memorize_chunk(*args, **kwargs)
transform.memorize_finish()
return transform.transform(*args, **kwargs)
stateful_transform_wrapper.__patsy_stateful_transform__ = class_
return stateful_transform_wrapper
# class NonIncrementalStatefulTransform(object):
# def __init__(self):
# self._data = []
#
# def memorize_chunk(self, input_data, *args, **kwargs):
# self._data.append(input_data)
# self._args = _args
# self._kwargs = kwargs
#
# def memorize_finish(self):
# all_data = np.row_stack(self._data)
# args = self._args
# kwargs = self._kwargs
# del self._data
# del self._args
# del self._kwargs
# self.memorize_all(all_data, *args, **kwargs)
#
# def memorize_all(self, input_data, *args, **kwargs):
# raise NotImplementedError
#
# def transform(self, input_data, *args, **kwargs):
# raise NotImplementedError
#
# class QuantileEstimatingTransform(NonIncrementalStatefulTransform):
# def memorize_all(self, input_data, *args, **kwargs):
class Center(object):
"""center(x)
A stateful transform that centers input data, i.e., subtracts the mean.
If input has multiple columns, centers each column separately.
Equivalent to ``standardize(x, rescale=False)``
"""
def __init__(self):
self._sum = None
self._count = 0
def memorize_chunk(self, x):
x = atleast_2d_column_default(x)
self._count += x.shape[0]
this_total = np.sum(x, 0, dtype=wide_dtype_for(x))
# This is to handle potentially multi-column x's:
if self._sum is None:
self._sum = this_total
else:
self._sum += this_total
def memorize_finish(self):
pass
def transform(self, x):
x = asarray_or_pandas(x)
# This doesn't copy data unless our input is a DataFrame that has
# heterogenous types. And in that case we're going to be munging the
# types anyway, so copying isn't a big deal.
x_arr = np.asarray(x)
if safe_issubdtype(x_arr.dtype, np.integer):
dt = float
else:
dt = x_arr.dtype
mean_val = np.asarray(self._sum / self._count, dtype=dt)
centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val
return pandas_friendly_reshape(centered, x.shape)
__getstate__ = no_pickling
center = stateful_transform(Center)
# See:
# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
# or page 232 of Knuth vol. 3 (3rd ed.).
class Standardize(object):
"""standardize(x, center=True, rescale=True, ddof=0)
A stateful transform that standardizes input data, i.e. it subtracts the
mean and divides by the sample standard deviation.
Either centering or rescaling or both can be disabled by use of keyword
arguments. The `ddof` argument controls the delta degrees of freedom when
computing the standard deviation (cf. :func:`numpy.std`). The default of
``ddof=0`` produces the maximum likelihood estimate; use ``ddof=1`` if you
prefer the square root of the unbiased estimate of the variance.
If input has multiple columns, standardizes each column separately.
.. note:: This function computes the mean and standard deviation using a
memory-efficient online algorithm, making it suitable for use with
large incrementally processed data-sets.
"""
def __init__(self):
self.current_n = 0
self.current_mean = None
self.current_M2 = None
def memorize_chunk(self, x, center=True, rescale=True, ddof=0):
x = atleast_2d_column_default(x)
if self.current_mean is None:
self.current_mean = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
self.current_M2 = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
# XX this can surely be vectorized but I am feeling lazy:
for i in range(x.shape[0]):
self.current_n += 1
delta = x[i, :] - self.current_mean
self.current_mean += delta / self.current_n
self.current_M2 += delta * (x[i, :] - self.current_mean)
def memorize_finish(self):
pass
def transform(self, x, center=True, rescale=True, ddof=0):
# XX: this forces all inputs to double-precision real, even if the
# input is single- or extended-precision or complex. But I got all
# tangled up in knots trying to do that without breaking something
# else (e.g. by requiring an extra copy).
x = asarray_or_pandas(x, copy=True, dtype=float)
x_2d = atleast_2d_column_default(x, preserve_pandas=True)
if center:
x_2d -= self.current_mean
if rescale:
x_2d /= np.sqrt(self.current_M2 / (self.current_n - ddof))
return pandas_friendly_reshape(x_2d, x.shape)
__getstate__ = no_pickling
standardize = stateful_transform(Standardize)
# R compatibility:
scale = standardize
|