/usr/share/pyshared/mvpa/clfs/gpr.py

# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
#   Copyright (c) 2008 Emanuele Olivetti <emanuele@relativita.com>
#   See COPYING file distributed along with the PyMVPA package for the
#   copyright and license terms.
#
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Gaussian Process Regression (GPR)."""

__docformat__ = 'restructuredtext'


import numpy as N

from mvpa.base import externals

from mvpa.misc.state import StateVariable
from mvpa.clfs.base import Classifier
from mvpa.misc.param import Parameter
from mvpa.clfs.kernel import KernelSquaredExponential, KernelLinear
from mvpa.measures.base import Sensitivity
from mvpa.misc.exceptions import InvalidHyperparameterError
from mvpa.datasets import Dataset

if externals.exists("scipy", raiseException=True):
    from scipy.linalg import cho_solve as SLcho_solve
    from scipy.linalg import cholesky as SLcholesky
    import scipy.linalg as SL
    # Some local binding for bits of speed up
    SLAError = SL.basic.LinAlgError

if __debug__:
    from mvpa.base import debug

# Some local bindings for bits of speed up
Nlog = N.log
Ndot = N.dot
Ndiag = N.diag
NLAcholesky = N.linalg.cholesky
NLAsolve = N.linalg.solve
NLAError = N.linalg.linalg.LinAlgError
eps64 = N.finfo(N.float64).eps

# Some precomputed items. log is relatively expensive
_halflog2pi = 0.5 * Nlog(2 * N.pi)


class GPR(Classifier):
    """Gaussian Process Regression (GPR).

    """

    predicted_variances = StateVariable(enabled=False,
        doc="Variance per each predicted value")

    log_marginal_likelihood = StateVariable(enabled=False,
        doc="Log Marginal Likelihood")

    log_marginal_likelihood_gradient = StateVariable(enabled=False,
        doc="Log Marginal Likelihood Gradient")

    _clf_internals = [ 'gpr', 'regression', 'retrainable' ]


    # NOTE XXX Parameters of the classifier. Values available as
    # clf.parameter or clf.params.parameter, or as
    # clf.params['parameter'] (as the full Parameter object)
    #
    # __doc__ and __repr__ for class is conviniently adjusted to
    # reflect values of those params

    # Kernel machines/classifiers should be refactored also to behave
    # the same and define kernel parameter appropriately... TODO, but SVMs
    # already kinda do it nicely ;-)

    sigma_noise = Parameter(0.001, allowedtype='float', min=1e-10,
        doc="the standard deviation of the gaussian noise.")

    # XXX For now I don't introduce kernel parameter since yet to unify
    # kernel machines
    #kernel = Parameter(None, allowedtype='Kernel',
    #    doc="Kernel object defining the covariance between instances. "
    #        "(Defaults to KernelSquaredExponential if None in arguments)")

    lm = Parameter(0.0, min=0.0, allowedtype='float',
                   doc="""The regularization term lambda.
                   Increase this when the kernel matrix is not positive, definite.""")


    def __init__(self, kernel=None, **kwargs):
        """Initialize a GPR regression analysis.

        :Parameters:
          kernel : Kernel
            a kernel object defining the covariance between instances.
            (Defaults to KernelSquaredExponential if None in arguments)
        """
        # init base class first
        Classifier.__init__(self, **kwargs)

        # It does not make sense to calculate a confusion matrix for a GPR
        # XXX it does ;) it will be a RegressionStatistics actually ;-)
        # So if someone desires -- let him have it
        # self.states.enable('training_confusion', False)

        # set kernel:
        if kernel is None:
            kernel = KernelSquaredExponential()
        self.__kernel = kernel

        # append proper clf_internal depending on the kernel
        # TODO: unify finally all kernel-based machines.
        #       make SMLR to use kernels
        if isinstance(kernel, KernelLinear):
            self._clf_internals += ['linear']
        else:
            self._clf_internals += ['non-linear']

        if externals.exists('openopt') \
               and not 'has_sensitivity' in self._clf_internals:
            self._clf_internals += ['has_sensitivity']

        # No need to initialize state variables. Unless they got set
        # they would raise an exception self.predicted_variances =
        # None self.log_marginal_likelihood = None
        self._init_internals()
        pass


    def _init_internals(self):
        """Reset some internal variables to None.

        To be used in constructor and untrain()
        """
        self._train_fv = None
        self._labels = None
        self._km_train_train = None
        self._train_labels = None
        self._alpha = None
        self._L = None
        self._LL = None
        self.__kernel.reset()
        pass


    def __repr__(self):
        """String summary of the object
        """
        return super(GPR, self).__repr__(
            prefixes=['kernel=%s' % self.__kernel])


    def compute_log_marginal_likelihood(self):
        """
        Compute log marginal likelihood using self.train_fv and self.labels.
        """
        if __debug__:
            debug("GPR", "Computing log_marginal_likelihood")
        self.log_marginal_likelihood = \
                                 -0.5*Ndot(self._train_labels, self._alpha) - \
                                  Nlog(self._L.diagonal()).sum() - \
                                  self._km_train_train.shape[0] * _halflog2pi
        return self.log_marginal_likelihood


    def compute_gradient_log_marginal_likelihood(self):
        """Compute gradient of the log marginal likelihood. This
        version use a more compact formula provided by Williams and
        Rasmussen book.
        """
        # XXX EO: check whether the precomputed self.alpha self.Kinv
        # are actually the ones corresponding to the hyperparameters
        # used to compute this gradient!
        # YYY EO: currently this is verified outside gpr.py but it is
        # not an efficient solution.
        # XXX EO: Do some memoizing since it could happen that some
        # hyperparameters are kept constant by user request, so we
        # don't need (somtimes) to recompute the corresponding
        # gradient again.

        # self.Kinv = N.linalg.inv(self._C)
        # Faster:
        Kinv = SLcho_solve(self._LL, N.eye(self._L.shape[0]))

        alphalphaT = N.dot(self._alpha[:,None], self._alpha[None,:])
        tmp = alphalphaT - Kinv
        # Pass tmp to __kernel and let it compute its gradient terms.
        # This scales up to huge number of hyperparameters:
        grad_LML_hypers = self.__kernel.compute_lml_gradient(
            tmp, self._train_fv)
        grad_K_sigma_n = 2.0*self.sigma_noise*N.eye(tmp.shape[0])
        # Add the term related to sigma_noise:
        # grad_LML_sigma_n = 0.5 * N.trace(N.dot(tmp,grad_K_sigma_n))
        # Faster formula: tr(AB) = (A*B.T).sum()
        grad_LML_sigma_n = 0.5 * (tmp * (grad_K_sigma_n).T).sum()
        lml_gradient = N.hstack([grad_LML_sigma_n, grad_LML_hypers])
        self.log_marginal_likelihood_gradient = lml_gradient
        return lml_gradient


    def compute_gradient_log_marginal_likelihood_logscale(self):
        """Compute gradient of the log marginal likelihood when
        hyperparameters are in logscale. This version use a more
        compact formula provided by Williams and Rasmussen book.
        """
        # Kinv = N.linalg.inv(self._C)
        # Faster:
        Kinv = SLcho_solve(self._LL, N.eye(self._L.shape[0]))
        alphalphaT = N.dot(self._alpha[:,None], self._alpha[None,:])
        tmp = alphalphaT - Kinv
        grad_LML_log_hypers = \
            self.__kernel.compute_lml_gradient_logscale(tmp, self._train_fv)
        grad_K_log_sigma_n = 2.0 * self.sigma_noise ** 2 * N.eye(Kinv.shape[0])
        # Add the term related to sigma_noise:
        # grad_LML_log_sigma_n = 0.5 * N.trace(N.dot(tmp, grad_K_log_sigma_n))
        # Faster formula: tr(AB) = (A * B.T).sum()
        grad_LML_log_sigma_n = 0.5 * (tmp * (grad_K_log_sigma_n).T).sum()
        lml_gradient = N.hstack([grad_LML_log_sigma_n, grad_LML_log_hypers])
        self.log_marginal_likelihood_gradient = lml_gradient
        return lml_gradient


    def getSensitivityAnalyzer(self, flavor='auto', **kwargs):
        """Returns a sensitivity analyzer for GPR.

        :Parameters:
          flavor : basestring
            What sensitivity to provide. Valid values are
            'linear', 'model_select', 'auto'.
            In case of 'auto' selects 'linear' for linear kernel
            and 'model_select' for the rest. 'linear' corresponds to
            GPRLinearWeights and 'model_select' to GRPWeights
        """
        # XXX The following two lines does not work since
        # self.__kernel is instance of kernel.KernelLinear and not
        # just KernelLinear. How to fix?
        # YYY yoh is not sure what is the problem... KernelLinear is actually
        #     kernel.KernelLinear so everything shoudl be ok
        if flavor == 'auto':
            flavor = ('model_select', 'linear')\
                     [int(isinstance(self.__kernel, KernelLinear))]
            if __debug__:
                debug("GPR", "Returning '%s' sensitivity analyzer" % flavor)

        # Return proper sensitivity
        if flavor == 'linear':
            return GPRLinearWeights(self, **kwargs)
        elif flavor == 'model_select':
            # sanity check
            if not ('has_sensitivity' in self._clf_internals):
                raise ValueError, \
                      "model_select flavor is not available probably " \
                      "due to not available 'openopt' module"
            return GPRWeights(self, **kwargs)
        else:
            raise ValueError, "Flavor %s is not recognized" % flavor


    def _train(self, data):
        """Train the classifier using `data` (`Dataset`).
        """

        # local bindings for faster lookup
        retrainable = self.params.retrainable
        if retrainable:
            newkernel = False
            newL = False
            _changedData = self._changedData

        self._train_fv = train_fv = data.samples
        self._train_labels = train_labels = data.labels

        if not retrainable or _changedData['traindata'] \
               or _changedData.get('kernel_params', False):
            if __debug__:
                debug("GPR", "Computing train train kernel matrix")
            self._km_train_train = km_train_train = self.__kernel.compute(train_fv)
            newkernel = True
            if retrainable:
                self._km_train_test = None # reset to facilitate recomputation
        else:
            if __debug__:
                debug("GPR", "Not recomputing kernel since retrainable and "
                      "nothing has changed")
            km_train_train = self._km_train_train # reuse

        if not retrainable or newkernel or _changedData['params']:
            if __debug__:
                debug("GPR", "Computing L. sigma_noise=%g" % self.sigma_noise)
            # XXX it seems that we do not need binding to object, but may be
            # commented out code would return?
            self._C = km_train_train + \
                  self.sigma_noise**2 * N.identity(km_train_train.shape[0], 'd')
            # The following decomposition could raise
            # N.linalg.linalg.LinAlgError because of numerical
            # reasons, due to the too rapid decay of 'self._C'
            # eigenvalues. In that case we try adding a small constant
            # to self._C, e.g. epsilon=1.0e-20. It should be a form of
            # Tikhonov regularization. This is equivalent to adding
            # little white gaussian noise to data.
            #
            # XXX EO: how to choose epsilon?
            #
            # Cholesky decomposition is provided by three different
            # NumPy/SciPy routines (fastest first):
            # 1) self._LL = scipy.linalg.cho_factor(self._C, lower=True)
            #    self._L = L = N.tril(self._LL[0])
            # 2) self._L = scipy.linalg.cholesky(self._C, lower=True)
            # 3) self._L = numpy.linalg.cholesky(self._C)
            # Even though 1 is the fastest we choose 2 since 1 does
            # not return a clean lower-triangular matrix (see docstring).

            # PBS: I just made it so the KernelMatrix is regularized
            # all the time.  I figured that if ever you were going to
            # use regularization, you would want to set it yourself
            # and use the same value for all folds of your data.
            try:
                # apply regularization
                epsilon = self.params.lm * N.eye(self._C.shape[0])
                self._L = SLcholesky(self._C + epsilon, lower=True)
                self._LL = (self._L, True)
            except SLAError:
                raise SLAError("Kernel matrix is not positive, definite.  " + \
                               "Try increasing the lm parameter.")
                pass
            newL = True
        else:
            if __debug__:
                debug("GPR", "Not computing L since kernel, data and params "
                      "stayed the same")
            L = self._L                 # reuse

        # XXX we leave _alpha being recomputed, although we could check
        #   if newL or _changedData['labels']
        #
        if __debug__:
            debug("GPR", "Computing alpha")
        # self._alpha = NLAsolve(L.transpose(),
        #                              NLAsolve(L, train_labels))
        # Faster:
        self._alpha = SLcho_solve(self._LL, train_labels)

        # compute only if the state is enabled
        if self.states.isEnabled('log_marginal_likelihood'):
            self.compute_log_marginal_likelihood()
            pass

        if retrainable:
            # we must assign it only if it is retrainable
            self.states.retrained = not newkernel or not newL

        if __debug__:
            debug("GPR", "Done training")

        pass


    def _predict(self, data):
        """
        Predict the output for the provided data.
        """
        retrainable = self.params.retrainable

        if not retrainable or self._changedData['testdata'] \
               or self._km_train_test is None:
            if __debug__:
                debug('GPR', "Computing train test kernel matrix")
            km_train_test = self.__kernel.compute(self._train_fv, data)
            if retrainable:
                self._km_train_test = km_train_test
                self.states.repredicted = False
        else:
            if __debug__:
                debug('GPR', "Not recomputing train test kernel matrix")
            km_train_test = self._km_train_test
            self.states.repredicted = True


        predictions = Ndot(km_train_test.transpose(), self._alpha)

        if self.states.isEnabled('predicted_variances'):
            # do computation only if state variable was enabled
            if not retrainable or self._km_test_test is None \
                   or self._changedData['testdata']:
                if __debug__:
                    debug('GPR', "Computing test test kernel matrix")
                km_test_test = self.__kernel.compute(data)
                if retrainable:
                    self._km_test_test = km_test_test
            else:
                if __debug__:
                    debug('GPR', "Not recomputing test test kernel matrix")
                km_test_test = self._km_test_test

            if __debug__:
                debug("GPR", "Computing predicted variances")
            L = self._L
            # v = NLAsolve(L, km_train_test)
            # Faster:
            piv = N.arange(L.shape[0])
            v = SL.lu_solve((L.T, piv), km_train_test, trans=1)
            # self.predicted_variances = \
            #     Ndiag(km_test_test - Ndot(v.T, v)) \
            #     + self.sigma_noise**2
            # Faster formula: N.diag(Ndot(v.T, v)) = (v**2).sum(0):
            self.predicted_variances = Ndiag(km_test_test) - (v ** 2).sum(0) \
                                       + self.sigma_noise ** 2
            pass

        if __debug__:
            debug("GPR", "Done predicting")
        return predictions


    def _setRetrainable(self, value, force=False):
        """Internal function : need to set _km_test_test
        """
        super(GPR, self)._setRetrainable(value, force)
        if force or (value and value != self.params.retrainable):
            self._km_test_test = None


    def untrain(self):
        super(GPR, self).untrain()
        # XXX might need to take special care for retrainable. later
        self._init_internals()
        pass


    def set_hyperparameters(self, hyperparameter):
        """
        Set hyperparameters' values.

        Note that 'hyperparameter' is a sequence so the order of its
        values is important. First value must be sigma_noise, then
        other kernel's hyperparameters values follow in the exact
        order the kernel expect them to be.
        """
        if hyperparameter[0] < self.params['sigma_noise'].min:
            raise InvalidHyperparameterError()
        self.sigma_noise = hyperparameter[0]
        if hyperparameter.size > 1:
            self.__kernel.set_hyperparameters(hyperparameter[1:])
            pass
        return

    kernel = property(fget=lambda self:self.__kernel)
    pass


class GPRLinearWeights(Sensitivity):
    """`SensitivityAnalyzer` that reports the weights GPR trained
    on a given `Dataset`.

    In case of KernelLinear compute explicitly the coefficients
    of the linear regression, together with their variances (if
    requested).

    Note that the intercept is not computed.
    """

    variances = StateVariable(enabled=False,
        doc="Variances of the weights (for KernelLinear)")

    _LEGAL_CLFS = [ GPR ]


    def _call(self, dataset):
        """Extract weights from GPR
        """

        clf = self.clf
        kernel = clf.kernel
        train_fv = clf._train_fv

        weights = Ndot(kernel.Sigma_p,
                        Ndot(train_fv.T, clf._alpha))

        if self.states.isEnabled('variances'):
            # super ugly formulas that can be quite surely improved:
            tmp = N.linalg.inv(clf._L)
            Kyinv = Ndot(tmp.T, tmp)
            # XXX in such lengthy matrix manipulations you might better off
            #     using N.matrix where * is a matrix product
            self.states.variances = Ndiag(
                kernel.Sigma_p -
                Ndot(kernel.Sigma_p,
                      Ndot(train_fv.T,
                            Ndot(Kyinv,
                                  Ndot(train_fv, kernel.Sigma_p)))))
        return weights


if externals.exists('openopt'):

    from mvpa.clfs.model_selector import ModelSelector

    class GPRWeights(Sensitivity):
        """`SensitivityAnalyzer` that reports the weights GPR trained
        on a given `Dataset`.
        """

        _LEGAL_CLFS = [ GPR ]

        def _call(self, dataset):
            """Extract weights from GPR
            """

            clf = self.clf
            # normalize data:
            clf._train_labels = (clf._train_labels - clf._train_labels.mean()) \
                                / clf._train_labels.std()
            # clf._train_fv = (clf._train_fv-clf._train_fv.mean(0)) \
            #                  /clf._train_fv.std(0)
            dataset = Dataset(samples=clf._train_fv, labels=clf._train_labels)
            clf.states.enable("log_marginal_likelihood")
            ms = ModelSelector(clf, dataset)
            # Note that some kernels does not have gradient yet!
            # XXX Make it initialize to clf's current hyperparameter values
            #     or may be add ability to specify starting points in the constructor
            sigma_noise_initial = 1.0e-5
            sigma_f_initial = 1.0
            length_scale_initial = N.ones(dataset.nfeatures)*1.0e4
            # length_scale_initial = N.random.rand(dataset.nfeatures)*1.0e4
            hyp_initial_guess = N.hstack([sigma_noise_initial,
                                          sigma_f_initial,
                                          length_scale_initial])
            fixedHypers = N.array([0]*hyp_initial_guess.size, dtype=bool)
            fixedHypers = None
            problem =  ms.max_log_marginal_likelihood(
                hyp_initial_guess=hyp_initial_guess,
                optimization_algorithm="scipy_lbfgsb",
                ftol=1.0e-3, fixedHypers=fixedHypers,
                use_gradient=True, logscale=True)
            if __debug__ and 'GPR_WEIGHTS' in debug.active:
                problem.iprint = 1
            lml = ms.solve()
            weights = 1.0/ms.hyperparameters_best[2:] # weight = 1/length_scale
            if __debug__:
                debug("GPR",
                      "%s, train: shape %s, labels %s, min:max %g:%g, "
                      "sigma_noise %g, sigma_f %g" %
                      (clf, clf._train_fv.shape, N.unique(clf._train_labels),
                       clf._train_fv.min(), clf._train_fv.max(),
                       ms.hyperparameters_best[0], ms.hyperparameters_best[1]))

            return weights
python-mvpa 0.4.8-3 / usr / share / pyshared / mvpa / clfs / gpr.py