merge with commit of dgplvm

2026-05-13 22:12:38 +02:00 · 2015-04-20 16:02:19 +01:00 · 2015-04-20 16:02:19 +01:00 · 401374d068
commit 401374d068
parent 17fa8a0ada 440d7b6478
152 changed files with 4272 additions and 1875 deletions
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -1,24 +1,24 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from gp_regression import GPRegression
-from gp_classification import GPClassification
-from sparse_gp_regression import SparseGPRegression, SparseGPRegressionUncertainInput
-from sparse_gp_classification import SparseGPClassification
-from gplvm import GPLVM
-from bcgplvm import BCGPLVM
-from sparse_gplvm import SparseGPLVM
-from warped_gp import WarpedGP
-from bayesian_gplvm import BayesianGPLVM
-from mrd import MRD
-from gradient_checker import GradientChecker
-from ss_gplvm import SSGPLVM
-from gp_coregionalized_regression import GPCoregionalizedRegression
-from sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
-from gp_heteroscedastic_regression import GPHeteroscedasticRegression
-from ss_mrd import SSMRD
-from gp_kronecker_gaussian_regression import GPKroneckerGaussianRegression
-from gp_var_gauss import GPVariationalGaussianApproximation
-from one_vs_all_classification import OneVsAllClassification
-from one_vs_all_sparse_classification import OneVsAllSparseClassification
-from dpgplvm import DPBayesianGPLVM
+from .gp_regression import GPRegression
+from .gp_classification import GPClassification
+from .sparse_gp_regression import SparseGPRegression, SparseGPRegressionUncertainInput
+from .sparse_gp_classification import SparseGPClassification
+from .gplvm import GPLVM
+from .bcgplvm import BCGPLVM
+from .sparse_gplvm import SparseGPLVM
+from .warped_gp import WarpedGP
+from .bayesian_gplvm import BayesianGPLVM
+from .mrd import MRD
+from .gradient_checker import GradientChecker, HessianChecker, SkewChecker
+from .ss_gplvm import SSGPLVM
+from .gp_coregionalized_regression import GPCoregionalizedRegression
+from .sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
+from .gp_heteroscedastic_regression import GPHeteroscedasticRegression
+from .ss_mrd import SSMRD
+from .gp_kronecker_gaussian_regression import GPKroneckerGaussianRegression
+from .gp_var_gauss import GPVariationalGaussianApproximation
+from .one_vs_all_classification import OneVsAllClassification
+from .one_vs_all_sparse_classification import OneVsAllSparseClassification
+from .dpgplvm import DPBayesianGPLVM
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -24,7 +24,7 @@ class BayesianGPLVM(SparseGP_MPI):
    def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
                 Z=None, kernel=None, inference_method=None, likelihood=None,
                 name='bayesian gplvm', mpi_comm=None, normalizer=None,
-                 missing_data=False, stochastic=False, batchsize=1):
+                 missing_data=False, stochastic=False, batchsize=1, Y_metadata=None):

        self.logger = logging.getLogger(self.__class__.__name__)
        if X is None:
@ -69,6 +69,7 @@ class BayesianGPLVM(SparseGP_MPI):
                                           name=name, inference_method=inference_method,
                                           normalizer=normalizer, mpi_comm=mpi_comm,
                                           variational_prior=self.variational_prior,
+                                           Y_metadata=Y_metadata
                                           )
        self.link_parameter(self.X, index=0)

@ -83,7 +84,7 @@ class BayesianGPLVM(SparseGP_MPI):
    def parameters_changed(self):
        super(BayesianGPLVM,self).parameters_changed()
        if isinstance(self.inference_method, VarDTC_minibatch):
-            return        
+            return

        kl_fctr = 1.
        self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@ -5,6 +5,8 @@ from ..core.model import Model
 import itertools
 import numpy
 from ..core.parameterization import Param
+np = numpy
+from ..util.block_matrices import get_blocks, get_block_shapes, unblock, get_blocks_3d, get_block_shapes_3d

 def get_shape(x):
    if isinstance(x, numpy.ndarray):
@ -111,3 +113,261 @@ class GradientChecker(Model):
        #for name, shape in zip(self.names, self.shapes):
            #_param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
        #return _param_names
+
+
+class HessianChecker(GradientChecker):
+
+    def __init__(self, f, df, ddf, x0, names=None, *args, **kwargs):
+        """
+        :param f: Function (only used for numerical hessian gradient)
+        :param df: Gradient of function to check
+        :param ddf: Analytical gradient function
+        :param x0:
+            Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
+            Can be a list of arrays, if takes a list of arrays. This list will be passed
+            to f and df in the same order as given here.
+            If only one argument, make sure not to pass a list!!!
+
+        :type x0: [array-like] | array-like | float | int
+        :param names:
+            Names to print, when performing gradcheck. If a list was passed to x0
+            a list of names with the same length is expected.
+        :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
+
+        """
+        super(HessianChecker, self).__init__(df, ddf, x0, names=names, *args, **kwargs)
+        self._f = f
+        self._df = df
+        self._ddf = ddf
+
+    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3, block_indices=None, plot=False):
+        """
+        Overwrite checkgrad method to check whole block instead of looping through
+
+        Shows diagnostics using matshow instead
+
+        :param verbose: If True, print a "full" checking of each parameter
+        :type verbose: bool
+        :param step: The size of the step around which to linearise the objective
+        :type step: float (default 1e-6)
+        :param tolerance: the tolerance allowed (see note)
+        :type tolerance: float (default 1e-3)
+
+        Note:-
+           The gradient is considered correct if the ratio of the analytical
+           and numerical gradients is within <tolerance> of unity.
+        """
+        try:
+            import numdifftools as nd
+        except:
+            raise ImportError("Don't have numdifftools package installed, it is not a GPy dependency as of yet, it is only used for hessian tests")
+
+        if target_param:
+            raise NotImplementedError('Only basic functionality is provided with this gradchecker')
+
+        #Repeat for each parameter, not the nicest but shouldn't be many cases where there are many
+        #variables
+        current_index = 0
+        for name, shape in zip(self.names, self.shapes):
+            current_size = numpy.prod(shape)
+            x = self.optimizer_array.copy()
+            #x = self._get_params_transformed().copy()
+            x = x[current_index:current_index + current_size].reshape(shape)
+
+            # Check gradients
+            analytic_hess = self._ddf(x)
+            if analytic_hess.shape[1] == 1:
+                analytic_hess = numpy.diagflat(analytic_hess)
+
+            #From the docs:
+            #x0 : vector location
+            #at which to differentiate fun
+            #If x0 is an N x M array, then fun is assumed to be a function
+            #of N*M variables., thus we must have it flat, not (N,1), but just (N,)
+            #numeric_hess_partial = nd.Hessian(self._f, vectorized=False)
+            numeric_hess_partial = nd.Jacobian(self._df, vectorized=False)
+            #numeric_hess_partial = nd.Derivative(self._df, vectorized=True)
+            numeric_hess = numeric_hess_partial(x)
+
+            check_passed = self.checkgrad_block(analytic_hess, numeric_hess, verbose=verbose, step=step, tolerance=tolerance, block_indices=block_indices, plot=plot)
+            current_index += current_size
+        return check_passed
+
+    def checkgrad_block(self, analytic_hess, numeric_hess, verbose=False, step=1e-6, tolerance=1e-3, block_indices=None, plot=False):
+        """
+        Checkgrad a block matrix
+        """
+        if analytic_hess.dtype is np.dtype('object'):
+            #Make numeric hessian also into a block matrix
+            real_size = get_block_shapes(analytic_hess)
+            num_elements = np.sum(real_size)
+            if (num_elements, num_elements) == numeric_hess.shape:
+                #If the sizes are the same we assume they are the same
+                #(we have not fixed any values so the numeric is the whole hessian)
+                numeric_hess = get_blocks(numeric_hess, real_size)
+            else:
+                #Make a fake empty matrix and fill out the correct block
+                tmp_numeric_hess = get_blocks(np.zeros((num_elements, num_elements)), real_size)
+                tmp_numeric_hess[block_indices] = numeric_hess.copy()
+                numeric_hess = tmp_numeric_hess
+
+        if block_indices is not None:
+            #Extract the right block
+            analytic_hess = analytic_hess[block_indices]
+            numeric_hess = numeric_hess[block_indices]
+        else:
+            #Unblock them if they are in blocks and you aren't checking a single block (checking whole hessian)
+            if analytic_hess.dtype is np.dtype('object'):
+                analytic_hess = unblock(analytic_hess)
+                numeric_hess = unblock(numeric_hess)
+
+        ratio = numeric_hess / (numpy.where(analytic_hess==0, 1e-10, analytic_hess))
+        difference = numpy.abs(analytic_hess - numeric_hess)
+
+        check_passed = numpy.all((numpy.abs(1 - ratio)) < tolerance) or numpy.allclose(numeric_hess, analytic_hess, atol = tolerance)
+
+        if verbose:
+            if block_indices:
+                print "\nBlock {}".format(block_indices)
+            else:
+                print "\nAll blocks"
+
+            header = ['Checked', 'Max-Ratio', 'Min-Ratio', 'Min-Difference', 'Max-Difference']
+            header_string = map(lambda x: ' | '.join(header), [header])
+            separator = '-' * len(header_string[0])
+            print '\n'.join([header_string[0], separator])
+            min_r = '%.6f' % float(numpy.min(ratio))
+            max_r = '%.6f' % float(numpy.max(ratio))
+            max_d = '%.6f' % float(numpy.max(difference))
+            min_d = '%.6f' % float(numpy.min(difference))
+            cols = [max_r, min_r, min_d, max_d]
+
+            if check_passed:
+                checked = "\033[92m  True \033[0m"
+            else:
+                checked = "\033[91m  False \033[0m"
+
+            grad_string = "{} | {}  | {} |    {}    |   {} ".format(checked, cols[0], cols[1], cols[2], cols[3])
+            print grad_string
+
+            if plot:
+                import pylab as pb
+                fig, axes = pb.subplots(2, 2)
+                max_lim = numpy.max(numpy.vstack((analytic_hess, numeric_hess)))
+                min_lim = numpy.min(numpy.vstack((analytic_hess, numeric_hess)))
+                msa = axes[0,0].matshow(analytic_hess, vmin=min_lim, vmax=max_lim)
+                axes[0,0].set_title('Analytic hessian')
+                axes[0,0].xaxis.set_ticklabels([None])
+                axes[0,0].yaxis.set_ticklabels([None])
+                axes[0,0].xaxis.set_ticks([None])
+                axes[0,0].yaxis.set_ticks([None])
+                msn = axes[0,1].matshow(numeric_hess, vmin=min_lim, vmax=max_lim)
+                pb.colorbar(msn, ax=axes[0,1])
+                axes[0,1].set_title('Numeric hessian')
+                axes[0,1].xaxis.set_ticklabels([None])
+                axes[0,1].yaxis.set_ticklabels([None])
+                axes[0,1].xaxis.set_ticks([None])
+                axes[0,1].yaxis.set_ticks([None])
+                msr = axes[1,0].matshow(ratio)
+                pb.colorbar(msr, ax=axes[1,0])
+                axes[1,0].set_title('Ratio')
+                axes[1,0].xaxis.set_ticklabels([None])
+                axes[1,0].yaxis.set_ticklabels([None])
+                axes[1,0].xaxis.set_ticks([None])
+                axes[1,0].yaxis.set_ticks([None])
+                msd = axes[1,1].matshow(difference)
+                pb.colorbar(msd, ax=axes[1,1])
+                axes[1,1].set_title('difference')
+                axes[1,1].xaxis.set_ticklabels([None])
+                axes[1,1].yaxis.set_ticklabels([None])
+                axes[1,1].xaxis.set_ticks([None])
+                axes[1,1].yaxis.set_ticks([None])
+                if block_indices:
+                    fig.suptitle("Block: {}".format(block_indices))
+                pb.show()
+
+        return check_passed
+
+class SkewChecker(HessianChecker):
+
+    def __init__(self, df, ddf, dddf, x0, names=None, *args, **kwargs):
+        """
+        :param df: gradient of function
+        :param ddf: Gradient of function to check (hessian)
+        :param dddf: Analytical gradient function (third derivative)
+        :param x0:
+            Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
+            Can be a list of arrays, if takes a list of arrays. This list will be passed
+            to f and df in the same order as given here.
+            If only one argument, make sure not to pass a list!!!
+
+        :type x0: [array-like] | array-like | float | int
+        :param names:
+            Names to print, when performing gradcheck. If a list was passed to x0
+            a list of names with the same length is expected.
+        :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
+
+        """
+        super(SkewChecker, self).__init__(df, ddf, dddf, x0, names=names, *args, **kwargs)
+
+    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3, block_indices=None, plot=False, super_plot=False):
+        """
+        Gradient checker that just checks each hessian individually
+
+        super_plot will plot the hessian wrt every parameter, plot will just do the first one
+        """
+        try:
+            import numdifftools as nd
+        except:
+            raise ImportError("Don't have numdifftools package installed, it is not a GPy dependency as of yet, it is only used for hessian tests")
+
+        if target_param:
+            raise NotImplementedError('Only basic functionality is provided with this gradchecker')
+
+        #Repeat for each parameter, not the nicest but shouldn't be many cases where there are many
+        #variables
+        current_index = 0
+        for name, n_shape in zip(self.names, self.shapes):
+            current_size = numpy.prod(n_shape)
+            x = self.optimizer_array.copy()
+            #x = self._get_params_transformed().copy()
+            x = x[current_index:current_index + current_size].reshape(n_shape)
+
+            # Check gradients
+            #Actually the third derivative
+            analytic_hess = self._ddf(x)
+
+            #Can only calculate jacobian for one variable at a time
+            #From the docs:
+            #x0 : vector location
+            #at which to differentiate fun
+            #If x0 is an N x M array, then fun is assumed to be a function
+            #of N*M variables., thus we must have it flat, not (N,1), but just (N,)
+            #numeric_hess_partial = nd.Hessian(self._f, vectorized=False)
+            #Actually _df is already the hessian
+            numeric_hess_partial = nd.Jacobian(self._df, vectorized=True)
+            numeric_hess = numeric_hess_partial(x)
+
+            print "Done making numerical hessian"
+            if analytic_hess.dtype is np.dtype('object'):
+                #Blockify numeric_hess aswell
+                blocksizes, pagesizes = get_block_shapes_3d(analytic_hess)
+                #HACK
+                real_block_size = np.sum(blocksizes)
+                numeric_hess = numeric_hess.reshape(real_block_size, real_block_size, pagesizes)
+                #numeric_hess = get_blocks_3d(numeric_hess, blocksizes)#, pagesizes)
+            else:
+                numeric_hess = numeric_hess.reshape(*analytic_hess.shape)
+
+            #Check every block individually (for ease)
+            check_passed = [False]*numeric_hess.shape[2]
+            for block_ind in xrange(numeric_hess.shape[2]):
+                #Unless super_plot is set, just plot the first one
+                p = True if (plot and block_ind == numeric_hess.shape[2]-1) or super_plot else False
+                if verbose:
+                    print "Checking derivative of hessian wrt parameter number {}".format(block_ind)
+                check_passed[block_ind] = self.checkgrad_block(analytic_hess[:,:,block_ind], numeric_hess[:,:,block_ind], verbose=verbose, step=step, tolerance=tolerance, block_indices=block_indices, plot=p)
+
+            current_index += current_size
+        return np.all(check_passed)
+
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -74,6 +74,8 @@ class MRD(BayesianGPLVMMiniBatch):

        self.logger.debug("creating observable arrays")
        self.Ylist = [ObsAr(Y) for Y in Ylist]
+        #The next line is a fix for Python 3. It replicates the python 2 behaviour from the above comprehension
+        Y = Ylist[-1]

        if Ynames is None:
            self.logger.debug("creating Ynames")
@ -82,7 +84,7 @@ class MRD(BayesianGPLVMMiniBatch):
        assert len(self.names) == len(self.Ylist), "one name per dataset, or None if Ylist is a dict"

        if inference_method is None:
-            self.inference_method = InferenceMethodList([VarDTC() for _ in xrange(len(self.Ylist))])
+            self.inference_method = InferenceMethodList([VarDTC() for _ in range(len(self.Ylist))])
        else:
            assert isinstance(inference_method, InferenceMethodList), "please provide one inference method per Y in the list and provide it as InferenceMethodList, inference_method given: {}".format(inference_method)
            self.inference_method = inference_method
@ -137,7 +139,7 @@ class MRD(BayesianGPLVMMiniBatch):

        self.bgplvms = []

-        for i, n, k, l, Y, im, bs in itertools.izip(itertools.count(), Ynames, kernels, likelihoods, Ylist, self.inference_method, batchsize):
+        for i, n, k, l, Y, im, bs in zip(itertools.count(), Ynames, kernels, likelihoods, Ylist, self.inference_method, batchsize):
            assert Y.shape[0] == self.num_data, "All datasets need to share the number of datapoints, and those have to correspond to one another"
            md = np.isnan(Y).any()
            spgp = BayesianGPLVMMiniBatch(Y, input_dim, X, X_variance,
@ -164,7 +166,7 @@ class MRD(BayesianGPLVMMiniBatch):
        self._log_marginal_likelihood = 0
        self.Z.gradient[:] = 0.
        self.X.gradient[:] = 0.
-        for b, i in itertools.izip(self.bgplvms, self.inference_method):
+        for b, i in zip(self.bgplvms, self.inference_method):
            self._log_marginal_likelihood += b._log_marginal_likelihood

            self.logger.info('working on im <{}>'.format(hex(id(i))))
@ -195,7 +197,7 @@ class MRD(BayesianGPLVMMiniBatch):
        elif init in "PCA_single":
            X = np.zeros((Ylist[0].shape[0], self.input_dim))
            fracs = []
-            for qs, Y in itertools.izip(np.array_split(np.arange(self.input_dim), len(Ylist)), Ylist):
+            for qs, Y in zip(np.array_split(np.arange(self.input_dim), len(Ylist)), Ylist):
                x,frcs = initialize_latent('PCA', len(qs), Y)
                X[:, qs] = x
                fracs.append(frcs)
@ -327,9 +329,9 @@ class MRD(BayesianGPLVMMiniBatch):

    def __getstate__(self):
        state = super(MRD, self).__getstate__()
-        if state.has_key('kern'):
+        if 'kern' in state:
            del state['kern']
-        if state.has_key('likelihood'):
+        if 'likelihood' in state:
            del state['likelihood']
        return state

@ -338,4 +340,4 @@ class MRD(BayesianGPLVMMiniBatch):
        super(MRD, self).__setstate__(state)
        self.kern = self.bgplvms[0].kern
        self.likelihood = self.bgplvms[0].likelihood
-        self.parameters_changed()
+        self.parameters_changed()
--- a/GPy/models/one_vs_all_sparse_classification.py
+++ b/GPy/models/one_vs_all_sparse_classification.py
@ -30,7 +30,7 @@ class OneVsAllSparseClassification(object):

        self.results = {}
        for yj in labels:
-            print 'Class %s vs all' %yj
+            print('Class %s vs all' %yj)
            Ynew = Y.copy()
            Ynew[Y.flatten()!=yj] = 0
            Ynew[Y.flatten()==yj] = 1
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -1,6 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

+from __future__ import print_function
 import numpy as np
 from ..core.parameterization.param import Param
 from ..core.sparse_gp import SparseGP
@ -43,14 +44,15 @@ class SparseGPMiniBatch(SparseGP):
    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
                 name='sparse gp', Y_metadata=None, normalizer=False,
                 missing_data=False, stochastic=False, batchsize=1):
-        #pick a sensible inference method
+        
+        # pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
+                inference_method = var_dtc.VarDTC(limit=1 if not missing_data else Y.shape[1])
            else:
                #inference_method = ??
-                raise NotImplementedError, "what to do what to do?"
-            print "defaulting to ", inference_method, "for latent function inference"
+                raise NotImplementedError("what to do what to do?")
+            print("defaulting to ", inference_method, "for latent function inference")

        self.kl_factr = 1.
        self.Z = Param('inducing inputs', Z)
@ -80,13 +82,13 @@ class SparseGPMiniBatch(SparseGP):
            overall = self.Y_normalized.shape[1]
            m_f = lambda i: "Precomputing Y for missing data: {: >7.2%}".format(float(i+1)/overall)
            message = m_f(-1)
-            print message,
-            for d in xrange(overall):
+            print(message, end=' ')
+            for d in range(overall):
                self.Ylist.append(self.Y_normalized[self.ninan[:, d], d][:, None])
-                print ' '*(len(message)+1) + '\r',
+                print(' '*(len(message)+1) + '\r', end=' ')
                message = m_f(d)
-                print message,
-            print ''
+                print(message, end=' ')
+            print('')

        self.posterior = None

@ -181,11 +183,11 @@ class SparseGPMiniBatch(SparseGP):
            full_values[key][value_indices[key]] += current_values[key]
        """
        for key in current_values.keys():
-            if value_indices is not None and value_indices.has_key(key):
+            if value_indices is not None and key in value_indices:
                index = value_indices[key]
            else:
                index = slice(None)
-            if full_values.has_key(key):
+            if key in full_values:
                full_values[key][index] += current_values[key]
            else:
                full_values[key] = current_values[key]
@ -241,15 +243,15 @@ class SparseGPMiniBatch(SparseGP):
        if not self.stochastics:
            m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
            message = m_f(-1)
-            print message,
+            print(message, end=' ')

        for d in self.stochastics.d:
            ninan = self.ninan[:, d]

            if not self.stochastics:
-                print ' '*(len(message)) + '\r',
+                print(' '*(len(message)) + '\r', end=' ')
                message = m_f(d)
-                print message,
+                print(message, end=' ')

            posterior, log_marginal_likelihood, \
                grad_dict, current_values, value_indices = self._inner_parameters_changed(
@ -268,7 +270,7 @@ class SparseGPMiniBatch(SparseGP):
            woodbury_vector[:, d:d+1] = posterior.woodbury_vector
            self._log_marginal_likelihood += log_marginal_likelihood
        if not self.stochastics:
-            print ''
+            print('')

        if self.posterior is None:
            self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@ -39,7 +39,10 @@ class SSGPLVM(SparseGP_MPI):
            X_variance = np.random.uniform(0,.1,X.shape)
            
        if Gamma is None:
-            gamma = np.random.randn(X.shape[0], input_dim)
+            gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+            gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
+            gamma[gamma>1.-1e-9] = 1.-1e-9
+            gamma[gamma<1e-9] = 1e-9
        else:
            gamma = Gamma.copy()
                
@ -71,7 +74,7 @@ class SSGPLVM(SparseGP_MPI):
        self.link_parameter(self.X, index=0)
                
        if self.group_spike:
-            [self.X.gamma[:,i].tie('tieGamma'+str(i)) for i in xrange(self.X.gamma.shape[1])] # Tie columns together
+            [self.X.gamma[:,i].tie('tieGamma'+str(i)) for i in range(self.X.gamma.shape[1])] # Tie columns together
        
    def set_X_gradients(self, X, X_grad):
        """Set the gradients of the posterior distribution of X in its specific form."""
--- a/GPy/models/ss_mrd.py
+++ b/GPy/models/ss_mrd.py
@ -19,10 +19,10 @@ class SSMRD(Model):
                               name='model_'+str(i)) for i,y in enumerate(Ylist)]
        self.add_parameters(*(self.models))
        
-        [[[self.models[m].X.mean[i,j:j+1].tie('mean_'+str(i)+'_'+str(j)) for m in xrange(len(self.models))] for j in xrange(self.models[0].X.mean.shape[1])] 
-         for i in xrange(self.models[0].X.mean.shape[0])]
-        [[[self.models[m].X.variance[i,j:j+1].tie('var_'+str(i)+'_'+str(j)) for m in xrange(len(self.models))] for j in xrange(self.models[0].X.variance.shape[1])] 
-         for i in xrange(self.models[0].X.variance.shape[0])]
+        [[[self.models[m].X.mean[i,j:j+1].tie('mean_'+str(i)+'_'+str(j)) for m in range(len(self.models))] for j in range(self.models[0].X.mean.shape[1])] 
+         for i in range(self.models[0].X.mean.shape[0])]
+        [[[self.models[m].X.variance[i,j:j+1].tie('var_'+str(i)+'_'+str(j)) for m in range(len(self.models))] for j in range(self.models[0].X.variance.shape[1])] 
+         for i in range(self.models[0].X.variance.shape[0])]
        
        self.updates = True
        
@ -31,4 +31,4 @@ class SSMRD(Model):
        self._log_marginal_likelihood = sum([m._log_marginal_likelihood for m in self.models])

    def log_likelihood(self):
-        return self._log_marginal_likelihood
+        return self._log_marginal_likelihood
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@ -1,7 +1,6 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
 import numpy as np
 from ..util.warping_functions import *
 from ..core import GP
@ -10,14 +9,16 @@ from GPy.util.warping_functions import TanhWarpingFunction_d
 from GPy import kern

 class WarpedGP(GP):
-    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalize_X=False, normalize_Y=False):
+    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3):

        if kernel is None:
-            kernel = kern.rbf(X.shape[1])
+            kernel = kern.RBF(X.shape[1])

        if warping_function == None:
            self.warping_function = TanhWarpingFunction_d(warping_terms)
            self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
+        else:
+            self.warping_function = warping_function

        self.scale_data = False
        if self.scale_data:
@ -25,10 +26,10 @@ class WarpedGP(GP):
        self.has_uncertain_inputs = False
        self.Y_untransformed = Y.copy()
        self.predict_in_warped_space = False
-        likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y)
+        likelihood = likelihoods.Gaussian()

-        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
-        self._set_params(self._get_params())
+        GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
+        self.link_parameter(self.warping_function)

    def _scale_data(self, Y):
        self._Ymax = Y.max()
@ -38,62 +39,55 @@ class WarpedGP(GP):
    def _unscale_data(self, Y):
        return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin

-    def _set_params(self, x):
-        self.warping_params = x[:self.warping_function.num_parameters]
-        Y = self.transform_data()
-        self.likelihood.set_data(Y)
-        GP._set_params(self, x[self.warping_function.num_parameters:].copy())
+    def parameters_changed(self):
+        self.Y[:] = self.transform_data()
+        super(WarpedGP, self).parameters_changed()

-    def _get_params(self):
-        return np.hstack((self.warping_params.flatten().copy(), GP._get_params(self).copy()))
+        Kiy = self.posterior.woodbury_vector.flatten()

-    def _get_param_names(self):
-        warping_names = self.warping_function._get_param_names()
-        param_names = GP._get_param_names(self)
-        return warping_names + param_names
-
-    def transform_data(self):
-        Y = self.warping_function.f(self.Y_untransformed.copy(), self.warping_params).copy()
-        return Y
-
-    def log_likelihood(self):
-        ll = GP.log_likelihood(self)
-        jacobian = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
-        return ll + np.log(jacobian).sum()
-
-    def _log_likelihood_gradients(self):
-        ll_grads = GP._log_likelihood_gradients(self)
-        alpha = np.dot(self.Ki, self.likelihood.Y.flatten())
-        warping_grads = self.warping_function_gradients(alpha)
-
-        warping_grads = np.append(warping_grads[:, :-1].flatten(), warping_grads[0, -1])
-        return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
-
-    def warping_function_gradients(self, Kiy):
-        grad_y = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
-        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, self.warping_params,
+        grad_y = self.warping_function.fgrad_y(self.Y_untransformed)
+        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed,
                                                                 return_covar_chain=True)
        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)

-        return -dquad_dpsi + djac_dpsi
+        warping_grads = -dquad_dpsi + djac_dpsi
+
+        self.warping_function.psi.gradient[:] = warping_grads[:, :-1]
+        self.warping_function.d.gradient[:] = warping_grads[0, -1]
+
+
+    def transform_data(self):
+        Y = self.warping_function.f(self.Y_untransformed.copy()).copy()
+        return Y
+
+    def log_likelihood(self):
+        ll = GP.log_likelihood(self)
+        jacobian = self.warping_function.fgrad_y(self.Y_untransformed)
+        return ll + np.log(jacobian).sum()

    def plot_warping(self):
-        self.warping_function.plot(self.warping_params, self.Y_untransformed.min(), self.Y_untransformed.max())
+        self.warping_function.plot(self.Y_untransformed.min(), self.Y_untransformed.max())

-    def predict(self, Xnew, which_parts='all', full_cov=False, pred_init=None):
+    def predict(self, Xnew, which_parts='all', pred_init=None):
        # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        mu, var = GP._raw_predict(self, Xnew, full_cov=full_cov, which_parts=which_parts)
+        # Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        mu, var = GP._raw_predict(self, Xnew)

        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+        mean, var = self.likelihood.predictive_values(mu, var)

        if self.predict_in_warped_space:
-            mean = self.warping_function.f_inv(mean, self.warping_params, y=pred_init)
-            var = self.warping_function.f_inv(var, self.warping_params)
+            mean = self.warping_function.f_inv(mean,  y=pred_init)
+            var = self.warping_function.f_inv(var)

        if self.scale_data:
            mean = self._unscale_data(mean)
-        
-        return mean, var, _025pm, _975pm
+
+        return mean, var
+
+if __name__ == '__main__':
+    X = np.random.randn(100, 1)
+    Y = np.sin(X) + np.random.randn(100, 1)*0.05
+
+    m = WarpedGP(X, Y)