From ae03b63afba906f7d824581183b20481f06f77d9 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 29 Jan 2014 09:40:22 +0000
Subject: [PATCH 01/43] tidied up sparse_gp_regression

---
 GPy/models/__init__.py             |  2 +-
 GPy/models/sparse_gp_regression.py | 71 +++++++++++++++++++++++-------
 2 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index 047658a4..3fcaffa8 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -3,7 +3,7 @@
 
 from gp_regression import GPRegression
 from gp_classification import GPClassification
-from sparse_gp_regression import SparseGPRegression
+from sparse_gp_regression import SparseGPRegression, SparseGPRegressionUncertainInput
 from svigp_regression import SVIGPRegression
 from sparse_gp_classification import SparseGPClassification
 from gplvm import GPLVM
diff --git a/GPy/models/sparse_gp_regression.py b/GPy/models/sparse_gp_regression.py
index a45ea5cd..88b0d435 100644
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@@ -16,44 +16,83 @@ class SparseGPRegression(SparseGP):
     :param X: input observations
     :param Y: observed values
     :param kernel: a GPy kernel, defaults to rbf+white
-    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
-    :type normalize_X: False|True
-    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
-    :type normalize_Y: False|True
     :param Z: inducing inputs (optional, see note)
     :type Z: np.ndarray (num_inducing x input_dim) | None
+    :param num_inducing: number of inducing points (ignored if Z is passed, see note)
+    :type num_inducing: int
     :rtype: model object
-    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
-    :type X_variance: np.ndarray (num_data x input_dim) | None
 
+    .. Note:: If no Z array is passed, num_inducing (default 10) points are selected from the data. Other wise num_inducing is ignored
     .. Note:: Multiple independent outputs are allowed using columns of Y
 
     """
 
-    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False, Z=None, num_inducing=10, X_variance=None):
+    def __init__(self, X, Y, kernel=None, Z=None, num_inducing=10, X_variance=None):
+        num_data, input_dim = X.shape
+
         # kern defaults to rbf (plus white for stability)
         if kernel is None:
-            kernel = kern.rbf(X.shape[1]) # + kern.white(X.shape[1], 1e-3)
+            kernel = kern.rbf(input_dim)  + kern.white(input_dim, variance=1e-3)
 
         # Z defaults to a subset of the data
         if Z is None:
-            i = np.random.permutation(X.shape[0])[:num_inducing]
+            i = np.random.permutation(num_data)[:min(num_inducing, num_data)]
             Z = X[i].copy()
         else:
-            assert Z.shape[1] == X.shape[1]
+            assert Z.shape[1] == input_dim
 
-        # likelihood defaults to Gaussian
-        likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
+        likelihood = likelihoods.Gaussian()
 
-        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X, X_variance=X_variance)
+        SparseGP.__init__(self, X, Y, Z, kernel, likelihood)
         self.ensure_default_constraints()
-        pass
 
     def _getstate(self):
         return SparseGP._getstate(self)
 
-
     def _setstate(self, state):
         return SparseGP._setstate(self, state)
 
-    pass
+
+
+class SparseGPRegressionUncertainInput(SparseGP):
+    """
+    Gaussian Process model for regression with Gaussian variance on the inputs (X_variance)
+
+    This is a thin wrapper around the SparseGP class, with a set of sensible defalts
+
+    """
+
+    def __init__(self, X, X_variance, Y, kernel=None, Z=None, num_inducing=10):
+        """
+        :param X: input observations
+        :type X: np.ndarray (num_data x input_dim)
+        :param X_variance: The uncertainty in the measurements of X (Gaussian variance, optional)
+        :type X_variance: np.ndarray (num_data x input_dim)
+        :param Y: observed values
+        :param kernel: a GPy kernel, defaults to rbf+white
+        :param Z: inducing inputs (optional, see note)
+        :type Z: np.ndarray (num_inducing x input_dim) | None
+        :param num_inducing: number of inducing points (ignored if Z is passed, see note)
+        :type num_inducing: int
+        :rtype: model object
+
+        .. Note:: If no Z array is passed, num_inducing (default 10) points are selected from the data. Other wise num_inducing is ignored
+        .. Note:: Multiple independent outputs are allowed using columns of Y
+        """
+        num_data, input_dim = X.shape
+
+        # kern defaults to rbf (plus white for stability)
+        if kernel is None:
+            kernel = kern.rbf(input_dim)  + kern.white(input_dim, variance=1e-3)
+
+        # Z defaults to a subset of the data
+        if Z is None:
+            i = np.random.permutation(num_data)[:min(num_inducing, num_data)]
+            Z = X[i].copy()
+        else:
+            assert Z.shape[1] == input_dim
+
+        likelihood = likelihoods.Gaussian()
+
+        SparseGP.__init__(self, X, Y, Z, kernel, likelihood, X_variance=X_variance)
+        self.ensure_default_constraints()

From 9a7d9fa5b7f3893e18f44bf45463eba7ed68a73b Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <dai@compbio-OptiPlex-780.(none)>
Date: Wed, 29 Jan 2014 10:28:39 +0000
Subject: [PATCH 02/43] adapter laplace inference into the param framework

---
 .../latent_function_inference/laplace.py      | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 2b2128db..fa5bb3b8 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -53,6 +53,27 @@ class LaplaceInference(object):
         self.restart()
         likelihood.__init__(self)
 
+    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+        """
+        Returns a Posterior class containing essential quantities of the posterior
+        """
+
+        # Compute K
+        self.K = kern.K(X)
+        self.data = Y
+        self.N, self.D = Y.shape
+
+        #Find mode
+        self.f_hat = self.rasm_mode(self.K)
+
+        #Compute hessian and other variables at mode
+        self._compute_likelihood_variables()
+
+        #Compute fake variables replicating laplace approximation to posterior
+        self._compute_GP_variables()
+
+        return Posterior(mean=self.f_hat, cov=self.covariance_matrix, K=self.K)
+
     def restart(self):
         """
         Reset likelihood variables to their defaults

From a31ff3acc3daf5a7e79b6586dd51656b60428041 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 29 Jan 2014 17:02:44 +0000
Subject: [PATCH 03/43] some hacking on sparse_gp inference

---
 GPy/core/gp.py                                |  6 +-
 GPy/core/sparse_gp.py                         | 14 ++--
 .../latent_function_inference/varDTC.py       | 70 +++++++++++--------
 3 files changed, 50 insertions(+), 40 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index fbabf5f6..060b617a 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -48,9 +48,9 @@ class GP(Model):
         if inference_method is None:
             if isinstance(likelihood, likelihoods.Gaussian):
                 inference_method = exact_gaussian_inference.ExactGaussianInference()
-        else:
-            inference_method = expectation_propagation
-            print "defaulting to ", inference_method, "for latent function inference"
+            else:
+                inference_method = expectation_propagation
+                print "defaulting to ", inference_method, "for latent function inference"
         self.inference_method = inference_method
 
         self.add_parameter(self.kern)
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index e33703d8..3152d4b5 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -6,6 +6,7 @@ from ..util.linalg import mdot, tdot, symmetrify, backsub_both_sides, chol_inv,
 from gp import GP
 from parameterization.param import Param
 from ..inference.latent_function_inference import varDTC
+from .. import likelihoods
 
 class SparseGP(GP):
     """
@@ -35,24 +36,21 @@ class SparseGP(GP):
         #pick a sensible inference method
         if inference_method is None:
             if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = varDTC.Gaussian_inference()
+                inference_method = varDTC.VarDTC()
         else:
             #inference_method = ??
             raise NotImplementedError, "what to do what to do?"
             print "defaulting to ", inference_method, "for latent function inference"
 
-        GP.__init__(self, X, Y, likelihood, inference_method, kernel, name)
 
         self.Z = Z
         self.num_inducing = Z.shape[0]
 
-        if X_variance is None:
-            self.has_uncertain_inputs = False
-            self.X_variance = None
-        else:
+        if not (X_variance is None):
             assert X_variance.shape == X.shape
-            self.has_uncertain_inputs = True
-            self.X_variance = X_variance
+        self.X_variance = X_variance
+
+        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name)
 
         self.Z = Param('inducing inputs', self.Z)
         self.add_parameter(self.Z, gradient=self.dL_dZ, index=0)
diff --git a/GPy/inference/latent_function_inference/varDTC.py b/GPy/inference/latent_function_inference/varDTC.py
index 0cd709b2..7ceeff11 100644
--- a/GPy/inference/latent_function_inference/varDTC.py
+++ b/GPy/inference/latent_function_inference/varDTC.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from posterior import Posterior
-from ...util.linalg import pdinv, dpotrs, tdot
+from ...util.linalg import jitchol, backsub_both_sides, tdot
 import numpy as np
 log_2_pi = np.log(2*np.pi)
 
@@ -17,7 +17,7 @@ class VarDTC(object):
 
     """
     def __init__(self):
-        self._YYTfactor_cache = caching.cache()
+        #self._YYTfactor_cache = caching.cache()
         self.const_jitter = 1e-6
 
     def get_YYTfactor(self, Y):
@@ -42,7 +42,13 @@ class VarDTC(object):
         num_data, output_dim = Y.shape
 
         #see whether we're using variational uncertain inputs
-        uncertain_inputs = (X_variance is None)
+        uncertain_inputs = not (X_variance is None)
+
+        #see whether we've got a different noise variance for each datum
+        beta = 1./np.squeeze(likelihood.variance)
+        het_noise = False
+        if beta.size <1:
+            het_noise = True
 
         # kernel computations, using BGPLVM notation
         Kmm = kern.K(Z)
@@ -59,7 +65,7 @@ class VarDTC(object):
 
         # The rather complex computations of A
         if uncertain_inputs:
-            if likelihood.is_heteroscedastic:
+            if het_noise:
                 psi2_beta = (psi2 * (likelihood.precision.flatten().reshape(num_data, 1, 1))).sum(0)
             else:
                 psi2_beta = psi2.sum(0) * likelihood.precision
@@ -70,10 +76,10 @@ class VarDTC(object):
             tmp = evecs * np.sqrt(clipped_evals)
             tmp = tmp.T
         else:
-            if likelihood.is_heteroscedastic:
-                tmp = psi1 * (np.sqrt(likelihood.precision.flatten().reshape(num_data, 1)))
+            if het_noise:
+                tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
             else:
-                tmp = psi1 * (np.sqrt(likelihood.precision))
+                tmp = psi1 * (np.sqrt(beta))
         tmp, _ = dtrtrs(Lm, np.asfortranarray(tmp.T), lower=1)
         A = tdot(tmp)
 
@@ -82,7 +88,7 @@ class VarDTC(object):
         LB = jitchol(B)
 
         # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
-        VVT_factor = self.get_VVTfactor(Y, likelihood.precision)
+        VVT_factor = self.get_VVTfactor(Y, beta)
         psi1Vf = np.dot(psi1.T, VVT_factor)
 
         # back substutue C into psi1Vf
@@ -92,12 +98,12 @@ class VarDTC(object):
         Cpsi1Vf, info3 = dtrtrs(Lm, tmp, lower=1, trans=1)
 
         #compute log marginal likelihood
-        if likelihood.is_heteroscedastic:
-            A = -0.5 * num_data * output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(likelihood.precision)) - 0.5 * np.sum(likelihood.V * likelihood.Y)
-            B = -0.5 * output_dim * (np.sum(likelihood.precision.flatten() * psi0) - np.trace(_A))
+        if het_noise:
+            A = -0.5 * num_data * output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(likelihood.V * likelihood.Y)
+            B = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(_A))
         else:
-            A = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(likelihood.precision)) - 0.5 * likelihood.precision * likelihood.trYYT
-            B = -0.5 * output_dim * (np.sum(likelihood.precision * psi0) - np.trace(_A))
+            A = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(beta)) - 0.5 * beta * likelihood.trYYT
+            B = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(_A))
         C = -output_dim * (np.sum(np.log(np.diag(LB)))) # + 0.5 * num_inducing * np.log(sf2))
         D = 0.5 * data_fit
         log_marginal = A + B + C + D
@@ -112,21 +118,20 @@ class VarDTC(object):
         tmp += output_dim * np.eye(num_inducing)
         dL_dKmm = backsub_both_sides(Lm, tmp)
 
-        # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case
-        dL_dpsi0 = -0.5 * output_dim * (likelihood.precision * np.ones([num_data, 1])).flatten()
+        # Compute dL_dpsi 
+        dL_dpsi0 = -0.5 * output_dim * (beta * np.ones([num_data, 1])).flatten()
         dL_dpsi1 = np.dot(likelihood.VVT_factor, Cpsi1Vf.T)
         dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
 
-        if likelihood.is_heteroscedastic:
-
-            if has_uncertain_inputs:
-                dL_dpsi2 = likelihood.precision.flatten()[:, None, None] * dL_dpsi2_beta[None, :, :]
+        if het_noise:
+            if uncertain_inputs:
+                dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :]
             else:
                 dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * likelihood.precision.reshape(num_data, 1)).T).T
                 dL_dpsi2 = None
         else:
             dL_dpsi2 = likelihood.precision * dL_dpsi2_beta
-            if has_uncertain_inputs:
+            if uncertain_inputs:
                 # repeat for each of the N psi_2 matrices
                 dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0)
             else:
@@ -139,18 +144,14 @@ class VarDTC(object):
         if likelihood.size == 0:
             # save computation here.
             partial_for_likelihood = None
-        elif likelihood.is_heteroscedastic:
-
-            if has_uncertain_inputs:
+        elif het_noise:
+            if uncertain_inputs:
                 raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
-
             else:
-
                 LBi = chol_inv(LB)
                 Lmi_psi1, nil = dtrtrs(Lm, np.asfortranarray(psi1.T), lower=1, trans=0)
                 _LBi_Lmi_psi1, _ = dtrtrs(LB, np.asfortranarray(Lmi_psi1), lower=1, trans=0)
 
-
                 partial_for_likelihood = -0.5 * likelihood.precision + 0.5 * likelihood.V**2
                 partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * likelihood.precision**2
 
@@ -165,15 +166,26 @@ class VarDTC(object):
             partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * likelihood.precision ** 2 - np.trace(_A) * likelihood.precision)
             partial_for_likelihood += likelihood.precision * (0.5 * np.sum(_A * DBi_plus_BiPBi) - data_fit)
 
+        #put the gradients in the right places
+        likelihood.update_gradients(np.diag(partial_for_likelihood))
+
+        if uncertain_inputs:
+            kern.update_gradients_variational(??)
+            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
+        else:
+            kern.update_gradients_sparse(??)
+            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
 
         #get sufficient things for posterior prediction
         if VVT_factor.shape[1] == Y.shape[1]:
-            Cpsi1V = Cpsi1Vf
+            woodbury_vector = Cpsi1Vf # == Cpsi1V
+            woodbury_chol = ??
         else:
             raise NotImplementedError #TODO
 
         #construct a posterior object
-        post = Posterior(woodbury_chol=None, woodbury_vector=Cpsi1V, K=None, mean=None, cov=None, K_chol=None)
-        return
+        post = Posterior(woodbury_chol=woodbury_chol, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
+
+        return Posterior, log_marginal, grad_dict
 
 

From e0dbfbe1487b059024c8e1169e660a33ac29dc57 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 30 Jan 2014 12:03:02 +0000
Subject: [PATCH 04/43] sparse GP now checkgrads, optimises sensibly.
 Predicitno still not working

---
 GPy/core/sparse_gp.py                         | 37 +++++++---
 .../latent_function_inference/posterior.py    | 11 ++-
 .../latent_function_inference/varDTC.py       | 71 ++++++++++---------
 GPy/kern/kern.py                              |  5 +-
 GPy/kern/parts/rbf.py                         |  2 +-
 GPy/kern/parts/white.py                       |  2 +-
 6 files changed, 81 insertions(+), 47 deletions(-)

diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 3152d4b5..ab1f3bf0 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -53,26 +53,45 @@ class SparseGP(GP):
         GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name)
 
         self.Z = Param('inducing inputs', self.Z)
-        self.add_parameter(self.Z, gradient=self.dL_dZ, index=0)
-        self.add_parameter(self.kern, gradient=self.dL_dtheta)
-        self.add_parameter(self.likelihood, gradient=lambda:self.likelihood._gradients(partial=self.partial_for_likelihood))
+        self.add_parameter(self.Z, index=0)
+        self.add_parameter(self.kern)
+        self.add_parameter(self.likelihood)
 
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
 
-                #The derivative of the bound wrt the inducing inputs Z
-        self.Z.gradient = self.kern.dK_dX(self.dL_dKmm, self.Z)
-        if self.has_uncertain_inputs:
+        #The derivative of the bound wrt the inducing inputs Z
+        self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
+        if self.X_variance is None:
+            self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
+        else:
             self.Z.gradient += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance)
             self.Z.gradient += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance)
-        else:
-            self.Z.gradient += self.kern.dK_dX(self.dL_dpsi1.T, self.Z, self.X)
 
     def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
         """
         Make a prediction for the latent function values
         """
-        #TODO!!!
+        if X_variance_new is None:
+            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
+            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
+            if full_cov:
+                Kxx = self.kern.K(Xnew, which_parts=which_parts)
+                var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx) # NOTE this won't work for plotting
+            else:
+                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
+                var = Kxx - np.sum(Kx * np.dot(self.posterior.woodbury_inv, Kx), 0)
+        else:
+            # assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
+            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
+            mu = np.dot(Kx, self.Cpsi1V)
+            if full_cov:
+                raise NotImplementedError, "TODO"
+            else:
+                Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new)
+                psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new)
+                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
+        return mu, var[:,None]
 
 
     def _getstate(self):
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index 82de1419..3e5022aa 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs
+from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs, dpotri, symmetrify
 
 class Posterior(object):
     """
@@ -62,6 +62,7 @@ class Posterior(object):
 
         #compute this lazily
         self._precision = None
+        self._woodbury_inv = None
 
     @property
     def mean(self):
@@ -91,6 +92,14 @@ class Posterior(object):
             _, _, self._woodbury_chol, _ = pdinv(Wi)
         return self._woodbury_chol
 
+    @property
+    def woodbury_inv(self):
+        if self._woodbury_inv is None:
+            self._woodbury_inv, _ = dpotri(self.woodbury_chol)
+            symmetrify(self._woodbury_inv)
+        return self._woodbury_inv
+
+
     @property
     def woodbury_vector(self):
         if self._woodbury_vector is None:
diff --git a/GPy/inference/latent_function_inference/varDTC.py b/GPy/inference/latent_function_inference/varDTC.py
index 7ceeff11..b5ba4c2d 100644
--- a/GPy/inference/latent_function_inference/varDTC.py
+++ b/GPy/inference/latent_function_inference/varDTC.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from posterior import Posterior
-from ...util.linalg import jitchol, backsub_both_sides, tdot
+from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs
 import numpy as np
 log_2_pi = np.log(2*np.pi)
 
@@ -30,7 +30,7 @@ class VarDTC(object):
         if (N>D):
             return Y
         else:
-            #if Y in self.cache, return self.Cache[Y], else stor Y in cache and return L.
+            #if Y in self.cache, return self.Cache[Y], else store Y in cache and return L.
             raise NotImplementedError, 'TODO' #TODO
 
     def get_VVTfactor(self, Y, prec):
@@ -66,9 +66,9 @@ class VarDTC(object):
         # The rather complex computations of A
         if uncertain_inputs:
             if het_noise:
-                psi2_beta = (psi2 * (likelihood.precision.flatten().reshape(num_data, 1, 1))).sum(0)
+                psi2_beta = (psi2 * (beta.flatten().reshape(num_data, 1, 1))).sum(0)
             else:
-                psi2_beta = psi2.sum(0) * likelihood.precision
+                psi2_beta = psi2.sum(0) * beta
             evals, evecs = linalg.eigh(psi2_beta)
             clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
             if not np.array_equal(evals, clipped_evals):
@@ -89,6 +89,7 @@ class VarDTC(object):
 
         # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
         VVT_factor = self.get_VVTfactor(Y, beta)
+        trYYT = np.sum(np.square(Y))
         psi1Vf = np.dot(psi1.T, VVT_factor)
 
         # back substutue C into psi1Vf
@@ -97,17 +98,6 @@ class VarDTC(object):
         tmp, info2 = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
         Cpsi1Vf, info3 = dtrtrs(Lm, tmp, lower=1, trans=1)
 
-        #compute log marginal likelihood
-        if het_noise:
-            A = -0.5 * num_data * output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(likelihood.V * likelihood.Y)
-            B = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(_A))
-        else:
-            A = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(beta)) - 0.5 * beta * likelihood.trYYT
-            B = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(_A))
-        C = -output_dim * (np.sum(np.log(np.diag(LB)))) # + 0.5 * num_inducing * np.log(sf2))
-        D = 0.5 * data_fit
-        log_marginal = A + B + C + D
-
 
         # Compute dL_dKmm
         tmp = tdot(_LBi_Lmi_psi1Vf)
@@ -120,17 +110,17 @@ class VarDTC(object):
 
         # Compute dL_dpsi 
         dL_dpsi0 = -0.5 * output_dim * (beta * np.ones([num_data, 1])).flatten()
-        dL_dpsi1 = np.dot(likelihood.VVT_factor, Cpsi1Vf.T)
+        dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
         dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
 
         if het_noise:
             if uncertain_inputs:
                 dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :]
             else:
-                dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * likelihood.precision.reshape(num_data, 1)).T).T
+                dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T
                 dL_dpsi2 = None
         else:
-            dL_dpsi2 = likelihood.precision * dL_dpsi2_beta
+            dL_dpsi2 = beta * dL_dpsi2_beta
             if uncertain_inputs:
                 # repeat for each of the N psi_2 matrices
                 dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0)
@@ -152,40 +142,55 @@ class VarDTC(object):
                 Lmi_psi1, nil = dtrtrs(Lm, np.asfortranarray(psi1.T), lower=1, trans=0)
                 _LBi_Lmi_psi1, _ = dtrtrs(LB, np.asfortranarray(Lmi_psi1), lower=1, trans=0)
 
-                partial_for_likelihood = -0.5 * likelihood.precision + 0.5 * likelihood.V**2
-                partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * likelihood.precision**2
+                partial_for_likelihood = -0.5 * beta + 0.5 * likelihood.V**2
+                partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
 
-                partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*likelihood.precision**2
+                partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
 
-                partial_for_likelihood += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * likelihood.precision**2
-                partial_for_likelihood += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * likelihood.precision**2
+                partial_for_likelihood += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * beta**2
+                partial_for_likelihood += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
 
         else:
             # likelihood is not heteroscedatic
-            partial_for_likelihood = -0.5 * num_data * output_dim * likelihood.precision + 0.5 * likelihood.trYYT * likelihood.precision ** 2
-            partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * likelihood.precision ** 2 - np.trace(_A) * likelihood.precision)
-            partial_for_likelihood += likelihood.precision * (0.5 * np.sum(_A * DBi_plus_BiPBi) - data_fit)
+            partial_for_likelihood = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
+            partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
+            partial_for_likelihood += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
+
+        #compute log marginal likelihood
+        if het_noise:
+            lik_1 = -0.5 * num_data * output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(likelihood.V * likelihood.Y)
+            lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
+        else:
+            lik_1 = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(beta)) - 0.5 * beta * trYYT
+            lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
+        lik_3 = -output_dim * (np.sum(np.log(np.diag(LB)))) # + 0.5 * num_inducing * np.log(sf2))
+        lik_4 = 0.5 * data_fit
+        log_marginal = lik_1 + lik_2 + lik_3 + lik_4
 
         #put the gradients in the right places
-        likelihood.update_gradients(np.diag(partial_for_likelihood))
+        likelihood.update_gradients(partial_for_likelihood)
 
         if uncertain_inputs:
-            kern.update_gradients_variational(??)
             grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
+            kern.update_gradients_variational(mu=X, S=X_variance, Z=Z, **grad_dict)
         else:
-            kern.update_gradients_sparse(??)
-            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
+            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1}
+            kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
 
         #get sufficient things for posterior prediction
         if VVT_factor.shape[1] == Y.shape[1]:
             woodbury_vector = Cpsi1Vf # == Cpsi1V
-            woodbury_chol = ??
         else:
-            raise NotImplementedError #TODO
+            psi1V = np.dot(Y.T*beta, psi1).T
+            tmp, _ = dtrtrs(Lm, np.asfortranarray(psi1V), lower=1, trans=0)
+            tmp, _ = dpotrs(LB, tmp, lower=1)
+            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+        #TODO: totally wrong, fix.
+        woodbury_chol = np.eye(num_inducing)
 
         #construct a posterior object
         post = Posterior(woodbury_chol=woodbury_chol, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
 
-        return Posterior, log_marginal, grad_dict
+        return post, log_marginal, grad_dict
 
 
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index de87ff14..68b26e3c 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -284,9 +284,10 @@ class kern(Parameterized):
         [p.update_gradients_full(dL_dK, X) for p in self._parameters_]
 
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        raise NotImplementedError
+        [p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X, Z) for p in self._parameters_]
+
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
-        raise NotImplementedError
+        [p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_]
 
     def dK_dtheta(self, dL_dK, X, X2=None):
         """
diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py
index 885fed96..89f6894c 100644
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@@ -117,7 +117,7 @@ class RBF(Kernpart):
             self.lengthscales.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
 
         else:
-            self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
+            self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKnm)
 
         #from Kmm
         self._K_computations(Z, None)
diff --git a/GPy/kern/parts/white.py b/GPy/kern/parts/white.py
index a59c9f98..c9677f28 100644
--- a/GPy/kern/parts/white.py
+++ b/GPy/kern/parts/white.py
@@ -32,7 +32,7 @@ class White(Kernpart):
         self.variance.gradient = np.trace(dL_dK)
 
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        raise NotImplementedError
+        self.variance.gradient = np.trace(dL_dKmm) + np.sum(dL_dKdiag)
 
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         raise NotImplementedError

From 9f40ab0f832c26fda5f1851b3f04e34102669cdf Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 31 Jan 2014 10:15:30 +0000
Subject: [PATCH 05/43] sparse GP now working nicely

---
 .../latent_function_inference/posterior.py          | 12 ++++++++----
 GPy/inference/latent_function_inference/varDTC.py   | 13 ++++++++-----
 GPy/plotting/matplot_dep/models_plots.py            | 11 ++++++++---
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index 3e5022aa..b3a352e5 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -14,7 +14,7 @@ class Posterior(object):
     schemes and the model classes.
 
     """
-    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None):
+    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
         """
         woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
         woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
@@ -45,7 +45,9 @@ class Posterior(object):
         #obligatory
         self._K = K
 
-        if ((woodbury_chol is not None) and (woodbury_vector is not None) and (K is not None)) or ((mean is not None) and (cov is not None) and (K is not None)):
+        if ((woodbury_chol is not None) and (woodbury_vector is not None))\
+                or ((woodbury_inv is not None) and (woodbury_vector is not None))\
+                or ((mean is not None) and (cov is not None)):
             pass # we have sufficient to compute the posterior
         else:
             raise ValueError, "insufficient information to compute the posterior"
@@ -56,13 +58,16 @@ class Posterior(object):
         self._woodbury_chol = woodbury_chol
         self._woodbury_vector = woodbury_vector
 
+        #option 2.
+        self._woodbury_inv = woodbury_inv
+        #and woodbury vector
+
         #option 2:
         self._mean = mean
         self._covariance = cov
 
         #compute this lazily
         self._precision = None
-        self._woodbury_inv = None
 
     @property
     def mean(self):
@@ -99,7 +104,6 @@ class Posterior(object):
             symmetrify(self._woodbury_inv)
         return self._woodbury_inv
 
-
     @property
     def woodbury_vector(self):
         if self._woodbury_vector is None:
diff --git a/GPy/inference/latent_function_inference/varDTC.py b/GPy/inference/latent_function_inference/varDTC.py
index b5ba4c2d..e6a96aa2 100644
--- a/GPy/inference/latent_function_inference/varDTC.py
+++ b/GPy/inference/latent_function_inference/varDTC.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from posterior import Posterior
-from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs
+from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dpotri, symmetrify
 import numpy as np
 log_2_pi = np.log(2*np.pi)
 
@@ -163,7 +163,7 @@ class VarDTC(object):
         else:
             lik_1 = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(beta)) - 0.5 * beta * trYYT
             lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
-        lik_3 = -output_dim * (np.sum(np.log(np.diag(LB)))) # + 0.5 * num_inducing * np.log(sf2))
+        lik_3 = -output_dim * (np.sum(np.log(np.diag(LB))))
         lik_4 = 0.5 * data_fit
         log_marginal = lik_1 + lik_2 + lik_3 + lik_4
 
@@ -178,6 +178,7 @@ class VarDTC(object):
             kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
 
         #get sufficient things for posterior prediction
+        #TODO: do we really want to do this in  the loop?
         if VVT_factor.shape[1] == Y.shape[1]:
             woodbury_vector = Cpsi1Vf # == Cpsi1V
         else:
@@ -185,11 +186,13 @@ class VarDTC(object):
             tmp, _ = dtrtrs(Lm, np.asfortranarray(psi1V), lower=1, trans=0)
             tmp, _ = dpotrs(LB, tmp, lower=1)
             woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
-        #TODO: totally wrong, fix.
-        woodbury_chol = np.eye(num_inducing)
+        Bi, _ = dpotri(LB, lower=0)
+        symmetrify(Bi)
+        woodbury_inv = backsub_both_sides(Lm, np.eye(num_inducing) - Bi)
+
 
         #construct a posterior object
-        post = Posterior(woodbury_chol=woodbury_chol, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
+        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
 
         return post, log_marginal, grad_dict
 
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index a4e06441..b3178d0c 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -5,6 +5,7 @@ import pylab as pb
 import numpy as np
 import Tango
 from base_plots import gpplot, x_frame1D, x_frame2D
+from ...util.misc import param_to_array
 
 
 def plot_fit(model, plot_limits=None, which_data_rows='all',
@@ -95,8 +96,11 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
 
         #add inducing inputs (if a sparse model is used)
         if hasattr(model,"Z"):
-            Zu = model.Z[:,free_dims] * model._Xscale[:,free_dims] + model._Xoffset[:,free_dims]
-            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
+            #Zu = model.Z[:,free_dims] * model._Xscale[:,free_dims] + model._Xoffset[:,free_dims]
+            Zu = param_to_array(model.Z[:,free_dims])
+            z_height = ax.get_ylim()[0]
+            ax.plot(Zu, np.zeros_like(Zu) + z_height, 'r|', mew=1.5, markersize=12)
+
 
         #add error bars for uncertain (if input uncertainty is being modelled)
         if hasattr(model,"has_uncertain_inputs"):
@@ -144,7 +148,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
 
         #add inducing inputs (if a sparse model is used)
         if hasattr(model,"Z"):
-            Zu = model.Z[:,free_dims] * model._Xscale[:,free_dims] + model._Xoffset[:,free_dims]
+            #Zu = model.Z[:,free_dims] * model._Xscale[:,free_dims] + model._Xoffset[:,free_dims]
+            Zu = model.Z[:,free_dims]
             ax.plot(Zu[:,free_dims[0]], Zu[:,free_dims[1]], 'wo')
 
     else:

From 399adb1b008180112fa97ee20877a109bfad9f3c Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 31 Jan 2014 16:59:06 +0000
Subject: [PATCH 06/43] some documenting, and fiddling with the laplace approx

---
 .../exact_gaussian_inference.py               |   2 +-
 .../latent_function_inference/laplace.py      | 235 ++++++------------
 .../latent_function_inference/posterior.py    |   5 +-
 3 files changed, 86 insertions(+), 156 deletions(-)

diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index 907e8485..2c3bfa6a 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -53,6 +53,6 @@ class ExactGaussianInference(object):
 
         likelihood.update_gradients(np.diag(dL_dK))
 
-        return Posterior(LW, alpha, K), log_marginal, {'dL_dK':dL_dK}
+        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK}
 
 
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index fa5bb3b8..e5165da6 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -15,49 +15,32 @@ import scipy as sp
 from likelihood import likelihood
 from ..util.linalg import mdot, jitchol, pddet, dpotrs
 from functools import partial as partial_func
+from posterior import Posterior
 import warnings
 
 class LaplaceInference(object):
     """Laplace approximation to a posterior"""
 
-    def __init__(self, data, noise_model, extra_data=None):
+    def __init__(self):
         """
         Laplace Approximation
 
         Find the moments \hat{f} and the hessian at this point
         (using Newton-Raphson) of the unnormalised posterior
 
-        Compute the GP variables (i.e. generate some Y^{squiggle} and
-        z^{squiggle} which makes a gaussian the same as the laplace
-        approximation to the posterior, but normalised
-
-        Arguments
-        ---------
-
-        :param data: array of data the likelihood function is approximating
-        :type data: NxD
-        :param noise_model: likelihood function - subclass of noise_model
-        :type noise_model: noise_model
-        :param extra_data: additional data used by some likelihood functions,
         """
-        self.data = data
-        self.noise_model = noise_model
-        self.extra_data = extra_data
-
         #Inital values
-        self.N, self.D = self.data.shape
-        self.is_heteroscedastic = True
-        self.Nparams = 0
         self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
 
-        self.restart()
-        likelihood.__init__(self)
 
     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
         """
         Returns a Posterior class containing essential quantities of the posterior
         """
 
+        self.N, self.D = self.data.shape
+        self.restart()
+
         # Compute K
         self.K = kern.K(X)
         self.data = Y
@@ -69,10 +52,11 @@ class LaplaceInference(object):
         #Compute hessian and other variables at mode
         self._compute_likelihood_variables()
 
-        #Compute fake variables replicating laplace approximation to posterior
-        self._compute_GP_variables()
+        likelihood.gradient = self.likelihood_gradients()
+        dL_dK = self._Kgradients()
+        kern.update_gradients_full(dL_dK)
 
-        return Posterior(mean=self.f_hat, cov=self.covariance_matrix, K=self.K)
+        return Posterior(mean=self.f_hat, cov=self.Sigma, K=self.K), log_marginal_approx, {'dL_dK':dL_dK}
 
     def restart(self):
         """
@@ -88,37 +72,10 @@ class LaplaceInference(object):
         self.old_Ki_f = None
         self.bad_fhat = False
 
-    def predictive_values(self,mu,var,full_cov,**noise_args):
-        if full_cov:
-            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.noise_model.predictive_values(mu,var,**noise_args)
-
-    def log_predictive_density(self, y_test, mu_star, var_star):
-        """
-        Calculation of the log predictive density
-
-        .. math:
-            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
-
-        :param y_test: test observations (y_{*})
-        :type y_test: (Nx1) array
-        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type mu_star: (Nx1) array
-        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type var_star: (Nx1) array
-        """
-        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
-
-    def _get_params(self):
-        return np.asarray(self.noise_model._get_params())
-
-    def _get_param_names(self):
-        return self.noise_model._get_param_names()
-
-    def _set_params(self, p):
-        return self.noise_model._set_params(p)
-
     def _shared_gradients_components(self):
+        """
+        A helper function to compute some common quantities
+        """
         d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
         dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
         I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
@@ -132,41 +89,30 @@ class LaplaceInference(object):
         :rtype: Matrix (1 x num_kernel_params)
         """
         dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data, extra_data=self.extra_data)
+        dlp = likelihood.dlogpdf_df(self.f_hat, Y, extra_data=None) # TODO: how will extra data work?
 
         #Explicit
-        #expl_a = np.dot(self.Ki_f, self.Ki_f.T)
-        #expl_b = self.Wi_K_i
-        #expl = 0.5*expl_a - 0.5*expl_b
-        #dL_dthetaK_exp = dK_dthetaK(expl, X)
+        expl_a = np.dot(self.Ki_f, self.Ki_f.T)
+        expl_b = self.Wi_K_i
+        expl = 0.5*expl_a - 0.5*expl_b
+        dL_dthetaK_exp = dK_dthetaK(expl, X)
 
         #Implicit
         impl = mdot(dlp, dL_dfhat, I_KW_i)
 
-        #No longer required as we are computing these in the gp already
-        #otherwise we would take them away and add them back
-        #dL_dthetaK_imp = dK_dthetaK(impl, X)
-        #dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
-        #dL_dK = expl + impl
+        dL_dK = expl + impl
 
-        #No need to compute explicit as we are computing dZ_dK to account
-        #for the difference between the K gradients of a normal GP,
-        #and the K gradients including the implicit part
-        dL_dK = impl
         return dL_dK
 
-    def _gradients(self, partial):
+    def likelihood_gradients(self):
         """
         Gradients with respect to likelihood parameters (dL_dthetaL)
 
-        :param partial: Not needed by this likelihood
-        :type partial: lambda function
         :rtype: array of derivatives (1 x num_likelihood_params)
         """
         dL_dfhat, I_KW_i = self._shared_gradients_components()
         dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
 
-        #len(dlik_dthetaL)
         num_params = len(self._get_param_names())
         # make space for one derivative for each likelihood parameter
         dL_dthetaL = np.zeros(num_params)
@@ -184,88 +130,9 @@ class LaplaceInference(object):
 
         return dL_dthetaL
 
-    def _compute_GP_variables(self):
-        """
-        Generate data Y which would give the normal distribution identical
-        to the laplace approximation to the posterior, but normalised
-
-        GPy expects a likelihood to be gaussian, so need to caluclate
-        the data Y^{\tilde} that makes the posterior match that found
-        by a laplace approximation to a non-gaussian likelihood but with
-        a gaussian likelihood
-
-        Firstly,
-        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
-        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
-        we wish to find the hessian \Sigma^{\tilde}
-        that has the same curvature but using our new simulated data Y^{\tilde}
-        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
-        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
-        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
-
-        Secondly,
-        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
-        So we can suck up any differences between that and our log marginal likelihood approximation
-        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
-        which we want to optimize instead, by equating them and rearranging, the difference is added onto
-        the log p(y) that GPy optimizes by default
-
-        Thirdly,
-        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
-        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
-        gp.py code
-        """
-        Wi = 1.0/self.W
-        self.Sigma_tilde = np.diagflat(Wi)
-
-        Y_tilde = Wi*self.Ki_f + self.f_hat
-
-        self.Wi_K_i = self.W12BiW12
-        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
-        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
-        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
-
-        Z_tilde = (+ lik
-                   - 0.5*self.ln_B_det
-                   + 0.5*ln_det_Wi_K
-                   - 0.5*self.f_Ki_f
-                   + 0.5*y_Wi_K_i_y
-                   + self.NORMAL_CONST
-                  )
-
-        #Convert to float as its (1, 1) and Z must be a scalar
-        self.Z = np.float64(Z_tilde)
-        self.Y = Y_tilde
-        self.YYT = np.dot(self.Y, self.Y.T)
-        self.covariance_matrix = self.Sigma_tilde
-        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
-
-        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
-        self.dZ_dK = self._Kgradients()
-        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
-
-    def fit_full(self, K):
-        """
-        The laplace approximation algorithm, find K and expand hessian
-        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
-
-        :param K: Prior covariance matrix evaluated at locations X
-        :type K: NxN matrix
-        """
-        self.K = K.copy()
-
-        #Find mode
-        self.f_hat = self.rasm_mode(self.K)
-
-        #Compute hessian and other variables at mode
-        self._compute_likelihood_variables()
-
-        #Compute fake variables replicating laplace approximation to posterior
-        self._compute_GP_variables()
-
     def _compute_likelihood_variables(self):
         """
-        Compute the variables required to compute gaussian Y variables
+        At the mode, compute the hessian and effective covaraince matrix.
         """
         #At this point get the hessian matrix (or vector as W is diagonal)
         self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
@@ -422,3 +289,65 @@ class LaplaceInference(object):
 
         self.Ki_f = Ki_f
         return f
+
+    def _compute_GP_variables(self):
+        """
+        Generate data Y which would give the normal distribution identical
+        to the laplace approximation to the posterior, but normalised
+
+        GPy expects a likelihood to be gaussian, so need to caluclate
+        the data Y^{\tilde} that makes the posterior match that found
+        by a laplace approximation to a non-gaussian likelihood but with
+        a gaussian likelihood
+
+        Firstly,
+        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
+        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
+        we wish to find the hessian \Sigma^{\tilde}
+        that has the same curvature but using our new simulated data Y^{\tilde}
+        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
+        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
+        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
+
+        Secondly,
+        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
+        So we can suck up any differences between that and our log marginal likelihood approximation
+        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
+        which we want to optimize instead, by equating them and rearranging, the difference is added onto
+        the log p(y) that GPy optimizes by default
+
+        Thirdly,
+        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
+        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
+        gp.py code
+        """
+        Wi = 1.0/self.W
+        self.Sigma_tilde = np.diagflat(Wi)
+
+        Y_tilde = Wi*self.Ki_f + self.f_hat
+
+        self.Wi_K_i = self.W12BiW12
+        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
+        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
+        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
+
+        Z_tilde = (+ lik
+                   - 0.5*self.ln_B_det
+                   + 0.5*ln_det_Wi_K
+                   - 0.5*self.f_Ki_f
+                   + 0.5*y_Wi_K_i_y
+                   + self.NORMAL_CONST
+                  )
+
+        #Convert to float as its (1, 1) and Z must be a scalar
+        self.Z = np.float64(Z_tilde)
+        self.Y = Y_tilde
+        self.YYT = np.dot(self.Y, self.Y.T)
+        self.covariance_matrix = self.Sigma_tilde
+        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
+
+        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
+        self.dZ_dK = self._Kgradients()
+        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
+
+
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index b3a352e5..c0974dc5 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -6,12 +6,13 @@ from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs, dpotri, symmetrify
 
 class Posterior(object):
     """
-    An object to represent a Gaussian posterior over latent function values.
+    An object to represent a Gaussian posterior over latent function values, p(f|D).
     This may be computed exactly for Gaussian likelihoods, or approximated for
     non-Gaussian likelihoods.
 
     The purpose of this class is to serve as an interface between the inference
-    schemes and the model classes.
+    schemes and the model classes.  the model class can make predictions for
+    the function at any new point x_* by integrating over this posterior.
 
     """
     def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):

From 1b49f7ab30808effca88bcd85f5d6aa6e038a7e8 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Fri, 31 Jan 2014 17:01:50 +0000
Subject: [PATCH 07/43] not calling self.parameters_changed explicitly anymore
 -> not needed

---
 GPy/core/gp.py        | 2 +-
 GPy/core/sparse_gp.py | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 060b617a..6d9ed75d 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -56,7 +56,7 @@ class GP(Model):
         self.add_parameter(self.kern)
         self.add_parameter(self.likelihood)
 
-        self.parameters_changed()
+        #self.parameters_changed()
 
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y)
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index ab1f3bf0..3a6a98cb 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -51,11 +51,8 @@ class SparseGP(GP):
         self.X_variance = X_variance
 
         GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name)
-
         self.Z = Param('inducing inputs', self.Z)
         self.add_parameter(self.Z, index=0)
-        self.add_parameter(self.kern)
-        self.add_parameter(self.likelihood)
 
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)

From 70e7d72bf26ea9ec3ebd1f5ed2dd4a27341259a1 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@shef.ac.uk>
Date: Mon, 3 Feb 2014 09:12:43 +0000
Subject: [PATCH 08/43] add spike-and-slab gplvm kernel [unfinished].]

---
 GPy/kern/parts/ss_rbf.py | 352 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 352 insertions(+)
 create mode 100644 GPy/kern/parts/ss_rbf.py

diff --git a/GPy/kern/parts/ss_rbf.py b/GPy/kern/parts/ss_rbf.py
new file mode 100644
index 00000000..a234d428
--- /dev/null
+++ b/GPy/kern/parts/ss_rbf.py
@@ -0,0 +1,352 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from kernpart import Kernpart
+from ...util.linalg import tdot
+from ...util.misc import fast_array_equal, param_to_array
+from ...core.parameterization import Param
+
+class SS_RBF(Kernpart):
+    """
+    The RBF kernel for Spike-and-Slab GPLVM
+    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:
+
+    .. math::
+
+       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \ \ \ \ \  \\text{ where  } r^2 = \sum_{i=1}^d \\frac{ (x_i-x^\prime_i)^2}{\ell_i^2}
+
+    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param lengthscale: the vector of lengthscale of the kernel
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
+    :rtype: kernel object
+    """
+
+    def __init__(self, input_dim, variance=1., lengthscale=None, name='rbf'):
+        super(RBF, self).__init__(input_dim, name)
+        self.input_dim = input_dim
+
+        if lengthscale is not None:
+            lengthscale = np.asarray(lengthscale)
+            assert lengthscale.size == self.input_dim, "bad number of lengthscales"
+        else:
+            lengthscale = np.ones(self.input_dim)
+
+        self.variance = Param('variance', variance)
+        self.lengthscale = Param('lengthscale', lengthscale)
+        self.lengthscale.add_observer(self, self.update_lengthscale)
+        self.add_parameters(self.variance, self.lengthscale)
+        self.parameters_changed() # initializes cache
+
+    def on_input_change(self, X):
+        #self._K_computations(X, None)
+        pass
+
+    def update_lengthscale(self, l):
+        self.lengthscale2 = np.square(self.lengthscale)
+
+    def parameters_changed(self):
+        # reset cached results
+        self._X, self._X2 = np.empty(shape=(2, 1))
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
+
+    def K(self, X, X2, target):
+        self._K_computations(X, X2)
+        target += self.variance * self._K_dvar
+
+    def Kdiag(self, X, target):
+        np.add(target, self.variance, target)
+
+    def psi0(self, Z, mu, S, target):
+        target += self.variance
+
+    def psi1(self, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        target += self._psi1
+
+    def psi2(self, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        target += self._psi2
+
+    def update_gradients_full(self, dL_dK, X):
+        self._K_computations(X, None)
+        self.variance.gradient = np.sum(self._K_dvar * dL_dK)
+        if self.ARD:
+            self.lengthscale.gradient = self._dL_dlengthscales_via_K(dL_dK, X, None)
+        else:
+            self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK)
+
+    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+        #contributions from Kdiag
+        self.variance.gradient = np.sum(dL_dKdiag)
+
+        #from Knm
+        self._K_computations(X, Z)
+        self.variance.gradient += np.sum(dL_dKnm * self._K_dvar)
+        if self.ARD:
+            self.lengthscales.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
+
+        else:
+            self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
+
+        #from Kmm
+        self._K_computations(Z, None)
+        self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
+        if self.ARD:
+            self.lengthscales.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
+        else:
+            self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
+
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        self._psi_computations(Z, mu, S)
+
+        #contributions from psi0:
+        self.variance.gradient = np.sum(dL_dpsi0)
+
+        #from psi1
+        self.variance.gradient += np.sum(dL_dpsi1 * self._psi1 / self.variance)
+        d_length = self._psi1[:,:,None] * ((self._psi1_dist_sq - 1.)/(self.lengthscale*self._psi1_denom) +1./self.lengthscale)
+        dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
+        if not self.ARD:
+            self.lengthscale.gradeint = dpsi1_dlength.sum()
+        else:
+            self.lengthscale.gradient = dpsi1_dlength.sum(0).sum(0)
+
+        #from psi2
+        d_var = 2.*self._psi2 / self.variance
+        d_length = 2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] / self.lengthscale2) / (self.lengthscale * self._psi2_denom)
+
+        self.variance.gradient += np.sum(dL_dpsi2 * d_var)
+        dpsi2_dlength = d_length * dL_dpsi2[:, :, :, None]
+        if not self.ARD:
+            self.lengthscale.gradient += dpsi2_dlength.sum()
+        else:
+            self.lengthscale.gradient += dpsi2_dlength.sum(0).sum(0).sum(0)
+
+        #from Kmm
+        self._K_computations(Z, None)
+        self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
+        if self.ARD:
+            self.lengthscales.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
+        else:
+            self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK)
+
+    def gradients_X(self, dL_dK, X, X2, target):
+        #if self._X is None or X.base is not self._X.base or X2 is not None:
+        self._K_computations(X, X2)
+        if X2 is None:
+            _K_dist = 2*(X[:, None, :] - X[None, :, :])
+        else:
+            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
+        dK_dX = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
+        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        pass
+
+    #---------------------------------------#
+    #             PSI statistics            #
+    #---------------------------------------#
+
+    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
+        pass
+
+    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        denominator = (self.lengthscale2 * (self._psi1_denom))
+        dpsi1_dZ = -self._psi1[:, :, None] * ((self._psi1_dist / denominator))
+        target += np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0)
+
+    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
+        self._psi_computations(Z, mu, S)
+        tmp = self._psi1[:, :, None] / self.lengthscale2 / self._psi1_denom
+        target_mu += np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
+        target_S += np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
+
+    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        term1 = self._psi2_Zdist / self.lengthscale2 # num_inducing, num_inducing, input_dim
+        term2 = self._psi2_mudist / self._psi2_denom / self.lengthscale2 # N, num_inducing, num_inducing, input_dim
+        dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
+        target += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
+
+    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
+        """Think N,num_inducing,num_inducing,input_dim """
+        self._psi_computations(Z, mu, S)
+        tmp = self._psi2[:, :, :, None] / self.lengthscale2 / self._psi2_denom
+        target_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
+        target_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)
+
+    #---------------------------------------#
+    #            Precomputations            #
+    #---------------------------------------#
+
+    def _K_computations(self, X, X2):
+        #params = self._get_params()
+        if not (fast_array_equal(X, self._X) and fast_array_equal(X2, self._X2)):# and fast_array_equal(self._params_save , params)):
+            #self._X = X.copy()
+            #self._params_save = params.copy()
+            if X2 is None:
+                self._X2 = None
+                X = X / self.lengthscale
+                Xsquare = np.sum(np.square(X), 1)
+                self._K_dist2 = -2.*tdot(X) + (Xsquare[:, None] + Xsquare[None, :])
+            else:
+                self._X2 = X2.copy()
+                X = X / self.lengthscale
+                X2 = X2 / self.lengthscale
+                self._K_dist2 = -2.*np.dot(X, X2.T) + (np.sum(np.square(X), 1)[:, None] + np.sum(np.square(X2), 1)[None, :])
+            self._K_dvar = np.exp(-0.5 * self._K_dist2)
+
+    def _dL_dlengthscales_via_K(self, dL_dK, X, X2):
+        """
+        A helper function for update_gradients_* methods
+
+        Computes the derivative of the objective L wrt the lengthscales via
+
+        dL_dl = sum_{i,j}(dL_dK_{ij} dK_dl)
+
+        assumes self._K_computations has just been called.
+
+        This is only valid if self.ARD=True
+        """
+        target = np.zeros(self.input_dim)
+        dvardLdK = self._K_dvar * dL_dK
+        var_len3 = self.variance / np.power(self.lengthscale, 3)
+        if X2 is None:
+            # save computation for the symmetrical case
+            dvardLdK = dvardLdK + dvardLdK.T
+            code = """
+            int q,i,j;
+            double tmp;
+            for(q=0; q<input_dim; q++){
+              tmp = 0;
+              for(i=0; i<num_data; i++){
+                for(j=0; j<i; j++){
+                  tmp += (X(i,q)-X(j,q))*(X(i,q)-X(j,q))*dvardLdK(i,j);
+                }
+              }
+              target(q) += var_len3(q)*tmp;
+            }
+            """
+            num_data, num_inducing, input_dim = X.shape[0], X.shape[0], self.input_dim
+            X, dvardLdK = param_to_array(X, dvardLdK)
+            weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
+        else:
+            code = """
+            int q,i,j;
+            double tmp;
+            for(q=0; q<input_dim; q++){
+              tmp = 0;
+              for(i=0; i<num_data; i++){
+                for(j=0; j<num_inducing; j++){
+                  tmp += (X(i,q)-X2(j,q))*(X(i,q)-X2(j,q))*dvardLdK(i,j);
+                }
+              }
+              target(q) += var_len3(q)*tmp;
+            }
+            """
+            num_data, num_inducing, input_dim = X.shape[0], X2.shape[0], self.input_dim
+            X, X2, dvardLdK = param_to_array(X, X2, dvardLdK)
+            weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
+        return target
+
+
+
+    def _psi_computations(self, Z, mu, S):
+        # here are the "statistics" for psi1 and psi2
+        Z_changed = not fast_array_equal(Z, self._Z)
+        if Z_changed:
+            # Z has changed, compute Z specific stuff
+            self._psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+            self._psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+            self._psi2_Zdist_sq = np.square(self._psi2_Zdist / self.lengthscale) # M,M,Q
+
+        if Z_changed or not fast_array_equal(mu, self._mu) or not fast_array_equal(S, self._S):
+            # something's changed. recompute EVERYTHING
+
+            # psi1
+            self._psi1_denom = S[:, None, :] / self.lengthscale2 + 1.
+            self._psi1_dist = Z[None, :, :] - mu[:, None, :]
+            self._psi1_dist_sq = np.square(self._psi1_dist) / self.lengthscale2 / self._psi1_denom
+            self._psi1_exponent = -0.5 * np.sum(self._psi1_dist_sq + np.log(self._psi1_denom), -1)
+            self._psi1 = self.variance * np.exp(self._psi1_exponent)
+
+            # psi2
+            self._psi2_denom = 2.*S[:, None, None, :] / self.lengthscale2 + 1. # N,M,M,Q
+            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu, self._psi2_Zhat)
+            # self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
+            # self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
+            # self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q
+            self._psi2 = np.square(self.variance) * np.exp(self._psi2_exponent) # N,M,M,Q
+
+            # store matrices for caching
+            self._Z, self._mu, self._S = Z, mu, S
+
+    def weave_psi2(self, mu, Zhat):
+        N, input_dim = mu.shape
+        num_inducing = Zhat.shape[0]
+
+        mudist = np.empty((N, num_inducing, num_inducing, input_dim))
+        mudist_sq = np.empty((N, num_inducing, num_inducing, input_dim))
+        psi2_exponent = np.zeros((N, num_inducing, num_inducing))
+        psi2 = np.empty((N, num_inducing, num_inducing))
+
+        psi2_Zdist_sq = self._psi2_Zdist_sq
+        _psi2_denom = self._psi2_denom.squeeze().reshape(N, self.input_dim)
+        half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(N, self.input_dim)
+        variance_sq = float(np.square(self.variance))
+        if self.ARD:
+            lengthscale2 = self.lengthscale2
+        else:
+            lengthscale2 = np.ones(input_dim) * self.lengthscale2
+        code = """
+        double tmp;
+
+        #pragma omp parallel for private(tmp)
+        for (int n=0; n<N; n++){
+            for (int m=0; m<num_inducing; m++){
+               for (int mm=0; mm<(m+1); mm++){
+                   for (int q=0; q<input_dim; q++){
+                       //compute mudist
+                       tmp = mu(n,q) - Zhat(m,mm,q);
+                       mudist(n,m,mm,q) = tmp;
+                       mudist(n,mm,m,q) = tmp;
+
+                       //now mudist_sq
+                       tmp = tmp*tmp/lengthscale2(q)/_psi2_denom(n,q);
+                       mudist_sq(n,m,mm,q) = tmp;
+                       mudist_sq(n,mm,m,q) = tmp;
+
+                       //now psi2_exponent
+                       tmp = -psi2_Zdist_sq(m,mm,q) - tmp - half_log_psi2_denom(n,q);
+                       psi2_exponent(n,mm,m) += tmp;
+                       if (m !=mm){
+                           psi2_exponent(n,m,mm) += tmp;
+                       }
+                   //psi2 would be computed like this, but np is faster
+                   //tmp = variance_sq*exp(psi2_exponent(n,m,mm));
+                   //psi2(n,m,mm) = tmp;
+                   //psi2(n,mm,m) = tmp;
+                   }
+                }
+            }
+        }
+
+        """
+
+        support_code = """
+        #include <omp.h>
+        #include <math.h>
+        """
+        weave.inline(code, support_code=support_code, libraries=['gomp'],
+                     arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
+                     type_converters=weave.converters.blitz, **self.weave_options)
+
+        return mudist, mudist_sq, psi2_exponent, psi2

From b2328c4f47ce3cd58d02d489a4843dded35f821b Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 5 Feb 2014 10:48:23 +0000
Subject: [PATCH 09/43] starting varDTC with uncertain inputs [not working]

---
 GPy/core/gp.py                                |  4 ++--
 .../latent_function_inference/varDTC.py       | 22 ++++++++++++-------
 GPy/kern/parts/rbf.py                         |  2 +-
 GPy/models/sparse_gp_regression.py            |  2 +-
 GPy/util/linalg.py                            | 11 ++++++++++
 5 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 6d9ed75d..b9239a03 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -185,7 +185,7 @@ class GP(Model):
         from ..plotting.matplot_dep import models_plots
         models_plots.plot_fit_f(self,*args,**kwargs)
 
-    def plot(self, *args):
+    def plot(self, *args, **kwargs):
         """
         Plot the posterior of the GP.
           - In one dimension, the function is plotted with a shaded region
@@ -204,7 +204,7 @@ class GP(Model):
         """
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ..plotting.matplot_dep import models_plots
-        models_plots.plot_fit(self,*args)
+        models_plots.plot_fit(self,*args,**kwargs)
 
     def _getstate(self):
         """
diff --git a/GPy/inference/latent_function_inference/varDTC.py b/GPy/inference/latent_function_inference/varDTC.py
index b5ba4c2d..290e234e 100644
--- a/GPy/inference/latent_function_inference/varDTC.py
+++ b/GPy/inference/latent_function_inference/varDTC.py
@@ -4,6 +4,7 @@
 from posterior import Posterior
 from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs
 import numpy as np
+from GPy.util.linalg import dtrtri
 log_2_pi = np.log(2*np.pi)
 
 class VarDTC(object):
@@ -69,19 +70,24 @@ class VarDTC(object):
                 psi2_beta = (psi2 * (beta.flatten().reshape(num_data, 1, 1))).sum(0)
             else:
                 psi2_beta = psi2.sum(0) * beta
-            evals, evecs = linalg.eigh(psi2_beta)
-            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
-            if not np.array_equal(evals, clipped_evals):
-                pass # print evals
-            tmp = evecs * np.sqrt(clipped_evals)
-            tmp = tmp.T
+            if 0:
+                evals, evecs = linalg.eigh(psi2_beta)
+                clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
+                if not np.array_equal(evals, clipped_evals):
+                    pass # print evals
+                tmp = evecs * np.sqrt(clipped_evals)
+                tmp = tmp.T
+            # no backsubstitution because of bound explosion on tr(A) if not...
+            LmInv, _ = dtrtri(Lm, lower=1)
+            A = LmInv.T.dot(psi2_beta.dot(LmInv))
+            print A.sum()
         else:
             if het_noise:
                 tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
             else:
                 tmp = psi1 * (np.sqrt(beta))
-        tmp, _ = dtrtrs(Lm, np.asfortranarray(tmp.T), lower=1)
-        A = tdot(tmp)
+            tmp, _ = dtrtrs(Lm, np.asfortranarray(tmp.T), lower=1)
+            A = tdot(tmp)
 
         # factor B
         B = np.eye(num_inducing) + A
diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py
index 89f6894c..4247eb9c 100644
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@@ -159,7 +159,7 @@ class RBF(Kernpart):
         if self.ARD:
             self.lengthscales.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
         else:
-            self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK)
+            self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
 
     def gradients_X(self, dL_dK, X, X2, target):
         #if self._X is None or X.base is not self._X.base or X2 is not None:
diff --git a/GPy/models/sparse_gp_regression.py b/GPy/models/sparse_gp_regression.py
index 88b0d435..386380b7 100644
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@@ -43,7 +43,7 @@ class SparseGPRegression(SparseGP):
 
         likelihood = likelihoods.Gaussian()
 
-        SparseGP.__init__(self, X, Y, Z, kernel, likelihood)
+        SparseGP.__init__(self, X, Y, Z, kernel, likelihood, X_variance=X_variance)
         self.ensure_default_constraints()
 
     def _getstate(self):
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index b8c6a1df..44f3700d 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -41,6 +41,17 @@ else:
         _blas_available = False
         warnings.warn("warning: caught this exception:" + str(e))
 
+def dtrtri(L, lower=0):
+    """
+    Wrapper for lapack dtrtrs function
+    Inverse of L
+
+    :param L: Triangular Matrix L
+    :param lower: is matrix lower (true) or upper (false)
+    :returns: Li, info
+    """
+    return lapack.dtrtri(L, lower=lower)
+
 def dtrtrs(A, B, lower=0, trans=0, unitdiag=0):
     """
     Wrapper for lapack dtrtrs function

From 2ff7e286ee990a83576e8a3922b5fb253025d7d2 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 5 Feb 2014 10:56:48 +0000
Subject: [PATCH 10/43] more owrk on the Laplace approx

---
 .../latent_function_inference/__init__.py     |   1 +
 .../latent_function_inference/laplace.py      | 192 ++++++++----------
 2 files changed, 82 insertions(+), 111 deletions(-)

diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py
index fe9dd819..5184f0b4 100644
--- a/GPy/inference/latent_function_inference/__init__.py
+++ b/GPy/inference/latent_function_inference/__init__.py
@@ -24,4 +24,5 @@ etc.
 """
 
 from exact_gaussian_inference import ExactGaussianInference
+from laplace import LaplaceInference
 expectation_propagation = 'foo' # TODO
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index e5165da6..a6d3c9b5 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -11,15 +11,13 @@
 #http://gaussianprocess.org/gpml/code.
 
 import numpy as np
-import scipy as sp
-from likelihood import likelihood
-from ..util.linalg import mdot, jitchol, pddet, dpotrs
+from ...util.linalg import mdot, jitchol, pddet, dpotrs
 from functools import partial as partial_func
 from posterior import Posterior
 import warnings
+from scipy import optimize
 
 class LaplaceInference(object):
-    """Laplace approximation to a posterior"""
 
     def __init__(self):
         """
@@ -29,8 +27,11 @@ class LaplaceInference(object):
         (using Newton-Raphson) of the unnormalised posterior
 
         """
-        #Inital values
-        self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
+        self.NORMAL_CONST = (0.5 * np.log(2 * np.pi))
+
+        self._mode_finding_tolerance = 1e-7
+        self._mode_finding_max_iter = 40
+        self.bad_fhat = True
 
 
     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
@@ -38,16 +39,17 @@ class LaplaceInference(object):
         Returns a Posterior class containing essential quantities of the posterior
         """
 
-        self.N, self.D = self.data.shape
-        self.restart()
-
         # Compute K
-        self.K = kern.K(X)
-        self.data = Y
-        self.N, self.D = Y.shape
+        K = kern.K(X)
 
         #Find mode
-        self.f_hat = self.rasm_mode(self.K)
+        if self.bad_fhat:
+            Ki_f_init = np.random.randn(*Y.shape)/50
+        else:
+            Ki_f_init = self._previous_Ki_fhat
+        self.f_hat, self._previous_Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
+
+        stop
 
         #Compute hessian and other variables at mode
         self._compute_likelihood_variables()
@@ -58,25 +60,11 @@ class LaplaceInference(object):
 
         return Posterior(mean=self.f_hat, cov=self.Sigma, K=self.K), log_marginal_approx, {'dL_dK':dL_dK}
 
-    def restart(self):
-        """
-        Reset likelihood variables to their defaults
-        """
-        #Initial values for the GP variables
-        self.Y = np.zeros((self.N, 1))
-        self.covariance_matrix = np.eye(self.N)
-        self.precision = np.ones(self.N)[:, None]
-        self.Z = 0
-        self.YYT = None
-
-        self.old_Ki_f = None
-        self.bad_fhat = False
-
     def _shared_gradients_components(self):
         """
         A helper function to compute some common quantities
         """
-        d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
+        d3lik_d3fhat = likelihood.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
         dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
         I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
         return dL_dfhat, I_KW_i
@@ -111,7 +99,7 @@ class LaplaceInference(object):
         :rtype: array of derivatives (1 x num_likelihood_params)
         """
         dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
+        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = likelihood._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
 
         num_params = len(self._get_param_names())
         # make space for one derivative for each likelihood parameter
@@ -135,19 +123,19 @@ class LaplaceInference(object):
         At the mode, compute the hessian and effective covaraince matrix.
         """
         #At this point get the hessian matrix (or vector as W is diagonal)
-        self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
+        self.W = -likelihood.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
 
-        if not self.noise_model.log_concave:
+        if not likelihood.log_concave:
             #print "Under 1e-10: {}".format(np.sum(self.W < 1e-6))
             self.W[self.W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
 
-        self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))
+        self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N), likelihood.log_concave)
 
         self.Ki_f = self.Ki_f
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
         self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
 
-    def _compute_B_statistics(self, K, W, a):
+    def _compute_B_statistics(self, K, W, a, log_concave):
         """
         Rasmussen suggests the use of a numerically stable positive definite matrix B
         Which has a positive diagonal element and can be easyily inverted
@@ -160,7 +148,7 @@ class LaplaceInference(object):
         :type a: Matrix NxN
         :returns: (W12BiW12, ln_B_det)
         """
-        if not self.noise_model.log_concave:
+        if not log_concave:
             #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
             W[W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                 # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -170,14 +158,14 @@ class LaplaceInference(object):
 
         #W is diagonal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
-        B = np.eye(self.N) + W_12*K*W_12.T
+        B = np.eye(K.shape[0]) + W_12*K*W_12.T
         L = jitchol(B)
 
         W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
-        ln_B_det = 2*np.sum(np.log(np.diag(L)))
+        ln_B_det = 2.*np.sum(np.log(np.diag(L)))
         return W12BiW12a, ln_B_det
 
-    def rasm_mode(self, K, MAX_ITER=40):
+    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
         """
         Rasmussen's numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -185,110 +173,92 @@ class LaplaceInference(object):
 
         :param K: Covariance matrix evaluated at locations X
         :type K: NxD matrix
-        :param MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
-        :type MAX_ITER: scalar
+        :param Y: The data
+        :type Y: np.ndarray
+        :param likelihood: the likelihood of the latent function value for the given data
+        :type likelihood: a GPy.likelihood object
+        :param Ki_f_init: the initial guess at the mode
+        :type Ki_f_init: np.ndarray
+        :param Y_metadata: information about the data, e.g. which likelihood to take from a multi-likelihood object
+        :type Y_metadata: np.ndarray | None
         :returns: f_hat, mode on which to make laplace approxmiation
-        :rtype: NxD matrix
+        :rtype: np.ndarray
         """
-        #old_Ki_f = np.zeros((self.N, 1))
 
-        #Start f's at zero originally of if we have gone off track, try restarting
-        if self.old_Ki_f is None or self.bad_fhat:
-            old_Ki_f = np.random.rand(self.N, 1)/50.0
-            #old_Ki_f = self.Y
-            f = np.dot(K, old_Ki_f)
-        else:
-            #Start at the old best point
-            old_Ki_f = self.old_Ki_f.copy()
-            f = self.f_hat.copy()
+        ##Start f's at zero originally or if we have gone off track, try restarting
+        #if self.old_Ki_f is None or self.bad_fhat:
+            #old_Ki_f = np.random.rand(self.N, 1)/50.0
+            ##old_Ki_f = self.Y
+            #f = np.dot(K, old_Ki_f)
+        #else:
+            ##Start at the old best point
+            #old_Ki_f = self.old_Ki_f.copy()
+            #f = self.f_hat.copy()
 
-        new_obj = -np.inf
-        old_obj = np.inf
+        Ki_f = Ki_f_init.copy()
+        f = np.dot(K, Ki_f)
 
+
+        #define the objective function (to be maximised)
         def obj(Ki_f, f):
-            return -0.5*np.dot(Ki_f.T, f) + self.noise_model.logpdf(f, self.data, extra_data=self.extra_data)
+            return -0.5*np.dot(Ki_f.T, f) + likelihood.logpdf(f, Y, extra_data=Y_metadata)
 
         difference = np.inf
-        epsilon = 1e-7
-        #step_size = 1
-        #rs = 0
         i = 0
-
-        while difference > epsilon and i < MAX_ITER:
-            W = -self.noise_model.d2logpdf_df2(f, self.data, extra_data=self.extra_data)
+        while difference > self._mode_finding_tolerance and i < self._mode_finding_max_iter:
+            W = -likelihood.d2logpdf_df2(f, Y, extra_data=Y_metadata)
 
             W_f = W*f
-            grad = self.noise_model.dlogpdf_df(f, self.data, extra_data=self.extra_data)
+            grad = likelihood.dlogpdf_df(f, Y, extra_data=Y_metadata)
 
             b = W_f + grad
-            W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
+            W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b), likelihood.log_concave)
 
             #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
             full_step_Ki_f = b - W12BiW12Kb
-            dKi_f = full_step_Ki_f - old_Ki_f
+            dKi_f = full_step_Ki_f - Ki_f
 
-            f_old = f.copy()
-            def inner_obj(step_size, old_Ki_f, dKi_f, K):
-                Ki_f = old_Ki_f + step_size*dKi_f
-                f = np.dot(K, Ki_f)
-                # This is nasty, need to set something within an optimization though
-                self.tmp_Ki_f = Ki_f.copy()
-                self.tmp_f = f.copy()
-                return -obj(Ki_f, f)
+            #define an objective for the line search
+            def inner_obj(step_size):
+                Ki_f_trial = Ki_f + step_size*dKi_f
+                f_trial = np.dot(K, Ki_f_trial)
+                print -obj(Ki_f_trial, f_trial),
+                return -obj(Ki_f_trial, f_trial)
 
-            i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
+            #use scipy for the line search, the compute new values of f, Ki_f
+            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
+            Ki_f_new = Ki_f + step*dKi_f
+            f_new = np.dot(K, Ki_f_new)
+
+            print ""
+            print obj(Ki_f, f), obj(Ki_f_new, f_new), step
+            print ""
+
+            #i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
             #Find the stepsize that minimizes the objective function using a brent line search
             #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
             #steps than get this exact then make a step, if B was bigger it might be the other way around though
             #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
-            new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
-            f = self.tmp_f.copy()
-            Ki_f = self.tmp_Ki_f.copy()
+            #new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
+            #f = self.tmp_f.copy()
+            #Ki_f = self.tmp_Ki_f.copy()
 
-            #Optimize without linesearch
-            #f_old = f.copy()
-            #update_passed = False
-            #while not update_passed:
-                #Ki_f = old_Ki_f + step_size*dKi_f
-                #f = np.dot(K, Ki_f)
-
-                #old_obj = new_obj
-                #new_obj = obj(Ki_f, f)
-                #difference = new_obj - old_obj
-                ##print "difference: ",difference
-                #if difference < 0:
-                    ##print "Objective function rose", np.float(difference)
-                    ##If the objective function isn't rising, restart optimization
-                    #step_size *= 0.8
-                    ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
-                    ##objective function isn't increasing, try reducing step size
-                    #f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
-                    #old_obj = new_obj
-                    #rs += 1
-                #else:
-                    #update_passed = True
-
-            #old_Ki_f = self.Ki_f.copy()
-
-            #difference = abs(new_obj - old_obj)
-            #old_obj = new_obj.copy()
-            difference = np.abs(np.sum(f - f_old)) + np.abs(np.sum(Ki_f - old_Ki_f))
-            #difference = np.abs(np.sum(Ki_f - old_Ki_f))/np.float(self.N)
-            old_Ki_f = Ki_f.copy()
+            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
+            Ki_f = Ki_f_new
+            f = f_new
             i += 1
 
-        self.old_Ki_f = old_Ki_f.copy()
 
         #Warn of bad fits
-        if difference > epsilon:
+        if difference > self._mode_finding_tolerance:
+            if not self.bad_fhat:
+                warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
             self.bad_fhat = True
-            warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
         elif self.bad_fhat:
             self.bad_fhat = False
-            warnings.warn("f_hat now perfect again")
+            warnings.warn("f_hat now fine again")
 
-        self.Ki_f = Ki_f
-        return f
+        return f, Ki_f
 
     def _compute_GP_variables(self):
         """
@@ -328,7 +298,7 @@ class LaplaceInference(object):
 
         self.Wi_K_i = self.W12BiW12
         ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
-        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
+        lik = likelihood.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
         y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
 
         Z_tilde = (+ lik

From f653bc430eb3db491052a5973621963110418750 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 5 Feb 2014 16:23:35 +0000
Subject: [PATCH 11/43] an afternoon's work on the laplace approximation

---
 GPy/core/gp.py                                |   5 +-
 GPy/core/model.py                             |   1 -
 GPy/core/parameterization/parameterized.py    |   4 +-
 GPy/core/sparse_gp.py                         |   6 +-
 .../latent_function_inference/laplace.py      | 328 ++++++------------
 .../latent_function_inference/posterior.py    |   1 +
 6 files changed, 120 insertions(+), 225 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 060b617a..031ed16e 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -55,9 +55,9 @@ class GP(Model):
 
         self.add_parameter(self.kern)
         self.add_parameter(self.likelihood)
-
         self.parameters_changed()
 
+
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y)
         self._dL_dK = grad_dict['dL_dK']
@@ -65,9 +65,6 @@ class GP(Model):
     def log_likelihood(self):
         return self._log_marginal_likelihood
 
-    def dL_dtheta_K(self):
-        return self.kern.dK_dtheta(self.posterior.dL_dK, self.X)
-
     def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
         """
         Internal helper function for making predictions, does not account
diff --git a/GPy/core/model.py b/GPy/core/model.py
index f4de0405..dc108641 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -325,7 +325,6 @@ class Model(Parameterized):
             if self._fail_count >= self._allowed_failures:
                 raise e
             self._fail_count += 1
-            import ipdb;ipdb.set_trace()
             obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
         return obj_grads
 
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 5a04fbfd..c2025202 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -74,6 +74,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
         self.size = sum(p.size for p in self._parameters_)
         if not self._has_fixes():
             self._fixes_ = None
+        self._param_slices_ = []
         self._connect_parameters()
         self._added_names_ = set()
         del self._in_init_
@@ -213,7 +214,6 @@ class Parameterized(Constrainable, Pickleable, Observable):
             return
         i = 0
         sizes = [0]
-        self._param_slices_ = []
         for p in self._parameters_:
             p._direct_parent_ = self
             p._highest_parent_ = self
@@ -315,7 +315,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
         return n
     def _get_params(self):
         # don't overwrite this anymore!
-        return numpy.hstack([x._get_params() for x in self._parameters_])
+        return numpy.hstack([x._get_params() for x in self._parameters_ if x.size>0])
     def _set_params(self, params, update=True):
         # don't overwrite this anymore!
         [p._set_params(params[s], update=update) for p,s in itertools.izip(self._parameters_,self._param_slices_)]
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index ab1f3bf0..089f2b38 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -42,8 +42,8 @@ class SparseGP(GP):
             raise NotImplementedError, "what to do what to do?"
             print "defaulting to ", inference_method, "for latent function inference"
 
+        self.Z = Param('inducing inputs', Z)
 
-        self.Z = Z
         self.num_inducing = Z.shape[0]
 
         if not (X_variance is None):
@@ -52,10 +52,8 @@ class SparseGP(GP):
 
         GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name)
 
-        self.Z = Param('inducing inputs', self.Z)
         self.add_parameter(self.Z, index=0)
-        self.add_parameter(self.kern)
-        self.add_parameter(self.likelihood)
+
 
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index a6d3c9b5..9a66d3b6 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -11,7 +11,8 @@
 #http://gaussianprocess.org/gpml/code.
 
 import numpy as np
-from ...util.linalg import mdot, jitchol, pddet, dpotrs
+from ...util.linalg import mdot, jitchol, pddet, dpotrs, dtrtrs
+from ...util.misc import param_to_array
 from functools import partial as partial_func
 from posterior import Posterior
 import warnings
@@ -27,7 +28,6 @@ class LaplaceInference(object):
         (using Newton-Raphson) of the unnormalised posterior
 
         """
-        self.NORMAL_CONST = (0.5 * np.log(2 * np.pi))
 
         self._mode_finding_tolerance = 1e-7
         self._mode_finding_max_iter = 40
@@ -39,58 +39,133 @@ class LaplaceInference(object):
         Returns a Posterior class containing essential quantities of the posterior
         """
 
+        #make Y a normal array!
+        Y = param_to_array(Y)
+
         # Compute K
         K = kern.K(X)
 
         #Find mode
         if self.bad_fhat:
-            Ki_f_init = np.random.randn(*Y.shape)/50
+            Ki_f_init = np.zeros_like(Y)
         else:
             Ki_f_init = self._previous_Ki_fhat
-        self.f_hat, self._previous_Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
-
-        stop
+        f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
 
         #Compute hessian and other variables at mode
-        self._compute_likelihood_variables()
+        log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, Y_metadata)
 
-        likelihood.gradient = self.likelihood_gradients()
-        dL_dK = self._Kgradients()
-        kern.update_gradients_full(dL_dK)
+        #likelihood.gradient = self.likelihood_gradients()
+        kern.update_gradients_full(dL_dK, X)
 
-        return Posterior(mean=self.f_hat, cov=self.Sigma, K=self.K), log_marginal_approx, {'dL_dK':dL_dK}
+        self._previous_Ki_fhat = Ki_fhat.copy()
+        return Posterior(woodbury_vector=woodbury_vector, woodbury_inv = K_Wi_i, K=K), log_marginal, {'dL_dK':dL_dK}
 
-    def _shared_gradients_components(self):
+    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
         """
-        A helper function to compute some common quantities
-        """
-        d3lik_d3fhat = likelihood.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
-        dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
-        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
-        return dL_dfhat, I_KW_i
+        Rasmussen's numerically stable mode finding
+        For nomenclature see Rasmussen & Williams 2006
+        Influenced by GPML (BSD) code, all errors are our own
 
-    def _Kgradients(self):
+        :param K: Covariance matrix evaluated at locations X
+        :type K: NxD matrix
+        :param Y: The data
+        :type Y: np.ndarray
+        :param likelihood: the likelihood of the latent function value for the given data
+        :type likelihood: a GPy.likelihood object
+        :param Ki_f_init: the initial guess at the mode
+        :type Ki_f_init: np.ndarray
+        :param Y_metadata: information about the data, e.g. which likelihood to take from a multi-likelihood object
+        :type Y_metadata: np.ndarray | None
+        :returns: f_hat, mode on which to make laplace approxmiation
+        :rtype: np.ndarray
         """
-        Gradients with respect to prior kernel parameters dL_dK to be chained
-        with dK_dthetaK to give dL_dthetaK
-        :returns: dL_dK matrix
-        :rtype: Matrix (1 x num_kernel_params)
-        """
-        dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlp = likelihood.dlogpdf_df(self.f_hat, Y, extra_data=None) # TODO: how will extra data work?
 
-        #Explicit
-        expl_a = np.dot(self.Ki_f, self.Ki_f.T)
-        expl_b = self.Wi_K_i
-        expl = 0.5*expl_a - 0.5*expl_b
-        dL_dthetaK_exp = dK_dthetaK(expl, X)
+        Ki_f = Ki_f_init.copy()
+        f = np.dot(K, Ki_f)
+
+
+        #define the objective function (to be maximised)
+        def obj(Ki_f, f):
+            return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + likelihood.logpdf(f, Y, extra_data=Y_metadata)
+
+        difference = np.inf
+        iteration = 0
+        while difference > self._mode_finding_tolerance and iteration < self._mode_finding_max_iter:
+            W = -likelihood.d2logpdf_df2(f, Y, extra_data=Y_metadata)
+
+            W_f = W*f
+            grad = likelihood.dlogpdf_df(f, Y, extra_data=Y_metadata)
+
+            b = W_f + grad # R+W p46 line 6.
+            #W12BiW12Kb, B_logdet = self._compute_B_statistics(K, W.copy(), np.dot(K, b), likelihood.log_concave)
+            W12BiW12, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave)
+            W12BiW12Kb = np.dot(W12BiW12, np.dot(K, b))
+
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            full_step_Ki_f = b - W12BiW12Kb # full_step_Ki_f = a in R&W p46 line 6.
+            dKi_f = full_step_Ki_f - Ki_f
+
+            #define an objective for the line search (minimize this one)
+            def inner_obj(step_size):
+                Ki_f_trial = Ki_f + step_size*dKi_f
+                f_trial = np.dot(K, Ki_f_trial)
+                return -obj(Ki_f_trial, f_trial)
+
+            #use scipy for the line search, the compute new values of f, Ki_f
+            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
+            Ki_f_new = Ki_f + step*dKi_f
+            f_new = np.dot(K, Ki_f_new)
+
+            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
+            Ki_f = Ki_f_new
+            f = f_new
+            iteration += 1
+
+        #Warn of bad fits
+        if difference > self._mode_finding_tolerance:
+            if not self.bad_fhat:
+                warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
+            self.bad_fhat = True
+        elif self.bad_fhat:
+            self.bad_fhat = False
+            warnings.warn("f_hat now fine again")
+
+        return f, Ki_f
+
+
+    def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, Y_metadata):
+        """
+        At the mode, compute the hessian and effective covariance matrix.
+
+        returns: logZ : approximation to the marginal likelihood
+        Cov : the approximation to the covariance matrix
+        """
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        W = -likelihood.d2logpdf_df2(f_hat, Y, extra_data=Y_metadata)
+
+        K_Wi_i, L, LiW12 = self._compute_B_statistics(K, W, likelihood.log_concave)
+
+        #compute vital matrices
+        C = np.dot(LiW12, K)
+        Ki_W_i  = K - C.T.dot(C)
+
+        #compute the log marginal
+        log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + likelihood.logpdf(f_hat, Y, extra_data=Y_metadata) - np.sum(np.log(np.diag(L)))
+
+        #compute dL_dK
+        explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
 
         #Implicit
-        impl = mdot(dlp, dL_dfhat, I_KW_i)
+        d3lik_d3fhat = likelihood.d3logpdf_df3(f_hat, Y, extra_data=Y_metadata)
+        dL_dfhat = 0.5*(np.diag(Ki_W_i)[:, None]*d3lik_d3fhat) #why isn't this -0.5? s2 in R&W p126 line 9.
+        woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, extra_data=Y_metadata)
+        implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(np.eye(Y.shape[0]) - np.dot(K, K_Wi_i))
 
-        dL_dK = expl + impl
+        dL_dK = explicit_part + implicit_part
+
+        return log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector
 
-        return dL_dK
 
     def likelihood_gradients(self):
         """
@@ -118,24 +193,7 @@ class LaplaceInference(object):
 
         return dL_dthetaL
 
-    def _compute_likelihood_variables(self):
-        """
-        At the mode, compute the hessian and effective covaraince matrix.
-        """
-        #At this point get the hessian matrix (or vector as W is diagonal)
-        self.W = -likelihood.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
-
-        if not likelihood.log_concave:
-            #print "Under 1e-10: {}".format(np.sum(self.W < 1e-6))
-            self.W[self.W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-
-        self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N), likelihood.log_concave)
-
-        self.Ki_f = self.Ki_f
-        self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
-        self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
-
-    def _compute_B_statistics(self, K, W, a, log_concave):
+    def _compute_B_statistics(self, K, W, log_concave):
         """
         Rasmussen suggests the use of a numerically stable positive definite matrix B
         Which has a positive diagonal element and can be easyily inverted
@@ -144,9 +202,7 @@ class LaplaceInference(object):
         :type K: NxN matrix
         :param W: Negative hessian at a point (diagonal matrix)
         :type W: Vector of diagonal values of hessian (1xN)
-        :param a: Matrix to calculate W12BiW12a
-        :type a: Matrix NxN
-        :returns: (W12BiW12, ln_B_det)
+        :returns: (W12BiW12, L_B, Li_W12)
         """
         if not log_concave:
             #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
@@ -155,169 +211,13 @@ class LaplaceInference(object):
                                 # To cause the posterior to become less certain than the prior and likelihood,
                                 # This is a property only held by non-log-concave likelihoods
 
-
         #W is diagonal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
         B = np.eye(K.shape[0]) + W_12*K*W_12.T
         L = jitchol(B)
 
-        W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
-        ln_B_det = 2.*np.sum(np.log(np.diag(L)))
-        return W12BiW12a, ln_B_det
-
-    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
-        """
-        Rasmussen's numerically stable mode finding
-        For nomenclature see Rasmussen & Williams 2006
-        Influenced by GPML (BSD) code, all errors are our own
-
-        :param K: Covariance matrix evaluated at locations X
-        :type K: NxD matrix
-        :param Y: The data
-        :type Y: np.ndarray
-        :param likelihood: the likelihood of the latent function value for the given data
-        :type likelihood: a GPy.likelihood object
-        :param Ki_f_init: the initial guess at the mode
-        :type Ki_f_init: np.ndarray
-        :param Y_metadata: information about the data, e.g. which likelihood to take from a multi-likelihood object
-        :type Y_metadata: np.ndarray | None
-        :returns: f_hat, mode on which to make laplace approxmiation
-        :rtype: np.ndarray
-        """
-
-        ##Start f's at zero originally or if we have gone off track, try restarting
-        #if self.old_Ki_f is None or self.bad_fhat:
-            #old_Ki_f = np.random.rand(self.N, 1)/50.0
-            ##old_Ki_f = self.Y
-            #f = np.dot(K, old_Ki_f)
-        #else:
-            ##Start at the old best point
-            #old_Ki_f = self.old_Ki_f.copy()
-            #f = self.f_hat.copy()
-
-        Ki_f = Ki_f_init.copy()
-        f = np.dot(K, Ki_f)
-
-
-        #define the objective function (to be maximised)
-        def obj(Ki_f, f):
-            return -0.5*np.dot(Ki_f.T, f) + likelihood.logpdf(f, Y, extra_data=Y_metadata)
-
-        difference = np.inf
-        i = 0
-        while difference > self._mode_finding_tolerance and i < self._mode_finding_max_iter:
-            W = -likelihood.d2logpdf_df2(f, Y, extra_data=Y_metadata)
-
-            W_f = W*f
-            grad = likelihood.dlogpdf_df(f, Y, extra_data=Y_metadata)
-
-            b = W_f + grad
-            W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b), likelihood.log_concave)
-
-            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
-            full_step_Ki_f = b - W12BiW12Kb
-            dKi_f = full_step_Ki_f - Ki_f
-
-            #define an objective for the line search
-            def inner_obj(step_size):
-                Ki_f_trial = Ki_f + step_size*dKi_f
-                f_trial = np.dot(K, Ki_f_trial)
-                print -obj(Ki_f_trial, f_trial),
-                return -obj(Ki_f_trial, f_trial)
-
-            #use scipy for the line search, the compute new values of f, Ki_f
-            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
-            Ki_f_new = Ki_f + step*dKi_f
-            f_new = np.dot(K, Ki_f_new)
-
-            print ""
-            print obj(Ki_f, f), obj(Ki_f_new, f_new), step
-            print ""
-
-            #i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
-            #Find the stepsize that minimizes the objective function using a brent line search
-            #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
-            #steps than get this exact then make a step, if B was bigger it might be the other way around though
-            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
-            #new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
-            #f = self.tmp_f.copy()
-            #Ki_f = self.tmp_Ki_f.copy()
-
-            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
-            Ki_f = Ki_f_new
-            f = f_new
-            i += 1
-
-
-        #Warn of bad fits
-        if difference > self._mode_finding_tolerance:
-            if not self.bad_fhat:
-                warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
-            self.bad_fhat = True
-        elif self.bad_fhat:
-            self.bad_fhat = False
-            warnings.warn("f_hat now fine again")
-
-        return f, Ki_f
-
-    def _compute_GP_variables(self):
-        """
-        Generate data Y which would give the normal distribution identical
-        to the laplace approximation to the posterior, but normalised
-
-        GPy expects a likelihood to be gaussian, so need to caluclate
-        the data Y^{\tilde} that makes the posterior match that found
-        by a laplace approximation to a non-gaussian likelihood but with
-        a gaussian likelihood
-
-        Firstly,
-        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
-        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
-        we wish to find the hessian \Sigma^{\tilde}
-        that has the same curvature but using our new simulated data Y^{\tilde}
-        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
-        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
-        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
-
-        Secondly,
-        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
-        So we can suck up any differences between that and our log marginal likelihood approximation
-        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
-        which we want to optimize instead, by equating them and rearranging, the difference is added onto
-        the log p(y) that GPy optimizes by default
-
-        Thirdly,
-        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
-        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
-        gp.py code
-        """
-        Wi = 1.0/self.W
-        self.Sigma_tilde = np.diagflat(Wi)
-
-        Y_tilde = Wi*self.Ki_f + self.f_hat
-
-        self.Wi_K_i = self.W12BiW12
-        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
-        lik = likelihood.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
-        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
-
-        Z_tilde = (+ lik
-                   - 0.5*self.ln_B_det
-                   + 0.5*ln_det_Wi_K
-                   - 0.5*self.f_Ki_f
-                   + 0.5*y_Wi_K_i_y
-                   + self.NORMAL_CONST
-                  )
-
-        #Convert to float as its (1, 1) and Z must be a scalar
-        self.Z = np.float64(Z_tilde)
-        self.Y = Y_tilde
-        self.YYT = np.dot(self.Y, self.Y.T)
-        self.covariance_matrix = self.Sigma_tilde
-        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
-
-        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
-        self.dZ_dK = self._Kgradients()
-        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
+        LiW12, _ = dtrtrs(L, np.diag(W_12[:,0]), lower=1, trans=0)
+        K_Wi_i = np.dot(LiW12.T, LiW12) # R = W12BiW12, in R&W p 126, eq 5.25
 
+        return K_Wi_i, L, LiW12
 
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index c0974dc5..a0f4104c 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -48,6 +48,7 @@ class Posterior(object):
 
         if ((woodbury_chol is not None) and (woodbury_vector is not None))\
                 or ((woodbury_inv is not None) and (woodbury_vector is not None))\
+                or ((woodbury_inv is not None) and (mean is not None))\
                 or ((mean is not None) and (cov is not None)):
             pass # we have sufficient to compute the posterior
         else:

From 80629e00b694a809aed6ea8a03ce91ab5c9c9591 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 5 Feb 2014 17:12:52 +0000
Subject: [PATCH 12/43] fiddling with plotting

---
 GPy/core/gp.py                                   |  5 +++--
 GPy/core/parameterization/parameterized.py       |  1 +
 .../latent_function_inference/posterior.py       | 16 ++++++++++------
 GPy/likelihoods/bernoulli.py                     |  7 ++++---
 GPy/plotting/matplot_dep/models_plots.py         |  4 ++--
 5 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 6959ba79..65295521 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -58,6 +58,7 @@ class GP(Model):
         self.parameters_changed()
 
     def parameters_changed(self):
+        print self.kern
         self.posterior, self._log_marginal_likelihood, grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y)
         self._dL_dK = grad_dict['dL_dK']
 
@@ -75,8 +76,8 @@ class GP(Model):
 
         """
         Kx = self.kern.K(_Xnew, self.X, which_parts=which_parts).T
-        LiKx, _ = dtrtrs(self.posterior._woodbury_chol, np.asfortranarray(Kx), lower=1)
-        mu = np.dot(Kx.T, self.posterior._woodbury_vector)
+        LiKx, _ = dtrtrs(self.posterior.woodbury_chol, np.asfortranarray(Kx), lower=1)
+        mu = np.dot(Kx.T, self.posterior.woodbury_vector)
         if full_cov:
             Kxx = self.kern.K(_Xnew, which_parts=which_parts)
             var = Kxx - tdot(LiKx.T)
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index c2025202..7011f33d 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -214,6 +214,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
             return
         i = 0
         sizes = [0]
+        self._param_slices_ = []
         for p in self._parameters_:
             p._direct_parent_ = self
             p._highest_parent_ = self
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index a0f4104c..c3aa9b36 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -74,7 +74,7 @@ class Posterior(object):
     @property
     def mean(self):
         if self._mean is None:
-            self._mean = np.dot(self._K, self._woodbury_vector)
+            self._mean = np.dot(self._K, self.woodbury_vector)
         return self._mean
 
     @property
@@ -93,10 +93,14 @@ class Posterior(object):
     @property
     def woodbury_chol(self):
         if self._woodbury_chol is None:
-            B = self._K - self._covariance
-            tmp, _ = dpotrs(self._K_chol, B)
-            Wi, _ = dpotrs(self._K_chol, tmp.T)
-            _, _, self._woodbury_chol, _ = pdinv(Wi)
+            #try computing woodbury chol from cov
+            if self._woodbury_inv is not None:
+                _, _, self._woodbury_chol, _ = pdinv(self._woodbury_inv)
+            elif self._covariance is not None:
+                B = self._K - self._covariance
+                tmp, _ = dpotrs(self.K_chol, B)
+                self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T)
+                _, _, self._woodbury_chol, _ = pdinv(self._woodbury_inv)
         return self._woodbury_chol
 
     @property
@@ -109,7 +113,7 @@ class Posterior(object):
     @property
     def woodbury_vector(self):
         if self._woodbury_vector is None:
-            self._woodbury_vector, _ = dpotrs(self._K_chol, self.mean)
+            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean)
         return self._woodbury_vector
 
     @property
diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py
index 062528ca..00626cd3 100644
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@@ -78,7 +78,7 @@ class Bernoulli(Likelihood):
 
         return Z_hat, mu_hat, sigma2_hat
 
-    def _predictive_mean_analytical(self, mu, variance):
+    def predictive_mean(self, mu, variance):
 
         if isinstance(self.gp_link, link_functions.Probit):
             return stats.norm.cdf(mu/np.sqrt(1+variance))
@@ -89,12 +89,13 @@ class Bernoulli(Likelihood):
         else:
             raise NotImplementedError
 
-    def _predictive_variance_analytical(self, mu, variance, pred_mean):
+    def predictive_variance(self, mu, variance, pred_mean):
 
         if isinstance(self.gp_link, link_functions.Heaviside):
             return 0.
         else:
-            raise NotImplementedError
+            return np.nan
+            #raise NotImplementedError
 
     def pdf_link(self, link_f, y, extra_data=None):
         """
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index b3178d0c..0aa1d4a4 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -156,11 +156,11 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
 
 
-def plot_f_fit(model, *args, **kwargs):
+def plot_fit_f(model, *args, **kwargs):
     """
     Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
 
     All args and kwargs are passed on to models_plots.plot.
     """
     kwargs['plot_raw'] = True
-    plot(model,*args, **kwargs)
+    plot_fit(model,*args, **kwargs)

From 75241ecf89523bc273144153820f014787c3e7d6 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 5 Feb 2014 17:52:17 +0000
Subject: [PATCH 13/43] predictino working nicely for laplace

---
 GPy/core/gp.py                                     |  9 ++++++---
 GPy/inference/latent_function_inference/laplace.py | 11 +++++++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 65295521..57e9715a 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -76,14 +76,17 @@ class GP(Model):
 
         """
         Kx = self.kern.K(_Xnew, self.X, which_parts=which_parts).T
-        LiKx, _ = dtrtrs(self.posterior.woodbury_chol, np.asfortranarray(Kx), lower=1)
+        #LiKx, _ = dtrtrs(self.posterior.woodbury_chol, np.asfortranarray(Kx), lower=1)
+        WiKx = np.dot(self.posterior.woodbury_inv, Kx)
         mu = np.dot(Kx.T, self.posterior.woodbury_vector)
         if full_cov:
             Kxx = self.kern.K(_Xnew, which_parts=which_parts)
-            var = Kxx - tdot(LiKx.T)
+            #var = Kxx - tdot(LiKx.T)
+            var = np.dot(Kx.T, WiKx)
         else:
             Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
-            var = Kxx - np.sum(LiKx*LiKx, 0)
+            #var = Kxx - np.sum(LiKx*LiKx, 0)
+            var = Kxx - np.sum(WiKx*Kx, 0)
             var = var.reshape(-1, 1)
         return mu, var
 
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 9a66d3b6..6e252406 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -11,7 +11,7 @@
 #http://gaussianprocess.org/gpml/code.
 
 import numpy as np
-from ...util.linalg import mdot, jitchol, pddet, dpotrs, dtrtrs
+from ...util.linalg import mdot, jitchol, pddet, dpotrs, dtrtrs, dpotri, symmetrify
 from ...util.misc import param_to_array
 from functools import partial as partial_func
 from posterior import Posterior
@@ -216,8 +216,15 @@ class LaplaceInference(object):
         B = np.eye(K.shape[0]) + W_12*K*W_12.T
         L = jitchol(B)
 
-        LiW12, _ = dtrtrs(L, np.diag(W_12[:,0]), lower=1, trans=0)
+        LiW12, _ = dtrtrs(L, np.diagflat(W_12), lower=1, trans=0)
         K_Wi_i = np.dot(LiW12.T, LiW12) # R = W12BiW12, in R&W p 126, eq 5.25
 
+        #here's a better way to compute the required matrix. 
+        # you could do the model finding witha backsub, instead of a dot...
+        #L2 = L/W_12
+        #K_Wi_i_2 , _= dpotri(L2)
+        #symmetrify(K_Wi_i_2)
+
+
         return K_Wi_i, L, LiW12
 

From 6346af8764a3d80a074689c94796a0c1baac936a Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 6 Feb 2014 09:28:27 +0000
Subject: [PATCH 14/43] assorted fixes

---
 GPy/core/gp.py                             | 1 -
 GPy/core/model.py                          | 6 ++----
 GPy/core/parameterization/array_core.py    | 3 ++-
 GPy/core/parameterization/parameterized.py | 7 ++++++-
 GPy/core/sparse_gp.py                      | 1 -
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 57e9715a..ab725897 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -58,7 +58,6 @@ class GP(Model):
         self.parameters_changed()
 
     def parameters_changed(self):
-        print self.kern
         self.posterior, self._log_marginal_likelihood, grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y)
         self._dL_dK = grad_dict['dL_dK']
 
diff --git a/GPy/core/model.py b/GPy/core/model.py
index dc108641..a1b2abe4 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -170,15 +170,13 @@ class Model(Parameterized):
         # first take care of all parameters (from N(0,1))
         #x = self._get_params_transformed()
         x = np.random.randn(self.size_transformed)
-        self._set_params_transformed(x)
+        x = self._untransform_params(x)
         # now draw from prior where possible
-        x = self._get_params()
-        if self.priors is not None:
+        if self.priors is not None and len(self.priors):
             [np.put(x, i, p.rvs(1)) for i, p in enumerate(self.priors) if not p is None]
         self._set_params(x)
         #self._set_params_transformed(self._get_params_transformed()) # makes sure all of the tied parameters get the same init (since there's only one prior object...)
 
-
     def optimize_restarts(self, num_restarts=10, robust=False, verbose=True, parallel=False, num_processes=None, **kwargs):
         """
         Perform random restarts of the model, and set the model to the best
diff --git a/GPy/core/parameterization/array_core.py b/GPy/core/parameterization/array_core.py
index 4e3c515d..12c68bb6 100644
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@@ -27,6 +27,7 @@ class ObservableArray(ListArray, Observable):
     """
     __array_priority__ = 0 # Never give back Param
     def __new__(cls, input_array):
+        cls.__name__ = "ObservableArray\n     "
         obj = super(ObservableArray, cls).__new__(cls, input_array).view(cls)
         obj._observers_ = {}
         return obj
@@ -48,4 +49,4 @@ class ObservableArray(ListArray, Observable):
     def __getslice__(self, start, stop):
         return self.__getitem__(slice(start, stop))
     def __setslice__(self, start, stop, val):
-        return self.__setitem__(slice(start, stop), val)  
\ No newline at end of file
+        return self.__setitem__(slice(start, stop), val)  
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 7011f33d..8f5c9bf0 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -316,7 +316,10 @@ class Parameterized(Constrainable, Pickleable, Observable):
         return n
     def _get_params(self):
         # don't overwrite this anymore!
+        if not self.size:
+            return np.empty(shape=(0,), dtype=np.float64)
         return numpy.hstack([x._get_params() for x in self._parameters_ if x.size>0])
+
     def _set_params(self, params, update=True):
         # don't overwrite this anymore!
         [p._set_params(params[s], update=update) for p,s in itertools.izip(self._parameters_,self._param_slices_)]
@@ -330,10 +333,12 @@ class Parameterized(Constrainable, Pickleable, Observable):
         return p
     def _set_params_transformed(self, p):
         # inverse apply transformations for parameters and set the resulting parameters
+        self._set_params(self._untransform_params(p))
+    def _untransform_params(self, p):
         p = p.copy()
         if self._has_fixes(): tmp = self._get_params(); tmp[self._fixes_] = p; p = tmp; del tmp
         [numpy.put(p, ind, c.f(p[ind])) for c,ind in self.constraints.iteritems() if c != __fixed__]
-        self._set_params(p)
+        return p
     def _name_changed(self, param, old_name):
         if hasattr(self, old_name) and old_name in self._added_names_:
             delattr(self, old_name)
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 193d9adf..fda201ff 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -43,7 +43,6 @@ class SparseGP(GP):
             print "defaulting to ", inference_method, "for latent function inference"
 
         self.Z = Param('inducing inputs', Z)
-
         self.num_inducing = Z.shape[0]
 
         if not (X_variance is None):

From ef3bfa46764561a2846f529371bf4199abf32780 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 6 Feb 2014 12:00:43 +0000
Subject: [PATCH 15/43] highest parent fix

---
 GPy/core/parameterization/param.py         |  2 ++
 GPy/core/parameterization/parameterized.py | 17 +++++++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 64f0a36d..462210dc 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -153,6 +153,8 @@ class Param(ObservableArray, Constrainable):
     @property
     def _parameters_(self):
         return []
+    def _connect_highest_parent(self, highest_parent):
+        self._highest_parent_ = highest_parent
     def _collect_gradient(self, target):
         target[:] = self.gradient.flat
     #===========================================================================
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 7011f33d..6feed470 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -217,13 +217,9 @@ class Parameterized(Constrainable, Pickleable, Observable):
         self._param_slices_ = []
         for p in self._parameters_:
             p._direct_parent_ = self
-            p._highest_parent_ = self
             p._parent_index_ = i
+            p._connect_highest_parent(self)
             i += 1
-            for pi in p.flattened_parameters:
-                pi._highest_parent_ = self
-            for pi in p._parameters_:
-                pi._highest_parent_ = self
             not_unique = []
             sizes.append(p.size+sizes[-1])
             self._param_slices_.append(slice(sizes[-2], sizes[-1]))
@@ -237,7 +233,16 @@ class Parameterized(Constrainable, Pickleable, Observable):
             elif not (pname in not_unique):
                 self.__dict__[pname] = p
                 self._added_names_.add(pname)
-            
+        
+    def _connect_highest_parent(self, highest_parent):
+        self._highest_parent_ = highest_parent
+        if not hasattr(self, "_parameters_") or len(self._parameters_) < 1:
+            # no parameters for this class
+            return
+        for p in self._parameters_:
+            p._highest_parent_ = highest_parent
+            p._connect_highest_parent(highest_parent)
+        
     #===========================================================================
     # Pickling operations
     #===========================================================================

From 54272d22bb88d8cc741105597409352fd4c9928a Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 6 Feb 2014 12:33:00 +0000
Subject: [PATCH 16/43] first crack at a caching object

---
 GPy/core/parameterization/array_core.py     |  4 ++
 GPy/core/parameterization/parameter_core.py |  3 +-
 GPy/kern/parts/prod.py                      | 41 ++++++++-------------
 GPy/util/__init__.py                        |  1 +
 4 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/GPy/core/parameterization/array_core.py b/GPy/core/parameterization/array_core.py
index 12c68bb6..4c31f23b 100644
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@@ -50,3 +50,7 @@ class ObservableArray(ListArray, Observable):
         return self.__getitem__(slice(start, stop))
     def __setslice__(self, start, stop, val):
         return self.__setitem__(slice(start, stop), val)  
+    def __copy__(self, *args):
+        return ObservableArray(self.base.base.copy(*args))
+    def copy(self, *args):
+        return self.__copy__(*args)
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index d63c6ea8..a826b10c 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -14,11 +14,12 @@ class Observable(object):
     _observers_ = {}
     def add_observer(self, observer, callble):
         self._observers_[observer] = callble
-        callble(self)
+        #callble(self)
     def remove_observer(self, observer):
         del self._observers_[observer]
     def _notify_observers(self):
         [callble(self) for callble in self._observers_.itervalues()]
+
         
 class Pickleable(object):
     def _getstate(self):
diff --git a/GPy/kern/parts/prod.py b/GPy/kern/parts/prod.py
index 2569c51c..62eed2aa 100644
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@@ -19,36 +19,21 @@ class Prod(Kernpart):
     """
     def __init__(self,k1,k2,tensor=False):
         if tensor:
-            super(Prod, self).__init__(k1.input_dim + k2.input_dim, '['+k1.name + '**' + k2.name +']')
+            super(Prod, self).__init__(k1.input_dim + k2.input_dim, k1.name + '_xx_' + k2.name)
+            self.slice1 = slice(0,k1.input_dim)
+            self.slice2 = slice(k1.input_dim,k1.input_dim+k2.input_dim)
         else:
-            assert k1.input_dim == k2.input_dim, "Error: The input spaces of the kernels to sum don't have the same dimension."
-            super(Prod, self).__init__(k1.input_dim, '['+k1.name + '*' + k2.name +']')
-        #self.num_params = k1.num_params + k2.num_params
+            assert k1.input_dim == k2.input_dim, "Error: The input spaces of the kernels to multiply don't have the same dimension."
+            super(Prod, self).__init__(k1.input_dim, k1.name + '_x_' + k2.name)
+            self.slice1 = slice(0,self.input_dim)
+            self.slice2 = slice(0,self.input_dim)
         self.k1 = k1
         self.k2 = k2
-#         if tensor:
-#             self.slice1 = slice(0,self.k1.input_dim)
-#             self.slice2 = slice(self.k1.input_dim,self.k1.input_dim+self.k2.input_dim)
-#         else:
-#             self.slice1 = slice(0,self.input_dim)
-#             self.slice2 = slice(0,self.input_dim)
-
-        self._X, self._X2 = np.empty(shape=(2,1))
         self.add_parameters(self.k1, self.k2)
-#         self._set_params(np.hstack((k1._get_params(),k2._get_params())))
 
-#     def _get_params(self):
-#         """return the value of the parameters."""
-#         return np.hstack((self.k1._get_params(), self.k2._get_params()))
-# 
-#     def _set_params(self,x):
-#         """set the value of the parameters."""
-#         self.k1._set_params(x[:self.k1.num_params])
-#         self.k2._set_params(x[self.k1.num_params:])
-# 
-#     def _get_param_names(self):
-#         """return parameter names."""
-#         return [self.k1.name + '_' + param_name for param_name in self.k1._get_param_names()] + [self.k2.name + '_' + param_name for param_name in self.k2._get_param_names()]
+        #initialize cache
+        self._X, self._X2 = np.empty(shape=(2,1))
+        self._params = None
 
     def K(self,X,X2,target):
         self._K_computations(X,X2)
@@ -64,6 +49,11 @@ class Prod(Kernpart):
         self._K_computations(X, X2)
         return self._K2
 
+    def update_gradients_full(self, dL_dK, X):
+        self._K_computations(X, None)
+        self.k1.update_gradients_full(dL_dK*self._K2, X[:,self.slice1])
+        self.k2.update_gradients_full(dL_dK*self._K1, X[:,self.slice2])
+
     def dK_dtheta(self,dL_dK,X,X2,target):
         """Derivative of the covariance matrix with respect to the parameters."""
         self._K_computations(X,X2)
@@ -82,6 +72,7 @@ class Prod(Kernpart):
         self.k2.Kdiag(X[:,self.slice2],target2)
         target += target1 * target2
 
+
     def dKdiag_dtheta(self,dL_dKdiag,X,target):
         K1 = np.zeros(X.shape[0])
         K2 = np.zeros(X.shape[0])
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index c25b1349..5a335027 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -10,6 +10,7 @@ import datasets
 import mocap
 import decorators
 import classification
+import caching
 
 try:
     import sympy

From 1c9151a7d036624f14304334ff2703a5e0360f84 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 6 Feb 2014 12:46:56 +0000
Subject: [PATCH 17/43] added caching framework

---
 GPy/util/caching.py | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 GPy/util/caching.py

diff --git a/GPy/util/caching.py b/GPy/util/caching.py
new file mode 100644
index 00000000..d8893021
--- /dev/null
+++ b/GPy/util/caching.py
@@ -0,0 +1,41 @@
+import numpy as np
+from ..core.parameterization.array_core import ObservableArray
+class Cacher(object):
+    def __init__(self, operation, limit=5):
+        self.limit = int(limit)
+        self.operation=operation
+        self.cached_inputs = []
+        self.cached_outputs = []
+        self.inputs_changed = []
+
+    def __call__(self, X):
+        assert isinstance(X, ObservableArray)
+        if X in self.cached_inputs:
+            i = self.cached_inputs.index(X)
+            if self.inputs_changed[i]:
+                self.cached_outputs[i] = self.operation(X)
+                self.inputs_changed[i] = False
+            return self.cached_outputs[i]
+        else:
+            if len(self.cached_inputs) == self.limit:
+                X_ = self.cached_inputs.pop(0)
+                X_.remove_observer(self)
+                self.inputs_changed.pop(0)
+                self.cached_outputs.pop(0)
+
+            self.cached_inputs.append(X)
+            self.cached_outputs.append(self.operation(X))
+            self.inputs_changed.append(False)
+            X.add_observer(self, self.on_cache_changed)
+            return self.cached_outputs[-1]
+
+    def on_cache_changed(self, X):
+        print id(X)
+        i = self.cached_inputs.index(X)
+        self.inputs_changed[i] = True
+
+                
+
+
+
+

From 0d52430ffb00489b1cf26b2b19c5e2e60f494ba1 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 6 Feb 2014 14:03:15 +0000
Subject: [PATCH 18/43] fixed product kernel copy error

---
 GPy/kern/kern.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 68b26e3c..6a8bc745 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -180,8 +180,10 @@ class kern(Parameterized):
         :type tensor: bool
 
         """
-        K1 = self.copy()
-        K2 = other.copy()
+        K1 = self
+        K2 = other
+        #K1 = self.copy()
+        #K2 = other.copy()
 
         slices = []
         for sl1, sl2 in itertools.product(K1.input_slices, K2.input_slices):

From 1c7949a61c10bc83c19b6002f16715fe57d57093 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 6 Feb 2014 14:03:40 +0000
Subject: [PATCH 19/43] small changes to parameterization init

---
 GPy/core/parameterization/parameterized.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index dafb1018..7abaf4a3 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -69,8 +69,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
         super(Parameterized, self).__init__(name=name)
         self._in_init_ = True
         self._constraints_ = None#ParameterIndexOperations()
-        if not hasattr(self, "_parameters_"):
-            self._parameters_ = []
+        self._parameters_ = []
         self.size = sum(p.size for p in self._parameters_)
         if not self._has_fixes():
             self._fixes_ = None
@@ -212,14 +211,12 @@ class Parameterized(Constrainable, Pickleable, Observable):
         if not hasattr(self, "_parameters_") or len(self._parameters_) < 1:
             # no parameters for this class
             return
-        i = 0
         sizes = [0]
         self._param_slices_ = []
-        for p in self._parameters_:
+        for i,p in enumerate(self._parameters_):
             p._direct_parent_ = self
             p._parent_index_ = i
             p._connect_highest_parent(self)
-            i += 1
             not_unique = []
             sizes.append(p.size+sizes[-1])
             self._param_slices_.append(slice(sizes[-2], sizes[-1]))
@@ -240,7 +237,6 @@ class Parameterized(Constrainable, Pickleable, Observable):
             # no parameters for this class
             return
         for p in self._parameters_:
-            p._highest_parent_ = highest_parent
             p._connect_highest_parent(highest_parent)
         
     #===========================================================================

From f5f2c56f7c1311093bb503cce96d6b42815d3403 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 6 Feb 2014 14:36:24 +0000
Subject: [PATCH 20/43] spelling

---
 GPy/likelihoods/gaussian.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index c047e573..f77eafbe 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -3,7 +3,7 @@
 
 #TODO
 """
-A lot of this code assumes that the link functio nis the identity. 
+A lot of this code assumes that the link function is the identity. 
 
 I think laplace code is okay, but I'm quite sure that the EP moments will only work if the link is identity. 
 

From b12fb6a2a8161c17ebb675651a80f973f0655af2 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 6 Feb 2014 16:22:08 +0000
Subject: [PATCH 21/43] Fixed parameterized oddity where it was updating all
 constrained parameters as soon as any were constrained rather than after all
 are constrained@ @

---
 GPy/core/parameterization/param.py          | 95 ++++++++++++---------
 GPy/core/parameterization/parameter_core.py | 48 +++++------
 GPy/core/parameterization/parameterized.py  | 13 +--
 3 files changed, 87 insertions(+), 69 deletions(-)

diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 462210dc..62108ab2 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -12,32 +12,32 @@ __index_name__ = "Index"
 __tie_name__ = "Tied to"
 __precision__ = numpy.get_printoptions()['precision'] # numpy printing precision used, sublassing numpy ndarray after all
 __print_threshold__ = 5
-######      
+######
 
 class Float(numpy.float64, Constrainable):
     def __init__(self, f, base):
         super(Float,self).__init__(f)
         self._base = base
-        
-        
+
+
 class Param(ObservableArray, Constrainable):
     """
     Parameter object for GPy models.
 
     :param name:        name of the parameter to be printed
     :param input_array: array which this parameter handles
-    
+
     You can add/remove constraints by calling constrain on the parameter itself, e.g:
-    
+
         - self[:,1].constrain_positive()
         - self[0].tie_to(other)
         - self.untie()
         - self[:3,:].unconstrain()
         - self[1].fix()
-        
+
     Fixing parameters will fix them to the value they are right now. If you change
     the fixed value, it will be fixed to the new value!
-    
+
     See :py:class:`GPy.core.parameterized.Parameterized` for more details on constraining etc.
 
     This ndarray can be stored in lists and checked if it is in.
@@ -46,7 +46,7 @@ class Param(ObservableArray, Constrainable):
     >>> x = np.random.normal(size=(10,3))
     >>> x in [[1], x, [3]]
     True
-    
+
     WARNING: This overrides the functionality of x==y!!!
     Use numpy.equal(x,y) for element-wise equality testing.
     """
@@ -68,7 +68,7 @@ class Param(ObservableArray, Constrainable):
 
     def __init__(self, name, input_array):
         super(Param, self).__init__(name=name)
-        
+
     def __array_finalize__(self, obj):
         # see InfoArray.__array_finalize__ for comments
         if obj is None: return
@@ -86,7 +86,7 @@ class Param(ObservableArray, Constrainable):
         self._original_ = getattr(obj, '_original_', None)
         self._name = getattr(obj, 'name', None)
         self.gradient = getattr(obj, 'gradient', None)
-        
+
     def __array_wrap__(self, out_arr, context=None):
         return out_arr.view(numpy.ndarray)
     #===========================================================================
@@ -94,7 +94,7 @@ class Param(ObservableArray, Constrainable):
     #===========================================================================
     def __reduce_ex__(self):
         func, args, state = super(Param, self).__reduce__()
-        return func, args, (state, 
+        return func, args, (state,
                             (self.name,
                              self._direct_parent_,
                              self._parent_index_,
@@ -132,13 +132,13 @@ class Param(ObservableArray, Constrainable):
         self.flat = param
         self._notify_tied_parameters()
         self._notify_observers()
-        
+
     def _get_params(self):
         return self.flat
 #     @property
 #     def name(self):
 #         """
-#         Name of this parameter. 
+#         Name of this parameter.
 #         This can be a callable without parameters. The callable will be called
 #         every time the name property is accessed.
 #         """
@@ -163,7 +163,7 @@ class Param(ObservableArray, Constrainable):
     def constrain_fixed(self, warning=True):
         """
         Constrain this paramter to be fixed to the current value it carries.
-        
+
         :param warning: print a warning for overwriting constraints.
         """
         self._highest_parent_._fix(self,warning)
@@ -179,18 +179,18 @@ class Param(ObservableArray, Constrainable):
     #===========================================================================
     def tie_to(self, param):
         """
-        :param param: the parameter object to tie this parameter to. 
+        :param param: the parameter object to tie this parameter to.
                       Can be ParamConcatenation (retrieved by regexp search)
-        
+
         Tie this parameter to the given parameter.
         Broadcasting is not allowed, but you can tie a whole dimension to
         one parameter:  self[:,0].tie_to(other), where other is a one-value
         parameter.
-        
+
         Note: For now only one parameter can have ties, so all of a parameter
               will be removed, when re-tieing!
         """
-        #Note: this method will tie to the parameter which is the last in 
+        #Note: this method will tie to the parameter which is the last in
         #      the chain of ties. Thus, if you tie to a tied parameter,
         #      this tie will be created to the parameter the param is tied
         #      to.
@@ -200,12 +200,12 @@ class Param(ObservableArray, Constrainable):
         if param.size != 1:
             raise NotImplementedError, "Broadcast tying is not implemented yet"
         try:
-            if self._original_: 
+            if self._original_:
                 self[:] = param
             else: # this happens when indexing created a copy of the array
                 self._direct_parent_._get_original(self)[self._current_slice_] = param
         except ValueError:
-            raise ValueError("Trying to tie {} with shape {} to {} with shape {}".format(self.name, self.shape, param.name, param.shape))            
+            raise ValueError("Trying to tie {} with shape {} to {} with shape {}".format(self.name, self.shape, param.name, param.shape))
         if param is self:
             raise RuntimeError, 'Cyclic tieing is not allowed'
 #         if len(param._tied_to_) > 0:
@@ -293,7 +293,7 @@ class Param(ObservableArray, Constrainable):
     def unset_prior(self, *priors):
         """
         :param priors: priors to remove from this parameter
-        
+
         Remove all priors from this parameter
         """
         self._highest_parent_._remove_prior(self, *priors)
@@ -324,7 +324,7 @@ class Param(ObservableArray, Constrainable):
             if numpy.all(si == Ellipsis):
                 continue
             if isinstance(si, slice):
-                a = si.indices(self._realshape_[i])[0] 
+                a = si.indices(self._realshape_[i])[0]
             elif isinstance(si, (list,numpy.ndarray,tuple)):
                 a = si[0]
             else: a = si
@@ -424,7 +424,7 @@ class Param(ObservableArray, Constrainable):
             slice_index = self._current_slice_
         if isinstance(slice_index, (tuple, list)):
             clean_curr_slice = [s for s in slice_index if numpy.any(s != Ellipsis)]
-            if (all(isinstance(n, (numpy.ndarray, list, tuple)) for n in clean_curr_slice) 
+            if (all(isinstance(n, (numpy.ndarray, list, tuple)) for n in clean_curr_slice)
                 and len(set(map(len,clean_curr_slice))) <= 1):
                 return numpy.fromiter(itertools.izip(*clean_curr_slice),
                     dtype=[('',int)]*self._realndim_,count=len(clean_curr_slice[0])).view((int, self._realndim_))
@@ -443,7 +443,7 @@ class Param(ObservableArray, Constrainable):
         if self._realsize_ < 2:
             return name
         ind = self._indices()
-        if ind.size > 4: indstr = ','.join(map(str,ind[:2])) + "..." + ','.join(map(str,ind[-2:])) 
+        if ind.size > 4: indstr = ','.join(map(str,ind[:2])) + "..." + ','.join(map(str,ind[-2:]))
         else: indstr = ','.join(map(str,ind))
         return name+'['+indstr+']'
     def __str__(self, constr_matrix=None, indices=None, ties=None, lc=None, lx=None, li=None, lt=None):
@@ -477,7 +477,7 @@ class ParamConcatenation(object):
         for p in params:
             for p in p.flattened_parameters:
                 if p not in self.params:
-                    self.params.append(p)           
+                    self.params.append(p)
         self._param_sizes = [p.size for p in self.params]
         startstops = numpy.cumsum([0] + self._param_sizes)
         self._param_slices_ = [slice(start, stop) for start,stop in zip(startstops, startstops[1:])]
@@ -485,15 +485,15 @@ class ParamConcatenation(object):
     # Get/set items, enable broadcasting
     #===========================================================================
     def __getitem__(self, s):
-        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True; 
+        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True;
         params = [p._get_params()[ind[ps]] for p,ps in zip(self.params, self._param_slices_) if numpy.any(p._get_params()[ind[ps]])]
         if len(params)==1: return params[0]
         return ParamConcatenation(params)
     def __setitem__(self, s, val, update=True):
-        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True; 
+        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True;
         vals = self._vals(); vals[s] = val; del val
-        [numpy.place(p, ind[ps], vals[ps]) and p._notify_tied_parameters() 
-         for p, ps in zip(self.params, self._param_slices_)]
+        [numpy.place(p, ind[ps], vals[ps]) and p._notify_tied_parameters()
+        for p, ps in zip(self.params, self._param_slices_)]
         if update:
             self.params[0]._highest_parent_.parameters_changed()
     def _vals(self):
@@ -501,38 +501,55 @@ class ParamConcatenation(object):
     #===========================================================================
     # parameter operations:
     #===========================================================================
+    def update_all_params(self):
+        self.params[0]._highest_parent_.parameters_changed()
+
     def constrain(self, constraint, warning=True):
-        [param.constrain(constraint) for param in self.params]
+        [param.constrain(constraint, update=False) for param in self.params]
+        self.update_all_params()
     constrain.__doc__ = Param.constrain.__doc__
+
     def constrain_positive(self, warning=True):
-        [param.constrain_positive(warning) for param in self.params]
+        [param.constrain_positive(warning, update=False) for param in self.params]
+        self.update_all_params()
     constrain_positive.__doc__ = Param.constrain_positive.__doc__
+
     def constrain_fixed(self, warning=True):
         [param.constrain_fixed(warning) for param in self.params]
     constrain_fixed.__doc__ = Param.constrain_fixed.__doc__
     fix = constrain_fixed
+
     def constrain_negative(self, warning=True):
-        [param.constrain_negative(warning) for param in self.params]
+        [param.constrain_negative(warning, update=False) for param in self.params]
+        self.update_all_params()
     constrain_negative.__doc__ = Param.constrain_negative.__doc__
+
     def constrain_bounded(self, lower, upper, warning=True):
-        [param.constrain_bounded(lower, upper, warning) for param in self.params]
+        [param.constrain_bounded(lower, upper, warning, update=False) for param in self.params]
+        self.update_all_params()
     constrain_bounded.__doc__ = Param.constrain_bounded.__doc__
+
     def unconstrain(self, *constraints):
         [param.unconstrain(*constraints) for param in self.params]
     unconstrain.__doc__ = Param.unconstrain.__doc__
+
     def unconstrain_negative(self):
         [param.unconstrain_negative() for param in self.params]
     unconstrain_negative.__doc__ = Param.unconstrain_negative.__doc__
+
     def unconstrain_positive(self):
         [param.unconstrain_positive() for param in self.params]
     unconstrain_positive.__doc__ = Param.unconstrain_positive.__doc__
+
     def unconstrain_fixed(self):
         [param.unconstrain_fixed() for param in self.params]
     unconstrain_fixed.__doc__ = Param.unconstrain_fixed.__doc__
     unfix = unconstrain_fixed
+
     def unconstrain_bounded(self, lower, upper):
         [param.unconstrain_bounded(lower, upper) for param in self.params]
     unconstrain_bounded.__doc__ = Param.unconstrain_bounded.__doc__
+
     def untie(self, *ties):
         [param.untie(*ties) for param in self.params]
     __lt__ = lambda self, val: self._vals()<val
@@ -557,9 +574,9 @@ class ParamConcatenation(object):
         return "\n{}\n".format(" -"+"- | -".join(['-'*l for l in [li,lx,lc,lt]])).join(strings)
     def __repr__(self):
         return "\n".join(map(repr,self.params))
-    
+
 if __name__ == '__main__':
-    
+
 
     from GPy.core.parameterized import Parameterized
     from GPy.core.parameter import Param
@@ -570,16 +587,16 @@ if __name__ == '__main__':
     p = Param("q_mean", X)
     p1 = Param("q_variance", numpy.random.rand(*p.shape))
     p2 = Param("Y", numpy.random.randn(p.shape[0],1))
-    
+
     p3 = Param("variance", numpy.random.rand())
     p4 = Param("lengthscale", numpy.random.rand(2))
-    
+
     m = Parameterized()
     rbf = Parameterized(name='rbf')
-    
+
     rbf.add_parameter(p3,p4)
     m.add_parameter(p,p1,rbf)
-    
+
     print "setting params"
     #print m.q_v[3:5,[1,4,5]]
     print "constraining variance"
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index a826b10c..81fb16d9 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -20,7 +20,7 @@ class Observable(object):
     def _notify_observers(self):
         [callble(self) for callble in self._observers_.itervalues()]
 
-        
+
 class Pickleable(object):
     def _getstate(self):
         """
@@ -36,9 +36,9 @@ class Pickleable(object):
         Set the state (memento pattern) of this class to the given state.
         Usually this is just the counterpart to _getstate, such that
         an object is a copy of another when calling
-    
+
             copy = <classname>.__new__(*args,**kw)._setstate(<to_be_copied>._getstate())
-            
+
         See python doc "pickling" (`__getstate__` and `__setstate__`) for details.
         """
         raise NotImplementedError, "To be able to use pickling you need to implement this method"
@@ -49,14 +49,14 @@ class Pickleable(object):
 
 class Parentable(object):
     def __init__(self, direct_parent=None, highest_parent=None, parent_index=None):
-        super(Parentable,self).__init__()        
+        super(Parentable,self).__init__()
         self._direct_parent_ = direct_parent
         self._parent_index_ = parent_index
         self._highest_parent_ = highest_parent
-        
+
     def has_parent(self):
         return self._direct_parent_ is not None and self._highest_parent_ is not None
-    
+
 class Nameable(Parentable):
     _name = None
     def __init__(self, name, direct_parent=None, highest_parent=None, parent_index=None):
@@ -69,10 +69,10 @@ class Nameable(Parentable):
     @name.setter
     def name(self, name):
         from_name = self.name
-        self._name = name        
+        self._name = name
         if self.has_parent():
             self._direct_parent_._name_changed(self, from_name)
-            
+
 class Constrainable(Nameable):
     def __init__(self, name):
         super(Constrainable,self).__init__(name)
@@ -84,7 +84,7 @@ class Constrainable(Nameable):
         :param transform: the :py:class:`GPy.core.transformations.Transformation`
                           to constrain the this parameter to.
         :param warning: print a warning if re-constraining parameters.
-        
+
         Constrain the parameter to the given
         :py:class:`GPy.core.transformations.Transformation`.
         """
@@ -97,37 +97,37 @@ class Constrainable(Nameable):
                 self._add_constrain(p, transform, warning)
             if update:
                 self.parameters_changed()
-                
-    def constrain_positive(self, warning=True):
+
+    def constrain_positive(self, warning=True, update=True):
         """
         :param warning: print a warning if re-constraining parameters.
-        
+
         Constrain this parameter to the default positive constraint.
         """
-        self.constrain(Logexp(), warning)
+        self.constrain(Logexp(), warning=warning, update=update)
 
-    def constrain_negative(self, warning=True):
+    def constrain_negative(self, warning=True, update=True):
         """
         :param warning: print a warning if re-constraining parameters.
-        
+
         Constrain this parameter to the default negative constraint.
         """
-        self.constrain(NegativeLogexp(), warning)
+        self.constrain(NegativeLogexp(), warning=warning, update=update)
 
-    def constrain_bounded(self, lower, upper, warning=True):
+    def constrain_bounded(self, lower, upper, warning=True, update=True):
         """
         :param lower, upper: the limits to bound this parameter to
         :param warning: print a warning if re-constraining parameters.
-        
+
         Constrain this parameter to lie within the given range.
         """
-        self.constrain(Logistic(lower, upper), warning)
+        self.constrain(Logistic(lower, upper), warning=warning, update=update)
 
     def unconstrain(self, *transforms):
         """
         :param transforms: The transformations to unconstrain from.
-        
-        remove all :py:class:`GPy.core.transformations.Transformation` 
+
+        remove all :py:class:`GPy.core.transformations.Transformation`
         transformats of this parameter object.
         """
         if self.has_parent():
@@ -138,20 +138,20 @@ class Constrainable(Nameable):
 
     def unconstrain_positive(self):
         """
-        Remove positive constraint of this parameter. 
+        Remove positive constraint of this parameter.
         """
         self.unconstrain(Logexp())
 
     def unconstrain_negative(self):
         """
-        Remove negative constraint of this parameter. 
+        Remove negative constraint of this parameter.
         """
         self.unconstrain(NegativeLogexp())
 
     def unconstrain_bounded(self, lower, upper):
         """
         :param lower, upper: the limits to unbound this parameter from
-        
+
         Remove (lower, upper) bounded constrain from this parameter/
         """
         self.unconstrain(Logistic(lower, upper))
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index dafb1018..4198e22c 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -233,7 +233,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
             elif not (pname in not_unique):
                 self.__dict__[pname] = p
                 self._added_names_.add(pname)
-        
+
     def _connect_highest_parent(self, highest_parent):
         self._highest_parent_ = highest_parent
         if not hasattr(self, "_parameters_") or len(self._parameters_) < 1:
@@ -242,7 +242,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
         for p in self._parameters_:
             p._highest_parent_ = highest_parent
             p._connect_highest_parent(highest_parent)
-        
+
     #===========================================================================
     # Pickling operations
     #===========================================================================
@@ -413,10 +413,10 @@ class Parameterized(Constrainable, Pickleable, Observable):
     #===========================================================================
     # Fixing parameters:
     #===========================================================================
-    def _fix(self, param, warning=True):
+    def _fix(self, param, warning=True, update=True):
         f = self._add_constrain(param, __fixed__, warning)
         self._set_fixed(f)
-    def _unfix(self, param):
+    def _unfix(self, param, update=True):
         if self._has_fixes():
             f = self._remove_constrain(param, __fixed__)
             self._set_unfixed(f)
@@ -438,7 +438,8 @@ class Parameterized(Constrainable, Pickleable, Observable):
         # if advanced indexing is activated it happens that the array is a copy
         # you can retrieve the original param through this method, by passing
         # the copy here
-        return self._parameters_[param._parent_index_]
+        #return self._parameters_[param._parent_index_]
+        return param._direct_parent_._parameters_[param._parent_index_]
     def hirarchy_name(self):
         if self.has_parent():
             return self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name) + "."
@@ -452,7 +453,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
         # if removing constraints before adding new is not wanted, just delete the above line!
         self.constraints.add(transform, rav_i)
         param = self._get_original(param)
-        param._set_params(transform.initialize(param._get_params()))
+        param._set_params(transform.initialize(param._get_params()), update=False)
         if warning and any(reconstrained):
             # if you want to print the whole params object, which was reconstrained use:
             # m = str(param[self._backtranslate_index(param, reconstrained)])

From 87dab55fe11648278c57ea20aa250e3af7d398c3 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 6 Feb 2014 16:22:45 +0000
Subject: [PATCH 22/43] adjusted periodic exponential to the new
 parameterization

---
 GPy/kern/parts/periodic_exponential.py | 32 +++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/GPy/kern/parts/periodic_exponential.py b/GPy/kern/parts/periodic_exponential.py
index 201def6d..d8c193e0 100644
--- a/GPy/kern/parts/periodic_exponential.py
+++ b/GPy/kern/parts/periodic_exponential.py
@@ -6,6 +6,7 @@ from kernpart import Kernpart
 import numpy as np
 from GPy.util.linalg import mdot
 from GPy.util.decorators import silence_errors
+from GPy.core.parameterization.param import Param
 
 class PeriodicExponential(Kernpart):
     """
@@ -25,9 +26,9 @@ class PeriodicExponential(Kernpart):
 
     """
 
-    def __init__(self, input_dim=1, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
+    def __init__(self, input_dim=1, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi, name='periodic_exp'):
+        super(PeriodicExponential, self).__init__(input_dim, name)
         assert input_dim==1, "Periodic kernels are only defined for input_dim=1"
-        self.name = 'periodic_exp'
         self.input_dim = input_dim
         if lengthscale is not None:
             lengthscale = np.asarray(lengthscale)
@@ -38,7 +39,11 @@ class PeriodicExponential(Kernpart):
         self.num_params = 3
         self.n_freq = n_freq
         self.n_basis = 2*n_freq
-        self._set_params(np.hstack((variance,lengthscale,period)))
+        self.variance = Param('variance', variance)
+        self.lengthscale = Param('lengthscale', lengthscale)
+        self.period = Param('period', period)
+        self.parameters_changed()
+        #self._set_params(np.hstack((variance,lengthscale,period)))
 
     def _cos(self,alpha,omega,phase):
         def f(x):
@@ -61,30 +66,25 @@ class PeriodicExponential(Kernpart):
         Gint = np.dot(r1,r2.T)/2 * np.where(np.isnan(Gint1),Gint2,Gint1)
         return Gint
 
-    def _get_params(self):
-        """return the value of the parameters."""
-        return np.hstack((self.variance,self.lengthscale,self.period))
+    #def _get_params(self):
+    #    """return the value of the parameters."""
+    #    return np.hstack((self.variance,self.lengthscale,self.period))
 
-    def _set_params(self,x):
+    def parameters_changed(self):
         """set the value of the parameters."""
-        assert x.size==3
-        self.variance = x[0]
-        self.lengthscale = x[1]
-        self.period = x[2]
-
         self.a = [1./self.lengthscale, 1.]
         self.b = [1]
 
         self.basis_alpha = np.ones((self.n_basis,))
-        self.basis_omega = np.array(sum([[i*2*np.pi/self.period]*2 for i in  range(1,self.n_freq+1)],[]))
+        self.basis_omega = np.array(sum([[i*2*np.pi/self.period]*2 for i in  range(1,self.n_freq+1)],[]))[:,0]
         self.basis_phi =   np.array(sum([[-np.pi/2, 0.]  for i in range(1,self.n_freq+1)],[]))
 
         self.G = self.Gram_matrix()
         self.Gi = np.linalg.inv(self.G)
 
-    def _get_param_names(self):
-        """return parameter names."""
-        return ['variance','lengthscale','period']
+    #def _get_param_names(self):
+    #    """return parameter names."""
+    #    return ['variance','lengthscale','period']
 
     def Gram_matrix(self):
         La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega))

From 7d2295d8546d5a824cdc44e375820d75fff776d1 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 6 Feb 2014 18:17:04 +0000
Subject: [PATCH 23/43] some messing with fitc

---
 .../latent_function_inference/fitc.py         | 93 ++++++++-----------
 1 file changed, 37 insertions(+), 56 deletions(-)

diff --git a/GPy/inference/latent_function_inference/fitc.py b/GPy/inference/latent_function_inference/fitc.py
index d5aa80bc..ac725294 100644
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@@ -1,73 +1,54 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-import numpy as np
-from ...util.linalg import mdot, jitchol, chol_inv, tdot, dtrtrs
-from ...core import SparseGP
+class VarDTC(object):
+    def __init__(self):
+        self.const_jitter = 1e-6
 
-class FITC(SparseGP):
-    """
+    def inference(self, kern, X, X_variance, Z, likelihood, Y):
 
-    Sparse FITC approximation
+        num_inducing, _ = Z.shape
+        num_data, output_dim = Y.shape
 
-    :param X: inputs
-    :type X: np.ndarray (num_data x Q)
-    :param likelihood: a likelihood instance, containing the observed data
-    :type likelihood: GPy.likelihood.(Gaussian | EP)
-    :param kernel: the kernel (covariance function). See link kernels
-    :type kernel: a GPy.kern.kern instance
-    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (M x Q) | None
-    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize_(X|Y): bool
+        #make sure we're not using variational uncertain inputs
+        assert = X_variance is None, "variational inducing inputs only for use with varDTC inference"
+        #Note: we can;t do the variational thing after making the GITC conditional approximation because K~ appears in the log determinant.
 
-    """
+        #see whether we've got a different noise variance for each datum
+        sigma2 = np.squeeze(likelihood.variance)
+        het_noise = False
+        if sigma2.size <1:
+            het_noise = True
 
-    def __init__(self, X, likelihood, kernel, Z, normalize_X=False):
-        SparseGP.__init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False)
-        assert self.output_dim == 1, "FITC model is not defined for handling multiple outputs"
-
-    def update_likelihood_approximation(self, **kwargs):
-        """
-        Approximates a non-Gaussian likelihood using Expectation Propagation
-
-        For a Gaussian likelihood, no iteration is required:
-        this function does nothing
-        """
-        self.likelihood.restart()
-        self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0, **kwargs)
-        self._set_params(self._get_params())
-
-    def _compute_kernel_matrices(self):
         # kernel computations, using BGPLVM notation
-        self.Kmm = self.kern.K(self.Z)
-        self.psi0 = self.kern.Kdiag(self.X)
-        self.psi1 = self.kern.K(self.Z, self.X)
-        self.psi2 = None
+        Kmm = kern.K(Z)
+        Kdiag = kern.Kdiag(X)
+        Kmn = kern.K(X, Z)
 
-    def _computations(self):
         #factor Kmm
-        self.Lm = jitchol(self.Kmm)
-        self.Lmi,info = dtrtrs(self.Lm,np.eye(self.num_inducing),lower=1)
-        Lmipsi1 = np.dot(self.Lmi,self.psi1)
-        self.Qnn = np.dot(Lmipsi1.T,Lmipsi1).copy()
-        self.Diag0 = self.psi0 - np.diag(self.Qnn)
-        self.beta_star = self.likelihood.precision/(1. + self.likelihood.precision*self.Diag0[:,None]) #NOTE: beta_star contains Diag0 and the precision
-        self.V_star = self.beta_star * self.likelihood.Y
+        Lm = jitchol(Kmm)
+        V = dtrtrs(Lm, Kmn, lower=1)
 
-        # The rather complex computations of self.A
-        tmp = self.psi1 * (np.sqrt(self.beta_star.flatten().reshape(1, self.num_data)))
-        tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
-        self.A = tdot(tmp)
+        #compute effective noise
+        g_sn2 = sigma2 + Kdiag - np.sum(V*V, 0)
 
-        # factor B
-        self.B = np.eye(self.num_inducing) + self.A
-        self.LB = jitchol(self.B)
-        self.LBi = chol_inv(self.LB)
-        self.psi1V = np.dot(self.psi1, self.V_star)
+        #compute and factor B
+        tmp = Kmn / np.sqrt(g_snd)
+        tmp, _ = dtrtrs(Lm, tmp, lower=1)
+        A = tdot(tmp)
+        B = np.eye(num_inducing) + A
+        Bi, Lb, LBi, log_det_B = pdinv(B)
 
-        Lmi_psi1V, info = dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0)
-        self._LBi_Lmi_psi1V, _ = dtrtrs(self.LB, np.asfortranarray(Lmi_psi1V), lower=1, trans=0)
+        #compute posterior parameters
+        tmp, _ = dtrtrs()
+        woodbury_vec = 
+
+        
+
+
+        psi1V = np.dot(self.psi1, self.V_star)
+        Lmi_psi1V, _ = dtrtrs(Lm, self.psi1V, lower=1, trans=0)
+        LBi_Lmi_psi1V, _ = dtrtrs(LB, Lmi_psi1V, lower=1, trans=0)
 
         Kmmipsi1 = np.dot(self.Lmi.T,Lmipsi1)
         b_psi1_Ki = self.beta_star * Kmmipsi1.T

From c28f11f29175379c5ea8257bd508c9533b180f96 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 7 Feb 2014 15:16:05 +0000
Subject: [PATCH 24/43] Fixed parameter bugs

---
 GPy/core/parameterization/parameterized.py   |  4 +++-
 GPy/core/parameterization/transformations.py | 12 +++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index d712b382..556fdbe0 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -449,7 +449,9 @@ class Parameterized(Constrainable, Pickleable, Observable):
         # if removing constraints before adding new is not wanted, just delete the above line!
         self.constraints.add(transform, rav_i)
         param = self._get_original(param)
-        param._set_params(transform.initialize(param._get_params()), update=False)
+        #FIXME: Max, is this the right thing to do to handle fixed?
+        if not (transform == __fixed__):
+            param._set_params(transform.initialize(param._get_params()), update=False)
         if warning and any(reconstrained):
             # if you want to print the whole params object, which was reconstrained use:
             # m = str(param[self._backtranslate_index(param, reconstrained)])
diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py
index fd2c3ee5..c4cab1e9 100644
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@@ -4,9 +4,9 @@
 
 import numpy as np
 from domains import _POSITIVE,_NEGATIVE, _BOUNDED
-import sys 
+import sys
 import weakref
-_lim_val = -np.log(sys.float_info.epsilon) 
+_lim_val = -np.log(sys.float_info.epsilon)
 
 class Transformation(object):
     domain = None
@@ -94,7 +94,7 @@ class LogexpClipped(Logexp):
     def __str__(self):
         return '+ve_c'
 
-    
+
 class Exponent(Transformation):
     # TODO: can't allow this to go to zero, need to set a lower bound. Similar with negative Exponent below. See old MATLAB code.
     domain = _POSITIVE
@@ -162,9 +162,11 @@ class Logistic(Transformation):
     def initialize(self, f):
         if np.any(np.logical_or(f < self.lower, f > self.upper)):
             print "Warning: changing parameters to satisfy constraints"
-        return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
+        #return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
+        #FIXME: Max, zeros_like right?
+        return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(np.zeros_like(f)), f)
     def __str__(self):
         return '{},{}'.format(self.lower, self.upper)
 
- 
+
 

From 186feb45a1efd801b7ca931d3ca519df4b51c21f Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 7 Feb 2014 15:16:52 +0000
Subject: [PATCH 25/43] Fixed likelihood tests for new parameters structure

---
 .../latent_function_inference/laplace.py      |  12 +-
 GPy/likelihoods/gaussian.py                   |  21 +-
 GPy/likelihoods/likelihood.py                 |  24 +-
 GPy/likelihoods/student_t.py                  |  42 ++--
 GPy/models/gradient_checker.py                |  59 ++---
 GPy/testing/likelihood_tests.py               | 217 ++++++++++--------
 6 files changed, 207 insertions(+), 168 deletions(-)

diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 6e252406..6fce94fc 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -33,7 +33,6 @@ class LaplaceInference(object):
         self._mode_finding_max_iter = 40
         self.bad_fhat = True
 
-
     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
         """
         Returns a Posterior class containing essential quantities of the posterior
@@ -50,6 +49,7 @@ class LaplaceInference(object):
             Ki_f_init = np.zeros_like(Y)
         else:
             Ki_f_init = self._previous_Ki_fhat
+
         f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
 
         #Compute hessian and other variables at mode
@@ -57,6 +57,7 @@ class LaplaceInference(object):
 
         #likelihood.gradient = self.likelihood_gradients()
         kern.update_gradients_full(dL_dK, X)
+        likelihood.update_gradients(np.ones(10))
 
         self._previous_Ki_fhat = Ki_fhat.copy()
         return Posterior(woodbury_vector=woodbury_vector, woodbury_inv = K_Wi_i, K=K), log_marginal, {'dL_dK':dL_dK}
@@ -157,9 +158,12 @@ class LaplaceInference(object):
         explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
 
         #Implicit
-        d3lik_d3fhat = likelihood.d3logpdf_df3(f_hat, Y, extra_data=Y_metadata)
-        dL_dfhat = 0.5*(np.diag(Ki_W_i)[:, None]*d3lik_d3fhat) #why isn't this -0.5? s2 in R&W p126 line 9.
+        dW_df = likelihood.d3logpdf_df3(f_hat, Y, extra_data=Y_metadata) # d3lik_d3fhat
         woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, extra_data=Y_metadata)
+        dL_dfhat = 0.5*(np.diag(Ki_W_i)[:, None]*dW_df) #why isn't this -0.5? s2 in R&W p126 line 9.
+        #implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(np.eye(Y.shape[0]) - np.dot(K, K_Wi_i))
+        BiK, _ = dpotrs(L, K, lower=1)
+        #dL_dfhat = 0.5*np.diag(BiK)[:, None]*dW_df
         implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(np.eye(Y.shape[0]) - np.dot(K, K_Wi_i))
 
         dL_dK = explicit_part + implicit_part
@@ -219,7 +223,7 @@ class LaplaceInference(object):
         LiW12, _ = dtrtrs(L, np.diagflat(W_12), lower=1, trans=0)
         K_Wi_i = np.dot(LiW12.T, LiW12) # R = W12BiW12, in R&W p 126, eq 5.25
 
-        #here's a better way to compute the required matrix. 
+        #here's a better way to compute the required matrix.
         # you could do the model finding witha backsub, instead of a dot...
         #L2 = L/W_12
         #K_Wi_i_2 , _= dpotri(L2)
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index c047e573..e6be2261 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -3,9 +3,9 @@
 
 #TODO
 """
-A lot of this code assumes that the link functio nis the identity. 
+A lot of this code assumes that the link functio nis the identity.
 
-I think laplace code is okay, but I'm quite sure that the EP moments will only work if the link is identity. 
+I think laplace code is okay, but I'm quite sure that the EP moments will only work if the link is identity.
 
 Furthermore, exact Guassian inference can only be done for the identity link, so we should be asserting so for all calls which relate to that.
 
@@ -130,7 +130,10 @@ class Gaussian(Likelihood):
         :rtype: float
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
-        return -0.5*(np.sum((y-link_f)**2/self.variance) + self.ln_det_K + self.N*np.log(2.*np.pi))
+        N = y.shape[0]
+        ln_det_cov = N*np.log(self.variance)
+
+        return -0.5*(np.sum((y-link_f)**2/self.variance) + ln_det_cov + N*np.log(2.*np.pi))
 
     def dlogpdf_dlink(self, link_f, y, extra_data=None):
         """
@@ -175,7 +178,8 @@ class Gaussian(Likelihood):
             (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
-        hess = -(1.0/self.variance)*np.ones((self.N, 1))
+        N = y.shape[0]
+        hess = -(1.0/self.variance)*np.ones((N, 1))
         return hess
 
     def d3logpdf_dlink3(self, link_f, y, extra_data=None):
@@ -194,7 +198,8 @@ class Gaussian(Likelihood):
         :rtype: Nx1 array
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
-        d3logpdf_dlink3 = np.diagonal(0*self.I)[:, None]
+        N = y.shape[0]
+        d3logpdf_dlink3 = np.zeros((N,1))
         return d3logpdf_dlink3
 
     def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
@@ -215,7 +220,8 @@ class Gaussian(Likelihood):
         assert np.asarray(link_f).shape == np.asarray(y).shape
         e = y - link_f
         s_4 = 1.0/(self.variance**2)
-        dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.sum(np.square(e))
+        N = y.shape[0]
+        dlik_dsigma = -0.5*N/self.variance + 0.5*s_4*np.sum(np.square(e))
         return np.sum(dlik_dsigma) # Sure about this sum?
 
     def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
@@ -255,7 +261,8 @@ class Gaussian(Likelihood):
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
         s_4 = 1.0/(self.variance**2)
-        d2logpdf_dlink2_dvar = np.diag(s_4*self.I)[:, None]
+        N = y.shape[0]
+        d2logpdf_dlink2_dvar = np.ones((N,1))*s_4
         return d2logpdf_dlink2_dvar
 
     def dlogpdf_link_dtheta(self, f, y, extra_data=None):
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index b0ecfc37..d9a7e109 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -13,12 +13,12 @@ from ..core.parameterization import Parameterized
 
 class Likelihood(Parameterized):
     """
-    Likelihood base class, used to defing p(y|f). 
+    Likelihood base class, used to defing p(y|f).
 
     All instances use _inverse_ link functions, which can be swapped out. It is
     expected that inherriting classes define a default inverse link function
 
-    To use this class, inherrit and define missing functionality. 
+    To use this class, inherrit and define missing functionality.
 
     Inherriting classes *must* implement:
        pdf_link : a bound method which turns the output of the link function into the pdf
@@ -27,7 +27,7 @@ class Likelihood(Parameterized):
     To enable use with EP, inherriting classes *must* define:
        TODO: a suitable derivative function for any parameters of the class
     It is also desirable to define:
-       moments_match_ep : a function to compute the EP moments If this isn't defined, the moments will be computed using 1D quadrature. 
+       moments_match_ep : a function to compute the EP moments If this isn't defined, the moments will be computed using 1D quadrature.
 
     To enable use with Laplace approximation, inherriting classes *must* define:
        Some derivative functions *AS TODO*
@@ -36,7 +36,7 @@ class Likelihood(Parameterized):
 
     """
     def __init__(self, gp_link, name):
-        super(Likelihood, self).__init__(name) 
+        super(Likelihood, self).__init__(name)
         assert isinstance(gp_link,link_functions.GPTransformation), "gp_link is not a valid GPTransformation."
         self.gp_link = gp_link
         self.log_concave = False
@@ -44,6 +44,10 @@ class Likelihood(Parameterized):
     def _gradients(self,partial):
         return np.zeros(0)
 
+    def update_gradients(self, partial):
+        if self.size > 0:
+            raise NotImplementedError('Must be implemented for likelihoods with parameters to be optimized')
+
     def _preprocess_values(self,Y):
         """
         In case it is needed, this function assess the output values or makes any pertinent transformation on them.
@@ -303,7 +307,7 @@ class Likelihood(Parameterized):
         """
         TODO: Doc strings
         """
-        if len(self._get_param_names()) > 0:
+        if self.size > 0:
             link_f = self.gp_link.transf(f)
             return self.dlogpdf_link_dtheta(link_f, y, extra_data=extra_data)
         else:
@@ -314,7 +318,7 @@ class Likelihood(Parameterized):
         """
         TODO: Doc strings
         """
-        if len(self._get_param_names()) > 0:
+        if self.size > 0:
             link_f = self.gp_link.transf(f)
             dlink_df = self.gp_link.dtransf_df(f)
             dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
@@ -327,7 +331,7 @@ class Likelihood(Parameterized):
         """
         TODO: Doc strings
         """
-        if len(self._get_param_names()) > 0:
+        if self.size > 0:
             link_f = self.gp_link.transf(f)
             dlink_df = self.gp_link.dtransf_df(f)
             d2link_df2 = self.gp_link.d2transf_df2(f)
@@ -345,9 +349,9 @@ class Likelihood(Parameterized):
 
         #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
         # ensure we have gradients for every parameter we want to optimize
-        assert dlogpdf_dtheta.shape[1] == len(self._get_param_names())
-        assert dlogpdf_df_dtheta.shape[1] == len(self._get_param_names())
-        assert d2logpdf_df2_dtheta.shape[1] == len(self._get_param_names())
+        assert dlogpdf_dtheta.shape[1] == self.size
+        assert dlogpdf_df_dtheta.shape[1] == self.size
+        assert d2logpdf_df2_dtheta.shape[1] == self.size
 
         return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
 
diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py
index 587e1b23..b4e0dfc3 100644
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@@ -8,6 +8,7 @@ import link_functions
 from scipy import stats, integrate
 from scipy.special import gammaln, gamma
 from likelihood import Likelihood
+from ..core.parameterization import Param
 
 class StudentT(Likelihood):
     """
@@ -19,26 +20,25 @@ class StudentT(Likelihood):
         p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
 
     """
-    def __init__(self,gp_link=None,analytical_mean=True,analytical_variance=True, deg_free=5, sigma2=2):
-        self.v = deg_free
-        self.sigma2 = sigma2
+    def __init__(self,gp_link=None, deg_free=5, sigma2=2):
+        if gp_link is None:
+            gp_link = link_functions.Identity()
+
+        super(StudentT, self).__init__(gp_link, name='Student_T')
+
+        self.sigma2 = Param('t_noise', float(sigma2))
+        self.v = Param('deg_free', float(deg_free))
+        self.add_parameter(self.sigma2)
+        self.add_parameter(self.v)
 
-        self._set_params(np.asarray(sigma2))
-        super(StudentT, self).__init__(gp_link,analytical_mean,analytical_variance)
         self.log_concave = False
 
-    def _get_params(self):
-        return np.asarray(self.sigma2)
+    def parameters_changed(self):
+        self.variance = (self.v / float(self.v - 2)) * self.sigma2
 
-    def _get_param_names(self):
-        return ["t_noise_std2"]
-
-    def _set_params(self, x):
-        self.sigma2 = float(x)
-
-    @property
-    def variance(self, extra_data=None):
-        return (self.v / float(self.v - 2)) * self.sigma2
+    def update_gradients(self, partial):
+        self.sigma2.gradient = np.ones(1) #FIXME: Not done yet
+        self.v.gradient = np.ones(1) #FIXME: Not done yet
 
     def pdf_link(self, link_f, y, extra_data=None):
         """
@@ -82,10 +82,14 @@ class StudentT(Likelihood):
         """
         assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         e = y - link_f
+        #FIXME:
+        #Why does np.log(1 + (1/self.v)*((y-link_f)**2)/self.sigma2) suppress the divide by zero?!
+        #But np.log(1 + (1/float(self.v))*((y-link_f)**2)/self.sigma2) throws it correctly
+        #print - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
         objective = (+ gammaln((self.v + 1) * 0.5)
-                     - gammaln(self.v * 0.5)
-                     - 0.5*np.log(self.sigma2 * self.v * np.pi)
-                     - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
+                    - gammaln(self.v * 0.5)
+                    - 0.5*np.log(self.sigma2 * self.v * np.pi)
+                    - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
                     )
         return np.sum(objective)
 
diff --git a/GPy/models/gradient_checker.py b/GPy/models/gradient_checker.py
index 775334ac..b7c78449 100644
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@@ -1,9 +1,10 @@
 # ## Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from GPy.core.model import Model
+from ..core.model import Model
 import itertools
 import numpy
+from ..core.parameterization import Param
 
 def get_shape(x):
     if isinstance(x, numpy.ndarray):
@@ -24,42 +25,42 @@ class GradientChecker(Model):
         """
         :param f: Function to check gradient for
         :param df: Gradient of function to check
-        :param x0: 
+        :param x0:
             Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
-            Can be a list of arrays, if takes a list of arrays. This list will be passed 
+            Can be a list of arrays, if takes a list of arrays. This list will be passed
             to f and df in the same order as given here.
             If only one argument, make sure not to pass a list!!!
-            
+
         :type x0: [array-like] | array-like | float | int
         :param names:
             Names to print, when performing gradcheck. If a list was passed to x0
             a list of names with the same length is expected.
         :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
-        
+
         Examples:
         ---------
             from GPy.models import GradientChecker
             N, M, Q = 10, 5, 3
-        
+
             Sinusoid:
-            
+
                 X = numpy.random.rand(N, Q)
                 grad = GradientChecker(numpy.sin,numpy.cos,X,'x')
                 grad.checkgrad(verbose=1)
-    
+
             Using GPy:
-            
+
                 X, Z = numpy.random.randn(N,Q), numpy.random.randn(M,Q)
                 kern = GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True)
-                grad = GradientChecker(kern.K, 
+                grad = GradientChecker(kern.K,
                                        lambda x: 2*kern.dK_dX(numpy.ones((1,1)), x),
                                        x0 = X.copy(),
-                                       names='X')  
+                                       names='X')
                 grad.checkgrad(verbose=1)
                 grad.randomize()
-                grad.checkgrad(verbose=1)      
+                grad.checkgrad(verbose=1)
         """
-        Model.__init__(self)
+        Model.__init__(self, 'GradientChecker')
         if isinstance(x0, (list, tuple)) and names is None:
             self.shapes = [get_shape(xi) for xi in x0]
             self.names = ['X{i}'.format(i=i) for i in range(len(x0))]
@@ -72,8 +73,10 @@ class GradientChecker(Model):
         else:
             self.names = names
             self.shapes = [get_shape(x0)]
+
         for name, xi in zip(self.names, at_least_one_element(x0)):
-            self.__setattr__(name, xi)
+            self.__setattr__(name, Param(name, xi))
+            self.add_parameter(self.__getattribute__(name))
 #         self._param_names = []
 #         for name, shape in zip(self.names, self.shapes):
 #             self._param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
@@ -93,20 +96,18 @@ class GradientChecker(Model):
     def _log_likelihood_gradients(self):
         return numpy.atleast_1d(self.df(*self._get_x(), **self.kwargs)).flatten()
 
+    #def _get_params(self):
+        #return numpy.atleast_1d(numpy.hstack(map(lambda name: flatten_if_needed(self.__getattribute__(name)), self.names)))
 
-    def _get_params(self):
-        return numpy.atleast_1d(numpy.hstack(map(lambda name: flatten_if_needed(self.__getattribute__(name)), self.names)))
+    #def _set_params(self, x):
+        #current_index = 0
+        #for name, shape in zip(self.names, self.shapes):
+            #current_size = numpy.prod(shape)
+            #self.__setattr__(name, x[current_index:current_index + current_size].reshape(shape))
+            #current_index += current_size
 
-
-    def _set_params(self, x):
-        current_index = 0
-        for name, shape in zip(self.names, self.shapes):
-            current_size = numpy.prod(shape)
-            self.__setattr__(name, x[current_index:current_index + current_size].reshape(shape))
-            current_index += current_size
-
-    def _get_param_names(self):
-        _param_names = []
-        for name, shape in zip(self.names, self.shapes):
-            _param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
-        return _param_names
+    #def _get_param_names(self):
+        #_param_names = []
+        #for name, shape in zip(self.names, self.shapes):
+            #_param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
+        #return _param_names
diff --git a/GPy/testing/likelihood_tests.py b/GPy/testing/likelihood_tests.py
index d14c9a41..c418a096 100644
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@@ -4,7 +4,8 @@ import GPy
 from GPy.models import GradientChecker
 import functools
 import inspect
-from GPy.likelihoods.noise_models import gp_transformations
+from GPy.likelihoods import link_functions
+from ..core.parameterization import Param
 from functools import partial
 #np.random.seed(300)
 np.random.seed(7)
@@ -22,12 +23,14 @@ def dparam_partial(inst_func, *args):
           the f or Y that are being used in the function whilst we tweak the
           param
     """
-    def param_func(param, inst_func, args):
-        inst_func.im_self._set_params(param)
+    def param_func(param_val, param_name, inst_func, args):
+        #inst_func.im_self._set_params(param)
+        #inst_func.im_self.add_parameter(Param(param_name, param_val))
+        inst_func.im_self[param_name] = param_val
         return inst_func(*args)
     return functools.partial(param_func, inst_func=inst_func, args=args)
 
-def dparam_checkgrad(func, dfunc, params, args, constraints=None, randomize=False, verbose=False):
+def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None, randomize=False, verbose=False):
     """
     checkgrad expects a f: R^N -> R^1 and df: R^N -> R^N
     However if we are holding other parameters fixed and moving something else
@@ -43,22 +46,27 @@ def dparam_checkgrad(func, dfunc, params, args, constraints=None, randomize=Fals
     partial_f = dparam_partial(func, *args)
     partial_df = dparam_partial(dfunc, *args)
     gradchecking = True
-    for param in params:
-        fnum = np.atleast_1d(partial_f(param)).shape[0]
-        dfnum = np.atleast_1d(partial_df(param)).shape[0]
+    zipped_params = zip(params, params_names)
+    for param_val, param_name in zipped_params:
+        fnum = np.atleast_1d(partial_f(param_val, param_name)).shape[0]
+        dfnum = np.atleast_1d(partial_df(param_val, param_name)).shape[0]
         for fixed_val in range(dfnum):
             #dlik and dlik_dvar gives back 1 value for each
             f_ind = min(fnum, fixed_val+1) - 1
             print "fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val)
             #Make grad checker with this param moving, note that set_params is NOT being called
             #The parameter is being set directly with __setattr__
-            grad = GradientChecker(lambda x: np.atleast_1d(partial_f(x))[f_ind],
-                                   lambda x : np.atleast_1d(partial_df(x))[fixed_val],
-                                   param, 'p')
+            grad = GradientChecker(lambda p_val: np.atleast_1d(partial_f(p_val, param_name))[f_ind],
+                                   lambda p_val: np.atleast_1d(partial_df(p_val, param_name))[fixed_val],
+                                   param_val, [param_name])
             #This is not general for more than one param...
             if constraints is not None:
-                for constraint in constraints:
-                    constraint('p', grad)
+                for constrain_param, constraint in constraints:
+                    if grad.grep_param_names(constrain_param):
+                        constraint(constrain_param, grad)
+                    else:
+                        print "parameter didn't exist"
+                    print constrain_param, " ", constraint
             if randomize:
                 grad.randomize()
             if verbose:
@@ -107,17 +115,20 @@ class TestNoiseModels(object):
         ####################################################
         # Constraint wrappers so we can just list them off #
         ####################################################
+        def constrain_fixed(regex, model, value):
+            model[regex].constrain_fixed(value)
+
         def constrain_negative(regex, model):
-            model.constrain_negative(regex)
+            model[regex].constrain_negative()
 
         def constrain_positive(regex, model):
-            model.constrain_positive(regex)
+            model[regex].constrain_positive()
 
         def constrain_bounded(regex, model, lower, upper):
             """
             Used like: partial(constrain_bounded, lower=0, upper=1)
             """
-            model.constrain_bounded(regex, lower, upper)
+            model[regex].constrain_bounded(lower, upper)
 
         """
         Dictionary where we nest models we would like to check
@@ -134,71 +145,72 @@ class TestNoiseModels(object):
                 }
         """
         noise_models = {"Student_t_default": {
-                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [self.var],
-                                "constraints": [constrain_positive]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_positive)]
+                                #"constraints": [("t_noise", constrain_positive), ("deg_free", partial(constrain_fixed, value=5))]
                                 },
                             "laplace": True
                             },
                         "Student_t_1_var": {
-                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [1.0],
-                                "constraints": [constrain_positive]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_positive)]
                                 },
                             "laplace": True
                             },
                         "Student_t_small_var": {
-                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [0.01],
-                                "constraints": [constrain_positive]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_positive)]
                                 },
                             "laplace": True
                             },
                         "Student_t_large_var": {
-                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [10.0],
-                                "constraints": [constrain_positive]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_positive)]
                                 },
                             "laplace": True
                             },
                         "Student_t_approx_gauss": {
-                            "model": GPy.likelihoods.student_t(deg_free=1000, sigma2=self.var),
+                            "model": GPy.likelihoods.StudentT(deg_free=1000, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [self.var],
-                                "constraints": [constrain_positive]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_positive)]
                                 },
                             "laplace": True
                             },
                         "Student_t_log": {
-                            "model": GPy.likelihoods.student_t(gp_link=gp_transformations.Log(), deg_free=5, sigma2=self.var),
+                            "model": GPy.likelihoods.StudentT(gp_link=link_functions.Log(), deg_free=5, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [self.var],
-                                "constraints": [constrain_positive]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_positive)]
                                 },
                             "laplace": True
                             },
                         "Gaussian_default": {
-                            "model": GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N),
+                            "model": GPy.likelihoods.Gaussian(variance=self.var),
                             "grad_params": {
-                                "names": ["noise_model_variance"],
+                                "names": ["variance"],
                                 "vals": [self.var],
-                                "constraints": [constrain_positive]
+                                "constraints": [("variance", constrain_positive)]
                                 },
                             "laplace": True,
                             "ep": True
                             },
                         #"Gaussian_log": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log(), variance=self.var, D=self.D, N=self.N),
+                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Log(), variance=self.var, D=self.D, N=self.N),
                             #"grad_params": {
                                 #"names": ["noise_model_variance"],
                                 #"vals": [self.var],
@@ -207,7 +219,7 @@ class TestNoiseModels(object):
                             #"laplace": True
                             #},
                         #"Gaussian_probit": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Probit(), variance=self.var, D=self.D, N=self.N),
+                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Probit(), variance=self.var, D=self.D, N=self.N),
                             #"grad_params": {
                                 #"names": ["noise_model_variance"],
                                 #"vals": [self.var],
@@ -216,7 +228,7 @@ class TestNoiseModels(object):
                             #"laplace": True
                             #},
                         #"Gaussian_log_ex": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
+                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
                             #"grad_params": {
                                 #"names": ["noise_model_variance"],
                                 #"vals": [self.var],
@@ -225,31 +237,31 @@ class TestNoiseModels(object):
                             #"laplace": True
                             #},
                         "Bernoulli_default": {
-                            "model": GPy.likelihoods.bernoulli(),
+                            "model": GPy.likelihoods.Bernoulli(),
                             "link_f_constraints": [partial(constrain_bounded, lower=0, upper=1)],
                             "laplace": True,
                             "Y": self.binary_Y,
                             "ep": True
                             },
-                        "Exponential_default": {
-                            "model": GPy.likelihoods.exponential(),
-                            "link_f_constraints": [constrain_positive],
-                            "Y": self.positive_Y,
-                            "laplace": True,
-                        },
-                        "Poisson_default": {
-                            "model": GPy.likelihoods.poisson(),
-                            "link_f_constraints": [constrain_positive],
-                            "Y": self.integer_Y,
-                            "laplace": True,
-                            "ep": False #Should work though...
-                        },
-                        "Gamma_default": {
-                            "model": GPy.likelihoods.gamma(),
-                            "link_f_constraints": [constrain_positive],
-                            "Y": self.positive_Y,
-                            "laplace": True
-                        }
+                        #"Exponential_default": {
+                            #"model": GPy.likelihoods.exponential(),
+                            #"link_f_constraints": [constrain_positive],
+                            #"Y": self.positive_Y,
+                            #"laplace": True,
+                        #},
+                        #"Poisson_default": {
+                            #"model": GPy.likelihoods.poisson(),
+                            #"link_f_constraints": [constrain_positive],
+                            #"Y": self.integer_Y,
+                            #"laplace": True,
+                            #"ep": False #Should work though...
+                        #},
+                        #"Gamma_default": {
+                            #"model": GPy.likelihoods.gamma(),
+                            #"link_f_constraints": [constrain_positive],
+                            #"Y": self.positive_Y,
+                            #"laplace": True
+                        #}
                     }
 
         for name, attributes in noise_models.iteritems():
@@ -286,8 +298,8 @@ class TestNoiseModels(object):
             else:
                 ep = False
 
-            if len(param_vals) > 1:
-                raise NotImplementedError("Cannot support multiple params in likelihood yet!")
+            #if len(param_vals) > 1:
+                #raise NotImplementedError("Cannot support multiple params in likelihood yet!")
 
             #Required by all
             #Normal derivatives
@@ -302,13 +314,13 @@ class TestNoiseModels(object):
                 yield self.t_d3logpdf_df3, model, Y, f
                 yield self.t_d3logpdf_dlink3, model, Y, f, link_f_constraints
                 #Params
-                yield self.t_dlogpdf_dparams, model, Y, f, param_vals, param_constraints
-                yield self.t_dlogpdf_df_dparams, model, Y, f, param_vals, param_constraints
-                yield self.t_d2logpdf2_df2_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_dlogpdf_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_df_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_d2logpdf2_df2_dparams, model, Y, f, param_vals, param_names, param_constraints
                 #Link params
-                yield self.t_dlogpdf_link_dparams, model, Y, f, param_vals, param_constraints
-                yield self.t_dlogpdf_dlink_dparams, model, Y, f, param_vals, param_constraints
-                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_dlogpdf_link_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_dlink_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, param_vals, param_names, param_constraints
 
                 #laplace likelihood gradcheck
                 yield self.t_laplace_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
@@ -370,33 +382,33 @@ class TestNoiseModels(object):
     # df_dparams #
     ##############
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_dparams(self, model, Y, f, params, param_constraints):
+    def t_dlogpdf_dparams(self, model, Y, f, params, params_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
-                    params, args=(f, Y), constraints=param_constraints,
-                    randomize=True, verbose=True)
+                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_df_dparams(self, model, Y, f, params, param_constraints):
+    def t_dlogpdf_df_dparams(self, model, Y, f, params, params_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
-                    params, args=(f, Y), constraints=param_constraints,
-                    randomize=True, verbose=True)
+                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf2_df2_dparams(self, model, Y, f, params, param_constraints):
+    def t_d2logpdf2_df2_dparams(self, model, Y, f, params, params_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
-                    params, args=(f, Y), constraints=param_constraints,
-                    randomize=True, verbose=True)
+                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
                 )
 
     ################
@@ -454,32 +466,32 @@ class TestNoiseModels(object):
     # dlink_dparams #
     #################
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_link_dparams(self, model, Y, f, params, param_constraints):
+    def t_dlogpdf_link_dparams(self, model, Y, f, params, param_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.logpdf_link, model.dlogpdf_link_dtheta,
-                    params, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_dlink_dparams(self, model, Y, f, params, param_constraints):
+    def t_dlogpdf_dlink_dparams(self, model, Y, f, params, param_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.dlogpdf_dlink, model.dlogpdf_dlink_dtheta,
-                    params, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, params, param_constraints):
+    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, params, param_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.d2logpdf_dlink2, model.d2logpdf_dlink2_dtheta,
-                    params, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
@@ -493,18 +505,23 @@ class TestNoiseModels(object):
         Y = Y/Y.max()
         white_var = 1e-6
         kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
-        laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), model)
-        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=laplace_likelihood)
+        laplace_likelihood = GPy.inference.latent_function_inference.LaplaceInference()
+        m = GPy.core.GP(X.copy(), Y.copy(), kernel, likelihood=model, inference_method=laplace_likelihood)
         m.ensure_default_constraints()
-        m.constrain_fixed('white', white_var)
+        m['white'].constrain_fixed(white_var)
 
-        for param_num in range(len(param_names)):
-            name = param_names[param_num]
-            m[name] = param_vals[param_num]
-            constraints[param_num](name, m)
+        #Set constraints
+        for constrain_param, constraint in constraints:
+            constraint(constrain_param, m)
 
         print m
         m.randomize()
+
+        #Set params
+        for param_num in range(len(param_names)):
+            name = param_names[param_num]
+            m[name] = param_vals[param_num]
+
         #m.optimize(max_iters=8)
         print m
         m.checkgrad(verbose=1, step=step)
@@ -526,9 +543,9 @@ class TestNoiseModels(object):
         white_var = 1e-6
         kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         ep_likelihood = GPy.likelihoods.EP(Y.copy(), model)
-        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=ep_likelihood)
+        m = GPy.core.GP(X.copy(), Y.copy(), kernel, likelihood=ep_likelihood)
         m.ensure_default_constraints()
-        m.constrain_fixed('white', white_var)
+        m['white'].constrain_fixed(white_var)
 
         for param_num in range(len(param_names)):
             name = param_names[param_num]
@@ -559,8 +576,8 @@ class LaplaceTests(unittest.TestCase):
         self.var = 0.2
 
         self.var = np.random.rand(1)
-        self.stu_t = GPy.likelihoods.student_t(deg_free=5, sigma2=self.var)
-        self.gauss = GPy.likelihoods.gaussian(gp_transformations.Log(), variance=self.var, D=self.D, N=self.N)
+        self.stu_t = GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var)
+        self.gauss = GPy.likelihoods.Gaussian(gp_link=link_functions.Log(), variance=self.var)
 
         #Make a bigger step as lower bound can be quite curved
         self.step = 1e-6
@@ -584,7 +601,7 @@ class LaplaceTests(unittest.TestCase):
         noise = np.random.randn(*self.X.shape)*self.real_std
         self.Y = np.sin(self.X*2*np.pi) + noise
         self.f = np.random.rand(self.N, 1)
-        self.gauss = GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N)
+        self.gauss = GPy.likelihoods.Gaussian(variance=self.var)
 
         dlogpdf_df = functools.partial(self.gauss.dlogpdf_df, y=self.Y)
         d2logpdf_df2 = functools.partial(self.gauss.d2logpdf_df2, y=self.Y)
@@ -607,21 +624,23 @@ class LaplaceTests(unittest.TestCase):
         kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         kernel2 = kernel1.copy()
 
-        m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
-        m1.constrain_fixed('white', 1e-6)
-        m1['noise'] = initial_var_guess
-        m1.constrain_bounded('noise', 1e-4, 10)
-        m1.constrain_bounded('rbf', 1e-4, 10)
+        gauss_distr1 = GPy.likelihoods.Gaussian(variance=initial_var_guess)
+        exact_inf = GPy.inference.latent_function_inference.ExactGaussianInference()
+        m1 = GPy.core.GP(X, Y.copy(), kernel=kernel1, likelihood=gauss_distr1, inference_method=exact_inf)
+        m1['white'].constrain_fixed(1e-6)
+        m1['variance'] = initial_var_guess
+        m1['variance'].constrain_bounded(1e-4, 10)
+        m1['rbf'].constrain_bounded(1e-4, 10)
         m1.ensure_default_constraints()
         m1.randomize()
 
-        gauss_distr = GPy.likelihoods.gaussian(variance=initial_var_guess, D=1, N=Y.shape[0])
-        laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), gauss_distr)
-        m2 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel2, likelihood=laplace_likelihood)
+        gauss_distr2 = GPy.likelihoods.Gaussian(variance=initial_var_guess)
+        laplace_inf = GPy.inference.latent_function_inference.LaplaceInference()
+        m2 = GPy.core.GP(X, Y.copy(), kernel=kernel2, likelihood=gauss_distr2, inference_method=laplace_inf)
         m2.ensure_default_constraints()
-        m2.constrain_fixed('white', 1e-6)
-        m2.constrain_bounded('rbf', 1e-4, 10)
-        m2.constrain_bounded('noise', 1e-4, 10)
+        m2['white'].constrain_fixed(1e-6)
+        m2['rbf'].constrain_bounded(1e-4, 10)
+        m2['variance'].constrain_bounded(1e-4, 10)
         m2.randomize()
 
         if debug:

From 0f263d2ff29051a8e7279ceafd2e99057b3db0fb Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 10 Feb 2014 12:28:24 +0000
Subject: [PATCH 26/43] Have most of the likelihood testing working, laplace
 likelihood parameters need fixing, some of the signs are wrong I believe

---
 GPy/core/parameterization/parameterized.py    |  13 ++-
 .../latent_function_inference/laplace.py      | 109 +++++++++++-------
 GPy/likelihoods/likelihood.py                 |  15 ++-
 GPy/likelihoods/student_t.py                  |  10 +-
 GPy/testing/likelihood_tests.py               |  46 ++++----
 5 files changed, 122 insertions(+), 71 deletions(-)

diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 556fdbe0..fe51e34c 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -256,6 +256,16 @@ class Parameterized(Constrainable, Pickleable, Observable):
             cPickle.dump(self, f, protocol)
     def copy(self):
         """Returns a (deep) copy of the current model """
+        #dc = dict()
+        #for k, v in self.__dict__.iteritems():
+            #if k not in ['_highest_parent_', '_direct_parent_']:
+                #dc[k] = copy.deepcopy(v)
+
+        #dc = copy.deepcopy(self.__dict__)
+        #dc['_highest_parent_'] = None
+        #dc['_direct_parent_'] = None
+        #s = self.__class__.new()
+        #s.__dict__ = dc
         return copy.deepcopy(self)
     def __getstate__(self):
         if self._has_get_set_state():
@@ -419,6 +429,8 @@ class Parameterized(Constrainable, Pickleable, Observable):
     #===========================================================================
     # Convenience for fixed, tied checking of param:
     #===========================================================================
+    def fixed_indices(self):
+        return np.array([x.is_fixed for x in self._parameters_])
     def _is_fixed(self, param):
         # returns if the whole param is fixed
         if not self._has_fixes():
@@ -449,7 +461,6 @@ class Parameterized(Constrainable, Pickleable, Observable):
         # if removing constraints before adding new is not wanted, just delete the above line!
         self.constraints.add(transform, rav_i)
         param = self._get_original(param)
-        #FIXME: Max, is this the right thing to do to handle fixed?
         if not (transform == __fixed__):
             param._set_params(transform.initialize(param._get_params()), update=False)
         if warning and any(reconstrained):
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 6fce94fc..2185aca1 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -32,6 +32,7 @@ class LaplaceInference(object):
         self._mode_finding_tolerance = 1e-7
         self._mode_finding_max_iter = 40
         self.bad_fhat = True
+        self._previous_Ki_fhat = None
 
     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
         """
@@ -53,14 +54,13 @@ class LaplaceInference(object):
         f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
 
         #Compute hessian and other variables at mode
-        log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, Y_metadata)
+        log_marginal, woodbury_vector, woodbury_inv, dL_dK, dL_dthetaL = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, kern, Y_metadata)
 
-        #likelihood.gradient = self.likelihood_gradients()
         kern.update_gradients_full(dL_dK, X)
-        likelihood.update_gradients(np.ones(10))
+        likelihood.update_gradients(dL_dthetaL)
 
         self._previous_Ki_fhat = Ki_fhat.copy()
-        return Posterior(woodbury_vector=woodbury_vector, woodbury_inv = K_Wi_i, K=K), log_marginal, {'dL_dK':dL_dK}
+        return Posterior(woodbury_vector=woodbury_vector, woodbury_inv=woodbury_inv, K=K), log_marginal, {'dL_dK':dL_dK}
 
     def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
         """
@@ -134,13 +134,15 @@ class LaplaceInference(object):
 
         return f, Ki_f
 
-
-    def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, Y_metadata):
+    def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, kern, Y_metadata):
         """
         At the mode, compute the hessian and effective covariance matrix.
 
         returns: logZ : approximation to the marginal likelihood
-        Cov : the approximation to the covariance matrix
+                 woodbury_vector : variable required for calculating the approximation to the covariance matrix
+                 woodbury_inv : variable required for calculating the approximation to the covariance matrix
+                 dL_dthetaL : array of derivatives (1 x num_kernel_params)
+                 dL_dthetaL : array of derivatives (1 x num_likelihood_params)
         """
         #At this point get the hessian matrix (or vector as W is diagonal)
         W = -likelihood.d2logpdf_df2(f_hat, Y, extra_data=Y_metadata)
@@ -154,48 +156,75 @@ class LaplaceInference(object):
         #compute the log marginal
         log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + likelihood.logpdf(f_hat, Y, extra_data=Y_metadata) - np.sum(np.log(np.diag(L)))
 
-        #compute dL_dK
-        explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
-
-        #Implicit
+        #Compute vival matrices for derivatives
         dW_df = likelihood.d3logpdf_df3(f_hat, Y, extra_data=Y_metadata) # d3lik_d3fhat
         woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, extra_data=Y_metadata)
         dL_dfhat = 0.5*(np.diag(Ki_W_i)[:, None]*dW_df) #why isn't this -0.5? s2 in R&W p126 line 9.
-        #implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(np.eye(Y.shape[0]) - np.dot(K, K_Wi_i))
-        BiK, _ = dpotrs(L, K, lower=1)
+        #BiK, _ = dpotrs(L, K, lower=1)
         #dL_dfhat = 0.5*np.diag(BiK)[:, None]*dW_df
-        implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(np.eye(Y.shape[0]) - np.dot(K, K_Wi_i))
+        I_KW_i = np.eye(Y.shape[0]) - np.dot(K, K_Wi_i)
 
-        dL_dK = explicit_part + implicit_part
-
-        return log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector
-
-
-    def likelihood_gradients(self):
-        """
-        Gradients with respect to likelihood parameters (dL_dthetaL)
-
-        :rtype: array of derivatives (1 x num_likelihood_params)
-        """
-        dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = likelihood._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
-
-        num_params = len(self._get_param_names())
-        # make space for one derivative for each likelihood parameter
-        dL_dthetaL = np.zeros(num_params)
-        for thetaL_i in range(num_params):
+        ####################
+        #compute dL_dK#
+        ####################
+        if kern.size > 0 and not kern.is_fixed:
             #Explicit
-            dL_dthetaL_exp = ( np.sum(dlik_dthetaL[:, thetaL_i])
-                             #- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
-                             + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[:, thetaL_i])
-                             )
+            explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
 
             #Implicit
-            dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[:, thetaL_i])
-            dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
-            dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+            implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(I_KW_i)
 
-        return dL_dthetaL
+            dL_dK = explicit_part + implicit_part
+        else:
+            dL_dK = np.zeros(likelihood.size)
+
+        ####################
+        #compute dL_dthetaL#
+        ####################
+        if likelihood.size > 0 and not likelihood.is_fixed:
+            dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = likelihood._laplace_gradients(f_hat, Y, extra_data=Y_metadata)
+
+            num_params = likelihood.size
+            # make space for one derivative for each likelihood parameter
+            dL_dthetaL = np.zeros(num_params)
+            for thetaL_i in range(num_params):
+                #Explicit
+                dL_dthetaL_exp = ( + np.sum(dlik_dthetaL[thetaL_i])
+                                + 0.5*np.sum(np.diag(Ki_W_i).flatten()*dlik_hess_dthetaL[:, thetaL_i].flatten())
+                                #- 0.5*np.trace(np.diag(Ki_W_i)[:,None]*dlik_hess_dthetaL[:, thetaL_i])
+                                #+ 0.5*np.trace(np.dot(I_KW_i, K)*dlik_hess_dthetaL[:, thetaL_i])
+                                )
+
+                #Implicit
+                dfhat_dthetaL = mdot(I_KW_i, K, dlik_grad_dthetaL[:, thetaL_i])
+                #dfhat_dthetaL = mdot(Wi_K_i, dlik_grad_dthetaL[:, thetaL_i])
+                dL_dthetaL_imp = np.dot(dL_dfhat.T, dfhat_dthetaL)
+                #import pylab as pb
+                #pb.figure(1)
+                #pb.matshow(Ki_W_i)
+                #pb.title('I_KW_i approx')
+                #pb.colorbar()
+                #pb.figure(2)
+                #pb.matshow(np.linalg.inv(np.dot(np.eye(Y.shape[0]) + np.sqrt(W).T*K*np.sqrt(W), K)))
+                #pb.title('I_KW_i')
+                #pb.colorbar()
+                #print likelihood
+                #pb.show()
+                #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+                dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+
+        else:
+            dL_dthetaL = np.zeros(likelihood.size)
+
+        return log_marginal, woodbury_vector, K_Wi_i, dL_dK, dL_dthetaL
+
+
+    #def likelihood_gradients(self, f_hat, K, Y, Ki_W_i, dL_dfhat, I_KW_i, likelihood, Y_metadata):
+        #"""
+        #Gradients with respect to likelihood parameters (dL_dthetaL)
+
+        #:rtype: array of derivatives (1 x num_likelihood_params)
+        #"""
 
     def _compute_B_statistics(self, K, W, log_concave):
         """
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index d9a7e109..701a5a2f 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -312,7 +312,7 @@ class Likelihood(Parameterized):
             return self.dlogpdf_link_dtheta(link_f, y, extra_data=extra_data)
         else:
             #Is no parameters so return an empty array for its derivatives
-            return np.empty([1, 0])
+            return np.zeros([1, 0])
 
     def dlogpdf_df_dtheta(self, f, y, extra_data=None):
         """
@@ -325,7 +325,7 @@ class Likelihood(Parameterized):
             return chain_1(dlogpdf_dlink_dtheta, dlink_df)
         else:
             #Is no parameters so return an empty array for its derivatives
-            return np.empty([f.shape[0], 0])
+            return np.zeros([f.shape[0], 0])
 
     def d2logpdf_df2_dtheta(self, f, y, extra_data=None):
         """
@@ -340,7 +340,7 @@ class Likelihood(Parameterized):
             return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
         else:
             #Is no parameters so return an empty array for its derivatives
-            return np.empty([f.shape[0], 0])
+            return np.zeros([f.shape[0], 0])
 
     def _laplace_gradients(self, f, y, extra_data=None):
         dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, extra_data=extra_data)
@@ -349,9 +349,12 @@ class Likelihood(Parameterized):
 
         #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
         # ensure we have gradients for every parameter we want to optimize
-        assert dlogpdf_dtheta.shape[1] == self.size
-        assert dlogpdf_df_dtheta.shape[1] == self.size
-        assert d2logpdf_df2_dtheta.shape[1] == self.size
+        try:
+            assert len(dlogpdf_dtheta) == self.size #1 x num_param array
+            assert dlogpdf_df_dtheta.shape[1] == self.size #f x num_param matrix
+            assert d2logpdf_df2_dtheta.shape[1] == self.size #f x num_param matrix
+        except Exception as e:
+            import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
 
         return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
 
diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py
index b4e0dfc3..6347897e 100644
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@@ -30,6 +30,7 @@ class StudentT(Likelihood):
         self.v = Param('deg_free', float(deg_free))
         self.add_parameter(self.sigma2)
         self.add_parameter(self.v)
+        self.v.constrain_fixed()
 
         self.log_concave = False
 
@@ -226,15 +227,18 @@ class StudentT(Likelihood):
 
     def dlogpdf_link_dtheta(self, f, y, extra_data=None):
         dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, extra_data=extra_data)
-        return np.asarray([[dlogpdf_dvar]])
+        dlogpdf_dv = np.zeros_like(dlogpdf_dvar) #FIXME: Not done yet
+        return np.hstack((dlogpdf_dvar, dlogpdf_dv))
 
     def dlogpdf_dlink_dtheta(self, f, y, extra_data=None):
         dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)
-        return dlogpdf_dlink_dvar
+        dlogpdf_dlink_dv = np.zeros_like(dlogpdf_dlink_dvar) #FIXME: Not done yet
+        return np.hstack((dlogpdf_dlink_dvar, dlogpdf_dlink_dv))
 
     def d2logpdf_dlink2_dtheta(self, f, y, extra_data=None):
         d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)
-        return d2logpdf_dlink2_dvar
+        d2logpdf_dlink2_dv = np.zeros_like(d2logpdf_dlink2_dvar) #FIXME: Not done yet
+        return np.hstack((d2logpdf_dlink2_dvar, d2logpdf_dlink2_dv))
 
     def _predictive_variance_analytical(self, mu, sigma, predictive_mean=None):
         """
diff --git a/GPy/testing/likelihood_tests.py b/GPy/testing/likelihood_tests.py
index c418a096..d344e23d 100644
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@@ -8,7 +8,7 @@ from GPy.likelihoods import link_functions
 from ..core.parameterization import Param
 from functools import partial
 #np.random.seed(300)
-np.random.seed(7)
+#np.random.seed(7)
 
 def dparam_partial(inst_func, *args):
     """
@@ -41,25 +41,27 @@ def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None,
     The number of parameters and N is the number of data
     Need to take a slice out from f and a slice out of df
     """
-    #print "\n{} likelihood: {} vs {}".format(func.im_self.__class__.__name__,
-                                           #func.__name__, dfunc.__name__)
+    print "\n{} likelihood: {} vs {}".format(func.im_self.__class__.__name__,
+                                           func.__name__, dfunc.__name__)
     partial_f = dparam_partial(func, *args)
     partial_df = dparam_partial(dfunc, *args)
     gradchecking = True
     zipped_params = zip(params, params_names)
-    for param_val, param_name in zipped_params:
-        fnum = np.atleast_1d(partial_f(param_val, param_name)).shape[0]
-        dfnum = np.atleast_1d(partial_df(param_val, param_name)).shape[0]
+    for param_ind, (param_val, param_name) in enumerate(zipped_params):
+        #Check one parameter at a time, make sure it is 2d (as some gradients only return arrays) then strip out the parameter
+        fnum = np.atleast_2d(partial_f(param_val, param_name))[:, param_ind].shape[0]
+        dfnum = np.atleast_2d(partial_df(param_val, param_name))[:, param_ind].shape[0]
         for fixed_val in range(dfnum):
             #dlik and dlik_dvar gives back 1 value for each
             f_ind = min(fnum, fixed_val+1) - 1
             print "fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val)
             #Make grad checker with this param moving, note that set_params is NOT being called
             #The parameter is being set directly with __setattr__
-            grad = GradientChecker(lambda p_val: np.atleast_1d(partial_f(p_val, param_name))[f_ind],
-                                   lambda p_val: np.atleast_1d(partial_df(p_val, param_name))[fixed_val],
+            #Check only the parameter and function value we wish to check at a time
+            grad = GradientChecker(lambda p_val: np.atleast_2d(partial_f(p_val, param_name))[f_ind, param_ind],
+                                   lambda p_val: np.atleast_2d(partial_df(p_val, param_name))[fixed_val, param_ind],
                                    param_val, [param_name])
-            #This is not general for more than one param...
+
             if constraints is not None:
                 for constrain_param, constraint in constraints:
                     if grad.grep_param_names(constrain_param):
@@ -115,8 +117,8 @@ class TestNoiseModels(object):
         ####################################################
         # Constraint wrappers so we can just list them off #
         ####################################################
-        def constrain_fixed(regex, model, value):
-            model[regex].constrain_fixed(value)
+        def constrain_fixed(regex, model):
+            model[regex].constrain_fixed()
 
         def constrain_negative(regex, model):
             model[regex].constrain_negative()
@@ -149,7 +151,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [self.var],
-                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_positive)]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
                                 #"constraints": [("t_noise", constrain_positive), ("deg_free", partial(constrain_fixed, value=5))]
                                 },
                             "laplace": True
@@ -159,7 +161,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [1.0],
-                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_positive)]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
@@ -168,7 +170,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [0.01],
-                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_positive)]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
@@ -177,7 +179,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [10.0],
-                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_positive)]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
@@ -186,7 +188,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [self.var],
-                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_positive)]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
@@ -195,7 +197,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [self.var],
-                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_positive)]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
@@ -542,8 +544,8 @@ class TestNoiseModels(object):
         Y = Y/Y.max()
         white_var = 1e-6
         kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
-        ep_likelihood = GPy.likelihoods.EP(Y.copy(), model)
-        m = GPy.core.GP(X.copy(), Y.copy(), kernel, likelihood=ep_likelihood)
+        ep_inf = GPy.inference.latent_function_inference.EP()
+        m = GPy.core.GP(X.copy(), Y.copy(), kernel=kernel, likelihood=model, inference_method=ep_inf)
         m.ensure_default_constraints()
         m['white'].constrain_fixed(white_var)
 
@@ -622,7 +624,9 @@ class LaplaceTests(unittest.TestCase):
         #Yc = Y.copy()
         #Yc[75:80] += 1
         kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
-        kernel2 = kernel1.copy()
+        #FIXME: Make sure you can copy kernels when params is fixed
+        #kernel2 = kernel1.copy()
+        kernel2 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
 
         gauss_distr1 = GPy.likelihoods.Gaussian(variance=initial_var_guess)
         exact_inf = GPy.inference.latent_function_inference.ExactGaussianInference()
@@ -686,7 +690,7 @@ class LaplaceTests(unittest.TestCase):
 
 
         #Check Y's are the same
-        np.testing.assert_almost_equal(Y, m2.likelihood.Y, decimal=5)
+        np.testing.assert_almost_equal(m1.Y, m2.Y, decimal=5)
         #Check marginals are the same
         np.testing.assert_almost_equal(m1.log_likelihood(), m2.log_likelihood(), decimal=2)
         #Check marginals are the same with random

From fc44478ed23d51117f51458a15b366a18ce8a788 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 10 Feb 2014 12:29:09 +0000
Subject: [PATCH 27/43] Have most of the likelihood testing working, laplace
 likelihood parameters need fixing, some of the signs are wrong I believe

---
 GPy/inference/latent_function_inference/laplace.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 2185aca1..26d3a538 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -159,7 +159,7 @@ class LaplaceInference(object):
         #Compute vival matrices for derivatives
         dW_df = likelihood.d3logpdf_df3(f_hat, Y, extra_data=Y_metadata) # d3lik_d3fhat
         woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, extra_data=Y_metadata)
-        dL_dfhat = 0.5*(np.diag(Ki_W_i)[:, None]*dW_df) #why isn't this -0.5? s2 in R&W p126 line 9.
+        dL_dfhat = -0.5*(np.diag(Ki_W_i)[:, None]*dW_df) #why isn't this -0.5? s2 in R&W p126 line 9.
         #BiK, _ = dpotrs(L, K, lower=1)
         #dL_dfhat = 0.5*np.diag(BiK)[:, None]*dW_df
         I_KW_i = np.eye(Y.shape[0]) - np.dot(K, K_Wi_i)
@@ -172,7 +172,7 @@ class LaplaceInference(object):
             explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
 
             #Implicit
-            implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(I_KW_i)
+            implicit_part = -np.dot(woodbury_vector, dL_dfhat.T).dot(I_KW_i)
 
             dL_dK = explicit_part + implicit_part
         else:

From d2a0e4a2658597e1fb682774c3f6cd39fd8ae3a9 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 10 Feb 2014 12:54:49 +0000
Subject: [PATCH 28/43] Stupid error, needed to actually USE the gradients in
 student t... Looks like s2 of rasm's may have an extra -? dW_df ==
 -d2logpdf_df not just d2logpdf_df?

---
 .../latent_function_inference/laplace.py      | 23 ++++---------------
 GPy/likelihoods/student_t.py                  |  8 +++++--
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 26d3a538..82313eab 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -157,7 +157,7 @@ class LaplaceInference(object):
         log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + likelihood.logpdf(f_hat, Y, extra_data=Y_metadata) - np.sum(np.log(np.diag(L)))
 
         #Compute vival matrices for derivatives
-        dW_df = likelihood.d3logpdf_df3(f_hat, Y, extra_data=Y_metadata) # d3lik_d3fhat
+        dW_df = -likelihood.d3logpdf_df3(f_hat, Y, extra_data=Y_metadata) # -d3lik_d3fhat
         woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, extra_data=Y_metadata)
         dL_dfhat = -0.5*(np.diag(Ki_W_i)[:, None]*dW_df) #why isn't this -0.5? s2 in R&W p126 line 9.
         #BiK, _ = dpotrs(L, K, lower=1)
@@ -172,7 +172,7 @@ class LaplaceInference(object):
             explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
 
             #Implicit
-            implicit_part = -np.dot(woodbury_vector, dL_dfhat.T).dot(I_KW_i)
+            implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(I_KW_i)
 
             dL_dK = explicit_part + implicit_part
         else:
@@ -189,28 +189,15 @@ class LaplaceInference(object):
             dL_dthetaL = np.zeros(num_params)
             for thetaL_i in range(num_params):
                 #Explicit
-                dL_dthetaL_exp = ( + np.sum(dlik_dthetaL[thetaL_i])
+                dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i])
+                                # The + comes from the fact that dlik_hess_dthetaL == -dW_dthetaL
                                 + 0.5*np.sum(np.diag(Ki_W_i).flatten()*dlik_hess_dthetaL[:, thetaL_i].flatten())
-                                #- 0.5*np.trace(np.diag(Ki_W_i)[:,None]*dlik_hess_dthetaL[:, thetaL_i])
-                                #+ 0.5*np.trace(np.dot(I_KW_i, K)*dlik_hess_dthetaL[:, thetaL_i])
                                 )
 
                 #Implicit
                 dfhat_dthetaL = mdot(I_KW_i, K, dlik_grad_dthetaL[:, thetaL_i])
-                #dfhat_dthetaL = mdot(Wi_K_i, dlik_grad_dthetaL[:, thetaL_i])
+                #dfhat_dthetaL = mdot(Ki_W_i, dlik_grad_dthetaL[:, thetaL_i])
                 dL_dthetaL_imp = np.dot(dL_dfhat.T, dfhat_dthetaL)
-                #import pylab as pb
-                #pb.figure(1)
-                #pb.matshow(Ki_W_i)
-                #pb.title('I_KW_i approx')
-                #pb.colorbar()
-                #pb.figure(2)
-                #pb.matshow(np.linalg.inv(np.dot(np.eye(Y.shape[0]) + np.sqrt(W).T*K*np.sqrt(W), K)))
-                #pb.title('I_KW_i')
-                #pb.colorbar()
-                #print likelihood
-                #pb.show()
-                #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
                 dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
 
         else:
diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py
index 6347897e..e815a399 100644
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@@ -38,8 +38,12 @@ class StudentT(Likelihood):
         self.variance = (self.v / float(self.v - 2)) * self.sigma2
 
     def update_gradients(self, partial):
-        self.sigma2.gradient = np.ones(1) #FIXME: Not done yet
-        self.v.gradient = np.ones(1) #FIXME: Not done yet
+        """
+        Pull out the gradients, be careful as the order must match the order
+        in which the parameters are added
+        """
+        self.sigma2.gradient = partial[0]
+        self.v.gradient = partial[1]
 
     def pdf_link(self, link_f, y, extra_data=None):
         """

From e0c68d5eb3857f887898be75d5fee5f90313e03f Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Mon, 10 Feb 2014 15:12:49 +0000
Subject: [PATCH 29/43] _highest_parent_ now follows the tree, dK_dX >
 gradient_X, added update_grads_variational to linear, bgplvm for new
 framework

---
 GPy/core/model.py                             |  13 +-
 GPy/core/parameterization/array_core.py       |  37 +++--
 GPy/core/parameterization/index_operations.py |  64 --------
 GPy/core/parameterization/param.py            | 139 +++++++++---------
 GPy/core/parameterization/parameter_core.py   |  15 +-
 GPy/core/parameterization/parameterized.py    |  18 +--
 GPy/core/parameterization/variational.py      |   5 +-
 GPy/examples/dimensionality_reduction.py      |  34 ++---
 .../latent_function_inference/fitc.py         |   8 +-
 .../latent_function_inference/varDTC.py       |   2 +-
 GPy/kern/kern.py                              |  11 +-
 GPy/kern/parts/Brownian.py                    |   2 +-
 GPy/kern/parts/Matern32.py                    |   6 +-
 GPy/kern/parts/Matern52.py                    |   6 +-
 GPy/kern/parts/eq_ode1.py                     |   2 +-
 GPy/kern/parts/exponential.py                 |   6 +-
 GPy/kern/parts/fixed.py                       |   2 +-
 GPy/kern/parts/gibbs.py                       |   6 +-
 GPy/kern/parts/hetero.py                      |   2 +-
 GPy/kern/parts/hierarchical.py                |   4 +-
 GPy/kern/parts/independent_outputs.py         |   4 +-
 GPy/kern/parts/kernpart.py                    |   3 +-
 GPy/kern/parts/linear.py                      |  27 +++-
 GPy/kern/parts/mlp.py                         |   2 +-
 GPy/kern/parts/poly.py                        |   2 +-
 GPy/kern/parts/prod.py                        |  20 +--
 GPy/kern/parts/prod_orthogonal.py             |  10 +-
 GPy/kern/parts/rational_quadratic.py          |   2 +-
 GPy/kern/parts/rbf.py                         |  13 +-
 GPy/kern/parts/rbf_inv.py                     |   6 +-
 GPy/kern/parts/rbfcos.py                      |   2 +-
 GPy/kern/parts/ss_rbf.py                      |   4 +-
 GPy/kern/parts/symmetric.py                   |  10 +-
 GPy/kern/parts/sympykern.py                   |   2 +-
 GPy/mappings/kernel.py                        |   2 +-
 GPy/models/bayesian_gplvm.py                  |  55 ++++---
 GPy/models/bcgplvm.py                         |   2 +-
 GPy/models/gplvm.py                           |   2 +-
 GPy/models/sparse_gplvm.py                    |   2 +-
 GPy/plotting/matplot_dep/variational_plots.py |   3 +-
 GPy/util/caching.py                           |   5 +-
 41 files changed, 269 insertions(+), 291 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index a1b2abe4..35403ba7 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -439,18 +439,19 @@ class Model(Parameterized):
                     print "No free parameters to check"
                     return
 
-
+            gradient = self.objective_function_gradients(x)
+            np.where(gradient==0, 1e-312, gradient)
+            
             for i in param_list:
                 xx = x.copy()
                 xx[i] += step
                 f1, g1 = self.objective_and_gradients(xx)
                 xx[i] -= 2.*step
                 f2, g2 = self.objective_and_gradients(xx)
-                gradient = self.objective_function_gradients(x)[i]
-
+                
                 numerical_gradient = (f1 - f2) / (2 * step)
-                ratio = (f1 - f2) / (2 * step * np.where(gradient==0, 1e-312, gradient))
-                difference = np.abs((f1 - f2) / 2 / step - gradient)
+                ratio = (f1 - f2) / (2 * step * gradient[i])
+                difference = np.abs((f1 - f2) / 2 / step - gradient[i])
 
                 if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance:
                     formatted_name = "\033[92m {0} \033[0m".format(names[i])
@@ -458,7 +459,7 @@ class Model(Parameterized):
                     formatted_name = "\033[91m {0} \033[0m".format(names[i])
                 r = '%.6f' % float(ratio)
                 d = '%.6f' % float(difference)
-                g = '%.6f' % gradient
+                g = '%.6f' % gradient[i]
                 ng = '%.6f' % float(numerical_gradient)
                 grad_string = "{0:<{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name, r, d, g, ng, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4])
                 print grad_string
diff --git a/GPy/core/parameterization/array_core.py b/GPy/core/parameterization/array_core.py
index 4c31f23b..c95f3ce3 100644
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@@ -15,8 +15,18 @@ class ListArray(np.ndarray):
     def __new__(cls, input_array):
         obj = np.asanyarray(input_array).view(cls)
         return obj
-    def __eq__(self, other):
-        return other is self
+    #def __eq__(self, other):
+    #    return other is self
+
+class ParamList(list):
+
+    def __contains__(self, other):
+        for el in self:
+            if el is other:
+                return True
+        return False
+    
+    pass
 
 class ObservableArray(ListArray, Observable):
     """
@@ -36,16 +46,19 @@ class ObservableArray(ListArray, Observable):
         if obj is None: return
         self._observers_ = getattr(obj, '_observers_', None)
     def __setitem__(self, s, val, update=True):
-        if self.ndim:
-            if not np.all(np.equal(self[s], val)):
-                super(ObservableArray, self).__setitem__(s, val)
-                if update:
-                    self._notify_observers()
-        else:
-            if not np.all(np.equal(self, val)):
-                super(ObservableArray, self).__setitem__(Ellipsis, val)
-                if update:
-                    self._notify_observers()
+        super(ObservableArray, self).__setitem__(s, val)
+        if update:
+            self._notify_observers()
+#         if self.ndim:
+#             if not np.all(np.equal(self[s], val)):
+#                 super(ObservableArray, self).__setitem__(s, val)
+#                 if update:
+#                     self._notify_observers()
+#         else:
+#             if not np.all(np.equal(self, val)):
+#                 super(ObservableArray, self).__setitem__(Ellipsis, val)
+#                 if update:
+#                     self._notify_observers()
     def __getslice__(self, start, stop):
         return self.__getitem__(slice(start, stop))
     def __setslice__(self, start, stop, val):
diff --git a/GPy/core/parameterization/index_operations.py b/GPy/core/parameterization/index_operations.py
index 99b5a4de..d52211c5 100644
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@@ -90,11 +90,6 @@ class ParameterIndexOperations(object):
         return self._properties.values()
 
     def properties_for(self, index):
-#         already_seen = dict()
-#         for ni in index:
-#             if ni not in already_seen:
-#                 already_seen[ni] = [prop for prop in self.iter_properties() if ni in self._properties[prop]] 
-#             yield already_seen[ni]
         return vectorize(lambda i: [prop for prop in self.iter_properties() if i in self._properties[prop]], otypes=[list])(index)
         
     def add(self, prop, indices):
@@ -111,70 +106,11 @@ class ParameterIndexOperations(object):
                 self._properties[prop] = diff
             else:
                 del self._properties[prop]
-            #[self._reverse[i].remove(prop) for i in removed if prop in self._reverse[i]] 
             return removed.astype(int)
-#         else:
-#             for a in self.properties(): 
-#                 if numpy.all(a==prop) and a._parent_index_ == prop._parent_index_:
-#                     ind = create_raveled_indices(indices, shape, offset)
-#                     diff = remove_indices(self[a], ind)
-#                     removed = numpy.intersect1d(self[a], ind, True)
-#                     if not index_empty(diff):
-#                         self._properties[a] = diff
-#                     else:
-#                         del self._properties[a]
-#                     [self._reverse[i].remove(a) for i in removed if a in self._reverse[i]] 
-#                     return removed.astype(int)
         return numpy.array([]).astype(int)
     def __getitem__(self, prop):
         return self._properties[prop]
        
-# class TieIndexOperations(object):
-#     def __init__(self, params):
-#         self.params = params
-#         self.tied_from = ParameterIndexOperations()
-#         self.tied_to = ParameterIndexOperations()
-#     def add(self, tied_from, tied_to):
-#         rav_from = self.params._raveled_index_for(tied_from)
-#         rav_to = self.params._raveled_index_for(tied_to)
-#         self.tied_from.add(tied_to, rav_from)
-#         self.tied_to.add(tied_to, rav_to)
-#         return rav_from, rav_to
-#     def remove(self, tied_from, tied_to):
-#         rav_from = self.params._raveled_index_for(tied_from)
-#         rav_to = self.params._raveled_index_for(tied_to)
-#         rem_from = self.tied_from.remove(tied_to, rav_from)
-#         rem_to = self.tied_to.remove(tied_to, rav_to)
-#         left_from = self.tied_from._properties.pop(tied_to)
-#         left_to = self.tied_to._properties.pop(tied_to)
-#         self.tied_from[numpy.delete(tied_to, rem_from)] = left_from
-#         self.tied_to[numpy.delete(tied_to, rem_to)] = left_to
-#         return rav_from, rav_to
-#     def from_to_for(self, index):
-#         return self.tied_from.properties_for(index), self.tied_to.properties_for(index)
-#     def iter_from_to_indices(self):
-#         for k, f in self.tied_from.iteritems():
-#             yield f, self.tied_to[k]
-#     def iter_to_indices(self):
-#         return self.tied_to.iterindices()
-#     def iter_from_indices(self):
-#         return self.tied_from.iterindices()
-#     def iter_from_items(self):
-#         for f, i in self.tied_from.iteritems():
-#             yield f, i
-#     def iter_properties(self):
-#         return self.tied_from.iter_properties()
-#     def properties(self):
-#         return self.tied_from.properties()
-#     def from_to_indices(self, param):
-#         return self.tied_from[param], self.tied_to[param]
-#     
-# # def create_raveled_indices(index, shape, offset=0):
-# #     if isinstance(index, (tuple, list)): i = [slice(None)] + list(index)
-# #     else: i = [slice(None), index]
-# #     ind = numpy.array(numpy.ravel_multi_index(numpy.indices(shape)[i], shape)).flat + numpy.int_(offset)
-# #     return ind
-
 def combine_indices(arr1, arr2):
     return numpy.union1d(arr1, arr2)
 
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 462210dc..c0dd3fea 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -4,19 +4,19 @@
 import itertools
 import numpy
 from parameter_core import Constrainable, adjust_name_for_printing
-from array_core import ObservableArray
+from array_core import ObservableArray, ParamList
 
 ###### printing
 __constraints_name__ = "Constraint"
 __index_name__ = "Index"
 __tie_name__ = "Tied to"
-__precision__ = numpy.get_printoptions()['precision'] # numpy printing precision used, sublassing numpy ndarray after all
+__precision__ = numpy.get_printoptions()['precision']  # numpy printing precision used, sublassing numpy ndarray after all
 __print_threshold__ = 5
 ######      
 
 class Float(numpy.float64, Constrainable):
     def __init__(self, f, base):
-        super(Float,self).__init__(f)
+        super(Float, self).__init__(f)
         self._base = base
         
         
@@ -50,7 +50,7 @@ class Param(ObservableArray, Constrainable):
     WARNING: This overrides the functionality of x==y!!!
     Use numpy.equal(x,y) for element-wise equality testing.
     """
-    __array_priority__ = 0 # Never give back Param
+    __array_priority__ = 0  # Never give back Param
     _fixes_ = None
     def __new__(cls, name, input_array, *args, **kwargs):
         obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array))
@@ -75,7 +75,6 @@ class Param(ObservableArray, Constrainable):
         super(Param, self).__array_finalize__(obj)
         self._direct_parent_ = getattr(obj, '_direct_parent_', None)
         self._parent_index_ = getattr(obj, '_parent_index_', None)
-        self._highest_parent_ = getattr(obj, '_highest_parent_', None)
         self._current_slice_ = getattr(obj, '_current_slice_', None)
         self._tied_to_me_ = getattr(obj, '_tied_to_me_', None)
         self._tied_to_ = getattr(obj, '_tied_to_', None)
@@ -94,11 +93,10 @@ class Param(ObservableArray, Constrainable):
     #===========================================================================
     def __reduce_ex__(self):
         func, args, state = super(Param, self).__reduce__()
-        return func, args, (state, 
+        return func, args, (state,
                             (self.name,
                              self._direct_parent_,
                              self._parent_index_,
-                             self._highest_parent_,
                              self._current_slice_,
                              self._realshape_,
                              self._realsize_,
@@ -119,7 +117,6 @@ class Param(ObservableArray, Constrainable):
         self._realsize_ = state.pop()
         self._realshape_ = state.pop()
         self._current_slice_ = state.pop()
-        self._highest_parent_ = state.pop()
         self._parent_index_ = state.pop()
         self._direct_parent_ = state.pop()
         self.name = state.pop()
@@ -153,8 +150,6 @@ class Param(ObservableArray, Constrainable):
     @property
     def _parameters_(self):
         return []
-    def _connect_highest_parent(self, highest_parent):
-        self._highest_parent_ = highest_parent
     def _collect_gradient(self, target):
         target[:] = self.gradient.flat
     #===========================================================================
@@ -166,7 +161,7 @@ class Param(ObservableArray, Constrainable):
         
         :param warning: print a warning for overwriting constraints.
         """
-        self._highest_parent_._fix(self,warning)
+        self._highest_parent_._fix(self, warning)
     fix = constrain_fixed
     def unconstrain_fixed(self):
         """
@@ -190,19 +185,19 @@ class Param(ObservableArray, Constrainable):
         Note: For now only one parameter can have ties, so all of a parameter
               will be removed, when re-tieing!
         """
-        #Note: this method will tie to the parameter which is the last in 
+        # Note: this method will tie to the parameter which is the last in 
         #      the chain of ties. Thus, if you tie to a tied parameter,
         #      this tie will be created to the parameter the param is tied
         #      to.
 
-        assert isinstance(param, Param), "Argument {1} not of type {0}".format(Param,param.__class__)
+        assert isinstance(param, Param), "Argument {1} not of type {0}".format(Param, param.__class__)
         param = numpy.atleast_1d(param)
         if param.size != 1:
             raise NotImplementedError, "Broadcast tying is not implemented yet"
         try:
             if self._original_: 
                 self[:] = param
-            else: # this happens when indexing created a copy of the array
+            else:  # this happens when indexing created a copy of the array
                 self._direct_parent_._get_original(self)[self._current_slice_] = param
         except ValueError:
             raise ValueError("Trying to tie {} with shape {} to {} with shape {}".format(self.name, self.shape, param.name, param.shape))            
@@ -248,7 +243,7 @@ class Param(ObservableArray, Constrainable):
                 t_rav_i = t._raveled_index()
                 tr_rav_i = tied_to_me._raveled_index()
                 new_index = list(set(t_rav_i) | set(tr_rav_i))
-                tmp = t._direct_parent_._get_original(t)[numpy.unravel_index(new_index,t._realshape_)]
+                tmp = t._direct_parent_._get_original(t)[numpy.unravel_index(new_index, t._realshape_)]
                 self._tied_to_me_[tmp] = self._tied_to_me_[t] | set(self._raveled_index())
                 del self._tied_to_me_[t]
                 return
@@ -261,7 +256,7 @@ class Param(ObservableArray, Constrainable):
                 import ipdb;ipdb.set_trace()
                 new_index = list(set(t_rav_i) - set(tr_rav_i))
                 if new_index:
-                    tmp = t._direct_parent_._get_original(t)[numpy.unravel_index(new_index,t._realshape_)]
+                    tmp = t._direct_parent_._get_original(t)[numpy.unravel_index(new_index, t._realshape_)]
                     self._tied_to_me_[tmp] = self._tied_to_me_[t]
                     del self._tied_to_me_[t]
                     if len(self._tied_to_me_[tmp]) == 0:
@@ -269,12 +264,12 @@ class Param(ObservableArray, Constrainable):
                 else:
                     del self._tied_to_me_[t]
     def _on_tied_parameter_changed(self, val, ind):
-        if not self._updated_: #not fast_array_equal(self, val[ind]):
+        if not self._updated_:  # not fast_array_equal(self, val[ind]):
             val = numpy.atleast_1d(val)
             self._updated_ = True
             if self._original_:
                 self.__setitem__(slice(None), val[ind], update=False)
-            else: # this happens when indexing created a copy of the array
+            else:  # this happens when indexing created a copy of the array
                 self._direct_parent_._get_original(self).__setitem__(self._current_slice_, val[ind], update=False)
             self._notify_tied_parameters()
             self._updated_ = False
@@ -303,11 +298,11 @@ class Param(ObservableArray, Constrainable):
     def __getitem__(self, s, *args, **kwargs):
         if not isinstance(s, tuple):
             s = (s,)
-        if not reduce(lambda a,b: a or numpy.any(b is Ellipsis), s, False) and len(s) <= self.ndim:
+        if not reduce(lambda a, b: a or numpy.any(b is Ellipsis), s, False) and len(s) <= self.ndim:
             s += (Ellipsis,)
         new_arr = super(Param, self).__getitem__(s, *args, **kwargs)
         try: new_arr._current_slice_ = s; new_arr._original_ = self.base is new_arr.base
-        except AttributeError: pass# returning 0d array or float, double etc
+        except AttributeError: pass  # returning 0d array or float, double etc
         return new_arr
     def __setitem__(self, s, val, update=True):
         super(Param, self).__setitem__(s, val, update=update)
@@ -325,11 +320,11 @@ class Param(ObservableArray, Constrainable):
                 continue
             if isinstance(si, slice):
                 a = si.indices(self._realshape_[i])[0] 
-            elif isinstance(si, (list,numpy.ndarray,tuple)):
+            elif isinstance(si, (list, numpy.ndarray, tuple)):
                 a = si[0]
             else: a = si
-            if a<0:
-                a = self._realshape_[i]+a
+            if a < 0:
+                a = self._realshape_[i] + a
             internal_offset += a * extended_realshape[i]
         return internal_offset
     def _raveled_index(self, slice_index=None):
@@ -337,8 +332,8 @@ class Param(ObservableArray, Constrainable):
         # of this object
         extended_realshape = numpy.cumprod((1,) + self._realshape_[:0:-1])[::-1]
         ind = self._indices(slice_index)
-        if ind.ndim < 2: ind=ind[:,None]
-        return numpy.asarray(numpy.apply_along_axis(lambda x: numpy.sum(extended_realshape*x), 1, ind), dtype=int)
+        if ind.ndim < 2: ind = ind[:, None]
+        return numpy.asarray(numpy.apply_along_axis(lambda x: numpy.sum(extended_realshape * x), 1, ind), dtype=int)
     def _expand_index(self, slice_index=None):
         # this calculates the full indexing arrays from the slicing objects given by get_item for _real..._ attributes
         # it basically translates slices to their respective index arrays and turns negative indices around
@@ -351,11 +346,11 @@ class Param(ObservableArray, Constrainable):
                 if isinstance(a, slice):
                     start, stop, step = a.indices(b)
                     return numpy.r_[start:stop:step]
-                elif isinstance(a, (list,numpy.ndarray,tuple)):
+                elif isinstance(a, (list, numpy.ndarray, tuple)):
                     a = numpy.asarray(a, dtype=int)
-                    a[a<0] = b + a[a<0]
-                elif a<0:
-                    a = b+a
+                    a[a < 0] = b + a[a < 0]
+                elif a < 0:
+                    a = b + a
                 return numpy.r_[a]
             return numpy.r_[:b]
         return itertools.imap(f, itertools.izip_longest(slice_index[:self._realndim_], self._realshape_, fillvalue=slice(self.size)))
@@ -379,7 +374,7 @@ class Param(ObservableArray, Constrainable):
     #===========================================================================
     @property
     def _description_str(self):
-        if self.size <= 1: return ["%f"%self]
+        if self.size <= 1: return ["%f" % self]
         else: return [str(self.shape)]
     def _parameter_names(self, add_name):
         return [self.name]
@@ -391,31 +386,31 @@ class Param(ObservableArray, Constrainable):
         return [self.shape]
     @property
     def _constraints_str(self):
-        return [' '.join(map(lambda c: str(c[0]) if c[1].size==self._realsize_ else "{"+str(c[0])+"}", self._highest_parent_._constraints_iter_items(self)))]
+        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self._highest_parent_._constraints_iter_items(self)))]
     @property
     def _ties_str(self):
         return [t._short() for t in self._tied_to_] or ['']
     @property
     def name_hirarchical(self):
         if self.has_parent():
-            return self._direct_parent_.hirarchy_name()+adjust_name_for_printing(self.name)
+            return self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name)
         return adjust_name_for_printing(self.name)
     def __repr__(self, *args, **kwargs):
         name = "\033[1m{x:s}\033[0;0m:\n".format(
                             x=self.name_hirarchical)
-        return name + super(Param, self).__repr__(*args,**kwargs)
+        return name + super(Param, self).__repr__(*args, **kwargs)
     def _ties_for(self, rav_index):
-        #size = sum(p.size for p in self._tied_to_)
+        # size = sum(p.size for p in self._tied_to_)
         ties = numpy.empty(shape=(len(self._tied_to_), numpy.size(rav_index)), dtype=Param)
         for i, tied_to in enumerate(self._tied_to_):
             for t, ind in tied_to._tied_to_me_.iteritems():
                 if t._parent_index_ == self._parent_index_:
-                    matches = numpy.where(rav_index[:,None] == t._raveled_index()[None, :])
+                    matches = numpy.where(rav_index[:, None] == t._raveled_index()[None, :])
                     tt_rav_index = tied_to._raveled_index()
                     ind_rav_matches = numpy.where(tt_rav_index == numpy.array(list(ind)))[0]
                     if len(ind) != 1: ties[i, matches[0][ind_rav_matches]] = numpy.take(tt_rav_index, matches[1], mode='wrap')[ind_rav_matches]
                     else: ties[i, matches[0]] = numpy.take(tt_rav_index, matches[1], mode='wrap')
-        return map(lambda a: sum(a,[]), zip(*[[[tie.flatten()] if tx!=None else [] for tx in t] for t,tie in zip(ties,self._tied_to_)]))
+        return map(lambda a: sum(a, []), zip(*[[[tie.flatten()] if tx != None else [] for tx in t] for t, tie in zip(ties, self._tied_to_)]))
     def _constraints_for(self, rav_index):
         return self._highest_parent_._constraints_for(self, rav_index)
     def _indices(self, slice_index=None):
@@ -425,12 +420,12 @@ class Param(ObservableArray, Constrainable):
         if isinstance(slice_index, (tuple, list)):
             clean_curr_slice = [s for s in slice_index if numpy.any(s != Ellipsis)]
             if (all(isinstance(n, (numpy.ndarray, list, tuple)) for n in clean_curr_slice) 
-                and len(set(map(len,clean_curr_slice))) <= 1):
+                and len(set(map(len, clean_curr_slice))) <= 1):
                 return numpy.fromiter(itertools.izip(*clean_curr_slice),
-                    dtype=[('',int)]*self._realndim_,count=len(clean_curr_slice[0])).view((int, self._realndim_))
+                    dtype=[('', int)] * self._realndim_, count=len(clean_curr_slice[0])).view((int, self._realndim_))
         expanded_index = list(self._expand_index(slice_index))
         return numpy.fromiter(itertools.product(*expanded_index),
-                 dtype=[('',int)]*self._realndim_,count=reduce(lambda a,b: a*b.size,expanded_index,1)).view((int, self._realndim_))
+                 dtype=[('', int)] * self._realndim_, count=reduce(lambda a, b: a * b.size, expanded_index, 1)).view((int, self._realndim_))
     def _max_len_names(self, gen, header):
         return reduce(lambda a, b:max(a, len(b)), gen, len(header))
     def _max_len_values(self):
@@ -443,9 +438,9 @@ class Param(ObservableArray, Constrainable):
         if self._realsize_ < 2:
             return name
         ind = self._indices()
-        if ind.size > 4: indstr = ','.join(map(str,ind[:2])) + "..." + ','.join(map(str,ind[-2:])) 
-        else: indstr = ','.join(map(str,ind))
-        return name+'['+indstr+']'
+        if ind.size > 4: indstr = ','.join(map(str, ind[:2])) + "..." + ','.join(map(str, ind[-2:])) 
+        else: indstr = ','.join(map(str, ind))
+        return name + '[' + indstr + ']'
     def __str__(self, constr_matrix=None, indices=None, ties=None, lc=None, lx=None, li=None, lt=None):
         filter_ = self._current_slice_
         vals = self.flat
@@ -458,10 +453,10 @@ class Param(ObservableArray, Constrainable):
         if lx is None: lx = self._max_len_values()
         if li is None: li = self._max_len_index(indices)
         if lt is None: lt = self._max_len_names(ties, __tie_name__)
-        header = "  {i:^{2}s}  |  \033[1m{x:^{1}s}\033[0;0m  |  {c:^{0}s}  |  {t:^{3}s}".format(lc,lx,li,lt, x=self.name_hirarchical, c=__constraints_name__, i=__index_name__, t=__tie_name__) # nice header for printing
+        header = "  {i:^{2}s}  |  \033[1m{x:^{1}s}\033[0;0m  |  {c:^{0}s}  |  {t:^{3}s}".format(lc, lx, li, lt, x=self.name_hirarchical, c=__constraints_name__, i=__index_name__, t=__tie_name__)  # nice header for printing
         if not ties: ties = itertools.cycle([''])
-        return "\n".join([header]+["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {t:^{4}s}  ".format(lc,lx,__precision__,li,lt, x=x, c=" ".join(map(str,c)), t=(t or ''), i=i) for i,x,c,t in itertools.izip(indices,vals,constr_matrix,ties)]) # return all the constraints with right indices
-        #except: return super(Param, self).__str__()
+        return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, x=x, c=" ".join(map(str, c)), t=(t or ''), i=i) for i, x, c, t in itertools.izip(indices, vals, constr_matrix, ties)])  # return all the constraints with right indices
+        # except: return super(Param, self).__str__()
 
 class ParamConcatenation(object):
     def __init__(self, params):
@@ -472,22 +467,22 @@ class ParamConcatenation(object):
 
         See :py:class:`GPy.core.parameter.Param` for more details on constraining.
         """
-        #self.params = params
-        self.params = []
+        # self.params = params
+        self.params = ParamList([])
         for p in params:
             for p in p.flattened_parameters:
                 if p not in self.params:
                     self.params.append(p)           
         self._param_sizes = [p.size for p in self.params]
         startstops = numpy.cumsum([0] + self._param_sizes)
-        self._param_slices_ = [slice(start, stop) for start,stop in zip(startstops, startstops[1:])]
+        self._param_slices_ = [slice(start, stop) for start, stop in zip(startstops, startstops[1:])]
     #===========================================================================
     # Get/set items, enable broadcasting
     #===========================================================================
     def __getitem__(self, s):
         ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True; 
-        params = [p._get_params()[ind[ps]] for p,ps in zip(self.params, self._param_slices_) if numpy.any(p._get_params()[ind[ps]])]
-        if len(params)==1: return params[0]
+        params = [p._get_params()[ind[ps]] for p, ps in zip(self.params, self._param_slices_) if numpy.any(p._get_params()[ind[ps]])]
+        if len(params) == 1: return params[0]
         return ParamConcatenation(params)
     def __setitem__(self, s, val, update=True):
         ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True; 
@@ -535,12 +530,12 @@ class ParamConcatenation(object):
     unconstrain_bounded.__doc__ = Param.unconstrain_bounded.__doc__
     def untie(self, *ties):
         [param.untie(*ties) for param in self.params]
-    __lt__ = lambda self, val: self._vals()<val
-    __le__ = lambda self, val: self._vals()<=val
-    __eq__ = lambda self, val: self._vals()==val
-    __ne__ = lambda self, val: self._vals()!=val
-    __gt__ = lambda self, val: self._vals()>val
-    __ge__ = lambda self, val: self._vals()>=val
+    __lt__ = lambda self, val: self._vals() < val
+    __le__ = lambda self, val: self._vals() <= val
+    __eq__ = lambda self, val: self._vals() == val
+    __ne__ = lambda self, val: self._vals() != val
+    __gt__ = lambda self, val: self._vals() > val
+    __ge__ = lambda self, val: self._vals() >= val
     def __str__(self, *args, **kwargs):
         def f(p):
             ind = p._raveled_index()
@@ -552,11 +547,11 @@ class ParamConcatenation(object):
         lx = max([p._max_len_values() for p in params])
         li = max([p._max_len_index(i) for p, i in itertools.izip(params, indices)])
         lt = max([p._max_len_names(tm, __tie_name__) for p, tm in itertools.izip(params, ties_matrices)])
-        strings = [p.__str__(cm, i, tm, lc, lx, li, lt) for p, cm, i, tm in itertools.izip(params,constr_matrices,indices,ties_matrices)]
+        strings = [p.__str__(cm, i, tm, lc, lx, li, lt) for p, cm, i, tm in itertools.izip(params, constr_matrices, indices, ties_matrices)]
         return "\n".join(strings)
-        return "\n{}\n".format(" -"+"- | -".join(['-'*l for l in [li,lx,lc,lt]])).join(strings)
+        return "\n{}\n".format(" -" + "- | -".join(['-' * l for l in [li, lx, lc, lt]])).join(strings)
     def __repr__(self):
-        return "\n".join(map(repr,self.params))
+        return "\n".join(map(repr, self.params))
     
 if __name__ == '__main__':
     
@@ -564,12 +559,12 @@ if __name__ == '__main__':
     from GPy.core.parameterized import Parameterized
     from GPy.core.parameter import Param
 
-    #X = numpy.random.randn(2,3,1,5,2,4,3)
-    X = numpy.random.randn(3,2)
+    # X = numpy.random.randn(2,3,1,5,2,4,3)
+    X = numpy.random.randn(3, 2)
     print "random done"
     p = Param("q_mean", X)
     p1 = Param("q_variance", numpy.random.rand(*p.shape))
-    p2 = Param("Y", numpy.random.randn(p.shape[0],1))
+    p2 = Param("Y", numpy.random.randn(p.shape[0], 1))
     
     p3 = Param("variance", numpy.random.rand())
     p4 = Param("lengthscale", numpy.random.rand(2))
@@ -577,19 +572,19 @@ if __name__ == '__main__':
     m = Parameterized()
     rbf = Parameterized(name='rbf')
     
-    rbf.add_parameter(p3,p4)
-    m.add_parameter(p,p1,rbf)
+    rbf.add_parameter(p3, p4)
+    m.add_parameter(p, p1, rbf)
     
     print "setting params"
-    #print m.q_v[3:5,[1,4,5]]
+    # print m.q_v[3:5,[1,4,5]]
     print "constraining variance"
-    #m[".*variance"].constrain_positive()
-    #print "constraining rbf"
-    #m.rbf_l.constrain_positive()
-    #m.q_variance[1,[0,5,11,19,2]].tie_to(m.rbf_v)
-    #m.rbf_v.tie_to(m.rbf_l[0])
-    #m.rbf_l[0].tie_to(m.rbf_l[1])
-    #m.q_v.tie_to(m.rbf_v)
+    # m[".*variance"].constrain_positive()
+    # print "constraining rbf"
+    # m.rbf_l.constrain_positive()
+    # m.q_variance[1,[0,5,11,19,2]].tie_to(m.rbf_v)
+    # m.rbf_v.tie_to(m.rbf_l[0])
+    # m.rbf_l[0].tie_to(m.rbf_l[1])
+    # m.q_v.tie_to(m.rbf_v)
 #     m.rbf_l.tie_to(m.rbf_va)
     # pt = numpy.array(params._get_params_transformed())
     # ptr = numpy.random.randn(*pt.shape)
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index a826b10c..a06211fe 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -48,19 +48,24 @@ class Pickleable(object):
 #===============================================================================
 
 class Parentable(object):
-    def __init__(self, direct_parent=None, highest_parent=None, parent_index=None):
+    def __init__(self, direct_parent=None, parent_index=None):
         super(Parentable,self).__init__()        
         self._direct_parent_ = direct_parent
         self._parent_index_ = parent_index
-        self._highest_parent_ = highest_parent
         
     def has_parent(self):
-        return self._direct_parent_ is not None and self._highest_parent_ is not None
+        return self._direct_parent_ is not None
+    
+    @property
+    def _highest_parent_(self):
+        if self._direct_parent_ is None:
+            return self
+        return self._direct_parent_._highest_parent_
     
 class Nameable(Parentable):
     _name = None
-    def __init__(self, name, direct_parent=None, highest_parent=None, parent_index=None):
-        super(Nameable,self).__init__(direct_parent, highest_parent, parent_index)
+    def __init__(self, name, direct_parent=None, parent_index=None):
+        super(Nameable,self).__init__(direct_parent, parent_index)
         self._name = name or self.__class__.__name__
 
     @property
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 7abaf4a3..2f00fbe8 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -11,6 +11,7 @@ from param import ParamConcatenation, Param
 from parameter_core import Constrainable, Pickleable, Observable, adjust_name_for_printing
 from index_operations import ParameterIndexOperations,\
     index_empty
+from array_core import ParamList
 
 #===============================================================================
 # Printing:
@@ -69,7 +70,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
         super(Parameterized, self).__init__(name=name)
         self._in_init_ = True
         self._constraints_ = None#ParameterIndexOperations()
-        self._parameters_ = []
+        self._parameters_ = ParamList()
         self.size = sum(p.size for p in self._parameters_)
         if not self._has_fixes():
             self._fixes_ = None
@@ -188,10 +189,10 @@ class Parameterized(Constrainable, Pickleable, Observable):
             note: if it is a string object it will not (!) be regexp-matched
                   automatically.
         """
-        self._parameters_ = [p for p in self._parameters_
+        self._parameters_ = ParamList([p for p in self._parameters_
                         if not (p._parent_index_ in names_params_indices
                                 or p.name in names_params_indices
-                                or p in names_params_indices)]
+                                or p in names_params_indices)])
         self._connect_parameters()
 
     def parameters_changed(self):
@@ -216,7 +217,6 @@ class Parameterized(Constrainable, Pickleable, Observable):
         for i,p in enumerate(self._parameters_):
             p._direct_parent_ = self
             p._parent_index_ = i
-            p._connect_highest_parent(self)
             not_unique = []
             sizes.append(p.size+sizes[-1])
             self._param_slices_.append(slice(sizes[-2], sizes[-1]))
@@ -231,14 +231,6 @@ class Parameterized(Constrainable, Pickleable, Observable):
                 self.__dict__[pname] = p
                 self._added_names_.add(pname)
         
-    def _connect_highest_parent(self, highest_parent):
-        self._highest_parent_ = highest_parent
-        if not hasattr(self, "_parameters_") or len(self._parameters_) < 1:
-            # no parameters for this class
-            return
-        for p in self._parameters_:
-            p._connect_highest_parent(highest_parent)
-        
     #===========================================================================
     # Pickling operations
     #===========================================================================
@@ -372,6 +364,8 @@ class Parameterized(Constrainable, Pickleable, Observable):
         that is an int array, containing the indexes for the flattened
         param inside this parameterized logic.
         """
+        if isinstance(param, ParamConcatenation):
+            return numpy.hstack((self._raveled_index_for(p) for p in param.params))
         return param._raveled_index() + self._offset_for(param)
 
     def _raveled_index(self):
diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index e9868b82..b73e25da 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -3,10 +3,8 @@ Created on 6 Nov 2013
 
 @author: maxz
 '''
-import numpy as np
 from parameterized import Parameterized
 from param import Param
-from ...util.misc import param_to_array
 
 class Normal(Parameterized):
     '''
@@ -26,6 +24,7 @@ class Normal(Parameterized):
 
         See  GPy.plotting.matplot_dep.variational_plots
         """
+        import sys
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import variational_plots
+        from ...plotting.matplot_dep import variational_plots
         return variational_plots.plot(self,*args)
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 46fc6797..e2ba4912 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -15,54 +15,54 @@ def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False):
     num_inducing = 5
     if plot:
         output_dim = 1
-        input_dim = 2
+        input_dim = 3
     else:
-        input_dim = 2
+        input_dim = 1
         output_dim = 25
 
     # generate GPLVM-like data
     X = _np.random.rand(num_inputs, input_dim)
     lengthscales = _np.random.rand(input_dim)
     k = (GPy.kern.rbf(input_dim, .5, lengthscales, ARD=True)
-         + GPy.kern.white(input_dim, 0.01))
+         #+ GPy.kern.white(input_dim, 0.01)
+         )
     K = k.K(X)
     Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, output_dim).T
-    lik = Gaussian(Y, normalize=True)
 
-    k = GPy.kern.rbf_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
-    # k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.rbf_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
+    k = GPy.kern.linear(input_dim)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
     # k = GPy.kern.rbf(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
     # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.rbf(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
     # k = GPy.kern.rbf(input_dim, .5, 2., ARD=0) + GPy.kern.rbf(input_dim, .3, .2, ARD=0)
     # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)
 
-    m = GPy.models.BayesianGPLVM(lik, input_dim, kernel=k, num_inducing=num_inducing)
+    m = GPy.models.BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
     #===========================================================================
     # randomly obstruct data with percentage p
     p = .8
     Y_obstruct = Y.copy()
     Y_obstruct[_np.random.uniform(size=(Y.shape)) < p] = _np.nan
     #===========================================================================
-    m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
+    #m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
     m.lengthscales = lengthscales
 
     if plot:
         import matplotlib.pyplot as pb
         m.plot()
         pb.title('PCA initialisation')
-        m2.plot()
-        pb.title('PCA initialisation')
+        #m2.plot()
+        #pb.title('PCA initialisation')
 
     if optimize:
         m.optimize('scg', messages=verbose)
-        m2.optimize('scg', messages=verbose)
+        #m2.optimize('scg', messages=verbose)
         if plot:
             m.plot()
             pb.title('After optimisation')
-            m2.plot()
-            pb.title('After optimisation')
+            #m2.plot()
+            #pb.title('After optimisation')
 
-    return m, m2
+    return m
 
 def gplvm_oil_100(optimize=True, verbose=1, plot=True):
     import GPy
@@ -264,16 +264,16 @@ def bgplvm_simulation(optimize=True, verbose=1,
     D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
     _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
     Y = Ylist[0]
-    k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    k = kern.linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
-    m['noise'] = Y.var() / 100.
+    m.Gaussian_noise = Y.var() / 100.
 
     if optimize:
         print "Optimizing model:"
         m.optimize('scg', messages=verbose, max_iters=max_iters,
                    gtol=.05)
     if plot:
-        m.plot_X_1d("BGPLVM Latent Space 1D")
+        m.q.plot("BGPLVM Latent Space 1D")
         m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
     return m
 
diff --git a/GPy/inference/latent_function_inference/fitc.py b/GPy/inference/latent_function_inference/fitc.py
index d5aa80bc..e4c01252 100644
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@@ -118,8 +118,8 @@ class FITC(SparseGP):
             _dKmm = .5*(V_n**2 + alpha_n + gamma_n**2 - 2.*gamma_k) * K_pp_K #Diag_dD_dKmm
             self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1,self.X[i:i+1,:],self.Z)
             self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm,self.Z)
-            self._dKmm_dX += self.kern.dK_dX(_dKmm ,self.Z)
-            self._dpsi1_dX += self.kern.dK_dX(_dpsi1.T,self.Z,self.X[i:i+1,:])
+            self._dKmm_dX += self.kern.gradients_X(_dKmm ,self.Z)
+            self._dpsi1_dX += self.kern.gradients_X(_dpsi1.T,self.Z,self.X[i:i+1,:])
 
         # the partial derivative vector for the likelihood
         if self.likelihood.num_params == 0:
@@ -170,8 +170,8 @@ class FITC(SparseGP):
         return dL_dtheta
 
     def dL_dZ(self):
-        dL_dZ = self.kern.dK_dX(self._dL_dpsi1.T,self.Z,self.X)
-        dL_dZ += self.kern.dK_dX(self._dL_dKmm,X=self.Z)
+        dL_dZ = self.kern.gradients_X(self._dL_dpsi1.T,self.Z,self.X)
+        dL_dZ += self.kern.gradients_X(self._dL_dKmm,X=self.Z)
         dL_dZ += self._dpsi1_dX
         dL_dZ += self._dKmm_dX
         return dL_dZ
diff --git a/GPy/inference/latent_function_inference/varDTC.py b/GPy/inference/latent_function_inference/varDTC.py
index 6d46ad14..07ae17c5 100644
--- a/GPy/inference/latent_function_inference/varDTC.py
+++ b/GPy/inference/latent_function_inference/varDTC.py
@@ -80,7 +80,7 @@ class VarDTC(object):
             # no backsubstitution because of bound explosion on tr(A) if not...
             LmInv, _ = dtrtri(Lm, lower=1)
             A = LmInv.T.dot(psi2_beta.dot(LmInv))
-            print A.sum()
+            #print A.sum()
         else:
             if het_noise:
                 tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 6a8bc745..c97807fb 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -160,6 +160,9 @@ class kern(Parameterized):
 
         return newkern
 
+    def __call__(self, X, X2=None):
+        return self.K(X, X2)
+
     def __mul__(self, other):
         """ Here we overload the '*' operator. See self.prod for more information"""
         return self.prod(other)
@@ -550,7 +553,7 @@ class Kern_check_dK_dX(Kern_check_model):
         Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
 
     def _log_likelihood_gradients(self):
-        return self.kernel.dK_dX(self.dL_dK, self.X, self.X2).flatten()
+        return self.kernel.gradients_X(self.dL_dK, self.X, self.X2).flatten()
 
     def _get_param_names(self):
         return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
@@ -657,7 +660,7 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
     except NotImplementedError:
         result=True
         if verbose:
-            print("dK_dX not implemented for " + kern.name)
+            print("gradients_X not implemented for " + kern.name)
     if result and verbose:
         print("Check passed.")
     if not result:
@@ -673,7 +676,7 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
     except NotImplementedError:
         result=True
         if verbose:
-            print("dK_dX not implemented for " + kern.name)
+            print("gradients_X not implemented for " + kern.name)
     if result and verbose:
         print("Check passed.")
     if not result:
@@ -689,7 +692,7 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
     except NotImplementedError:
         result=True
         if verbose:
-            print("dK_dX not implemented for " + kern.name)
+            print("gradients_X not implemented for " + kern.name)
     if result and verbose:
         print("Check passed.")
     if not result:
diff --git a/GPy/kern/parts/Brownian.py b/GPy/kern/parts/Brownian.py
index bdfa0df5..17f65cbd 100644
--- a/GPy/kern/parts/Brownian.py
+++ b/GPy/kern/parts/Brownian.py
@@ -51,7 +51,7 @@ class Brownian(Kernpart):
     def dKdiag_dtheta(self,dL_dKdiag,X,target):
         target += np.dot(X.flatten(), dL_dKdiag)
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         raise NotImplementedError, "TODO"
         #target += self.variance
         #target -= self.variance*theta(X-X2.T)
diff --git a/GPy/kern/parts/Matern32.py b/GPy/kern/parts/Matern32.py
index 40da79f0..a95f0bcf 100644
--- a/GPy/kern/parts/Matern32.py
+++ b/GPy/kern/parts/Matern32.py
@@ -96,7 +96,7 @@ class Matern32(Kernpart):
         """derivative of the diagonal of the covariance matrix with respect to the parameters."""
         target[0] += np.sum(dL_dKdiag)
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         """derivative of the covariance matrix with respect to X."""
         if X2 is None:
             dist = np.sqrt(np.sum(np.square((X[:, None, :] - X[None, :, :]) / self.lengthscale), -1))[:, :, None]
@@ -105,8 +105,8 @@ class Matern32(Kernpart):
         else:
             dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
             ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
-        dK_dX = -np.transpose(3 * self.variance * dist * np.exp(-np.sqrt(3) * dist) * ddist_dX, (1, 0, 2))
-        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+        gradients_X = -np.transpose(3 * self.variance * dist * np.exp(-np.sqrt(3) * dist) * ddist_dX, (1, 0, 2))
+        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         pass
diff --git a/GPy/kern/parts/Matern52.py b/GPy/kern/parts/Matern52.py
index 4bf4a1a8..1f87fefb 100644
--- a/GPy/kern/parts/Matern52.py
+++ b/GPy/kern/parts/Matern52.py
@@ -96,7 +96,7 @@ class Matern52(Kernpart):
         """derivative of the diagonal of the covariance matrix with respect to the parameters."""
         target[0] += np.sum(dL_dKdiag)
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to X."""
         if X2 is None:
             dist = np.sqrt(np.sum(np.square((X[:,None,:]-X[None,:,:])/self.lengthscale),-1))[:,:,None]
@@ -104,8 +104,8 @@ class Matern52(Kernpart):
         else:
             dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
             ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
-        dK_dX = -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
-        target += np.sum(dK_dX*dL_dK.T[:,:,None],0)
+        gradients_X = -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
+        target += np.sum(gradients_X*dL_dK.T[:,:,None],0)
 
     def dKdiag_dX(self,dL_dKdiag,X,target):
         pass
diff --git a/GPy/kern/parts/eq_ode1.py b/GPy/kern/parts/eq_ode1.py
index 70e3c49d..85bb6379 100644
--- a/GPy/kern/parts/eq_ode1.py
+++ b/GPy/kern/parts/eq_ode1.py
@@ -193,7 +193,7 @@ class Eq_ode1(Kernpart):
     def dKdiag_dtheta(self,dL_dKdiag,index,target):
         pass
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         pass
 
     def _extract_t_indices(self, X, X2=None, dL_dK=None):
diff --git a/GPy/kern/parts/exponential.py b/GPy/kern/parts/exponential.py
index f568b66b..7cd92aff 100644
--- a/GPy/kern/parts/exponential.py
+++ b/GPy/kern/parts/exponential.py
@@ -95,13 +95,13 @@ class Exponential(Kernpart):
         # NB: derivative of diagonal elements wrt lengthscale is 0
         target[0] += np.sum(dL_dKdiag)
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         """derivative of the covariance matrix with respect to X."""
         if X2 is None: X2 = X
         dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
         ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
-        dK_dX = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
-        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+        gradients_X = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
+        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         pass
diff --git a/GPy/kern/parts/fixed.py b/GPy/kern/parts/fixed.py
index 67baea91..dd5bdb85 100644
--- a/GPy/kern/parts/fixed.py
+++ b/GPy/kern/parts/fixed.py
@@ -34,7 +34,7 @@ class Fixed(Kernpart):
     def dK_dtheta(self, partial, X, X2, target):
         target += (partial * self.fixed_K).sum()
 
-    def dK_dX(self, partial, X, X2, target):
+    def gradients_X(self, partial, X, X2, target):
         pass
 
     def dKdiag_dX(self, partial, X, target):
diff --git a/GPy/kern/parts/gibbs.py b/GPy/kern/parts/gibbs.py
index f47144e1..717703ce 100644
--- a/GPy/kern/parts/gibbs.py
+++ b/GPy/kern/parts/gibbs.py
@@ -97,7 +97,7 @@ class Gibbs(Kernpart):
 
         target+= np.hstack([(dL_dK*self._K_dvar).sum(), gmapping])
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         """Derivative of the covariance matrix with respect to X."""
         # First account for gradients arising from presence of X in exponent.
         self._K_computations(X, X2)
@@ -105,8 +105,8 @@ class Gibbs(Kernpart):
             _K_dist = 2*(X[:, None, :] - X[None, :, :])
         else:
             _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_co
-        dK_dX = (-2.*self.variance)*np.transpose((self._K_dvar/self._w2)[:, :, None]*_K_dist, (1, 0, 2))
-        target += np.sum(dK_dX*dL_dK.T[:, :, None], 0)
+        gradients_X = (-2.*self.variance)*np.transpose((self._K_dvar/self._w2)[:, :, None]*_K_dist, (1, 0, 2))
+        target += np.sum(gradients_X*dL_dK.T[:, :, None], 0)
         # Now account for gradients arising from presence of X in lengthscale.
         self._dK_computations(dL_dK)
         if X2 is None:
diff --git a/GPy/kern/parts/hetero.py b/GPy/kern/parts/hetero.py
index d3939563..f48dddb4 100644
--- a/GPy/kern/parts/hetero.py
+++ b/GPy/kern/parts/hetero.py
@@ -90,7 +90,7 @@ class Hetero(Kernpart):
         """Gradient of diagonal of covariance with respect to parameters."""
         target += 2.*self.mapping.df_dtheta(dL_dKdiag[:, None]*self.mapping.f(X), X)
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         """Derivative of the covariance matrix with respect to X."""
         if X2==None or X2 is X:
             dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
diff --git a/GPy/kern/parts/hierarchical.py b/GPy/kern/parts/hierarchical.py
index c629f6b9..43dddd2d 100644
--- a/GPy/kern/parts/hierarchical.py
+++ b/GPy/kern/parts/hierarchical.py
@@ -55,14 +55,14 @@ class Hierarchical(Kernpart):
         [[[[k.dK_dtheta(dL_dK[s,s2],X[s],X2[s2],target[p_start:p_stop]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_, slices2_)] for k, p_start, p_stop, slices_, slices2_ in zip(self.parts, self.param_starts, self.param_stops, slices, slices2)]
 
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         raise NotImplementedError
         #X,slices = X[:,:-1],index_to_slices(X[:,-1])
         #if X2 is None:
             #X2,slices2 = X,slices
         #else:
             #X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
-        #[[[self.k.dK_dX(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+        #[[[self.k.gradients_X(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
 #
     def dKdiag_dX(self,dL_dKdiag,X,target):
         raise NotImplementedError
diff --git a/GPy/kern/parts/independent_outputs.py b/GPy/kern/parts/independent_outputs.py
index f88b0ff5..8c0959c5 100644
--- a/GPy/kern/parts/independent_outputs.py
+++ b/GPy/kern/parts/independent_outputs.py
@@ -79,13 +79,13 @@ class IndependentOutputs(Kernpart):
         [[[self.k.dK_dtheta(dL_dK[s,s2],X[s],X2[s2],target) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
 
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         X,slices = X[:,:-1],index_to_slices(X[:,-1])
         if X2 is None:
             X2,slices2 = X,slices
         else:
             X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
-        [[[self.k.dK_dX(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+        [[[self.k.gradients_X(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
 
     def dKdiag_dX(self,dL_dKdiag,X,target):
         X,slices = X[:,:-1],index_to_slices(X[:,-1])
diff --git a/GPy/kern/parts/kernpart.py b/GPy/kern/parts/kernpart.py
index 8314e7a7..2583d525 100644
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
@@ -20,7 +20,6 @@ class Kernpart(Parameterized):
         # the number of optimisable parameters
         # the name of the covariance function.
         # link to parameterized objects
-        self._parameters_ = []
         #self._X = None
     
     def connect_input(self, X):
@@ -106,7 +105,7 @@ class Kernpart(Parameterized):
         raise NotImplementedError
     def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
         raise NotImplementedError
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         raise NotImplementedError
     def dKdiag_dX(self, dL_dK, X, target):
         raise NotImplementedError
diff --git a/GPy/kern/parts/linear.py b/GPy/kern/parts/linear.py
index 62f1ac36..6ead4549 100644
--- a/GPy/kern/parts/linear.py
+++ b/GPy/kern/parts/linear.py
@@ -57,6 +57,27 @@ class Linear(Kernpart):
     def on_input_change(self, X):
         self._K_computations(X, None)
 
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        self._psi_computations(Z, mu, S)
+        
+        # psi0:
+        tmp = dL_dpsi0[:, None] * self.mu2_S
+        if self.ARD: self.variances.gradient = tmp.sum(0)
+        else: self.variances.gradient = tmp.sum()
+
+        #psi1
+        self.dK_dtheta(dL_dpsi1, mu, Z, self.variances.gradient)
+        
+        #from psi2
+        tmp = dL_dpsi2[:, :, :, None] * (self.ZAinner[:, :, None, :] * (2 * Z)[None, None, :, :])
+        if self.ARD: self.variances.gradient += tmp.sum(0).sum(0).sum(0)
+        else: self.variances.gradient += tmp.sum()
+
+        #from Kmm
+        self._K_computations(Z, None)
+        self.dK_dtheta(dL_dKmm, Z, None, self.variances.gradient)
+        
+        
 #     def _get_params(self):
 #         return self.variances
 # 
@@ -107,7 +128,7 @@ class Linear(Kernpart):
         else:
             target += tmp.sum()
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         if X2 is None:
             target += 2*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
         else:
@@ -150,7 +171,7 @@ class Linear(Kernpart):
         target_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
 
     def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
-        self.dK_dX(dL_dpsi1.T, Z, mu, target)
+        self.gradients_X(dL_dpsi1.T, Z, mu, target)
 
     def psi2(self, Z, mu, S, target):
         self._psi_computations(Z, mu, S)
@@ -182,7 +203,7 @@ class Linear(Kernpart):
     def dpsi2_dmuS_new(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
         tmp = np.zeros((mu.shape[0], Z.shape[0]))
         self.K(mu,Z,tmp)
-        self.dK_dX(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target_mu)
+        self.gradients_X(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target_mu)
 
         Zs = Z*self.variances
         Zs_sq = Zs[:,None,:]*Zs[None,:,:]
diff --git a/GPy/kern/parts/mlp.py b/GPy/kern/parts/mlp.py
index e68aaa72..2ba25802 100644
--- a/GPy/kern/parts/mlp.py
+++ b/GPy/kern/parts/mlp.py
@@ -107,7 +107,7 @@ class MLP(Kernpart):
             
         target[0] += np.sum(self._K_dvar*dL_dK)
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         """Derivative of the covariance matrix with respect to X"""
         self._K_computations(X, X2)
         arg = self._K_asin_arg
diff --git a/GPy/kern/parts/poly.py b/GPy/kern/parts/poly.py
index 98c520f0..80abab60 100644
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
@@ -99,7 +99,7 @@ class POLY(Kernpart):
         target[2] += base_cov_grad.sum()
 
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         """Derivative of the covariance matrix with respect to X"""
         self._K_computations(X, X2)
         arg = self._K_poly_arg
diff --git a/GPy/kern/parts/prod.py b/GPy/kern/parts/prod.py
index 62eed2aa..07286a82 100644
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@@ -81,21 +81,21 @@ class Prod(Kernpart):
         self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,self.slice1],target[:self.k1.num_params])
         self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.slice2],target[self.k1.num_params:])
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to X."""
         self._K_computations(X,X2)
         if X2 is None:
             if not isinstance(self.k1,Coregionalize) and not isinstance(self.k2,Coregionalize):
-                self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], None, target[:,self.slice1])
-                self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], None, target[:,self.slice2])
+                self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], None, target[:,self.slice1])
+                self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], None, target[:,self.slice2])
             else:#if isinstance(self.k1,Coregionalize) or isinstance(self.k2,Coregionalize):
-                #NOTE The indices column in the inputs makes the ki.dK_dX fail when passing None instead of X[:,self.slicei]
+                #NOTE The indices column in the inputs makes the ki.gradients_X fail when passing None instead of X[:,self.slicei]
                 X2 = X
-                self.k1.dK_dX(2.*dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
-                self.k2.dK_dX(2.*dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
+                self.k1.gradients_X(2.*dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
+                self.k2.gradients_X(2.*dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
         else:
-            self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
-            self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
+            self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
+            self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         K1 = np.zeros(X.shape[0])
@@ -103,8 +103,8 @@ class Prod(Kernpart):
         self.k1.Kdiag(X[:,self.slice1],K1)
         self.k2.Kdiag(X[:,self.slice2],K2)
 
-        self.k1.dK_dX(dL_dKdiag*K2, X[:,self.slice1], target[:,self.slice1])
-        self.k2.dK_dX(dL_dKdiag*K1, X[:,self.slice2], target[:,self.slice2])
+        self.k1.gradients_X(dL_dKdiag*K2, X[:,self.slice1], target[:,self.slice1])
+        self.k2.gradients_X(dL_dKdiag*K1, X[:,self.slice2], target[:,self.slice2])
 
     def _K_computations(self,X,X2):
         if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
diff --git a/GPy/kern/parts/prod_orthogonal.py b/GPy/kern/parts/prod_orthogonal.py
index 237c9557..f8d1c3b2 100644
--- a/GPy/kern/parts/prod_orthogonal.py
+++ b/GPy/kern/parts/prod_orthogonal.py
@@ -67,11 +67,11 @@ class prod_orthogonal(Kernpart):
         self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,:self.k1.input_dim],target[:self.k1.num_params])
         self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.k1.input_dim:],target[self.k1.num_params:])
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to X."""
         self._K_computations(X,X2)
-        self.k1.dK_dX(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target)
-        self.k2.dK_dX(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target)
+        self.k1.gradients_X(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target)
+        self.k2.gradients_X(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target)
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         K1 = np.zeros(X.shape[0])
@@ -79,8 +79,8 @@ class prod_orthogonal(Kernpart):
         self.k1.Kdiag(X[:,0:self.k1.input_dim],K1)
         self.k2.Kdiag(X[:,self.k1.input_dim:],K2)
 
-        self.k1.dK_dX(dL_dKdiag*K2, X[:,:self.k1.input_dim], target)
-        self.k2.dK_dX(dL_dKdiag*K1, X[:,self.k1.input_dim:], target)
+        self.k1.gradients_X(dL_dKdiag*K2, X[:,:self.k1.input_dim], target)
+        self.k2.gradients_X(dL_dKdiag*K1, X[:,self.k1.input_dim:], target)
 
     def _K_computations(self,X,X2):
         if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
diff --git a/GPy/kern/parts/rational_quadratic.py b/GPy/kern/parts/rational_quadratic.py
index a75a5b11..bd623320 100644
--- a/GPy/kern/parts/rational_quadratic.py
+++ b/GPy/kern/parts/rational_quadratic.py
@@ -68,7 +68,7 @@ class RationalQuadratic(Kernpart):
         target[0] += np.sum(dL_dKdiag)
         # here self.lengthscale and self.power have no influence on Kdiag so target[1:] are unchanged
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to X."""
         if X2 is None:
             dist2 = np.square((X-X.T)/self.lengthscale)
diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py
index 4247eb9c..e7bc8624 100644
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@@ -51,8 +51,11 @@ class RBF(Kernpart):
                 lengthscale = np.ones(self.input_dim)
 
         self.variance = Param('variance', variance)
+        
         self.lengthscale = Param('lengthscale', lengthscale)
         self.lengthscale.add_observer(self, self.update_lengthscale)
+        self.update_lengthscale(self.lengthscale)
+        
         self.add_parameters(self.variance, self.lengthscale)
         self.parameters_changed() # initializes cache
 
@@ -114,7 +117,7 @@ class RBF(Kernpart):
         self._K_computations(X, Z)
         self.variance.gradient += np.sum(dL_dKnm * self._K_dvar)
         if self.ARD:
-            self.lengthscales.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
+            self.lengthscale.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
 
         else:
             self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKnm)
@@ -123,7 +126,7 @@ class RBF(Kernpart):
         self._K_computations(Z, None)
         self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
         if self.ARD:
-            self.lengthscales.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
+            self.lengthscale.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
         else:
             self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
 
@@ -157,7 +160,7 @@ class RBF(Kernpart):
         self._K_computations(Z, None)
         self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
         if self.ARD:
-            self.lengthscales.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
+            self.lengthscale.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
         else:
             self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
 
@@ -168,8 +171,8 @@ class RBF(Kernpart):
             _K_dist = 2*(X[:, None, :] - X[None, :, :])
         else:
             _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
-        dK_dX = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
-        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+        gradients_X = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
+        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         pass
diff --git a/GPy/kern/parts/rbf_inv.py b/GPy/kern/parts/rbf_inv.py
index c4461267..0c0168a6 100644
--- a/GPy/kern/parts/rbf_inv.py
+++ b/GPy/kern/parts/rbf_inv.py
@@ -142,14 +142,14 @@ class RBFInv(RBF):
         else:
             target[1] += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK) * (-self.lengthscale2)
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         self._K_computations(X, X2)
         if X2 is None:            
             _K_dist = 2*(X[:, None, :] - X[None, :, :])
         else:
             _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
-        dK_dX = (-self.variance * self.inv_lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
-        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+        gradients_X = (-self.variance * self.inv_lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
+        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         pass
diff --git a/GPy/kern/parts/rbfcos.py b/GPy/kern/parts/rbfcos.py
index b6411e0a..fc4a376a 100644
--- a/GPy/kern/parts/rbfcos.py
+++ b/GPy/kern/parts/rbfcos.py
@@ -88,7 +88,7 @@ class RBFCos(Kernpart):
     def dKdiag_dtheta(self,dL_dKdiag,X,target):
         target[0] += np.sum(dL_dKdiag)
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         #TODO!!!
         raise NotImplementedError
 
diff --git a/GPy/kern/parts/ss_rbf.py b/GPy/kern/parts/ss_rbf.py
index a234d428..cab8fd11 100644
--- a/GPy/kern/parts/ss_rbf.py
+++ b/GPy/kern/parts/ss_rbf.py
@@ -144,8 +144,8 @@ class SS_RBF(Kernpart):
             _K_dist = 2*(X[:, None, :] - X[None, :, :])
         else:
             _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
-        dK_dX = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
-        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+        gradients_X = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
+        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         pass
diff --git a/GPy/kern/parts/symmetric.py b/GPy/kern/parts/symmetric.py
index d47fbd9d..ef9a8dd5 100644
--- a/GPy/kern/parts/symmetric.py
+++ b/GPy/kern/parts/symmetric.py
@@ -54,7 +54,7 @@ class Symmetric(Kernpart):
         self.k.dK_dtheta(dL_dK,AX,AX2,target)
 
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to X."""
         AX = np.dot(X,self.transform)
         if X2 is None:
@@ -62,10 +62,10 @@ class Symmetric(Kernpart):
             ZX2 = AX
         else:
             AX2 = np.dot(X2, self.transform)
-        self.k.dK_dX(dL_dK, X, X2, target)
-        self.k.dK_dX(dL_dK, AX, X2, target)
-        self.k.dK_dX(dL_dK, X, AX2, target)
-        self.k.dK_dX(dL_dK, AX ,AX2, target)
+        self.k.gradients_X(dL_dK, X, X2, target)
+        self.k.gradients_X(dL_dK, AX, X2, target)
+        self.k.gradients_X(dL_dK, X, AX2, target)
+        self.k.gradients_X(dL_dK, AX ,AX2, target)
 
     def Kdiag(self,X,target):
         """Compute the diagonal of the covariance matrix associated to X."""
diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index ea603eab..46f975d2 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -357,7 +357,7 @@ class spkern(Kernpart):
     def dKdiag_dtheta(self,partial,X,target):
         self._weave_inline(self._dKdiag_dtheta_code, X, target, Z=None, partial=partial)
                
-    def dK_dX(self,partial,X,Z,target):
+    def gradients_X(self,partial,X,Z,target):
         if Z is None:
             self._weave_inline(self._dK_dX_code_X, X, target, Z, partial)
         else:
diff --git a/GPy/mappings/kernel.py b/GPy/mappings/kernel.py
index ccd1462a..94ce203f 100644
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@@ -57,4 +57,4 @@ class Kernel(Mapping):
         return np.hstack((self._df_dA.flatten(), self._df_dbias))
 
     def df_dX(self, dL_df, X):
-        return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X)
+        return self.kern.gradients_X((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X)
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 7a22b5ea..78851147 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -2,7 +2,6 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-import itertools
 from gplvm import GPLVM
 from .. import kern
 from ..core import SparseGP
@@ -23,15 +22,10 @@ class BayesianGPLVM(SparseGP, GPLVM):
     :type init: 'PCA'|'random'
 
     """
-    def __init__(self, likelihood_or_Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, name='bayesian gplvm', **kwargs):
-        if type(likelihood_or_Y) is np.ndarray:
-            likelihood = Gaussian(likelihood_or_Y)
-        else:
-            likelihood = likelihood_or_Y
-
+    def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
+                 Z=None, kernel=None, inference_method=None, likelihood=Gaussian(), name='bayesian gplvm', **kwargs):
         if X == None:
-            X = self.initialise_latent(init, input_dim, likelihood.Y)
+            X = self.initialise_latent(init, input_dim, Y)
         self.init = init
 
         if X_variance is None:
@@ -44,9 +38,9 @@ class BayesianGPLVM(SparseGP, GPLVM):
         if kernel is None:
             kernel = kern.rbf(input_dim) # + kern.white(input_dim)
 
-        SparseGP.__init__(self, X=X, likelihood=likelihood, kernel=kernel, Z=Z, X_variance=X_variance, name=name, **kwargs)
-        self.q = Normal(self.X, self.X_variance)
-        self.add_parameter(self.q, gradient=self._dbound_dmuS, index=0)
+        self.q = Normal(X, X_variance)
+        SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, X_variance, name, **kwargs)
+        self.add_parameter(self.q, index=0)
         self.ensure_default_constraints()
 
     def _getstate(self):
@@ -94,9 +88,9 @@ class BayesianGPLVM(SparseGP, GPLVM):
         return dKL_dmu, dKL_dS
 
     def dL_dmuS(self):
-        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi0_dmuS(self.dL_dpsi0, self.Z, self.X, self.X_variance)
-        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi1_dmuS(self.dL_dpsi1, self.Z, self.X, self.X_variance)
-        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.dL_dpsi2, self.Z, self.X, self.X_variance)
+        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi0_dmuS(self.grad_dict['dL_dpsi0'], self.Z, self.X, self.X_variance)
+        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi1_dmuS(self.grad_dict['dL_dpsi1'], self.Z, self.X, self.X_variance)
+        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.grad_dict['dL_dpsi2'], self.Z, self.X, self.X_variance)
         dL_dmu = dL_dmu_psi0 + dL_dmu_psi1 + dL_dmu_psi2
         dL_dS = dL_dS_psi0 + dL_dS_psi1 + dL_dS_psi2
 
@@ -107,10 +101,25 @@ class BayesianGPLVM(SparseGP, GPLVM):
         var_S = np.sum(self.X_variance - np.log(self.X_variance))
         return 0.5 * (var_mean + var_S) - 0.5 * self.input_dim * self.num_data
 
-    def log_likelihood(self):
-        ll = SparseGP.log_likelihood(self)
-        kl = self.KL_divergence()
-        return ll - kl
+    def parameters_changed(self):
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
+        self._log_marginal_likelihood -= self.KL_divergence()
+
+        #The derivative of the bound wrt the inducing inputs Z
+        self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
+        self.Z.gradient += self.kern.dpsi1_dZ(self.grad_dict['dL_dpsi1'], self.Z, self.X, self.X_variance)
+        self.Z.gradient += self.kern.dpsi2_dZ(self.grad_dict['dL_dpsi2'], self.Z, self.X, self.X_variance)
+        
+        dL_dmu, dL_dS = self.dL_dmuS()
+        dKL_dmu, dKL_dS = self.dKL_dmuS()
+        self.q.means.gradient = dL_dmu - dKL_dmu
+        self.q.variances.gradient = dL_dS - dKL_dS
+    
+
+#     def log_likelihood(self):        
+#         ll = SparseGP.log_likelihood(self)
+#         kl = self.KL_divergence()
+#         return ll - kl
 
     def _dbound_dmuS(self):
         dKL_dmu, dKL_dS = self.dKL_dmuS()
@@ -181,18 +190,18 @@ class BayesianGPLVM(SparseGP, GPLVM):
         """
         dmu_dX = np.zeros_like(Xnew)
         for i in range(self.Z.shape[0]):
-            dmu_dX += self.kern.dK_dX(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :])
+            dmu_dX += self.kern.gradients_X(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :])
         return dmu_dX
 
     def dmu_dXnew(self, Xnew):
         """
         Individual gradient of prediction at Xnew w.r.t. each sample in Xnew
         """
-        dK_dX = np.zeros((Xnew.shape[0], self.num_inducing))
+        gradients_X = np.zeros((Xnew.shape[0], self.num_inducing))
         ones = np.ones((1, 1))
         for i in range(self.Z.shape[0]):
-            dK_dX[:, i] = self.kern.dK_dX(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
-        return np.dot(dK_dX, self.Cpsi1Vf)
+            gradients_X[:, i] = self.kern.gradients_X(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
+        return np.dot(gradients_X, self.Cpsi1Vf)
 
     def plot_steepest_gradient_map(self, *args, ** kwargs):
         """
diff --git a/GPy/models/bcgplvm.py b/GPy/models/bcgplvm.py
index 9f5866c3..f21a01f4 100644
--- a/GPy/models/bcgplvm.py
+++ b/GPy/models/bcgplvm.py
@@ -44,7 +44,7 @@ class BCGPLVM(GPLVM):
         GP._set_params(self, x[self.mapping.num_params:])
 
     def _log_likelihood_gradients(self):
-        dL_df = self.kern.dK_dX(self.dL_dK, self.X)
+        dL_df = self.kern.gradients_X(self.dL_dK, self.X)
         dL_dtheta = self.mapping.df_dtheta(dL_df, self.likelihood.Y)
         return np.hstack((dL_dtheta.flatten(), GP._log_likelihood_gradients(self)))
 
diff --git a/GPy/models/gplvm.py b/GPy/models/gplvm.py
index fc328ff2..06481b81 100644
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@@ -60,7 +60,7 @@ class GPLVM(GP):
     def jacobian(self,X):
         target = np.zeros((X.shape[0],X.shape[1],self.output_dim))
         for i in range(self.output_dim):
-            target[:,:,i]=self.kern.dK_dX(np.dot(self.Ki,self.likelihood.Y[:,i])[None, :],X,self.X)
+            target[:,:,i]=self.kern.gradients_X(np.dot(self.Ki,self.likelihood.Y[:,i])[None, :],X,self.X)
         return target
 
     def magnification(self,X):
diff --git a/GPy/models/sparse_gplvm.py b/GPy/models/sparse_gplvm.py
index e3024264..5c10d0b8 100644
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@@ -52,7 +52,7 @@ class SparseGPLVM(SparseGPRegression, GPLVM):
 
     def dL_dX(self):
         dL_dX = self.kern.dKdiag_dX(self.dL_dpsi0, self.X)
-        dL_dX += self.kern.dK_dX(self.dL_dpsi1, self.X, self.Z)
+        dL_dX += self.kern.gradients_X(self.dL_dpsi1, self.X, self.Z)
 
         return dL_dX
 
diff --git a/GPy/plotting/matplot_dep/variational_plots.py b/GPy/plotting/matplot_dep/variational_plots.py
index 9f791dd1..7c89a088 100644
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@@ -1,4 +1,5 @@
-import pylab as pb
+import pylab as pb, numpy as np
+from ...util.misc import param_to_array
 
 def plot(parameterized, fignum=None, ax=None, colors=None):
     """
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index d8893021..374f9600 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -1,10 +1,9 @@
-import numpy as np
-from ..core.parameterization.array_core import ObservableArray
+from ..core.parameterization.array_core import ObservableArray, ParamList
 class Cacher(object):
     def __init__(self, operation, limit=5):
         self.limit = int(limit)
         self.operation=operation
-        self.cached_inputs = []
+        self.cached_inputs = ParamList([])
         self.cached_outputs = []
         self.inputs_changed = []
 

From c6d466e72d9c4898dc079c8cf7329c65f8c1d4eb Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 10 Feb 2014 15:39:54 +0000
Subject: [PATCH 30/43] Moved fix parameter to constrainable

---
 GPy/core/parameterization/param.py          | 17 -----------------
 GPy/core/parameterization/parameter_core.py | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 62108ab2..848249f4 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -158,23 +158,6 @@ class Param(ObservableArray, Constrainable):
     def _collect_gradient(self, target):
         target[:] = self.gradient.flat
     #===========================================================================
-    # Fixing Parameters:
-    #===========================================================================
-    def constrain_fixed(self, warning=True):
-        """
-        Constrain this paramter to be fixed to the current value it carries.
-
-        :param warning: print a warning for overwriting constraints.
-        """
-        self._highest_parent_._fix(self,warning)
-    fix = constrain_fixed
-    def unconstrain_fixed(self):
-        """
-        This parameter will no longer be fixed.
-        """
-        self._highest_parent_._unfix(self)
-    unfix = unconstrain_fixed
-    #===========================================================================
     # Tying operations -> bugged, TODO
     #===========================================================================
     def tie_to(self, param):
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 81fb16d9..d6733b44 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -76,6 +76,27 @@ class Nameable(Parentable):
 class Constrainable(Nameable):
     def __init__(self, name):
         super(Constrainable,self).__init__(name)
+    #===========================================================================
+    # Fixing Parameters:
+    #===========================================================================
+    def constrain_fixed(self, value=None, warning=True):
+        """
+        Constrain this paramter to be fixed to the current value it carries.
+
+        :param warning: print a warning for overwriting constraints.
+        """
+        if value is not None:
+            self[:] = value
+        self._highest_parent_._fix(self,warning)
+    fix = constrain_fixed
+    def unconstrain_fixed(self):
+        """
+        This parameter will no longer be fixed.
+        """
+        self._highest_parent_._unfix(self)
+    unfix = unconstrain_fixed
+
+
     #===========================================================================
     # Constrain operations -> done
     #===========================================================================

From 6cbf810856b9a26d0922a962277bee8f0c0cd93d Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 10 Feb 2014 15:40:06 +0000
Subject: [PATCH 31/43] Laplace now appears to be grad checking again

---
 GPy/examples/non_gaussian.py                  | 34 +++++++++++--------
 GPy/examples/regression.py                    |  7 ++--
 .../latent_function_inference/laplace.py      | 13 +------
 GPy/likelihoods/poisson.py                    |  7 ++--
 GPy/likelihoods/student_t.py                  |  4 +--
 GPy/testing/likelihood_tests.py               | 15 ++++++--
 6 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/GPy/examples/non_gaussian.py b/GPy/examples/non_gaussian.py
index bda80137..23122691 100644
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@@ -37,39 +37,43 @@ def student_t_approx(optimize=True, plot=True):
 
     # Kernel object
     kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
-    kernel2 = kernel1.copy()
-    kernel3 = kernel1.copy()
-    kernel4 = kernel1.copy()
+    kernel2 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel3 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel4 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
 
     #Gaussian GP model on clean data
     m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
     # optimize
     m1.ensure_default_constraints()
-    m1.constrain_fixed('white', 1e-5)
+    m1['white'] = 1e-5
+    m1['white'].constrain_fixed('white')
     m1.randomize()
 
     #Gaussian GP model on corrupt data
     m2 = GPy.models.GPRegression(X, Yc.copy(), kernel=kernel2)
     m2.ensure_default_constraints()
-    m2.constrain_fixed('white', 1e-5)
+    m1['white'] = 1e-5
+    m1['white'].constrain_fixed('white')
     m2.randomize()
 
     #Student t GP model on clean data
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
-    m3 = GPy.models.GPRegression(X, Y.copy(), kernel3, likelihood=stu_t_likelihood)
+    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
+    laplace_inf = GPy.inference.latent_function_inference.LaplaceInference()
+    m3 = GPy.core.GP(X, Y.copy(), kernel3, likelihood=t_distribution, inference_method=laplace_inf)
     m3.ensure_default_constraints()
-    m3.constrain_bounded('t_noise', 1e-6, 10.)
-    m3.constrain_fixed('white', 1e-5)
+    m3['t_noise'].constrain_bounded(1e-6, 10.)
+    m3['white'] = 1e-5
+    m3['white'].constrain_fixed()
     m3.randomize()
 
     #Student t GP model on corrupt data
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
-    m4 = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
+    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
+    laplace_inf = GPy.inference.latent_function_inference.LaplaceInference()
+    m4 = GPy.core.GP(X, Yc.copy(), kernel4, likelihood=t_distribution, inference_method=laplace_inf)
     m4.ensure_default_constraints()
-    m4.constrain_bounded('t_noise', 1e-6, 10.)
-    m4.constrain_fixed('white', 1e-5)
+    m4['t_noise'].constrain_bounded(1e-6, 10.)
+    m4['white'] = 1e-5
+    m4['white'].constrain_fixed()
     m4.randomize()
 
     if optimize:
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 65a50f0e..4dea1342 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -281,11 +281,12 @@ def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
     f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
     Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
 
-    noise_model = GPy.likelihoods.poisson()
-    likelihood = GPy.likelihoods.Laplace(Y,noise_model)
+    kern = GPy.kern.rbf(1)
+    poisson_lik = GPy.likelihoods.Poisson()
+    laplace_inf = GPy.inference.latent_function_inference.LaplaceInference()
 
     # create simple GP Model
-    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
+    m = GPy.core.GP(X, Y, kernel=kern, likelihood=poisson_lik, inference_method=laplace_inf)
 
     if optimize:
         m.optimize(optimizer)
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 82313eab..bc81a86a 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -11,9 +11,8 @@
 #http://gaussianprocess.org/gpml/code.
 
 import numpy as np
-from ...util.linalg import mdot, jitchol, pddet, dpotrs, dtrtrs, dpotri, symmetrify
+from ...util.linalg import mdot, jitchol, dpotrs, dtrtrs, dpotri, symmetrify
 from ...util.misc import param_to_array
-from functools import partial as partial_func
 from posterior import Posterior
 import warnings
 from scipy import optimize
@@ -85,7 +84,6 @@ class LaplaceInference(object):
         Ki_f = Ki_f_init.copy()
         f = np.dot(K, Ki_f)
 
-
         #define the objective function (to be maximised)
         def obj(Ki_f, f):
             return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + likelihood.logpdf(f, Y, extra_data=Y_metadata)
@@ -205,14 +203,6 @@ class LaplaceInference(object):
 
         return log_marginal, woodbury_vector, K_Wi_i, dL_dK, dL_dthetaL
 
-
-    #def likelihood_gradients(self, f_hat, K, Y, Ki_W_i, dL_dfhat, I_KW_i, likelihood, Y_metadata):
-        #"""
-        #Gradients with respect to likelihood parameters (dL_dthetaL)
-
-        #:rtype: array of derivatives (1 x num_likelihood_params)
-        #"""
-
     def _compute_B_statistics(self, K, W, log_concave):
         """
         Rasmussen suggests the use of a numerically stable positive definite matrix B
@@ -245,6 +235,5 @@ class LaplaceInference(object):
         #K_Wi_i_2 , _= dpotri(L2)
         #symmetrify(K_Wi_i_2)
 
-
         return K_Wi_i, L, LiW12
 
diff --git a/GPy/likelihoods/poisson.py b/GPy/likelihoods/poisson.py
index 355516bb..ba6915b8 100644
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@@ -19,8 +19,11 @@ class Poisson(Likelihood):
     .. Note::
         Y is expected to take values in {0,1,2,...}
     """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
-        super(Poisson, self).__init__(gp_link,analytical_mean,analytical_variance)
+    def __init__(self, gp_link=None):
+        if gp_link is None:
+            gp_link = link_functions.Log_ex_1()
+
+        super(Poisson, self).__init__(gp_link, name='Poisson')
 
     def _preprocess_values(self,Y):
         return Y
diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py
index e815a399..ac93f204 100644
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@@ -244,7 +244,7 @@ class StudentT(Likelihood):
         d2logpdf_dlink2_dv = np.zeros_like(d2logpdf_dlink2_dvar) #FIXME: Not done yet
         return np.hstack((d2logpdf_dlink2_dvar, d2logpdf_dlink2_dv))
 
-    def _predictive_variance_analytical(self, mu, sigma, predictive_mean=None):
+    def predictive_variance(self, mu, sigma, predictive_mean=None):
         """
         Compute predictive variance of student_t*normal p(y*|f*)p(f*)
 
@@ -264,7 +264,7 @@ class StudentT(Likelihood):
 
         return true_var
 
-    def _predictive_mean_analytical(self, mu, sigma):
+    def predictive_mean(self, mu, sigma):
         """
         Compute mean of the prediction
         """
diff --git a/GPy/testing/likelihood_tests.py b/GPy/testing/likelihood_tests.py
index d344e23d..7f48ac95 100644
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@@ -86,7 +86,7 @@ class TestNoiseModels(object):
     Generic model checker
     """
     def setUp(self):
-        self.N = 5
+        self.N = 15
         self.D = 3
         self.X = np.random.rand(self.N, self.D)*10
 
@@ -104,7 +104,7 @@ class TestNoiseModels(object):
         self.var = np.random.rand(1)
 
         #Make a bigger step as lower bound can be quite curved
-        self.step = 1e-3
+        self.step = 1e-4
 
     def tearDown(self):
         self.Y = None
@@ -165,11 +165,20 @@ class TestNoiseModels(object):
                                 },
                             "laplace": True
                             },
+                        "Student_t_small_deg_free": {
+                            "model": GPy.likelihoods.StudentT(deg_free=1.5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
+                                },
+                            "laplace": True
+                            },
                         "Student_t_small_var": {
                             "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
-                                "vals": [0.01],
+                                "vals": [0.0001],
                                 "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
                                 },
                             "laplace": True

From a2b986dc5cd7f82ab18f666b51b6d276912ee30e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 10 Feb 2014 15:49:50 +0000
Subject: [PATCH 32/43] Fixed a couple of small params bugs

---
 GPy/core/parameterization/param.py          | 2 +-
 GPy/core/parameterization/parameter_core.py | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index a8408bf4..950314dd 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -4,7 +4,7 @@
 import itertools
 import numpy
 from parameter_core import Constrainable, adjust_name_for_printing
-from array_core import ObservableArray
+from array_core import ObservableArray, ParamList
 
 ###### printing
 __constraints_name__ = "Constraint"
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 8bb33d6f..0bb97388 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -49,20 +49,19 @@ class Pickleable(object):
 
 class Parentable(object):
     def __init__(self, direct_parent=None, parent_index=None):
-        super(Parentable,self).__init__()        
+        super(Parentable,self).__init__()
         self._direct_parent_ = direct_parent
         self._parent_index_ = parent_index
-        self._highest_parent_ = highest_parent
 
     def has_parent(self):
         return self._direct_parent_ is not None
-    
+
     @property
     def _highest_parent_(self):
         if self._direct_parent_ is None:
             return self
         return self._direct_parent_._highest_parent_
-    
+
 class Nameable(Parentable):
     _name = None
     def __init__(self, name, direct_parent=None, parent_index=None):
@@ -101,8 +100,6 @@ class Constrainable(Nameable):
         """
         self._highest_parent_._unfix(self)
     unfix = unconstrain_fixed
-
-
     #===========================================================================
     # Constrain operations -> done
     #===========================================================================

From 826d2f04ff1fa6fb4dff0fc7ddf05c32d0bbc0ab Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Mon, 10 Feb 2014 16:01:55 +0000
Subject: [PATCH 33/43] checkgrad now global and callable from any parameter

---
 GPy/core/model.py                           |  4 ++--
 GPy/core/parameterization/param.py          | 11 ++++++++---
 GPy/core/parameterization/parameter_core.py | 15 +++++++++++++--
 GPy/core/parameterization/parameterized.py  | 11 ++++-------
 4 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 35403ba7..db811801 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -380,7 +380,7 @@ class Model(Parameterized):
         sgd.run()
         self.optimization_runs.append(sgd)
 
-    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3):
+    def _checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3):
         """
         Check the gradient of the ,odel by comparing to a numerical
         estimate.  If the verbose flag is passed, invividual
@@ -434,7 +434,7 @@ class Model(Parameterized):
             if target_param is None:
                 param_list = range(len(x))
             else:
-                param_list = self.grep_param_names(target_param, transformed=True, search=True)
+                param_list = self._raveled_index_for(target_param)
                 if not np.any(param_list):
                     print "No free parameters to check"
                     return
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 80661de0..4fc3aca0 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -3,8 +3,8 @@
 
 import itertools
 import numpy
-from parameter_core import Constrainable, adjust_name_for_printing
-from array_core import ObservableArray
+from parameter_core import Constrainable, Gradcheckable, adjust_name_for_printing
+from array_core import ObservableArray, ParamList
 
 ###### printing
 __constraints_name__ = "Constraint"
@@ -20,7 +20,7 @@ class Float(numpy.float64, Constrainable):
         self._base = base
 
 
-class Param(ObservableArray, Constrainable):
+class Param(ObservableArray, Constrainable, Gradcheckable):
     """
     Parameter object for GPy models.
 
@@ -547,6 +547,11 @@ class ParamConcatenation(object):
 
     def untie(self, *ties):
         [param.untie(*ties) for param in self.params]
+
+    def checkgrad(self, verbose=0, step=1e-6, tolerance=1e-3):
+        return self.params[0]._highest_parent_._checkgrad(self, verbose, step, tolerance)
+    #checkgrad.__doc__ = Gradcheckable.checkgrad.__doc__
+    
     __lt__ = lambda self, val: self._vals() < val
     __le__ = lambda self, val: self._vals() <= val
     __eq__ = lambda self, val: self._vals() == val
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 51d9a110..b22e14f7 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -52,7 +52,6 @@ class Parentable(object):
         super(Parentable,self).__init__()        
         self._direct_parent_ = direct_parent
         self._parent_index_ = parent_index
-        self._highest_parent_ = highest_parent
 
     def has_parent(self):
         return self._direct_parent_ is not None
@@ -77,7 +76,19 @@ class Nameable(Parentable):
         from_name = self.name
         self._name = name
         if self.has_parent():
-            self._direct_parent_._name_changed(self, from_name)
+            self._direct_parent_._name_changed(self, from_name)    
+
+class Gradcheckable(Parentable):
+    #===========================================================================
+    # Gradchecking
+    #===========================================================================
+    def checkgrad(self, verbose=0, step=1e-6, tolerance=1e-3):
+        if self.has_parent():
+            return self._highest_parent_._checkgrad(self, verbose=verbose, step=step, tolerance=tolerance)
+        return self._checkgrad(self[''], verbose=verbose, step=step, tolerance=tolerance)
+    def _checkgrad(self, param):
+        raise NotImplementedError, "Need log likelihood to check gradient against"
+
 
 class Constrainable(Nameable):
     def __init__(self, name):
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 61097951..746163dc 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -8,7 +8,7 @@ import cPickle
 import itertools
 from re import compile, _pattern_type
 from param import ParamConcatenation, Param
-from parameter_core import Constrainable, Pickleable, Observable, adjust_name_for_printing
+from parameter_core import Constrainable, Pickleable, Observable, adjust_name_for_printing, Gradcheckable
 from index_operations import ParameterIndexOperations,\
     index_empty
 from array_core import ParamList
@@ -24,7 +24,7 @@ FIXED = False
 UNFIXED = True
 #===============================================================================
 
-class Parameterized(Constrainable, Pickleable, Observable):
+class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
     """
     Parameterized class
 
@@ -230,7 +230,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
             elif not (pname in not_unique):
                 self.__dict__[pname] = p
                 self._added_names_.add(pname)
-        
+            
     #===========================================================================
     # Pickling operations
     #===========================================================================
@@ -385,7 +385,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
         """
         return numpy.r_[:self.size]
     #===========================================================================
-    # Handle ties:
+    # Fixing parameters:
     #===========================================================================
     def _set_fixed(self, param_or_index):
         if not self._has_fixes(): self._fixes_ = numpy.ones(self.size, dtype=bool)
@@ -410,9 +410,6 @@ class Parameterized(Constrainable, Pickleable, Observable):
         if self._has_fixes():
             return self._fixes_[self._raveled_index_for(param)]
         return numpy.ones(self.size, dtype=bool)[self._raveled_index_for(param)]
-    #===========================================================================
-    # Fixing parameters:
-    #===========================================================================
     def _fix(self, param, warning=True):
         f = self._add_constrain(param, __fixed__, warning)
         self._set_fixed(f)

From 9eef4ebded451fef0ff045732b23ca3e14324a30 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Mon, 10 Feb 2014 16:08:11 +0000
Subject: [PATCH 34/43] fixes added for gradchecking

---
 GPy/core/model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index db811801..921d50b1 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -435,6 +435,8 @@ class Model(Parameterized):
                 param_list = range(len(x))
             else:
                 param_list = self._raveled_index_for(target_param)
+                if self._has_fixes():
+                    param_list = param_list[self._fixes_]
                 if not np.any(param_list):
                     print "No free parameters to check"
                     return

From 32491abaff209879d1c00d6d5c684c7d5f930ca3 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Mon, 10 Feb 2014 16:42:00 +0000
Subject: [PATCH 35/43] gradchecker now with fixed inputs

---
 GPy/core/model.py                          | 11 ++++++-----
 GPy/core/parameterization/parameterized.py |  5 ++++-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 921d50b1..90edcedf 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -416,7 +416,7 @@ class Model(Parameterized):
         else:
             # check the gradient of each parameter individually, and do some pretty printing
             try:
-                names = self._get_param_names_transformed()
+                names = self._get_param_names()
             except NotImplementedError:
                 names = ['Variable %i' % i for i in range(len(x))]
             # Prepare for pretty-printing
@@ -436,7 +436,8 @@ class Model(Parameterized):
             else:
                 param_list = self._raveled_index_for(target_param)
                 if self._has_fixes():
-                    param_list = param_list[self._fixes_]
+                    param_list = np.intersect1d(param_list, np.r_[:self.size][self._fixes_], True)
+
                 if not np.any(param_list):
                     print "No free parameters to check"
                     return
@@ -444,7 +445,7 @@ class Model(Parameterized):
             gradient = self.objective_function_gradients(x)
             np.where(gradient==0, 1e-312, gradient)
             
-            for i in param_list:
+            for i, ind in enumerate(param_list):
                 xx = x.copy()
                 xx[i] += step
                 f1, g1 = self.objective_and_gradients(xx)
@@ -456,9 +457,9 @@ class Model(Parameterized):
                 difference = np.abs((f1 - f2) / 2 / step - gradient[i])
 
                 if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance:
-                    formatted_name = "\033[92m {0} \033[0m".format(names[i])
+                    formatted_name = "\033[92m {0} \033[0m".format(names[ind])
                 else:
-                    formatted_name = "\033[91m {0} \033[0m".format(names[i])
+                    formatted_name = "\033[91m {0} \033[0m".format(names[ind])
                 r = '%.6f' % float(ratio)
                 d = '%.6f' % float(difference)
                 g = '%.6f' % gradient[i]
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 746163dc..7f2b6a8a 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -312,8 +312,11 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
     #===========================================================================
     # Optimization handles:
     #===========================================================================
-    def _get_param_names_transformed(self):
+    def _get_param_names(self):
         n = numpy.array([p.name_hirarchical+'['+str(i)+']' for p in self.flattened_parameters for i in p._indices()])
+        return n
+    def _get_param_names_transformed(self):
+        n = self._get_param_names()
         if self._has_fixes():
             return n[self._fixes_]
         return n

From ed87e6bbd292e581eac66efc069b01af16527ef6 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Mon, 10 Feb 2014 17:00:50 +0000
Subject: [PATCH 36/43] final touches to gradchecker

---
 GPy/core/model.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 90edcedf..f4e58c9d 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -447,14 +447,14 @@ class Model(Parameterized):
             
             for i, ind in enumerate(param_list):
                 xx = x.copy()
-                xx[i] += step
+                xx[ind] += step
                 f1, g1 = self.objective_and_gradients(xx)
-                xx[i] -= 2.*step
+                xx[ind] -= 2.*step
                 f2, g2 = self.objective_and_gradients(xx)
                 
                 numerical_gradient = (f1 - f2) / (2 * step)
-                ratio = (f1 - f2) / (2 * step * gradient[i])
-                difference = np.abs((f1 - f2) / 2 / step - gradient[i])
+                ratio = (f1 - f2) / (2 * step * gradient[ind])
+                difference = np.abs((f1 - f2) / 2 / step - gradient[ind])
 
                 if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance:
                     formatted_name = "\033[92m {0} \033[0m".format(names[ind])
@@ -462,7 +462,7 @@ class Model(Parameterized):
                     formatted_name = "\033[91m {0} \033[0m".format(names[ind])
                 r = '%.6f' % float(ratio)
                 d = '%.6f' % float(difference)
-                g = '%.6f' % gradient[i]
+                g = '%.6f' % gradient[ind]
                 ng = '%.6f' % float(numerical_gradient)
                 grad_string = "{0:<{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name, r, d, g, ng, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4])
                 print grad_string

From 9e1546524e13512ef082996aa223faa15b4ab4ad Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Tue, 11 Feb 2014 12:17:07 +0000
Subject: [PATCH 37/43] linear kern variational updates

---
 GPy/kern/parts/linear.py | 84 +++++++++++++---------------------------
 1 file changed, 26 insertions(+), 58 deletions(-)

diff --git a/GPy/kern/parts/linear.py b/GPy/kern/parts/linear.py
index 6ead4549..1b805219 100644
--- a/GPy/kern/parts/linear.py
+++ b/GPy/kern/parts/linear.py
@@ -8,6 +8,7 @@ from kernpart import Kernpart
 from ...util.linalg import tdot
 from ...util.misc import fast_array_equal, param_to_array
 from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
 
 class Linear(Kernpart):
     """
@@ -43,8 +44,9 @@ class Linear(Kernpart):
             else:
                 variances = np.ones(self.input_dim)
         
-        self.variances = Param('variances', variances)
-        self.add_parameters(self.variances)
+        self.variances = Param('variances', variances, Logexp())
+        self.variances.gradient = np.zeros(self.variances.shape)
+        self.add_parameter(self.variances)
         self.variances.add_observer(self, self.update_variance)
 
         # initialize cache
@@ -57,42 +59,35 @@ class Linear(Kernpart):
     def on_input_change(self, X):
         self._K_computations(X, None)
 
+    def update_gradients_full(self, dL_dK, X):
+        #self.variances.gradient[:] = 0
+        self._param_grad_helper(dL_dK, X, self.variances.gradient)
+    
+    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+        tmp = dL_dKdiag[:, None] * X ** 2
+        if self.ARD:
+            self.variances.gradient = tmp.sum(0)
+        else:
+            self.variances.gradient = tmp.sum()
+        self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient)
+        self._param_grad_helper(dL_dKnm, X, Z, self.variances.gradient)
+        
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         self._psi_computations(Z, mu, S)
-        
         # psi0:
         tmp = dL_dpsi0[:, None] * self.mu2_S
-        if self.ARD: self.variances.gradient = tmp.sum(0)
-        else: self.variances.gradient = tmp.sum()
-
+        if self.ARD: self.variances.gradient[:] = tmp.sum(0)
+        else: self.variances.gradient[:] = tmp.sum()
         #psi1
-        self.dK_dtheta(dL_dpsi1, mu, Z, self.variances.gradient)
-        
-        #from psi2
+        self._param_grad_helper(dL_dpsi1, mu, Z, self.variances.gradient)
+        #psi2
         tmp = dL_dpsi2[:, :, :, None] * (self.ZAinner[:, :, None, :] * (2 * Z)[None, None, :, :])
         if self.ARD: self.variances.gradient += tmp.sum(0).sum(0).sum(0)
         else: self.variances.gradient += tmp.sum()
-
         #from Kmm
         self._K_computations(Z, None)
-        self.dK_dtheta(dL_dKmm, Z, None, self.variances.gradient)
+        self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient)
         
-        
-#     def _get_params(self):
-#         return self.variances
-# 
-#     def _set_params(self, x):
-#         assert x.size == (self.num_params)
-#         self.variances = x
-    #def parameters_changed(self):
-    #    self.variances2 = np.square(self.variances)
-# 
-#     def _get_param_names(self):
-#         if self.num_params == 1:
-#             return ['variance']
-#         else:
-#             return ['variance_%i' % i for i in range(self.variances.size)]
-
     def K(self, X, X2, target):
         if self.ARD:
             XX = X * np.sqrt(self.variances)
@@ -109,7 +104,7 @@ class Linear(Kernpart):
     def Kdiag(self, X, target):
         np.add(target, np.sum(self.variances * np.square(X), -1), target)
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         if self.ARD:
             if X2 is None:
                 [np.add(target[i:i + 1], np.sum(dL_dK * tdot(X[:, i:i + 1])), target[i:i + 1]) for i in range(self.input_dim)]
@@ -121,13 +116,6 @@ class Linear(Kernpart):
                 self._K_computations(X, X2)
             target += np.sum(self._dot_product * dL_dK)
 
-    def dKdiag_dtheta(self, dL_dKdiag, X, target):
-        tmp = dL_dKdiag[:, None] * X ** 2
-        if self.ARD:
-            target += tmp.sum(0)
-        else:
-            target += tmp.sum()
-
     def gradients_X(self, dL_dK, X, X2, target):
         if X2 is None:
             target += 2*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
@@ -145,14 +133,6 @@ class Linear(Kernpart):
         self._psi_computations(Z, mu, S)
         target += np.sum(self.variances * self.mu2_S, 1)
 
-    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S, target):
-        self._psi_computations(Z, mu, S)
-        tmp = dL_dpsi0[:, None] * self.mu2_S
-        if self.ARD:
-            target += tmp.sum(0)
-        else:
-            target += tmp.sum()
-
     def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
         target_mu += dL_dpsi0[:, None] * (2.0 * mu * self.variances)
         target_S += dL_dpsi0[:, None] * self.variances
@@ -161,10 +141,6 @@ class Linear(Kernpart):
         """the variance, it does nothing"""
         self._psi1 = self.K(mu, Z, target)
 
-    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
-        """the variance, it does nothing"""
-        self.dK_dtheta(dL_dpsi1, mu, Z, target)
-
     def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
         """Do nothing for S, it does not affect psi1"""
         self._psi_computations(Z, mu, S)
@@ -185,21 +161,13 @@ class Linear(Kernpart):
     def dpsi2_dtheta_new(self, dL_dpsi2, Z, mu, S, target):
         tmp = np.zeros((mu.shape[0], Z.shape[0]))
         self.K(mu,Z,tmp)
-        self.dK_dtheta(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target)
+        self._param_grad_helper(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target)
         result= 2.*(dL_dpsi2[:,:,:,None]*S[:,None,None,:]*self.variances*Z[None,:,None,:]*Z[None,None,:,:]).sum(0).sum(0).sum(0)
         if self.ARD:
             target += result.sum(0).sum(0).sum(0)
         else:
             target += result.sum()
 
-    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
-        self._psi_computations(Z, mu, S)
-        tmp = dL_dpsi2[:, :, :, None] * (self.ZAinner[:, :, None, :] * (2 * Z)[None, None, :, :])
-        if self.ARD:
-            target += tmp.sum(0).sum(0).sum(0)
-        else:
-            target += tmp.sum()
-
     def dpsi2_dmuS_new(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
         tmp = np.zeros((mu.shape[0], Z.shape[0]))
         self.K(mu,Z,tmp)
@@ -309,11 +277,11 @@ class Linear(Kernpart):
         if not (fast_array_equal(X, self._X) and fast_array_equal(X2, self._X2)):
             self._X = X.copy()
             if X2 is None:
-                self._dot_product = tdot(X)
+                self._dot_product = tdot(param_to_array(X))
                 self._X2 = None
             else:
                 self._X2 = X2.copy()
-                self._dot_product = np.dot(X, X2.T)
+                self._dot_product = np.dot(param_to_array(X), param_to_array(X2.T))  
 
     def _psi_computations(self, Z, mu, S):
         # here are the "statistics" for psi1 and psi2

From 6a1c700c03cc006379218e30031f137daeebda4d Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Tue, 11 Feb 2014 12:20:16 +0000
Subject: [PATCH 38/43] psi stat and kernel tests new parameterization

---
 GPy/testing/kernel_tests.py            |  2 +-
 GPy/testing/psi_stat_gradient_tests.py | 85 ++++++++++++++------------
 2 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 0fceac60..40cd66dd 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -5,7 +5,7 @@ import unittest
 import numpy as np
 import GPy
 
-verbose = False
+verbose = True
 
 try:
     import sympy
diff --git a/GPy/testing/psi_stat_gradient_tests.py b/GPy/testing/psi_stat_gradient_tests.py
index e373aaa3..56586d3b 100644
--- a/GPy/testing/psi_stat_gradient_tests.py
+++ b/GPy/testing/psi_stat_gradient_tests.py
@@ -9,42 +9,44 @@ import numpy
 import GPy
 import itertools
 from GPy.core import Model
+from GPy.core.parameterization.param import Param
+from GPy.core.parameterization.transformations import Logexp
 
 class PsiStatModel(Model):
     def __init__(self, which, X, X_variance, Z, num_inducing, kernel):
+        super(PsiStatModel, self).__init__(name='psi stat test')
         self.which = which
-        self.X = X
-        self.X_variance = X_variance
-        self.Z = Z
+        self.X = Param("X", X)
+        self.X_variance = Param('X_variance', X_variance, Logexp())
+        self.Z = Param("Z", Z)
         self.N, self.input_dim = X.shape
         self.num_inducing, input_dim = Z.shape
         assert self.input_dim == input_dim, "shape missmatch: Z:{!s} X:{!s}".format(Z.shape, X.shape)
         self.kern = kernel
-        super(PsiStatModel, self).__init__()
         self.psi_ = self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance)
-    def _get_param_names(self):
-        Xnames = ["{}_{}_{}".format(what, i, j) for what, i, j in itertools.product(['X', 'X_variance'], range(self.N), range(self.input_dim))]
-        Znames = ["Z_{}_{}".format(i, j) for i, j in itertools.product(range(self.num_inducing), range(self.input_dim))]
-        return Xnames + Znames + self.kern._get_param_names()
-    def _get_params(self):
-        return numpy.hstack([self.X.flatten(), self.X_variance.flatten(), self.Z.flatten(), self.kern._get_params()])
-    def _set_params(self, x, save_old=True, save_count=0):
-        start, end = 0, self.X.size
-        self.X = x[start:end].reshape(self.N, self.input_dim)
-        start, end = end, end + self.X_variance.size
-        self.X_variance = x[start: end].reshape(self.N, self.input_dim)
-        start, end = end, end + self.Z.size
-        self.Z = x[start: end].reshape(self.num_inducing, self.input_dim)
-        self.kern._set_params(x[end:])
+        self.add_parameters(self.X, self.X_variance, self.Z, self.kern)
+    
     def log_likelihood(self):
         return self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance).sum()
-    def _log_likelihood_gradients(self):
+    
+    def parameters_changed(self):
         psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
+        self.X.gradient = psimu
+        self.X_variance.gradient = psiS
         #psimu, psiS = numpy.ones(self.N * self.input_dim), numpy.ones(self.N * self.input_dim)
-        psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
+        try: psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
+        except AttributeError: psiZ = numpy.zeros_like(self.Z)
+        self.Z.gradient = psiZ
         #psiZ = numpy.ones(self.num_inducing * self.input_dim)
-        thetagrad = self.kern.__getattribute__("d" + self.which + "_dtheta")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance).flatten()
-        return numpy.hstack((psimu.flatten(), psiS.flatten(), psiZ.flatten(), thetagrad))
+        N,M = self.X.shape[0], self.Z.shape[0]
+        dL_dpsi0, dL_dpsi1, dL_dpsi2 = numpy.zeros([N]), numpy.zeros([N,M]), numpy.zeros([N,M,M])
+        if self.which == 'psi0': dL_dpsi0 += 1
+        if self.which == 'psi1': dL_dpsi1 += 1
+        if self.which == 'psi2': dL_dpsi2 += 1
+        self.kern.update_gradients_variational(numpy.zeros([1,1]), 
+                                               dL_dpsi0, 
+                                               dL_dpsi1, 
+                                               dL_dpsi2, self.X, self.X_variance, self.Z)
 
 class DPsiStatTest(unittest.TestCase):
     input_dim = 5
@@ -57,61 +59,66 @@ class DPsiStatTest(unittest.TestCase):
     Y = X.dot(numpy.random.randn(input_dim, input_dim))
 #     kernels = [GPy.kern.linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)), GPy.kern.rbf(input_dim, ARD=True), GPy.kern.bias(input_dim)]
 
-    kernels = [GPy.kern.linear(input_dim), GPy.kern.rbf(input_dim), GPy.kern.bias(input_dim),
-               GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim),
-               GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim)]
+    kernels = [
+               GPy.kern.linear(input_dim), 
+               GPy.kern.rbf(input_dim), 
+               #GPy.kern.bias(input_dim),
+               #GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim),
+               #GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim)
+               ]
 
     def testPsi0(self):
         for k in self.kernels:
             m = PsiStatModel('psi0', X=self.X, X_variance=self.X_var, Z=self.Z,\
                              num_inducing=self.num_inducing, kernel=k)
-            m.ensure_default_constraints()
+            #m.ensure_default_constraints(warning=0)
             m.randomize()
-            assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k.parts)))
+            import ipdb;ipdb.set_trace()
+            assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k._parameters_)))
         
     def testPsi1(self):
         for k in self.kernels:
             m = PsiStatModel('psi1', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-            m.ensure_default_constraints()
+            m.ensure_default_constraints(warning=0)
             m.randomize()
-            assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
+            assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k._parameters_)))
 
     def testPsi2_lin(self):
         k = self.kernels[0]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                  num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints()
+        m.ensure_default_constraints(warning=0)
         m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
     def testPsi2_lin_bia(self):
         k = self.kernels[3]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints()
+        m.ensure_default_constraints(warning=0)
         m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
     def testPsi2_rbf(self):
         k = self.kernels[1]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints()
+        m.ensure_default_constraints(warning=0)
         m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
     def testPsi2_rbf_bia(self):
         k = self.kernels[-1]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints()
+        m.ensure_default_constraints(warning=0)
         m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
     def testPsi2_bia(self):
         k = self.kernels[2]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints()
+        m.ensure_default_constraints(warning=0)
         m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
 
 
 if __name__ == "__main__":

From b9444277334252d904c67a101491aba0e29aefad Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Tue, 11 Feb 2014 12:20:57 +0000
Subject: [PATCH 39/43] rename dK_dtheta > gradients_X

---
 GPy/core/model.py                             | 44 +++++++-------
 GPy/core/parameterization/array_core.py       | 20 ++++++-
 GPy/core/parameterization/param.py            | 27 ++++-----
 GPy/core/parameterization/parameter_core.py   |  3 +-
 GPy/core/parameterization/parameterized.py    |  2 +
 GPy/core/svigp.py                             |  4 +-
 .../latent_function_inference/fitc.py         |  8 +--
 GPy/kern/kern.py                              | 57 ++++++-------------
 GPy/kern/parts/Brownian.py                    |  2 +-
 GPy/kern/parts/Matern32.py                    |  2 +-
 GPy/kern/parts/Matern52.py                    |  2 +-
 GPy/kern/parts/ODE_1.py                       |  2 +-
 GPy/kern/parts/eq_ode1.py                     |  2 +-
 GPy/kern/parts/exponential.py                 |  2 +-
 GPy/kern/parts/finite_dimensional.py          |  2 +-
 GPy/kern/parts/fixed.py                       |  2 +-
 GPy/kern/parts/gibbs.py                       |  2 +-
 GPy/kern/parts/hetero.py                      |  2 +-
 GPy/kern/parts/hierarchical.py                |  4 +-
 GPy/kern/parts/independent_outputs.py         |  4 +-
 GPy/kern/parts/kernpart.py                    | 17 ++++--
 GPy/kern/parts/mlp.py                         |  2 +-
 GPy/kern/parts/periodic_Matern32.py           |  2 +-
 GPy/kern/parts/periodic_Matern52.py           |  2 +-
 GPy/kern/parts/periodic_exponential.py        |  2 +-
 GPy/kern/parts/poly.py                        |  2 +-
 GPy/kern/parts/prod.py                        | 10 ++--
 GPy/kern/parts/prod_orthogonal.py             | 10 ++--
 GPy/kern/parts/rational_quadratic.py          |  2 +-
 GPy/kern/parts/rbf.py                         |  5 +-
 GPy/kern/parts/rbf_inv.py                     |  2 +-
 GPy/kern/parts/rbfcos.py                      |  2 +-
 GPy/kern/parts/spline.py                      |  2 +-
 GPy/kern/parts/symmetric.py                   | 10 ++--
 GPy/kern/parts/sympykern.py                   |  2 +-
 35 files changed, 136 insertions(+), 129 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index f4e58c9d..57d41602 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -260,17 +260,18 @@ class Model(Parameterized):
         """
         positive_strings = ['variance', 'lengthscale', 'precision', 'kappa', 'sensitivity']
         # param_names = self._get_param_names()
-        for s in positive_strings:
-            paramlist = self.grep_param_names(".*"+s)
-            for param in paramlist:
-                for p in param.flattened_parameters:
-                    rav_i = set(self._raveled_index_for(p))
-                    for constraint in self.constraints.iter_properties():
-                        rav_i -= set(self._constraint_indices(p, constraint))
-                    rav_i -= set(np.nonzero(self._fixes_for(p)!=UNFIXED)[0])
-                    ind = self._backtranslate_index(p, np.array(list(rav_i), dtype=int))
-                    if ind.size != 0:
-                        p[np.unravel_index(ind, p.shape)].constrain_positive()
+        
+#         for s in positive_strings:
+#             paramlist = self.grep_param_names(".*"+s)
+#             for param in paramlist:
+#                 for p in param.flattened_parameters:
+#                     rav_i = set(self._raveled_index_for(p))
+#                     for constraint in self.constraints.iter_properties():
+#                         rav_i -= set(self._constraint_indices(p, constraint))
+#                     rav_i -= set(np.nonzero(self._fixes_for(p)!=UNFIXED)[0])
+#                     ind = self._backtranslate_index(p, np.array(list(rav_i), dtype=int))
+#                     if ind.size != 0:
+#                         p[np.unravel_index(ind, p.shape)].constrain_positive(warning=warning)
 #             if paramlist:
 #                 self.__getitem__(None, paramlist).constrain_positive(warning=warning)
 #         currently_constrained = self.all_constrained_indices()
@@ -405,13 +406,13 @@ class Model(Parameterized):
             dx = step * np.sign(np.random.uniform(-1, 1, x.size))
 
             # evaulate around the point x
-            f1, g1 = self.objective_and_gradients(x + dx)
-            f2, g2 = self.objective_and_gradients(x - dx)
+            f1 = self.objective_function(x + dx)
+            f2 = self.objective_function(x - dx)
             gradient = self.objective_function_gradients(x)
 
             numerical_gradient = (f1 - f2) / (2 * dx)
             global_ratio = (f1 - f2) / (2 * np.dot(dx, np.where(gradient==0, 1e-32, gradient)))
-            
+
             return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() < tolerance)
         else:
             # check the gradient of each parameter individually, and do some pretty printing
@@ -430,7 +431,6 @@ class Model(Parameterized):
             header_string = map(lambda x: '|'.join(x), [header_string])
             separator = '-' * len(header_string[0])
             print '\n'.join([header_string[0], separator])
-
             if target_param is None:
                 param_list = range(len(x))
             else:
@@ -438,19 +438,19 @@ class Model(Parameterized):
                 if self._has_fixes():
                     param_list = np.intersect1d(param_list, np.r_[:self.size][self._fixes_], True)
 
-                if not np.any(param_list):
+                if param_list.size == 0:
                     print "No free parameters to check"
                     return
 
             gradient = self.objective_function_gradients(x)
             np.where(gradient==0, 1e-312, gradient)
-            
+            ret = True
             for i, ind in enumerate(param_list):
                 xx = x.copy()
                 xx[ind] += step
-                f1, g1 = self.objective_and_gradients(xx)
+                f1 = self.objective_function(xx)
                 xx[ind] -= 2.*step
-                f2, g2 = self.objective_and_gradients(xx)
+                f2 = self.objective_function(xx)
                 
                 numerical_gradient = (f1 - f2) / (2 * step)
                 ratio = (f1 - f2) / (2 * step * gradient[ind])
@@ -458,15 +458,19 @@ class Model(Parameterized):
 
                 if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance:
                     formatted_name = "\033[92m {0} \033[0m".format(names[ind])
+                    ret &= True
                 else:
                     formatted_name = "\033[91m {0} \033[0m".format(names[ind])
+                    ret &= False
+
                 r = '%.6f' % float(ratio)
                 d = '%.6f' % float(difference)
                 g = '%.6f' % gradient[ind]
                 ng = '%.6f' % float(numerical_gradient)
                 grad_string = "{0:<{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name, r, d, g, ng, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4])
                 print grad_string
-
+            return ret
+        
     def input_sensitivity(self):
         """
         return an array describing the sesitivity of the model to each input
diff --git a/GPy/core/parameterization/array_core.py b/GPy/core/parameterization/array_core.py
index c95f3ce3..4b5b7700 100644
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@@ -27,6 +27,24 @@ class ParamList(list):
         return False
     
     pass
+class C(np.ndarray):
+    __array_priority__ = 1.
+    def __new__(cls, array):
+        obj = array.view(cls)
+        return obj
+    #def __array_finalize__(self, obj):
+    #    #print 'finalize'
+    #    return obj
+    def __array_prepare__(self, out_arr, context):
+        #print 'prepare'
+        while type(out_arr) is C:
+            out_arr = out_arr.base
+        return out_arr
+    def __array_wrap__(self, out_arr, context):
+        #print 'wrap', type(self), type(out_arr), context
+        while type(out_arr) is C:
+            out_arr = out_arr.base
+        return out_arr
 
 class ObservableArray(ListArray, Observable):
     """
@@ -35,7 +53,7 @@ class ObservableArray(ListArray, Observable):
     will be called every time this array changes. The callable
     takes exactly one argument, which is this array itself.
     """
-    __array_priority__ = 0 # Never give back Param
+    __array_priority__ = -1 # Never give back ObservableArray
     def __new__(cls, input_array):
         cls.__name__ = "ObservableArray\n     "
         obj = super(ObservableArray, cls).__new__(cls, input_array).view(cls)
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 583c6425..b76b3858 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -24,8 +24,11 @@ class Param(ObservableArray, Constrainable, Gradcheckable):
     """
     Parameter object for GPy models.
 
-    :param name:        name of the parameter to be printed
-    :param input_array: array which this parameter handles
+    :param str name:           name of the parameter to be printed
+    :param input_array:        array which this parameter handles
+    :type input_array:         numpy.ndarray
+    :param default_constraint: The default constraint for this parameter
+    :type default_constraint:  
 
     You can add/remove constraints by calling constrain on the parameter itself, e.g:
 
@@ -40,19 +43,10 @@ class Param(ObservableArray, Constrainable, Gradcheckable):
 
     See :py:class:`GPy.core.parameterized.Parameterized` for more details on constraining etc.
 
-    This ndarray can be stored in lists and checked if it is in.
-
-    >>> import numpy as np
-    >>> x = np.random.normal(size=(10,3))
-    >>> x in [[1], x, [3]]
-    True
-
-    WARNING: This overrides the functionality of x==y!!!
-    Use numpy.equal(x,y) for element-wise equality testing.
     """
-    __array_priority__ = 0 # Never give back Param
+    __array_priority__ = -1 # Never give back Param
     _fixes_ = None
-    def __new__(cls, name, input_array, *args, **kwargs):
+    def __new__(cls, name, input_array, default_constraint=None):
         obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array))
         obj._current_slice_ = (slice(obj.shape[0]),)
         obj._realshape_ = obj.shape
@@ -66,8 +60,8 @@ class Param(ObservableArray, Constrainable, Gradcheckable):
         obj.gradient = None
         return obj
 
-    def __init__(self, name, input_array):
-        super(Param, self).__init__(name=name)
+    def __init__(self, name, input_array, default_constraint=None):
+        super(Param, self).__init__(name=name, default_constraint=default_constraint)
 
     def __array_finalize__(self, obj):
         # see InfoArray.__array_finalize__ for comments
@@ -75,6 +69,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable):
         super(Param, self).__array_finalize__(obj)
         self._direct_parent_ = getattr(obj, '_direct_parent_', None)
         self._parent_index_ = getattr(obj, '_parent_index_', None)
+        self._default_constraint_ = getattr(obj, '_default_constraint_', None)
         self._current_slice_ = getattr(obj, '_current_slice_', None)
         self._tied_to_me_ = getattr(obj, '_tied_to_me_', None)
         self._tied_to_ = getattr(obj, '_tied_to_', None)
@@ -97,6 +92,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable):
                             (self.name,
                              self._direct_parent_,
                              self._parent_index_,
+                             self._default_constraint_,
                              self._current_slice_,
                              self._realshape_,
                              self._realsize_,
@@ -117,6 +113,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable):
         self._realsize_ = state.pop()
         self._realshape_ = state.pop()
         self._current_slice_ = state.pop()
+        self._default_constraint_ = state.pop()
         self._parent_index_ = state.pop()
         self._direct_parent_ = state.pop()
         self.name = state.pop()
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 7505c796..1075d808 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -91,8 +91,9 @@ class Gradcheckable(Parentable):
 
 
 class Constrainable(Nameable):
-    def __init__(self, name):
+    def __init__(self, name, default_constraint=None):
         super(Constrainable,self).__init__(name)
+        self._default_constraint_ = default_constraint
     #===========================================================================
     # Fixing Parameters:
     #===========================================================================
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 7f2b6a8a..678b119b 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -171,6 +171,8 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
             for t, ind in param._constraints_.iteritems():
                 self.constraints.add(t, ind+self._offset_for(param))
             param._constraints_.clear()
+        if param._default_constraint_ is not None:
+            self._add_constrain(param, param._default_constraint_, False)
         if self._has_fixes() and np.all(self._fixes_): # ==UNFIXED
             self._fixes_= None
 
diff --git a/GPy/core/svigp.py b/GPy/core/svigp.py
index 9e4f3b12..a2c7acee 100644
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@@ -154,13 +154,13 @@ class SVIGP(GP):
             self.psi2 = None
 
     def dL_dtheta(self):
-        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z)
+        dL_dtheta = self.kern._param_grad_helper(self.dL_dKmm, self.Z)
         if self.has_uncertain_inputs:
             dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch)
             dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch)
             dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch)
         else:
-            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X_batch, self.Z)
+            dL_dtheta += self.kern._param_grad_helper(self.dL_dpsi1, self.X_batch, self.Z)
             dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch)
         return dL_dtheta
 
diff --git a/GPy/inference/latent_function_inference/fitc.py b/GPy/inference/latent_function_inference/fitc.py
index e4c01252..59719d86 100644
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@@ -116,8 +116,8 @@ class FITC(SparseGP):
             K_pp_K = np.dot(Kmmipsi1[:,i:(i+1)],Kmmipsi1[:,i:(i+1)].T)
             _dpsi1 = (-V_n**2 - alpha_n + 2.*gamma_k - gamma_n**2) * Kmmipsi1.T[i:(i+1),:]
             _dKmm = .5*(V_n**2 + alpha_n + gamma_n**2 - 2.*gamma_k) * K_pp_K #Diag_dD_dKmm
-            self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1,self.X[i:i+1,:],self.Z)
-            self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm,self.Z)
+            self._dpsi1_dtheta += self.kern._param_grad_helper(_dpsi1,self.X[i:i+1,:],self.Z)
+            self._dKmm_dtheta += self.kern._param_grad_helper(_dKmm,self.Z)
             self._dKmm_dX += self.kern.gradients_X(_dKmm ,self.Z)
             self._dpsi1_dX += self.kern.gradients_X(_dpsi1.T,self.Z,self.X[i:i+1,:])
 
@@ -163,8 +163,8 @@ class FITC(SparseGP):
 
     def dL_dtheta(self):
         dL_dtheta = self.kern.dKdiag_dtheta(self._dL_dpsi0,self.X)
-        dL_dtheta += self.kern.dK_dtheta(self._dL_dpsi1,self.X,self.Z)
-        dL_dtheta += self.kern.dK_dtheta(self._dL_dKmm,X=self.Z)
+        dL_dtheta += self.kern._param_grad_helper(self._dL_dpsi1,self.X,self.Z)
+        dL_dtheta += self.kern._param_grad_helper(self._dL_dKmm,X=self.Z)
         dL_dtheta += self._dKmm_dtheta
         dL_dtheta += self._dpsi1_dtheta
         return dL_dtheta
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index c97807fb..98945c2d 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -8,6 +8,7 @@ from parts.prod import Prod as prod
 from parts.linear import Linear
 from parts.kernpart import Kernpart
 from ..core.parameterization import Parameterized
+from GPy.core.parameterization.param import Param
 
 class kern(Parameterized):
     def __init__(self, input_dim, parts=[], input_slices=None):
@@ -84,7 +85,7 @@ class kern(Parameterized):
 #         represents the gradient in the transformed space (i.e. that given by
 #         get_params_transformed())
 #
-#         :param g: the gradient vector for the current model, usually created by dK_dtheta
+#         :param g: the gradient vector for the current model, usually created by _param_grad_helper
 #         """
 #         x = self._get_params()
 #         [np.place(g, index, g[index] * constraint.gradfactor(x[index]))
@@ -294,7 +295,7 @@ class kern(Parameterized):
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         [p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_]
 
-    def dK_dtheta(self, dL_dK, X, X2=None):
+    def _param_grad_helper(self, dL_dK, X, X2=None):
         """
         Compute the gradient of the covariance function with respect to the parameters.
 
@@ -310,9 +311,9 @@ class kern(Parameterized):
         assert X.shape[1] == self.input_dim
         target = np.zeros(self.size)
         if X2 is None:
-            [p.dK_dtheta(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
+            [p._param_grad_helper(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
         else:
-            [p.dK_dtheta(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
+            [p._param_grad_helper(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
 
         return self._transform_gradients(target)
 
@@ -484,6 +485,7 @@ from GPy.core.model import Model
 class Kern_check_model(Model):
     """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel."""
     def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Model.__init__(self, 'kernel_test_model')
         num_samples = 20
         num_samples2 = 10
         if kernel==None:
@@ -495,14 +497,12 @@ class Kern_check_model(Model):
                 dL_dK = np.ones((X.shape[0], X.shape[0]))
             else:
                 dL_dK = np.ones((X.shape[0], X2.shape[0]))
-
+        
         self.kernel=kernel
+        self.add_parameter(kernel)
         self.X = X
         self.X2 = X2
         self.dL_dK = dL_dK
-        #self.constrained_indices=[]
-        #self.constraints=[]
-        Model.__init__(self, 'kernel_test_model')
 
     def is_positive_definite(self):
         v = np.linalg.eig(self.kernel.K(self.X))[0]
@@ -511,15 +511,6 @@ class Kern_check_model(Model):
         else:
             return True
 
-    def _get_params(self):
-        return self.kernel._get_params()
-
-    def _get_param_names(self):
-        return self.kernel._get_param_names()
-
-    def _set_params(self, x):
-        self.kernel._set_params(x)
-
     def log_likelihood(self):
         return (self.dL_dK*self.kernel.K(self.X, self.X2)).sum()
 
@@ -532,7 +523,7 @@ class Kern_check_dK_dtheta(Kern_check_model):
         Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
 
     def _log_likelihood_gradients(self):
-        return self.kernel.dK_dtheta(self.dL_dK, self.X, self.X2)
+        return self.kernel._param_grad_helper(self.dL_dK, self.X, self.X2)
 
 class Kern_check_dKdiag_dtheta(Kern_check_model):
     """This class allows gradient checks of the gradient of the diagonal of a kernel with respect to the parameters."""
@@ -540,6 +531,8 @@ class Kern_check_dKdiag_dtheta(Kern_check_model):
         Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
         if dL_dK==None:
             self.dL_dK = np.ones((self.X.shape[0]))
+    def parameters_changed(self):
+        self.kernel.update_gradients_full(self.dL_dK, self.X)        
 
     def log_likelihood(self):
         return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
@@ -551,41 +544,25 @@ class Kern_check_dK_dX(Kern_check_model):
     """This class allows gradient checks for the gradient of a kernel with respect to X. """
     def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
         Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
-
+        self.remove_parameter(kernel)
+        self.X = Param('X', self.X)
+        self.add_parameter(self.X)
     def _log_likelihood_gradients(self):
         return self.kernel.gradients_X(self.dL_dK, self.X, self.X2).flatten()
 
-    def _get_param_names(self):
-        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-
-    def _get_params(self):
-        return self.X.flatten()
-
-    def _set_params(self, x):
-        self.X=x.reshape(self.X.shape)
-
-class Kern_check_dKdiag_dX(Kern_check_model):
+class Kern_check_dKdiag_dX(Kern_check_dK_dX):
     """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """
     def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
-        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
+        Kern_check_dK_dX.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
         if dL_dK==None:
             self.dL_dK = np.ones((self.X.shape[0]))
-
+        
     def log_likelihood(self):
         return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
 
     def _log_likelihood_gradients(self):
         return self.kernel.dKdiag_dX(self.dL_dK, self.X).flatten()
 
-    def _get_param_names(self):
-        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-
-    def _get_params(self):
-        return self.X.flatten()
-
-    def _set_params(self, x):
-        self.X=x.reshape(self.X.shape)
-
 def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
     """
     This function runs on kernels to check the correctness of their
diff --git a/GPy/kern/parts/Brownian.py b/GPy/kern/parts/Brownian.py
index 17f65cbd..488e9b7a 100644
--- a/GPy/kern/parts/Brownian.py
+++ b/GPy/kern/parts/Brownian.py
@@ -43,7 +43,7 @@ class Brownian(Kernpart):
     def Kdiag(self,X,target):
         target += self.variance*X.flatten()
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         if X2 is None:
             X2 = X
         target += np.sum(np.fmin(X,X2.T)*dL_dK)
diff --git a/GPy/kern/parts/Matern32.py b/GPy/kern/parts/Matern32.py
index a95f0bcf..08fa452c 100644
--- a/GPy/kern/parts/Matern32.py
+++ b/GPy/kern/parts/Matern32.py
@@ -74,7 +74,7 @@ class Matern32(Kernpart):
         """Compute the diagonal of the covariance matrix associated to X."""
         np.add(target, self.variance, target)
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """derivative of the covariance matrix with respect to the parameters."""
         if X2 is None: X2 = X
         dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
diff --git a/GPy/kern/parts/Matern52.py b/GPy/kern/parts/Matern52.py
index 1f87fefb..7d36254c 100644
--- a/GPy/kern/parts/Matern52.py
+++ b/GPy/kern/parts/Matern52.py
@@ -74,7 +74,7 @@ class Matern52(Kernpart):
         """Compute the diagonal of the covariance matrix associated to X."""
         np.add(target,self.variance,target)
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to the parameters."""
         if X2 is None: X2 = X
         dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
diff --git a/GPy/kern/parts/ODE_1.py b/GPy/kern/parts/ODE_1.py
index 416278e3..15faf108 100644
--- a/GPy/kern/parts/ODE_1.py
+++ b/GPy/kern/parts/ODE_1.py
@@ -90,7 +90,7 @@ class ODE_1(Kernpart):
 
         np.add(self.varianceU*self.varianceY*(k1+k2+k3), target, target)
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """derivative of the covariance matrix with respect to the parameters."""
         if X2 is None: X2 = X
         dist = np.abs(X - X2.T)
diff --git a/GPy/kern/parts/eq_ode1.py b/GPy/kern/parts/eq_ode1.py
index 85bb6379..bf0ca7e4 100644
--- a/GPy/kern/parts/eq_ode1.py
+++ b/GPy/kern/parts/eq_ode1.py
@@ -124,7 +124,7 @@ class Eq_ode1(Kernpart):
         #target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
         pass
     
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         
         # First extract times and indices.
         self._extract_t_indices(X, X2, dL_dK=dL_dK)
diff --git a/GPy/kern/parts/exponential.py b/GPy/kern/parts/exponential.py
index 7cd92aff..372d4d9b 100644
--- a/GPy/kern/parts/exponential.py
+++ b/GPy/kern/parts/exponential.py
@@ -75,7 +75,7 @@ class Exponential(Kernpart):
         """Compute the diagonal of the covariance matrix associated to X."""
         np.add(target, self.variance, target)
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """derivative of the covariance matrix with respect to the parameters."""
         if X2 is None: X2 = X
         dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
diff --git a/GPy/kern/parts/finite_dimensional.py b/GPy/kern/parts/finite_dimensional.py
index 6cc2325f..9e54fb34 100644
--- a/GPy/kern/parts/finite_dimensional.py
+++ b/GPy/kern/parts/finite_dimensional.py
@@ -50,7 +50,7 @@ class FiniteDimensional(Kernpart):
     def Kdiag(self,X,target):
         product = np.diag(self.K(X, X))
         np.add(target,product,target)
-    def dK_dtheta(self,X,X2,target):
+    def _param_grad_helper(self,X,X2,target):
         """Return shape is NxMx(Ntheta)"""
         if X2 is None: X2 = X
         FX = np.column_stack([f(X) for f in self.F])
diff --git a/GPy/kern/parts/fixed.py b/GPy/kern/parts/fixed.py
index dd5bdb85..680f0b14 100644
--- a/GPy/kern/parts/fixed.py
+++ b/GPy/kern/parts/fixed.py
@@ -31,7 +31,7 @@ class Fixed(Kernpart):
     def K(self, X, X2, target):
         target += self.variance * self.fixed_K
 
-    def dK_dtheta(self, partial, X, X2, target):
+    def _param_grad_helper(self, partial, X, X2, target):
         target += (partial * self.fixed_K).sum()
 
     def gradients_X(self, partial, X, X2, target):
diff --git a/GPy/kern/parts/gibbs.py b/GPy/kern/parts/gibbs.py
index 717703ce..68241245 100644
--- a/GPy/kern/parts/gibbs.py
+++ b/GPy/kern/parts/gibbs.py
@@ -85,7 +85,7 @@ class Gibbs(Kernpart):
         """Compute the diagonal of the covariance matrix for X."""
         np.add(target, self.variance, target)
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """Derivative of the covariance with respect to the parameters."""
         self._K_computations(X, X2)
         self._dK_computations(dL_dK)
diff --git a/GPy/kern/parts/hetero.py b/GPy/kern/parts/hetero.py
index f48dddb4..507f6251 100644
--- a/GPy/kern/parts/hetero.py
+++ b/GPy/kern/parts/hetero.py
@@ -80,7 +80,7 @@ class Hetero(Kernpart):
         """Helper function for computing the diagonal elements of the covariance."""
         return self.mapping.f(X).flatten()**2
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """Derivative of the covariance with respect to the parameters."""
         if (X2 is None) or (X2 is X):
             dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
diff --git a/GPy/kern/parts/hierarchical.py b/GPy/kern/parts/hierarchical.py
index 43dddd2d..3ca6b444 100644
--- a/GPy/kern/parts/hierarchical.py
+++ b/GPy/kern/parts/hierarchical.py
@@ -50,9 +50,9 @@ class Hierarchical(Kernpart):
         #X,slices = X[:,:-1],index_to_slices(X[:,-1])
         #[[self.k.Kdiag(X[s],target[s]) for s in slices_i] for slices_i in slices]
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         X, X2, slices, slices2 = self._sort_slices(X,X2)
-        [[[[k.dK_dtheta(dL_dK[s,s2],X[s],X2[s2],target[p_start:p_stop]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_, slices2_)] for k, p_start, p_stop, slices_, slices2_ in zip(self.parts, self.param_starts, self.param_stops, slices, slices2)]
+        [[[[k._param_grad_helper(dL_dK[s,s2],X[s],X2[s2],target[p_start:p_stop]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_, slices2_)] for k, p_start, p_stop, slices_, slices2_ in zip(self.parts, self.param_starts, self.param_stops, slices, slices2)]
 
 
     def gradients_X(self,dL_dK,X,X2,target):
diff --git a/GPy/kern/parts/independent_outputs.py b/GPy/kern/parts/independent_outputs.py
index 8c0959c5..98f1203d 100644
--- a/GPy/kern/parts/independent_outputs.py
+++ b/GPy/kern/parts/independent_outputs.py
@@ -70,13 +70,13 @@ class IndependentOutputs(Kernpart):
         X,slices = X[:,:-1],index_to_slices(X[:,-1])
         [[self.k.Kdiag(X[s],target[s]) for s in slices_i] for slices_i in slices]
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         X,slices = X[:,:-1],index_to_slices(X[:,-1])
         if X2 is None:
             X2,slices2 = X,slices
         else:
             X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
-        [[[self.k.dK_dtheta(dL_dK[s,s2],X[s],X2[s2],target) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+        [[[self.k._param_grad_helper(dL_dK[s,s2],X[s],X2[s2],target) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
 
 
     def gradients_X(self,dL_dK,X,X2,target):
diff --git a/GPy/kern/parts/kernpart.py b/GPy/kern/parts/kernpart.py
index 2583d525..06f1446b 100644
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
@@ -75,14 +75,14 @@ class Kernpart(Parameterized):
         raise NotImplementedError
     def Kdiag(self,X,target):
         raise NotImplementedError
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         raise NotImplementedError
     def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        # In the base case compute this by calling dK_dtheta. Need to
+        # In the base case compute this by calling _param_grad_helper. Need to
         # override for stationary covariances (for example) to save
         # time.
         for i in range(X.shape[0]):
-            self.dK_dtheta(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target)
+            self._param_grad_helper(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target)
     def psi0(self,Z,mu,S,target):
         raise NotImplementedError
     def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
@@ -109,8 +109,15 @@ class Kernpart(Parameterized):
         raise NotImplementedError
     def dKdiag_dX(self, dL_dK, X, target):
         raise NotImplementedError
-
-
+    def update_gradients_full(self, dL_dK, X):
+        """Set the gradients of all parameters when doing full (N) inference."""
+        raise NotImplementedError
+    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+        """Set the gradients of all parameters when doing sparse (M) inference."""
+        raise NotImplementedError
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        """Set the gradients of all parameters when doing variational (M) inference with uncertain inputs."""
+        raise NotImplementedError
 
 class Kernpart_stationary(Kernpart):
     def __init__(self, input_dim, lengthscale=None, ARD=False):
diff --git a/GPy/kern/parts/mlp.py b/GPy/kern/parts/mlp.py
index 2ba25802..59979a62 100644
--- a/GPy/kern/parts/mlp.py
+++ b/GPy/kern/parts/mlp.py
@@ -77,7 +77,7 @@ class MLP(Kernpart):
         self._K_diag_computations(X)
         target+= self.variance*self._K_diag_dvar
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """Derivative of the covariance with respect to the parameters."""
         self._K_computations(X, X2)
         denom3 = self._K_denom*self._K_denom*self._K_denom
diff --git a/GPy/kern/parts/periodic_Matern32.py b/GPy/kern/parts/periodic_Matern32.py
index 0de57f82..24ec45f9 100644
--- a/GPy/kern/parts/periodic_Matern32.py
+++ b/GPy/kern/parts/periodic_Matern32.py
@@ -112,7 +112,7 @@ class PeriodicMatern32(Kernpart):
         np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
 
     @silence_errors
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)"""
         if X2 is None: X2 = X
         FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
diff --git a/GPy/kern/parts/periodic_Matern52.py b/GPy/kern/parts/periodic_Matern52.py
index 882084fd..1f9d90b3 100644
--- a/GPy/kern/parts/periodic_Matern52.py
+++ b/GPy/kern/parts/periodic_Matern52.py
@@ -114,7 +114,7 @@ class PeriodicMatern52(Kernpart):
         np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
 
     @silence_errors
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)"""
         if X2 is None: X2 = X
         FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
diff --git a/GPy/kern/parts/periodic_exponential.py b/GPy/kern/parts/periodic_exponential.py
index d8c193e0..4562cd56 100644
--- a/GPy/kern/parts/periodic_exponential.py
+++ b/GPy/kern/parts/periodic_exponential.py
@@ -110,7 +110,7 @@ class PeriodicExponential(Kernpart):
         np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
 
     @silence_errors
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to the parameters (shape is N x num_inducing x num_params)"""
         if X2 is None: X2 = X
         FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
diff --git a/GPy/kern/parts/poly.py b/GPy/kern/parts/poly.py
index 80abab60..0deb11f4 100644
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
@@ -86,7 +86,7 @@ class POLY(Kernpart):
         self._K_diag_computations(X)
         target+= self.variance*self._K_diag_dvar
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """Derivative of the covariance with respect to the parameters."""
         self._K_computations(X, X2)
         base = self.variance*self.degree*self._K_poly_arg**(self.degree-1)
diff --git a/GPy/kern/parts/prod.py b/GPy/kern/parts/prod.py
index 07286a82..364c91b3 100644
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@@ -54,15 +54,15 @@ class Prod(Kernpart):
         self.k1.update_gradients_full(dL_dK*self._K2, X[:,self.slice1])
         self.k2.update_gradients_full(dL_dK*self._K1, X[:,self.slice2])
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """Derivative of the covariance matrix with respect to the parameters."""
         self._K_computations(X,X2)
         if X2 is None:
-            self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.num_params])
-            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.slice2], None, target[self.k1.num_params:])
+            self.k1._param_grad_helper(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.num_params])
+            self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.slice2], None, target[self.k1.num_params:])
         else:
-            self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:self.k1.num_params])
-            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[self.k1.num_params:])
+            self.k1._param_grad_helper(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:self.k1.num_params])
+            self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[self.k1.num_params:])
 
     def Kdiag(self,X,target):
         """Compute the diagonal of the covariance matrix associated to X."""
diff --git a/GPy/kern/parts/prod_orthogonal.py b/GPy/kern/parts/prod_orthogonal.py
index f8d1c3b2..e7dd1fdc 100644
--- a/GPy/kern/parts/prod_orthogonal.py
+++ b/GPy/kern/parts/prod_orthogonal.py
@@ -41,15 +41,15 @@ class prod_orthogonal(Kernpart):
         self._K_computations(X,X2)
         target += self._K1 * self._K2
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to the parameters."""
         self._K_computations(X,X2)
         if X2 is None:
-            self.k1.dK_dtheta(dL_dK*self._K2, X[:,:self.k1.input_dim], None, target[:self.k1.num_params])
-            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.k1.input_dim:], None, target[self.k1.num_params:])
+            self.k1._param_grad_helper(dL_dK*self._K2, X[:,:self.k1.input_dim], None, target[:self.k1.num_params])
+            self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.k1.input_dim:], None, target[self.k1.num_params:])
         else:
-            self.k1.dK_dtheta(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target[:self.k1.num_params])
-            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target[self.k1.num_params:])
+            self.k1._param_grad_helper(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target[:self.k1.num_params])
+            self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target[self.k1.num_params:])
 
     def Kdiag(self,X,target):
         """Compute the diagonal of the covariance matrix associated to X."""
diff --git a/GPy/kern/parts/rational_quadratic.py b/GPy/kern/parts/rational_quadratic.py
index bd623320..c36cee9f 100644
--- a/GPy/kern/parts/rational_quadratic.py
+++ b/GPy/kern/parts/rational_quadratic.py
@@ -52,7 +52,7 @@ class RationalQuadratic(Kernpart):
     def Kdiag(self,X,target):
         target += self.variance
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         if X2 is None: X2 = X
         dist2 = np.square((X-X2.T)/self.lengthscale)
 
diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py
index e7bc8624..4b38bd0f 100644
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@@ -8,6 +8,7 @@ from kernpart import Kernpart
 from ...util.linalg import tdot
 from ...util.misc import fast_array_equal, param_to_array
 from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
 
 class RBF(Kernpart):
     """
@@ -50,7 +51,7 @@ class RBF(Kernpart):
             else:
                 lengthscale = np.ones(self.input_dim)
 
-        self.variance = Param('variance', variance)
+        self.variance = Param('variance', variance, Logexp())
         
         self.lengthscale = Param('lengthscale', lengthscale)
         self.lengthscale.add_observer(self, self.update_lengthscale)
@@ -141,7 +142,7 @@ class RBF(Kernpart):
         d_length = self._psi1[:,:,None] * ((self._psi1_dist_sq - 1.)/(self.lengthscale*self._psi1_denom) +1./self.lengthscale)
         dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
         if not self.ARD:
-            self.lengthscale.gradeint = dpsi1_dlength.sum()
+            self.lengthscale.gradient = dpsi1_dlength.sum()
         else:
             self.lengthscale.gradient = dpsi1_dlength.sum(0).sum(0)
 
diff --git a/GPy/kern/parts/rbf_inv.py b/GPy/kern/parts/rbf_inv.py
index 0c0168a6..8405ae84 100644
--- a/GPy/kern/parts/rbf_inv.py
+++ b/GPy/kern/parts/rbf_inv.py
@@ -97,7 +97,7 @@ class RBFInv(RBF):
 #             return ['variance'] + ['inv_lengthscale%i' % i for i in range(self.inv_lengthscale.size)]
 
     # TODO: Rewrite computations so that lengthscale is not needed (but only inv. lengthscale)
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         self._K_computations(X, X2)
         target[0] += np.sum(self._K_dvar * dL_dK)
         if self.ARD:
diff --git a/GPy/kern/parts/rbfcos.py b/GPy/kern/parts/rbfcos.py
index fc4a376a..9a4b8ab2 100644
--- a/GPy/kern/parts/rbfcos.py
+++ b/GPy/kern/parts/rbfcos.py
@@ -73,7 +73,7 @@ class RBFCos(Kernpart):
     def Kdiag(self,X,target):
         np.add(target,self.variance,target)
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         self._K_computations(X,X2)
         target[0] += np.sum(dL_dK*self._dvar)
         if self.ARD:
diff --git a/GPy/kern/parts/spline.py b/GPy/kern/parts/spline.py
index c31258f3..3e57b8a9 100644
--- a/GPy/kern/parts/spline.py
+++ b/GPy/kern/parts/spline.py
@@ -50,7 +50,7 @@ class Spline(Kernpart):
     def Kdiag(self,X,target):
         target += self.variance*X.flatten()**3/3.
 
-    def dK_dtheta(self,X,X2,target):
+    def _param_grad_helper(self,X,X2,target):
         target += 0.5*(t*s**2) - s**3/6. + (s_t)**3*theta(s_t)/6.
 
     def dKdiag_dtheta(self,X,target):
diff --git a/GPy/kern/parts/symmetric.py b/GPy/kern/parts/symmetric.py
index ef9a8dd5..8eec2acc 100644
--- a/GPy/kern/parts/symmetric.py
+++ b/GPy/kern/parts/symmetric.py
@@ -40,7 +40,7 @@ class Symmetric(Kernpart):
         self.k.K(X,AX2,target)
         self.k.K(AX,AX2,target)
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to the parameters."""
         AX = np.dot(X,self.transform)
         if X2 is None:
@@ -48,10 +48,10 @@ class Symmetric(Kernpart):
             ZX2 = AX
         else:
             AX2 = np.dot(X2, self.transform)
-        self.k.dK_dtheta(dL_dK,X,X2,target)
-        self.k.dK_dtheta(dL_dK,AX,X2,target)
-        self.k.dK_dtheta(dL_dK,X,AX2,target)
-        self.k.dK_dtheta(dL_dK,AX,AX2,target)
+        self.k._param_grad_helper(dL_dK,X,X2,target)
+        self.k._param_grad_helper(dL_dK,AX,X2,target)
+        self.k._param_grad_helper(dL_dK,X,AX2,target)
+        self.k._param_grad_helper(dL_dK,AX,AX2,target)
 
 
     def gradients_X(self,dL_dK,X,X2,target):
diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index 46f975d2..a09d4bfc 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -348,7 +348,7 @@ class spkern(Kernpart):
     def Kdiag(self,X,target):
         self._weave_inline(self._Kdiag_code, X, target)
 
-    def dK_dtheta(self,partial,X,Z,target):
+    def _param_grad_helper(self,partial,X,Z,target):
         if Z is None:
             self._weave_inline(self._dK_dtheta_code_X, X, target, Z, partial)
         else:

From 79aca59a37332d84e33abf515a82afbe4cdf6828 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Tue, 11 Feb 2014 14:44:15 +0000
Subject: [PATCH 40/43] parameter handling with default constraints

---
 GPy/core/model.py                             |  1 +
 GPy/core/parameterization/index_operations.py |  6 ++
 GPy/core/parameterization/parameterized.py    |  5 ++
 GPy/core/parameterization/variational.py      |  3 +-
 GPy/core/sparse_gp.py                         |  4 +-
 GPy/examples/regression.py                    |  6 +-
 .../latent_function_inference/varDTC.py       | 20 +++--
 GPy/kern/kern.py                              |  7 +-
 GPy/kern/parts/white.py                       |  3 +-
 GPy/likelihoods/gaussian.py                   |  4 +-
 GPy/models/bayesian_gplvm.py                  | 79 +++----------------
 GPy/models/sparse_gp_regression.py            |  2 +-
 12 files changed, 53 insertions(+), 87 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 57d41602..aacf20be 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -258,6 +258,7 @@ class Model(Parameterized):
         these terms are present in the name the parameter is
         constrained positive.
         """
+        raise DeprecationWarning, 'parameters now have default constraints'
         positive_strings = ['variance', 'lengthscale', 'precision', 'kappa', 'sensitivity']
         # param_names = self._get_param_names()
         
diff --git a/GPy/core/parameterization/index_operations.py b/GPy/core/parameterization/index_operations.py
index d52211c5..8abb31e9 100644
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@@ -77,6 +77,12 @@ class ParameterIndexOperations(object):
     def iter_properties(self):
         return self._properties.iterkeys()
     
+    def shift(self, start, size):
+        for ind in self.iterindices():
+            toshift = ind>=start
+            if len(toshift) > 0:
+                ind[toshift] += size
+    
     def clear(self):
         self._properties.clear()
     
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 678b119b..85a1e179 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -154,6 +154,8 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
                 elif param._has_fixes(): self._fixes_ = np.r_[np.ones(self.size, dtype=bool), fixes_param]
 
             else:
+                start = sum(p.size for p in self._parameters_[:index])
+                self.constraints.shift(start, param.size)
                 self._parameters_.insert(index, param)
 
                 # make sure fixes and constraints are indexed right
@@ -165,10 +167,13 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
                     self._fixes_ = np.ones(self.size+param.size, dtype=bool)
                     self._fixes_[ins:ins+param.size] = fixes_param
             self.size += param.size
+        else:
+            raise RuntimeError, """Parameter exists already added and no copy made"""
         self._connect_parameters()
         # make sure the constraints are pulled over:
         if hasattr(param, "_constraints_") and param._constraints_ is not None:
             for t, ind in param._constraints_.iteritems():
+                
                 self.constraints.add(t, ind+self._offset_for(param))
             param._constraints_.clear()
         if param._default_constraint_ is not None:
diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index b73e25da..2e342f54 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -5,6 +5,7 @@ Created on 6 Nov 2013
 '''
 from parameterized import Parameterized
 from param import Param
+from transformations import Logexp
 
 class Normal(Parameterized):
     '''
@@ -15,7 +16,7 @@ class Normal(Parameterized):
     def __init__(self, means, variances, name='latent space'):
         Parameterized.__init__(self, name=name)
         self.means = Param("mean", means)
-        self.variances = Param('variance', variances)
+        self.variances = Param('variance', variances, Logexp())
         self.add_parameters(self.means, self.variances)
 
     def plot(self, *args):
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index fda201ff..130e56e2 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -61,8 +61,8 @@ class SparseGP(GP):
         if self.X_variance is None:
             self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
         else:
-            self.Z.gradient += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance)
-            self.Z.gradient += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance)
+            self.Z.gradient += self.kern.dpsi1_dZ(self.grad_dict['dL_dpsi1'], self.Z, self.X, self.X_variance)
+            self.Z.gradient += self.kern.dpsi2_dZ(self.grad_dict['dL_dpsi2'], self.Z, self.X, self.X_variance)
 
     def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
         """
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 4dea1342..f8d3d5a9 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -473,9 +473,9 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
     Z = np.random.uniform(-3., 3., (7, 1))
 
     k = GPy.kern.rbf(1)
-
+    import ipdb;ipdb.set_trace()
     # create simple GP Model - no input uncertainty on this one
-    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
+    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.rbf(1), Z=Z)
 
     if optimize:
         m.optimize('scg', messages=1, max_iters=max_iters)
@@ -486,7 +486,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
     print m
 
     # the same Model with uncertainty
-    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z, X_variance=S)
+    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.rbf(1), Z=Z, X_variance=S)
     if optimize:
         m.optimize('scg', messages=1, max_iters=max_iters)
     if plot:
diff --git a/GPy/inference/latent_function_inference/varDTC.py b/GPy/inference/latent_function_inference/varDTC.py
index 07ae17c5..e156a273 100644
--- a/GPy/inference/latent_function_inference/varDTC.py
+++ b/GPy/inference/latent_function_inference/varDTC.py
@@ -4,7 +4,8 @@
 from posterior import Posterior
 from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dpotri, symmetrify
 import numpy as np
-from GPy.util.linalg import dtrtri
+from ...util.linalg import dtrtri
+from ...util.caching import Cacher
 log_2_pi = np.log(2*np.pi)
 
 class VarDTC(object):
@@ -20,8 +21,13 @@ class VarDTC(object):
     def __init__(self):
         #self._YYTfactor_cache = caching.cache()
         self.const_jitter = 1e-6
+        self.get_trYYT = Cacher(self._get_trYYT, 1)
+        self.get_YYTfactor = Cacher(self._get_YYTfactor, 1)
+    
+    def _get_trYYT(self, Y):
+        return np.sum(np.square(Y))
 
-    def get_YYTfactor(self, Y):
+    def _get_YYTfactor(self, Y):
         """
         find a matrix L which satisfies LLT = YYT. 
 
@@ -31,9 +37,8 @@ class VarDTC(object):
         if (N>D):
             return Y
         else:
-            #if Y in self.cache, return self.Cache[Y], else store Y in cache and return L.
-            raise NotImplementedError, 'TODO' #TODO
-
+            return jitchol(tdot(Y))
+            
     def get_VVTfactor(self, Y, prec):
         return Y * prec # TODO chache this, and make it effective
 
@@ -94,8 +99,9 @@ class VarDTC(object):
         LB = jitchol(B)
 
         # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
-        VVT_factor = self.get_VVTfactor(Y, beta)
-        trYYT = np.sum(np.square(Y))
+        self.YYTfactor = self.get_YYTfactor(Y)
+        VVT_factor = self.get_VVTfactor(self.YYTfactor, beta)
+        trYYT = self.get_trYYT(Y)
         psi1Vf = np.dot(psi1.T, VVT_factor)
 
         # back substutue C into psi1Vf
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 98945c2d..53728d0d 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -153,10 +153,9 @@ class kern(Parameterized):
 #             newkern.fixed_indices = self.fixed_indices + [self.num_params + x for x in other.fixed_indices]
 #             newkern.fixed_values = self.fixed_values + other.fixed_values
 #             newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices]
-        [newkern._add_constrain(param, transform, warning=False)
-         for param, transform in itertools.izip(
-                *itertools.chain(self.constraints.iteritems(),
-                                 other.constraints.iteritems()))]
+        
+        [newkern.constraints.add(transform, ind) for transform, ind in self.constraints.iteritems()]
+        [newkern.constraints.add(transform, ind+self.size) for transform, ind in other.constraints.iteritems()]
         newkern._fixes_ = ((self._fixes_ or 0) + (other._fixes_ or 0)) or None
 
         return newkern
diff --git a/GPy/kern/parts/white.py b/GPy/kern/parts/white.py
index c9677f28..b0e961c7 100644
--- a/GPy/kern/parts/white.py
+++ b/GPy/kern/parts/white.py
@@ -4,6 +4,7 @@
 from kernpart import Kernpart
 import numpy as np
 from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
 
 class White(Kernpart):
     """
@@ -17,7 +18,7 @@ class White(Kernpart):
     def __init__(self,input_dim,variance=1.):
         super(White, self).__init__(input_dim, 'white')
         self.input_dim = input_dim
-        self.variance = Param('variance', variance)
+        self.variance = Param('variance', variance, Logexp())
         self.add_parameters(self.variance)
         self._psi1 = 0 # TODO: more elegance here
 
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index e6be2261..1f5b9db4 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-
 #TODO
 """
 A lot of this code assumes that the link functio nis the identity.
@@ -18,6 +17,7 @@ from GPy.util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
 import link_functions
 from likelihood import Likelihood
 from ..core.parameterization import Param
+from ..core.parameterization.transformations import Logexp
 
 class Gaussian(Likelihood):
     """
@@ -43,7 +43,7 @@ class Gaussian(Likelihood):
 
         super(Gaussian, self).__init__(gp_link, name=name)
 
-        self.variance = Param('variance', variance)
+        self.variance = Param('variance', variance, Logexp())
         self.add_parameter(self.variance)
 
         if isinstance(gp_link, link_functions.Identity):
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 78851147..36f0c4b1 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -41,7 +41,7 @@ class BayesianGPLVM(SparseGP, GPLVM):
         self.q = Normal(X, X_variance)
         SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, X_variance, name, **kwargs)
         self.add_parameter(self.q, index=0)
-        self.ensure_default_constraints()
+        #self.ensure_default_constraints()
 
     def _getstate(self):
         """
@@ -55,38 +55,6 @@ class BayesianGPLVM(SparseGP, GPLVM):
         self.init = state.pop()
         SparseGP._setstate(self, state)
 
-#     def _get_param_names(self):
-#         X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
-#         S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
-#         return (X_names + S_names + SparseGP._get_param_names(self))
-
-    #def _get_print_names(self):
-    #    return SparseGP._get_print_names(self)
-
-#     def _get_params(self):
-#         """
-#         Horizontally stacks the parameters in order to present them to the optimizer.
-#         The resulting 1-input_dim array has this structure:
-#
-#         ===============================================================
-#         |       mu       |        S        |    Z    | theta |  beta  |
-#         ===============================================================
-#
-#         """
-#         x = np.hstack((self.X.flatten(), self.X_variance.flatten(), SparseGP._get_params(self)))
-#         return x
-
-#     def _set_params(self, x, save_old=True, save_count=0):
-#         N, input_dim = self.num_data, self.input_dim
-#         self.X = x[:self.X.size].reshape(N, input_dim).copy()
-#         self.X_variance = x[(N * input_dim):(2 * N * input_dim)].reshape(N, input_dim).copy()
-#         SparseGP._set_params(self, x[(2 * N * input_dim):])
-
-    def dKL_dmuS(self):
-        dKL_dS = (1. - (1. / (self.X_variance))) * 0.5
-        dKL_dmu = self.X
-        return dKL_dmu, dKL_dS
-
     def dL_dmuS(self):
         dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi0_dmuS(self.grad_dict['dL_dpsi0'], self.Z, self.X, self.X_variance)
         dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi1_dmuS(self.grad_dict['dL_dpsi1'], self.Z, self.X, self.X_variance)
@@ -102,45 +70,24 @@ class BayesianGPLVM(SparseGP, GPLVM):
         return 0.5 * (var_mean + var_S) - 0.5 * self.input_dim * self.num_data
 
     def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
-        self._log_marginal_likelihood -= self.KL_divergence()
+        super(BayesianGPLVM, self).parameters_changed()
+        #self._log_marginal_likelihood -= self.KL_divergence()
 
-        #The derivative of the bound wrt the inducing inputs Z
-        self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
-        self.Z.gradient += self.kern.dpsi1_dZ(self.grad_dict['dL_dpsi1'], self.Z, self.X, self.X_variance)
-        self.Z.gradient += self.kern.dpsi2_dZ(self.grad_dict['dL_dpsi2'], self.Z, self.X, self.X_variance)
-        
         dL_dmu, dL_dS = self.dL_dmuS()
-        dKL_dmu, dKL_dS = self.dKL_dmuS()
-        self.q.means.gradient = dL_dmu - dKL_dmu
-        self.q.variances.gradient = dL_dS - dKL_dS
+
+        # dL:
+        self.q.means.gradient  = dL_dmu
+        self.q.variances.gradient  = dL_dS  
+
+        # dKL:
+        #self.q.means.gradient -= self.X
+        #self.q.variances.gradient -= (1. - (1. / (self.X_variance))) * 0.5
     
-
-#     def log_likelihood(self):        
-#         ll = SparseGP.log_likelihood(self)
-#         kl = self.KL_divergence()
-#         return ll - kl
-
-    def _dbound_dmuS(self):
-        dKL_dmu, dKL_dS = self.dKL_dmuS()
-        dL_dmu, dL_dS = self.dL_dmuS()
-        d_dmu = (dL_dmu - dKL_dmu).flatten()
-        d_dS = (dL_dS - dKL_dS).flatten()
-        return np.hstack((d_dmu, d_dS))
-
-#     def _log_likelihood_gradients(self):
-#         dKL_dmu, dKL_dS = self.dKL_dmuS()
-#         dL_dmu, dL_dS = self.dL_dmuS()
-#         d_dmu = (dL_dmu - dKL_dmu).flatten()
-#         d_dS = (dL_dS - dKL_dS).flatten()
-#         self.dbound_dmuS = np.hstack((d_dmu, d_dS))
-#         self.dbound_dZtheta = SparseGP._log_likelihood_gradients(self)
-#         return np.hstack((self.dbound_dmuS.flatten(), self.dbound_dZtheta))
-
     def plot_latent(self, plot_inducing=True, *args, **kwargs):
         """
         See GPy.plotting.matplot_dep.dim_reduction_plots.plot_latent
         """
+        import sys
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ..plotting.matplot_dep import dim_reduction_plots
 
@@ -210,7 +157,7 @@ class BayesianGPLVM(SparseGP, GPLVM):
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ..plotting.matplot_dep import dim_reduction_plots
 
-        return dim_reduction_plots.plot_steepest_gradient_map(model,*args,**kwargs)
+        return dim_reduction_plots.plot_steepest_gradient_map(self,*args,**kwargs)
 
 def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
     """
diff --git a/GPy/models/sparse_gp_regression.py b/GPy/models/sparse_gp_regression.py
index 386380b7..c936164b 100644
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@@ -44,7 +44,7 @@ class SparseGPRegression(SparseGP):
         likelihood = likelihoods.Gaussian()
 
         SparseGP.__init__(self, X, Y, Z, kernel, likelihood, X_variance=X_variance)
-        self.ensure_default_constraints()
+        #self.ensure_default_constraints()
 
     def _getstate(self):
         return SparseGP._getstate(self)

From 4cfc13d5fc65041ea3791ad507541f43db096172 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Tue, 11 Feb 2014 15:23:49 +0000
Subject: [PATCH 41/43] kernel adding now takes over constraints

---
 GPy/core/sparse_gp.py                             | 6 +++++-
 GPy/inference/latent_function_inference/varDTC.py | 5 +++--
 GPy/testing/psi_stat_gradient_tests.py            | 8 --------
 GPy/util/misc.py                                  | 2 +-
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 130e56e2..1879145a 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -7,6 +7,7 @@ from gp import GP
 from parameterization.param import Param
 from ..inference.latent_function_inference import varDTC
 from .. import likelihoods
+from GPy.util.misc import param_to_array
 
 class SparseGP(GP):
     """
@@ -54,7 +55,10 @@ class SparseGP(GP):
         self.add_parameter(self.Z, index=0)
 
     def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
+        Xvar = self.X_variance
+        if self.X_variance is not None:
+            Xvar = param_to_array(self.X_variance)
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, param_to_array(self.X), Xvar, param_to_array(self.Z), self.likelihood, self.Y)
 
         #The derivative of the bound wrt the inducing inputs Z
         self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
diff --git a/GPy/inference/latent_function_inference/varDTC.py b/GPy/inference/latent_function_inference/varDTC.py
index e156a273..237ab463 100644
--- a/GPy/inference/latent_function_inference/varDTC.py
+++ b/GPy/inference/latent_function_inference/varDTC.py
@@ -6,6 +6,7 @@ from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dpotri, sy
 import numpy as np
 from ...util.linalg import dtrtri
 from ...util.caching import Cacher
+from ...util.misc import param_to_array
 log_2_pi = np.log(2*np.pi)
 
 class VarDTC(object):
@@ -25,7 +26,7 @@ class VarDTC(object):
         self.get_YYTfactor = Cacher(self._get_YYTfactor, 1)
     
     def _get_trYYT(self, Y):
-        return np.sum(np.square(Y))
+        return param_to_array(np.sum(np.square(Y)))
 
     def _get_YYTfactor(self, Y):
         """
@@ -35,7 +36,7 @@ class VarDTC(object):
         """
         N, D = Y.shape
         if (N>D):
-            return Y
+            return param_to_array(Y)
         else:
             return jitchol(tdot(Y))
             
diff --git a/GPy/testing/psi_stat_gradient_tests.py b/GPy/testing/psi_stat_gradient_tests.py
index 56586d3b..32986c77 100644
--- a/GPy/testing/psi_stat_gradient_tests.py
+++ b/GPy/testing/psi_stat_gradient_tests.py
@@ -71,16 +71,13 @@ class DPsiStatTest(unittest.TestCase):
         for k in self.kernels:
             m = PsiStatModel('psi0', X=self.X, X_variance=self.X_var, Z=self.Z,\
                              num_inducing=self.num_inducing, kernel=k)
-            #m.ensure_default_constraints(warning=0)
             m.randomize()
-            import ipdb;ipdb.set_trace()
             assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k._parameters_)))
         
     def testPsi1(self):
         for k in self.kernels:
             m = PsiStatModel('psi1', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-            m.ensure_default_constraints(warning=0)
             m.randomize()
             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k._parameters_)))
 
@@ -88,35 +85,30 @@ class DPsiStatTest(unittest.TestCase):
         k = self.kernels[0]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                  num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints(warning=0)
         m.randomize()
         assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
     def testPsi2_lin_bia(self):
         k = self.kernels[3]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints(warning=0)
         m.randomize()
         assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
     def testPsi2_rbf(self):
         k = self.kernels[1]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints(warning=0)
         m.randomize()
         assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
     def testPsi2_rbf_bia(self):
         k = self.kernels[-1]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints(warning=0)
         m.randomize()
         assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
     def testPsi2_bia(self):
         k = self.kernels[2]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints(warning=0)
         m.randomize()
         assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
 
diff --git a/GPy/util/misc.py b/GPy/util/misc.py
index f519a1df..dc327324 100644
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@@ -184,4 +184,4 @@ from :class:ndarray)"""
     assert len(param) > 0, "At least one parameter needed"
     if len(param) == 1:
         return param[0].view(np.ndarray)
-    return map(lambda x: x.view(np.ndarray), param)
+    return [x.view(np.ndarray) for x in param]

From 4d1feb9d9d9681e4741d9d0b8d9e174a5ea67ad0 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 11 Feb 2014 16:27:08 +0000
Subject: [PATCH 42/43] first draft of DTC

---
 .../latent_function_inference/DTC.py          | 96 +++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 GPy/inference/latent_function_inference/DTC.py

diff --git a/GPy/inference/latent_function_inference/DTC.py b/GPy/inference/latent_function_inference/DTC.py
new file mode 100644
index 00000000..cc6e8606
--- /dev/null
+++ b/GPy/inference/latent_function_inference/DTC.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2012, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from posterior import Posterior
+from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
+import numpy as np
+log_2_pi = np.log(2*np.pi)
+
+class DTC(object):
+    """
+    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
+
+    The function self.inference returns a Posterior object, which summarizes
+    the posterior.
+
+    NB. It's not recommended to use this function! It's here for historical purposes. 
+
+    """
+    def __init__(self):
+        self.const_jitter = 1e-6
+
+    def inference(self, kern, X, X_variance, Z, likelihood, Y):
+        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
+
+        num_inducing, _ = Z.shape
+        num_data, output_dim = Y.shape
+
+        #make sure the noise is not hetero
+        beta = 1./np.squeeze(likelihood.variance)
+        if beta.size <1:
+            raise NotImplementedError, "no hetero noise with this implementatino of DTC"
+
+        Kmm = kern.K(Z)
+        Knn = kern.Kdiag(X)
+        Knm = kern.K(X, Z)
+        U = Knm
+        Uy = np.dot(U.T,Y)
+
+        #factor Kmm 
+        Kmmi, L, Li, _ = pdinv(Kmm)
+
+        # Compute A
+        LiUT, _ = dtrtrs(L, U.T*np.sqrt(beta), lower=1)
+        A_I = tdot(LiUT)
+        A = A_I + np.eye(num_inducing)
+
+        # factor A
+        LA = jitchol(A)
+
+        # back substutue to get b, P, v
+        tmp, _ = dtrtrs(L, Uy, lower=1)
+        b, _ = dtrtrs(LA, tmp*beta, lower=1)
+        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
+        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
+        tmp = tdrtrs(LA, Li, lower=1, trans=0)
+        P = tdot(tmp.T)
+
+        #compute log marginal
+        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
+                       -np.sum(np.log(np.diag(LA)))*output_dim + \
+                       0.5*num_data*output_dim*np.log(beta) + \
+                       -0.5*beta*np.sum(np.square(Y)) + 
+                       0.5*np.sum(np.square(b))
+
+        # Compute dL_dKmm
+        tmp, _ = dtrtrs(L, A_I, lower=1, trans=1)
+        dL_dK, _ = dtrtrs(L, tmp.T, lower=1, trans=0)
+        tmp, _ = dtrtrs(LA, tmp.T. lower=1, trans=1)
+        dL_dK -= tdot(tmp.T)
+        dL_dK *= output_dim
+        dL_dK -= tdot(v)
+        dL_dK /=2.
+
+        # Compute dL_dU
+        vvT_P = tdot(v.reshape(-1,1)) + P
+        vY = np.dot(v.reshape(-1,1),Y.T)
+        dL_dU = vY + np.dot(vvT_P, U.T)
+        dL_dU *= beta
+
+        #compute dL_dR
+        Uv = np.dot(U, v)
+        dL_dR = 0.5*(np.sum(U*np.dot(P, U.T), 1) - beta * np.sum(np.square(Y, 1)) - 2.*np.sum(Uv*Y, 1) + np.sum(np.square(Uv), 1)
+                )*beta**2
+
+        grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':np.zeros_like(Knn), 'dL_dKnm':dL_dU}
+
+        #update gradients
+        kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
+        likelihood.update_gradients(dL_dR)
+
+        #construct a posterior object
+        post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm)
+
+        return post, log_marginal, grad_dict
+
+

From 9d262936c6f5f6f877b76d8b24c85b6fd9b51597 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Tue, 11 Feb 2014 16:54:33 +0000
Subject: [PATCH 43/43] array core and bgplvm working > changes due to
 __i<op>__ will now be reported

---
 GPy/core/parameterization/array_core.py       | 182 +++++++++++++++---
 GPy/core/parameterization/variational.py      |   6 +-
 GPy/core/sparse_gp.py                         |   5 +-
 GPy/examples/dimensionality_reduction.py      |  11 +-
 .../latent_function_inference/varDTC.py       |   6 +-
 GPy/inference/optimization/scg.py             |   4 +-
 GPy/models/bayesian_gplvm.py                  |  10 +-
 GPy/models/sparse_gp_regression.py            |   2 +-
 GPy/plotting/matplot_dep/kernel_plots.py      |   1 +
 GPy/plotting/matplot_dep/variational_plots.py |   4 +-
 10 files changed, 177 insertions(+), 54 deletions(-)

diff --git a/GPy/core/parameterization/array_core.py b/GPy/core/parameterization/array_core.py
index 4b5b7700..1d300d80 100644
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@@ -12,6 +12,7 @@ class ListArray(np.ndarray):
     WARNING: This overrides the functionality of x==y!!!
     Use numpy.equal(x,y) for element-wise equality testing.
     """
+    
     def __new__(cls, input_array):
         obj = np.asanyarray(input_array).view(cls)
         return obj
@@ -27,24 +28,6 @@ class ParamList(list):
         return False
     
     pass
-class C(np.ndarray):
-    __array_priority__ = 1.
-    def __new__(cls, array):
-        obj = array.view(cls)
-        return obj
-    #def __array_finalize__(self, obj):
-    #    #print 'finalize'
-    #    return obj
-    def __array_prepare__(self, out_arr, context):
-        #print 'prepare'
-        while type(out_arr) is C:
-            out_arr = out_arr.base
-        return out_arr
-    def __array_wrap__(self, out_arr, context):
-        #print 'wrap', type(self), type(out_arr), context
-        while type(out_arr) is C:
-            out_arr = out_arr.base
-        return out_arr
 
 class ObservableArray(ListArray, Observable):
     """
@@ -63,25 +46,168 @@ class ObservableArray(ListArray, Observable):
         # see InfoArray.__array_finalize__ for comments
         if obj is None: return
         self._observers_ = getattr(obj, '_observers_', None)
+    
     def __setitem__(self, s, val, update=True):
         super(ObservableArray, self).__setitem__(s, val)
         if update:
             self._notify_observers()
-#         if self.ndim:
-#             if not np.all(np.equal(self[s], val)):
-#                 super(ObservableArray, self).__setitem__(s, val)
-#                 if update:
-#                     self._notify_observers()
-#         else:
-#             if not np.all(np.equal(self, val)):
-#                 super(ObservableArray, self).__setitem__(Ellipsis, val)
-#                 if update:
-#                     self._notify_observers()
     def __getslice__(self, start, stop):
         return self.__getitem__(slice(start, stop))
     def __setslice__(self, start, stop, val):
         return self.__setitem__(slice(start, stop), val)  
+
     def __copy__(self, *args):
         return ObservableArray(self.base.base.copy(*args))
     def copy(self, *args):
         return self.__copy__(*args)
+
+    def __ror__(self, *args, **kwargs):
+        r =  np.ndarray.__ror__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+        
+
+    def __ilshift__(self, *args, **kwargs):
+        r = np.ndarray.__ilshift__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __irshift__(self, *args, **kwargs):
+        r = np.ndarray.__irshift__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __rrshift__(self, *args, **kwargs):
+        r = np.ndarray.__rrshift__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __ixor__(self, *args, **kwargs):
+        r = np.ndarray.__ixor__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __rxor__(self, *args, **kwargs):
+        r = np.ndarray.__rxor__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+
+    def __rdivmod__(self, *args, **kwargs):
+        r = np.ndarray.__rdivmod__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __radd__(self, *args, **kwargs):
+        r = np.ndarray.__radd__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __rdiv__(self, *args, **kwargs):
+        r = np.ndarray.__rdiv__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __rtruediv__(self, *args, **kwargs):
+        r = np.ndarray.__rtruediv__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __ipow__(self, *args, **kwargs):
+        r = np.ndarray.__ipow__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __rmul__(self, *args, **kwargs):
+        r = np.ndarray.__rmul__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __rpow__(self, *args, **kwargs):
+        r = np.ndarray.__rpow__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __rsub__(self, *args, **kwargs):
+        r = np.ndarray.__rsub__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __ifloordiv__(self, *args, **kwargs):
+        r = np.ndarray.__ifloordiv__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __isub__(self, *args, **kwargs):
+        r = np.ndarray.__isub__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __ior__(self, *args, **kwargs):
+        r = np.ndarray.__ior__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __itruediv__(self, *args, **kwargs):
+        r = np.ndarray.__itruediv__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __idiv__(self, *args, **kwargs):
+        r = np.ndarray.__idiv__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __rfloordiv__(self, *args, **kwargs):
+        r = np.ndarray.__rfloordiv__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __iand__(self, *args, **kwargs):
+        r = np.ndarray.__iand__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __imod__(self, *args, **kwargs):
+        r = np.ndarray.__imod__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __iadd__(self, *args, **kwargs):
+        r = np.ndarray.__iadd__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __imul__(self, *args, **kwargs):
+        r = np.ndarray.__imul__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
+
+    def __rshift__(self, *args, **kwargs):
+        r = np.ndarray.__rshift__(self, *args, **kwargs)
+        self._notify_observers()
+        return r
+
diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index 2e342f54..a7b26a80 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -15,9 +15,9 @@ class Normal(Parameterized):
     '''
     def __init__(self, means, variances, name='latent space'):
         Parameterized.__init__(self, name=name)
-        self.means = Param("mean", means)
-        self.variances = Param('variance', variances, Logexp())
-        self.add_parameters(self.means, self.variances)
+        self.mean = Param("mean", means)
+        self.variance = Param('variance', variances, Logexp())
+        self.add_parameters(self.mean, self.variance)
 
     def plot(self, *args):
         """
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 1879145a..04062f30 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -55,10 +55,7 @@ class SparseGP(GP):
         self.add_parameter(self.Z, index=0)
 
     def parameters_changed(self):
-        Xvar = self.X_variance
-        if self.X_variance is not None:
-            Xvar = param_to_array(self.X_variance)
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, param_to_array(self.X), Xvar, param_to_array(self.Z), self.likelihood, self.Y)
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
 
         #The derivative of the bound wrt the inducing inputs Z
         self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index e2ba4912..f612ecd7 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -3,7 +3,7 @@
 import numpy as _np
 default_seed = _np.random.seed(123344)
 
-def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False):
+def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False, output_dim=1e4):
     """
     model for testing purposes. Samples from a GP with rbf kernel and learns
     the samples with a new kernel. Normally not for optimization, just model cheking
@@ -18,7 +18,7 @@ def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False):
         input_dim = 3
     else:
         input_dim = 1
-        output_dim = 25
+        output_dim = output_dim
 
     # generate GPLVM-like data
     X = _np.random.rand(num_inputs, input_dim)
@@ -27,7 +27,7 @@ def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False):
          #+ GPy.kern.white(input_dim, 0.01)
          )
     K = k.K(X)
-    Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, output_dim).T
+    Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, (output_dim,)).T
 
     # k = GPy.kern.rbf_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
     k = GPy.kern.linear(input_dim)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
@@ -266,11 +266,10 @@ def bgplvm_simulation(optimize=True, verbose=1,
     Y = Ylist[0]
     k = kern.linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
-    m.Gaussian_noise = Y.var() / 100.
-
+    
     if optimize:
         print "Optimizing model:"
-        m.optimize('scg', messages=verbose, max_iters=max_iters,
+        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                    gtol=.05)
     if plot:
         m.q.plot("BGPLVM Latent Space 1D")
diff --git a/GPy/inference/latent_function_inference/varDTC.py b/GPy/inference/latent_function_inference/varDTC.py
index 237ab463..08329b5a 100644
--- a/GPy/inference/latent_function_inference/varDTC.py
+++ b/GPy/inference/latent_function_inference/varDTC.py
@@ -2,9 +2,8 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from posterior import Posterior
-from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dpotri, symmetrify
+from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify
 import numpy as np
-from ...util.linalg import dtrtri
 from ...util.caching import Cacher
 from ...util.misc import param_to_array
 log_2_pi = np.log(2*np.pi)
@@ -85,7 +84,7 @@ class VarDTC(object):
                 tmp = tmp.T
             # no backsubstitution because of bound explosion on tr(A) if not...
             LmInv, _ = dtrtri(Lm, lower=1)
-            A = LmInv.T.dot(psi2_beta.dot(LmInv))
+            A = LmInv.dot(psi2_beta.dot(LmInv.T))
             #print A.sum()
         else:
             if het_noise:
@@ -97,6 +96,7 @@ class VarDTC(object):
 
         # factor B
         B = np.eye(num_inducing) + A
+        self.A = A
         LB = jitchol(B)
 
         # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
diff --git a/GPy/inference/optimization/scg.py b/GPy/inference/optimization/scg.py
index b4dee118..c99fa7d1 100644
--- a/GPy/inference/optimization/scg.py
+++ b/GPy/inference/optimization/scg.py
@@ -69,8 +69,8 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
     success = True # Force calculation of directional derivs.
     nsuccess = 0 # nsuccess counts number of successes.
     beta = 1.0 # Initial scale parameter.
-    betamin = 1.0e-60 # Lower bound on scale.
-    betamax = 1.0e50 # Upper bound on scale.
+    betamin = 1.0e-15 # Lower bound on scale.
+    betamax = 1.0e15 # Upper bound on scale.
     status = "Not converged"
 
     flog = [fold]
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 36f0c4b1..62d9a5a9 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -71,17 +71,17 @@ class BayesianGPLVM(SparseGP, GPLVM):
 
     def parameters_changed(self):
         super(BayesianGPLVM, self).parameters_changed()
-        #self._log_marginal_likelihood -= self.KL_divergence()
+        self._log_marginal_likelihood -= self.KL_divergence()
 
         dL_dmu, dL_dS = self.dL_dmuS()
 
         # dL:
-        self.q.means.gradient  = dL_dmu
-        self.q.variances.gradient  = dL_dS  
+        self.q.mean.gradient  = dL_dmu
+        self.q.variance.gradient  = dL_dS  
 
         # dKL:
-        #self.q.means.gradient -= self.X
-        #self.q.variances.gradient -= (1. - (1. / (self.X_variance))) * 0.5
+        self.q.mean.gradient -= self.X
+        self.q.variance.gradient -= (1. - (1. / (self.X_variance))) * 0.5
     
     def plot_latent(self, plot_inducing=True, *args, **kwargs):
         """
diff --git a/GPy/models/sparse_gp_regression.py b/GPy/models/sparse_gp_regression.py
index c936164b..8740a1f5 100644
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@@ -32,7 +32,7 @@ class SparseGPRegression(SparseGP):
 
         # kern defaults to rbf (plus white for stability)
         if kernel is None:
-            kernel = kern.rbf(input_dim)  + kern.white(input_dim, variance=1e-3)
+            kernel = kern.rbf(input_dim)#  + kern.white(input_dim, variance=1e-3)
 
         # Z defaults to a subset of the data
         if Z is None:
diff --git a/GPy/plotting/matplot_dep/kernel_plots.py b/GPy/plotting/matplot_dep/kernel_plots.py
index 66644483..19c96bc0 100644
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@@ -7,6 +7,7 @@ import pylab as pb
 import Tango
 from matplotlib.textpath import TextPath
 from matplotlib.transforms import offset_copy
+from ...kern.parts.linear import Linear
 
 
 def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False):
diff --git a/GPy/plotting/matplot_dep/variational_plots.py b/GPy/plotting/matplot_dep/variational_plots.py
index 7c89a088..72b857a6 100644
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@@ -14,14 +14,14 @@ def plot(parameterized, fignum=None, ax=None, colors=None):
 
     """
     if ax is None:
-        fig = pb.figure(num=fignum, figsize=(8, min(12, (2 * parameterized.means.shape[1]))))
+        fig = pb.figure(num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1]))))
     if colors is None:
         colors = pb.gca()._get_lines.color_cycle
         pb.clf()
     else:
         colors = iter(colors)
     plots = []
-    means, variances = param_to_array(parameterized.means, parameterized.variances)
+    means, variances = param_to_array(parameterized.mean, parameterized.variance)
     x = np.arange(means.shape[0])
     for i in range(means.shape[1]):
         if ax is None: