From d7316ee7d916ea38fb87b21876c71d849a09512a Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 27 Mar 2015 13:49:12 +0000
Subject: [PATCH 01/10] Relaxed inference test requirement

---
 GPy/testing/inference_tests.py | 42 +++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/GPy/testing/inference_tests.py b/GPy/testing/inference_tests.py
index ac92c519..d5039049 100644
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@@ -11,39 +11,38 @@ import GPy
 
 
 class InferenceXTestCase(unittest.TestCase):
-    
+
     def genData(self):
         D1,D2,N = 12,12,50
-        np.random.seed(1234)
-    
+
         x = np.linspace(0, 4 * np.pi, N)[:, None]
         s1 = np.vectorize(lambda x: np.sin(x))
         s2 = np.vectorize(lambda x: np.cos(x)**2)
         s3 = np.vectorize(lambda x:-np.exp(-np.cos(2 * x)))
         sS = np.vectorize(lambda x: np.cos(x))
-    
+
         s1 = s1(x)
         s2 = s2(x)
         s3 = s3(x)
         sS = sS(x)
-    
+
         s1 -= s1.mean(); s1 /= s1.std(0)
         s2 -= s2.mean(); s2 /= s2.std(0)
         s3 -= s3.mean(); s3 /= s3.std(0)
         sS -= sS.mean(); sS /= sS.std(0)
-    
+
         S1 = np.hstack([s1, sS])
         S2 = np.hstack([s3, sS])
-    
+
         P1 = np.random.randn(S1.shape[1], D1)
         P2 = np.random.randn(S2.shape[1], D2)
-    
+
         Y1 = S1.dot(P1)
         Y2 = S2.dot(P2)
-    
+
         Y1 += .01 * np.random.randn(*Y1.shape)
         Y2 += .01 * np.random.randn(*Y2.shape)
-    
+
         Y1 -= Y1.mean(0)
         Y2 -= Y2.mean(0)
         Y1 /= Y1.std(0)
@@ -52,33 +51,34 @@ class InferenceXTestCase(unittest.TestCase):
         slist = [s1, s2, s3, sS]
         slist_names = ["s1", "s2", "s3", "sS"]
         Ylist = [Y1, Y2]
-        
+
         return Ylist
-    
+
     def test_inferenceX_BGPLVM(self):
         Ys = self.genData()
         m = GPy.models.BayesianGPLVM(Ys[0],5,kernel=GPy.kern.Linear(5,ARD=True))
-        
+
         x,mi = m.infer_newX(m.Y, optimize=False)
         self.assertTrue(mi.checkgrad())
-        
-        m.optimize(max_iters=10000)
-        x,mi = m.infer_newX(m.Y)
 
-        self.assertTrue(np.allclose(m.X.mean, mi.X.mean))
-        self.assertTrue(np.allclose(m.X.variance, mi.X.variance))
+        m.optimize(max_iters=10000)
+        x, mi = m.infer_newX(m.Y)
+
+        print m.X.mean - mi.X.mean
+        self.assertTrue(np.allclose(m.X.mean, mi.X.mean, rtol=1e-4, atol=1e-4))
+        self.assertTrue(np.allclose(m.X.variance, mi.X.variance, rtol=1e-4, atol=1e-4))
 
     def test_inferenceX_GPLVM(self):
         Ys = self.genData()
         m = GPy.models.GPLVM(Ys[0],3,kernel=GPy.kern.RBF(3,ARD=True))
-        
+
         x,mi = m.infer_newX(m.Y, optimize=False)
         self.assertTrue(mi.checkgrad())
-        
+
 #         m.optimize(max_iters=10000)
 #         x,mi = m.infer_newX(m.Y)
 #         self.assertTrue(np.allclose(m.X, x))
-        
+
 
 if __name__ == "__main__":
     unittest.main()

From 932b5468ae41ffa33a2f612073b3f25548e5d164 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 11 Feb 2014 12:14:11 +0000
Subject: [PATCH 02/10] Adding likelihoods and block matrices

---
 GPy/util/block_matrices.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/GPy/util/block_matrices.py b/GPy/util/block_matrices.py
index 95920868..cdbb1b0b 100644
--- a/GPy/util/block_matrices.py
+++ b/GPy/util/block_matrices.py
@@ -17,6 +17,23 @@ def get_blocks(A, blocksizes):
         count_i += i
     return B
 
+def get_block_shapes(B):
+    assert B.dtype is np.dtype('object'), "Must be a block matrix"
+    return [B[b,b].shape[0] for b in range(0, B.shape[0])]
+
+def unblock(B):
+    assert B.dtype is np.dtype('object'), "Must be a block matrix"
+    block_shapes = get_block_shapes(B)
+    num_elements = np.sum(block_shapes)
+    A = np.empty(shape=(num_elements, num_elements))
+    count_i = 0
+    for Bi, i in enumerate(block_shapes):
+        count_j = 0
+        for Bj, j in enumerate(block_shapes):
+            A[count_i:count_i + i, count_j:count_j + j] = B[Bi, Bj]
+            count_j += j
+        count_i += i
+    return A
 
 
 if __name__=='__main__':
@@ -24,3 +41,8 @@ if __name__=='__main__':
     B = get_blocks(A,[2,3])
     B[0,0] += 7
     print B
+
+    assert np.all(unblock(B) == A)
+
+    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+

From 6a1de2bfc2dccd30c20a0bb30902a283eca1b6d1 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 12 Feb 2014 10:39:15 +0000
Subject: [PATCH 03/10] Added block matrix dot product

---
 GPy/util/block_matrices.py | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/GPy/util/block_matrices.py b/GPy/util/block_matrices.py
index cdbb1b0b..464e3ba1 100644
--- a/GPy/util/block_matrices.py
+++ b/GPy/util/block_matrices.py
@@ -35,6 +35,37 @@ def unblock(B):
         count_i += i
     return A
 
+def block_dot(A, B):
+    """
+    Element wise dot product on block matricies
+
+    +------+------+   +------+------+    +-------+-------+
+    |      |      |   |      |      |    |A11.B11|B12.B12|
+    | A11  | A12  |   | B11  | B12  |    |       |       |
+    +------+------+ o +------+------| =  +-------+-------+
+    |      |      |   |      |      |    |A21.B21|A22.B22|
+    | A21  | A22  |   | B21  | B22  |    |       |       |
+    +-------------+   +------+------+    +-------+-------+
+
+    ..Note
+        If either (A or B) of the diagonal matrices are stored as vectors then a more
+        efficient dot product using numpy broadcasting will be used, i.e. A11*B11
+    """
+    #Must have same number of blocks and be a block matrix
+    assert A.dtype is np.dtype('object'), "Must be a block matrix"
+    assert B.dtype is np.dtype('object'), "Must be a block matrix"
+    Ashape = A.shape
+    Bshape = B.shape
+    assert Ashape == Bshape
+    def f(A,B):
+        if Ashape[0] == Ashape[1] or Bshape[0] == Bshape[1]:
+            #FIXME: Careful if one is transpose of other, would make a matrix
+            return A*B
+        else:
+            return np.dot(A,B)
+    dot = np.vectorize(f, otypes = [np.object])
+    return dot(A,B)
+
 
 if __name__=='__main__':
     A = np.zeros((5,5))
@@ -43,6 +74,3 @@ if __name__=='__main__':
     print B
 
     assert np.all(unblock(B) == A)
-
-    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
-

From 0ea3d336957372a9ee7e40b9db116c881e99279b Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 27 Mar 2015 14:17:03 +0000
Subject: [PATCH 04/10] Merging with private repo, mostly fixed

---
 .../latent_function_inference/__init__.py     |  12 +-
 .../latent_function_inference/laplace.py      | 205 +++++++-
 GPy/likelihoods/bernoulli.py                  |  38 ++
 GPy/likelihoods/gaussian.py                   |  19 +-
 GPy/likelihoods/likelihood.py                 | 255 ++++++++--
 GPy/likelihoods/student_t.py                  |   7 +-
 GPy/testing/likelihood_tests.py               | 472 +++++++++---------
 GPy/util/misc.py                              |  78 ++-
 8 files changed, 768 insertions(+), 318 deletions(-)

diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py
index 67f57638..dc7789ba 100644
--- a/GPy/inference/latent_function_inference/__init__.py
+++ b/GPy/inference/latent_function_inference/__init__.py
@@ -50,19 +50,19 @@ class InferenceMethodList(LatentFunctionInference, list):
     def on_optimization_end(self):
         for inf in self:
             inf.on_optimization_end()
-    
+
     def __getstate__(self):
         state = []
         for inf in self:
             state.append(inf)
         return state
-    
+
     def __setstate__(self, state):
         for inf in state:
             self.append(inf)
 
 from exact_gaussian_inference import ExactGaussianInference
-from laplace import Laplace
+from laplace import Laplace, LaplaceBlock
 from GPy.inference.latent_function_inference.var_dtc import VarDTC
 from expectation_propagation import EP
 from expectation_propagation_dtc import EPDTC
@@ -78,9 +78,9 @@ from svgp import SVGP
 # class EMLikeLatentFunctionInference(LatentFunctionInference):
 #     def update_approximation(self):
 #         """
-#         This function gets called when the 
+#         This function gets called when the
 #         """
-#     
+#
 #     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
 #         Do inference on the latent functions given a covariance function `kern`,
@@ -88,7 +88,7 @@ from svgp import SVGP
 #         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
 #         """
 #         raise NotImplementedError, "Abstract base class for full inference"
-# 
+#
 # class VariationalLatentFunctionInference(LatentFunctionInference):
 #     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 05711b0b..4e25b4b1 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -43,28 +43,31 @@ class Laplace(LatentFunctionInference):
         """
         Returns a Posterior class containing essential quantities of the posterior
         """
-
         # Compute K
         K = kern.K(X)
 
         #Find mode
         if self.bad_fhat or self.first_run:
             Ki_f_init = np.zeros_like(Y)
-            first_run = False
+            self.first_run = False
         else:
             Ki_f_init = self._previous_Ki_fhat
 
+        Ki_f_init = np.zeros_like(Y)# FIXME: take this out
+
         f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
+
         self.f_hat = f_hat
-        self.Ki_fhat =  Ki_fhat
-        self.K = K.copy()
+        #self.Ki_fhat =  Ki_fhat
+        #self.K = K.copy()
+
         #Compute hessian and other variables at mode
         log_marginal, woodbury_inv, dL_dK, dL_dthetaL = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, kern, Y_metadata)
 
         self._previous_Ki_fhat = Ki_fhat.copy()
         return Posterior(woodbury_vector=Ki_fhat, woodbury_inv=woodbury_inv, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
 
-    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
+    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None, *args, **kwargs):
         """
         Rasmussen's numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -89,7 +92,12 @@ class Laplace(LatentFunctionInference):
 
         #define the objective function (to be maximised)
         def obj(Ki_f, f):
-            return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
+            ll = -0.5*np.sum(np.dot(Ki_f.T, f)) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
+            if np.isnan(ll):
+                return -np.inf
+            else:
+                return ll
+
 
         difference = np.inf
         iteration = 0
@@ -104,7 +112,7 @@ class Laplace(LatentFunctionInference):
             W_f = W*f
 
             b = W_f + grad # R+W p46 line 6.
-            W12BiW12, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave)
+            W12BiW12, _, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave, *args, **kwargs)
             W12BiW12Kb = np.dot(W12BiW12, np.dot(K, b))
 
             #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
@@ -121,7 +129,9 @@ class Laplace(LatentFunctionInference):
             step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
             Ki_f_new = Ki_f + step*dKi_f
             f_new = np.dot(K, Ki_f_new)
-
+            #print "new {} vs old {}".format(obj(Ki_f_new, f_new), obj(Ki_f, f))
+            if obj(Ki_f_new, f_new) < obj(Ki_f, f):
+                raise ValueError("Shouldn't happen, brent optimization failing")
             difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
             Ki_f = Ki_f_new
             f = f_new
@@ -152,14 +162,10 @@ class Laplace(LatentFunctionInference):
         if np.any(np.isnan(W)):
             raise ValueError('One or more element(s) of W is NaN')
 
-        K_Wi_i, L, LiW12 = self._compute_B_statistics(K, W, likelihood.log_concave)
-
-        #compute vital matrices
-        C = np.dot(LiW12, K)
-        Ki_W_i  = K - C.T.dot(C)
+        K_Wi_i, logdet_I_KW, I_KW_i, Ki_W_i = self._compute_B_statistics(K, W, likelihood.log_concave)
 
         #compute the log marginal
-        log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + np.sum(likelihood.logpdf(f_hat, Y, Y_metadata=Y_metadata)) - np.sum(np.log(np.diag(L)))
+        log_marginal = -0.5*np.sum(np.dot(Ki_f.T, f_hat)) + np.sum(likelihood.logpdf(f_hat, Y, Y_metadata=Y_metadata)) - 0.5*logdet_I_KW
 
         # Compute matrices for derivatives
         dW_df = -likelihood.d3logpdf_df3(f_hat, Y, Y_metadata=Y_metadata) # -d3lik_d3fhat
@@ -196,23 +202,23 @@ class Laplace(LatentFunctionInference):
             dL_dthetaL = np.zeros(num_params)
             for thetaL_i in range(num_params):
                 #Explicit
-                dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i])
+                dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i,:, :])
                                 # The + comes from the fact that dlik_hess_dthetaL == -dW_dthetaL
-                                + 0.5*np.sum(np.diag(Ki_W_i).flatten()*dlik_hess_dthetaL[:, thetaL_i].flatten())
+                                  + 0.5*np.sum(np.diag(Ki_W_i)*np.squeeze(dlik_hess_dthetaL[thetaL_i, :, :]))
                                 )
 
                 #Implicit
-                dfhat_dthetaL = mdot(I_KW_i, K, dlik_grad_dthetaL[:, thetaL_i])
-                #dfhat_dthetaL = mdot(Ki_W_i, dlik_grad_dthetaL[:, thetaL_i])
+                dfhat_dthetaL = mdot(I_KW_i, K, dlik_grad_dthetaL[thetaL_i, :, :])
+                #dfhat_dthetaL = mdot(Ki_W_i, dlik_grad_dthetaL[thetaL_i, :, :])
                 dL_dthetaL_imp = np.dot(dL_dfhat.T, dfhat_dthetaL)
-                dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+                dL_dthetaL[thetaL_i] = np.sum(dL_dthetaL_exp + dL_dthetaL_imp)
 
         else:
             dL_dthetaL = np.zeros(likelihood.size)
 
         return log_marginal, K_Wi_i, dL_dK, dL_dthetaL
 
-    def _compute_B_statistics(self, K, W, log_concave):
+    def _compute_B_statistics(self, K, W, log_concave, *args, **kwargs):
         """
         Rasmussen suggests the use of a numerically stable positive definite matrix B
         Which has a positive diagonal elements and can be easily inverted
@@ -225,7 +231,7 @@ class Laplace(LatentFunctionInference):
         """
         if not log_concave:
             #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
-            W[W<1e-6] = 1e-6
+            W = np.clip(W, 1e-6, 1e+30)
             # NOTE: when setting a parameter inside parameters_changed it will allways come to closed update circles!!!
             #W.__setitem__(W < 1e-6, 1e-6, update=False)  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                 # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -247,5 +253,160 @@ class Laplace(LatentFunctionInference):
         #K_Wi_i_2 , _= dpotri(L2)
         #symmetrify(K_Wi_i_2)
 
-        return K_Wi_i, L, LiW12
+        #compute vital matrices
+        C = np.dot(LiW12, K)
+        Ki_W_i  = K - C.T.dot(C)
 
+        I_KW_i = np.eye(K.shape[0]) - np.dot(K, K_Wi_i)
+        logdet_I_KW = 2*np.sum(np.log(np.diag(L)))
+
+        return K_Wi_i, logdet_I_KW, I_KW_i, Ki_W_i
+
+class LaplaceBlock(Laplace):
+    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None, *args, **kwargs):
+        Ki_f = Ki_f_init.copy()
+        f = np.dot(K, Ki_f)
+
+        #define the objective function (to be maximised)
+        def obj(Ki_f, f):
+            ll = -0.5*np.dot(Ki_f.T, f) + np.sum(likelihood.logpdf_sum(f, Y, Y_metadata=Y_metadata))
+            if np.isnan(ll):
+                return -np.inf
+            else:
+                return ll
+
+        difference = np.inf
+        iteration = 0
+
+        I = np.eye(K.shape[0])
+        while difference > self._mode_finding_tolerance and iteration < self._mode_finding_max_iter:
+            W = -likelihood.d2logpdf_df2(f, Y, Y_metadata=Y_metadata)
+
+            W[np.diag_indices_from(W)] = np.clip(np.diag(W), 1e-6, 1e+30)
+
+            W_f = np.dot(W, f)
+            grad = likelihood.dlogpdf_df(f, Y, Y_metadata=Y_metadata)
+
+            b = W_f + grad # R+W p46 line 6.
+            K_Wi_i, _, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave, *args, **kwargs)
+
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            #a = (I - (K+Wi)i*K)*b
+            full_step_Ki_f = np.dot(I - np.dot(K_Wi_i, K), b)
+            dKi_f = full_step_Ki_f - Ki_f
+
+            #define an objective for the line search (minimize this one)
+            def inner_obj(step_size):
+                Ki_f_trial = Ki_f + step_size*dKi_f
+                f_trial = np.dot(K, Ki_f_trial)
+                return -obj(Ki_f_trial, f_trial)
+
+            #use scipy for the line search, the compute new values of f, Ki_f
+            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
+
+            Ki_f_new = Ki_f + step*dKi_f
+            f_new = np.dot(K, Ki_f_new)
+
+            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
+            Ki_f = Ki_f_new
+            f = f_new
+            iteration += 1
+
+        #Warn of bad fits
+        if difference > self._mode_finding_tolerance:
+            if not self.bad_fhat:
+                warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
+            self._previous_Ki_fhat = np.zeros_like(Y)
+            self.bad_fhat = True
+        elif self.bad_fhat:
+            self.bad_fhat = False
+            warnings.warn("f_hat now fine again")
+        if iteration > self._mode_finding_max_iter:
+            warnings.warn("didn't find the best")
+
+        return f, Ki_f
+
+    def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, kern, Y_metadata):
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        W = -likelihood.d2logpdf_df2(f_hat, Y, Y_metadata=Y_metadata)
+
+        W[np.diag_indices_from(W)] = np.clip(np.diag(W), 1e-6, 1e+30)
+
+        K_Wi_i, log_B_det, I_KW_i, Ki_W_i = self._compute_B_statistics(K, W, likelihood.log_concave)
+
+        #compute the log marginal
+        #FIXME: The derterminant should be output_dim*0.5 I think, gradients may now no longer check
+        log_marginal = -0.5*np.dot(f_hat.T, Ki_f) + np.sum(likelihood.logpdf_sum(f_hat, Y, Y_metadata=Y_metadata)) - 0.5*log_B_det
+
+        #Compute vival matrices for derivatives
+        dW_df = -likelihood.d3logpdf_df3(f_hat, Y, Y_metadata=Y_metadata) # -d3lik_d3fhat
+
+        #dL_dfhat = np.zeros((f_hat.shape[0]))
+        #for i in range(f_hat.shape[0]):
+            #dL_dfhat[i] = -0.5*np.trace(np.dot(Ki_W_i, dW_df[:,:,i]))
+
+        dL_dfhat = -0.5*np.einsum('ij,ijk->k', Ki_W_i, dW_df)
+
+        woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, Y_metadata=Y_metadata)
+
+        ####################
+        #compute dL_dK#
+        ####################
+        if kern.size > 0 and not kern.is_fixed:
+            #Explicit
+            explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
+
+            #Implicit
+            implicit_part = woodbury_vector.dot(dL_dfhat[None,:]).dot(I_KW_i)
+            #implicit_part = Ki_f.dot(dL_dfhat[None,:]).dot(I_KW_i)
+
+            dL_dK = explicit_part + implicit_part
+        else:
+            dL_dK = np.zeros_like(K)
+
+        ####################
+        #compute dL_dthetaL#
+        ####################
+        if likelihood.size > 0 and not likelihood.is_fixed:
+            raise NotImplementedError
+        else:
+            dL_dthetaL = np.zeros(likelihood.size)
+
+        #self.K_Wi_i = K_Wi_i
+        #self.Ki_W_i = Ki_W_i
+        #self.W = W
+        #self.K = K
+        #self.dL_dfhat = dL_dfhat
+        #self.explicit_part = explicit_part
+        #self.implicit_part = implicit_part
+        return log_marginal, K_Wi_i, dL_dK, dL_dthetaL
+
+    def _compute_B_statistics(self, K, W, log_concave, *args, **kwargs):
+        """
+        Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal element and can be easyily inverted
+
+        :param K: Prior Covariance matrix evaluated at locations X
+        :type K: NxN matrix
+        :param W: Negative hessian at a point (diagonal matrix)
+        :type W: Vector of diagonal values of hessian (1xN)
+        :returns: (K_Wi_i, L_B, not_provided)
+        """
+        #w = GPy.util.diag.view(W)
+        #W[:] = np.where(w<1e-6, 1e-6, w)
+
+        #B = I + KW
+        B = np.eye(K.shape[0]) + np.dot(K, W)
+        #Bi, L, Li, logdetB = pdinv(B)
+        Bi = np.linalg.inv(B)
+
+        #K_Wi_i = np.eye(K.shape[0]) - mdot(W, Bi, K)
+        K_Wi_i = np.dot(W, Bi)
+
+        #self.K_Wi_i_brute = np.linalg.inv(K + np.linalg.inv(W))
+        #self.B = B
+        #self.Bi = Bi
+        Ki_W_i = np.dot(Bi, K)
+
+        sign, logdetB = np.linalg.slogdet(B)
+        return K_Wi_i, sign*logdetB, Bi, Ki_W_i
diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py
index 26de274b..6277c1dc 100644
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@@ -248,3 +248,41 @@ class Bernoulli(Likelihood):
 
     def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
         pass
+
+    def variational_expectations(self, Y, m, v, gh_points=None):
+        """
+        Probit specific numerical stable integrations
+        """
+        #Move to be faster
+        if self.gp_link:
+            pass
+        Yshape = Y.shape
+        mshape = m.shape
+        vshape = v.shape
+        Y = Y.flatten()
+        m = m.flatten()
+        v = v.flatten()
+
+        assert Yshape == mshape
+        assert mshape == vshape
+
+        Ysign = np.where(Y==1,1,-1).flatten()
+        gh_x, gh_w = np.polynomial.hermite.hermgauss(20)
+
+        #Shapes a bit weird
+        X = gh_x[None,:]*np.sqrt(2.*v[:, None]) + (m*Ysign)[:,None]
+        p = stats.norm.cdf(X)
+        p = np.clip(p, 1e-9, 1.-1e-9) # for numerical stability
+        N = stats.norm.pdf(X)
+        F = np.log(p).dot(gh_w)
+        NoverP = N/p
+        dF_dm = (NoverP*Ysign[:,None]).dot(gh_w)
+        dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(gh_w)
+        if np.any(np.isnan(dF_dv)) or np.any(np.isinf(dF_dv)):
+            stop
+        if np.any(np.isnan(dF_dm)) or np.any(np.isinf(dF_dm)):
+            stop
+        #FIXME: Might be wrong reshaping
+        return F.reshape(Yshape), dF_dm.reshape(mshape), dF_dv.reshape(vshape), None
+
+
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index 4e7de9e3..021ec269 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -34,7 +34,9 @@ class Gaussian(Likelihood):
         if gp_link is None:
             gp_link = link_functions.Identity()
 
-        assert isinstance(gp_link, link_functions.Identity), "the likelihood only implemented for the identity link"
+        if not isinstance(gp_link, link_functions.Identity):
+            print "Warning, Exact inference is not implemeted for non-identity link functions,\
+            if you are not already, ensure Laplace inference_method is used"
 
         super(Gaussian, self).__init__(gp_link, name=name)
 
@@ -263,16 +265,19 @@ class Gaussian(Likelihood):
         return d2logpdf_dlink2_dvar
 
     def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
-        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
-        return dlogpdf_dvar
+        dlogpdf_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dtheta[0,:,:] = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
+        return dlogpdf_dtheta
 
     def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
-        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
-        return dlogpdf_dlink_dvar
+        dlogpdf_dlink_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dlink_dtheta[0, :, :]= self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
+        return dlogpdf_dlink_dtheta
 
     def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
-        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
-        return d2logpdf_dlink2_dvar
+        d2logpdf_dlink2_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        d2logpdf_dlink2_dtheta[0, :, :] = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
+        return d2logpdf_dlink2_dtheta
 
     def _mean(self, gp):
         """
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index b1e78b93..ee2f5368 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -5,7 +5,7 @@ import numpy as np
 from scipy import stats,special
 import scipy as sp
 import link_functions
-from ..util.misc import chain_1, chain_2, chain_3
+from ..util.misc import chain_1, chain_2, chain_3, blockify_dhess_dtheta, blockify_third, blockify_hessian
 from scipy.integrate import quad
 import warnings
 from ..core.parameterization import Parameterized
@@ -39,6 +39,7 @@ class Likelihood(Parameterized):
         assert isinstance(gp_link,link_functions.GPTransformation), "gp_link is not a valid GPTransformation."
         self.gp_link = gp_link
         self.log_concave = False
+        self.not_block_really = False
 
     def _gradients(self,partial):
         return np.zeros(0)
@@ -189,20 +190,27 @@ class Likelihood(Parameterized):
 
         """
         #conditional_mean: the edpected value of y given some f, under this likelihood
+        fmin = -np.inf
+        fmax = np.inf
         def int_mean(f,m,v):
-            p = np.exp(-(0.5/v)*np.square(f - m))
+            exponent = -(0.5/v)*np.square(f - m)
+            #If exponent is under -30 then exp(exponent) will be very small, so don't exp it!)
             #If p is zero then conditional_mean will overflow
+            assert v.all() > 0
+            p = safe_exp(exponent)
+
+            #If p is zero then conditional_variance will overflow
             if p < 1e-10:
                 return 0.
             else:
                 return self.conditional_mean(f)*p
-        scaled_mean = [quad(int_mean, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        scaled_mean = [quad(int_mean, fmin, fmax,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
         mean = np.array(scaled_mean)[:,None] / np.sqrt(2*np.pi*(variance))
 
         return mean
 
     def _conditional_mean(self, f):
-        """Quadrature calculation of the conditional mean: E(Y_star|f)"""
+        """Quadrature calculation of the conditional mean: E(Y_star|f_star)"""
         raise NotImplementedError, "implement this function to make predictions"
 
     def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
@@ -210,7 +218,7 @@ class Likelihood(Parameterized):
         Approximation to the predictive variance: V(Y_star)
 
         The following variance decomposition is used:
-        V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
+        V(Y_star) = E( V(Y_star|f_star)**2 ) + V( E(Y_star|f_star) )**2
 
         :param mu: mean of posterior
         :param sigma: standard deviation of posterior
@@ -220,15 +228,22 @@ class Likelihood(Parameterized):
         #sigma2 = sigma**2
         normalizer = np.sqrt(2*np.pi*variance)
 
+        fmin_v = -np.inf
+        fmin_m = np.inf
+        fmin = -np.inf
+        fmax = np.inf
+
+        from ..util.misc import safe_exp
         # E( V(Y_star|f_star) )
         def int_var(f,m,v):
-            p = np.exp(-(0.5/v)*np.square(f - m))
+            exponent = -(0.5/v)*np.square(f - m)
+            p = safe_exp(exponent)
             #If p is zero then conditional_variance will overflow
             if p < 1e-10:
                 return 0.
             else:
                 return self.conditional_variance(f)*p
-        scaled_exp_variance = [quad(int_var, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        scaled_exp_variance = [quad(int_var, fmin_v, fmax,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
         exp_var = np.array(scaled_exp_variance)[:,None] / normalizer
 
         #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star) )**2
@@ -240,14 +255,15 @@ class Likelihood(Parameterized):
 
         #E( E(Y_star|f_star)**2 )
         def int_pred_mean_sq(f,m,v,predictive_mean_sq):
-            p = np.exp(-(0.5/v)*np.square(f - m))
+            exponent = -(0.5/v)*np.square(f - m)
+            p = np.exp(exponent)
             #If p is zero then conditional_mean**2 will overflow
             if p < 1e-10:
                 return 0.
             else:
                 return self.conditional_mean(f)**2*p
 
-        scaled_exp_exp2 = [quad(int_pred_mean_sq, -np.inf, np.inf,args=(mj,s2j,pm2j))[0] for mj,s2j,pm2j in zip(mu,variance,predictive_mean_sq)]
+        scaled_exp_exp2 = [quad(int_pred_mean_sq, fmin_m, fmax,args=(mj,s2j,pm2j))[0] for mj,s2j,pm2j in zip(mu,variance,predictive_mean_sq)]
         exp_exp2 = np.array(scaled_exp_exp2)[:,None] / normalizer
 
         var_exp = exp_exp2 - predictive_mean_sq
@@ -295,8 +311,18 @@ class Likelihood(Parameterized):
         :returns: likelihood evaluated for this point
         :rtype: float
         """
-        inv_link_f = self.gp_link.transf(f)
-        return self.pdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+        if isinstance(self.gp_link, link_functions.Identity):
+            return self.pdf_link(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            return self.pdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+
+    def logpdf_sum(self, f, y, Y_metadata=None):
+        """
+        Convenience function that can overridden for functions where this could
+        be computed more efficiently (Theano?)
+        """
+        return np.sum(self.logpdf(f, y, Y_metadata=Y_metadata))
 
     def logpdf(self, f, y, Y_metadata=None):
         """
@@ -313,8 +339,11 @@ class Likelihood(Parameterized):
         :returns: log likelihood evaluated for this point
         :rtype: float
         """
-        inv_link_f = self.gp_link.transf(f)
-        return self.logpdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+        if isinstance(self.gp_link, link_functions.Identity):
+            return self.logpdf_link(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            return self.logpdf_link(inv_link_f, y, Y_metadata=Y_metadata)
 
     def dlogpdf_df(self, f, y, Y_metadata=None):
         """
@@ -332,11 +361,15 @@ class Likelihood(Parameterized):
         :returns: derivative of log likelihood evaluated for this point
         :rtype: 1xN array
         """
-        inv_link_f = self.gp_link.transf(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
-        dlink_df = self.gp_link.dtransf_df(f)
-        return chain_1(dlogpdf_dlink, dlink_df)
+        if isinstance(self.gp_link, link_functions.Identity):
+            return self.dlogpdf_dlink(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+            dlink_df = self.gp_link.dtransf_df(f)
+            return chain_1(dlogpdf_dlink, dlink_df)
 
+    @blockify_hessian
     def d2logpdf_df2(self, f, y, Y_metadata=None):
         """
         Evaluates the link function link(f) then computes the second derivative of log likelihood using it
@@ -353,13 +386,18 @@ class Likelihood(Parameterized):
         :returns: second derivative of log likelihood evaluated for this point (diagonal only)
         :rtype: 1xN array
         """
-        inv_link_f = self.gp_link.transf(f)
-        d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
-        dlink_df = self.gp_link.dtransf_df(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
-        d2link_df2 = self.gp_link.d2transf_df2(f)
-        return chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)
+        if isinstance(self.gp_link, link_functions.Identity):
+            d2logpdf_df2 = self.d2logpdf_dlink2(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
+            dlink_df = self.gp_link.dtransf_df(f)
+            dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+            d2link_df2 = self.gp_link.d2transf_df2(f)
+            d2logpdf_df2 = chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)
+        return d2logpdf_df2
 
+    @blockify_third
     def d3logpdf_df3(self, f, y, Y_metadata=None):
         """
         Evaluates the link function link(f) then computes the third derivative of log likelihood using it
@@ -376,64 +414,96 @@ class Likelihood(Parameterized):
         :returns: third derivative of log likelihood evaluated for this point
         :rtype: float
         """
-        inv_link_f = self.gp_link.transf(f)
-        d3logpdf_dlink3 = self.d3logpdf_dlink3(inv_link_f, y, Y_metadata=Y_metadata)
-        dlink_df = self.gp_link.dtransf_df(f)
-        d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
-        d2link_df2 = self.gp_link.d2transf_df2(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
-        d3link_df3 = self.gp_link.d3transf_df3(f)
-        return chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
+        if isinstance(self.gp_link, link_functions.Identity):
+            d3logpdf_df3 = self.d3logpdf_dlink3(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            d3logpdf_dlink3 = self.d3logpdf_dlink3(inv_link_f, y, Y_metadata=Y_metadata)
+            dlink_df = self.gp_link.dtransf_df(f)
+            d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
+            d2link_df2 = self.gp_link.d2transf_df2(f)
+            dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+            d3link_df3 = self.gp_link.d3transf_df3(f)
+            d3logpdf_df3 = chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
+        return d3logpdf_df3
+
 
     def dlogpdf_dtheta(self, f, y, Y_metadata=None):
         """
         TODO: Doc strings
         """
         if self.size > 0:
-            inv_link_f = self.gp_link.transf(f)
-            return self.dlogpdf_link_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+            if self.not_block_really:
+                raise NotImplementedError("Need to make a decorator for this!")
+            if isinstance(self.gp_link, link_functions.Identity):
+                return self.dlogpdf_link_dtheta(f, y, Y_metadata=Y_metadata)
+            else:
+                inv_link_f = self.gp_link.transf(f)
+                return self.dlogpdf_link_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
         else:
             # There are no parameters so return an empty array for derivatives
-            return np.zeros([1, 0])
+            return np.zeros((0, f.shape[0], f.shape[1]))
 
     def dlogpdf_df_dtheta(self, f, y, Y_metadata=None):
         """
         TODO: Doc strings
         """
         if self.size > 0:
-            inv_link_f = self.gp_link.transf(f)
-            dlink_df = self.gp_link.dtransf_df(f)
-            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
-            return chain_1(dlogpdf_dlink_dtheta, dlink_df)
+            if self.not_block_really:
+                raise NotImplementedError("Need to make a decorator for this!")
+            if isinstance(self.gp_link, link_functions.Identity):
+                return self.dlogpdf_dlink_dtheta(f, y, Y_metadata=Y_metadata)
+            else:
+                inv_link_f = self.gp_link.transf(f)
+                dlink_df = self.gp_link.dtransf_df(f)
+                dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+
+                dlogpdf_df_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+                #Chain each parameter of hte likelihood seperately
+                for p in range(self.size):
+                    dlogpdf_df_dtheta[p, :, :] = chain_1(dlogpdf_dlink_dtheta[p,:,:], dlink_df)
+                return dlogpdf_df_dtheta
+                #return chain_1(dlogpdf_dlink_dtheta, dlink_df)
         else:
             # There are no parameters so return an empty array for derivatives
-            return np.zeros([f.shape[0], 0])
+            return np.zeros((0, f.shape[0], f.shape[1]))
 
     def d2logpdf_df2_dtheta(self, f, y, Y_metadata=None):
         """
         TODO: Doc strings
         """
         if self.size > 0:
-            inv_link_f = self.gp_link.transf(f)
-            dlink_df = self.gp_link.dtransf_df(f)
-            d2link_df2 = self.gp_link.d2transf_df2(f)
-            d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
-            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
-            return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
+            if self.not_block_really:
+                raise NotImplementedError("Need to make a decorator for this!")
+            if isinstance(self.gp_link, link_functions.Identity):
+                return self.d2logpdf_dlink2_dtheta(f, y, Y_metadata=Y_metadata)
+            else:
+                inv_link_f = self.gp_link.transf(f)
+                dlink_df = self.gp_link.dtransf_df(f)
+                d2link_df2 = self.gp_link.d2transf_df2(f)
+                d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+                dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+
+                d2logpdf_df2_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+                #Chain each parameter of hte likelihood seperately
+                for p in range(self.size):
+                    d2logpdf_df2_dtheta[p, :, :] = chain_2(d2logpdf_dlink2_dtheta[p,:,:], dlink_df, dlogpdf_dlink_dtheta[p,:,:], d2link_df2)
+                return d2logpdf_df2_dtheta
+                #return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
         else:
             # There are no parameters so return an empty array for derivatives
-            return np.zeros([f.shape[0], 0])
+            return np.zeros((0, f.shape[0], f.shape[1]))
 
     def _laplace_gradients(self, f, y, Y_metadata=None):
-        dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, Y_metadata=Y_metadata).sum(axis=0)
+        dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, Y_metadata=Y_metadata)
         dlogpdf_df_dtheta = self.dlogpdf_df_dtheta(f, y, Y_metadata=Y_metadata)
         d2logpdf_df2_dtheta = self.d2logpdf_df2_dtheta(f, y, Y_metadata=Y_metadata)
 
         #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
         # ensure we have gradients for every parameter we want to optimize
-        assert len(dlogpdf_dtheta) == self.size #1 x num_param array
-        assert dlogpdf_df_dtheta.shape[1] == self.size #f x num_param matrix
-        assert d2logpdf_df2_dtheta.shape[1] == self.size #f x num_param matrix
+        assert dlogpdf_dtheta.shape[0] == self.size #f, d x num_param array
+        assert dlogpdf_df_dtheta.shape[0] == self.size #f x d x num_param matrix or just f x num_param
+        assert d2logpdf_df2_dtheta.shape[0] == self.size #f x num_param matrix or f x d x num_param matrix, f x f x num_param or f x f x d x num_param
 
         return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
 
@@ -454,19 +524,98 @@ class Likelihood(Parameterized):
 
     def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
         #compute the quantiles by sampling!!!
-        N_samp = 1000
+        N_samp = 50
         s = np.random.randn(mu.shape[0], N_samp)*np.sqrt(var) + mu
         #ss_f = s.flatten()
         #ss_y = self.samples(ss_f, Y_metadata)
+        #ss_y = self.samples(s, Y_metadata, samples=100)
         ss_y = self.samples(s, Y_metadata)
         #ss_y = ss_y.reshape(mu.shape[0], N_samp)
 
         return [np.percentile(ss_y ,q, axis=1)[:,None] for q in quantiles]
 
-    def samples(self, gp, Y_metadata=None):
+    def samples(self, gp, Y_metadata=None, samples=1):
         """
         Returns a set of samples of observations based on a given value of the latent variable.
 
         :param gp: latent variable
+        :param samples: number of samples to take for each f location
         """
-        raise NotImplementedError
+        raise NotImplementedError("""May be possible to use MCMC with user-tuning, see
+                                  MCMC_pdf_samples in likelihood.py and write samples function
+                                  using this, beware this is a simple implementation
+                                  of Metropolis and will not work well for all likelihoods""")
+
+    def MCMC_pdf_samples(self, fNew, num_samples=1000, starting_loc=None, stepsize=0.1, burn_in=1000, Y_metadata=None):
+        """
+        Simple implementation of Metropolis sampling algorithm
+
+        Will run a parallel chain for each input dimension (treats each f independently)
+        Thus assumes f*_1 independant of f*_2 etc.
+
+        :param num_samples: Number of samples to take
+        :param fNew: f at which to sample around
+        :param starting_loc: Starting locations of the independant chains (usually will be conditional_mean of likelihood), often link_f
+        :param stepsize: Stepsize for the normal proposal distribution (will need modifying)
+        :param burnin: number of samples to use for burnin (will need modifying)
+        :param Y_metadata: Y_metadata for pdf
+        """
+        print "Warning, using MCMC for sampling y*, needs to be tuned!"
+        if starting_loc is None:
+            starting_loc = fNew
+        from functools import partial
+        logpdf = partial(self.logpdf, f=fNew, Y_metadata=Y_metadata)
+        pdf = lambda y_star: np.exp(logpdf(y=y_star[:, None]))
+        #Should be the link function of f is a good starting point
+        #(i.e. the point before you corrupt it with the likelihood)
+        par_chains = starting_loc.shape[0]
+        chain_values = np.zeros((par_chains, num_samples))
+        chain_values[:, 0][:,None] = starting_loc
+        #Use same stepsize for all par_chains
+        stepsize = np.ones(par_chains)*stepsize
+        accepted = np.zeros((par_chains, num_samples+burn_in))
+        accept_ratio = np.zeros(num_samples+burn_in)
+        #Whilst burning in, only need to keep the previous lot
+        burnin_cache = np.zeros(par_chains)
+        burnin_cache[:] = starting_loc.flatten()
+        burning_in = True
+        for i in xrange(burn_in+num_samples):
+            next_ind = i-burn_in
+            if burning_in:
+                old_y = burnin_cache
+            else:
+                old_y = chain_values[:,next_ind-1]
+
+            old_lik = pdf(old_y)
+            #Propose new y from Gaussian proposal
+            new_y = np.random.normal(loc=old_y, scale=stepsize)
+            new_lik = pdf(new_y)
+            #Accept using Metropolis (not hastings) acceptance
+            #Always accepts if new_lik > old_lik
+            accept_probability = np.minimum(1, new_lik/old_lik)
+            u = np.random.uniform(0,1,par_chains)
+            #print "Accept prob: ", accept_probability
+            accepts = u < accept_probability
+            if burning_in:
+                burnin_cache[accepts] = new_y[accepts]
+                burnin_cache[~accepts] = old_y[~accepts]
+                if i == burn_in:
+                    burning_in = False
+                    chain_values[:,0] = burnin_cache
+            else:
+                #If it was accepted then new_y becomes the latest sample
+                chain_values[accepts, next_ind] = new_y[accepts]
+                #Otherwise use old y as the sample
+                chain_values[~accepts, next_ind] = old_y[~accepts]
+
+            accepted[~accepts, i] = 0
+            accepted[accepts, i] = 1
+            accept_ratio[i] = np.sum(accepted[:,i])/float(par_chains)
+
+            #Show progress
+            if i % int((burn_in+num_samples)*0.1) == 0:
+                print "{}% of samples taken ({})".format((i/int((burn_in+num_samples)*0.1)*10), i)
+                print "Last run accept ratio: ", accept_ratio[i]
+
+        print "Average accept ratio: ", np.mean(accept_ratio)
+        return chain_values
diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py
index dbd4d94f..f16a55e9 100644
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@@ -226,17 +226,18 @@ class StudentT(Likelihood):
     def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
         dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
         dlogpdf_dv = np.zeros_like(dlogpdf_dvar) #FIXME: Not done yet
-        return np.hstack((dlogpdf_dvar, dlogpdf_dv))
+        return np.array((dlogpdf_dvar, dlogpdf_dv))
 
     def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
         dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
         dlogpdf_dlink_dv = np.zeros_like(dlogpdf_dlink_dvar) #FIXME: Not done yet
-        return np.hstack((dlogpdf_dlink_dvar, dlogpdf_dlink_dv))
+        return np.array((dlogpdf_dlink_dvar, dlogpdf_dlink_dv))
 
     def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
         d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
         d2logpdf_dlink2_dv = np.zeros_like(d2logpdf_dlink2_dvar) #FIXME: Not done yet
-        return np.hstack((d2logpdf_dlink2_dvar, d2logpdf_dlink2_dv))
+
+        return np.array((d2logpdf_dlink2_dvar, d2logpdf_dlink2_dv))
 
     def predictive_mean(self, mu, sigma, Y_metadata=None):
         # The comment here confuses mean and median.
diff --git a/GPy/testing/likelihood_tests.py b/GPy/testing/likelihood_tests.py
index 877d1aa0..7b6164c1 100644
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@@ -10,7 +10,7 @@ from GPy.likelihoods import link_functions
 from GPy.core.parameterization import Param
 from functools import partial
 #np.random.seed(300)
-#np.random.seed(7)
+#np.random.seed(4)
 
 #np.seterr(divide='raise')
 def dparam_partial(inst_func, *args):
@@ -52,8 +52,17 @@ def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None,
     zipped_params = zip(params, params_names)
     for param_ind, (param_val, param_name) in enumerate(zipped_params):
         #Check one parameter at a time, make sure it is 2d (as some gradients only return arrays) then strip out the parameter
-        fnum = np.atleast_2d(partial_f(param_val, param_name))[:, param_ind].shape[0]
-        dfnum = np.atleast_2d(partial_df(param_val, param_name))[:, param_ind].shape[0]
+        f_ = partial_f(param_val, param_name)
+        df_ = partial_df(param_val, param_name)
+        #Reshape it such that we have a 3d matrix incase, that is we want it (?, N, D) regardless of whether ? is num_params or not
+        f_ = f_.reshape(-1, f_.shape[0], f_.shape[1])
+        df_ = df_.reshape(-1, f_.shape[0], f_.shape[1])
+
+        #Get the number of f and number of dimensions
+        fnum = f_.shape[-2]
+        fdim = f_.shape[-1]
+        dfnum = df_.shape[-2]
+
         for fixed_val in range(dfnum):
             #dlik and dlik_dvar gives back 1 value for each
             f_ind = min(fnum, fixed_val+1) - 1
@@ -61,9 +70,13 @@ def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None,
             #Make grad checker with this param moving, note that set_params is NOT being called
             #The parameter is being set directly with __setattr__
             #Check only the parameter and function value we wish to check at a time
-            grad = GradientChecker(lambda p_val: np.atleast_2d(partial_f(p_val, param_name))[f_ind, param_ind],
-                                   lambda p_val: np.atleast_2d(partial_df(p_val, param_name))[fixed_val, param_ind],
-                                   param_val, [param_name])
+            #func = lambda p_val, fnum, fdim, param_ind, f_ind, param_ind: partial_f(p_val, param_name).reshape(-1, fnum, fdim)[param_ind, f_ind, :]
+            #dfunc_dparam = lambda d_val, fnum, fdim, param_ind, fixed_val: partial_df(d_val, param_name).reshape(-1, fnum, fdim)[param_ind, fixed_val, :]
+
+            #First we reshape the output such that it is (num_params, N, D) then we pull out the relavent parameter-findex and checkgrad just this index at a time
+            func = lambda p_val: partial_f(p_val, param_name).reshape(-1, fnum, fdim)[param_ind, f_ind, :]
+            dfunc_dparam = lambda d_val: partial_df(d_val, param_name).reshape(-1, fnum, fdim)[param_ind, fixed_val, :]
+            grad = GradientChecker(func, dfunc_dparam, param_val, [param_name])
 
             if constraints is not None:
                 for constrain_param, constraint in constraints:
@@ -104,37 +117,9 @@ class TestNoiseModels(object):
 
         self.var = 0.2
 
-        self.var = np.random.rand(1)
-
         #Make a bigger step as lower bound can be quite curved
         self.step = 1e-4
 
-    def tearDown(self):
-        self.Y = None
-        self.f = None
-        self.X = None
-
-    def test_scale2_models(self):
-        self.setUp()
-
-        ####################################################
-        # Constraint wrappers so we can just list them off #
-        ####################################################
-        def constrain_fixed(regex, model):
-            model[regex].constrain_fixed()
-
-        def constrain_negative(regex, model):
-            model[regex].constrain_negative()
-
-        def constrain_positive(regex, model):
-            model[regex].constrain_positive()
-
-        def constrain_bounded(regex, model, lower, upper):
-            """
-            Used like: partial(constrain_bounded, lower=0, upper=1)
-            """
-            model[regex].constrain_bounded(lower, upper)
-
         """
         Dictionary where we nest models we would like to check
             Name: {
@@ -149,136 +134,170 @@ class TestNoiseModels(object):
                 "link_f_constraints": [constraint_wrappers, listed_here]
                 }
         """
-        noise_models = {"Student_t_default": {
-                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [self.var],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                #"constraints": [("t_scale2", constrain_positive), ("deg_free", partial(constrain_fixed, value=5))]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_1_var": {
-                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [1.0],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_small_deg_free": {
-                            "model": GPy.likelihoods.StudentT(deg_free=1.5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [self.var],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_small_var": {
-                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [0.001],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_large_var": {
-                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [10.0],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_approx_gauss": {
-                            "model": GPy.likelihoods.StudentT(deg_free=1000, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [self.var],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_log": {
-                            "model": GPy.likelihoods.StudentT(gp_link=link_functions.Log(), deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [self.var],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Gaussian_default": {
-                            "model": GPy.likelihoods.Gaussian(variance=self.var),
-                            "grad_params": {
-                                "names": [".*variance"],
-                                "vals": [self.var],
-                                "constraints": [(".*variance", constrain_positive)]
-                                },
-                            "laplace": True,
-                            "ep": False # FIXME: Should be True when we have it working again
-                            },
-                        #"Gaussian_log": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Log(), variance=self.var, D=self.D, N=self.N),
-                            #"grad_params": {
-                                #"names": ["noise_model_variance"],
-                                #"vals": [self.var],
-                                #"constraints": [constrain_positive]
-                                #},
-                            #"laplace": True
-                            #},
-                        #"Gaussian_probit": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Probit(), variance=self.var, D=self.D, N=self.N),
-                            #"grad_params": {
-                                #"names": ["noise_model_variance"],
-                                #"vals": [self.var],
-                                #"constraints": [constrain_positive]
-                                #},
-                            #"laplace": True
-                            #},
-                        #"Gaussian_log_ex": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
-                            #"grad_params": {
-                                #"names": ["noise_model_variance"],
-                                #"vals": [self.var],
-                                #"constraints": [constrain_positive]
-                                #},
-                            #"laplace": True
-                            #},
-                        "Bernoulli_default": {
-                            "model": GPy.likelihoods.Bernoulli(),
-                            "link_f_constraints": [partial(constrain_bounded, lower=0, upper=1)],
-                            "laplace": True,
-                            "Y": self.binary_Y,
-                            "ep": False # FIXME: Should be True when we have it working again
-                            },
-                        "Exponential_default": {
-                            "model": GPy.likelihoods.Exponential(),
-                            "link_f_constraints": [constrain_positive],
-                            "Y": self.positive_Y,
-                            "laplace": True,
-                        },
-                        "Poisson_default": {
-                            "model": GPy.likelihoods.Poisson(),
-                            "link_f_constraints": [constrain_positive],
-                            "Y": self.integer_Y,
-                            "laplace": True,
-                            "ep": False #Should work though...
-                        }#,
-                        #GAMMA needs some work!"Gamma_default": {
-                            #"model": GPy.likelihoods.Gamma(),
-                            #"link_f_constraints": [constrain_positive],
-                            #"Y": self.positive_Y,
-                            #"laplace": True
-                        #}
-                    }
+        self.noise_models = {"Student_t_default": {
+            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+            "grad_params": {
+                "names": [".*t_scale2"],
+                "vals": [self.var],
+                "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+            },
+            "laplace": True
+            },
+            "Student_t_1_var": {
+                "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [1.0],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            "Student_t_small_deg_free": {
+                "model": GPy.likelihoods.StudentT(deg_free=1.5, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [self.var],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            "Student_t_small_var": {
+                "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [0.001],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            "Student_t_large_var": {
+                "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [10.0],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            "Student_t_approx_gauss": {
+                "model": GPy.likelihoods.StudentT(deg_free=1000, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [self.var],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            #"Student_t_log": {
+            #"model": GPy.likelihoods.StudentT(gp_link=link_functions.Log(), deg_free=5, sigma2=self.var),
+            #"grad_params": {
+            #"names": [".*t_noise"],
+            #"vals": [self.var],
+            #"constraints": [(".*t_noise", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+            #},
+            #"laplace": True
+            #},
+            "Gaussian_default": {
+                "model": GPy.likelihoods.Gaussian(variance=self.var),
+                "grad_params": {
+                    "names": [".*variance"],
+                    "vals": [self.var],
+                    "constraints": [(".*variance", self.constrain_positive)]
+                },
+                "laplace": True,
+                "ep": False # FIXME: Should be True when we have it working again
+            },
+            "Gaussian_log": {
+                "model": GPy.likelihoods.Gaussian(gp_link=link_functions.Log(), variance=self.var),
+                "grad_params": {
+                    "names": [".*variance"],
+                    "vals": [self.var],
+                    "constraints": [(".*variance", self.constrain_positive)]
+                },
+                "laplace": True
+            },
+            #"Gaussian_probit": {
+            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Probit(), variance=self.var, D=self.D, N=self.N),
+            #"grad_params": {
+            #"names": ["noise_model_variance"],
+            #"vals": [self.var],
+            #"constraints": [constrain_positive]
+            #},
+            #"laplace": True
+            #},
+            #"Gaussian_log_ex": {
+            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
+            #"grad_params": {
+            #"names": ["noise_model_variance"],
+            #"vals": [self.var],
+            #"constraints": [constrain_positive]
+            #},
+            #"laplace": True
+            #},
+            "Bernoulli_default": {
+                "model": GPy.likelihoods.Bernoulli(),
+                "link_f_constraints": [partial(self.constrain_bounded, lower=0, upper=1)],
+                "laplace": True,
+                "Y": self.binary_Y,
+                "ep": False # FIXME: Should be True when we have it working again
+            },
+            "Exponential_default": {
+                "model": GPy.likelihoods.Exponential(),
+                "link_f_constraints": [self.constrain_positive],
+                "Y": self.positive_Y,
+                "laplace": True,
+            },
+            "Poisson_default": {
+                "model": GPy.likelihoods.Poisson(),
+                "link_f_constraints": [self.constrain_positive],
+                "Y": self.integer_Y,
+                "laplace": True,
+                "ep": False #Should work though...
+            },
+            #,
+            #GAMMA needs some work!"Gamma_default": {
+            #"model": GPy.likelihoods.Gamma(),
+            #"link_f_constraints": [constrain_positive],
+            #"Y": self.positive_Y,
+            #"laplace": True
+            #}
+        }
 
-        for name, attributes in noise_models.iteritems():
+
+    ####################################################
+    # Constraint wrappers so we can just list them off #
+    ####################################################
+    def constrain_fixed(self, regex, model):
+        model[regex].constrain_fixed()
+
+    def constrain_negative(self, regex, model):
+        model[regex].constrain_negative()
+
+    def constrain_positive(self, regex, model):
+        model[regex].constrain_positive()
+
+    def constrain_fixed_below(self, regex, model, up_to):
+        model[regex][0:up_to].constrain_fixed()
+
+    def constrain_fixed_above(self, regex, model, above):
+        model[regex][above:].constrain_fixed()
+
+    def constrain_bounded(self, regex, model, lower, upper):
+        """
+        Used like: partial(constrain_bounded, lower=0, upper=1)
+        """
+        model[regex].constrain_bounded(lower, upper)
+
+
+    def tearDown(self):
+        self.Y = None
+        self.f = None
+        self.X = None
+
+    def test_scale2_models(self):
+        self.setUp()
+
+        for name, attributes in self.noise_models.iteritems():
             model = attributes["model"]
             if "grad_params" in attributes:
                 params = attributes["grad_params"]
@@ -290,7 +309,7 @@ class TestNoiseModels(object):
                 param_vals = []
                 param_names = []
                 constrain_positive = []
-                param_constraints = [] # ??? TODO: Saul to Fix.
+                param_constraints = []
             if "link_f_constraints" in attributes:
                 link_f_constraints = attributes["link_f_constraints"]
             else:
@@ -303,6 +322,10 @@ class TestNoiseModels(object):
                 f = attributes["f"].copy()
             else:
                 f = self.f.copy()
+            if "Y_metadata" in attributes:
+                Y_metadata = attributes["Y_metadata"].copy()
+            else:
+                Y_metadata = None
             if "laplace" in attributes:
                 laplace = attributes["laplace"]
             else:
@@ -317,30 +340,30 @@ class TestNoiseModels(object):
 
             #Required by all
             #Normal derivatives
-            yield self.t_logpdf, model, Y, f
-            yield self.t_dlogpdf_df, model, Y, f
-            yield self.t_d2logpdf_df2, model, Y, f
+            yield self.t_logpdf, model, Y, f, Y_metadata
+            yield self.t_dlogpdf_df, model, Y, f, Y_metadata
+            yield self.t_d2logpdf_df2, model, Y, f, Y_metadata
             #Link derivatives
-            yield self.t_dlogpdf_dlink, model, Y, f, link_f_constraints
-            yield self.t_d2logpdf_dlink2, model, Y, f, link_f_constraints
+            yield self.t_dlogpdf_dlink, model, Y, f, Y_metadata, link_f_constraints
+            yield self.t_d2logpdf_dlink2, model, Y, f, Y_metadata, link_f_constraints
             if laplace:
                 #Laplace only derivatives
-                yield self.t_d3logpdf_df3, model, Y, f
-                yield self.t_d3logpdf_dlink3, model, Y, f, link_f_constraints
+                yield self.t_d3logpdf_df3, model, Y, f, Y_metadata
+                yield self.t_d3logpdf_dlink3, model, Y, f, Y_metadata, link_f_constraints
                 #Params
-                yield self.t_dlogpdf_dparams, model, Y, f, param_vals, param_names, param_constraints
-                yield self.t_dlogpdf_df_dparams, model, Y, f, param_vals, param_names, param_constraints
-                yield self.t_d2logpdf2_df2_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_df_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
+                yield self.t_d2logpdf2_df2_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
                 #Link params
-                yield self.t_dlogpdf_link_dparams, model, Y, f, param_vals, param_names, param_constraints
-                yield self.t_dlogpdf_dlink_dparams, model, Y, f, param_vals, param_names, param_constraints
-                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_link_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_dlink_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
+                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
 
                 #laplace likelihood gradcheck
-                yield self.t_laplace_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+                yield self.t_laplace_fit_rbf_white, model, self.X, Y, f, Y_metadata, self.step, param_vals, param_names, param_constraints
             if ep:
                 #ep likelihood gradcheck
-                yield self.t_ep_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+                yield self.t_ep_fit_rbf_white, model, self.X, Y, f, Y_metadata, self.step, param_vals, param_names, param_constraints
 
 
         self.tearDown()
@@ -349,41 +372,41 @@ class TestNoiseModels(object):
     # dpdf_df's #
     #############
     @with_setup(setUp, tearDown)
-    def t_logpdf(self, model, Y, f):
+    def t_logpdf(self, model, Y, f, Y_metadata):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         #print model._get_params()
         np.testing.assert_almost_equal(
-                model.pdf(f.copy(), Y.copy()).prod(),
-                               np.exp(model.logpdf(f.copy(), Y.copy()).sum())
+                model.pdf(f.copy(), Y.copy(), Y_metadata=Y_metadata).prod(),
+                               np.exp(model.logpdf(f.copy(), Y.copy(), Y_metadata=Y_metadata).sum())
                                )
 
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_df(self, model, Y, f):
+    def t_dlogpdf_df(self, model, Y, f, Y_metadata):
         print "\n{}".format(inspect.stack()[0][3])
         self.description = "\n{}".format(inspect.stack()[0][3])
-        logpdf = functools.partial(np.sum(model.logpdf), y=Y)
-        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
+        logpdf = functools.partial(np.sum(model.logpdf), y=Y, Y_metadata=Y_metadata)
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(logpdf, dlogpdf_df, f.copy(), 'g')
         grad.randomize()
         print model
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf_df2(self, model, Y, f):
+    def t_d2logpdf_df2(self, model, Y, f, Y_metadata):
         print "\n{}".format(inspect.stack()[0][3])
-        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
-        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y, Y_metadata=Y_metadata)
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(dlogpdf_df, d2logpdf_df2, f.copy(), 'g')
         grad.randomize()
         print model
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
-    def t_d3logpdf_df3(self, model, Y, f):
+    def t_d3logpdf_df3(self, model, Y, f, Y_metadata):
         print "\n{}".format(inspect.stack()[0][3])
-        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
-        d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=Y)
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y, Y_metadata=Y_metadata)
+        d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, f.copy(), 'g')
         grad.randomize()
         print model
@@ -393,32 +416,32 @@ class TestNoiseModels(object):
     # df_dparams #
     ##############
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_dparams(self, model, Y, f, params, params_names, param_constraints):
+    def t_dlogpdf_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
-                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_df_dparams(self, model, Y, f, params, params_names, param_constraints):
+    def t_dlogpdf_df_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
-                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf2_df2_dparams(self, model, Y, f, params, params_names, param_constraints):
+    def t_d2logpdf2_df2_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
-                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
@@ -426,10 +449,10 @@ class TestNoiseModels(object):
     # dpdf_dlink's #
     ################
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_dlink(self, model, Y, f, link_f_constraints):
+    def t_dlogpdf_dlink(self, model, Y, f, Y_metadata, link_f_constraints):
         print "\n{}".format(inspect.stack()[0][3])
-        logpdf = functools.partial(model.logpdf_link, y=Y)
-        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
+        logpdf = functools.partial(model.logpdf_link, y=Y, Y_metadata=Y_metadata)
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(logpdf, dlogpdf_dlink, f.copy(), 'g')
 
         #Apply constraints to link_f values
@@ -442,10 +465,10 @@ class TestNoiseModels(object):
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf_dlink2(self, model, Y, f, link_f_constraints):
+    def t_d2logpdf_dlink2(self, model, Y, f, Y_metadata, link_f_constraints):
         print "\n{}".format(inspect.stack()[0][3])
-        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
-        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y, Y_metadata=Y_metadata)
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, f.copy(), 'g')
 
         #Apply constraints to link_f values
@@ -458,10 +481,10 @@ class TestNoiseModels(object):
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
-    def t_d3logpdf_dlink3(self, model, Y, f, link_f_constraints):
+    def t_d3logpdf_dlink3(self, model, Y, f, Y_metadata, link_f_constraints):
         print "\n{}".format(inspect.stack()[0][3])
-        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
-        d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=Y)
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y, Y_metadata=Y_metadata)
+        d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, f.copy(), 'g')
 
         #Apply constraints to link_f values
@@ -477,32 +500,32 @@ class TestNoiseModels(object):
     # dlink_dparams #
     #################
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_link_dparams(self, model, Y, f, params, param_names, param_constraints):
+    def t_dlogpdf_link_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.logpdf_link, model.dlogpdf_link_dtheta,
-                    params, param_names, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_dlink_dparams(self, model, Y, f, params, param_names, param_constraints):
+    def t_dlogpdf_dlink_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.dlogpdf_dlink, model.dlogpdf_dlink_dtheta,
-                    params, param_names, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, params, param_names, param_constraints):
+    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.d2logpdf_dlink2, model.d2logpdf_dlink2_dtheta,
-                    params, param_names, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
@@ -510,14 +533,15 @@ class TestNoiseModels(object):
     # laplace test #
     ################
     @with_setup(setUp, tearDown)
-    def t_laplace_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
+    def t_laplace_fit_rbf_white(self, model, X, Y, f, Y_metadata, step, param_vals, param_names, constraints):
         print "\n{}".format(inspect.stack()[0][3])
         #Normalize
         Y = Y/Y.max()
-        white_var = 1e-6
+        white_var = 1e-5
         kernel = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
         laplace_likelihood = GPy.inference.latent_function_inference.Laplace()
-        m = GPy.core.GP(X.copy(), Y.copy(), kernel, likelihood=model, inference_method=laplace_likelihood)
+
+        m = GPy.core.GP(X.copy(), Y.copy(), kernel, likelihood=model, Y_metadata=Y_metadata, inference_method=laplace_likelihood)
         m['.*white'].constrain_fixed(white_var)
 
         #Set constraints
@@ -526,6 +550,7 @@ class TestNoiseModels(object):
 
         print m
         m.randomize()
+        m.randomize()
 
         #Set params
         for param_num in range(len(param_names)):
@@ -545,14 +570,15 @@ class TestNoiseModels(object):
     # EP test #
     ###########
     @with_setup(setUp, tearDown)
-    def t_ep_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
+    def t_ep_fit_rbf_white(self, model, X, Y, f, Y_metadata, step, param_vals, param_names, constraints):
         print "\n{}".format(inspect.stack()[0][3])
         #Normalize
         Y = Y/Y.max()
         white_var = 1e-6
         kernel = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
         ep_inf = GPy.inference.latent_function_inference.EP()
-        m = GPy.core.GP(X.copy(), Y.copy(), kernel=kernel, likelihood=model, inference_method=ep_inf)
+
+        m = GPy.core.GP(X.copy(), Y.copy(), kernel=kernel, likelihood=model, Y_metadata=Y_metadata, inference_method=ep_inf)
         m['.*white'].constrain_fixed(white_var)
 
         for param_num in range(len(param_names)):
@@ -571,8 +597,8 @@ class LaplaceTests(unittest.TestCase):
     """
 
     def setUp(self):
-        self.N = 5
-        self.D = 3
+        self.N = 15
+        self.D = 1
         self.X = np.random.rand(self.N, self.D)*10
 
         self.real_std = 0.1
@@ -636,20 +662,20 @@ class LaplaceTests(unittest.TestCase):
         exact_inf = GPy.inference.latent_function_inference.ExactGaussianInference()
         m1 = GPy.core.GP(X, Y.copy(), kernel=kernel1, likelihood=gauss_distr1, inference_method=exact_inf)
         m1['.*white'].constrain_fixed(1e-6)
-        m1['.*rbf.variance'] = initial_var_guess
-        m1['.*rbf.variance'].constrain_bounded(1e-4, 10)
+        m1['.*Gaussian_noise.variance'].constrain_bounded(1e-4, 10)
         m1.randomize()
 
         gauss_distr2 = GPy.likelihoods.Gaussian(variance=initial_var_guess)
         laplace_inf = GPy.inference.latent_function_inference.Laplace()
         m2 = GPy.core.GP(X, Y.copy(), kernel=kernel2, likelihood=gauss_distr2, inference_method=laplace_inf)
         m2['.*white'].constrain_fixed(1e-6)
-        m2['.*rbf.variance'].constrain_bounded(1e-4, 10)
+        m2['.*Gaussian_noise.variance'].constrain_bounded(1e-4, 10)
         m2.randomize()
 
         if debug:
             print m1
             print m2
+
         optimizer = 'scg'
         print "Gaussian"
         m1.optimize(optimizer, messages=debug, ipython_notebook=False)
@@ -687,8 +713,6 @@ class LaplaceTests(unittest.TestCase):
             pb.scatter(X, m1.likelihood.Y, c='g')
             pb.scatter(X, m2.likelihood.Y, c='r', marker='x')
 
-
-
         #Check Y's are the same
         np.testing.assert_almost_equal(m1.Y, m2.Y, decimal=5)
         #Check marginals are the same
diff --git a/GPy/util/misc.py b/GPy/util/misc.py
index bf37159d..99bd62b3 100644
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@@ -4,6 +4,16 @@
 import numpy as np
 from config import *
 
+_lim_val = np.finfo(np.float64).max
+
+_lim_val_exp = np.log(_lim_val)
+_lim_val_square = np.sqrt(_lim_val)
+_lim_val_cube = np.power(_lim_val, -3)
+
+def safe_exp(f):
+    clip_f = np.clip(f, -np.inf, _lim_val_exp)
+    return np.exp(clip_f)
+
 def chain_1(df_dg, dg_dx):
     """
     Generic chaining function for first derivative
@@ -11,6 +21,11 @@ def chain_1(df_dg, dg_dx):
     .. math::
         \\frac{d(f . g)}{dx} = \\frac{df}{dg} \\frac{dg}{dx}
     """
+    if np.all(dg_dx==1.):
+        return df_dg
+    if len(df_dg) > 1 and df_dg.shape[-1] > 1:
+        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+        raise NotImplementedError('Not implemented for matricies yet')
     return df_dg * dg_dx
 
 def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
@@ -20,7 +35,13 @@ def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
     .. math::
         \\frac{d^{2}(f . g)}{dx^{2}} = \\frac{d^{2}f}{dg^{2}}(\\frac{dg}{dx})^{2} + \\frac{df}{dg}\\frac{d^{2}g}{dx^{2}}
     """
-    return d2f_dg2*(dg_dx**2) + df_dg*d2g_dx2
+    if np.all(dg_dx==1.) and np.all(d2g_dx2 == 0):
+        return d2f_dg2
+    if  len(d2f_dg2) > 1 and d2f_dg2.shape[-1] > 1:
+        raise NotImplementedError('Not implemented for matricies yet')
+    #dg_dx_2 = np.clip(dg_dx, 1e-12, _lim_val_square)**2
+    dg_dx_2 = dg_dx**2
+    return d2f_dg2*(dg_dx_2) + df_dg*d2g_dx2
 
 def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
     """
@@ -29,11 +50,18 @@ def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
     .. math::
         \\frac{d^{3}(f . g)}{dx^{3}} = \\frac{d^{3}f}{dg^{3}}(\\frac{dg}{dx})^{3} + 3\\frac{d^{2}f}{dg^{2}}\\frac{dg}{dx}\\frac{d^{2}g}{dx^{2}} + \\frac{df}{dg}\\frac{d^{3}g}{dx^{3}}
     """
-    return d3f_dg3*(dg_dx**3) + 3*d2f_dg2*dg_dx*d2g_dx2 + df_dg*d3g_dx3
+    if np.all(dg_dx==1.) and np.all(d2g_dx2==0) and np.all(d3g_dx3==0):
+        return d3f_dg3
+    if (  (len(d2f_dg2) > 1 and d2f_dg2.shape[-1] > 1)
+           or (len(d3f_dg3) > 1 and d3f_dg3.shape[-1] > 1)):
+        raise NotImplementedError('Not implemented for matricies yet')
+    #dg_dx_3 = np.clip(dg_dx, 1e-12, _lim_val_cube)**3
+    dg_dx_3 = dg_dx**3
+    return d3f_dg3*(dg_dx_3) + 3*d2f_dg2*dg_dx*d2g_dx2 + df_dg*d3g_dx3
 
 def opt_wrapper(m, **kwargs):
     """
-    This function just wraps the optimization procedure of a GPy
+    Thit function just wraps the optimization procedure of a GPy
     object so that optimize() pickleable (necessary for multiprocessing).
     """
     m.optimize(**kwargs)
@@ -96,3 +124,47 @@ from :class:ndarray)"""
     if len(param) == 1:
         return param[0].view(np.ndarray)
     return [x.view(np.ndarray) for x in param]
+
+def blockify_hessian(func):
+    def wrapper_func(self, *args, **kwargs):
+        # Invoke the wrapped function first
+        retval = func(self, *args, **kwargs)
+        # Now do something here with retval and/or action
+        if self.not_block_really and (retval.shape[0] != retval.shape[1]):
+            return np.diagflat(retval)
+        else:
+            return retval
+    return wrapper_func
+
+def blockify_third(func):
+    def wrapper_func(self, *args, **kwargs):
+        # Invoke the wrapped function first
+        retval = func(self, *args, **kwargs)
+        # Now do something here with retval and/or action
+        if self.not_block_really and (len(retval.shape) < 3):
+            num_data = retval.shape[0]
+            d3_block_cache = np.zeros((num_data, num_data, num_data))
+            diag_slice = range(num_data)
+            d3_block_cache[diag_slice, diag_slice, diag_slice] = np.squeeze(retval)
+            return d3_block_cache
+        else:
+            return retval
+    return wrapper_func
+
+def blockify_dhess_dtheta(func):
+    def wrapper_func(self, *args, **kwargs):
+        # Invoke the wrapped function first
+        retval = func(self, *args, **kwargs)
+        # Now do something here with retval and/or action
+        if self.not_block_really and (len(retval.shape) < 3):
+            num_data = retval.shape[0]
+            num_params = retval.shape[-1]
+            dhess_dtheta = np.zeros((num_data, num_data, num_params))
+            diag_slice = range(num_data)
+            for param_ind in range(num_params):
+                dhess_dtheta[diag_slice, diag_slice, param_ind] = np.squeeze(retval[:,param_ind])
+            return dhess_dtheta
+        else:
+            return retval
+    return wrapper_func
+

From 4d27fddd375cda05a63579706defa3af0877c4a2 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 27 Mar 2015 14:24:24 +0000
Subject: [PATCH 05/10] Small tidying up

---
 GPy/likelihoods/bernoulli.py  | 38 -----------------------------------
 GPy/likelihoods/likelihood.py |  2 +-
 2 files changed, 1 insertion(+), 39 deletions(-)

diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py
index c398b3a4..f5690aa4 100644
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@@ -248,41 +248,3 @@ class Bernoulli(Likelihood):
 
     def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
         pass
-
-    def variational_expectations(self, Y, m, v, gh_points=None):
-        """
-        Probit specific numerical stable integrations
-        """
-        #Move to be faster
-        if self.gp_link:
-            pass
-        Yshape = Y.shape
-        mshape = m.shape
-        vshape = v.shape
-        Y = Y.flatten()
-        m = m.flatten()
-        v = v.flatten()
-
-        assert Yshape == mshape
-        assert mshape == vshape
-
-        Ysign = np.where(Y==1,1,-1).flatten()
-        gh_x, gh_w = np.polynomial.hermite.hermgauss(20)
-
-        #Shapes a bit weird
-        X = gh_x[None,:]*np.sqrt(2.*v[:, None]) + (m*Ysign)[:,None]
-        p = stats.norm.cdf(X)
-        p = np.clip(p, 1e-9, 1.-1e-9) # for numerical stability
-        N = stats.norm.pdf(X)
-        F = np.log(p).dot(gh_w)
-        NoverP = N/p
-        dF_dm = (NoverP*Ysign[:,None]).dot(gh_w)
-        dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(gh_w)
-        if np.any(np.isnan(dF_dv)) or np.any(np.isinf(dF_dv)):
-            stop
-        if np.any(np.isnan(dF_dm)) or np.any(np.isinf(dF_dm)):
-            stop
-        #FIXME: Might be wrong reshaping
-        return F.reshape(Yshape), dF_dm.reshape(mshape), dF_dv.reshape(vshape), None
-
-
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index a545d54e..022670a5 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -325,7 +325,7 @@ class Likelihood(Parameterized):
     def logpdf_sum(self, f, y, Y_metadata=None):
         """
         Convenience function that can overridden for functions where this could
-        be computed more efficiently (Theano?)
+        be computed more efficiently
         """
         return np.sum(self.logpdf(f, y, Y_metadata=Y_metadata))
 

From 1a253ff82a9b244866a0e20fe06444dda6c0bcd4 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 27 Mar 2015 15:14:52 +0000
Subject: [PATCH 06/10] Added safe_exp and tests

---
 GPy/likelihoods/likelihood.py |  2 +-
 GPy/testing/misc_tests.py     | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)
 create mode 100644 GPy/testing/misc_tests.py

diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index 022670a5..2e55ddb9 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -5,7 +5,7 @@ import numpy as np
 from scipy import stats,special
 import scipy as sp
 import link_functions
-from ..util.misc import chain_1, chain_2, chain_3, blockify_dhess_dtheta, blockify_third, blockify_hessian
+from ..util.misc import chain_1, chain_2, chain_3, blockify_dhess_dtheta, blockify_third, blockify_hessian, safe_exp
 from scipy.integrate import quad
 import warnings
 from ..core.parameterization import Parameterized
diff --git a/GPy/testing/misc_tests.py b/GPy/testing/misc_tests.py
new file mode 100644
index 00000000..e620fa7e
--- /dev/null
+++ b/GPy/testing/misc_tests.py
@@ -0,0 +1,18 @@
+import numpy as np
+import scipy as sp
+import GPy
+
+class MiscTests(np.testing.TestCase):
+    """
+    Testing some utilities of misc
+    """
+    def setUp(self):
+        self._lim_val = np.finfo(np.float64).max
+        self._lim_val_exp = np.log(self._lim_val)
+
+    def test_safe_exp_upper(self):
+        assert np.exp(self._lim_val_exp + 1) == np.inf
+        assert GPy.util.misc.safe_exp(self._lim_val_exp + 1) < np.inf
+
+    def test_safe_exp_lower(self):
+        assert GPy.util.misc.safe_exp(1e-10) < np.inf

From 582aa4f40618048abf597b25058f345684a98299 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 27 Mar 2015 15:30:40 +0000
Subject: [PATCH 07/10] More samples for predictive quantile

---
 GPy/likelihoods/likelihood.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index 2e55ddb9..1295245c 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -529,7 +529,7 @@ class Likelihood(Parameterized):
 
     def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
         #compute the quantiles by sampling!!!
-        N_samp = 50
+        N_samp = 500
         s = np.random.randn(mu.shape[0], N_samp)*np.sqrt(var) + mu
         #ss_f = s.flatten()
         #ss_y = self.samples(ss_f, Y_metadata)

From 4f0894b6b703aa21b50cde74c4847d4e917f3dd8 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Mon, 30 Mar 2015 15:25:59 +0100
Subject: [PATCH 08/10] change the name of kernel DiffGenomeKern to DEtime

---
 GPy/kern/__init__.py       | 3 ++-
 GPy/kern/_src/splitKern.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 718be74f..0e1f8a0d 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -16,5 +16,6 @@ from _src.poly import Poly
 from _src.eq_ode2 import EQ_ODE2
 
 from _src.trunclinear import TruncLinear,TruncLinear_inf
-from _src.splitKern import SplitKern,DiffGenomeKern
+from _src.splitKern import SplitKern,DEtime
+from _src.splitKern import DEtime as DiffGenomeKern
 
diff --git a/GPy/kern/_src/splitKern.py b/GPy/kern/_src/splitKern.py
index 27e4f76b..3b2e5716 100644
--- a/GPy/kern/_src/splitKern.py
+++ b/GPy/kern/_src/splitKern.py
@@ -7,7 +7,7 @@ from kern import Kern,CombinationKernel
 from .independent_outputs import index_to_slices
 import itertools
 
-class DiffGenomeKern(Kern):
+class DEtime(Kern):
 
     def __init__(self, kernel, idx_p, Xp, index_dim=-1, name='DiffGenomeKern'):
         self.idx_p = idx_p

From edbb576bfcfd0755319961412d8f72a10c819ece Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Mon, 30 Mar 2015 21:49:02 +0100
Subject: [PATCH 09/10] fallback the implementation of spike and slab prior

---
 GPy/core/parameterization/variational.py      | 32 ++++---------------
 .../var_dtc_parallel.py                       |  6 ++--
 GPy/kern/_src/psi_comp/ssrbf_psi_comp.py      | 20 +++++++-----
 GPy/models/ss_gplvm.py                        |  5 ++-
 4 files changed, 27 insertions(+), 36 deletions(-)

diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index 7cc5c99a..43e8d096 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -50,31 +50,29 @@ class SpikeAndSlabPrior(VariationalPrior):
     def KL_divergence(self, variational_posterior):
         mu = variational_posterior.mean
         S = variational_posterior.variance
-        gamma,gamma1 = variational_posterior.gamma_probabilities()
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        gamma = variational_posterior.gamma.values
         if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
             pi = self.pi[idx]
         else:
             pi = self.pi
             
         var_mean = np.square(mu)/self.variance
         var_S = (S/self.variance - np.log(S))
-        var_gamma = (gamma*(log_gamma-np.log(pi))).sum()+(gamma1*(log_gamma1-np.log(1-pi))).sum()
+        var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum()
         return var_gamma+ (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
 
     def update_gradients_KL(self, variational_posterior):
         mu = variational_posterior.mean
         S = variational_posterior.variance
-        gamma,gamma1 = variational_posterior.gamma_probabilities()
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        gamma = variational_posterior.gamma.values
         if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
             pi = self.pi[idx]
         else:
             pi = self.pi
 
-        variational_posterior.binary_prob.gradient -= (np.log((1-pi)/pi)+log_gamma-log_gamma1+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.)*gamma*gamma1
+        variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
         mu.gradient -= gamma*mu/self.variance
         S.gradient -= (1./self.variance - 1./S) * gamma /2.
         if self.learnPi:
@@ -162,24 +160,8 @@ class SpikeAndSlabPosterior(VariationalPosterior):
         binary_prob : the probability of the distribution on the slab part.
         """
         super(SpikeAndSlabPosterior, self).__init__(means, variances, name)
-        self.gamma = Param("binary_prob",binary_prob)
+        self.gamma = Param("binary_prob",binary_prob,Logistic(0.,1.))
         self.link_parameter(self.gamma)
-        
-    @Cache_this(limit=5)
-    def gamma_probabilities(self):
-        prob = np.zeros_like(param_to_array(self.gamma))
-        prob[self.gamma>-710] = 1./(1.+np.exp(-self.gamma[self.gamma>-710]))
-        prob1 = -np.zeros_like(param_to_array(self.gamma))
-        prob1[self.gamma<710] = 1./(1.+np.exp(self.gamma[self.gamma<710]))
-        return prob, prob1
-    
-    @Cache_this(limit=5)
-    def gamma_log_prob(self):
-        loggamma = param_to_array(self.gamma).copy()
-        loggamma[loggamma>-40] = -np.log1p(np.exp(-loggamma[loggamma>-40]))
-        loggamma1 = -param_to_array(self.gamma).copy()
-        loggamma1[loggamma1>-40] = -np.log1p(np.exp(-loggamma1[loggamma1>-40]))
-        return loggamma,loggamma1
 
     def set_gradients(self, grad):
         self.mean.gradient, self.variance.gradient, self.gamma.gradient = grad
diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py
index cac69872..2e633e16 100644
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@@ -169,11 +169,13 @@ class VarDTC_minibatch(LatentFunctionInference):
 
         Kmm = kern.K(Z).copy()
         diag.add(Kmm, self.const_jitter)
-        Lm = jitchol(Kmm, maxtries=100)
+        if not np.isfinite(Kmm).all():
+            print Kmm
+        Lm = jitchol(Kmm)
 
         LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
         Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
-        LL = jitchol(Lambda, maxtries=100)
+        LL = jitchol(Lambda)
         logdet_L = 2.*np.sum(np.log(np.diag(LL)))
         b = dtrtrs(LL,dtrtrs(Lm,psi1Y_full.T)[0])[0]
         bbt = np.square(b).sum()
diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
index 18a4d751..f6a24c86 100644
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@@ -22,12 +22,14 @@ try:
         # _psi1                NxM
         mu = variational_posterior.mean
         S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
          
         N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
         l2 = np.square(lengthscale)
         log_denom1 = np.log(S/l2+1)
         log_denom2 = np.log(2*S/l2+1)
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        log_gamma = np.log(gamma)
+        log_gamma1 = np.log(1.-gamma)
         variance = float(variance)
         psi0 = np.empty(N)
         psi0[:] = variance
@@ -37,6 +39,7 @@ try:
         from ....util.misc import param_to_array
         S = param_to_array(S)
         mu = param_to_array(mu)
+        gamma = param_to_array(gamma)
         Z = param_to_array(Z)
          
         support_code = """
@@ -79,7 +82,7 @@ try:
             }
         }
         """
-        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
+        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
      
         psi2 = psi2n.sum(axis=0)
         return psi0,psi1,psi2,psi2n
@@ -94,12 +97,13 @@ try:
      
         mu = variational_posterior.mean
         S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
         N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
         l2 = np.square(lengthscale)
         log_denom1 = np.log(S/l2+1)
         log_denom2 = np.log(2*S/l2+1)
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
-        gamma, gamma1 = variational_posterior.gamma_probabilities()
+        log_gamma = np.log(gamma)
+        log_gamma1 = np.log(1.-gamma)
         variance = float(variance)
      
         dvar = np.zeros(1)
@@ -113,6 +117,7 @@ try:
         from ....util.misc import param_to_array
         S = param_to_array(S)
         mu = param_to_array(mu)
+        gamma = param_to_array(gamma)
         Z = param_to_array(Z)
          
         support_code = """
@@ -130,7 +135,6 @@ try:
                         double Zm1q = Z(m1,q);
                         double Zm2q = Z(m2,q);
                         double gnq = gamma(n,q);
-                        double g1nq = gamma1(n,q);
                         double mu_nq = mu(n,q);
                          
                         if(m2==0) {
@@ -156,7 +160,7 @@ try:
                              
                             dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum);
                             dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.;
-                            dgamma(n,q) += lpsi1*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                            dgamma(n,q) += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                             dl(q) += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zm1q*Zm1q/(lq*lq)*d_exp2)/(2.*exp_sum);
                             dZ(m1,q) += lpsi1*(-Zmu/denom*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
                         }
@@ -184,7 +188,7 @@ try:
                          
                         dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum;
                         dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
-                        dgamma(n,q) += lpsi2*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                        dgamma(n,q) += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                         dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
                         dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;                   
                     }
@@ -192,7 +196,7 @@ try:
             }
         }
         """
-        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','gamma1','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
+        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
      
         dl *= 2.*lengthscale
         if not ARD:
diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index a61ad2a0..04006d84 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -39,7 +39,10 @@ class SSGPLVM(SparseGP_MPI):
             X_variance = np.random.uniform(0,.1,X.shape)
             
         if Gamma is None:
-            gamma = np.random.randn(X.shape[0], input_dim)
+            gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+            gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
+            gamma[gamma>1.-1e-9] = 1.-1e-9
+            gamma[gamma<1e-9] = 1e-9
         else:
             gamma = Gamma.copy()
                 

From 7fa0c19a88c102516904ad007164f0276a095309 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Mon, 30 Mar 2015 22:24:48 +0100
Subject: [PATCH 10/10] optimize sslinear kernel

---
 GPy/kern/_src/psi_comp/sslinear_psi_comp.py | 44 +++++++++------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
index 5f261785..d431cd61 100644
--- a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
@@ -37,11 +37,11 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variati
 
     # Compute for psi0 and psi1
     mu2S = np.square(mu)+S
-    dL_dvar += np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu)
-    dL_dgamma += np.einsum('n,q,nq->nq',dL_dpsi0,variance,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,variance,Z,mu)
-    dL_dmu += np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*variance,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,variance,Z)
-    dL_dS += np.einsum('n,nq,q->nq',dL_dpsi0,gamma,variance)
-    dL_dZ +=  np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, variance,mu)
+    dL_dvar += (dL_dpsi0[:,None]*gamma*mu2S).sum(axis=0) + (dL_dpsi1.T.dot(gamma*mu)*Z).sum(axis=0)
+    dL_dgamma += dL_dpsi0[:,None]*variance*mu2S+ dL_dpsi1.dot(Z)*mu*variance
+    dL_dmu += dL_dpsi0[:,None]*2.*variance*gamma*mu + dL_dpsi1.dot(Z)*gamma*variance
+    dL_dS += dL_dpsi0[:,None]*variance*gamma
+    dL_dZ += dL_dpsi1.T.dot(gamma*mu)*variance
     
     return dL_dvar, dL_dZ, dL_dmu, dL_dS, dL_dgamma
 
@@ -64,29 +64,23 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma):
     gamma2 = np.square(gamma)
     variance2 = np.square(variance)
     mu2S = mu2+S # NxQ
-    gvm = np.einsum('nq,nq,q->nq',gamma,mu,variance)
-    common_sum = np.einsum('nq,mq->nm',gvm,Z)
-#     common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM
-    Z_expect = np.einsum('mo,mq,oq->q',dL_dpsi2,Z,Z)
+    gvm = gamma*mu*variance
+    common_sum = gvm.dot(Z.T)
+    Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
+    Z_expect_var2 = Z_expect*variance2
     dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
-    tmp = np.einsum('mo,oq->mq',dL_dpsi2T,Z)
-    common_expect = np.einsum('mq,nm->nq',tmp,common_sum)
-#     common_expect = np.einsum('mo,mq,no->nq',dL_dpsi2+dL_dpsi2.T,Z,common_sum)
-    Z2_expect = np.einsum('om,nm->no',dL_dpsi2T,common_sum)
-    Z1_expect = np.einsum('om,mq->oq',dL_dpsi2T,Z)
+    common_expect = common_sum.dot(dL_dpsi2T).dot(Z)
+    Z2_expect = common_sum.dot(dL_dpsi2T)
+    Z1_expect = dL_dpsi2T.dot(Z)
     
-    dL_dvar = np.einsum('nq,q,q->q',2.*(gamma*mu2S-gamma2*mu2),variance,Z_expect)+\
-        np.einsum('nq,nq,nq->q',common_expect,gamma,mu)
+    dL_dvar = variance*Z_expect*2.*(gamma*mu2S-gamma2*mu2).sum(axis=0)+(common_expect*gamma*mu).sum(axis=0)
         
-    dL_dgamma = np.einsum('q,q,nq->nq',Z_expect,variance2,(mu2S-2.*gamma*mu2))+\
-        np.einsum('nq,q,nq->nq',common_expect,variance,mu)
+    dL_dgamma = Z_expect_var2*(mu2S-2.*gamma*mu2)+common_expect*mu*variance
+                
+    dL_dmu = Z_expect_var2*mu*2.*(gamma-gamma2) + common_expect*gamma*variance
+
+    dL_dS = gamma*Z_expect_var2
     
-    dL_dmu = np.einsum('q,q,nq,nq->nq',Z_expect,variance2,mu,2.*(gamma-gamma2))+\
-            np.einsum('nq,nq,q->nq',common_expect,gamma,variance)
-                    
-    dL_dS = np.einsum('q,nq,q->nq',Z_expect,gamma,variance2)
-    
-#     dL_dZ = 2.*(np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma,variance2,Z,(mu2S-gamma*mu2))+np.einsum('om,nq,q,nq,nm->oq',dL_dpsi2,gamma,variance,mu,common_sum))
-    dL_dZ = Z1_expect*np.einsum('nq,q,nq->q',gamma,variance2,(mu2S-gamma*mu2))+np.einsum('nq,q,nq,nm->mq',gamma,variance,mu,Z2_expect)
+    dL_dZ = (gamma*(mu2S-gamma*mu2)).sum(axis=0)*variance2*Z1_expect+ Z2_expect.T.dot(gamma*mu)*variance
 
     return dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ