From 393b9e94ba1847c6d81ad8ebdbb6cfb4528b8c82 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 10 Feb 2015 16:15:37 +0000
Subject: [PATCH 01/13] added more stable expectations for Bernoulli

---
 GPy/likelihoods/bernoulli.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py
index ff2ab30a..26de274b 100644
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@@ -77,6 +77,32 @@ class Bernoulli(Likelihood):
 
         return Z_hat, mu_hat, sigma2_hat
 
+    def variational_expectations(self, Y, m, v, gh_points=None):
+        if isinstance(self.gp_link, link_functions.Probit):
+
+            if gh_points is None:
+                gh_x, gh_w = np.polynomial.hermite.hermgauss(20)
+            else:
+                gh_x, gh_w = gh_points
+
+            from scipy import stats
+
+            shape = m.shape
+            m,v,Y = m.flatten(), v.flatten(), Y.flatten()
+            Ysign = np.where(Y==1,1,-1)
+            X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + (m*Ysign)[:,None]
+            p = stats.norm.cdf(X)
+            p = np.clip(p, 1e-9, 1.-1e-9) # for numerical stability
+            N = stats.norm.pdf(X)
+            F = np.log(p).dot(gh_w)
+            NoverP = N/p
+            dF_dm = (NoverP*Ysign[:,None]).dot(gh_w)
+            dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(gh_w)
+            return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), None
+        else:
+            raise NotImplementedError
+
+
     def predictive_mean(self, mu, variance, Y_metadata=None):
 
         if isinstance(self.gp_link, link_functions.Probit):

From 482bd1472c7c3bbd3f08cf964c841d922a3e1421 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 11 Feb 2015 11:49:45 +0000
Subject: [PATCH 02/13] reconfigured svgp inference a little

---
 GPy/core/svgp.py                              |  6 +--
 .../latent_function_inference/svgp.py         |  8 +---
 GPy/kern/_src/prod.py                         | 40 +++++++++++++------
 3 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/GPy/core/svgp.py b/GPy/core/svgp.py
index 603a64a5..42044b1b 100644
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@@ -36,12 +36,12 @@ class SVGP(SparseGP):
         KL_scale = 1.0
 
         import climin.util
-        #Make a climin slicer to make drawing minibatches much quicker
+        #Make a climin slicer to make drawing minibatches much quicker. Annoyingly, this doesn;t pickle.
         self.slicer = climin.util.draw_mini_slices(self.X_all.shape[0], self.batchsize)
         X_batch, Y_batch = self.new_batch()
 
         #create the SVI inference method
-        inf_method = svgp_inf(KL_scale=KL_scale, batch_scale=batch_scale)
+        inf_method = svgp_inf()
 
         SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method,
                  name=name, Y_metadata=Y_metadata, normalizer=False)
@@ -53,7 +53,7 @@ class SVGP(SparseGP):
         self.link_parameter(self.m)
 
     def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata)
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
 
         #update the kernel gradients
         self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
diff --git a/GPy/inference/latent_function_inference/svgp.py b/GPy/inference/latent_function_inference/svgp.py
index ba36b74b..52db242c 100644
--- a/GPy/inference/latent_function_inference/svgp.py
+++ b/GPy/inference/latent_function_inference/svgp.py
@@ -5,11 +5,8 @@ import numpy as np
 from posterior import Posterior
 
 class SVGP(LatentFunctionInference):
-    def __init__(self, KL_scale=1., batch_scale=1.):
-        self.KL_scale = KL_scale
-        self.batch_scale = batch_scale
 
-    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
         num_inducing = Z.shape[0]
         num_data, num_outputs = Y.shape
 
@@ -44,9 +41,6 @@ class SVGP(LatentFunctionInference):
         dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
         dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
 
-        KL_scale = self.KL_scale
-        batch_scale = self.batch_scale
-        KL, dKL_dKmm, dKL_dS, dKL_dm = KL_scale*KL, KL_scale*dKL_dKmm, KL_scale*dKL_dS, KL_scale*dKL_dm
 
         #quadrature for the likelihood
         F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v)
diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index dd9a5fe4..bff6d841 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -42,25 +42,41 @@ class Prod(CombinationKernel):
         return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))
 
     def update_gradients_full(self, dL_dK, X, X2=None):
-        k = self.K(X,X2)*dL_dK
-        for p in self.parts:
-            p.update_gradients_full(k/p.K(X,X2),X,X2)
+        if len(self.parts)==2:
+            self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
+            self.parts[1].update_gradients_full(dL_dK*self.parts[0].K(X,X2), X, X2)
+        else:
+            k = self.K(X,X2)*dL_dK
+            for p in self.parts:
+                p.update_gradients_full(k/p.K(X,X2),X,X2)
 
     def update_gradients_diag(self, dL_dKdiag, X):
-        k = self.Kdiag(X)*dL_dKdiag
-        for p in self.parts:
-            p.update_gradients_diag(k/p.Kdiag(X),X)
+        if len(self.parts)==2:
+            self.parts[0].update_gradients_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
+            self.parts[1].update_gradients_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
+        else:
+            k = self.Kdiag(X)*dL_dKdiag
+            for p in self.parts:
+                p.update_gradients_diag(k/p.Kdiag(X),X)
 
     def gradients_X(self, dL_dK, X, X2=None):
         target = np.zeros(X.shape)
-        k = self.K(X,X2)*dL_dK
-        for p in self.parts:
-            target += p.gradients_X(k/p.K(X,X2),X,X2)
+        if len(self.parts)==2:
+            target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2)
+            target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2)
+        else:
+            k = self.K(X,X2)*dL_dK
+            for p in self.parts:
+                target += p.gradients_X(k/p.K(X,X2),X,X2)
         return target
 
     def gradients_X_diag(self, dL_dKdiag, X):
         target = np.zeros(X.shape)
-        k = self.Kdiag(X)*dL_dKdiag
-        for p in self.parts:
-            target += p.gradients_X_diag(k/p.Kdiag(X),X)
+        if len(self.parts)==2:
+            target += self.parts[0].gradients_X_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
+            target += self.parts[1].gradients_X_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
+        else:
+            k = self.Kdiag(X)*dL_dKdiag
+            for p in self.parts:
+                target += p.gradients_X_diag(k/p.Kdiag(X),X)
         return target

From 9f7ae611edea307ed185f45c835f08e582f41951 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Tue, 17 Feb 2015 10:49:06 +0000
Subject: [PATCH 03/13] force set_XY to update the model

---
 GPy/core/gp.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 25066381..3252ac08 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -124,6 +124,7 @@ class GP(Model):
             else:
                 self.X = ObsAr(X)
         self.update_model(True)
+        self._trigger_params_changed()
 
     def set_X(self,X):
         """

From 6435b83c7af412972214245065af3fd115d2cd6d Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Sat, 21 Feb 2015 14:43:10 +0100
Subject: [PATCH 04/13] [sparse gp] prediction with uncertain inputs

---
 GPy/core/sparse_gp.py             | 52 +++++++++++++++++++++++--------
 GPy/models/sparse_gp_minibatch.py | 41 ++----------------------
 2 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 51dbd5db..9004c9c7 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -6,7 +6,8 @@ from gp import GP
 from parameterization.param import Param
 from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
-from parameterization.variational import VariationalPosterior
+from parameterization.variational import VariationalPosterior, NormalPosterior
+from ..util.linalg import mdot
 
 import logging
 from GPy.inference.latent_function_inference.posterior import Posterior
@@ -102,7 +103,15 @@ class SparseGP(GP):
 
     def _raw_predict(self, Xnew, full_cov=False, kern=None):
         """
-        Make a prediction for the latent function values
+        Make a prediction for the latent function values. 
+    
+        For certain inputs we give back a full_cov of shape NxN,
+        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of, 
+        we take only the diagonal elements across N.
+        
+        For uncertain inputs, the SparseGP bound produces a full covariance structure across D, so for full_cov we 
+        return a NxDxD matrix and in the not full_cov case, we return the diagonal elements across D (NxD).
+        This is for both with and without missing data.
         """
 
         if kern is None: kern = self.kern
@@ -121,15 +130,32 @@ class SparseGP(GP):
                 Kxx = kern.Kdiag(Xnew)
                 var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
         else:
-            Kx = kern.psi1(self.Z, Xnew).T
-            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
-            if full_cov:
-                Kxx = kern.K(Xnew.mean)
-                if self.posterior.woodbury_inv.ndim == 2:
-                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
-                elif self.posterior.woodbury_inv.ndim == 3:
-                    var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
-            else:
-                Kxx = kern.psi0(self.Z, Xnew)
-                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
+            psi0_star = self.kern.psi0(self.Z, Xnew)
+            psi1_star = self.kern.psi1(self.Z, Xnew)
+            #psi2_star = self.kern.psi2(self.Z, Xnew) # Only possible if we get NxMxM psi2 out of the code.
+            la = self.posterior.woodbury_vector
+            mu = np.dot(psi1_star, la) # TODO: dimensions?
+            
+            if full_cov: 
+                var = np.empty((Xnew.shape[0], la.shape[1], la.shape[1]))
+                di = np.diag_indices(la.shape[1])
+            else: 
+                var = np.empty((Xnew.shape[0], la.shape[1]))
+                
+            for i in range(Xnew.shape[0]):
+                _mu, _var = Xnew.mean.values[[i]], Xnew.variance.values[[i]]
+                psi2_star = self.kern.psi2(self.Z, NormalPosterior(_mu, _var))
+                tmp = (psi2_star[:, :] - psi1_star[[i]].T.dot(psi1_star[[i]]))
+
+                var_ = mdot(la.T, tmp, la)
+                p0 = psi0_star[i]
+                t = self.posterior.woodbury_inv
+                t2 = np.trace(t.T.dot(psi2_star), axis1=1, axis2=2)
+                
+                if full_cov:
+                    var_[di] += p0
+                    var_[di] += -t2
+                    var[i] = var_
+                else:
+                    var[i] = np.diag(var_)+p0-t2
         return mu, var
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index 8925d4d7..e827bb70 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 from ..core.parameterization.param import Param
+from ..core.sparse_gp import SparseGP
 from ..core.gp import GP
 from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
@@ -16,14 +17,9 @@ from GPy.inference.optimization.stochastics import SparseGPStochastics,\
     #SparseGPMissing
 logger = logging.getLogger("sparse gp")
 
-class SparseGPMiniBatch(GP):
+class SparseGPMiniBatch(SparseGP):
     """
-    A general purpose Sparse GP model
-'''
-Created on 3 Nov 2014
-
-@author: maxz
-'''
+    A general purpose Sparse GP model, allowing missing data and stochastics across dimensions.
 
     This model allows (approximate) inference using variational DTC or FITC
     (Gaussian likelihoods) as well as non-conjugate sparse methods based on
@@ -315,34 +311,3 @@ Created on 3 Nov 2014
         else:
             self.posterior, self._log_marginal_likelihood, self.grad_dict, self.full_values, _ = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
             self._outer_values_update(self.full_values)
-
-    def _raw_predict(self, Xnew, full_cov=False, kern=None):
-        """
-        Make a prediction for the latent function values
-        """
-
-        if kern is None: kern = self.kern
-
-        if not isinstance(Xnew, VariationalPosterior):
-            Kx = kern.K(self.Z, Xnew)
-            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
-            if full_cov:
-                Kxx = kern.K(Xnew)
-                if self.posterior.woodbury_inv.ndim == 2:
-                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
-                elif self.posterior.woodbury_inv.ndim == 3:
-                    var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
-                var = var
-            else:
-                Kxx = kern.Kdiag(Xnew)
-                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
-        else:
-            Kx = kern.psi1(self.Z, Xnew)
-            mu = np.dot(Kx, self.posterior.woodbury_vector)
-            if full_cov:
-                raise NotImplementedError, "TODO"
-            else:
-                Kxx = kern.psi0(self.Z, Xnew)
-                psi2 = kern.psi2(self.Z, Xnew)
-                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
-        return mu, var

From b49228d1d0cc84b8abb4c13dd720409072e2f369 Mon Sep 17 00:00:00 2001
From: javiergonzalezh <j.h.gonzalez@sheffield.ac.uk>
Date: Mon, 23 Feb 2015 15:02:50 +0000
Subject: [PATCH 05/13] minor error in corregionalization corrected

---
 GPy/kern/_src/coregionalize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPy/kern/_src/coregionalize.py b/GPy/kern/_src/coregionalize.py
index cbfe1ccb..291402ec 100644
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@@ -154,9 +154,9 @@ class Coregionalize(Kern):
     def _gradient_reduce_numpy(self, dL_dK, index, index2):
         index, index2 = index[:,0], index2[:,0]
         dL_dK_small = np.zeros_like(self.B)
-        for i in range(k.output_dim):
+        for i in range(self.output_dim):
             tmp1 = dL_dK[index==i]
-            for j in range(k.output_dim):
+            for j in range(self.output_dim):
                 dL_dK_small[j,i] = tmp1[:,index2==j].sum()
         return dL_dK_small
 

From 11cc5d16f40b2ae7001b643aa94d65579f3e41ab Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Mon, 23 Feb 2015 16:19:14 +0000
Subject: [PATCH 06/13] add save param_array

---
 GPy/core/parameterization/parameter_core.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 9a903079..bee160b2 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -1042,6 +1042,9 @@ class Parameterizable(OptimizationHandlable):
                     p = param_to_array(p)
                     d = f.create_dataset(n,p.shape,dtype=p.dtype)
                     d[:] = p
+                if hasattr(self, 'param_array'):
+                    d = f.create_dataset('param_array',self.param_array.shape, dtype=self.param_array.dtype)
+                    d[:] = self.param_array
                 f.close()
             except:
                 raise 'Fails to write the parameters into a HDF5 file!'

From 85ae035e32bee114b7230cc18665fd9ef864cccd Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 24 Feb 2015 11:55:22 +0000
Subject: [PATCH 07/13] [var_dtc] constant jitter 1e-10

---
 GPy/inference/latent_function_inference/var_dtc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index d61e7f0f..eaa02009 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -21,7 +21,7 @@ class VarDTC(LatentFunctionInference):
     For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
 
     """
-    const_jitter = 1e-6
+    const_jitter = 1e-10
     def __init__(self, limit=1):
         #self._YYTfactor_cache = caching.cache()
         from ...util.caching import Cacher

From 72c6b6698376ba5b3772647309d88401caba45be Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 24 Feb 2015 11:56:06 +0000
Subject: [PATCH 08/13] [updateable] update field in observable

---
 GPy/core/parameterization/updateable.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/GPy/core/parameterization/updateable.py b/GPy/core/parameterization/updateable.py
index 278ba8cd..379e92e1 100644
--- a/GPy/core/parameterization/updateable.py
+++ b/GPy/core/parameterization/updateable.py
@@ -11,7 +11,6 @@ class Updateable(Observable):
     A model can be updated or not.
     Make sure updates can be switched on and off.
     """
-    _updates = True
     def __init__(self, *args, **kwargs):
         super(Updateable, self).__init__(*args, **kwargs)
 

From 924899069e583819d634198627eca68a30d44c4a Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 24 Feb 2015 11:56:36 +0000
Subject: [PATCH 09/13] [optimization] experimental auto detect of ipython
 notebook

---
 GPy/core/model.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index c63a29e5..35b046fd 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -255,7 +255,16 @@ class Model(Parameterized):
         else:
             optimizer = optimization.get_optimizer(optimizer)
             opt = optimizer(start, model=self, max_iters=max_iters, **kwargs)
-
+            
+        try:
+            from IPython.display import display
+            from IPython.html import widgets
+            display(widgets.TextWidget())
+            ipython_notebook = True
+        except:
+            # Not in Ipython notebook
+            ipython_notebook = False
+            
         with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook) as vo:
             opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
             vo.finish(opt)
@@ -402,7 +411,7 @@ class Model(Parameterized):
         model_details = [['<b>Model</b>', self.name + '<br>'],
                          ['<b>Log-likelihood</b>', '{}<br>'.format(float(self.log_likelihood()))],
                          ["<b>Number of Parameters</b>", '{}<br>'.format(self.size)],
-                         ["<b>Updates</b>", '{}<br>'.format(self._updates)],
+                         ["<b>Updates</b>", '{}<br>'.format(self._update_on)],
                          ]
         from operator import itemgetter
         to_print = ["""<style type="text/css">
@@ -419,7 +428,7 @@ class Model(Parameterized):
         model_details = [['Name', self.name],
                          ['Log-likelihood', '{}'.format(float(self.log_likelihood()))],
                          ["Number of Parameters", '{}'.format(self.size)],
-                         ["Updates", '{}'.format(self._updates)],
+                         ["Updates", '{}'.format(self._update_on)],
                          ]
         from operator import itemgetter
         max_len = reduce(lambda a, b: max(len(b[0]), a), model_details, 0)

From fc4b38fe59b11120ae0928efc598cc99f3bac215 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Thu, 26 Feb 2015 13:51:47 +0000
Subject: [PATCH 10/13] [sparse gp] prediction without missing data and
 uncertain inputs was bugged

---
 GPy/core/sparse_gp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 9004c9c7..005ef2ac 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -149,7 +149,7 @@ class SparseGP(GP):
 
                 var_ = mdot(la.T, tmp, la)
                 p0 = psi0_star[i]
-                t = self.posterior.woodbury_inv
+                t = np.atleast_3d(self.posterior.woodbury_inv)
                 t2 = np.trace(t.T.dot(psi2_star), axis1=1, axis2=2)
                 
                 if full_cov:

From f1e77604039917cfe5b29b8fa09a2328651e6dfb Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Thu, 26 Feb 2015 13:52:20 +0000
Subject: [PATCH 11/13] [verbose optimization] added automatic detection of
 ipython notebook support, this is experimental

---
 GPy/core/model.py                | 13 ++-----------
 GPy/core/verbose_optimization.py | 13 +++++++++----
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 35b046fd..c5d318e7 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -213,7 +213,7 @@ class Model(Parameterized):
             self.obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e10, 1e10)
         return obj_f, self.obj_grads
 
-    def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=False, **kwargs):
+    def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=True, **kwargs):
         """
         Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
 
@@ -255,16 +255,7 @@ class Model(Parameterized):
         else:
             optimizer = optimization.get_optimizer(optimizer)
             opt = optimizer(start, model=self, max_iters=max_iters, **kwargs)
-            
-        try:
-            from IPython.display import display
-            from IPython.html import widgets
-            display(widgets.TextWidget())
-            ipython_notebook = True
-        except:
-            # Not in Ipython notebook
-            ipython_notebook = False
-            
+                        
         with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook) as vo:
             opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
             vo.finish(opt)
diff --git a/GPy/core/verbose_optimization.py b/GPy/core/verbose_optimization.py
index 78b6127e..1a87b3da 100644
--- a/GPy/core/verbose_optimization.py
+++ b/GPy/core/verbose_optimization.py
@@ -11,9 +11,8 @@ def exponents(fnow, current_grad):
     return np.sign(exps) * np.log10(exps).astype(int)
 
 class VerboseOptimization(object):
-    def __init__(self, model, opt, maxiters, verbose=True, current_iteration=0, ipython_notebook=False):
+    def __init__(self, model, opt, maxiters, verbose=False, current_iteration=0, ipython_notebook=True):
         self.verbose = verbose
-        self.ipython_notebook = ipython_notebook
         if self.verbose:
             self.model = model
             self.iteration = current_iteration
@@ -26,13 +25,18 @@ class VerboseOptimization(object):
 
             self.update()
 
-            if self.ipython_notebook:
+            try:
                 from IPython.display import display
                 from IPython.html.widgets import FloatProgressWidget, HTMLWidget, ContainerWidget
                 self.text = HTMLWidget()
                 self.progress = FloatProgressWidget()
                 self.model_show = HTMLWidget()
+                self.ipython_notebook = ipython_notebook
+            except:
+                # Not in Ipython notebook
+                self.ipython_notebook = False
 
+            if self.ipython_notebook:
                 self.text.set_css('width', '100%')
                 #self.progress.set_css('width', '100%')
 
@@ -140,6 +144,7 @@ class VerboseOptimization(object):
             self.print_out()
 
             if not self.ipython_notebook:
-                print
+                print ''
                 print 'Optimization finished in {0:.5g} Seconds'.format(self.stop-self.start)
+                print 'Optimization status: {0:.5g}'.format(self.status)
                 print

From ed6da422eb27153d5a70b3633f45ff090af05298 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Thu, 26 Feb 2015 14:24:07 +0000
Subject: [PATCH 12/13] [pickling errors] due to too little constant jitter,
 the gradient checks in pickle tests did not pass

---
 GPy/inference/latent_function_inference/var_dtc.py          | 2 +-
 GPy/inference/latent_function_inference/var_dtc_parallel.py | 2 +-
 GPy/testing/pickle_tests.py                                 | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index eaa02009..9c4d51bb 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -21,7 +21,7 @@ class VarDTC(LatentFunctionInference):
     For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
 
     """
-    const_jitter = 1e-10
+    const_jitter = 1e-8
     def __init__(self, limit=1):
         #self._YYTfactor_cache = caching.cache()
         from ...util.caching import Cacher
diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py
index 2816d578..cac69872 100644
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@@ -24,7 +24,7 @@ class VarDTC_minibatch(LatentFunctionInference):
     For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
 
     """
-    const_jitter = 1e-6
+    const_jitter = 1e-8
     def __init__(self, batchsize=None, limit=1, mpi_comm=None):
 
         self.batchsize = batchsize
diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py
index d6d6f923..c79e9914 100644
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@@ -138,8 +138,6 @@ class Test(ListDictTestCase):
         self.assertIsNot(par.gradient_full, pcopy.gradient_full)
         self.assertTrue(pcopy.checkgrad())
         self.assert_(np.any(pcopy.gradient!=0.0))
-        pcopy.optimize('bfgs')
-        par.optimize('bfgs')
         np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)
         par.randomize()
         with tempfile.TemporaryFile('w+b') as f:

From 46e67c03cf7cead6a6a68776026c2ea7dfaf94de Mon Sep 17 00:00:00 2001
From: frb-yousefi <fariba.yousefi@gmail.com>
Date: Fri, 27 Feb 2015 13:23:08 +0000
Subject: [PATCH 13/13] DGPLVM

---
 GPy/core/parameterization/priors.py | 236 +++++++++++++++++++++++++++-
 1 file changed, 231 insertions(+), 5 deletions(-)

diff --git a/GPy/core/parameterization/priors.py b/GPy/core/parameterization/priors.py
index 906a5774..8c0c4b44 100644
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@@ -549,7 +549,8 @@ class DGPLVM(Prior):
         M_i = np.zeros((self.classnum, self.dim))
         for i in cls:
             # Mean of each class
-            M_i[i] = np.mean(cls[i], axis=0)
+	    class_i = cls[i]
+            M_i[i] = np.mean(class_i, axis=0)
         return M_i
 
     # Adding data points as tuple to the dictionary so that we can access indices
@@ -661,7 +662,8 @@ class DGPLVM(Prior):
         Sw = self.compute_Sw(cls, M_i)
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
         return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
 
     # This function calculates derivative of the log of prior function
@@ -680,8 +682,9 @@ class DGPLVM(Prior):
 
         # Calculating inverse of Sb and its transpose and minus
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
         Sb_inv_N_trans = np.transpose(Sb_inv_N)
         Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
         Sw_trans = np.transpose(Sw)
@@ -706,7 +709,230 @@ class DGPLVM(Prior):
         return np.random.rand(n)  # A WRONG implementation
 
     def __str__(self):
-        return 'DGPLVM_prior'
+        return 'DGPLVM_prior_Raq'
+
+
+
+class DGPLVM_T(Prior):
+    """
+    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
+
+    :param sigma2: constant
+
+    .. Note:: DGPLVM for Classification paper implementation
+
+    """
+    domain = _REAL
+    # _instances = []
+    # def __new__(cls, mu, sigma): # Singleton:
+    #     if cls._instances:
+    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
+    #         for instance in cls._instances:
+    #             if instance().mu == mu and instance().sigma == sigma:
+    #                 return instance()
+    #     o = super(Prior, cls).__new__(cls, mu, sigma)
+    #     cls._instances.append(weakref.ref(o))
+    #     return cls._instances[-1]()
+
+    def __init__(self, sigma2, lbl, x_shape, vec):
+        self.sigma2 = sigma2
+        # self.x = x
+        self.lbl = lbl
+        self.classnum = lbl.shape[1]
+        self.datanum = lbl.shape[0]
+        self.x_shape = x_shape
+        self.dim = x_shape[1]
+	self.vec = vec
+
+
+    def get_class_label(self, y):
+        for idx, v in enumerate(y):
+            if v == 1:
+                return idx
+        return -1
+
+    # This function assigns each data point to its own class
+    # and returns the dictionary which contains the class name and parameters.
+    def compute_cls(self, x):
+        cls = {}
+        # Appending each data point to its proper class
+        for j in xrange(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in cls:
+                cls[class_label] = []
+            cls[class_label].append(x[j])
+        return cls
+
+    # This function computes mean of each class. The mean is calculated through each dimension
+    def compute_Mi(self, cls, vec):
+        M_i = np.zeros((self.classnum, self.dim))
+        for i in cls:
+            # Mean of each class
+	    class_i = np.multiply(cls[i],vec)
+            M_i[i] = np.mean(class_i, axis=0)
+        return M_i
+
+    # Adding data points as tuple to the dictionary so that we can access indices
+    def compute_indices(self, x):
+        data_idx = {}
+        for j in xrange(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in data_idx:
+                data_idx[class_label] = []
+            t = (j, x[j])
+            data_idx[class_label].append(t)
+        return data_idx
+
+    # Adding indices to the list so we can access whole the indices
+    def compute_listIndices(self, data_idx):
+        lst_idx = []
+        lst_idx_all = []
+        for i in data_idx:
+            if len(lst_idx) == 0:
+                pass
+                #Do nothing, because it is the first time list is created so is empty
+            else:
+                lst_idx = []
+            # Here we put indices of each class in to the list called lst_idx_all
+            for m in xrange(len(data_idx[i])):
+                lst_idx.append(data_idx[i][m][0])
+            lst_idx_all.append(lst_idx)
+        return lst_idx_all
+
+    # This function calculates between classes variances
+    def compute_Sb(self, cls, M_i, M_0):
+        Sb = np.zeros((self.dim, self.dim))
+        for i in cls:
+            B = (M_i[i] - M_0).reshape(self.dim, 1)
+            B_trans = B.transpose()
+            Sb += (float(len(cls[i])) / self.datanum) * B.dot(B_trans)
+        return Sb
+
+    # This function calculates within classes variances
+    def compute_Sw(self, cls, M_i):
+        Sw = np.zeros((self.dim, self.dim))
+        for i in cls:
+            N_i = float(len(cls[i]))
+            W_WT = np.zeros((self.dim, self.dim))
+            for xk in cls[i]:
+                W = (xk - M_i[i])
+                W_WT += np.outer(W, W)
+            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+        return Sw
+
+    # Calculating beta and Bi for Sb
+    def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
+        # import pdb
+        # pdb.set_trace()
+        B_i = np.zeros((self.classnum, self.dim))
+        Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            # pdb.set_trace()
+            # Calculating Bi
+            B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
+        for k in xrange(self.datanum):
+            for i in data_idx:
+                N_i = float(len(data_idx[i]))
+                if k in lst_idx_all[i]:
+                    beta = (float(1) / N_i) - (float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+                else:
+                    beta = -(float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
+        return Sig_beta_B_i_all
+
+
+    # Calculating W_j s separately so we can access all the W_j s anytime
+    def compute_wj(self, data_idx, M_i):
+        W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                xj = tpl[1]
+                j = tpl[0]
+                W_i[j] = (xj - M_i[i])
+        return W_i
+
+    # Calculating alpha and Wj for Sw
+    def compute_sig_alpha_W(self, data_idx, lst_idx_all, W_i):
+        Sig_alpha_W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                k = tpl[0]
+                for j in lst_idx_all[i]:
+                    if k == j:
+                        alpha = 1 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                    else:
+                        alpha = 0 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+        return Sig_alpha_W_i
+
+    # This function calculates log of our prior
+    def lnpdf(self, x):
+        x = x.reshape(self.x_shape)
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls, self.vec)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+	#print 'SB_inv: ', Sb_inv_N
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+	Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
+
+    # This function calculates derivative of the log of prior function
+    def lnpdf_grad(self, x):
+        x = x.reshape(self.x_shape)
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls, self.vec)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        data_idx = self.compute_indices(x)
+        lst_idx_all = self.compute_listIndices(data_idx)
+        Sig_beta_B_i_all = self.compute_sig_beta_Bi(data_idx, M_i, M_0, lst_idx_all)
+        W_i = self.compute_wj(data_idx, M_i)
+        Sig_alpha_W_i = self.compute_sig_alpha_W(data_idx, lst_idx_all, W_i)
+
+        # Calculating inverse of Sb and its transpose and minus
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+	#print 'SB_inv: ',Sb_inv_N
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+	Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N_trans = np.transpose(Sb_inv_N)
+        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
+        Sw_trans = np.transpose(Sw)
+
+        # Calculating DJ/DXk
+        DJ_Dxk = 2 * (
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
+                Sig_alpha_W_i))
+        # Calculating derivative of the log of the prior
+        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        return DPx_Dx.T
+
+    # def frb(self, x):
+    #     from functools import partial
+    #     from GPy.models import GradientChecker
+    #     f = partial(self.lnpdf)
+    #     df = partial(self.lnpdf_grad)
+    #     grad = GradientChecker(f, df, x, 'X')
+    #     grad.checkgrad(verbose=1)
+
+    def rvs(self, n):
+        return np.random.rand(n)  # A WRONG implementation
+
+    def __str__(self):
+        return 'DGPLVM_prior_Raq_TTT'
+
+
 
 class HalfT(Prior):
     """