Merge remote-tracking branch 'upstream/devel' into devel

Conflicts: GPy/kern/__init__.py
2026-04-30 15:26:23 +02:00 · 2015-03-08 09:42:33 +01:00 · 2015-03-08 09:42:33 +01:00 · e115778d74
commit e115778d74
parent 8d3308b76c 48821a6b73
36 changed files with 2032 additions and 196 deletions
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -124,6 +124,7 @@ class GP(Model):
            else:
                self.X = ObsAr(X)
        self.update_model(True)
+        self._trigger_params_changed()

    def set_X(self,X):
        """
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -154,7 +154,7 @@ class Model(Parameterized):
        """
        return -(self._log_likelihood_gradients() + self._log_prior_gradients())

-    def _objective_grads(self, x):
+    def _grads(self, x):
        """
        Gets the gradients from the likelihood and the priors.

@ -200,7 +200,7 @@ class Model(Parameterized):
            return np.inf
        return obj

-    def _objective_and_grads(self, x):
+    def _objective_grads(self, x):
        try:
            self.optimizer_array = x
            obj_f, self.obj_grads = self.objective_function(), self._transform_gradients(self.objective_function_gradients())
@ -213,7 +213,7 @@ class Model(Parameterized):
            self.obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e10, 1e10)
        return obj_f, self.obj_grads

-    def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=False, **kwargs):
+    def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=True, **kwargs):
        """
        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.

@ -255,9 +255,10 @@ class Model(Parameterized):
        else:
            optimizer = optimization.get_optimizer(optimizer)
            opt = optimizer(start, model=self, max_iters=max_iters, **kwargs)
-
-        with VerboseOptimization(self, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook):
-            opt.run(f_fp=self._objective_and_grads, f=self._objective, fp=self._objective_grads)
+                        
+        with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook) as vo:
+            opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
+            vo.finish(opt)

        self.optimization_runs.append(opt)

@ -314,7 +315,7 @@ class Model(Parameterized):
            # evaulate around the point x
            f1 = self._objective(x + dx)
            f2 = self._objective(x - dx)
-            gradient = self._objective_grads(x)
+            gradient = self._grads(x)

            dx = dx[transformed_index]
            gradient = gradient[transformed_index]
@ -360,7 +361,7 @@ class Model(Parameterized):
                    print "No free parameters to check"
                    return

-            gradient = self._objective_grads(x).copy()
+            gradient = self._grads(x).copy()
            np.where(gradient == 0, 1e-312, gradient)
            ret = True
            for nind, xind in itertools.izip(param_index, transformed_index):
@ -401,12 +402,14 @@ class Model(Parameterized):
        model_details = [['<b>Model</b>', self.name + '<br>'],
                         ['<b>Log-likelihood</b>', '{}<br>'.format(float(self.log_likelihood()))],
                         ["<b>Number of Parameters</b>", '{}<br>'.format(self.size)],
-                         ["<b>Updates</b>", '{}<br>'.format(self._updates)],
+                         ["<b>Updates</b>", '{}<br>'.format(self._update_on)],
                         ]
        from operator import itemgetter
        to_print = ["""<style type="text/css">
 .pd{
-    font-family:"Courier New", Courier, monospace !important;
+    font-family: "Courier New", Courier, monospace !important;
+    width: 100%;
+    padding: 3px;
 }
 </style>\n"""] + ["<p class=pd>"] + ["{}: {}".format(name, detail) for name, detail in model_details] + ["</p>"]
        to_print.append(super(Model, self)._repr_html_())
@ -416,7 +419,7 @@ class Model(Parameterized):
        model_details = [['Name', self.name],
                         ['Log-likelihood', '{}'.format(float(self.log_likelihood()))],
                         ["Number of Parameters", '{}'.format(self.size)],
-                         ["Updates", '{}'.format(self._updates)],
+                         ["Updates", '{}'.format(self._update_on)],
                         ]
        from operator import itemgetter
        max_len = reduce(lambda a, b: max(len(b[0]), a), model_details, 0)
--- a/GPy/core/parameterization/observable.py
+++ b/GPy/core/parameterization/observable.py
@ -14,6 +14,10 @@ class Observable(object):
        super(Observable, self).__init__()
        from lists_and_dicts import ObserverList
        self.observers = ObserverList()
+        self._update_on = True
+
+    def set_updates(self, on=True):
+        self._update_on = on

    def add_observer(self, observer, callble, priority=0):
        """
@ -51,15 +55,16 @@ class Observable(object):
        :param min_priority: only notify observers with priority > min_priority
                             if min_priority is None, notify all observers in order
        """
-        if which is None:
-            which = self
-        if min_priority is None:
-            [callble(self, which=which) for _, _, callble in self.observers]
-        else:
-            for p, _, callble in self.observers:
-                if p <= min_priority:
-                    break
-                callble(self, which=which)
+        if self._update_on:
+            if which is None:
+                which = self
+            if min_priority is None:
+                [callble(self, which=which) for _, _, callble in self.observers]
+            else:
+                for p, _, callble in self.observers:
+                    if p <= min_priority:
+                        break
+                    callble(self, which=which)

    def change_priority(self, observer, callble, priority):
        self.remove_observer(observer, callble)
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -84,6 +84,7 @@ class Param(Parameterizable, ObsAr):
        self._original_ = getattr(obj, '_original_', None)
        self._name = getattr(obj, '_name', None)
        self._gradient_array_ = getattr(obj, '_gradient_array_', None)
+        self._update_on = getattr(obj, '_update_on', None)
        self.constraints = getattr(obj, 'constraints', None)
        self.priors = getattr(obj, 'priors', None)

@ -273,7 +274,7 @@ class Param(Parameterizable, ObsAr):
        header = header_format.format(x=self.hierarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
        if not ties: ties = itertools.cycle([''])
        return "\n".join(["""<style type="text/css">
-.tg  {padding:2px 3px;word-break:normal;border-collapse:collapse;border-spacing:0;border-color:#DCDCDC;margin:0px auto;}
+.tg  {padding:2px 3px;word-break:normal;border-collapse:collapse;border-spacing:0;border-color:#DCDCDC;margin:0px auto;width:100%;}
 .tg td{font-family:"Courier New", Courier, monospace !important;font-weight:bold;color:#444;background-color:#F7FDFA;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
 .tg th{font-family:"Courier New", Courier, monospace !important;font-weight:normal;color:#fff;background-color:#26ADE4;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
 .tg .tg-left{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:left;}
@ -360,7 +361,7 @@ class ParamConcatenation(object):
    #===========================================================================
    def update_all_params(self):
        for par in self.parents:
-            par.notify_observers()
+            par.trigger_update(trigger_parent=False)

    def constrain(self, constraint, warning=True):
        [param.constrain(constraint, trigger_parent=False) for param in self.params]
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@ -471,7 +471,7 @@ class Indexable(Nameable, Updateable):
            self.param_array[...] = transform.initialize(self.param_array)
        reconstrained = self.unconstrain()
        added = self._add_to_index_operations(self.constraints, reconstrained, transform, warning)
-        self.notify_observers(self, None if trigger_parent else -np.inf)
+        self.trigger_update(trigger_parent)
        return added

    def unconstrain(self, *transforms):
@ -1042,6 +1042,9 @@ class Parameterizable(OptimizationHandlable):
                    p = param_to_array(p)
                    d = f.create_dataset(n,p.shape,dtype=p.dtype)
                    d[:] = p
+                if hasattr(self, 'param_array'):
+                    d = f.create_dataset('param_array',self.param_array.shape, dtype=self.param_array.dtype)
+                    d[:] = self.param_array
                f.close()
            except:
                raise 'Fails to write the parameters into a HDF5 file!'
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -156,7 +156,7 @@ class Parameterized(Parameterizable):
                    p._parent_index_ += 1
                self.parameters.insert(index, param)

-            param.add_observer(self, self._pass_through_notify_observers, -1000)
+            param.add_observer(self, self._pass_through_notify_observers, -np.inf)

            parent = self
            while parent is not None:
@ -296,7 +296,7 @@ class Parameterized(Parameterizable):
                self.param_array[name] = value
            except:
                raise ValueError, "Setting by slice or index only allowed with array-like"
-            self._trigger_params_changed()
+            self.trigger_update()
        else:
            try: param = self.__getitem__(name, paramlist)
            except: raise
@ -393,7 +393,7 @@ class Parameterized(Parameterizable):
 </tr>""".format(name=name)
            to_print.insert(0, header)
        style = """<style type="text/css">
-.tg  {font-family:"Courier New", Courier, monospace !important;padding:2px 3px;word-break:normal;border-collapse:collapse;border-spacing:0;border-color:#DCDCDC;margin:0px auto;}
+.tg  {font-family:"Courier New", Courier, monospace !important;padding:2px 3px;word-break:normal;border-collapse:collapse;border-spacing:0;border-color:#DCDCDC;margin:0px auto;width:100%;}
 .tg td{font-family:"Courier New", Courier, monospace !important;font-weight:bold;color:#444;background-color:#F7FDFA;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
 .tg th{font-family:"Courier New", Courier, monospace !important;font-weight:normal;color:#fff;background-color:#26ADE4;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
 .tg .tg-left{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:left;}
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -549,7 +549,8 @@ class DGPLVM(Prior):
        M_i = np.zeros((self.classnum, self.dim))
        for i in cls:
            # Mean of each class
-            M_i[i] = np.mean(cls[i], axis=0)
+	    class_i = cls[i]
+            M_i[i] = np.mean(class_i, axis=0)
        return M_i

    # Adding data points as tuple to the dictionary so that we can access indices
@ -661,7 +662,8 @@ class DGPLVM(Prior):
        Sw = self.compute_Sw(cls, M_i)
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))

    # This function calculates derivative of the log of prior function
@ -680,8 +682,9 @@ class DGPLVM(Prior):

        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)
@ -706,7 +709,230 @@ class DGPLVM(Prior):
        return np.random.rand(n)  # A WRONG implementation

    def __str__(self):
-        return 'DGPLVM_prior'
+        return 'DGPLVM_prior_Raq'
+
+
+
+class DGPLVM_T(Prior):
+    """
+    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
+
+    :param sigma2: constant
+
+    .. Note:: DGPLVM for Classification paper implementation
+
+    """
+    domain = _REAL
+    # _instances = []
+    # def __new__(cls, mu, sigma): # Singleton:
+    #     if cls._instances:
+    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
+    #         for instance in cls._instances:
+    #             if instance().mu == mu and instance().sigma == sigma:
+    #                 return instance()
+    #     o = super(Prior, cls).__new__(cls, mu, sigma)
+    #     cls._instances.append(weakref.ref(o))
+    #     return cls._instances[-1]()
+
+    def __init__(self, sigma2, lbl, x_shape, vec):
+        self.sigma2 = sigma2
+        # self.x = x
+        self.lbl = lbl
+        self.classnum = lbl.shape[1]
+        self.datanum = lbl.shape[0]
+        self.x_shape = x_shape
+        self.dim = x_shape[1]
+	self.vec = vec
+
+
+    def get_class_label(self, y):
+        for idx, v in enumerate(y):
+            if v == 1:
+                return idx
+        return -1
+
+    # This function assigns each data point to its own class
+    # and returns the dictionary which contains the class name and parameters.
+    def compute_cls(self, x):
+        cls = {}
+        # Appending each data point to its proper class
+        for j in xrange(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in cls:
+                cls[class_label] = []
+            cls[class_label].append(x[j])
+        return cls
+
+    # This function computes mean of each class. The mean is calculated through each dimension
+    def compute_Mi(self, cls, vec):
+        M_i = np.zeros((self.classnum, self.dim))
+        for i in cls:
+            # Mean of each class
+	    class_i = np.multiply(cls[i],vec)
+            M_i[i] = np.mean(class_i, axis=0)
+        return M_i
+
+    # Adding data points as tuple to the dictionary so that we can access indices
+    def compute_indices(self, x):
+        data_idx = {}
+        for j in xrange(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in data_idx:
+                data_idx[class_label] = []
+            t = (j, x[j])
+            data_idx[class_label].append(t)
+        return data_idx
+
+    # Adding indices to the list so we can access whole the indices
+    def compute_listIndices(self, data_idx):
+        lst_idx = []
+        lst_idx_all = []
+        for i in data_idx:
+            if len(lst_idx) == 0:
+                pass
+                #Do nothing, because it is the first time list is created so is empty
+            else:
+                lst_idx = []
+            # Here we put indices of each class in to the list called lst_idx_all
+            for m in xrange(len(data_idx[i])):
+                lst_idx.append(data_idx[i][m][0])
+            lst_idx_all.append(lst_idx)
+        return lst_idx_all
+
+    # This function calculates between classes variances
+    def compute_Sb(self, cls, M_i, M_0):
+        Sb = np.zeros((self.dim, self.dim))
+        for i in cls:
+            B = (M_i[i] - M_0).reshape(self.dim, 1)
+            B_trans = B.transpose()
+            Sb += (float(len(cls[i])) / self.datanum) * B.dot(B_trans)
+        return Sb
+
+    # This function calculates within classes variances
+    def compute_Sw(self, cls, M_i):
+        Sw = np.zeros((self.dim, self.dim))
+        for i in cls:
+            N_i = float(len(cls[i]))
+            W_WT = np.zeros((self.dim, self.dim))
+            for xk in cls[i]:
+                W = (xk - M_i[i])
+                W_WT += np.outer(W, W)
+            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+        return Sw
+
+    # Calculating beta and Bi for Sb
+    def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
+        # import pdb
+        # pdb.set_trace()
+        B_i = np.zeros((self.classnum, self.dim))
+        Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            # pdb.set_trace()
+            # Calculating Bi
+            B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
+        for k in xrange(self.datanum):
+            for i in data_idx:
+                N_i = float(len(data_idx[i]))
+                if k in lst_idx_all[i]:
+                    beta = (float(1) / N_i) - (float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+                else:
+                    beta = -(float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
+        return Sig_beta_B_i_all
+
+
+    # Calculating W_j s separately so we can access all the W_j s anytime
+    def compute_wj(self, data_idx, M_i):
+        W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                xj = tpl[1]
+                j = tpl[0]
+                W_i[j] = (xj - M_i[i])
+        return W_i
+
+    # Calculating alpha and Wj for Sw
+    def compute_sig_alpha_W(self, data_idx, lst_idx_all, W_i):
+        Sig_alpha_W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                k = tpl[0]
+                for j in lst_idx_all[i]:
+                    if k == j:
+                        alpha = 1 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                    else:
+                        alpha = 0 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+        return Sig_alpha_W_i
+
+    # This function calculates log of our prior
+    def lnpdf(self, x):
+        x = x.reshape(self.x_shape)
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls, self.vec)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+	#print 'SB_inv: ', Sb_inv_N
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+	Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
+
+    # This function calculates derivative of the log of prior function
+    def lnpdf_grad(self, x):
+        x = x.reshape(self.x_shape)
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls, self.vec)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        data_idx = self.compute_indices(x)
+        lst_idx_all = self.compute_listIndices(data_idx)
+        Sig_beta_B_i_all = self.compute_sig_beta_Bi(data_idx, M_i, M_0, lst_idx_all)
+        W_i = self.compute_wj(data_idx, M_i)
+        Sig_alpha_W_i = self.compute_sig_alpha_W(data_idx, lst_idx_all, W_i)
+
+        # Calculating inverse of Sb and its transpose and minus
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+	#print 'SB_inv: ',Sb_inv_N
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+	Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N_trans = np.transpose(Sb_inv_N)
+        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
+        Sw_trans = np.transpose(Sw)
+
+        # Calculating DJ/DXk
+        DJ_Dxk = 2 * (
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
+                Sig_alpha_W_i))
+        # Calculating derivative of the log of the prior
+        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        return DPx_Dx.T
+
+    # def frb(self, x):
+    #     from functools import partial
+    #     from GPy.models import GradientChecker
+    #     f = partial(self.lnpdf)
+    #     df = partial(self.lnpdf_grad)
+    #     grad = GradientChecker(f, df, x, 'X')
+    #     grad.checkgrad(verbose=1)
+
+    def rvs(self, n):
+        return np.random.rand(n)  # A WRONG implementation
+
+    def __str__(self):
+        return 'DGPLVM_prior_Raq_TTT'
+
+

 class HalfT(Prior):
    """
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@ -79,8 +79,10 @@ class Logexp(Transformation):


 class NormalTheta(Transformation):
+    "Do not use, not officially supported!"
    _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
@ -143,9 +145,10 @@ class NormalTheta(Transformation):
        self.var_indices = state[1]

 class NormalNaturalAntti(NormalTheta):
+    "Do not use, not officially supported!"
    _instances = []
-    _logexp = Logexp()
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
@ -182,8 +185,10 @@ class NormalNaturalAntti(NormalTheta):
        return "natantti"

 class NormalEta(Transformation):
+    "Do not use, not officially supported!"
    _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
@ -223,8 +228,10 @@ class NormalEta(Transformation):
        return "eta"

 class NormalNaturalThroughTheta(NormalTheta):
+    "Do not use, not officially supported!"
    _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
@ -272,8 +279,10 @@ class NormalNaturalThroughTheta(NormalTheta):


 class NormalNaturalWhooot(NormalTheta):
+    "Do not use, not officially supported!"
    _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
@ -307,8 +316,10 @@ class NormalNaturalWhooot(NormalTheta):
        return "natgrad"

 class NormalNaturalThroughEta(NormalEta):
+    "Do not use, not officially supported!"
    _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
--- a/GPy/core/parameterization/updateable.py
+++ b/GPy/core/parameterization/updateable.py
@ -11,7 +11,6 @@ class Updateable(Observable):
    A model can be updated or not.
    Make sure updates can be switched on and off.
    """
-    _updates = True
    def __init__(self, *args, **kwargs):
        super(Updateable, self).__init__(*args, **kwargs)

@ -27,18 +26,18 @@ class Updateable(Observable):
            None: get the current update state
        """
        if updates is None:
-            p = getattr(self, '_highest_parent_', None)
-            if p is not None:
-                self._updates = p._updates
-            return self._updates
+            return self._update_on
        assert isinstance(updates, bool), "updates are either on (True) or off (False)"
        p = getattr(self, '_highest_parent_', None)
-        if p is not None:
-            p._updates = updates
-        self._updates = updates
+        def turn_updates(s):
+            s._update_on = updates
+        p.traverse(turn_updates)
        self.trigger_update()

    def toggle_update(self):
+        print "deprecated: toggle_update was renamed to update_toggle for easier access"
+        self.update_toggle()
+    def update_toggle(self):
        self.update_model(not self.update_model())

    def trigger_update(self, trigger_parent=True):
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -6,7 +6,8 @@ from gp import GP
 from parameterization.param import Param
 from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
-from parameterization.variational import VariationalPosterior
+from parameterization.variational import VariationalPosterior, NormalPosterior
+from ..util.linalg import mdot

 import logging
 from GPy.inference.latent_function_inference.posterior import Posterior
@ -102,7 +103,15 @@ class SparseGP(GP):

    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
-        Make a prediction for the latent function values
+        Make a prediction for the latent function values. 
+    
+        For certain inputs we give back a full_cov of shape NxN,
+        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of, 
+        we take only the diagonal elements across N.
+        
+        For uncertain inputs, the SparseGP bound produces a full covariance structure across D, so for full_cov we 
+        return a NxDxD matrix and in the not full_cov case, we return the diagonal elements across D (NxD).
+        This is for both with and without missing data.
        """

        if kern is None: kern = self.kern
@ -121,15 +130,32 @@ class SparseGP(GP):
                Kxx = kern.Kdiag(Xnew)
                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
        else:
-            Kx = kern.psi1(self.Z, Xnew).T
-            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
-            if full_cov:
-                Kxx = kern.K(Xnew.mean)
-                if self.posterior.woodbury_inv.ndim == 2:
-                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
-                elif self.posterior.woodbury_inv.ndim == 3:
-                    var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
-            else:
-                Kxx = kern.psi0(self.Z, Xnew)
-                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
+            psi0_star = self.kern.psi0(self.Z, Xnew)
+            psi1_star = self.kern.psi1(self.Z, Xnew)
+            #psi2_star = self.kern.psi2(self.Z, Xnew) # Only possible if we get NxMxM psi2 out of the code.
+            la = self.posterior.woodbury_vector
+            mu = np.dot(psi1_star, la) # TODO: dimensions?
+            
+            if full_cov: 
+                var = np.empty((Xnew.shape[0], la.shape[1], la.shape[1]))
+                di = np.diag_indices(la.shape[1])
+            else: 
+                var = np.empty((Xnew.shape[0], la.shape[1]))
+                
+            for i in range(Xnew.shape[0]):
+                _mu, _var = Xnew.mean.values[[i]], Xnew.variance.values[[i]]
+                psi2_star = self.kern.psi2(self.Z, NormalPosterior(_mu, _var))
+                tmp = (psi2_star[:, :] - psi1_star[[i]].T.dot(psi1_star[[i]]))
+
+                var_ = mdot(la.T, tmp, la)
+                p0 = psi0_star[i]
+                t = np.atleast_3d(self.posterior.woodbury_inv)
+                t2 = np.trace(t.T.dot(psi2_star), axis1=1, axis2=2)
+                
+                if full_cov:
+                    var_[di] += p0
+                    var_[di] += -t2
+                    var[i] = var_
+                else:
+                    var[i] = np.diag(var_)+p0-t2
        return mu, var
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@ -25,23 +25,18 @@ class SVGP(SparseGP):

        Hensman, Matthews and Ghahramani, Scalable Variational GP Classification, ArXiv 1411.2005
        """
-        if batchsize is None:
-            batchsize = X.shape[0]
-
-        self.X_all, self.Y_all = X, Y
-        # how to rescale the batch likelihood in case of minibatches
        self.batchsize = batchsize
-        batch_scale = float(self.X_all.shape[0])/float(self.batchsize)
-        #KL_scale = 1./np.float64(self.mpi_comm.size)
-        KL_scale = 1.0
-
-        import climin.util
-        #Make a climin slicer to make drawing minibatches much quicker
-        self.slicer = climin.util.draw_mini_slices(self.X_all.shape[0], self.batchsize)
-        X_batch, Y_batch = self.new_batch()
+        self.X_all, self.Y_all = X, Y
+        if batchsize is None:
+            X_batch, Y_batch = X, Y
+        else:
+            import climin.util
+            #Make a climin slicer to make drawing minibatches much quicker
+            self.slicer = climin.util.draw_mini_slices(self.X_all.shape[0], self.batchsize)
+            X_batch, Y_batch = self.new_batch()

        #create the SVI inference method
-        inf_method = svgp_inf(KL_scale=KL_scale, batch_scale=batch_scale)
+        inf_method = svgp_inf()

        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method,
                 name=name, Y_metadata=Y_metadata, normalizer=False)
@ -53,7 +48,7 @@ class SVGP(SparseGP):
        self.link_parameter(self.m)

    def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata)
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))

        #update the kernel gradients
        self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
--- a/GPy/core/verbose_optimization.py
+++ b/GPy/core/verbose_optimization.py
@ -4,48 +4,90 @@

 import numpy as np
 import sys
+import time

 def exponents(fnow, current_grad):
    exps = [np.abs(np.float(fnow)), current_grad]
    return np.sign(exps) * np.log10(exps).astype(int)

 class VerboseOptimization(object):
-    def __init__(self, model, maxiters, verbose=True, current_iteration=0, ipython_notebook=False):
+    def __init__(self, model, opt, maxiters, verbose=False, current_iteration=0, ipython_notebook=True):
        self.verbose = verbose
        if self.verbose:
            self.model = model
            self.iteration = current_iteration
-            self.ipython_notebook = ipython_notebook
            self.p_iter = self.iteration
            self.maxiters = maxiters
            self.len_maxiters = len(str(maxiters))
+            self.opt_name = opt.opt_name
            self.model.add_observer(self, self.print_status)
+            self.status = 'running'

            self.update()

-            if self.ipython_notebook:
+            try:
                from IPython.display import display
-                from IPython.html.widgets import IntProgressWidget, HTMLWidget
+                from IPython.html.widgets import FloatProgressWidget, HTMLWidget, ContainerWidget
                self.text = HTMLWidget()
-                display(self.text)
-                self.progress = IntProgressWidget()
-                display(self.progress)
+                self.progress = FloatProgressWidget()
+                self.model_show = HTMLWidget()
+                self.ipython_notebook = ipython_notebook
+            except:
+                # Not in Ipython notebook
+                self.ipython_notebook = False
+
+            if self.ipython_notebook:
+                self.text.set_css('width', '100%')
+                #self.progress.set_css('width', '100%')
+
+                left_col = ContainerWidget(children = [self.progress, self.text])
+                right_col = ContainerWidget(children = [self.model_show])
+                hor_align = ContainerWidget(children = [left_col, right_col])
+
+                display(hor_align)
+
+                left_col.set_css({
+                         'padding': '2px',
+                         'width': "100%",
+                         })
+
+                right_col.set_css({
+                         'padding': '2px',
+                         })
+
+                hor_align.set_css({
+                         'width': "100%",
+                         })
+
+                hor_align.remove_class('vbox')
+                hor_align.add_class('hbox')
+
+                left_col.add_class("box-flex1")
+                right_col.add_class('box-flex0')
+
+                #self.text.add_class('box-flex2')
+                #self.progress.add_class('box-flex1')
            else:
                self.exps = exponents(self.fnow, self.current_gradient)
-                print ' {0:{mi}s}   {1:11s}    {2:11s}'.format("I", "F", "|g|", mi=self.len_maxiters)
+                print 'Running {} Code:'.format(self.opt_name)
+                print ' {3:7s}   {0:{mi}s}   {1:11s}    {2:11s}'.format("i", "f", "|g|", "secs", mi=self.len_maxiters)

    def __enter__(self):
+        self.start = time.time()
        return self

    def print_out(self):
        if self.ipython_notebook:
-            names_vals = [['Iteration', "{:>0{l}}".format(self.iteration, l=self.len_maxiters)],
-                          ['f', "{: > 05.3E}".format(self.fnow)],
-                          ['||Gradient||', "{: >+05.3E}".format(float(self.current_gradient))],
+            names_vals = [['optimizer', "{:s}".format(self.opt_name)],
+                          ['runtime [s]', "{:> g}".format(time.time()-self.start)],
+                          ['evaluation', "{:>0{l}}".format(self.iteration, l=self.len_maxiters)],
+                          ['objective', "{: > 12.3E}".format(self.fnow)],
+                          ['||gradient||', "{: >+12.3E}".format(float(self.current_gradient))],
+                          ['status', "{:s}".format(self.status)],
                      ]
            #message = "Lik:{:5.3E} Grad:{:5.3E} Lik:{:5.3E} Len:{!s}".format(float(m.log_likelihood()), np.einsum('i,i->', grads, grads), float(m.likelihood.variance), " ".join(["{:3.2E}".format(l) for l in m.kern.lengthscale.values]))
            html_begin = """<style type="text/css">
-    .tg-opt  {font-family:"Courier New", Courier, monospace !important;padding:2px 3px;word-break:normal;border-collapse:collapse;border-spacing:0;border-color:#DCDCDC;margin:0px auto;}
+    .tg-opt  {font-family:"Courier New", Courier, monospace !important;padding:2px 3px;word-break:normal;border-collapse:collapse;border-spacing:0;border-color:#DCDCDC;margin:0px auto;width:100%;}
    .tg-opt td{font-family:"Courier New", Courier, monospace !important;font-weight:bold;color:#444;background-color:#F7FDFA;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
    .tg-opt th{font-family:"Courier New", Courier, monospace !important;font-weight:normal;color:#fff;background-color:#26ADE4;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
    .tg-opt .tg-left{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:left;}
@ -61,6 +103,7 @@ class VerboseOptimization(object):
                html_body += "</tr>"
            self.text.value = html_begin + html_body + html_end
            self.progress.value = 100*(self.iteration+1)/self.maxiters
+            self.model_show.value = self.model._repr_html_()
        else:
            n_exps = exponents(self.fnow, self.current_gradient)
            if self.iteration - self.p_iter >= 20 * np.random.rand():
@ -72,7 +115,7 @@ class VerboseOptimization(object):
                if b:
                    self.exps = n_exps
            print '\r',
-            print '{0:>0{mi}g}  {1:> 12e}  {2:> 12e}'.format(self.iteration, float(self.fnow), float(self.current_gradient), mi=self.len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+            print '{3:> 7.2g}  {0:>0{mi}g}  {1:> 12e}  {2:> 12e}'.format(self.iteration, float(self.fnow), float(self.current_gradient), time.time()-self.start, mi=self.len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
            sys.stdout.flush()

    def print_status(self, me, which=None):
@ -91,6 +134,17 @@ class VerboseOptimization(object):
        else:
            self.current_gradient = np.nan

+    def finish(self, opt):
+        self.status = opt.status
+
    def __exit__(self, type, value, traceback):
        if self.verbose:
-            self.model.remove_observer(self)
+            self.stop = time.time()
+            self.model.remove_observer(self)
+            self.print_out()
+
+            if not self.ipython_notebook:
+                print ''
+                print 'Optimization finished in {0:.5g} Seconds'.format(self.stop-self.start)
+                print 'Optimization status: {0:.5g}'.format(self.status)
+                print
--- a/GPy/inference/init.py
+++ b/GPy/inference/init.py
@ -1,2 +1,3 @@
 import latent_function_inference
-import optimization
+import optimization
+import mcmc
--- a/GPy/inference/latent_function_inference/svgp.py
+++ b/GPy/inference/latent_function_inference/svgp.py
@ -5,11 +5,8 @@ import numpy as np
 from posterior import Posterior

 class SVGP(LatentFunctionInference):
-    def __init__(self, KL_scale=1., batch_scale=1.):
-        self.KL_scale = KL_scale
-        self.batch_scale = batch_scale

-    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
        num_inducing = Z.shape[0]
        num_data, num_outputs = Y.shape

@ -44,12 +41,9 @@ class SVGP(LatentFunctionInference):
        dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
        dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)

-        KL_scale = self.KL_scale
-        batch_scale = self.batch_scale
-        KL, dKL_dKmm, dKL_dS, dKL_dm = KL_scale*KL, KL_scale*dKL_dKmm, KL_scale*dKL_dS, KL_scale*dKL_dm

        #quadrature for the likelihood
-        F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v)
+        F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)

        #rescale the F term if working on a batch
        F, dF_dmu, dF_dv =  F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -21,7 +21,7 @@ class VarDTC(LatentFunctionInference):
    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.

    """
-    const_jitter = 1e-6
+    const_jitter = 1e-8
    def __init__(self, limit=1):
        #self._YYTfactor_cache = caching.cache()
        from ...util.caching import Cacher
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -24,7 +24,7 @@ class VarDTC_minibatch(LatentFunctionInference):
    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.

    """
-    const_jitter = 1e-6
+    const_jitter = 1e-8
    def __init__(self, batchsize=None, limit=1, mpi_comm=None):

        self.batchsize = batchsize
--- a/GPy/inference/mcmc/hmc.py
+++ b/GPy/inference/mcmc/hmc.py
@ -1,4 +1,4 @@
-# ## Copyright (c) 2014, Zhenwen Dai
+# ## Copyright (c) 2014 Mu Niu, Zhenwen Dai and GPy Authors
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
--- a/GPy/inference/optimization/optimization.py
+++ b/GPy/inference/optimization/optimization.py
@ -37,7 +37,7 @@ class Optimizer():
        self.x_opt = None
        self.funct_eval = None
        self.status = None
-        self.max_f_eval = int(max_f_eval)
+        self.max_f_eval = int(max_iters)
        self.max_iters = int(max_iters)
        self.bfgs_factor = bfgs_factor
        self.trace = None
--- a/GPy/inference/optimization/scg.py
+++ b/GPy/inference/optimization/scg.py
@ -61,6 +61,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
    function_eval = 1
    fnow = fold
    gradnew = gradf(x, *optargs) # Initial gradient.
+    function_eval += 1
    #if any(np.isnan(gradnew)):
    #    raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value"
    current_grad = np.dot(gradnew, gradnew)
@ -96,6 +97,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
            sigma = sigma0 / np.sqrt(kappa)
            xplus = x + sigma * d
            gplus = gradf(xplus, *optargs)
+            function_eval += 1
            theta = np.dot(d, (gplus - gradnew)) / sigma

        # Increase effective curvature and evaluate step size alpha.
@ -111,10 +113,10 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
        fnew = f(xnew, *optargs)
        function_eval += 1

-#         if function_eval >= max_f_eval:
-#             status = "maximum number of function evaluations exceeded"
-#             break
-#             return x, flog, function_eval, status
+        if function_eval >= max_f_eval:
+            status = "maximum number of function evaluations exceeded"
+            break
+            return x, flog, function_eval, status

        Delta = 2.*(fnew - fold) / (alpha * mu)
        if Delta >= 0.:
@ -156,6 +158,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
                # Update variables for new position
                gradold = gradnew
                gradnew = gradf(x, *optargs)
+                function_eval += 1
                current_grad = np.dot(gradnew, gradnew)
                fold = fnew
                # If the gradient is zero then we are done.
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -1,7 +1,7 @@
 from _src.kern import Kern
 from _src.rbf import RBF
 from _src.linear import Linear, LinearFull
-from _src.static import Bias, White
+from _src.static import Bias, White, Fixed
 from _src.brownian import Brownian
 from _src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from _src.mlp import MLP
@ -13,7 +13,11 @@ from _src.ODE_UYC import ODE_UYC
 from _src.ODE_st import ODE_st
 from _src.ODE_t import ODE_t
 from _src.poly import Poly
+<<<<<<< HEAD
 from _src.spline import Spline
+=======
+from _src.eq_ode2 import EQ_ODE2
+>>>>>>> upstream/devel

 from _src.trunclinear import TruncLinear,TruncLinear_inf
 from _src.splitKern import SplitKern,DiffGenomeKern
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@ -127,7 +127,7 @@ class Coregionalize(Kern):
                config.set('weave', 'working', 'False')
                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
        else:
-            dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+            dL_dK_small = self._gradient_reduce_numpy(dL_dK, index, index2)



@ -154,9 +154,9 @@ class Coregionalize(Kern):
    def _gradient_reduce_numpy(self, dL_dK, index, index2):
        index, index2 = index[:,0], index2[:,0]
        dL_dK_small = np.zeros_like(self.B)
-        for i in range(k.output_dim):
+        for i in range(self.output_dim):
            tmp1 = dL_dK[index==i]
-            for j in range(k.output_dim):
+            for j in range(self.output_dim):
                dL_dK_small[j,i] = tmp1[:,index2==j].sum()
        return dL_dK_small

--- a/GPy/kern/_src/eq_ode2.py
+++ b/GPy/kern/_src/eq_ode2.py
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@ -42,25 +42,41 @@ class Prod(CombinationKernel):
        return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))

    def update_gradients_full(self, dL_dK, X, X2=None):
-        k = self.K(X,X2)*dL_dK
-        for p in self.parts:
-            p.update_gradients_full(k/p.K(X,X2),X,X2)
+        if len(self.parts)==2:
+            self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
+            self.parts[1].update_gradients_full(dL_dK*self.parts[0].K(X,X2), X, X2)
+        else:
+            k = self.K(X,X2)*dL_dK
+            for p in self.parts:
+                p.update_gradients_full(k/p.K(X,X2),X,X2)

    def update_gradients_diag(self, dL_dKdiag, X):
-        k = self.Kdiag(X)*dL_dKdiag
-        for p in self.parts:
-            p.update_gradients_diag(k/p.Kdiag(X),X)
+        if len(self.parts)==2:
+            self.parts[0].update_gradients_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
+            self.parts[1].update_gradients_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
+        else:
+            k = self.Kdiag(X)*dL_dKdiag
+            for p in self.parts:
+                p.update_gradients_diag(k/p.Kdiag(X),X)

    def gradients_X(self, dL_dK, X, X2=None):
        target = np.zeros(X.shape)
-        k = self.K(X,X2)*dL_dK
-        for p in self.parts:
-            target += p.gradients_X(k/p.K(X,X2),X,X2)
+        if len(self.parts)==2:
+            target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2)
+            target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2)
+        else:
+            k = self.K(X,X2)*dL_dK
+            for p in self.parts:
+                target += p.gradients_X(k/p.K(X,X2),X,X2)
        return target

    def gradients_X_diag(self, dL_dKdiag, X):
        target = np.zeros(X.shape)
-        k = self.Kdiag(X)*dL_dKdiag
-        for p in self.parts:
-            target += p.gradients_X_diag(k/p.Kdiag(X),X)
+        if len(self.parts)==2:
+            target += self.parts[0].gradients_X_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
+            target += self.parts[1].gradients_X_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
+        else:
+            k = self.Kdiag(X)*dL_dKdiag
+            for p in self.parts:
+                target += p.gradients_X_diag(k/p.Kdiag(X),X)
        return target
--- a/GPy/kern/_src/spline.py
+++ b/GPy/kern/_src/spline.py
@ -1,4 +1,4 @@
-# Copyright (c) 2014, James Hensman
+# Copyright (c) 2015, Thomas Hornung
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
@ -7,7 +7,12 @@ from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 class Spline(Kern):
    """
-    Spline kernel
+    Linear spline kernel. You need to specify 2 parameters: the variance and c.
+    The variance is defined in powers of 10. Thus specifying -2 means 10^-2.
+    The parameter c allows to define the stiffness of the spline fit. A very stiff
+    spline equals linear regression.
+    See https://www.youtube.com/watch?v=50Vgw11qn0o starting at minute 1:17:28
+    Lit: Wahba, 1990
    """

    def __init__(self, input_dim, variance=1., c=1., active_dims=None, name='spline'):
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -6,3 +6,4 @@ from poisson import Poisson
 from student_t import StudentT
 from likelihood import Likelihood
 from mixed_noise import MixedNoise
+from binomial import Binomial
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -77,6 +77,32 @@ class Bernoulli(Likelihood):

        return Z_hat, mu_hat, sigma2_hat

+    def variational_expectations(self, Y, m, v, gh_points=None):
+        if isinstance(self.gp_link, link_functions.Probit):
+
+            if gh_points is None:
+                gh_x, gh_w = np.polynomial.hermite.hermgauss(20)
+            else:
+                gh_x, gh_w = gh_points
+
+            from scipy import stats
+
+            shape = m.shape
+            m,v,Y = m.flatten(), v.flatten(), Y.flatten()
+            Ysign = np.where(Y==1,1,-1)
+            X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + (m*Ysign)[:,None]
+            p = stats.norm.cdf(X)
+            p = np.clip(p, 1e-9, 1.-1e-9) # for numerical stability
+            N = stats.norm.pdf(X)
+            F = np.log(p).dot(gh_w)
+            NoverP = N/p
+            dF_dm = (NoverP*Ysign[:,None]).dot(gh_w)
+            dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(gh_w)
+            return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), None
+        else:
+            raise NotImplementedError
+
+
    def predictive_mean(self, mu, variance, Y_metadata=None):

        if isinstance(self.gp_link, link_functions.Probit):
--- a/GPy/likelihoods/binomial.py
+++ b/GPy/likelihoods/binomial.py
@ -0,0 +1,125 @@
+# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
+import link_functions
+from likelihood import Likelihood
+from scipy import special
+
+class Binomial(Likelihood):
+    """
+    Binomial likelihood
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+    .. Note::
+        Y takes values in either {-1, 1} or {0, 1}.
+        link function should have the domain [0, 1], e.g. probit (default) or Heaviside
+
+    .. See also::
+        likelihood.py, for the parent class
+    """
+    def __init__(self, gp_link=None):
+        if gp_link is None:
+            gp_link = link_functions.Probit()
+
+        super(Binomial, self).__init__(gp_link, 'Binomial')
+
+    def conditional_mean(self, gp, Y_metadata):
+        return self.gp_link(gp)*Y_metadata['trials']
+
+    def pdf_link(self, inv_link_f, y, Y_metadata):
+        """
+        Likelihood function given inverse link of f.
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata must contain 'trials'
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        .. Note:
+            Each y_i must be in {0, 1}
+        """
+        return np.exp(self.logpdf_link(inv_link_f, y, Y_metadata))
+
+    def logpdf_link(self, inv_link_f, y, Y_metadata=None):
+        """
+        Log Likelihood function given inverse link of f.
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = y_{i}\\log\\lambda(f_{i}) + (1-y_{i})\\log (1-f_{i})
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata must contain 'trials'
+        :returns: log likelihood evaluated at points inverse link of f.
+        :rtype: float
+        """
+        N = Y_metadata['trials']
+        nchoosey = special.gammaln(N+1) - special.gammaln(y+1) - special.gammaln(N-y+1)
+
+        return nchoosey + y*np.log(inv_link_f) + (N-y)*np.log(1.-inv_link_f)
+
+    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
+        """
+        Gradient of the pdf at y, given inverse link of f w.r.t inverse link of f.
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata must contain 'trials'
+        :returns: gradient of log likelihood evaluated at points inverse link of f.
+        :rtype: Nx1 array
+        """
+        N = Y_metadata['trials']
+        return y/inv_link_f - (N-y)/(1-inv_link_f)
+
+    def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
+        """
+        Hessian at y, given inv_link_f, w.r.t inv_link_f the hessian will be 0 unless i == j
+        i.e. second derivative logpdf at y given inverse link of f_i and inverse link of f_j  w.r.t inverse link of f_i and inverse link of f_j.
+
+
+        .. math::
+            \\frac{d^{2}\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)^{2}} = \\frac{-y_{i}}{\\lambda(f)^{2}} - \\frac{(1-y_{i})}{(1-\\lambda(f))^{2}}
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in binomial
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points inverse link of f.
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on inverse link of f_i not on inverse link of f_(j!=i)
+        """
+        N = Y_metadata['trials']
+        return -y/np.square(inv_link_f) - (N-y)/np.square(1-inv_link_f)
+
+    def samples(self, gp, Y_metadata=None):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        N = Y_metadata['trials']
+        Ysim = np.random.binomial(N, self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
+
+    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
+        pass
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -131,7 +131,7 @@ class Likelihood(Parameterized):

        return z, mean, variance

-    def variational_expectations(self, Y, m, v, gh_points=None):
+    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
        """
        Use Gauss-Hermite Quadrature to compute

@ -158,9 +158,9 @@ class Likelihood(Parameterized):

        #evaluate the likelhood for the grid. First ax indexes the data (and mu, var) and the second indexes the grid.
        # broadcast needs to be handled carefully.
-        logp = self.logpdf(X,Y[:,None])
-        dlogp_dx = self.dlogpdf_df(X, Y[:,None])
-        d2logp_dx2 = self.d2logpdf_df2(X, Y[:,None])
+        logp = self.logpdf(X,Y[:,None], Y_metadata=Y_metadata)
+        dlogp_dx = self.dlogpdf_df(X, Y[:,None], Y_metadata=Y_metadata)
+        d2logp_dx2 = self.d2logpdf_df2(X, Y[:,None], Y_metadata=Y_metadata)

        #clipping for numerical stability
        #logp = np.clip(logp,-1e9,1e9)
@ -178,7 +178,7 @@ class Likelihood(Parameterized):
            stop

        dF_dtheta = None # Not yet implemented
-        return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), None
+        return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), dF_dtheta

    def predictive_mean(self, mu, variance, Y_metadata=None):
        """
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@ -64,8 +64,7 @@ class Poisson(Likelihood):
        :rtype: float

        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        return np.sum(-link_f + y*np.log(link_f) - special.gammaln(y+1))
+        return -link_f + y*np.log(link_f) - special.gammaln(y+1)

    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
        """
@ -83,7 +82,6 @@ class Poisson(Likelihood):
        :rtype: Nx1 array

        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        return y/link_f - 1

    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
@ -107,12 +105,7 @@ class Poisson(Likelihood):
            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        hess = -y/(link_f**2)
-        return hess
-        #d2_df = self.gp_link.d2transf_df2(gp)
-        #transf = self.gp_link.transf(gp)
-        #return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
+        return -y/(link_f**2) 

    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
        """
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@ -83,8 +83,8 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        """Get the gradients of the posterior distribution of X in its specific form."""
        return X.mean.gradient, X.variance.gradient

-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
-        posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices)
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, **kw):
+        posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices, **kw)

        if self.has_uncertain_inputs():
            current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -3,6 +3,7 @@

 import numpy as np
 from ..core.parameterization.param import Param
+from ..core.sparse_gp import SparseGP
 from ..core.gp import GP
 from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
@ -16,14 +17,9 @@ from GPy.inference.optimization.stochastics import SparseGPStochastics,\
    #SparseGPMissing
 logger = logging.getLogger("sparse gp")

-class SparseGPMiniBatch(GP):
+class SparseGPMiniBatch(SparseGP):
    """
-    A general purpose Sparse GP model
-'''
-Created on 3 Nov 2014
-
-@author: maxz
-'''
+    A general purpose Sparse GP model, allowing missing data and stochastics across dimensions.

    This model allows (approximate) inference using variational DTC or FITC
    (Gaussian likelihoods) as well as non-conjugate sparse methods based on
@ -97,7 +93,7 @@ Created on 3 Nov 2014
    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)

-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, **kwargs):
        """
        This is the standard part, which usually belongs in parameters_changed.

@ -117,7 +113,7 @@ Created on 3 Nov 2014
        algorithm.
        """
        try:
-            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None)
+            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None, **kwargs)
        except:
            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata)
        current_values = {}
@ -315,34 +311,3 @@ Created on 3 Nov 2014
        else:
            self.posterior, self._log_marginal_likelihood, self.grad_dict, self.full_values, _ = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
            self._outer_values_update(self.full_values)
-
-    def _raw_predict(self, Xnew, full_cov=False, kern=None):
-        """
-        Make a prediction for the latent function values
-        """
-
-        if kern is None: kern = self.kern
-
-        if not isinstance(Xnew, VariationalPosterior):
-            Kx = kern.K(self.Z, Xnew)
-            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
-            if full_cov:
-                Kxx = kern.K(Xnew)
-                if self.posterior.woodbury_inv.ndim == 2:
-                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
-                elif self.posterior.woodbury_inv.ndim == 3:
-                    var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
-                var = var
-            else:
-                Kxx = kern.Kdiag(Xnew)
-                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
-        else:
-            Kx = kern.psi1(self.Z, Xnew)
-            mu = np.dot(Kx, self.posterior.woodbury_vector)
-            if full_cov:
-                raise NotImplementedError, "TODO"
-            else:
-                Kxx = kern.psi0(self.Z, Xnew)
-                psi2 = kern.psi2(self.Z, Xnew)
-                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
-        return mu, var
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@ -172,7 +172,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
        if hasattr(model,"Z"):
            #Zu = model.Z[:,free_dims] * model._Xscale[:,free_dims] + model._Xoffset[:,free_dims]
            Zu = Z[:,free_dims]
-            plots['inducing_inputs'] = ax.plot(Zu[:,free_dims[0]], Zu[:,free_dims[1]], 'wo')
+            plots['inducing_inputs'] = ax.plot(Zu[:,0], Zu[:,1], 'wo')

    else:
        raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
--- a/GPy/testing/linalg_test.py
+++ b/GPy/testing/linalg_test.py
@ -0,0 +1,37 @@
+import numpy as np
+import scipy as sp
+from ..util.linalg import jitchol
+
+class LinalgTests(np.testing.TestCase):
+    def setUp(self):
+        #Create PD matrix
+        A = np.random.randn(20,100)
+        self.A = A.dot(A.T)
+        #compute Eigdecomp
+        vals, vectors = np.linalg.eig(self.A)
+        #Set smallest eigenval to be negative with 5 rounds worth of jitter
+        vals[vals.argmin()] = 0
+        default_jitter = 1e-6*np.mean(vals)
+        vals[vals.argmin()] = -default_jitter*(10**3.5)
+        self.A_corrupt = (vectors * vals).dot(vectors.T)
+
+    def test_jitchol_success(self):
+        """
+        Expect 5 rounds of jitter to be added and for the recovered matrix to be
+        identical to the corrupted matrix apart from the jitter added to the diagonal
+        """
+        L = jitchol(self.A_corrupt, maxtries=5)
+        A_new = L.dot(L.T)
+        diff = A_new - self.A_corrupt
+        np.testing.assert_allclose(diff, np.eye(A_new.shape[0])*np.diag(diff).mean(), atol=1e-13)
+
+    def test_jitchol_failure(self):
+        try:
+            """
+            Expecting an exception to be thrown as we expect it to require
+            5 rounds of jitter to be added to enforce PDness
+            """
+            jitchol(self.A_corrupt, maxtries=4)
+            return False
+        except sp.linalg.LinAlgError:
+            return True
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -178,6 +178,24 @@ class MiscTests(unittest.TestCase):
        m.optimize()
        print m

+    def test_model_updates(self):
+        Y1 = np.random.normal(0, 1, (40, 13))
+        Y2 = np.random.normal(0, 1, (40, 6))
+        m = GPy.models.MRD([Y1, Y2], 5)
+        self.count = 0
+        m.add_observer(self, self._count_updates, -2000)
+        m.update_model(False)
+        m['.*Gaussian'] = .001
+        self.assertEquals(self.count, 0)
+        m['.*Gaussian'].constrain_bounded(0,.01)
+        self.assertEquals(self.count, 0)
+        m.Z.fix()
+        self.assertEquals(self.count, 0)
+        m.update_model(True)
+        self.assertEquals(self.count, 1)
+    def _count_updates(self, me, which):
+        self.count+=1
+
    def test_model_optimize(self):
        X = np.random.uniform(-3., 3., (20, 1))
        Y = np.sin(X) + np.random.randn(20, 1) * 0.05
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@ -138,8 +138,6 @@ class Test(ListDictTestCase):
        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        self.assertTrue(pcopy.checkgrad())
        self.assert_(np.any(pcopy.gradient!=0.0))
-        pcopy.optimize('bfgs')
-        par.optimize('bfgs')
        np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)
        par.randomize()
        with tempfile.TemporaryFile('w+b') as f:
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -82,6 +82,7 @@ def force_F_ordered(A):

 #         return jitchol(A+np.eye(A.shape[0])*jitter, maxtries-1)

+
 def jitchol(A, maxtries=5):
    A = np.ascontiguousarray(A)
    L, info = lapack.dpotrf(A, lower=1)
@ -92,25 +93,19 @@ def jitchol(A, maxtries=5):
        if np.any(diagA <= 0.):
            raise linalg.LinAlgError, "not pd: non-positive diagonal elements"
        jitter = diagA.mean() * 1e-6
-        while maxtries > 0 and np.isfinite(jitter):
+        num_tries = 1
+        while num_tries <= maxtries and np.isfinite(jitter):
            try:
                L = linalg.cholesky(A + np.eye(A.shape[0]) * jitter, lower=True)
+                logging.warning('Added {} rounds of jitter, jitter of {:.10e}\n'.format(num_tries, jitter))
+                return L
            except:
                jitter *= 10
-            finally:
-                maxtries -= 1
-        raise linalg.LinAlgError, "not positive definite, even with jitter."
+                num_tries += 1
    import traceback
-    try: raise
-    except:
-        logging.warning('\n'.join(['Added jitter of {:.10e}'.format(jitter),
-            '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
-    import ipdb;ipdb.set_trace()
-    return L
-
-
-
-
+    logging.warning('\n'.join(['Added {} rounds of jitter, jitter of {:.10e}'.format(num_tries-1, jitter),
+                                '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
+    raise linalg.LinAlgError, "not positive definite, even with jitter."

 # def dtrtri(L, lower=1):
 #     """