Merge branch 'devel' into pickle

2026-05-10 12:32:40 +02:00 · 2013-06-25 17:44:52 +01:00 · 2013-06-25 17:44:52 +01:00 · 1e06ca2d40
commit 1e06ca2d40
parent e869fcaf65 70189a387b
66 changed files with 768 additions and 193 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,7 +9,6 @@
 dist
 build
 eggs
 parts
 bin
 var
 sdist
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -4,6 +4,7 @@
 from model import *
 from parameterised import *
 import priors
-from GPy.core.gp import GP
+from gp import GP
-from GPy.core.sparse_gp import SparseGP
+from sparse_gp import SparseGP
 from fitc import FITC
 from svigp import SVIGP
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -29,6 +29,7 @@ class GPBase(Model):
            self._Xscale = np.ones((1, self.input_dim))
        super(GPBase, self).__init__()
        #Model.__init__(self)
        # All leaf nodes should call self._set_params(self._get_params()) at
        # the end
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -81,6 +81,7 @@ class SparseGP(GPBase):
    def _computations(self):
        # factor Kmm
        self.Lm = jitchol(self.Kmm)
@ -107,17 +108,18 @@ class SparseGP(GPBase):
        self.B = np.eye(self.num_inducing) + self.A
        self.LB = jitchol(self.B)
-        # TODO: make a switch for either first compute psi1V, or VV.T
+        #VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
-        self.psi1V = np.dot(self.psi1.T, self.likelihood.V)
+        self.psi1Vf = np.dot(self.psi1.T, self.likelihood.VVT_factor)
-        # back substutue C into psi1V
+        # back substutue C into psi1Vf
-        tmp, info1 = dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0)
+        tmp, info1 = dtrtrs(self.Lm, np.asfortranarray(self.psi1Vf), lower=1, trans=0)
-        self._LBi_Lmi_psi1V, _ = dtrtrs(self.LB, np.asfortranarray(tmp), lower=1, trans=0)
+        self._LBi_Lmi_psi1Vf, _ = dtrtrs(self.LB, np.asfortranarray(tmp), lower=1, trans=0)
        tmp, info2 = dpotrs(self.LB, tmp, lower=1)
-        self.Cpsi1V, info3 = dtrtrs(self.Lm, tmp, lower=1, trans=1)
+        self.Cpsi1Vf, info3 = dtrtrs(self.Lm, tmp, lower=1, trans=1)
        # Compute dL_dKmm
-        tmp = tdot(self._LBi_Lmi_psi1V)
+        tmp = tdot(self._LBi_Lmi_psi1Vf)
        self.data_fit = np.trace(tmp)
        self.DBi_plus_BiPBi = backsub_both_sides(self.LB, self.output_dim * np.eye(self.num_inducing) + tmp)
        tmp = -0.5 * self.DBi_plus_BiPBi
        tmp += -0.5 * self.B * self.output_dim
@ -126,7 +128,7 @@ class SparseGP(GPBase):
        # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case
        self.dL_dpsi0 = -0.5 * self.output_dim * (self.likelihood.precision * np.ones([self.num_data, 1])).flatten()
-        self.dL_dpsi1 = np.dot(self.Cpsi1V, self.likelihood.V.T).T
+        self.dL_dpsi1 = np.dot(self.likelihood.VVT_factor, self.Cpsi1Vf.T)
        dL_dpsi2_beta = 0.5 * backsub_both_sides(self.Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)
        if self.likelihood.is_heteroscedastic:
@ -156,18 +158,18 @@ class SparseGP(GPBase):
            # likelihood is not heterscedatic
            self.partial_for_likelihood = -0.5 * self.num_data * self.output_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2
            self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self.A) * self.likelihood.precision)
-            self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self.A * self.DBi_plus_BiPBi) - np.sum(np.square(self._LBi_Lmi_psi1V)))
+            self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self.A * self.DBi_plus_BiPBi) - self.data_fit)
    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        if self.likelihood.is_heteroscedastic:
-            A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V * self.likelihood.Y)
+            A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V*self.likelihood.Y)
            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A))
        else:
            A = -0.5 * self.num_data * self.output_dim * (np.log(2.*np.pi) - np.log(self.likelihood.precision)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A))
        C = -self.output_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2))
-        D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
+        D = 0.5 * self.data_fit
        return A + B + C + D + self.likelihood.Z
    def _set_params(self, p):
@ -176,6 +178,7 @@ class SparseGP(GPBase):
        self.likelihood._set_params(p[self.Z.size + self.kern.num_params:])
        self._compute_kernel_matrices()
        self._computations()
        self.Cpsi1V = None
    def _get_params(self):
        return np.hstack([self.Z.flatten(), self.kern._get_params_transformed(), self.likelihood._get_params()])
@ -242,6 +245,14 @@ class SparseGP(GPBase):
        symmetrify(Bi)
        Kmmi_LmiBLmi = backsub_both_sides(self.Lm, np.eye(self.num_inducing) - Bi)
        if self.Cpsi1V is None:
            psi1V = np.dot(self.psi1.T,self.likelihood.V)
            tmp, _ = dtrtrs(self.Lm, np.asfortranarray(psi1V), lower=1, trans=0)
            tmp, _ = dpotrs(self.LB, tmp, lower=1)
            self.Cpsi1V, _ = dtrtrs(self.Lm, tmp, lower=1, trans=1)
        if X_variance_new is None:
            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
            mu = np.dot(Kx.T, self.Cpsi1V)
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@ -0,0 +1,466 @@
 # Copyright (c) 2012, James Hensman and Nicolo' Fusi
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from .. import kern
 from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs, jitchol, backsub_both_sides
 from ..likelihoods import EP
 from gp_base import GPBase
 from model import Model
 import time
 import sys
 class SVIGP(GPBase):
    """
    Stochastic Variational inference in a Gaussian Process
    :param X: inputs
    :type X: np.ndarray (N x Q)
    :param Y: observed data
    :type Y: np.ndarray of observations (N x D)
    :param batchsize: the size of a h
    Additional kwargs are used as for a sparse GP. They include
    :param q_u: canonical parameters of the distribution squasehd into a 1D array
    :type q_u: np.ndarray
    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
    :param kernel : the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
    :type X_uncertainty: np.ndarray (N x Q) | None
    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
    :param beta: noise precision. TODO> ignore beta if doing EP
    :type beta: float
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
    def __init__(self, X, likelihood, kernel, Z, q_u=None, batchsize=10, X_variance=None):
        GPBase.__init__(self, X, likelihood, kernel, normalize_X=False)
        self.batchsize=batchsize
        self.Y = self.likelihood.Y.copy()
        self.Z = Z
        self.num_inducing = Z.shape[0]
        self.batchcounter = 0
        self.epochs = 0
        self.iterations = 0
        self.vb_steplength = 0.05
        self.param_steplength = 1e-5
        self.momentum = 0.9
        if X_variance is None:
            self.has_uncertain_inputs = False
        else:
            self.has_uncertain_inputs = True
            self.X_variance = X_variance
        if q_u is None:
             q_u = np.hstack((np.random.randn(self.num_inducing*self.output_dim),-.5*np.eye(self.num_inducing).flatten()))
        self.set_vb_param(q_u)
        self._permutation = np.random.permutation(self.num_data)
        self.load_batch()
        self._param_trace = []
        self._ll_trace = []
        self._grad_trace = []
        #set the adaptive steplength parameters
        self.hbar_t = 0.0
        self.tau_t = 100.0
        self.gbar_t = 0.0
        self.gbar_t1 = 0.0
        self.gbar_t2 = 0.0
        self.hbar_tp = 0.0
        self.tau_tp = 10000.0
        self.gbar_tp = 0.0
        self.adapt_param_steplength = True
        self.adapt_vb_steplength = True
        self._param_steplength_trace = []
        self._vb_steplength_trace = []
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
        if self.has_uncertain_inputs:
            self.psi0 = self.kern.psi0(self.Z, self.X_batch, self.X_variance_batch)
            self.psi1 = self.kern.psi1(self.Z, self.X_batch, self.X_variance_batch)
            self.psi2 = self.kern.psi2(self.Z, self.X_batch, self.X_variance_batch)
        else:
            self.psi0 = self.kern.Kdiag(self.X_batch)
            self.psi1 = self.kern.K(self.X_batch, self.Z)
            self.psi2 = None
    def dL_dtheta(self):
        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z)
        if self.has_uncertain_inputs:
            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch)
            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch)
            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch)
        else:
            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X_batch, self.Z)
            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch)
        return dL_dtheta
    def _set_params(self, p, computations=True):
        self.kern._set_params_transformed(p[:self.kern.num_params])
        self.likelihood._set_params(p[self.kern.num_params:])
        if computations:
            self._compute_kernel_matrices()
            self._computations()
    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed() , self.likelihood._get_params()))
    def _get_param_names(self):
        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    def load_batch(self):
        """
        load a batch of data (set self.X_batch and self.likelihood.Y from self.X, self.Y)
        """
        #if we've seen all the data, start again with them in a new random order
        if self.batchcounter+self.batchsize > self.num_data:
            self.batchcounter = 0
            self.epochs += 1
            self._permutation = np.random.permutation(self.num_data)
        this_perm = self._permutation[self.batchcounter:self.batchcounter+self.batchsize]
        self.X_batch = self.X[this_perm]
        self.likelihood.set_data(self.Y[this_perm])
        if self.has_uncertain_inputs:
            self.X_variance_batch = self.X_variance[this_perm]
        self.batchcounter += self.batchsize
        self.data_prop = float(self.batchsize)/self.num_data
        self._compute_kernel_matrices()
        self._computations()
    def _computations(self,do_Kmm=True, do_Kmm_grad=True):
        """
        All of the computations needed. Some are optional, see kwargs.
        """
        if do_Kmm:
            self.Lm = jitchol(self.Kmm)
        # The rather complex computations of self.A
        if self.has_uncertain_inputs:
            if self.likelihood.is_heteroscedastic:
                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0)
            else:
                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
            evals, evecs = linalg.eigh(psi2_beta)
            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
            tmp = evecs * np.sqrt(clipped_evals)
        else:
            if self.likelihood.is_heteroscedastic:
                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.batchsize)))
            else:
                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision))
        tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
        self.A = tdot(tmp)
        self.V = self.likelihood.precision*self.likelihood.Y
        self.VmT = np.dot(self.V,self.q_u_expectation[0].T)
        self.psi1V = np.dot(self.psi1.T, self.V)
        self.B = np.eye(self.num_inducing)*self.data_prop + self.A
        self.Lambda = backsub_both_sides(self.Lm, self.B.T)
        self.LQL = backsub_both_sides(self.Lm,self.q_u_expectation[1].T,transpose='right')
        self.trace_K = self.psi0.sum() - np.trace(self.A)/self.likelihood.precision
        self.Kmmi_m, _ = dpotrs(self.Lm, self.q_u_expectation[0], lower=1)
        self.projected_mean = np.dot(self.psi1,self.Kmmi_m)
        # Compute dL_dpsi
        self.dL_dpsi0 = - 0.5 * self.output_dim * self.likelihood.precision * np.ones(self.batchsize)
        self.dL_dpsi1, _ = dpotrs(self.Lm,np.asfortranarray(self.VmT.T),lower=1)
        self.dL_dpsi1 = self.dL_dpsi1.T
        dL_dpsi2 = -0.5 * self.likelihood.precision * backsub_both_sides(self.Lm, self.LQL - self.output_dim * np.eye(self.num_inducing))
        if self.has_uncertain_inputs:
            self.dL_dpsi2 = np.repeat(dL_dpsi2[None,:,:],self.batchsize,axis=0)
        else:
            self.dL_dpsi1 += 2.*np.dot(dL_dpsi2,self.psi1.T).T
            self.dL_dpsi2 = None
        # Compute dL_dKmm
        if do_Kmm_grad:
            tmp = np.dot(self.LQL,self.A) - backsub_both_sides(self.Lm,np.dot(self.q_u_expectation[0],self.psi1V.T),transpose='right')
            tmp += tmp.T
            tmp += -self.output_dim*self.B
            tmp += self.data_prop*self.LQL
            self.dL_dKmm = 0.5*backsub_both_sides(self.Lm,tmp)
        #Compute the gradient of the log likelihood wrt noise variance
        self.partial_for_likelihood =  -0.5*(self.batchsize*self.output_dim - np.sum(self.A*self.LQL))*self.likelihood.precision
        self.partial_for_likelihood +=  (0.5*self.output_dim*self.trace_K + 0.5 * self.likelihood.trYYT - np.sum(self.likelihood.Y*self.projected_mean))*self.likelihood.precision**2
    def log_likelihood(self):
        """
        As for uncollapsed sparse GP, but account for the proportion of data we're looking at right now.
        NB. self.batchsize is the size of the batch, not the size of X_all
        """
        assert not self.likelihood.is_heteroscedastic
        A = -0.5*self.batchsize*self.output_dim*(np.log(2.*np.pi) - np.log(self.likelihood.precision))
        B = -0.5*self.likelihood.precision*self.output_dim*self.trace_K
        Kmm_logdet = 2.*np.sum(np.log(np.diag(self.Lm)))
        C = -0.5*self.output_dim*self.data_prop*(Kmm_logdet-self.q_u_logdet - self.num_inducing)
        C += -0.5*np.sum(self.LQL * self.B)
        D = -0.5*self.likelihood.precision*self.likelihood.trYYT
        E = np.sum(self.V*self.projected_mean)
        return (A+B+C+D+E)/self.data_prop
    def _log_likelihood_gradients(self):
        return np.hstack((self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))/self.data_prop
    def vb_grad_natgrad(self):
        """
        Compute the gradients of the lower bound wrt the canonical and
        Expectation parameters of u.
        Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family)
        """
        # Gradient for eta
        dL_dmmT_S = -0.5*self.Lambda/self.data_prop + 0.5*self.q_u_prec
        Kmmipsi1V,_ = dpotrs(self.Lm,self.psi1V,lower=1)
        dL_dm = (Kmmipsi1V - np.dot(self.Lambda,self.q_u_mean))/self.data_prop
        # Gradients for theta
        S = self.q_u_cov
        Si = self.q_u_prec
        m = self.q_u_mean
        dL_dSi = -mdot(S,dL_dmmT_S, S)
        dL_dmhSi = -2*dL_dSi
        dL_dSim = np.dot(dL_dSi,m) + np.dot(Si, dL_dm)
        return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten())) , np.hstack((dL_dSim.flatten(), dL_dmhSi.flatten()))
    def optimize(self, iterations, print_interval=10, callback=lambda:None, callback_interval=5):
        param_step = 0.
        #Iterate!
        for i in range(iterations):
            #store the current configuration for plotting later
            self._param_trace.append(self._get_params())
            self._ll_trace.append(self.log_likelihood() + self.log_prior())
            #load a batch
            self.load_batch()
            #compute the (stochastic) gradient
            natgrads = self.vb_grad_natgrad()
            grads = self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
            self._grad_trace.append(grads)
            #compute the steps in all parameters
            vb_step = self.vb_steplength*natgrads[0]
            if (self.epochs>=1):#only move the parameters after the first epoch
                param_step = self.momentum*param_step + self.param_steplength*grads
            else:
                param_step = 0.
            self.set_vb_param(self.get_vb_param() + vb_step)
            #Note: don't recompute everything here, wait until the next iteration when we have a new batch
            self._set_params(self._untransform_params(self._get_params_transformed() + param_step), computations=False)
            #print messages if desired
            if i and (not i%print_interval):
                print i, np.mean(self._ll_trace[-print_interval:]) #, self.log_likelihood()
                print np.round(np.mean(self._grad_trace[-print_interval:],0),3)
                sys.stdout.flush()
            #callback
            if i and not i%callback_interval:
                callback()
                time.sleep(0.1)
            if self.epochs > 10:
                self._adapt_steplength()
            self.iterations += 1
    def _adapt_steplength(self):
        if self.adapt_vb_steplength:
            # self._adaptive_vb_steplength()
            self._adaptive_vb_steplength_KL()
        self._vb_steplength_trace.append(self.vb_steplength)
        assert self.vb_steplength > 0
        if self.adapt_param_steplength:
            # self._adaptive_param_steplength()
            # self._adaptive_param_steplength_log()
            self._adaptive_param_steplength_from_vb()
        self._param_steplength_trace.append(self.param_steplength)
    def _adaptive_param_steplength(self):
        decr_factor = 0.1
        g_tp = self._transform_gradients(self._log_likelihood_gradients())
        self.gbar_tp = (1-1/self.tau_tp)*self.gbar_tp + 1/self.tau_tp * g_tp
        self.hbar_tp = (1-1/self.tau_tp)*self.hbar_tp + 1/self.tau_tp * np.dot(g_tp.T, g_tp)
        new_param_steplength = np.dot(self.gbar_tp.T, self.gbar_tp) / self.hbar_tp
        #- hack
        new_param_steplength *= decr_factor
        self.param_steplength = (self.param_steplength + new_param_steplength)/2
        #-
        self.tau_tp = self.tau_tp*(1-self.param_steplength) + 1
    def _adaptive_param_steplength_log(self):
        stp = np.logspace(np.log(0.0001), np.log(1e-6), base=np.e, num=18000)
        self.param_steplength = stp[self.iterations]
    def _adaptive_param_steplength_log2(self):
        self.param_steplength = (self.iterations + 0.001)**-0.5
    def _adaptive_param_steplength_from_vb(self):
        self.param_steplength = self.vb_steplength * 0.01
    def _adaptive_vb_steplength(self):
        decr_factor = 0.1
        g_t = self.vb_grad_natgrad()[0]
        self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
        new_vb_steplength = np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t
        #- hack
        new_vb_steplength *= decr_factor
        self.vb_steplength = (self.vb_steplength + new_vb_steplength)/2
        #-
        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
    def _adaptive_vb_steplength_KL(self):
        decr_factor = 1 #0.1
        natgrad = self.vb_grad_natgrad()
        g_t1 = natgrad[0]
        g_t2 = natgrad[1]
        self.gbar_t1 = (1-1/self.tau_t)*self.gbar_t1 + 1/self.tau_t * g_t1
        self.gbar_t2 = (1-1/self.tau_t)*self.gbar_t2 + 1/self.tau_t * g_t2
        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t1.T, g_t2)
        self.vb_steplength = np.dot(self.gbar_t1.T, self.gbar_t2) / self.hbar_t
        self.vb_steplength *= decr_factor
        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
    def _raw_predict(self, X_new, X_variance_new=None, which_parts='all',full_cov=False):
        """Internal helper function for making predictions, does not account for normalization"""
        #TODO: make this more efficient!
        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
        tmp = self.Kmmi- mdot(self.Kmmi,self.q_u_cov,self.Kmmi)
        if X_variance_new is None:
            Kx = self.kern.K(X_new,self.Z)
            mu = np.dot(Kx,self.Kmmi_m)
            if full_cov:
                Kxx = self.kern.K(X_new)
                var = Kxx - mdot(Kx,tmp,Kx.T)
            else:
                Kxx = self.kern.Kdiag(X_new)
                var = (Kxx - np.sum(Kx*np.dot(Kx,tmp),1))[:,None]
            return mu, var
        else:
            assert X_variance_new.shape == X_new.shape
            Kx = self.kern.psi1(self.Z,X_new, X_variance_new)
            mu = np.dot(Kx,self.Kmmi_m)
            Kxx = self.kern.psi0(self.Z,X_new,X_variance_new)
            psi2 = self.kern.psi2(self.Z,X_new,X_variance_new)
            diag_var = Kxx - np.sum(np.sum(psi2*tmp[None,:,:],1),1)
            if full_cov:
                raise NotImplementedError
            else:
                return mu, diag_var[:,None]
    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
        # normalize X values
        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
        if X_variance_new is not None:
            X_variance_new = X_variance_new / self._Xscale ** 2
        # here's the actual prediction by the GP model
        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
        return mean, var, _025pm, _975pm
    def set_vb_param(self,vb_param):
        """set the distribution q(u) from the canonical parameters"""
        self.q_u_canonical_flat = vb_param.copy()
        self.q_u_canonical = self.q_u_canonical_flat[:self.num_inducing*self.output_dim].reshape(self.num_inducing,self.output_dim),self.q_u_canonical_flat[self.num_inducing*self.output_dim:].reshape(self.num_inducing,self.num_inducing)
        self.q_u_prec = -2.*self.q_u_canonical[1]
        self.q_u_cov, q_u_Li, q_u_L, tmp = pdinv(self.q_u_prec)
        self.q_u_Li = q_u_Li
        self.q_u_logdet = -tmp
        self.q_u_mean, _ = dpotrs(q_u_Li, np.asfortranarray(self.q_u_canonical[0]),lower=1)
        self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov*self.output_dim)
    def get_vb_param(self):
        """
        Return the canonical parameters of the distribution q(u)
        """
        return self.q_u_canonical_flat
    def plot(self, ax=None, fignum=None, Z_height=None, **kwargs):
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
        #horrible hack here:
        data = self.likelihood.data.copy()
        self.likelihood.data = self.Y
        GPBase.plot(self, ax=ax, **kwargs)
        self.likelihood.data = data
        Zu = self.Z * self._Xscale + self._Xoffset
        if self.input_dim==1:
            ax.plot(self.X_batch, self.likelihood.data, 'gx',mew=2)
            if Z_height is None:
                Z_height = ax.get_ylim()[0]
            ax.plot(Zu, np.zeros_like(Zu) + Z_height, 'r|', mew=1.5, markersize=12)
        if self.input_dim==2:
            ax.scatter(self.X_all[:,0], self.X_all[:,1], 20., self.Y[:,0], linewidth=0, cmap=pb.cm.jet)
            ax.plot(Zu[:,0], Zu[:,1], 'w^')
    def plot_traces(self):
        pb.figure()
        t = np.array(self._param_trace)
        pb.subplot(2,1,1)
        for l,ti in zip(self._get_param_names(),t.T):
            if not l[:3]=='iip':
                pb.plot(ti,label=l)
        pb.legend(loc=0)
        pb.subplot(2,1,2)
        pb.plot(np.asarray(self._ll_trace),label='stochastic likelihood')
        pb.legend(loc=0)
--- a/GPy/examples/init.py
+++ b/GPy/examples/init.py
@ -5,3 +5,4 @@ import classification
 import regression
 import dimensionality_reduction
 import tutorials
 import stochastic
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -25,7 +25,6 @@ def crescent_data(seed=default_seed): # FIXME
    Y[Y.flatten()==-1] = 0
    m = GPy.models.GPClassification(data['X'], Y)
    m.ensure_default_constraints()
    #m.update_likelihood_approximation()
    #m.optimize()
    m.pseudo_EM()
@ -75,7 +74,6 @@ def toy_linear_1d_classification(seed=default_seed):
    # Model definition
    m = GPy.models.GPClassification(data['X'], Y)
    m.ensure_default_constraints()
    # Optimize
    #m.update_likelihood_approximation()
@ -106,7 +104,6 @@ def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
    m = GPy.models.SparseGPClassification(data['X'], Y,num_inducing=num_inducing)
    m['.*len']= 4.
    m.ensure_default_constraints()
    # Optimize
    #m.update_likelihood_approximation()
    # Parameters optimization:
@ -137,7 +134,6 @@ def sparse_crescent_data(num_inducing=10, seed=default_seed):
    Y[Y.flatten()==-1]=0
    m = GPy.models.SparseGPClassification(data['X'], Y,num_inducing=num_inducing)
    m.ensure_default_constraints()
    m['.*len'] = 10.
    #m.update_likelihood_approximation()
    #m.optimize()
@ -163,7 +159,6 @@ def FITC_crescent_data(num_inducing=10, seed=default_seed):
    m = GPy.models.FITCClassification(data['X'], Y,num_inducing=num_inducing)
    m.constrain_bounded('.*len',1.,1e3)
    m.ensure_default_constraints()
    m['.*len'] = 3.
    #m.update_likelihood_approximation()
    #m.optimize()
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -37,7 +37,6 @@ def BGPLVM(seed=default_seed):
    # m.optimize(messages = 1)
    # m.plot()
    # pb.title('After optimisation')
    m.ensure_default_constraints()
    m.randomize()
    m.checkgrad(verbose=1)
@ -53,7 +52,6 @@ def GPLVM_oil_100(optimize=True):
    m.data_labels = data['Y'].argmax(axis=1)
    # optimize
    m.ensure_default_constraints()
    if optimize:
        m.optimize('scg', messages=1)
@ -108,7 +106,6 @@ def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False
    m.data_colors = c
    m.data_t = t
    m.ensure_default_constraints()
    m['rbf_lengthscale'] = 1. # X.var(0).max() / X.var(0)
    m['noise_variance'] = Y.var() / 100.
    m['bias_variance'] = 0.05
@ -134,7 +131,6 @@ def BGPLVM_oil(optimize=True, N=200, Q=10, num_inducing=15, max_f_eval=50, plot=
    m['.*lengt'] = 1. # m.X.var(0).max() / m.X.var(0)
    m['noise'] = Yn.var() / 100.
    m.ensure_default_constraints()
    # optimize
    if optimize:
@ -159,7 +155,6 @@ def oil_100():
    m = GPy.models.GPLVM(data['X'], 2)
    # optimize
    m.ensure_default_constraints()
    m.optimize(messages=1, max_iters=2)
    # plot
@ -239,7 +234,6 @@ def bgplvm_simulation_matlab_compare():
 #                        X=mu,
 #                        X_variance=S,
                       _debug=False)
    m.ensure_default_constraints()
    m.auto_scale_factor = True
    m['noise'] = Y.var() / 100.
    m['linear_variance'] = .01
@ -261,8 +255,8 @@ def bgplvm_simulation(optimize='scg',
    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2)) # + kern.bias(Q)
    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k, _debug=True)
    # m.constrain('variance|noise', logexp_clipped())
    m.ensure_default_constraints()
    m['noise'] = Y.var() / 100.
    m['linear_variance'] = .01
@ -291,7 +285,6 @@ def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
    for i, Y in enumerate(Ylist):
        m['{}_noise'.format(i + 1)] = Y.var() / 100.
    m.ensure_default_constraints()
    # DEBUG
    # np.seterr("raise")
@ -319,7 +312,6 @@ def brendan_faces():
    # optimize
    m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped())
    m.ensure_default_constraints()
    m.optimize('scg', messages=1, max_f_eval=10000)
    ax = m.plot_latent(which_indices=(0, 1))
@ -327,28 +319,55 @@ def brendan_faces():
    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, invert=False, scale=False)
    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
    raw_input('Press enter to finish')
    lvm_visualizer.close()
    return m
 def stick_play(range=None, frame_rate=15):
    data = GPy.util.datasets.stick()
    # optimize
    if range==None:
        Y = data['Y'].copy()
    else:
        Y = data['Y'][range[0]:range[1], :].copy()
    y = Y[0, :]
    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
    GPy.util.visualize.data_play(Y, data_show, frame_rate)
    return Y
 def stick():
    data = GPy.util.datasets.stick()
    m = GPy.models.GPLVM(data['Y'], 2)
    # optimize
-    m.ensure_default_constraints()
+    m = GPy.models.GPLVM(data['Y'], 2)
    m.optimize(messages=1, max_f_eval=10000)
    m._set_params(m._get_params())
-
+    plt.clf
    ax = m.plot_latent()
    y = m.likelihood.Y[0, :]
    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
    raw_input('Press enter to finish')
    lvm_visualizer.close()
    return m
 def stick_bgplvm(model=None):
    data = GPy.util.datasets.stick()
    Q = 6
    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2))
    m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20,kernel=kernel)
    # optimize
    m.ensure_default_constraints()
    m.optimize(messages=1, max_f_eval=3000,xtol=1e-300,ftol=1e-300)
    m._set_params(m._get_params())
    plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
    plt.sca(latent_axes)
    m.plot_latent()
    y = m.likelihood.Y[0, :].copy()
    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
    lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
    raw_input('Press enter to finish')
    return m
 def cmu_mocap(subject='35', motion=['01'], in_place=True):
    data = GPy.util.datasets.cmu_mocap(subject, motion)
@ -359,7 +378,6 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True):
    m = GPy.models.GPLVM(data['Y'], 2, normalize_Y=True)
    # optimize
    m.ensure_default_constraints()
    m.optimize(messages=1, max_f_eval=10000)
    ax = m.plot_latent()
@ -391,7 +409,6 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True):
 #     m.set('iip', Z)
 #     m.set('bias', 1e-4)
 #     # optimize
 #     # m.ensure_default_constraints()
 #
 #     import pdb; pdb.set_trace()
 #     m.optimize('tnc', messages=1)
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -18,7 +18,6 @@ def toy_rbf_1d(optimizer='tnc', max_nb_eval_optim=100):
    m = GPy.models.GPRegression(data['X'],data['Y'])
    # optimize
    m.ensure_default_constraints()
    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
    # plot
    m.plot()
@ -36,7 +35,6 @@ def rogers_girolami_olympics(optim_iters=100):
    m['rbf_lengthscale'] = 10
    # optimize
    m.ensure_default_constraints()
    m.optimize(max_f_eval=optim_iters)
    # plot
@ -52,7 +50,6 @@ def toy_rbf_1d_50(optim_iters=100):
    m = GPy.models.GPRegression(data['X'],data['Y'])
    # optimize
    m.ensure_default_constraints()
    m.optimize(max_f_eval=optim_iters)
    # plot
@ -68,7 +65,6 @@ def silhouette(optim_iters=100):
    m = GPy.models.GPRegression(data['X'],data['Y'])
    # optimize
    m.ensure_default_constraints()
    m.optimize(messages=True,max_f_eval=optim_iters)
    print(m)
@ -92,7 +88,6 @@ def coregionalisation_toy2(optim_iters=100):
    m = GPy.models.GPRegression(X,Y,kernel=k)
    m.constrain_fixed('.*rbf_var',1.)
    #m.constrain_positive('.*kappa')
    m.ensure_default_constraints()
    m.optimize('sim',messages=1,max_f_eval=optim_iters)
    pb.figure()
@ -124,7 +119,6 @@ def coregionalisation_toy(optim_iters=100):
    m = GPy.models.GPRegression(X,Y,kernel=k)
    m.constrain_fixed('.*rbf_var',1.)
    #m.constrain_positive('kappa')
    m.ensure_default_constraints()
    m.optimize(max_f_eval=optim_iters)
    pb.figure()
@ -162,7 +156,6 @@ def coregionalisation_sparse(optim_iters=100):
    m.constrain_fixed('.*rbf_var',1.)
    m.constrain_fixed('iip')
    m.constrain_bounded('noise_variance',1e-3,1e-1)
    m.ensure_default_constraints()
    m.optimize_restarts(5, robust=True, messages=1, max_f_eval=optim_iters)
    #plotting:
@ -189,11 +182,9 @@ def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000
    log_SNRs = np.linspace(-3., 4., resolution)
    data = GPy.util.datasets.della_gatta_TRP63_gene_expression(gene_number)
    # Sub sample the data to ensure multiple optima
    #data['Y'] = data['Y'][0::2, :]
    #data['X'] = data['X'][0::2, :]
    # Remove the mean (no bias kernel to ensure signal/noise is in RBF/white)
    data['Y'] = data['Y'] - np.mean(data['Y'])
    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
@ -220,7 +211,6 @@ def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000
        optim_point_y[0] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);
        # optimize
        m.ensure_default_constraints()
        m.optimize('scg', xtol=1e-6, ftol=1e-6, max_f_eval=optim_iters)
        optim_point_x[1] = m['rbf_lengthscale']
@ -273,7 +263,6 @@ def sparse_GP_regression_1D(N = 400, num_inducing = 5, optim_iters=100):
    # create simple GP Model
    m = GPy.models.SparseGPRegression(X, Y, kernel, num_inducing=num_inducing)
    m.ensure_default_constraints()
    m.checkgrad(verbose=1)
    m.optimize('tnc', messages = 1, max_f_eval=optim_iters)
@ -294,7 +283,6 @@ def sparse_GP_regression_2D(N = 400, num_inducing = 50, optim_iters=100):
    m = GPy.models.SparseGPRegression(X,Y,kernel, num_inducing = num_inducing)
    # contrain all parameters to be positive (but not inducing inputs)
    m.ensure_default_constraints()
    m.set('.*len',2.)
    m.checkgrad()
@ -320,7 +308,6 @@ def uncertain_inputs_sparse_regression(optim_iters=100):
    # create simple GP Model - no input uncertainty on this one
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
    m.ensure_default_constraints()
    m.optimize('scg', messages=1, max_f_eval=optim_iters)
    m.plot(ax=axes[0])
    axes[0].set_title('no input uncertainty')
@ -328,7 +315,6 @@ def uncertain_inputs_sparse_regression(optim_iters=100):
    #the same Model with uncertainty
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z, X_variance=S)
    m.ensure_default_constraints()
    m.optimize('scg', messages=1, max_f_eval=optim_iters)
    m.plot(ax=axes[1])
    axes[1].set_title('with input uncertainty')
--- a/GPy/examples/stochastic.py
+++ b/GPy/examples/stochastic.py
@ -0,0 +1,40 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import pylab as pb
 import numpy as np
 import GPy
 def toy_1d():
    N = 2000
    M = 20
    #create data
    X = np.linspace(0,32,N)[:,None]
    Z = np.linspace(0,32,M)[:,None]
    Y = np.sin(X) + np.cos(0.3*X) + np.random.randn(*X.shape)/np.sqrt(50.)
    m = GPy.models.SVIGPRegression(X,Y, batchsize=10, Z=Z)
    m.constrain_bounded('noise_variance',1e-3,1e-1)
    m.param_steplength = 1e-4
    fig = pb.figure()
    ax = fig.add_subplot(111)
    def cb():
        ax.cla()
        m.plot(ax=ax,Z_height=-3)
        ax.set_ylim(-3,3)
        fig.canvas.draw()
    m.optimize(500, callback=cb, callback_interval=1)
    m.plot_traces()
    return m
--- a/GPy/examples/tutorials.py
+++ b/GPy/examples/tutorials.py
@ -24,7 +24,6 @@ def tuto_GP_regression():
    print m
    m.plot()
    m.ensure_default_constraints() 
    m.constrain_positive('')
    m.unconstrain('')               # may be used to remove the previous constrains
@ -135,7 +134,6 @@ def tuto_kernel_overview():
    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
    m.plot(ax=axs, which_parts=[False,False,False,True])
    m.ensure_default_constraints()
    return(m)
@ -144,6 +142,5 @@ def model_interaction():
    Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5.
    k = GPy.kern.rbf(1) + GPy.kern.bias(1)
    m = GPy.models.GPRegression(X, Y, kernel=k)
    m.ensure_default_constraints()
    return m
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -1,8 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-
+from constructors import *
 from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, periodic_exponential, periodic_Matern32, periodic_Matern52, prod, symmetric, Coregionalise, rational_quadratic, Fixed, rbfcos, IndependentOutputs
 try:
    from constructors import rbf_sympy, sympykern # these depend on sympy
 except:
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -1,33 +1,9 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from kern import kern
-
+import parts
 from rbf import rbf as rbfpart
 from white import white as whitepart
 from linear import linear as linearpart
 from exponential import exponential as exponentialpart
 from Matern32 import Matern32 as Matern32part
 from Matern52 import Matern52 as Matern52part
 from bias import bias as biaspart
 from fixed import Fixed as fixedpart
 from finite_dimensional import finite_dimensional as finite_dimensionalpart
 from spline import spline as splinepart
 from Brownian import Brownian as Brownianpart
 from periodic_exponential import periodic_exponential as periodic_exponentialpart
 from periodic_Matern32 import periodic_Matern32 as periodic_Matern32part
 from periodic_Matern52 import periodic_Matern52 as periodic_Matern52part
 from prod import prod as prodpart
 from symmetric import symmetric as symmetric_part
 from coregionalise import Coregionalise as coregionalise_part
 from rational_quadratic import rational_quadratic as rational_quadraticpart
 from rbfcos import rbfcos as rbfcospart
 from independent_outputs import IndependentOutputs as independent_output_part
 #TODO these s=constructors are not as clean as we'd like. Tidy the code up
 #using meta-classes to make the objects construct properly wthout them.
 def rbf(input_dim,variance=1., lengthscale=None,ARD=False):
    """
@ -42,7 +18,7 @@ def rbf(input_dim,variance=1., lengthscale=None,ARD=False):
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
-    part = rbfpart(input_dim,variance,lengthscale,ARD)
+    part = parts.rbf.RBF(input_dim,variance,lengthscale,ARD)
    return kern(input_dim, [part])
 def linear(input_dim,variances=None,ARD=False):
@ -55,7 +31,7 @@ def linear(input_dim,variances=None,ARD=False):
     variances (np.ndarray)
     ARD (boolean)
    """
-    part = linearpart(input_dim,variances,ARD)
+    part = parts.linear.Linear(input_dim,variances,ARD)
    return kern(input_dim, [part])
 def white(input_dim,variance=1.):
@ -67,7 +43,7 @@ def white(input_dim,variance=1.):
    input_dimD (int), obligatory
     variance (float)
    """
-    part = whitepart(input_dim,variance)
+    part = parts.white.White(input_dim,variance)
    return kern(input_dim, [part])
 def exponential(input_dim,variance=1., lengthscale=None, ARD=False):
@ -83,7 +59,7 @@ def exponential(input_dim,variance=1., lengthscale=None, ARD=False):
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
-    part = exponentialpart(input_dim,variance, lengthscale, ARD)
+    part = parts.exponential.Exponential(input_dim,variance, lengthscale, ARD)
    return kern(input_dim, [part])
 def Matern32(input_dim,variance=1., lengthscale=None, ARD=False):
@ -99,7 +75,7 @@ def Matern32(input_dim,variance=1., lengthscale=None, ARD=False):
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
-    part = Matern32part(input_dim,variance, lengthscale, ARD)
+    part = parts.Matern32.Matern32(input_dim,variance, lengthscale, ARD)
    return kern(input_dim, [part])
 def Matern52(input_dim, variance=1., lengthscale=None, ARD=False):
@ -115,7 +91,7 @@ def Matern52(input_dim, variance=1., lengthscale=None, ARD=False):
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
-    part = Matern52part(input_dim, variance, lengthscale, ARD)
+    part = parts.Matern52.Matern52(input_dim, variance, lengthscale, ARD)
    return kern(input_dim, [part])
 def bias(input_dim, variance=1.):
@ -127,7 +103,7 @@ def bias(input_dim, variance=1.):
     input_dim (int), obligatory
     variance (float)
    """
-    part = biaspart(input_dim, variance)
+    part = parts.bias.Bias(input_dim, variance)
    return kern(input_dim, [part])
 def finite_dimensional(input_dim, F, G, variances=1., weights=None):
@ -138,7 +114,7 @@ def finite_dimensional(input_dim, F, G, variances=1., weights=None):
    G: np.array with shape (n,n) - the Gram matrix associated to F
    variances : np.ndarray with shape (n,)
    """
-    part = finite_dimensionalpart(input_dim, F, G, variances, weights)
+    part = parts.finite_dimensional.FiniteDimensional(input_dim, F, G, variances, weights)
    return kern(input_dim, [part])
 def spline(input_dim, variance=1.):
@ -150,7 +126,7 @@ def spline(input_dim, variance=1.):
    :param variance: the variance of the kernel
    :type variance: float
    """
-    part = splinepart(input_dim, variance)
+    part = parts.spline.Spline(input_dim, variance)
    return kern(input_dim, [part])
 def Brownian(input_dim, variance=1.):
@ -162,7 +138,7 @@ def Brownian(input_dim, variance=1.):
    :param variance: the variance of the kernel
    :type variance: float
    """
-    part = Brownianpart(input_dim, variance)
+    part = parts.Brownian.Brownian(input_dim, variance)
    return kern(input_dim, [part])
 try:
@ -215,7 +191,7 @@ def periodic_exponential(input_dim=1, variance=1., lengthscale=None, period=2 *
    :param n_freq: the number of frequencies considered for the periodic subspace
    :type n_freq: int
    """
-    part = periodic_exponentialpart(input_dim, variance, lengthscale, period, n_freq, lower, upper)
+    part = parts.periodic_exponential.PeriodicExponential(input_dim, variance, lengthscale, period, n_freq, lower, upper)
    return kern(input_dim, [part])
 def periodic_Matern32(input_dim, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
@ -233,7 +209,7 @@ def periodic_Matern32(input_dim, variance=1., lengthscale=None, period=2 * np.pi
     :param n_freq: the number of frequencies considered for the periodic subspace
     :type n_freq: int
    """
-    part = periodic_Matern32part(input_dim, variance, lengthscale, period, n_freq, lower, upper)
+    part = parts.periodic_Matern32.PeriodicMatern32(input_dim, variance, lengthscale, period, n_freq, lower, upper)
    return kern(input_dim, [part])
 def periodic_Matern52(input_dim, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
@ -251,7 +227,7 @@ def periodic_Matern52(input_dim, variance=1., lengthscale=None, period=2 * np.pi
     :param n_freq: the number of frequencies considered for the periodic subspace
     :type n_freq: int
    """
-    part = periodic_Matern52part(input_dim, variance, lengthscale, period, n_freq, lower, upper)
+    part = parts.periodic_Matern52part(input_dim, variance, lengthscale, period, n_freq, lower, upper)
    return kern(input_dim, [part])
 def prod(k1,k2,tensor=False):
@ -262,7 +238,7 @@ def prod(k1,k2,tensor=False):
    :type k1, k2: kernpart
    :rtype: kernel object
    """
-    part = prodpart(k1,k2,tensor)
+    part = parts.prodpart(k1,k2,tensor)
    return kern(part.input_dim, [part])
 def symmetric(k):
@ -270,11 +246,11 @@ def symmetric(k):
    Construct a symmetrical kernel from an existing kernel
    """
    k_ = k.copy()
-    k_.parts = [symmetric_part(p) for p in k.parts]
+    k_.parts = [symmetric.Symmetric(p) for p in k.parts]
    return k_
-def Coregionalise(Nout,R=1, W=None, kappa=None):
+def coregionalise(Nout,R=1, W=None, kappa=None):
-    p = coregionalise_part(Nout,R,W,kappa)
+    p = parts.coregionalise.Coregionalise(Nout,R,W,kappa)
    return kern(1,[p])
@ -291,10 +267,10 @@ def rational_quadratic(input_dim, variance=1., lengthscale=1., power=1.):
    :rtype: kern object
    """
-    part = rational_quadraticpart(input_dim, variance, lengthscale, power)
+    part = parts.rational_quadratic.RationalQuadratic(input_dim, variance, lengthscale, power)
    return kern(input_dim, [part])
-def Fixed(input_dim, K, variance=1.):
+def fixed(input_dim, K, variance=1.):
    """
     Construct a Fixed effect kernel.
@ -304,23 +280,21 @@ def Fixed(input_dim, K, variance=1.):
     K (np.array), obligatory
     variance (float)
    """
-    part = fixedpart(input_dim, K, variance)
+    part = parts.fixed.Fixed(input_dim, K, variance)
    return kern(input_dim, [part])
 def rbfcos(input_dim, variance=1., frequencies=None, bandwidths=None, ARD=False):
    """
    construct a rbfcos kernel
    """
-    part = rbfcospart(input_dim, variance, frequencies, bandwidths, ARD)
+    part = parts.rbfcos.RBFCos(input_dim, variance, frequencies, bandwidths, ARD)
    return kern(input_dim, [part])
-def IndependentOutputs(k):
+def independent_outputs(k):
    """
    Construct a kernel with independent outputs from an existing kernel
    """
    for sl in k.input_slices:
        assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
-    parts = [independent_output_part(p) for p in k.parts]
+    parts = [independent_outputs.IndependentOutputs(p) for p in k.parts]
    return kern(k.input_dim+1,parts)
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -1,13 +1,12 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from ..core.parameterised import Parameterised
-from kernpart import Kernpart
+from parts.kernpart import Kernpart
 import itertools
-from prod import prod
+from parts.prod import Prod as prod
 class kern(Parameterised):
    def __init__(self, input_dim, parts=[], input_slices=None):
--- a/GPy/kern/parts/Brownian.py
+++ b/GPy/kern/parts/Brownian.py
@ -21,7 +21,7 @@ class Brownian(Kernpart):
    def __init__(self,input_dim,variance=1.):
        self.input_dim = input_dim
        assert self.input_dim==1, "Brownian motion in 1D only"
-        self.num_params = 1.
+        self.num_params = 1
        self.name = 'Brownian'
        self._set_params(np.array([variance]).flatten())
--- a/GPy/kern/parts/Matern32.py
+++ b/GPy/kern/parts/Matern32.py
--- a/GPy/kern/parts/Matern52.py
+++ b/GPy/kern/parts/Matern52.py
--- a/GPy/kern/parts/init.py
+++ b/GPy/kern/parts/init.py
@ -0,0 +1,21 @@
 import bias
 import Brownian
 import coregionalise
 import exponential
 import finite_dimensional
 import fixed
 import independent_outputs
 import linear
 import Matern32
 import Matern52
 import periodic_exponential
 import periodic_Matern32
 import periodic_Matern52
 import prod_orthogonal
 import prod
 import rational_quadratic
 import rbfcos
 import rbf
 import spline
 import symmetric
 import white
--- a/GPy/kern/parts/bias.py
+++ b/GPy/kern/parts/bias.py
@ -6,7 +6,7 @@ from kernpart import Kernpart
 import numpy as np
 import hashlib
-class bias(Kernpart):
+class Bias(Kernpart):
    def __init__(self,input_dim,variance=1.):
        """
        :param input_dim: the number of input dimensions
--- a/GPy/kern/parts/coregionalise.py
+++ b/GPy/kern/parts/coregionalise.py
--- a/GPy/kern/parts/exponential.py
+++ b/GPy/kern/parts/exponential.py
@ -6,7 +6,7 @@ from kernpart import Kernpart
 import numpy as np
 from scipy import integrate
-class exponential(Kernpart):
+class Exponential(Kernpart):
    """
    Exponential kernel (aka Ornstein-Uhlenbeck or Matern 1/2)
--- a/GPy/kern/parts/finite_dimensional.py
+++ b/GPy/kern/parts/finite_dimensional.py
@ -4,9 +4,9 @@
 from kernpart import Kernpart
 import numpy as np
-from ..util.linalg import pdinv,mdot
+from ...util.linalg import pdinv,mdot
-class finite_dimensional(Kernpart):
+class FiniteDimensional(Kernpart):
    def __init__(self, input_dim, F, G, variance=1., weights=None):
        """
        Argumnents
--- a/GPy/kern/parts/fixed.py
+++ b/GPy/kern/parts/fixed.py
@ -15,7 +15,7 @@ class Fixed(Kernpart):
        self.input_dim = input_dim
        self.fixed_K = K
        self.num_params = 1
-        self.name = 'Fixed'
+        self.name = 'fixed'
        self._set_params(np.array([variance]).flatten())
    def _get_params(self):
--- a/GPy/kern/parts/independent_outputs.py
+++ b/GPy/kern/parts/independent_outputs.py
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
--- a/GPy/kern/parts/linear.py
+++ b/GPy/kern/parts/linear.py
@ -4,10 +4,10 @@
 from kernpart import Kernpart
 import numpy as np
-from ..util.linalg import tdot
+from ...util.linalg import tdot
 from scipy import weave
-class linear(Kernpart):
+class Linear(Kernpart):
    """
    Linear kernel
--- a/GPy/kern/parts/periodic_Matern32.py
+++ b/GPy/kern/parts/periodic_Matern32.py
@ -7,7 +7,7 @@ import numpy as np
 from GPy.util.linalg import mdot
 from GPy.util.decorators import silence_errors
-class periodic_Matern32(Kernpart):
+class PeriodicMatern32(Kernpart):
    """
    Kernel of the periodic subspace (up to a given frequency) of a Matern 3/2 RKHS. Only defined for input_dim=1.
--- a/GPy/kern/parts/periodic_Matern52.py
+++ b/GPy/kern/parts/periodic_Matern52.py
@ -7,7 +7,7 @@ import numpy as np
 from GPy.util.linalg import mdot
 from GPy.util.decorators import silence_errors
-class periodic_Matern52(Kernpart):
+class PeriodicMatern52(Kernpart):
    """
    Kernel of the periodic subspace (up to a given frequency) of a Matern 5/2 RKHS. Only defined for input_dim=1.
--- a/GPy/kern/parts/periodic_exponential.py
+++ b/GPy/kern/parts/periodic_exponential.py
@ -7,7 +7,7 @@ import numpy as np
 from GPy.util.linalg import mdot
 from GPy.util.decorators import silence_errors
-class periodic_exponential(Kernpart):
+class PeriodicExponential(Kernpart):
    """
    Kernel of the periodic subspace (up to a given frequency) of a exponential (Matern 1/2) RKHS. Only defined for input_dim=1.
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@ -5,7 +5,7 @@ from kernpart import Kernpart
 import numpy as np
 import hashlib
-class prod(Kernpart):
+class Prod(Kernpart):
    """
    Computes the product of 2 kernels
--- a/GPy/kern/parts/prod_orthogonal.py
+++ b/GPy/kern/parts/prod_orthogonal.py
--- a/GPy/kern/parts/rational_quadratic.py
+++ b/GPy/kern/parts/rational_quadratic.py
@ -5,7 +5,7 @@
 from kernpart import Kernpart
 import numpy as np
-class rational_quadratic(Kernpart):
+class RationalQuadratic(Kernpart):
    """
    rational quadratic kernel
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@ -6,9 +6,9 @@ from kernpart import Kernpart
 import numpy as np
 import hashlib
 from scipy import weave
-from ..util.linalg import tdot
+from ...util.linalg import tdot
-class rbf(Kernpart):
+class RBF(Kernpart):
    """
    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:
@ -241,9 +241,9 @@ class rbf(Kernpart):
        # here are the "statistics" for psi1 and psi2
        if not np.array_equal(Z, self._Z):
            #Z has changed, compute Z specific stuff
-            self._psi2_Zhat = 0.5*(Z[:,None,:] +Z[None,:,:]) # num_inducing,num_inducing,input_dim
+            self._psi2_Zhat = 0.5*(Z[:,None,:] +Z[None,:,:]) # M,M,Q
-            self._psi2_Zdist = 0.5*(Z[:,None,:]-Z[None,:,:]) # num_inducing,num_inducing,input_dim
+            self._psi2_Zdist = 0.5*(Z[:,None,:]-Z[None,:,:]) # M,M,Q
-            self._psi2_Zdist_sq = np.square(self._psi2_Zdist/self.lengthscale) # num_inducing,num_inducing,input_dim
+            self._psi2_Zdist_sq = np.square(self._psi2_Zdist/self.lengthscale) # M,M,Q
            self._Z = Z
        if not (np.array_equal(Z, self._Z) and np.array_equal(mu, self._mu) and np.array_equal(S, self._S)):
@ -257,12 +257,12 @@ class rbf(Kernpart):
            self._psi1 = self.variance*np.exp(self._psi1_exponent)
            #psi2
-            self._psi2_denom = 2.*S[:,None,None,:]/self.lengthscale2+1. # N,num_inducing,num_inducing,input_dim
+            self._psi2_denom = 2.*S[:,None,None,:]/self.lengthscale2+1. # N,M,M,Q
            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu,self._psi2_Zhat)
-            #self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,num_inducing,num_inducing,input_dim
+            #self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
            #self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
-            #self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,num_inducing,num_inducing
+            #self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q
-            self._psi2 = np.square(self.variance)*np.exp(self._psi2_exponent) # N,num_inducing,num_inducing
+            self._psi2 = np.square(self.variance)*np.exp(self._psi2_exponent) # N,M,M,Q
            #store matrices for caching
            self._Z, self._mu, self._S = Z, mu,S
--- a/GPy/kern/parts/rbfcos.py
+++ b/GPy/kern/parts/rbfcos.py
@ -6,7 +6,7 @@
 from kernpart import Kernpart
 import numpy as np
-class rbfcos(Kernpart):
+class RBFCos(Kernpart):
    def __init__(self,input_dim,variance=1.,frequencies=None,bandwidths=None,ARD=False):
        self.input_dim = input_dim
        self.name = 'rbfcos'
--- a/GPy/kern/parts/spline.py
+++ b/GPy/kern/parts/spline.py
@ -9,7 +9,7 @@ def theta(x):
    """Heaviside step function"""
    return np.where(x>=0.,1.,0.)
-class spline(Kernpart):
+class Spline(Kernpart):
    """
    Spline kernel
--- a/GPy/kern/parts/symmetric.py
+++ b/GPy/kern/parts/symmetric.py
@ -4,7 +4,7 @@
 from kernpart import Kernpart
 import numpy as np
-class symmetric(Kernpart):
+class Symmetric(Kernpart):
    """
    Symmetrical kernels
--- a/GPy/kern/parts/sympy_helpers.cpp
+++ b/GPy/kern/parts/sympy_helpers.cpp
--- a/GPy/kern/parts/sympy_helpers.h
+++ b/GPy/kern/parts/sympy_helpers.h
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
--- a/GPy/kern/parts/white.py
+++ b/GPy/kern/parts/white.py
@ -1,10 +1,10 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 import numpy as np
-class white(Kernpart):
+
 class White(Kernpart):
    """
    White noise kernel.
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@ -34,6 +34,8 @@ class EP(likelihood):
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
        self.VVT_factor = self.V
        self.trYYT = 0.
    def restart(self):
        self.tau_tilde = np.zeros(self.N)
@ -44,6 +46,8 @@ class EP(likelihood):
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
        self.VVT_factor = self.V
        self.trYYT = 0.
    def predictive_values(self,mu,var,full_cov):
        if full_cov:
@ -71,6 +75,8 @@ class EP(likelihood):
        self.covariance_matrix = np.diag(1./self.tau_tilde)
        self.precision = self.tau_tilde[:,None]
        self.V = self.precision * self.Y
        self.VVT_factor = self.V
        self.trYYT = np.trace(self.YYT)
    def fit_full(self,K):
        """
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -1,5 +1,7 @@
 import numpy as np
 from likelihood import likelihood
 from ..util.linalg import jitchol
 class Gaussian(likelihood):
    """
@ -40,9 +42,11 @@ class Gaussian(likelihood):
        if D > self.N:
            self.YYT = np.dot(self.Y, self.Y.T)
            self.trYYT = np.trace(self.YYT)
            self.YYT_factor = jitchol(self.YYT)
        else:
            self.YYT = None
            self.trYYT = np.sum(np.square(self.Y))
            self.YYT_factor = self.Y
    def _get_params(self):
        return np.asarray(self._variance)
@ -53,12 +57,13 @@ class Gaussian(likelihood):
    def _set_params(self, x):
        x = np.float64(x)
        if np.all(self._variance != x):
-            if x == 0.:
+            if x == 0.:#special case of zero noise
                self.precision = np.inf
                self.V = None
            else:
                self.precision = 1. / x
                self.V = (self.precision) * self.Y
                self.VVT_factor = self.precision * self.YYT_factor
            self.covariance_matrix = np.eye(self.N) * x
            self._variance = x
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -4,6 +4,7 @@
 from gp_regression import GPRegression
 from gp_classification import GPClassification
 from sparse_gp_regression import SparseGPRegression
 from svigp_regression import SVIGPRegression
 from sparse_gp_classification import SparseGPClassification
 from fitc_classification import FITCClassification
 from gplvm import GPLVM
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -46,7 +46,7 @@ class BayesianGPLVM(SparseGP, GPLVM):
            kernel = kern.rbf(input_dim) + kern.white(input_dim)
        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
-        self._set_params(self._get_params())
+        self.ensure_default_constraints()
    def __getstate__(self):
        """
--- a/GPy/models/fitc_classification.py
+++ b/GPy/models/fitc_classification.py
@ -44,4 +44,4 @@ class FITCClassification(FITC):
            assert Z.shape[1]==X.shape[1]
        FITC.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X)
-        self._set_params(self._get_params())
+        self.ensure_default_constraints()
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@ -38,4 +38,4 @@ class GPClassification(GP):
                raise Warning, 'likelihood.data and Y are different.'
        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
-        self._set_params(self._get_params())
+        self.ensure_default_constraints()
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -32,4 +32,4 @@ class GPRegression(GP):
        likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y)
        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
-        self._set_params(self._get_params())
+        self.ensure_default_constraints()
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -33,7 +33,7 @@ class GPLVM(GP):
            kernel = kern.rbf(input_dim, ARD=input_dim > 1) + kern.bias(input_dim, np.exp(-2)) + kern.white(input_dim, np.exp(-2))
        likelihood = Gaussian(Y, normalize=normalize_Y)
        GP.__init__(self, X, likelihood, kernel, normalize_X=False)
-        self._set_params(self._get_params())
+        self.ensure_default_constraints()
    def initialise_latent(self, init, input_dim, Y):
        if init == 'PCA':
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -82,7 +82,7 @@ class MRD(Model):
        self.MQ = self.num_inducing * self.input_dim
        Model.__init__(self)
-        self._set_params(self._get_params())
+        self.ensure_default_constraints()
    def __getstate__(self):
        return [self.names,
--- a/GPy/models/sparse_gp_classification.py
+++ b/GPy/models/sparse_gp_classification.py
@ -44,4 +44,4 @@ class SparseGPClassification(SparseGP):
            assert Z.shape[1]==X.shape[1]
        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X)
-        self._set_params(self._get_params())
+        self.ensure_default_constraints()
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@ -42,4 +42,4 @@ class SparseGPRegression(SparseGP):
        likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X, X_variance=X_variance)
-        self._set_params(self._get_params())
+        self.ensure_default_constraints()
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@ -26,6 +26,7 @@ class SparseGPLVM(SparseGPRegression, GPLVM):
    def __init__(self, Y, input_dim, kernel=None, init='PCA', num_inducing=10):
        X = self.initialise_latent(init, input_dim, Y)
        SparseGPRegression.__init__(self, X, Y, kernel=kernel, num_inducing=num_inducing)
        self.ensure_default_constraints()
    def _get_param_names(self):
        return (sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
--- a/GPy/models/svigp_regression.py
+++ b/GPy/models/svigp_regression.py
@ -0,0 +1,44 @@
 # Copyright (c) 2012, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import SVIGP
 from .. import likelihoods
 from .. import kern
 class SVIGPRegression(SVIGP):
    """
    Gaussian Process model for regression
    This is a thin wrapper around the SVIGP class, with a set of sensible defalts
    :param X: input observations
    :param Y: observed values
    :param kernel: a GPy kernel, defaults to rbf+white
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
    :rtype: model object
    .. Note:: Multiple independent outputs are allowed using columns of Y
    """
    def __init__(self, X, Y, kernel=None, Z=None, num_inducing=10, q_u=None, batchsize=10):
        # kern defaults to rbf (plus white for stability)
        if kernel is None:
            kernel = kern.rbf(X.shape[1], variance=1., lengthscale=4.) + kern.white(X.shape[1], 1e-3)
        # Z defaults to a subset of the data
        if Z is None:
            i = np.random.permutation(X.shape[0])[:num_inducing]
            Z = X[i].copy()
        else:
            assert Z.shape[1] == X.shape[1]
        # likelihood defaults to Gaussian
        likelihood = likelihoods.Gaussian(Y, normalize=False)
        SVIGP.__init__(self, X, likelihood, kernel, Z, q_u=q_u, batchsize=batchsize)
        self.load_batch()
--- a/GPy/testing/bgplvm_tests.py
+++ b/GPy/testing/bgplvm_tests.py
@ -16,7 +16,6 @@ class BGPLVMTests(unittest.TestCase):
        Y -= Y.mean(axis=0)
        k = GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())
@ -29,7 +28,6 @@ class BGPLVMTests(unittest.TestCase):
        Y -= Y.mean(axis=0)
        k = GPy.kern.linear(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())
@ -42,7 +40,6 @@ class BGPLVMTests(unittest.TestCase):
        Y -= Y.mean(axis=0)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())
@ -55,7 +52,6 @@ class BGPLVMTests(unittest.TestCase):
        Y -= Y.mean(axis=0)
        k = GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())
@ -69,7 +65,6 @@ class BGPLVMTests(unittest.TestCase):
        Y -= Y.mean(axis=0)
        k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())
--- a/GPy/testing/gplvm_tests.py
+++ b/GPy/testing/gplvm_tests.py
@ -14,7 +14,6 @@ class GPLVMTests(unittest.TestCase):
        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        k = GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())
@ -26,7 +25,6 @@ class GPLVMTests(unittest.TestCase):
        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        k = GPy.kern.linear(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())
@ -38,7 +36,6 @@ class GPLVMTests(unittest.TestCase):
        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -21,7 +21,7 @@ class KernelTests(unittest.TestCase):
        """
        X = np.random.rand(30, 4)
        K = np.dot(X, X.T)
-        kernel = GPy.kern.Fixed(4, K)
+        kernel = GPy.kern.fixed(4, K)
        Y = np.ones((30,1))
        m = GPy.models.GPRegression(X,Y,kernel=kernel)
        self.assertTrue(m.checkgrad())
@ -36,7 +36,7 @@ class KernelTests(unittest.TestCase):
        Y = np.vstack((Y1,Y2))
        k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
-        k2 = GPy.kern.Coregionalise(2,1)
+        k2 = GPy.kern.coregionalise(2,1)
        k = k1.prod(k2,tensor=True)
        m = GPy.models.GPRegression(X,Y,kernel=k)
        self.assertTrue(m.checkgrad())
--- a/GPy/testing/mrd_tests.py
+++ b/GPy/testing/mrd_tests.py
@ -24,7 +24,6 @@ class MRDTests(unittest.TestCase):
        likelihood_list = [GPy.likelihoods.Gaussian(Y) for Y in Ylist]
        m = GPy.models.MRD(likelihood_list, input_dim=input_dim, kernels=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        self.assertTrue(m.checkgrad())
--- a/GPy/testing/prior_tests.py
+++ b/GPy/testing/prior_tests.py
@ -14,7 +14,6 @@ class PriorTests(unittest.TestCase):
        y += 0.05*np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        m.ensure_default_constraints()
        lognormal = GPy.priors.LogGaussian(1, 2)
        m.set_prior('rbf', lognormal)
        m.randomize()
@ -28,7 +27,6 @@ class PriorTests(unittest.TestCase):
        y += 0.05*np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        m.ensure_default_constraints()
        Gamma = GPy.priors.Gamma(1, 1)
        m.set_prior('rbf', Gamma)
        m.randomize()
@ -42,7 +40,6 @@ class PriorTests(unittest.TestCase):
        y += 0.05*np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        m.ensure_default_constraints()
        gaussian = GPy.priors.Gaussian(1, 1)
        success = False
--- a/GPy/testing/psi_stat_gradient_tests.py
+++ b/GPy/testing/psi_stat_gradient_tests.py
@ -113,7 +113,6 @@ if __name__ == "__main__":
 #         Y -= Y.mean(axis=0)
 #         k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
 #         m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
 #         m.ensure_default_constraints()
 #         m.randomize()
 # #         self.assertTrue(m.checkgrad())
        numpy.random.seed(0)
@ -146,7 +145,6 @@ if __name__ == "__main__":
 #                          num_inducing=num_inducing, kernel=GPy.kern.rbf(input_dim))
        m3 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
                         num_inducing=num_inducing, kernel=GPy.kern.linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
        m3.ensure_default_constraints()
        # + GPy.kern.bias(input_dim))
 #         m4 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
 #                          num_inducing=num_inducing, kernel=GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim))
--- a/GPy/testing/sparse_gplvm_tests.py
+++ b/GPy/testing/sparse_gplvm_tests.py
@ -15,7 +15,6 @@ class sparse_GPLVMTests(unittest.TestCase):
        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        k = GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())
@ -27,7 +26,6 @@ class sparse_GPLVMTests(unittest.TestCase):
        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        k = GPy.kern.linear(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())
@ -39,7 +37,6 @@ class sparse_GPLVMTests(unittest.TestCase):
        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -37,7 +37,6 @@ class GradientTests(unittest.TestCase):
        noise = GPy.kern.white(dimension)
        kern = kern + noise
        m = model_fit(X, Y, kernel=kern)
        m.ensure_default_constraints()
        m.randomize()
        # contrain all parameters to be positive
        self.assertTrue(m.checkgrad())
@ -150,7 +149,6 @@ class GradientTests(unittest.TestCase):
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N), K, input_dim).T
        m = GPy.models.GPLVM(Y, input_dim, kernel=k)
        m.ensure_default_constraints()
        self.assertTrue(m.checkgrad())
    def test_GPLVM_rbf_linear_white_kern_2D(self):
@ -161,7 +159,6 @@ class GradientTests(unittest.TestCase):
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N), K, input_dim).T
        m = GPy.models.GPLVM(Y, input_dim, init='PCA', kernel=k)
        m.ensure_default_constraints()
        self.assertTrue(m.checkgrad())
    def test_GP_EP_probit(self):
@ -195,7 +192,6 @@ class GradientTests(unittest.TestCase):
        k = GPy.kern.rbf(1) + GPy.kern.white(1)
        Y = np.hstack([np.ones(N/2),np.zeros(N/2)])[:,None]
        m = GPy.models.FITCClassification(X, Y=Y)
        m.ensure_default_constraints()
        m.update_likelihood_approximation()
        self.assertTrue(m.checkgrad())
--- a/GPy/util/classification.py
+++ b/GPy/util/classification.py
@ -2,29 +2,30 @@ import numpy as np
 def conf_matrix(p,labels,names=['1','0'],threshold=.5,show=True):
    """
-    Returns true and false positives in a binary classification problem
+    Returns error rate and true/false positives in a binary classification problem
-    - Column names: true class of the examples
+    - Actual classes are displayed by column.
-    - Row names: classification assigned by the model
+    - Predicted classes are displayed by row.
-    p: probabilities estimated for observation of belonging to class '1'
+    :param p: array of class '1' probabilities.
-    labels: observations' class
+    :param labels: array of actual classes.
-    names: classes' names
+    :param names: list of class names, defaults to ['1','0'].
-    threshold: probability value at which the model allocate an element to each class
+    :param threshold: probability value used to decide the class.
-    show: whether the matrix should be shown or not
+    :param show: whether the matrix should be shown or not
    :type show: False|True
    """
-    p = p.flatten()
+    assert p.size == labels.size, "Arrays p and labels have different dimensions."
-    labels = labels.flatten()
+    decision = np.ones((labels.size,1))
-    N = p.size
+    decision[p<threshold] = 0
-    C = np.ones(N)
+    diff = decision - labels
-    C[p<threshold] = 0
+    false_0 = diff[diff == -1].size
-    True_1 = float((labels - C)[labels-C==0].shape[0] )
+    false_1 = diff[diff == 1].size
-    False_1 = float((labels - C)[labels-C==-2].shape[0] )
+    true_1 = np.sum(decision[diff ==0])
-    True_0 = float((labels - C)[labels-C==-1].shape[0] )
+    true_0 = labels.size - true_1 - false_0 - false_1
-    False_0 = float((labels - C)[labels-C==1].shape[0] )
+    error = (false_1 + false_0)/np.float(labels.size)
    if show:
-        print (True_1 + True_0 + 0.)/N * 100,'% instances correctly classified'
+        print 100. - error * 100,'% instances correctly classified'
        print '%-10s|  %-10s|  %-10s| ' % ('',names[0],names[1])
        print '----------|------------|------------|'
-        print '%-10s|  %-10s|  %-10s| ' % (names[0],True_1,False_0)
+        print '%-10s|  %-10s|  %-10s| ' % (names[0],true_1,false_0)
-        print '%-10s|  %-10s|  %-10s| ' % (names[1],False_1,True_0)
+        print '%-10s|  %-10s|  %-10s| ' % (names[1],false_1,true_0)
-    return True_1, False_1, True_0, False_0
+    return error,true_1, false_1, true_0, false_0
--- a/GPy/util/plot.py
+++ b/GPy/util/plot.py
@ -70,6 +70,36 @@ def align_subplots(N,M,xlim=None, ylim=None):
        else:
            removeUpperTicks()
 def align_subplot_array(axes,xlim=None, ylim=None):
    """make all of the axes in the array hae the same limits, turn off unnecessary ticks
    use pb.subplots() to get an array of axes
    """
    #find sensible xlim,ylim
    if xlim is None:
        xlim = [np.inf,-np.inf]
        for ax in axes.flatten():
            xlim[0] = min(xlim[0],ax.get_xlim()[0])
            xlim[1] = max(xlim[1],ax.get_xlim()[1])
    if ylim is None:
        ylim = [np.inf,-np.inf]
        for ax in axes.flatten():
            ylim[0] = min(ylim[0],ax.get_ylim()[0])
            ylim[1] = max(ylim[1],ax.get_ylim()[1])
    N,M = axes.shape
    for i,ax in enumerate(axes.flatten()):
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
        if (i)%M:
            ax.set_yticks([])
        else:
            removeRightTicks(ax)
        if i<(M*(N-1)):
            ax.set_xticks([])
        else:
            removeUpperTicks(ax)
 def x_frame1D(X,plot_limits=None,resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
--- a/GPy/util/visualize.py
+++ b/GPy/util/visualize.py
@ -203,6 +203,7 @@ class lvm_dimselect(lvm):
            self.sense_axes = sense_axes
        self.labels = labels
        lvm.__init__(self,vals,Model,data_visualize,latent_axes,sense_axes,latent_index)
        self.show_sensitivities()
        print "use left and right mouse butons to select dimensions"
@ -506,5 +507,5 @@ def data_play(Y, visualizer, frame_rate=30):
    for y in Y:
-        visualizer.modify(y)
+        visualizer.modify(y[None, :])
        time.sleep(1./float(frame_rate))
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,4 +1,4 @@
 include *.txt
-recursive-include docs *.txt
+recursive-include doc *.txt
 include *.md
-recursive-include docs *.md
+recursive-include doc *.md
--- a/setup.py
+++ b/setup.py
@ -5,7 +5,7 @@ import os
 from setuptools import setup
 # Version number
-version = '0.4.5'
+version = '0.4.6'
 def read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read()