merged with master

2026-05-14 06:22:38 +02:00 · 2013-02-15 11:56:37 +00:00 · 2013-02-15 11:56:37 +00:00 · 2abba8cf14
commit 2abba8cf14
parent 03c1f77c08 bfd2166469
59 changed files with 2845 additions and 1608 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -1,11 +1,20 @@
 language: python
 python:
  - "2.7"
 #Set virtual env with system-site-packages to true
 virtualenv:
  system_site_packages: true
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
-install: 
+before_install: 
-  - sudo apt-get install python-scipy
+  - sudo apt-get install -qq python-scipy python-pip
-  - pip install sphinx
+  - sudo apt-get install -qq python-matplotlib
 install:
  - pip install sphinx 
  - pip install nose
  - pip install . --use-mirrors
 # command to run tests, e.g. python setup.py test
 script: 
-  - nosetests --with-xcoverage --with-xunit --cover-package=GPy --cover-erase GPy/testing
+  - nosetests GPy/testing
--- a/GPy/init.py
+++ b/GPy/init.py
@ -7,5 +7,5 @@ import models
 import inference
 import util
 import examples
 #import examples TODO: discuss!
 from core import priors
 import likelihoods
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -10,6 +10,7 @@ from parameterised import parameterised, truncate_pad
 import priors
 from ..util.linalg import jitchol
 from ..inference import optimization
 from .. import likelihoods
 class model(parameterised):
    def __init__(self):
@ -82,7 +83,7 @@ class model(parameterised):
    def get(self,name, return_names=False):
        """
-        Get a model parameter by name. The name is applied as a regular expression and all parameters that match that regular expression are returned. 
+        Get a model parameter by name. The name is applied as a regular expression and all parameters that match that regular expression are returned.
        """
        matches = self.grep_param_names(name)
        if len(matches):
@ -107,7 +108,7 @@ class model(parameterised):
    def get_gradient(self,name, return_names=False):
        """
-        Get model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned. 
+        Get model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned.
        """
        matches = self.grep_param_names(name)
        if len(matches):
@ -303,54 +304,62 @@ class model(parameterised):
        return '\n'.join(s)
-    def checkgrad(self, verbose=False, include_priors=False, step=1e-6, tolerance = 1e-3, return_ratio=False, *args):
+    def checkgrad(self, verbose=False, include_priors=False, step=1e-6, tolerance = 1e-3):
        """
        Check the gradient of the model by comparing to a numerical estimate.
-        If the overall gradient fails, invividual components are tested.
+        If the verbose flag is passed, invividual components are tested (and printed)
        :param verbose: If True, print a "full" checking of each parameter
        :type verbose: bool
        :param step: The size of the step around which to linearise the objective
        :type step: float (defaul 1e-6)
        :param tolerance: the tolerance allowed (see note)
        :type tolerance: float (default 1e-3)
        Note:-
           The gradient is considered correct if the ratio of the analytical
           and numerical gradients is within <tolerance> of unity.
        """
        x = self._get_params_transformed().copy()
-        #choose a random direction to step in:
+        if not verbose:
-        dx = step*np.sign(np.random.uniform(-1,1,x.size))
+            #just check the global ratio
            dx = step*np.sign(np.random.uniform(-1,1,x.size))
-        #evaulate around the point x
+            #evaulate around the point x
-        self._set_params_transformed(x+dx)
+            self._set_params_transformed(x+dx)
-        f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()
+            f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()
-        self._set_params_transformed(x-dx)
+            self._set_params_transformed(x-dx)
-        f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()
+            f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()
-        self._set_params_transformed(x)
+            self._set_params_transformed(x)
-        gradient = self._log_likelihood_gradients_transformed()
+            gradient = self._log_likelihood_gradients_transformed()
-        numerical_gradient = (f1-f2)/(2*dx)
+            numerical_gradient = (f1-f2)/(2*dx)
-        global_ratio = (f1-f2)/(2*np.dot(dx,gradient))
+            global_ratio = (f1-f2)/(2*np.dot(dx,gradient))
        if verbose:
            print "Gradient ratio = ", global_ratio, '\n'
            sys.stdout.flush()
-        if (np.abs(1.-global_ratio)<tolerance) and not np.isnan(global_ratio):
+            if (np.abs(1.-global_ratio)<tolerance) and not np.isnan(global_ratio):
-            if verbose:
+                return True
-                print 'Gradcheck passed'
+            else:
                return False
        else:
-            if verbose:
+            #check the gradient of each parameter individually, and do some pretty printing
-                print "Global check failed. Testing individual gradients\n"
+            try:
                names = self._get_param_names_transformed()
            except NotImplementedError:
                names = ['Variable %i'%i for i in range(len(x))]
-                try:
+            # Prepare for pretty-printing
-                    names = self._get_param_names_transformed()
+            header = ['Name', 'Ratio', 'Difference', 'Analytical', 'Numerical']
-                except NotImplementedError:
+            max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
-                    names = ['Variable %i'%i for i in range(len(x))]
+            float_len = 10
-
+            cols = [max_names]
-                # Prepare for pretty-printing
+            cols.extend([max(float_len, len(header[i])) for i in range(1, len(header))])
-                header = ['Name', 'Ratio', 'Difference', 'Analytical', 'Numerical']
+            cols = np.array(cols) + 5
-                max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
+            header_string = ["{h:^{col}}".format(h = header[i], col = cols[i]) for i in range(len(cols))]
-                float_len = 10
+            header_string = map(lambda x: '|'.join(x), [header_string])
-                cols = [max_names]
+            separator = '-'*len(header_string[0])
-                cols.extend([max(float_len, len(header[i])) for i in range(1, len(header))])
+            print '\n'.join([header_string[0], separator])
                cols = np.array(cols) + 5
                header_string = ["{h:^{col}}".format(h = header[i], col = cols[i]) for i in range(len(cols))]
                header_string = map(lambda x: '|'.join(x), [header_string])
                separator = '-'*len(header_string[0])
                print '\n'.join([header_string[0], separator])
            for i in range(len(x)):
                xx = x.copy()
@ -368,27 +377,52 @@ class model(parameterised):
                ratio = (f1-f2)/(2*step*gradient)
                difference = np.abs((f1-f2)/2/step - gradient)
-                if verbose:
+                if (np.abs(ratio-1)<tolerance):
-                    if (np.abs(ratio-1)<tolerance):
+                    formatted_name = "\033[92m {0} \033[0m".format(names[i])
-                        formatted_name = "\033[92m {0} \033[0m".format(names[i])
+                else:
-                    else:
+                    formatted_name = "\033[91m {0} \033[0m".format(names[i])
-                        formatted_name = "\033[91m {0} \033[0m".format(names[i])
+                r = '%.6f' % float(ratio)
-                    r = '%.6f' % float(ratio)
+                d = '%.6f' % float(difference)
-                    d = '%.6f' % float(difference)
+                g = '%.6f' % gradient
-                    g = '%.6f' % gradient
+                ng = '%.6f' % float(numerical_gradient)
-                    ng = '%.6f' % float(numerical_gradient)
+                grad_string = "{0:^{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name,r,d,g, ng, c0 = cols[0]+9, c1 = cols[1], c2 = cols[2], c3 = cols[3], c4 = cols[4])
-                    grad_string = "{0:^{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name,r,d,g, ng, c0 = cols[0]+9, c1 = cols[1], c2 = cols[2], c3 = cols[3], c4 = cols[4])
+                print grad_string
                    print grad_string
-            if verbose:
+    def EPEM(self,epsilon=.1,**kwargs):
-                print ''
+        """
        TODO: Should this not bein the GP class?
        Expectation maximization for Expectation Propagation.
-            if return_ratio:
+        kwargs are passed to the optimize function. They can be:
-                return global_ratio
+
        :epsilon: convergence criterion
        :max_f_eval: maximum number of function evaluations
        :messages: whether to display during optimisation
        :param optimzer: whice optimizer to use (defaults to self.preferred optimizer)
        :type optimzer: string TODO: valid strings?
        """
        assert isinstance(self.likelihood,likelihoods.EP), "EM is not available for Gaussian likelihoods"
        log_change = epsilon + 1.
        self.log_likelihood_record = []
        self.gp_params_record = []
        self.ep_params_record = []
        iteration = 0
        last_value = -np.exp(1000)
        while log_change > epsilon or not iteration:
            print 'EM iteration %s' %iteration
            self.update_likelihood_approximation()
            self.optimize(**kwargs)
            new_value = self.log_likelihood()
            log_change = new_value - last_value
            if log_change > epsilon:
                self.log_likelihood_record.append(new_value)
                self.gp_params_record.append(self._get_params())
                #self.ep_params_record.append((self.beta,self.Y,self.Z_ep))
                last_value = new_value
            else:
-                return False
+                convergence = False
-
+                #self.beta, self.Y,  self.Z_ep = self.ep_params_record[-1]
-        if return_ratio:
+                self._set_params(self.gp_params_record[-1])
-            return global_ratio
+                print "Log-likelihood decrement: %s \nLast iteration discarded." %log_change
-        else:
+            iteration += 1
            return True
--- a/GPy/core/parameterised.py
+++ b/GPy/core/parameterised.py
@ -102,6 +102,11 @@ class parameterised(object):
        else:
            return expr
    def Nparam_transformed(self):
            ties = 0
            for ar in self.tied_indices:
                ties += ar.size - 1
            return self.Nparam - len(self.constrained_fixed_indices) - ties
    def constrain_positive(self, which):
        """
@ -149,8 +154,6 @@ class parameterised(object):
    def constrain_negative(self,which):
        """
        Set negative constraints.
--- a/GPy/examples/BGPLVM_demo.py
+++ b/GPy/examples/BGPLVM_demo.py
@ -16,8 +16,13 @@ k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
 K = k.K(X)
 Y = np.random.multivariate_normal(np.zeros(N),K,D).T
 <<<<<<< HEAD
 k = GPy.kern.rbf(Q) + GPy.kern.rbf(Q) + GPy.kern.white(Q)
 # k = GPy.kern.linear(Q, ARD = True)  + GPy.kern.white(Q, 0.00001)
 =======
 # k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
 k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
 >>>>>>> master
 m = GPy.models.Bayesian_GPLVM(Y, Q, kernel = k,  M=M)
 m.constrain_positive('(rbf|bias|noise|white|S)')
 # m.constrain_fixed('S', 1)
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -3,16 +3,15 @@
 """
-Simple Gaussian Processes classification
+Gaussian Processes classification
 """
 import pylab as pb
 import numpy as np
 import GPy
 default_seed=10000
-######################################
+
-## 2 dimensional example
+def crescent_data(model_type='Full', inducing=10, seed=default_seed): #FIXME
 def crescent_data(model_type='Full', inducing=10, seed=default_seed):
    """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
@ -21,20 +20,28 @@ def crescent_data(model_type='Full', inducing=10, seed=default_seed):
    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
    :type inducing: int
    """
    data = GPy.util.datasets.crescent_data(seed=seed)
-    likelihood = GPy.inference.likelihoods.probit(data['Y'])
+
    # Kernel object
    kernel = GPy.kern.rbf(data['X'].shape[1])
    # Likelihood object
    distribution = GPy.likelihoods.likelihood_functions.probit()
    likelihood = GPy.likelihoods.EP(data['Y'],distribution)
    if model_type=='Full':
-        m = GPy.models.GP_EP(data['X'],likelihood)
+        m = GPy.models.GP(data['X'],likelihood,kernel)
    else:
        # create sparse GP EP model
        m = GPy.models.sparse_GP_EP(data['X'],likelihood=likelihood,inducing=inducing,ep_proxy=model_type)
-    m.approximate_likelihood()
+    m.update_likelihood_approximation()
    print(m)
    # optimize
-    m.em()
+    m.optimize()
    print(m)
    # plot
@ -42,54 +49,67 @@ def crescent_data(model_type='Full', inducing=10, seed=default_seed):
    return m
 def oil():
-    """Run a Gaussian process classification on the oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood."""
+    """
    Run a Gaussian process classification on the oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
    """
    data = GPy.util.datasets.oil()
-    likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1])
+    # Kernel object
    kernel = GPy.kern.rbf(12)
-    # create simple GP model
+    # Likelihood object
-    m = GPy.models.GP_EP(data['X'],likelihood)
+    distribution = GPy.likelihoods.likelihood_functions.probit()
    likelihood = GPy.likelihoods.EP(data['Y'][:, 0:1],distribution)
-    # contrain all parameters to be positive
+    # Create GP model
    m = GPy.models.GP(data['X'],likelihood=likelihood,kernel=kernel)
    # Contrain all parameters to be positive
    m.constrain_positive('')
    m.tie_param('lengthscale')
-    m.approximate_likelihood()
+    m.update_likelihood_approximation()
-    # optimize
+    # Optimize
    m.optimize()
    # plot
    #m.plot()
    print(m)
    return m
-def toy_linear_1d_classification(model_type='Full', inducing=4, seed=default_seed):
+def toy_linear_1d_classification(seed=default_seed):
-    """Simple 1D classification example.
+    """
-    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
+    Simple 1D classification example
    :param seed : seed value for data generation (default is 4).
    :type seed: int
    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
    :type inducing: int
    """
    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
-    likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1])
+    Y = data['Y'][:, 0:1]
-    assert model_type in ('Full','DTC','FITC')
+    Y[Y == -1] = 0
-    # create simple GP model
+    # Kernel object
-    if model_type=='Full':
+    kernel = GPy.kern.rbf(1)
        m = GPy.models.simple_GP_EP(data['X'],likelihood)
    else:
        # create sparse GP EP model
        m = GPy.models.sparse_GP_EP(data['X'],likelihood=likelihood,inducing=inducing,ep_proxy=model_type)
-    m.constrain_positive('var')
+    # Likelihood object
-    m.constrain_positive('len')
+    distribution = GPy.likelihoods.likelihood_functions.probit()
-    m.tie_param('lengthscale')
+    likelihood = GPy.likelihoods.EP(Y,distribution)
    m.approximate_likelihood()
-    # Optimize and plot
+    # Model definition
-    m.em(plot_all=False) # EM algorithm
+    m = GPy.models.GP(data['X'],likelihood=likelihood,kernel=kernel)
    # Optimize
    """
    EPEM runs a loop that consists of two steps:
    1) EP likelihood approximation:
        m.update_likelihood_approximation()
    2) Parameters optimization:
        m.optimize()
    """
    m.EPEM()
    # Plot
    pb.subplot(211)
    m.plot_f()
    pb.subplot(212)
    m.plot()
    print(m)
    return m
--- a/GPy/examples/poisson.py
+++ b/GPy/examples/poisson.py
@ -0,0 +1,47 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Gaussian Processes + Expectation Propagation - Poisson Likelihood
 """
 import pylab as pb
 import numpy as np
 import GPy
 default_seed=10000
 def  toy_1d(seed=default_seed):
    """
    Simple 1D classification example
    :param seed : seed value for data generation (default is 4).
    :type seed: int
    """
    X = np.arange(0,100,5)[:,None]
    F = np.round(np.sin(X/18.) + .1*X) + np.arange(5,25)[:,None]
    E = np.random.randint(-5,5,20)[:,None]
    Y = F + E
    kernel = GPy.kern.rbf(1)
    distribution = GPy.likelihoods.likelihood_functions.Poisson()
    likelihood = GPy.likelihoods.EP(Y,distribution)
    m = GPy.models.GP(X,likelihood,kernel)
    m.ensure_default_constraints()
    # Approximate likelihood
    m.update_likelihood_approximation()
    # Optimize and plot
    m.optimize()
    #m.EPEM FIXME
    print m
    # Plot
    pb.subplot(211)
    m.plot_f() #GP plot
    pb.subplot(212)
    m.plot() #Output plot
    return m
--- a/GPy/examples/sparse_ep_fix.py
+++ b/GPy/examples/sparse_ep_fix.py
@ -0,0 +1,60 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 """
 Sparse Gaussian Processes regression with an RBF kernel
 """
 import pylab as pb
 import numpy as np
 import GPy
 np.random.seed(2)
 pb.ion()
 N = 500
 M = 5
 pb.close('all')
 ######################################
 ## 1 dimensional example
 # sample inputs and outputs
 X = np.random.uniform(-3.,3.,(N,1))
 #Y = np.sin(X)+np.random.randn(N,1)*0.05
 F = np.sin(X)+np.random.randn(N,1)*0.05
 Y = np.ones([F.shape[0],1])
 Y[F<0] = -1
 likelihood = GPy.inference.likelihoods.probit(Y)
 # construct kernel
 rbf =  GPy.kern.rbf(1)
 noise = GPy.kern.white(1)
 kernel = rbf + noise
 # create simple GP model
 #m = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood)
 # contrain all parameters to be positive
 #m.constrain_fixed('prec',100.)
 m = GPy.models.sparse_GP(X, Y, kernel, M=M)
 m.ensure_default_constraints()
 #if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian):
 #    m.approximate_likelihood()
 print m.checkgrad()
 m.optimize('tnc', messages = 1)
 m.plot(samples=3)
 print m
 n = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood)
 n.ensure_default_constraints()
 if not isinstance(n.likelihood,GPy.inference.likelihoods.gaussian):
    n.approximate_likelihood()
 print n.checkgrad()
 pb.figure()
 n.plot()
 """
 m = GPy.models.sparse_GP_regression(X, Y, kernel, M=M)
 m.ensure_default_constraints()
 print m.checkgrad()
 """
--- a/GPy/inference/Expectation_Propagation.py
+++ b/GPy/inference/Expectation_Propagation.py
@ -1,240 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import random
 from scipy import stats, linalg
 from .likelihoods import likelihood
 from ..core import model
 from ..util.linalg import pdinv,mdot,jitchol
 from ..util.plot import gpplot
 from .. import kern
 class EP_base:
    """
    Expectation Propagation.
    This is just the base class for expectation propagation. We'll extend it for full and sparse EP.
    """
    def __init__(self,likelihood,epsilon=1e-3,powerep=[1.,1.]):
        self.likelihood = likelihood
        self.epsilon = epsilon
        self.eta, self.delta = powerep
        self.jitter = 1e-12
        #Initial values - Likelihood approximation parameters:
        #p(y|f) = t(f|tau_tilde,v_tilde)
        self.restart_EP()
    def restart_EP(self):
        """
        Set the EP approximation to initial state
        """
        self.tau_tilde = np.zeros(self.N)
        self.v_tilde = np.zeros(self.N)
        self.mu = np.zeros(self.N)
 class Full(EP_base):
    """
    :param likelihood: Output's likelihood (e.g. probit)
    :type likelihood: GPy.inference.likelihood instance
    :param K: prior covariance matrix
    :type K: np.ndarray (N x N)
    :param likelihood: Output's likelihood (e.g. probit)
    :type likelihood: GPy.inference.likelihood instance
    :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
    :param powerep: Power-EP parameters (eta,delta) - 2x1 numpy array (floats)
    """
    def __init__(self,K,likelihood,*args,**kwargs):
        assert K.shape[0] == K.shape[1]
        self.K = K
        self.N = self.K.shape[0]
        EP_base.__init__(self,likelihood,*args,**kwargs)
    def fit_EP(self,messages=False):
        """
        The expectation-propagation algorithm.
        For nomenclature see Rasmussen & Williams 2006 (pag. 52-60)
        """
        #Prior distribution parameters: p(f|X) = N(f|0,K)
        #self.K = self.kernel.K(self.X,self.X)
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
        self.mu=np.zeros(self.N)
        self.Sigma=self.K.copy()
        """
        Initial values - Cavity distribution parameters:
        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.N,dtype=np.float64)
        self.v_ = np.empty(self.N,dtype=np.float64)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=np.float64)
        self.Z_hat = np.empty(self.N,dtype=np.float64)
        phi = np.empty(self.N,dtype=np.float64)
        mu_hat = np.empty(self.N,dtype=np.float64)
        sigma2_hat = np.empty(self.N,dtype=np.float64)
        #Approximation
        epsilon_np1 = self.epsilon + 1.
        epsilon_np2 = self.epsilon + 1.
       	self.iterations = 0
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./self.Sigma[i,i] - self.eta*self.tau_tilde[i]
                self.v_[i] = self.mu[i]/self.Sigma[i,i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma[i,i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma[i,i])
                self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau
                self.v_tilde[i] = self.v_tilde[i] + Delta_v
                #Posterior distribution parameters update
                si=self.Sigma[:,i].reshape(self.N,1)
                self.Sigma = self.Sigma - Delta_tau/(1.+ Delta_tau*self.Sigma[i,i])*np.dot(si,si.T)
                self.mu = np.dot(self.Sigma,self.v_tilde)
                self.iterations += 1
            #Sigma recomptutation with Cholesky decompositon
            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*(self.K)
            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
            L = jitchol(B)
            V,info = linalg.flapack.dtrtrs(L,Sroot_tilde_K,lower=1)
            self.Sigma = self.K - np.dot(V.T,V)
            self.mu = np.dot(self.Sigma,self.v_tilde)
            epsilon_np1 = np.mean(self.tau_tilde-self.np1[-1]**2)
            epsilon_np2 = np.mean(self.v_tilde-self.np2[-1]**2)
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
            if messages:
                print "EP iteration %i, epsiolon %d"%(self.iterations,epsilon_np1)
 class FITC(EP_base):
    """
    :param likelihood: Output's likelihood (e.g. probit)
    :type likelihood: GPy.inference.likelihood instance
    :param Knn_diag: The diagonal elements of Knn is a 1D vector
    :param Kmn: The 'cross' variance between inducing inputs and data
    :param Kmm: the covariance matrix of the inducing inputs
    :param likelihood: Output's likelihood (e.g. probit)
    :type likelihood: GPy.inference.likelihood instance
    :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
    :param powerep: Power-EP parameters (eta,delta) - 2x1 numpy array (floats)
    """
    def __init__(self,likelihood,Knn_diag,Kmn,Kmm,*args,**kwargs):
        self.Knn_diag = Knn_diag
        self.Kmn = Kmn
        self.Kmm = Kmm
        self.M = self.Kmn.shape[0]
        self.N = self.Kmn.shape[1]
        assert self.M <= self.N, 'The number of inducing inputs must be smaller than the number of observations'
        assert len(Knn_diag) == self.N, 'Knn_diagonal has size different from N'
        EP_base.__init__(self,likelihood,*args,**kwargs)
    def fit_EP(self):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see Naish-Guzman and Holden, 2008.
        """
        """
        Prior approximation parameters:
        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
        """
        self.Kmmi, self.Kmm_hld = pdinv(self.Kmm)
        self.P0 = self.Kmn.T
        self.KmnKnm = np.dot(self.P0.T, self.P0)
        self.KmmiKmn = np.dot(self.Kmmi,self.P0.T)
        self.Qnn_diag = np.sum(self.P0.T*self.KmmiKmn,-2)
        self.Diag0 = self.Knn_diag - self.Qnn_diag
        self.R0 = jitchol(self.Kmmi).T
        """
        Posterior approximation: q(f|y) = N(f| mu, Sigma)
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*gamma
        """
        self.w = np.zeros(self.N)
        self.gamma = np.zeros(self.M)
        self.mu = np.zeros(self.N)
        self.P = self.P0.copy()
        self.R = self.R0.copy()
        self.Diag = self.Diag0.copy()
        self.Sigma_diag = self.Knn_diag
        """
        Initial values - Cavity distribution parameters:
        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.N,dtype=np.float64)
        self.v_ = np.empty(self.N,dtype=np.float64)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=np.float64)
        self.Z_hat = np.empty(self.N,dtype=np.float64)
        phi = np.empty(self.N,dtype=np.float64)
        mu_hat = np.empty(self.N,dtype=np.float64)
        sigma2_hat = np.empty(self.N,dtype=np.float64)
        #Approximation
        epsilon_np1 = 1
        epsilon_np2 = 1
       	self.iterations = 0
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.arange(self.N)
            random.shuffle(update_order)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./self.Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = self.mu[i]/self.Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma_diag[i])
                self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau
                self.v_tilde[i] = self.v_tilde[i] + Delta_v
                #Posterior distribution parameters update
                dtd1 = Delta_tau*self.Diag[i] + 1.
                dii = self.Diag[i]
                self.Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
                pi_ = self.P[i,:].reshape(1,self.M)
                self.P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
                Rp_i = np.dot(self.R,pi_.T)
                RTR = np.dot(self.R.T,np.dot(np.eye(self.M) - Delta_tau/(1.+Delta_tau*self.Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),self.R))
                self.R = jitchol(RTR).T
                self.w[i] = self.w[i] + (Delta_v - Delta_tau*self.w[i])*dii/dtd1
                self.gamma = self.gamma + (Delta_v - Delta_tau*self.mu[i])*np.dot(RTR,self.P[i,:].T)
                self.RPT = np.dot(self.R,self.P.T)
                self.Sigma_diag = self.Diag + np.sum(self.RPT.T*self.RPT.T,-1)
                self.mu = self.w + np.dot(self.P,self.gamma)
                self.iterations += 1
            #Sigma recomptutation with Cholesky decompositon
            self.Diag = self.Diag0/(1.+ self.Diag0 * self.tau_tilde)
            self.P = (self.Diag / self.Diag0)[:,None] * self.P0
            self.RPT0 = np.dot(self.R0,self.P0.T)
            L = jitchol(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T))
            self.R,info = linalg.flapack.dtrtrs(L,self.R0,lower=1)
            self.RPT = np.dot(self.R,self.P.T)
            self.Sigma_diag = self.Diag + np.sum(self.RPT.T*self.RPT.T,-1)
            self.w = self.Diag * self.v_tilde
            self.gamma = np.dot(self.R.T, np.dot(self.RPT,self.v_tilde))
            self.mu = self.w + np.dot(self.P,self.gamma)
            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
--- a/GPy/inference/likelihoods.py
+++ b/GPy/inference/likelihoods.py
@ -1,219 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats
 import scipy as sp
 import pylab as pb
 from ..util.plot import gpplot
 class likelihood:
    def __init__(self,Y,location=0,scale=1):
        """
        Likelihood class for doing Expectation propagation
        :param Y: observed output (Nx1 numpy.darray)
        ..Note:: Y values allowed depend on the likelihood used
        """
        self.Y = Y
        self.N = self.Y.shape[0]
        self.location = location
        self.scale = scale
    def plot1Da(self,X_new,Mean_new,Var_new,X_u,Mean_u,Var_u):
        """
        Plot the predictive distribution of the GP model for 1-dimensional inputs
        :param X_new: The points at which to make a prediction
        :param Mean_new: mean values at X_new
        :param Var_new: variance values at X_new
        :param X_u: input (inducing)  points used to train the model
        :param Mean_u: mean values at X_u
        :param Var_new: variance values at X_u
        """
        assert X_new.shape[1] == 1, 'Number of dimensions must be 1'
        gpplot(X_new,Mean_new,Var_new)
        pb.errorbar(X_u,Mean_u,2*np.sqrt(Var_u),fmt='r+')
        pb.plot(X_u,Mean_u,'ro')
    def plot2D(self,X,X_new,F_new,U=None):
        """
        Predictive distribution of the fitted GP model for 2-dimensional inputs
        :param X_new: The points at which to make a prediction
        :param Mean_new: mean values at X_new
        :param Var_new: variance values at X_new
        :param X_u: input points used to train the model
        :param Mean_u: mean values at X_u
        :param Var_new: variance values at X_u
        """
        N,D = X_new.shape
        assert D == 2, 'Number of dimensions must be 2'
        n = np.sqrt(N)
        x1min = X_new[:,0].min()
        x1max = X_new[:,0].max()
        x2min = X_new[:,1].min()
        x2max = X_new[:,1].max()
        pb.imshow(F_new.reshape(n,n),extent=(x1min,x1max,x2max,x2min),vmin=0,vmax=1)
        pb.colorbar()
        C1 = np.arange(self.N)[self.Y.flatten()==1]
        C2 = np.arange(self.N)[self.Y.flatten()==-1]
        [pb.plot(X[i,0],X[i,1],'ro') for i in C1]
        [pb.plot(X[i,0],X[i,1],'bo') for i in C2]
        pb.xlim(x1min,x1max)
        pb.ylim(x2min,x2max)
        if U is not None:
            [pb.plot(a,b,'wo') for a,b in U]
 class probit(likelihood):
    """
    Probit likelihood
    Y is expected to take values in {-1,1}
    -----
    $$
    L(x) = \\Phi (Y_i*f_i)
    $$
    """
    def moments_match(self,i,tau_i,v_i):
        """
        Moments match of the marginal approximation in EP algorithm
        :param i: number of observation (int)
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
        z = self.Y[i]*v_i/np.sqrt(tau_i**2 + tau_i)
        Z_hat = stats.norm.cdf(z)
        phi = stats.norm.pdf(z)
        mu_hat = v_i/tau_i + self.Y[i]*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
        sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
        return Z_hat, mu_hat, sigma2_hat
    def plot1Db(self,X,X_new,F_new,U=None):
        assert X.shape[1] == 1, 'Number of dimensions must be 1'
        gpplot(X_new,F_new,np.zeros(X_new.shape[0]))
        pb.plot(X,(self.Y+1)/2,'kx',mew=1.5)
        pb.ylim(-0.2,1.2)
        if U is not None:
            pb.plot(U,U*0+.5,'r|',mew=1.5,markersize=12)
    def predictive_mean(self,mu,variance):
        return stats.norm.cdf(mu/np.sqrt(1+variance))
    def _log_likelihood_gradients():
        raise NotImplementedError
 class poisson(likelihood):
    """
    Poisson likelihood
    Y is expected to take values in {0,1,2,...}
    -----
    $$
    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
    $$
    """
    def moments_match(self,i,tau_i,v_i):
        """
        Moments match of the marginal approximation in EP algorithm
        :param i: number of observation (int)
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
        mu = v_i/tau_i
        sigma = np.sqrt(1./tau_i)
        def poisson_norm(f):
            """
            Product of the likelihood and the cavity distribution
            """
            pdf_norm_f = stats.norm.pdf(f,loc=mu,scale=sigma)
            rate = np.exp( (f*self.scale)+self.location)
            poisson = stats.poisson.pmf(float(self.Y[i]),rate)
            return pdf_norm_f*poisson
        def log_pnm(f):
            """
            Log of poisson_norm
            """
            return -(-.5*(f-mu)**2/sigma**2 - np.exp( (f*self.scale)+self.location) + ( (f*self.scale)+self.location)*self.Y[i])
        """
        Golden Search and Simpson's Rule
        --------------------------------
        Simpson's Rule is used to calculate the moments mumerically, it needs a grid of points as input.
        Golden Search is used to find the mode in the poisson_norm distribution and define around it the grid for Simpson's Rule
        """
        #TODO golden search & simpson's rule can be defined in the general likelihood class, rather than in each specific case.
        #Golden search
        golden_A = -1 if self.Y[i] == 0 else np.array([np.log(self.Y[i]),mu]).min() #Lower limit
        golden_B = np.array([np.log(self.Y[i]),mu]).max() #Upper limit
        golden_A = (golden_A - self.location)/self.scale
        golden_B = (golden_B - self.location)/self.scale
        opt = sp.optimize.golden(log_pnm,brack=(golden_A,golden_B)) #Better to work with log_pnm than with poisson_norm
        # Simpson's approximation
        width = 3./np.log(max(self.Y[i],2))
        A = opt - width #Lower limit
        B = opt + width #Upper limit
        K =  10*int(np.log(max(self.Y[i],150))) #Number of points in the grid, we DON'T want K to be the same number for every case
        h = (B-A)/K # length of the intervals
        grid_x = np.hstack([np.linspace(opt-width,opt,K/2+1)[1:-1], np.linspace(opt,opt+width,K/2+1)]) # grid of points (X axis)
        x = np.hstack([A,B,grid_x[range(1,K,2)],grid_x[range(2,K-1,2)]]) # grid_x rearranged, just to make Simpson's algorithm easier
        zeroth = np.hstack([poisson_norm(A),poisson_norm(B),[4*poisson_norm(f) for f in grid_x[range(1,K,2)]],[2*poisson_norm(f) for f in grid_x[range(2,K-1,2)]]]) # grid of points (Y axis) rearranged like x
        first = zeroth*x
        second = first*x
        Z_hat = sum(zeroth)*h/3 # Zero-th moment
        mu_hat = sum(first)*h/(3*Z_hat) # First moment
        m2 = sum(second)*h/(3*Z_hat) # Second moment
        sigma2_hat = m2 - mu_hat**2 # Second central moment
        return float(Z_hat), float(mu_hat), float(sigma2_hat)
    def plot1Db(self,X,X_new,F_new,F2_new=None,U=None):
        pb.subplot(212)
        #gpplot(X_new,F_new,np.sqrt(F2_new))
        pb.plot(X_new,F_new)#,np.sqrt(F2_new)) #FIXME
        pb.plot(X,self.Y,'kx',mew=1.5)
        if U is not None:
            pb.plot(U,np.ones(U.shape[0])*self.Y.min()*.8,'r|',mew=1.5,markersize=12)
    def predictive_mean(self,mu,variance):
        return np.exp(mu*self.scale + self.location)
    def predictive_variance(self,mu,variance):
        return mu
    def _log_likelihood_gradients():
        raise NotImplementedError
 class gaussian(likelihood):
    """
    Gaussian likelihood
    Y is expected to take values in (-inf,inf)
    """
    def moments_match(self,i,tau_i,v_i):
        """
        Moments match of the marginal approximation in EP algorithm
        :param i: number of observation (int)
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
        mu = v_i/tau_i
        sigma = np.sqrt(1./tau_i)
        s = 1. if self.Y[i] == 0 else 1./self.Y[i]
        sigma2_hat = 1./(1./sigma**2 + 1./s**2)
        mu_hat = sigma2_hat*(mu/sigma**2 + self.Y[i]/s**2)
        Z_hat = 1./np.sqrt(2*np.pi) * 1./np.sqrt(sigma**2+s**2) * np.exp(-.5*(mu-self.Y[i])**2/(sigma**2 + s**2))
        return Z_hat, mu_hat, sigma2_hat
    def plot1Db(self,X,X_new,F_new,U=None):
        assert X.shape[1] == 1, 'Number of dimensions must be 1'
        gpplot(X_new,F_new,np.zeros(X_new.shape[0]))
        pb.plot(X,self.Y,'kx',mew=1.5)
        if U is not None:
            pb.plot(U,np.ones(U.shape[0])*self.Y.min()*.8,'r|',mew=1.5,markersize=12)
    def predictive_mean(self,mu,Sigma):
        return mu
    def _log_likelihood_gradients():
        raise NotImplementedError
--- a/GPy/kern/Matern32.py
+++ b/GPy/kern/Matern32.py
@ -14,14 +14,14 @@ class Matern32(kernpart):
    .. math::
-       k(r) = \sigma^2 (1 + \sqrt{3} r) \exp(- \sqrt{3} r) \qquad \qquad \\text{ where  } r = \sqrt{\sum_{i=1}^D \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^D \\frac{(x_i-y_i)^2}{\ell_i^2} }
    :param D: the number of input dimensions
    :type D: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param lengthscale: the vector of lengthscale :math:`\ell_i`
-    :type lengthscale: np.ndarray of size (1,) or (D,) depending on ARD
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
@ -35,17 +35,19 @@ class Matern32(kernpart):
            self.Nparam = 2
            self.name = 'Mat32'
            if lengthscale is not None:
-                assert lengthscale.shape == (1,)
+                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
            else:
                lengthscale = np.ones(1)
        else:
            self.Nparam = self.D + 1
-            self.name = 'Mat32_ARD'
+            self.name = 'Mat32'
            if lengthscale is not None:
-                assert lengthscale.shape == (self.D,)
+                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == self.D, "bad number of lengthscales"
            else:
                lengthscale = np.ones(self.D)
-        self._set_params(np.hstack((variance,lengthscale)))
+        self._set_params(np.hstack((variance,lengthscale.flatten())))
    def _get_params(self):
        """return the value of the parameters."""
@ -116,9 +118,9 @@ class Matern32(kernpart):
        :param F1: vector of derivatives of F
        :type F1: np.array
        :param F2: vector of second derivatives of F
-        :type F2: np.array    
+        :type F2: np.array
        :param lower,upper: boundaries of the input domain
-        :type lower,upper: floats  
+        :type lower,upper: floats
        """
        assert self.D == 1
        def L(x,i):
@ -133,4 +135,3 @@ class Matern32(kernpart):
        #print "OLD \n", np.dot(F1lower,F1lower.T), "\n \n"
        #return(G)
        return(self.lengthscale**3/(12.*np.sqrt(3)*self.variance) * G + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscale**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
--- a/GPy/kern/Matern52.py
+++ b/GPy/kern/Matern52.py
@ -13,14 +13,14 @@ class Matern52(kernpart):
    .. math::
-       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r) \qquad \qquad \\text{ where  } r = \sqrt{\sum_{i=1}^D \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^D \\frac{(x_i-y_i)^2}{\ell_i^2} }
    :param D: the number of input dimensions
    :type D: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param lengthscale: the vector of lengthscale :math:`\ell_i`
-    :type lengthscale: np.ndarray of size (1,) or (D,) depending on ARD
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
@ -33,17 +33,19 @@ class Matern52(kernpart):
            self.Nparam = 2
            self.name = 'Mat52'
            if lengthscale is not None:
-                assert lengthscale.shape == (1,)
+                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
            else:
                lengthscale = np.ones(1)
        else:
            self.Nparam = self.D + 1
-            self.name = 'Mat52_ARD'
+            self.name = 'Mat52'
            if lengthscale is not None:
-                assert lengthscale.shape == (self.D,)
+                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == self.D, "bad number of lengthscales"
            else:
                lengthscale = np.ones(self.D)
-        self._set_params(np.hstack((variance,lengthscale)))
+        self._set_params(np.hstack((variance,lengthscale.flatten())))
    def _get_params(self):
        """return the value of the parameters."""
--- a/GPy/kern/exponential.py
+++ b/GPy/kern/exponential.py
@ -13,14 +13,14 @@ class exponential(kernpart):
    .. math::
-       k(r) = \sigma^2 \exp(- r) \qquad \qquad \\text{ where  } r = \sqrt{\sum_{i=1}^D \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \sigma^2 \exp(- r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^D \\frac{(x_i-y_i)^2}{\ell_i^2} }
    :param D: the number of input dimensions
    :type D: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param lengthscale: the vector of lengthscale :math:`\ell_i`
-    :type lengthscale: np.ndarray of size (1,) or (D,) depending on ARD
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
@ -33,17 +33,19 @@ class exponential(kernpart):
            self.Nparam = 2
            self.name = 'exp'
            if lengthscale is not None:
-                assert lengthscale.shape == (1,)
+                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
            else:
                lengthscale = np.ones(1)
        else:
            self.Nparam = self.D + 1
-            self.name = 'exp_ARD'
+            self.name = 'exp'
            if lengthscale is not None:
-                assert lengthscale.shape == (self.D,)
+                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == self.D, "bad number of lengthscales"
            else:
                lengthscale = np.ones(self.D)
-        self._set_params(np.hstack((variance,lengthscale)))
+        self._set_params(np.hstack((variance,lengthscale.flatten())))
    def _get_params(self):
        """return the value of the parameters."""
@ -87,7 +89,7 @@ class exponential(kernpart):
            dl = self.variance*dvar*dist2M.sum(-1)*invdist
            target[1] += np.sum(dl*partial)
-    def dKdiag_dtheta(self,partial,X,target): 
+    def dKdiag_dtheta(self,partial,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        #NB: derivative of diagonal elements wrt lengthscale is 0
        target[0] += np.sum(partial)
@ -110,9 +112,9 @@ class exponential(kernpart):
        :param F: vector of functions
        :type F: np.array
        :param F1: vector of derivatives of F
-        :type F1: np.array  
+        :type F1: np.array
        :param lower,upper: boundaries of the input domain
-        :type lower,upper: floats  
+        :type lower,upper: floats
        """
        assert self.D == 1
        def L(x,i):
@ -124,8 +126,3 @@ class exponential(kernpart):
                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
        Flower = np.array([f(lower) for f in F])[:,None]
        return(self.lengthscale/2./self.variance * G + 1./self.variance * np.dot(Flower,Flower.T))
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -6,7 +6,7 @@ import numpy as np
 from ..core.parameterised import parameterised
 from kernpart import kernpart
 import itertools
-from product_orthogonal import product_orthogonal
+from product_orthogonal import product_orthogonal 
 class kern(parameterised):
    def __init__(self,D,parts=[], input_slices=None):
@ -155,7 +155,7 @@ class kern(parameterised):
        D = K1.D + K2.D
-        newkernparts = [product_orthogonal(k1,k2).parts[0] for k1, k2 in itertools.product(K1.parts,K2.parts)]
+        newkernparts = [product_orthogonal(k1,k2) for k1, k2 in itertools.product(K1.parts,K2.parts)]
        slices = []
        for sl1, sl2 in itertools.product(K1.input_slices,K2.input_slices):
@ -235,6 +235,8 @@ class kern(parameterised):
            X2 = X
        target = np.zeros(self.Nparam)
        [p.dK_dtheta(partial[s1,s2],X[s1,i_s],X2[s2,i_s],target[ps]) for p,i_s,ps,s1,s2 in zip(self.parts, self.input_slices, self.param_slices, slices1, slices2)]
 	#TODO: transform the gradients here!
        return target
    def dK_dX(self,partial,X,X2=None,slices1=None,slices2=None):
@ -324,6 +326,7 @@ class kern(parameterised):
        [p.psi2(Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target[s1,s2,s2]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]
        # "crossterms". Here we are recomputing psi1 for white (we don't need to), but it's
        # not really expensive, since it's just a matrix of zeroes.
        # psi1_matrices = [np.zeros((mu.shape[0], Z.shape[0])) for p in self.parts]
--- a/GPy/kern/linear.py
+++ b/GPy/kern/linear.py
@ -15,34 +15,33 @@ class linear(kernpart):
    :param D: the number of input dimensions
    :type D: int
    :param variances: the vector of variances :math:`\sigma^2_i`
-    :type variances: np.ndarray of size (1,) or (D,) depending on ARD
+    :type variances: array or list of the appropriate size (or float if there is only one variance parameter)
-    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single variance parameter \sigma^2), otherwise there is one variance parameter per dimension.
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel has only one variance parameter \sigma^2, otherwise there is one variance parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    """
-    def __init__(self,D,variances=None,ARD=True):
+    def __init__(self,D,variances=None,ARD=False):
        self.D = D
        self.ARD = ARD
        if ARD == False:
            self.Nparam = 1
            self.name = 'linear'
            if variances is not None:
-                if isinstance(variances, float):
+                variances = np.asarray(variances)
-                    variances = np.array([variances])
+                assert variances.size == 1, "Only one variance needed for non-ARD kernel"
                assert variances.shape == (1,)
            else:
                variances = np.ones(1)
            self._Xcache, self._X2cache = np.empty(shape=(2,))
        else:
            self.Nparam = self.D
-            self.name = 'linear_ARD'
+            self.name = 'linear'
            if variances is not None:
-                assert variances.shape == (self.D,)
+                variances = np.asarray(variances)
                assert variances.size == self.D, "bad number of lengthscales"
            else:
                variances = np.ones(self.D)
-        self._set_params(variances)
+        self._set_params(variances.flatten())
        #initialize cache
        self._Z, self._mu, self._S = np.empty(shape=(3,1))
--- a/GPy/kern/rbf.py
+++ b/GPy/kern/rbf.py
@ -12,7 +12,7 @@ class rbf(kernpart):
    .. math::
-       k(r) = \sigma^2 \exp(- \frac{1}{2}r^2) \qquad \qquad \\text{ where  } r^2 = \sum_{i=1}^d \frac{ (x_i-x^\prime_i)^2}{\ell_i^2}}
+       k(r) = \sigma^2 \exp(- \frac{1}{2}r^2) \ \ \ \ \  \\text{ where  } r^2 = \sum_{i=1}^d \frac{ (x_i-x^\prime_i)^2}{\ell_i^2}}
    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.
@ -21,7 +21,7 @@ class rbf(kernpart):
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscale: the vector of lengthscale of the kernel
-    :type lengthscale: np.ndarray od size (1,) or (D,) depending on ARD
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
@ -75,6 +75,7 @@ class rbf(kernpart):
    def K(self,X,X2,target):
        if X2 is None:
            X2 = X
        self._K_computations(X,X2)
        np.add(self.variance*self._K_dvar, target,target)
--- a/GPy/likelihoods/EP.py
+++ b/GPy/likelihoods/EP.py
@ -0,0 +1,311 @@
 import numpy as np
 from scipy import stats, linalg
 from ..util.linalg import pdinv,mdot,jitchol
 from likelihood import likelihood
 class EP(likelihood):
    def __init__(self,data,likelihood_function,epsilon=1e-3,power_ep=[1.,1.]):
        """
        Expectation Propagation
        Arguments
        ---------
        epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
        likelihood_function : a likelihood function (see likelihood_functions.py)
        """
        self.likelihood_function = likelihood_function
        self.epsilon = epsilon
        self.eta, self.delta = power_ep
        self.data = data
        self.N = self.data.size
        self.is_heteroscedastic = True
        self.Nparams = 0
        #Initial values - Likelihood approximation parameters:
        #p(y|f) = t(f|tau_tilde,v_tilde)
        self.tau_tilde = np.zeros(self.N)
        self.v_tilde = np.zeros(self.N)
        #initial values for the GP variables
        self.Y = np.zeros((self.N,1))
        self.covariance_matrix = np.eye(self.N)
        self.precision = np.ones(self.N)
        self.Z = 0
        self.YYT = None
    def predictive_values(self,mu,var):
        return self.likelihood_function.predictive_values(mu,var)
    def _get_params(self):
        return np.zeros(0)
    def _get_param_names(self):
        return []
    def _set_params(self,p):
        pass # TODO: the EP likelihood might want to take some parameters...
    def _gradients(self,partial):
        return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
    def _compute_GP_variables(self):
        #Variables to be called from GP
        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
        self.Y =  mu_tilde[:,None]
        self.YYT = np.dot(self.Y,self.Y.T)
        self.precision = self.tau_tilde
        self.covariance_matrix = np.diag(1./self.precision)
    def fit_full(self,K):
        """
        The expectation-propagation algorithm.
        For nomenclature see Rasmussen & Williams 2006.
        """
        #Prior distribution parameters: p(f|X) = N(f|0,K)
        self.tau_tilde = np.zeros(self.N)
        self.v_tilde = np.zeros(self.N)
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
        mu = np.zeros(self.N)
        Sigma = K.copy()
        """
        Initial values - Cavity distribution parameters:
        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.N,dtype=float)
        self.v_ = np.empty(self.N,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=float)
        self.Z_hat = np.empty(self.N,dtype=float)
        phi = np.empty(self.N,dtype=float)
        mu_hat = np.empty(self.N,dtype=float)
        sigma2_hat = np.empty(self.N,dtype=float)
        #Approximation
        epsilon_np1 = self.epsilon + 1.
        epsilon_np2 = self.epsilon + 1.
       	self.iterations = 0
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
                self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau
                self.v_tilde[i] = self.v_tilde[i] + Delta_v
                #Posterior distribution parameters update
                si=Sigma[:,i].reshape(self.N,1)
                Sigma = Sigma - Delta_tau/(1.+ Delta_tau*Sigma[i,i])*np.dot(si,si.T)
                mu = np.dot(Sigma,self.v_tilde)
                self.iterations += 1
            #Sigma recomptutation with Cholesky decompositon
            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
            L = jitchol(B)
            V,info = linalg.flapack.dtrtrs(L,Sroot_tilde_K,lower=1)
            Sigma = K - np.dot(V.T,V)
            mu = np.dot(Sigma,self.v_tilde)
            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
        return self._compute_GP_variables()
    def fit_DTC(self, Knn_diag, Kmn, Kmm):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see ... 2013.
        """
        #TODO: this doesn;t work with uncertain inputs!
        """
        Prior approximation parameters:
        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
        Sigma0 = Qnn = Knm*Kmmi*Kmn
        """
        Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm)
        KmnKnm = np.dot(Kmn, Kmn.T)
        KmmiKmn = np.dot(Kmmi,Kmn)
        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
        LLT0 = Kmm.copy()
        """
        Posterior approximation: q(f|y) = N(f| mu, Sigma)
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*gamma
        """
        mu = np.zeros(self.N)
        LLT = Kmm.copy()
        Sigma_diag = Qnn_diag.copy()
        """
        Initial values - Cavity distribution parameters:
        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        tau_ = np.empty(self.N,dtype=float)
        v_ = np.empty(self.N,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=float)
        Z_hat = np.empty(self.N,dtype=float)
        phi = np.empty(self.N,dtype=float)
        mu_hat = np.empty(self.N,dtype=float)
        sigma2_hat = np.empty(self.N,dtype=float)
        #Approximation
        epsilon_np1 = 1
        epsilon_np2 = 1
       	self.iterations = 0
        np1 = [tau_tilde.copy()]
        np2 = [v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                tau_[i] = 1./Sigma_diag[i] - self.eta*tau_tilde[i]
                v_[i] = mu[i]/Sigma_diag[i] - self.eta*v_tilde[i]
                #Marginal moments
                Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self.data[i],tau_[i],v_[i])
                #Site parameters update
                Delta_tau = delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
                tau_tilde[i] = tau_tilde[i] + Delta_tau
                v_tilde[i] = v_tilde[i] + Delta_v
                #Posterior distribution parameters update
                LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
                L = jitchol(LLT)
                V,info = linalg.flapack.dtrtrs(L,Kmn,lower=1)
                Sigma_diag = np.sum(V*V,-2)
                si = np.sum(V.T*V[:,i],-1)
                mu = mu + (Delta_v-Delta_tau*mu[i])*si
                self.iterations += 1
            #Sigma recomputation with Cholesky decompositon
            LLT0 = LLT0 + np.dot(Kmn*tau_tilde[None,:],Kmn.T)
            L = jitchol(LLT)
            V,info = linalg.flapack.dtrtrs(L,Kmn,lower=1)
            V2,info = linalg.flapack.dtrtrs(L.T,V,lower=0)
            Sigma_diag = np.sum(V*V,-2)
            Knmv_tilde = np.dot(Kmn,v_tilde)
            mu = np.dot(V2.T,Knmv_tilde)
            epsilon_np1 = sum((tau_tilde-np1[-1])**2)/self.N
            epsilon_np2 = sum((v_tilde-np2[-1])**2)/self.N
            np1.append(tau_tilde.copy())
            np2.append(v_tilde.copy())
        self._compute_GP_variables()
    def fit_FITC(self, Knn_diag, Kmn):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see Naish-Guzman and Holden, 2008.
        """
        """
        Prior approximation parameters:
        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
        """
        Kmmi, self.Lm, self.Lmi, Kmm_logdet = pdinv(Kmm)
        P0 = Kmn.T
        KmnKnm = np.dot(P0.T, P0)
        KmmiKmn = np.dot(Kmmi,P0.T)
        Qnn_diag = np.sum(P0.T*KmmiKmn,-2)
        Diag0 = Knn_diag - Qnn_diag
        R0 = jitchol(Kmmi).T
        """
        Posterior approximation: q(f|y) = N(f| mu, Sigma)
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*gamma
        """
        self.w = np.zeros(self.N)
        self.gamma = np.zeros(self.M)
        mu = np.zeros(self.N)
        P = P0.copy()
        R = R0.copy()
        Diag = Diag0.copy()
        Sigma_diag = Knn_diag
        """
        Initial values - Cavity distribution parameters:
        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.N,dtype=float)
        self.v_ = np.empty(self.N,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=float)
        self.Z_hat = np.empty(self.N,dtype=float)
        phi = np.empty(self.N,dtype=float)
        mu_hat = np.empty(self.N,dtype=float)
        sigma2_hat = np.empty(self.N,dtype=float)
        #Approximation
        epsilon_np1 = 1
        epsilon_np2 = 1
       	self.iterations = 0
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
                self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau
                self.v_tilde[i] = self.v_tilde[i] + Delta_v
                #Posterior distribution parameters update
                dtd1 = Delta_tau*Diag[i] + 1.
                dii = Diag[i]
                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
                pi_ = P[i,:].reshape(1,self.M)
                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
                Rp_i = np.dot(R,pi_.T)
                RTR = np.dot(R.T,np.dot(np.eye(self.M) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
                R = jitchol(RTR).T
                self.w[i] = self.w[i] + (Delta_v - Delta_tau*self.w[i])*dii/dtd1
                self.gamma = self.gamma + (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
                RPT = np.dot(R,P.T)
                Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
                mu = self.w + np.dot(P,self.gamma)
                self.iterations += 1
            #Sigma recomptutation with Cholesky decompositon
            Diag = Diag0/(1.+ Diag0 * self.tau_tilde)
            P = (Diag / Diag0)[:,None] * P0
            RPT0 = np.dot(R0,P0.T)
            L = jitchol(np.eye(self.M) + np.dot(RPT0,(1./Diag0 - Diag/(Diag0**2))[:,None]*RPT0.T))
            R,info = linalg.flapack.dtrtrs(L,R0,lower=1)
            RPT = np.dot(R,P.T)
            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
            self.w = Diag * self.v_tilde
            self.gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
            mu = self.w + np.dot(P,self.gamma)
            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
        return self._compute_GP_variables()
--- a/GPy/likelihoods/Gaussian.py
+++ b/GPy/likelihoods/Gaussian.py
@ -0,0 +1,56 @@
 import numpy as np
 from likelihood import likelihood
 class Gaussian(likelihood):
    def __init__(self,data,variance=1.,normalize=False):
        self.is_heteroscedastic = False
        self.Nparams = 1
        self.data = data
        self.N,D = data.shape
        self.Z = 0. # a correction factor which accounts for the approximation made
        #normalisation
        if normalize:
            self._mean = data.mean(0)[None,:]
            self._std = data.std(0)[None,:]
            self.Y = (self.data - self._mean)/self._std
        else:
            self._mean = np.zeros((1,D))
            self._std = np.ones((1,D))
            self.Y = self.data
        #TODO: make this work efficiently (only compute YYT if D>>N)
        self.YYT = np.dot(self.Y,self.Y.T)
        self.trYYT = np.trace(self.YYT)
        self._set_params(np.asarray(variance))
    def _get_params(self):
        return np.asarray(self._variance)
    def _get_param_names(self):
        return ["noise variance"]
    def _set_params(self,x):
        self._variance = float(x)
        self.covariance_matrix = np.eye(self.N)*self._variance
        self.precision = 1./self._variance
    def predictive_values(self,mu,var):
        """
        Un-normalise the prediction and add the likelihood variance, then return the 5%, 95% interval
        """
        mean = mu*self._std + self._mean
        true_var = (var + self._variance)*self._std**2
        _5pc = mean + - 2.*np.sqrt(true_var)
        _95pc = mean + 2.*np.sqrt(true_var)
        return mean, _5pc, _95pc
    def fit_full(self):
        """
        No approximations needed
        """
        pass
    def _gradients(self,partial):
        return np.sum(partial)
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -0,0 +1,4 @@
 from EP import EP
 from Gaussian import Gaussian
 # TODO: from Laplace import Laplace
 import likelihood_functions as functions
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -0,0 +1,35 @@
 import numpy as np
 class likelihood:
    """
    The atom for a likelihood class
    This object interfaces the GP and the data. The most basic likelihood
    (Gaussian) inherits directly from this, as does the EP algorithm
    Some things must be defined for this to work properly:
    self.Y : the effective Gaussian target of the GP
    self.N, self.D : Y.shape
    self.covariance_matrix : the effective (noise) covariance of the GP targets
    self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP)
    self.is_heteroscedastic : enables significant computational savings in GP
    self.precision : a scalar or vector representation of the effective target precision
    self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N
    """
    def __init__(self,data):
        raise ValueError, "this class is not to be instantiated"
    def _get_params(self):
        raise NotImplementedError
    def _get_param_names(self):
        raise NotImplementedError
    def _set_params(self,x):
        raise NotImplementedError
    def fit(self):
        raise NotImplementedError
    def _gradients(self,partial):
        raise NotImplementedError
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@ -0,0 +1,134 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats
 import scipy as sp
 import pylab as pb
 from ..util.plot import gpplot
 class likelihood_function:
    """
    Likelihood class for doing Expectation propagation
    :param Y: observed output (Nx1 numpy.darray)
    ..Note:: Y values allowed depend on the likelihood_function used
    """
    def __init__(self,location=0,scale=1):
        self.location = location
        self.scale = scale
 class probit(likelihood_function):
    """
    Probit likelihood
    Y is expected to take values in {-1,1}
    -----
    $$
    L(x) = \\Phi (Y_i*f_i)
    $$
    """
    def moments_match(self,data_i,tau_i,v_i):
        """
        Moments match of the marginal approximation in EP algorithm
        :param i: number of observation (int)
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
        # TODO: some version of assert np.sum(np.abs(Y)-1) == 0, "Output values must be either -1 or 1"
        if data_i == 0: data_i = -1 #NOTE Binary classification works better classes {-1,1}, 1D-plotting works better with classes {0,1}.
        z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
        Z_hat = stats.norm.cdf(z)
        phi = stats.norm.pdf(z)
        mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
        sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
        return Z_hat, mu_hat, sigma2_hat
    def predictive_values(self,mu,var):
        """
        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
        """
        mu = mu.flatten()
        var = var.flatten()
        mean = stats.norm.cdf(mu/np.sqrt(1+var))
        p_025 = np.zeros(mu.shape)
        p_975 = np.ones(mu.shape)
        return mean, p_025, p_975
 class Poisson(likelihood_function):
    """
    Poisson likelihood
    Y is expected to take values in {0,1,2,...}
    -----
    $$
    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
    $$
    """
    def moments_match(self,data_i,tau_i,v_i):
        """
        Moments match of the marginal approximation in EP algorithm
        :param i: number of observation (int)
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
        mu = v_i/tau_i
        sigma = np.sqrt(1./tau_i)
        def poisson_norm(f):
            """
            Product of the likelihood and the cavity distribution
            """
            pdf_norm_f = stats.norm.pdf(f,loc=mu,scale=sigma)
            rate = np.exp( (f*self.scale)+self.location)
            poisson = stats.poisson.pmf(float(data_i),rate)
            return pdf_norm_f*poisson
        def log_pnm(f):
            """
            Log of poisson_norm
            """
            return -(-.5*(f-mu)**2/sigma**2 - np.exp( (f*self.scale)+self.location) + ( (f*self.scale)+self.location)*data_i)
        """
        Golden Search and Simpson's Rule
        --------------------------------
        Simpson's Rule is used to calculate the moments mumerically, it needs a grid of points as input.
        Golden Search is used to find the mode in the poisson_norm distribution and define around it the grid for Simpson's Rule
        """
        #TODO golden search & simpson's rule can be defined in the general likelihood class, rather than in each specific case.
        #Golden search
        golden_A = -1 if data_i == 0 else np.array([np.log(data_i),mu]).min() #Lower limit
        golden_B = np.array([np.log(data_i),mu]).max() #Upper limit
        golden_A = (golden_A - self.location)/self.scale
        golden_B = (golden_B - self.location)/self.scale
        opt = sp.optimize.golden(log_pnm,brack=(golden_A,golden_B)) #Better to work with log_pnm than with poisson_norm
        # Simpson's approximation
        width = 3./np.log(max(data_i,2))
        A = opt - width #Lower limit
        B = opt + width #Upper limit
        K =  10*int(np.log(max(data_i,150))) #Number of points in the grid, we DON'T want K to be the same number for every case
        h = (B-A)/K # length of the intervals
        grid_x = np.hstack([np.linspace(opt-width,opt,K/2+1)[1:-1], np.linspace(opt,opt+width,K/2+1)]) # grid of points (X axis)
        x = np.hstack([A,B,grid_x[range(1,K,2)],grid_x[range(2,K-1,2)]]) # grid_x rearranged, just to make Simpson's algorithm easier
        zeroth = np.hstack([poisson_norm(A),poisson_norm(B),[4*poisson_norm(f) for f in grid_x[range(1,K,2)]],[2*poisson_norm(f) for f in grid_x[range(2,K-1,2)]]]) # grid of points (Y axis) rearranged like x
        first = zeroth*x
        second = first*x
        Z_hat = sum(zeroth)*h/3 # Zero-th moment
        mu_hat = sum(first)*h/(3*Z_hat) # First moment
        m2 = sum(second)*h/(3*Z_hat) # Second moment
        sigma2_hat = m2 - mu_hat**2 # Second central moment
        return float(Z_hat), float(mu_hat), float(sigma2_hat)
    def predictive_values(self,mu,var):
        """
        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
        """
        mean = np.exp(mu*self.scale + self.location)
        tmp = stats.poisson.ppf(np.array([.025,.975]),mean)
        p_025 = tmp[:,0]
        p_975 = tmp[:,1]
        return mean,p_025,p_975
--- a/GPy/models/BGPLVM.py
+++ b/GPy/models/BGPLVM.py
@ -5,10 +5,12 @@ import numpy as np
 import pylab as pb
 import sys, pdb
 from GPLVM import GPLVM
-from sparse_GP_regression import sparse_GP_regression
+from sparse_GP import sparse_GP
 from GPy.util.linalg import pdinv
 from ..likelihoods import Gaussian
 from .. import kern
-class Bayesian_GPLVM(sparse_GP_regression, GPLVM):
+class Bayesian_GPLVM(sparse_GP, GPLVM):
    """
    Bayesian Gaussian Process Latent Variable Model
@ -20,18 +22,24 @@ class Bayesian_GPLVM(sparse_GP_regression, GPLVM):
    :type init: 'PCA'|'random'
    """
-    def __init__(self, Y, Q, X = None, S = None, init='PCA', **kwargs):
+    def __init__(self, Y, Q, init='PCA', M=10, Z=None, kernel=None, **kwargs):
-        if X == None:
+        X = self.initialise_latent(init, Q, Y)
            X = self.initialise_latent(init, Q, Y)
        if S == None:
            S = np.ones_like(X) * 1e-2
-        sparse_GP_regression.__init__(self, X, Y, X_uncertainty = S, **kwargs)
+        if Z is None:
            Z = np.random.permutation(X.copy())[:M]
        else:
            assert Z.shape[1]==X.shape[1]
        if kernel is None:
            kernel = kern.rbf(Q) + kern.white(Q)
        S = np.ones_like(X) * 1e-2#
        sparse_GP.__init__(self, X, Gaussian(Y), kernel, Z=Z, X_uncertainty=S, **kwargs)
    def _get_param_names(self):
        X_names = sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
        S_names = sum([['S_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
-        return (X_names + S_names + sparse_GP_regression._get_param_names(self))
+        return (X_names + S_names + sparse_GP._get_param_names(self))
    def _get_params(self):
        """
@ -39,17 +47,17 @@ class Bayesian_GPLVM(sparse_GP_regression, GPLVM):
        The resulting 1-D array has this structure:
        ===============================================================
-        |       mu       |        S        |    Z    | beta |  theta  |
+        |       mu       |        S        |    Z    | theta |  beta  |
        ===============================================================
        """
-        return np.hstack((self.X.flatten(), self.X_uncertainty.flatten(), sparse_GP_regression._get_params(self)))
+        return np.hstack((self.X.flatten(), self.X_uncertainty.flatten(), sparse_GP._get_params(self)))
    def _set_params(self,x):
        N, Q = self.N, self.Q
        self.X = x[:self.X.size].reshape(N,Q).copy()
        self.X_uncertainty = x[(N*Q):(2*N*Q)].reshape(N,Q).copy()
-        sparse_GP_regression._set_params(self, x[(2*N*Q):])
+        sparse_GP._set_params(self, x[(2*N*Q):])
    def dL_dmuS(self):
        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi1_dmuS(self.dL_dpsi1,self.Z,self.X,self.X_uncertainty)
@ -58,17 +66,8 @@ class Bayesian_GPLVM(sparse_GP_regression, GPLVM):
        dL_dmu = dL_dmu_psi0 + dL_dmu_psi1 + dL_dmu_psi2
        dL_dS = dL_dS_psi0 + dL_dS_psi1 + dL_dS_psi2
-        dKL_dS = (1. - (1./self.X_uncertainty))*0.5
+        return np.hstack((dL_dmu.flatten(), dL_dS.flatten()))
        dKL_dmu = self.X
        return np.hstack(((dL_dmu - dKL_dmu).flatten(), (dL_dS - dKL_dS).flatten()))
    def KL_divergence(self):
        var_mean = np.square(self.X).sum()
        var_S = np.sum(self.X_uncertainty - np.log(self.X_uncertainty))
        return 0.5*(var_mean + var_S) - 0.5*self.Q*self.N
    def log_likelihood(self):
        return sparse_GP_regression.log_likelihood(self) - self.KL_divergence()
    def _log_likelihood_gradients(self):
-        return np.hstack((self.dL_dmuS().flatten(), sparse_GP_regression._log_likelihood_gradients(self)))
+        return np.hstack((self.dL_dmuS().flatten(), sparse_GP._log_likelihood_gradients(self)))
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@ -0,0 +1,274 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from .. import kern
 from ..core import model
 from ..util.linalg import pdinv,mdot
 from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango
 from ..likelihoods import EP
 class GP(model):
    """
    Gaussian Process model for regression and EP
    :param X: input observations
    :param kernel: a GPy kernel, defaults to rbf+white
    :parm likelihood: a GPy likelihood
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
    :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing)
    :rtype: model object
    :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
    :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
    :type powerep: list
    .. Note:: Multiple independent outputs are allowed using columns of Y
    """
    #FIXME normalize vs normalise
    def __init__(self, X, likelihood, kernel, normalize_X=False, Xslices=None):
        # parse arguments
        self.Xslices = Xslices
        self.X = X
        assert len(self.X.shape)==2
        self.N, self.Q = self.X.shape
        assert isinstance(kernel, kern.kern)
        self.kern = kernel
        #here's some simple normalisation for the inputs
        if normalize_X:
            self._Xmean = X.mean(0)[None,:]
            self._Xstd = X.std(0)[None,:]
            self.X = (X.copy() - self._Xmean) / self._Xstd
            if hasattr(self,'Z'):
                self.Z = (self.Z - self._Xmean) / self._Xstd
        else:
            self._Xmean = np.zeros((1,self.X.shape[1]))
            self._Xstd = np.ones((1,self.X.shape[1]))
        self.likelihood = likelihood
        #assert self.X.shape[0] == self.likelihood.Y.shape[0]
        #self.N, self.D = self.likelihood.Y.shape
        assert self.X.shape[0] == self.likelihood.data.shape[0]
        self.N, self.D = self.likelihood.data.shape
        model.__init__(self)
    def _set_params(self,p):
        self.kern._set_params_transformed(p[:self.kern.Nparam])
        #self.likelihood._set_params(p[self.kern.Nparam:])               # test by Nicolas
        self.likelihood._set_params(p[self.kern.Nparam_transformed():])    # test by Nicolas
        self.K = self.kern.K(self.X,slices1=self.Xslices)
        self.K += self.likelihood.covariance_matrix
        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
        #the gradient of the likelihood wrt the covariance matrix
        if self.likelihood.YYT is None:
            alpha = np.dot(self.Ki,self.likelihood.Y)
            self.dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki)
        else:
            tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
            self.dL_dK = 0.5*(tmp - self.D*self.Ki)
    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
    def _get_param_names(self):
        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    def update_likelihood_approximation(self):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation
        For a Gaussian (or direct: TODO) likelihood, no iteration is required:
        this function does nothing
        """
        self.likelihood.fit_full(self.kern.K(self.X))
        self._set_params(self._get_params()) # update the GP
    def _model_fit_term(self):
        """
        Computes the model fit using YYT if it's available
        """
        if self.likelihood.YYT is None:
            return -0.5*np.sum(np.square(np.dot(self.Li,self.likelihood.Y)))
        else:
            return -0.5*np.sum(np.multiply(self.Ki, self.likelihood.YYT))
    def log_likelihood(self):
        """
        The log marginal likelihood of the GP.
        For an EP model,  can be written as the log likelihood of a regression
        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
        """
        return -0.5*self.D*self.K_logdet + self._model_fit_term() + self.likelihood.Z
    def _log_likelihood_gradients(self):
        """
        The gradient of all parameters.
        For the kernel parameters, use the chain rule via dL_dK
        For the likelihood parameters, pass in alpha = K^-1 y
        """
        return np.hstack((self.kern.dK_dtheta(partial=self.dL_dK,X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
    def _raw_predict(self,_Xnew,slices=None, full_cov=False):
        """
        Internal helper function for making predictions, does not account
        for normalisation or likelihood
        """
        Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices)
        mu = np.dot(np.dot(Kx.T,self.Ki),self.likelihood.Y)
        KiKx = np.dot(self.Ki,Kx)
        if full_cov:
            Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices)
            var = Kxx - np.dot(KiKx.T,Kx) #NOTE this won't work for plotting
        else:
            Kxx = self.kern.Kdiag(_Xnew, slices=slices)
            var = Kxx - np.sum(np.multiply(KiKx,Kx),0)
            var = var[:,None]
        return mu, var
    def predict(self,Xnew, slices=None, full_cov=False):
        """
        Predict the function(s) at the new point(s) Xnew.
        Arguments
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.Q
        :param slices:  specifies which outputs kernel(s) the Xnew correspond to (see below)
        :type slices: (None, list of slice objects, list of ints)
        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.D
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.D
        .. Note:: "slices" specifies how the the points X_new co-vary wich the training points.
             - If None, the new points covary throigh every kernel part (default)
             - If a list of slices, the i^th slice specifies which data are affected by the i^th kernel part
             - If a list of booleans, specifying which kernel parts are active
           If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalisations of the output dimensions.
        """
        #normalise X values
        Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
        mu, var = self._raw_predict(Xnew, slices, full_cov)
        #now push through likelihood TODO
        mean, _025pm, _975pm = self.likelihood.predictive_values(mu, var)
        return mean, var, _025pm, _975pm
    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_functions='all', resolution=None, full_cov=False):
        """
        Plot the GP's view of the world, where the data is normalised and the likelihood is Gaussian
        :param samples: the number of a posteriori samples to plot
        :param which_data: which if the training data to plot (default all)
        :type which_data: 'all' or a slice object to slice self.X, self.Y
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :param which_functions: which of the kernel functions to plot (additively)
        :type which_functions: list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
          - In higher dimensions, we've no implemented this yet !TODO!
        Can plot only part of the data and part of the posterior functions using which_data and which_functions
        Plot the data's view of the world, with non-normalised values and GP predictions passed through the likelihood
        """
        if which_functions=='all':
            which_functions = [True]*self.kern.Nparts
        if which_data=='all':
            which_data = slice(None)
        if self.X.shape[1] == 1:
            Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
            if samples == 0:
                m,v = self._raw_predict(Xnew, slices=which_functions)
                gpplot(Xnew,m,m-2*np.sqrt(v),m+2*np.sqrt(v))
                pb.plot(self.X[which_data],self.likelihood.Y[which_data],'kx',mew=1.5)
            else:
                m,v = self._raw_predict(Xnew, slices=which_functions,full_cov=True)
                Ysim = np.random.multivariate_normal(m.flatten(),v,samples)
                gpplot(Xnew,m,m-2*np.sqrt(np.diag(v)[:,None]),m+2*np.sqrt(np.diag(v))[:,None])
                for i in range(samples):
                    pb.plot(Xnew,Ysim[i,:],Tango.coloursHex['darkBlue'],linewidth=0.25)
            pb.plot(self.X[which_data],self.likelihood.Y[which_data],'kx',mew=1.5)
            pb.xlim(xmin,xmax)
            ymin,ymax = min(np.append(self.likelihood.Y,m-2*np.sqrt(np.diag(v)[:,None]))), max(np.append(self.likelihood.Y,m+2*np.sqrt(np.diag(v)[:,None])))
            ymin, ymax = ymin - 0.1*(ymax - ymin), ymax + 0.1*(ymax - ymin)
            pb.ylim(ymin,ymax)
            if hasattr(self,'Z'):
                pb.plot(self.Z,self.Z*0+pb.ylim()[0],'r|',mew=1.5,markersize=12)
        elif self.X.shape[1] == 2:
            resolution = resolution or 50
            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits,resolution)
            m,v = self._raw_predict(Xnew, slices=which_functions)
            m = m.reshape(resolution,resolution).T
            pb.contour(xx,yy,m,vmin=m.min(),vmax=m.max(),cmap=pb.cm.jet)
            pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=m.min(), vmax=m.max())
            pb.xlim(xmin[0],xmax[0])
            pb.ylim(xmin[1],xmax[1])
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
    def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False):
        # TODO include samples
        if which_functions=='all':
            which_functions = [True]*self.kern.Nparts
        if which_data=='all':
            which_data = slice(None)
        if self.X.shape[1] == 1:
            Xu = self.X * self._Xstd + self._Xmean #NOTE self.X are the normalized values now
            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
            m, var, lower, upper = self.predict(Xnew, slices=which_functions)
            gpplot(Xnew,m, lower, upper)
            pb.plot(Xu[which_data],self.likelihood.data[which_data],'kx',mew=1.5)
            ymin,ymax = min(np.append(self.likelihood.data,lower)), max(np.append(self.likelihood.data,upper))
            ymin, ymax = ymin - 0.1*(ymax - ymin), ymax + 0.1*(ymax - ymin)
            pb.xlim(xmin,xmax)
            pb.ylim(ymin,ymax)
            if hasattr(self,'Z'):
                Zu = self.Z*self._Xstd + self._Xmean
                pb.plot(Zu,Zu*0+pb.ylim()[0],'r|',mew=1.5,markersize=12)
        elif self.X.shape[1]==2: #FIXME
            resolution = resolution or 50
            Xnew, xx, yy, xmin, xmax = x_frame2D(self.X, plot_limits,resolution)
            x, y = np.linspace(xmin[0],xmax[0],resolution), np.linspace(xmin[1],xmax[1],resolution)
            m, var, lower, upper = self.predict(Xnew, slices=which_functions)
            m = m.reshape(resolution,resolution).T
            pb.contour(x,y,m,vmin=m.min(),vmax=m.max(),cmap=pb.cm.jet)
            Yf = self.likelihood.Y.flatten()
            pb.scatter(self.X[:,0], self.X[:,1], 40, Yf, cmap=pb.cm.jet,vmin=m.min(),vmax=m.max(), linewidth=0.)
            pb.xlim(xmin[0],xmax[0])
            pb.ylim(xmin[1],xmax[1])
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
--- a/GPy/models/GPLVM.py
+++ b/GPy/models/GPLVM.py
@ -8,9 +8,10 @@ import sys, pdb
 from .. import kern
 from ..core import model
 from ..util.linalg import pdinv, PCA
-from GP_regression import GP_regression
+from GP import GP
 from ..likelihoods import Gaussian
-class GPLVM(GP_regression):
+class GPLVM(GP):
    """
    Gaussian Process Latent Variable Model
@ -22,10 +23,13 @@ class GPLVM(GP_regression):
    :type init: 'PCA'|'random'
    """
-    def __init__(self, Y, Q, init='PCA', X = None, **kwargs):
+    def __init__(self, Y, Q, init='PCA', X = None, kernel=None, **kwargs):
        if X is None:
            X = self.initialise_latent(init, Q, Y)
-        GP_regression.__init__(self, X, Y, **kwargs)
+        if kernel is None:
            kernel = kern.rbf(Q) + kern.bias(Q)
        likelihood = Gaussian(Y)
        GP.__init__(self, X, likelihood, kernel, **kwargs)
    def initialise_latent(self, init, Q, Y):
        if init == 'PCA':
@ -34,23 +38,19 @@ class GPLVM(GP_regression):
            return np.random.randn(Y.shape[0], Q)
    def _get_param_names(self):
-        return (sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
+        return sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[]) + GP._get_param_names(self)
                + self.kern._get_param_names_transformed())
    def _get_params(self):
-        return np.hstack((self.X.flatten(), self.kern._get_params_transformed()))
+        return np.hstack((self.X.flatten(), GP._get_params(self)))
    def _set_params(self,x):
        self.X = x[:self.X.size].reshape(self.N,self.Q).copy()
-        GP_regression._set_params(self, x[self.X.size:])
+        GP._set_params(self, x[self.X.size:])
    def _log_likelihood_gradients(self):
-        dL_dK = self.dL_dK()
+        dL_dX = 2.*self.kern.dK_dX(self.dL_dK,self.X)
-        dL_dtheta = self.kern.dK_dtheta(dL_dK,self.X)
+        return np.hstack((dL_dX.flatten(),GP._log_likelihood_gradients(self)))
        dL_dX = 2*self.kern.dK_dX(dL_dK,self.X)
        return np.hstack((dL_dX.flatten(),dL_dtheta))
    def plot(self):
        assert self.Y.shape[1]==2
--- a/GPy/models/GP_EP.py
+++ b/GPy/models/GP_EP.py
@ -1,160 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from scipy import stats, linalg
 from .. import kern
 from ..inference.Expectation_Propagation import Full
 from ..inference.likelihoods import likelihood,probit#,poisson,gaussian
 from ..core import model
 from ..util.linalg import pdinv,jitchol
 from ..util.plot import gpplot
 class GP_EP(model):
    def __init__(self,X,likelihood,kernel=None,epsilon_ep=1e-3,epsion_em=.1,powerep=[1.,1.]):
        """
        Simple Gaussian Process with Non-Gaussian likelihood
        Arguments
        ---------
        :param X: input observations (NxD numpy.darray)
        :param likelihood: a GPy likelihood (likelihood class)
        :param kernel: a GPy kernel (kern class)
        :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 (float)
        :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] (list)
        :rtype: GPy model class.
        """
        if kernel is None:
            kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1])
        assert isinstance(kernel,kern.kern), 'kernel is not a kern instance'
        self.likelihood = likelihood
        self.Y = self.likelihood.Y
        self.kernel = kernel
        self.X = X
        self.N, self.D = self.X.shape
        self.eta,self.delta = powerep
        self.epsilon_ep = epsilon_ep
        self.jitter = 1e-12
        self.K = self.kernel.K(self.X)
        model.__init__(self)
    def _set_params(self,p):
        self.kernel._set_params_transformed(p)
    def _get_params(self):
        return self.kernel._get_params_transformed()
    def _get_param_names(self):
        return self.kernel._get_param_names_transformed()
    def approximate_likelihood(self):
        self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta])
        self.ep_approx.fit_EP()
    def posterior_param(self):
        self.K = self.kernel.K(self.X)
        self.Sroot_tilde_K =  np.sqrt(self.ep_approx.tau_tilde)[:,None]*self.K
        B = np.eye(self.N) + np.sqrt(self.ep_approx.tau_tilde)[None,:]*self.Sroot_tilde_K
        #self.L = np.linalg.cholesky(B)
        self.L = jitchol(B)
        V,info = linalg.flapack.dtrtrs(self.L,self.Sroot_tilde_K,lower=1)
        self.Sigma = self.K - np.dot(V.T,V)
        self.mu = np.dot(self.Sigma,self.ep_approx.v_tilde)
    def log_likelihood(self):
        """
        Returns
        -------
        The EP approximation to the log-marginal likelihood
        """
        self.posterior_param()
        mu_ = self.ep_approx.v_/self.ep_approx.tau_
        L1 =.5*sum(np.log(1+self.ep_approx.tau_tilde*1./self.ep_approx.tau_))-sum(np.log(np.diag(self.L)))
        L2A =.5*np.sum((self.Sigma-np.diag(1./(self.ep_approx.tau_+self.ep_approx.tau_tilde))) * np.dot(self.ep_approx.v_tilde[:,None],self.ep_approx.v_tilde[None,:]))
        L2B = .5*np.dot(mu_*(self.ep_approx.tau_/(self.ep_approx.tau_tilde+self.ep_approx.tau_)),self.ep_approx.tau_tilde*mu_ - 2*self.ep_approx.v_tilde)
        L3 = sum(np.log(self.ep_approx.Z_hat))
        return L1 + L2A + L2B + L3
    def _log_likelihood_gradients(self):
        dK_dp = self.kernel.dK_dtheta(self.X)
        self.dK_dp = dK_dp
        aux1,info_1 = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1)
        b = self.ep_approx.v_tilde - np.sqrt(self.ep_approx.tau_tilde)*linalg.flapack.dtrtrs(self.L.T,aux1)[0]
        U,info_u = linalg.flapack.dtrtrs(self.L,np.diag(np.sqrt(self.ep_approx.tau_tilde)),lower=1)
        dL_dK = 0.5*(np.outer(b,b)-np.dot(U.T,U))
        self.dL_dK = dL_dK
        return np.array([np.sum(dK_dpi*dL_dK) for dK_dpi in dK_dp.T])
    def predict(self,X):
        #TODO: check output dimensions
        self.posterior_param()
        K_x = self.kernel.K(self.X,X)
        Kxx = self.kernel.K(X)
        aux1,info = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1)
        aux2,info = linalg.flapack.dtrtrs(self.L.T, aux1,lower=0)
        zeta = np.sqrt(self.ep_approx.tau_tilde)*aux2
        f = np.dot(K_x.T,self.ep_approx.v_tilde-zeta)
       	v,info = linalg.flapack.dtrtrs(self.L,np.sqrt(self.ep_approx.tau_tilde)[:,None]*K_x,lower=1)
        variance = Kxx - np.dot(v.T,v)
       	vdiag = np.diag(variance)
        y=self.likelihood.predictive_mean(f,vdiag)
        return f,vdiag,y
    def plot(self):
        """
        Plot the fitted model: training function values, inducing points used, mean estimate and confidence intervals.
        """
        if self.X.shape[1]==1:
            pb.figure()
            xmin,xmax = self.X.min(),self.X.max()
            xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
            Xnew = np.linspace(xmin,xmax,100)[:,None]
            mu_f, var_f, mu_phi = self.predict(Xnew)
            pb.subplot(211)
            self.likelihood.plot1Da(X_new=Xnew,Mean_new=mu_f,Var_new=var_f,X_u=self.X,Mean_u=self.mu,Var_u=np.diag(self.Sigma))
            pb.subplot(212)
            self.likelihood.plot1Db(self.X,Xnew,mu_phi)
        elif self.X.shape[1]==2:
            pb.figure()
            x1min,x1max = self.X[:,0].min(0),self.X[:,0].max(0)
            x2min,x2max = self.X[:,1].min(0),self.X[:,1].max(0)
            x1min, x1max = x1min-0.2*(x1max-x1min), x1max+0.2*(x1max-x1min)
            x2min, x2max = x2min-0.2*(x2max-x2min), x2max+0.2*(x1max-x1min)
            axis1 = np.linspace(x1min,x1max,50)
            axis2 = np.linspace(x2min,x2max,50)
            XX1, XX2 = [e.flatten() for e in np.meshgrid(axis1,axis2)]
            Xnew = np.c_[XX1.flatten(),XX2.flatten()]
            f,v,p = self.predict(Xnew)
            self.likelihood.plot2D(self.X,Xnew,p)
        else:
            raise NotImplementedError, "Cannot plot GPs with more than two input dimensions"
    def em(self,max_f_eval=1e4,epsilon=.1,plot_all=False): #TODO check this makes sense
        """
        Fits sparse_EP and optimizes the hyperparametes iteratively until convergence is achieved.
        """
        self.epsilon_em = epsilon
        log_likelihood_change = self.epsilon_em + 1.
        self.parameters_path = [self.kernel._get_params()]
        self.approximate_likelihood()
        self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]]
        self.log_likelihood_path = [self.log_likelihood()]
        iteration = 0
        while log_likelihood_change > self.epsilon_em:
            print 'EM iteration', iteration
            self.optimize(max_f_eval = max_f_eval)
            log_likelihood_new = self.log_likelihood()
            log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1]
            if log_likelihood_change < 0:
                print 'log_likelihood decrement'
                self.kernel._set_params_transformed(self.parameters_path[-1])
                self.kernM._set_params_transformed(self.parameters_path[-1])
            else:
                self.approximate_likelihood()
                self.log_likelihood_path.append(self.log_likelihood())
                self.parameters_path.append(self.kernel._get_params())
                self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde])
            iteration += 1
--- a/GPy/models/GP_regression.py
+++ b/GPy/models/GP_regression.py
@ -1,18 +1,18 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-import pylab as pb
+from GP import GP
 from .. import likelihoods
 from .. import kern
 from ..core import model
 from ..util.linalg import pdinv,mdot
 from ..util.plot import gpplot, Tango
-class GP_regression(model):
+class GP_regression(GP):
    """
    Gaussian Process model for regression
    This is a thin wrapper around the GP class, with a set of sensible defalts
    :param X: input observations
    :param Y: observed values
    :param kernel: a GPy kernel, defaults to rbf+white
@ -29,199 +29,8 @@ class GP_regression(model):
    def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None):
        if kernel is None:
-            kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1])
+            kernel = kern.rbf(X.shape[1])
-        # parse arguments
+        likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y)
        self.Xslices = Xslices
        assert isinstance(kernel, kern.kern)
        self.kern = kernel
        self.X = X
        self.Y = Y
        assert len(self.X.shape)==2
        assert len(self.Y.shape)==2
        assert self.X.shape[0] == self.Y.shape[0]
        self.N, self.D = self.Y.shape
        self.N, self.Q = self.X.shape
-        #here's some simple normalisation
+        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices)
        if normalize_X:
            self._Xmean = X.mean(0)[None,:]
            self._Xstd = X.std(0)[None,:]
            self.X = (X.copy() - self._Xmean) / self._Xstd
            if hasattr(self,'Z'):
                self.Z = (self.Z - self._Xmean) / self._Xstd
        else:
            self._Xmean = np.zeros((1,self.X.shape[1]))
            self._Xstd = np.ones((1,self.X.shape[1]))
        if normalize_Y:
            self._Ymean = Y.mean(0)[None,:]
            self._Ystd = Y.std(0)[None,:]
            self.Y = (Y.copy()- self._Ymean) / self._Ystd
        else:
            self._Ymean = np.zeros((1,self.Y.shape[1]))
            self._Ystd = np.ones((1,self.Y.shape[1]))
        if self.D > self.N:
            # then it's more efficient to store YYT
            self.YYT = np.dot(self.Y, self.Y.T)
        else:
            self.YYT = None
        model.__init__(self)
    def _set_params(self,p):
        self.kern._set_params_transformed(p)
        self.K = self.kern.K(self.X,slices1=self.Xslices)
        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
    def _get_params(self):
        return self.kern._get_params_transformed()
    def _get_param_names(self):
        return self.kern._get_param_names_transformed()
    def _model_fit_term(self):
        """
        Computes the model fit using YYT if it's available
        """
        if self.YYT is None:
            return -0.5*np.sum(np.square(np.dot(self.Li,self.Y)))
        else:
            return -0.5*np.sum(np.multiply(self.Ki, self.YYT))
    def log_likelihood(self):
        complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - 0.5*self.D*self.K_logdet
        return complexity_term + self._model_fit_term()
    def dL_dK(self):
        if self.YYT is None:
            alpha = np.dot(self.Ki,self.Y)
            dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki)
        else:
            dL_dK = 0.5*(mdot(self.Ki, self.YYT, self.Ki) - self.D*self.Ki)
        return dL_dK
    def _log_likelihood_gradients(self):
        return self.kern.dK_dtheta(partial=self.dL_dK(),X=self.X)
    def predict(self,Xnew, slices=None, full_cov=False):
        """
        Predict the function(s) at the new point(s) Xnew.
        Arguments
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.Q
        :param slices:  specifies which outputs kernel(s) the Xnew correspond to (see below)
        :type slices: (None, list of slice objects, list of ints)
        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.D
        :rtype: posterior variance, a Numpy array, Nnew x Nnew x (self.D)
        .. Note:: "slices" specifies how the the points X_new co-vary wich the training points.
             - If None, the new points covary throigh every kernel part (default)
             - If a list of slices, the i^th slice specifies which data are affected by the i^th kernel part
             - If a list of booleans, specifying which kernel parts are active
           If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalisations of the output dimensions.
        """
        #normalise X values
        Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
        mu, var = self._raw_predict(Xnew, slices, full_cov)
        #un-normalise
        mu = mu*self._Ystd + self._Ymean
        if full_cov:
            if self.D==1:
                var *= np.square(self._Ystd)
            else:
                var = var[:,:,None] * np.square(self._Ystd)
        else:
            if self.D==1:
                var *= np.square(np.squeeze(self._Ystd))
            else:
                var = var[:,None] * np.square(self._Ystd)
        return mu,var
    def _raw_predict(self,_Xnew,slices, full_cov=False):
        """Internal helper function for making predictions, does not account for normalisation"""
        Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices)
        mu = np.dot(np.dot(Kx.T,self.Ki),self.Y)
        KiKx = np.dot(self.Ki,Kx)
        if full_cov:
            Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices)
            var = Kxx - np.dot(KiKx.T,Kx)
        else:
            Kxx = self.kern.Kdiag(_Xnew, slices=slices)
            var = Kxx - np.sum(np.multiply(KiKx,Kx),0)
        return mu, var
    def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None):
        """
        :param samples: the number of a posteriori samples to plot
        :param which_data: which if the training data to plot (default all)
        :type which_data: 'all' or a slice object to slice self.X, self.Y
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :param which_functions: which of the kernel functions to plot (additively)
        :type which_functions: list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
          - In higher dimensions, we've no implemented this yet !TODO!
        Can plot only part of the data and part of the posterior functions using which_data and which_functions
        """
        if which_functions=='all':
            which_functions = [True]*self.kern.Nparts
        if which_data=='all':
            which_data = slice(None)
        X = self.X[which_data,:]
        Y = self.Y[which_data,:]
        Xorig = X*self._Xstd + self._Xmean
        Yorig = Y*self._Ystd + self._Ymean
        if plot_limits is None:
            xmin,xmax = Xorig.min(0),Xorig.max(0)
            xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
        elif len(plot_limits)==2:
            xmin, xmax = plot_limits
        else:
            raise ValueError, "Bad limits for plotting"
        if self.X.shape[1]==1:
            Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
            m,v = self.predict(Xnew,slices=which_functions)
            gpplot(Xnew,m,v)
            if samples:
                s = np.random.multivariate_normal(m.flatten(),v,samples)
                pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8)
            pb.plot(Xorig,Yorig,'kx',mew=1.5)
            pb.xlim(xmin,xmax)
        elif self.X.shape[1]==2:
            resolution = 50 or resolution
            xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
            Xtest = np.vstack((xx.flatten(),yy.flatten())).T
            zz,vv = self.predict(Xtest,slices=which_functions)
            zz = zz.reshape(resolution,resolution)
            pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet)
            pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max())
            pb.xlim(xmin[0],xmax[0])
            pb.ylim(xmin[1],xmax[1])
        else:
            raise NotImplementedError, "Cannot plot GPs with more than two input dimensions"
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -2,12 +2,12 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from GP import GP
 from GP_regression import GP_regression
 from sparse_GP import sparse_GP
 from sparse_GP_regression import sparse_GP_regression
 from GPLVM import GPLVM
 from warped_GP import warpedGP
 from GP_EP import GP_EP
 from generalized_FITC import generalized_FITC
 from sparse_GPLVM import sparse_GPLVM
-from uncollapsed_sparse_GP import uncollapsed_sparse_GP
+#from uncollapsed_sparse_GP import uncollapsed_sparse_GP
 from BGPLVM import Bayesian_GPLVM
--- a/GPy/models/generalized_FITC.py
+++ b/GPy/models/generalized_FITC.py
@ -1,241 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from scipy import stats, linalg
 from .. import kern
 from ..core import model
 from ..util.linalg import pdinv,mdot
 from ..util.plot import gpplot
 from ..inference.Expectation_Propagation import FITC
 from ..inference.likelihoods import likelihood,probit
 class generalized_FITC(model):
    def __init__(self,X,likelihood,kernel=None,inducing=10,epsilon_ep=1e-3,powerep=[1.,1.]):
        """
        Naish-Guzman, A. and Holden, S. (2008) implemantation of EP with FITC.
        :param X: input observations
        :param likelihood: Output's likelihood (likelihood class)
        :param kernel: a GPy kernel
        :param inducing:  Either an array specifying the inducing points location or a scalar defining their number.
        :param epsilon_ep: EP convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
        :param powerep: Power-EP parameters (eta,delta) - 2x1 numpy array (floats)
        """
        assert isinstance(kernel,kern.kern)
        self.likelihood = likelihood
        self.Y = self.likelihood.Y
        self.kernel = kernel
        self.X = X
        self.N, self.D = self.X.shape
        assert self.Y.shape[0] == self.N
        if type(inducing) == int:
            self.M = inducing
            self.Z = (np.random.random_sample(self.D*self.M)*(self.X.max()-self.X.min())+self.X.min()).reshape(self.M,-1)
        elif type(inducing) == np.ndarray:
            self.Z = inducing
            self.M = self.Z.shape[0]
        self.eta,self.delta = powerep
        self.epsilon_ep = epsilon_ep
        self.jitter = 1e-12
        model.__init__(self)
    def _set_params(self,p):
        self.kernel._set_params_transformed(p[0:-self.Z.size])
        self.Z = p[-self.Z.size:].reshape(self.M,self.D)
    def _get_params(self):
        return np.hstack([self.kernel._get_params_transformed(),self.Z.flatten()])
    def _get_param_names(self):
        return self.kernel._get_param_names_transformed()+['iip_%i'%i for i in range(self.Z.size)]
    def approximate_likelihood(self):
        self.Kmm = self.kernel.K(self.Z)
        self.Knm = self.kernel.K(self.X,self.Z)
        self.Knn_diag = self.kernel.Kdiag(self.X)
        self.ep_approx = FITC(self.Kmm,self.likelihood,self.Knm.T,self.Knn_diag,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta])
        self.ep_approx.fit_EP()
    def posterior_param(self):
        self.Knn_diag = self.kernel.Kdiag(self.X)
        self.Kmm = self.kernel.K(self.Z)
        self.Kmmi, self.Lmm, self.Lmmi, self.Kmm_logdet = pdinv(self.Kmm)
        self.Knm = self.kernel.K(self.X,self.Z)
        self.KmmiKmn = np.dot(self.Kmmi,self.Knm.T)
        self.Qnn = np.dot(self.Knm,self.KmmiKmn)
        self.Diag0 =  self.Knn_diag - np.diag(self.Qnn)
        self.R0 = np.linalg.cholesky(self.Kmmi).T
        self.Taut = self.ep_approx.tau_tilde/(1.+ self.ep_approx.tau_tilde*self.Diag0)
        self.KmnTaut = self.Knm.T*self.Taut[None,:]
        self.KmnTautKnm = np.dot(self.KmnTaut, self.Knm)
        self.Woodbury_inv, self.Wood_L, self.Wood_Li, self.Woodbury_logdet = pdinv(self.Kmm + self.KmnTautKnm)
        self.Qnn_diag = self.Knn_diag - np.diag(self.Qnn) + 1./self.ep_approx.tau_tilde
        self.Qi = -np.dot(self.KmnTaut.T, np.dot(self.Woodbury_inv,self.KmnTaut)) + np.diag(self.Taut)
        self.hld = 0.5*np.sum(np.log(self.Diag0 + 1./self.ep_approx.tau_tilde)) - 0.5*self.Kmm_logdet + 0.5*self.Woodbury_logdet
        self.Diag = self.Diag0/(1.+ self.Diag0 * self.ep_approx.tau_tilde)
        self.P = (self.Diag / self.Diag0)[:,None] * self.Knm
        self.RPT0 = np.dot(self.R0,self.Knm.T)
        self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T))
        self.R,info = linalg.flapack.dtrtrs(self.L,self.R0,lower=1)
        self.RPT = np.dot(self.R,self.P.T)
        self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT)
        self.w = self.Diag * self.ep_approx.v_tilde
        self.gamma = np.dot(self.R.T, np.dot(self.RPT,self.ep_approx.v_tilde))
        self.mu = self.w + np.dot(self.P,self.gamma)
        self.mu_tilde = (self.ep_approx.v_tilde/self.ep_approx.tau_tilde)[:,None]
    def log_likelihood(self):
        self.posterior_param()
        self.YYT = np.dot(self.mu_tilde,self.mu_tilde.T)
        A = -self.hld
        B = -.5*np.sum(self.Qi*self.YYT)
        C = sum(np.log(self.ep_approx.Z_hat))
        D = .5*np.sum(np.log(1./self.ep_approx.tau_tilde + 1./self.ep_approx.tau_))
        E = .5*np.sum((self.ep_approx.v_/self.ep_approx.tau_ - self.mu_tilde.flatten())**2/(1./self.ep_approx.tau_ + 1./self.ep_approx.tau_tilde))
        return  A + B + C + D + E
    def _log_likelihood_gradients(self):
        dKmm_dtheta = self.kernel.dK_dtheta(self.Z)
        dKnn_dtheta = self.kernel.dK_dtheta(self.X)
        dKmn_dtheta = self.kernel.dK_dtheta(self.Z,self.X)
        dKmm_dZ = -self.kernel.dK_dX(self.Z)
        dKnm_dZ = -self.kernel.dK_dX(self.X,self.Z)
        tmp = [np.dot(dKmn_dtheta_i,self.KmmiKmn) for dKmn_dtheta_i in dKmn_dtheta.T]
        dQnn_dtheta = [tmp_i + tmp_i.T - np.dot(np.dot(self.KmmiKmn.T,dKmm_dtheta_i),self.KmmiKmn) for tmp_i,dKmm_dtheta_i in zip(tmp,dKmm_dtheta.T)]
        dDiag0_dtheta = [np.diag(dKnn_dtheta_i) - np.diag(dQnn_dtheta_i) for dKnn_dtheta_i,dQnn_dtheta_i in zip(dKnn_dtheta.T,dQnn_dtheta)]
        dQ_dtheta = [np.diag(dDiag0_dtheta_i) + dQnn_dtheta_i for dDiag0_dtheta_i,dQnn_dtheta_i in zip(dDiag0_dtheta,dQnn_dtheta)]
        dW_dtheta = [dKmm_dtheta_i + 2*np.dot(self.KmnTaut,dKmn_dtheta_i) - np.dot(self.KmnTaut*dDiag0_dtheta_i,self.KmnTaut.T) for dKmm_dtheta_i,dDiag0_dtheta_i,dKmn_dtheta_i in zip(dKmm_dtheta.T,dDiag0_dtheta,dKmn_dtheta.T)]
        QiY = np.dot(self.Qi, self.mu_tilde)
        QiYYQi = np.outer(QiY,QiY)
        WiKmnTaut = np.dot(self.Woodbury_inv,self.KmnTaut)
        K_Y = np.dot(self.KmmiKmn,QiY)
        # gradient - theta
        Atheta = [-0.5*np.dot(self.Taut,dDiag0_dtheta_i) + 0.5*np.sum(self.Kmmi*dKmm_dtheta_i) - 0.5*np.sum(self.Woodbury_inv*dW_dtheta_i) for dDiag0_dtheta_i,dKmm_dtheta_i,dW_dtheta_i in zip(dDiag0_dtheta,dKmm_dtheta.T,dW_dtheta)]
        Btheta = np.array([0.5*np.sum(QiYYQi*dQ_dtheta_i) for dQ_dtheta_i in dQ_dtheta])
        dL_dtheta = Atheta + Btheta
        # gradient - Z
        # Az
        dQnn_dZ_diag_a2 = (np.array([d[:,:,None]*self.KmmiKmn[:,:,None] for d in dKnm_dZ.transpose(2,0,1)]).reshape(self.D,self.M,self.N)).transpose(1,2,0)
        dQnn_dZ_diag_b2 = (np.array([(self.KmmiKmn*np.sum(d[:,:,None]*self.KmmiKmn,-2))[:,:,None] for d in dKmm_dZ.transpose(2,0,1)]).reshape(self.D,self.M,self.N)).transpose(1,2,0)
        dQnn_dZ_diag = dQnn_dZ_diag_a2 - dQnn_dZ_diag_b2
        d_hld_Diag1_dZ = -np.sum(np.dot(self.KmmiKmn*self.Taut,self.KmmiKmn.T)[:,:,None]*dKmm_dZ,-2) + np.sum((self.KmmiKmn*self.Taut)[:,:,None]*dKnm_dZ,-2)
        d_hld_Kmm_dZ = np.sum(self.Kmmi[:,:,None]*dKmm_dZ,-2)
        d_hld_W_dZ1 = np.sum(WiKmnTaut[:,:,None]*dKnm_dZ,-2)
        d_hld_W_dZ3 = np.sum(self.Woodbury_inv[:,:,None]*dKmm_dZ,-2)
        d_hld_W_dZ2 = np.array([np.sum(np.sum(WiKmnTaut.T*d[:,:,None]*self.KmnTaut.T,-2),-1) for d in dQnn_dZ_diag.transpose(2,0,1)]).T
        Az = d_hld_Diag1_dZ + d_hld_Kmm_dZ - d_hld_W_dZ1 - d_hld_W_dZ2 - d_hld_W_dZ3
        # Bz
        Bz2 = np.sum(np.dot(K_Y,QiY.T)[:,:,None]*dKnm_dZ,-2)
        Bz3 = - np.sum(np.dot(K_Y,K_Y.T)[:,:,None]*dKmm_dZ,-2)
        Bz1 = -np.array([np.sum((QiY**2)*d[:,:,None],-2) for d in dQnn_dZ_diag.transpose(2,0,1)]).reshape(self.D,self.M).T
        Bz = Bz1 + Bz2 + Bz3
        dL_dZ = (Az + Bz).flatten()
        return np.hstack([dL_dtheta, dL_dZ])
    def predict(self,X):
        """
        Make a prediction for the vsGP model
        Arguments
        ---------
        X : Input prediction data - Nx1 numpy array (floats)
        """
        #TODO: check output dimensions
        K_x = self.kernel.K(self.Z,X)
        Kxx = self.kernel.K(X)
        #K_x = self.kernM.cross.K(X)
        # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)
        # Ci = I + (RPT0)Di(RPT0).T
        # C = I - [RPT0] * (D+[RPT0].T*[RPT0])^-1*[RPT0].T
        #   = I - [RPT0] * (D + self.Qnn)^-1 * [RPT0].T
        #   = I - [RPT0] * (U*U.T)^-1 * [RPT0].T
        #   = I - V.T * V
        U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn)
        V,info = linalg.flapack.dtrtrs(U,self.RPT0.T,lower=1)
        C = np.eye(self.M) - np.dot(V.T,V)
        mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:])
        #self.C = C
        #self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
        #self.mu_u = mu_u
        #self.U = U
        # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T)
        mu_H = np.dot(mu_u,self.mu)
        self.mu_H = mu_H
        Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T))
        # q(f_star|y) = N(f_star|mu_star,sigma2_star)
        KR0T = np.dot(K_x.T,self.R0.T)
        mu_star = np.dot(KR0T,mu_H)
        sigma2_star = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
        vdiag = np.diag(sigma2_star)
        # q(y_star|y) = non-gaussian posterior probability of class membership
        p = self.likelihood.predictive_mean(mu_star,vdiag)
        return mu_star,vdiag,p
    def plot(self):
        """
        Plot the fitted model: training function values, inducing points used, mean estimate and confidence intervals.
        """
        if self.X.shape[1]==1:
            pb.figure()
            xmin,xmax = np.r_[self.X,self.Z].min(),np.r_[self.X,self.Z].max()
            xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
            Xnew = np.linspace(xmin,xmax,100)[:,None]
            mu_f, var_f, mu_phi = self.predict(Xnew)
            self.mu_inducing,self.var_diag_inducing,self.phi_inducing = self.predict(self.Z)
            pb.subplot(211)
            self.likelihood.plot1Da(X_new=Xnew,Mean_new=mu_f,Var_new=var_f,X_u=self.Z,Mean_u=self.mu_inducing,Var_u=self.var_diag_inducing)
            pb.subplot(212)
            self.likelihood.plot1Db(self.X,Xnew,mu_phi,self.Z)
        elif self.X.shape[1]==2:
            pb.figure()
            x1min,x1max = self.X[:,0].min(0),self.X[:,0].max(0)
            x2min,x2max = self.X[:,1].min(0),self.X[:,1].max(0)
            x1min, x1max = x1min-0.2*(x1max-x1min), x1max+0.2*(x1max-x1min)
            x2min, x2max = x2min-0.2*(x2max-x2min), x2max+0.2*(x1max-x1min)
            axis1 = np.linspace(x1min,x1max,50)
            axis2 = np.linspace(x2min,x2max,50)
            XX1, XX2 = [e.flatten() for e in np.meshgrid(axis1,axis2)]
            Xnew = np.c_[XX1.flatten(),XX2.flatten()]
            f,v,p = self.predict(Xnew)
            self.likelihood.plot2D(self.X,Xnew,p,self.Z)
        else:
            raise NotImplementedError, "Cannot plot GPs with more than two input dimensions"
    def em(self,max_f_eval=1e4,epsilon=.1,plot_all=False): #TODO check this makes sense
        """
        Fits sparse_EP and optimizes the hyperparametes iteratively until convergence is achieved.
        """
        self.epsilon_em = epsilon
        log_likelihood_change = self.epsilon_em + 1.
        self.parameters_path = [self.kernel._get_params()]
        self.approximate_likelihood()
        self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]]
        self.inducing_inputs_path = [self.Z]
        self.log_likelihood_path = [self.log_likelihood()]
        iteration = 0
        while log_likelihood_change > self.epsilon_em:
            print 'EM iteration', iteration
            self.optimize(max_f_eval = max_f_eval)
            log_likelihood_new = self.log_likelihood()
            log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1]
            if log_likelihood_change < 0:
                print 'log_likelihood decrement'
                self.kernel._set_params_transformed(self.parameters_path[-1])
                self.kernM = self.kernel.copy()
                slef.kernM.expand_X(self.iducing_inputs_path[-1])
                self.__init__(self.kernel,self.likelihood,kernM=self.kernM,powerep=[self.eta,self.delta],epsilon_ep = self.epsilon_ep, epsilon_em = self.epsilon_em)
            else:
                self.approximate_likelihood()
                self.log_likelihood_path.append(self.log_likelihood())
                self.parameters_path.append(self.kernel._get_params())
                self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde])
                self.inducing_inputs_path.append(self.Z)
            iteration += 1
--- a/GPy/models/sparse_GP.py
+++ b/GPy/models/sparse_GP.py
@ -0,0 +1,217 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, chol_inv, pdinv
 from ..util.plot import gpplot
 from .. import kern
 from GP import GP
 #Still TODO:
 # make use of slices properly (kernel can now do this)
 # enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array)
 class sparse_GP(GP):
    """
    Variational sparse GP model
    :param X: inputs
    :type X: np.ndarray (N x Q)
    :param likelihood: a likelihood instance, containing the observed data
    :type likelihood: GPy.likelihood.(Gaussian | EP)
    :param kernel : the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
    :type X_uncertainty: np.ndarray (N x Q) | None
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
    def __init__(self, X, likelihood, kernel, Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False):
        self.scale_factor = 1.0# a scaling factor to help keep the algorithm stable
        self.Z = Z
        self.Zslices = Zslices
        self.Xslices = Xslices
        self.M = Z.shape[0]
        self.likelihood = likelihood
        if X_uncertainty is None:
            self.has_uncertain_inputs=False
        else:
            assert X_uncertainty.shape==X.shape
            self.has_uncertain_inputs=True
            self.X_uncertainty = X_uncertainty
        GP.__init__(self, X, likelihood, kernel=kernel, normalize_X=normalize_X, Xslices=Xslices)
        #normalise X uncertainty also
        if self.has_uncertain_inputs:
            self.X_uncertainty /= np.square(self._Xstd)
    def _computations(self):
        # TODO find routine to multiply triangular matrices
        #TODO: slices for psi statistics (easy enough)
        sf = self.scale_factor
        sf2 = sf**2
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
        if self.has_uncertain_inputs:
            self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty)
            self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
            self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
            if self.likelihood.is_heteroscedastic:
                self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision.reshape(self.N,1,1)/sf2)).sum(0)
                #TODO: what is the likelihood is heterscedatic and there are multiple independent outputs?
            else:
                self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision/sf2)).sum(0)
        else:
            self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)
            self.psi1 = self.kern.K(self.Z,self.X)
            if self.likelihood.is_heteroscedastic:
                tmp = self.psi1*(np.sqrt(self.likelihood.precision.reshape(self.N,1))/sf)
            else:
                tmp = self.psi1*(np.sqrt(self.likelihood.precision)/sf)
            self.psi2_beta_scaled = np.dot(tmp,tmp.T)
            self.psi2 = self.psi1.T[:,:,None]*self.psi1.T[:,None,:] # TODO: remove me for efficiency and stability
        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
        self.V = (self.likelihood.precision/self.scale_factor)*self.likelihood.Y
        self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T)
        self.B = np.eye(self.M)/sf2 + self.A
        self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
        self.psi1V = np.dot(self.psi1, self.V)
        self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
        self.C = mdot(self.Lmi.T, self.Bi, self.Lmi)
        self.E = mdot(self.C, self.psi1VVpsi1/sf2, self.C.T)
        # Compute dL_dpsi # FIXME: this is untested for the het. case
        self.dL_dpsi0 = - 0.5 * self.D * self.likelihood.precision * np.ones(self.N)
        self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T
        if self.likelihood.is_heteroscedastic:
            self.dL_dpsi2 = 0.5 * self.likelihood.precision[:,None,None] * self.D * self.Kmmi[None,:,:] # dB
            self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]/sf2 * self.D * self.C[None,:,:] # dC
            self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]* self.E[None,:,:] # dD
        else:
            self.dL_dpsi2 = 0.5 * self.likelihood.precision * self.D * self.Kmmi[None,:,:] # dB
            self.dL_dpsi2 += - 0.5 * self.likelihood.precision/sf2 * self.D * self.C[None,:,:] # dC
            self.dL_dpsi2 += - 0.5 * self.likelihood.precision * self.E[None,:,:] # dD
        # Compute dL_dKmm
        self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB
        self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC
        self.dL_dKmm +=  np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD
        #the partial derivative vector for the likelihood
        if self.likelihood.Nparams ==0:
            #save computation here.
            self.partial_for_likelihood = None
        elif self.likelihood.is_heteroscedastic:
            raise NotImplementedError, "heteroscedatic derivates not implemented"
            #self.partial_for_likelihood = - 0.5 * self.D*self.likelihood.precision + 0.5 * (self.likelihood.Y**2).sum(1)*self.likelihood.precision**2 #dA
            #self.partial_for_likelihood +=  0.5 * self.D * (self.psi0*self.likelihood.precision**2 - (self.psi2*self.Kmmi[None,:,:]*self.likelihood.precision[:,None,None]**2).sum(1).sum(1)/sf2) #dB
            #self.partial_for_likelihood +=  0.5 * self.D * np.sum(self.Bi*self.A)*self.likelihood.precision #dC
            #self.partial_for_likelihood += -np.diag(np.dot((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) , self.psi1VVpsi1 ))*self.likelihood.precision #dD
        else:
            #likelihood is not heterscedatic
            beta = self.likelihood.precision
            dbeta =   0.5 * self.N*self.D/beta - 0.5 * np.sum(np.square(self.likelihood.Y))
            dbeta += - 0.5 * self.D * (self.psi0.sum() - np.trace(self.A)/beta*sf2)
            dbeta += - 0.5 * self.D * np.sum(self.Bi*self.A)/beta
            dbeta += np.sum((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) * self.psi1VVpsi1 )/beta
            self.partial_for_likelihood = -dbeta*self.likelihood.precision**2
    def _set_params(self, p):
        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
        self.kern._set_params(p[self.Z.size:self.Z.size+self.kern.Nparam])
        self.likelihood._set_params(p[self.Z.size+self.kern.Nparam:])
        self._computations()
    def _get_params(self):
        return np.hstack([self.Z.flatten(),GP._get_params(self)])
    def _get_param_names(self):
        return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + GP._get_param_names(self)
    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        sf2 = self.scale_factor**2
        if self.likelihood.is_heteroscedastic:
            A = -0.5*self.N*self.D*np.log(2.*np.pi) +0.5*np.sum(np.log(self.likelihood.precision)) -0.5*np.sum(self.V*self.likelihood.Y)
        else:
            A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.likelihood.precision)) -0.5*self.likelihood.precision*self.likelihood.trYYT
        B = -0.5*self.D*(np.sum(self.likelihood.precision*self.psi0) - np.trace(self.A)*sf2)
        C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
        D = +0.5*np.sum(self.psi1VVpsi1 * self.C)
        return A+B+C+D
    def _log_likelihood_gradients(self):
        return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))
    def dL_dtheta(self):
        """
        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
        """
        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
        if self.has_uncertain_inputs:
            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.dL_dpsi1.T, self.Z,self.X, self.X_uncertainty)
        else:
            #re-cast computations in psi2 back to psi1:
            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1)
            dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
        return dL_dtheta
    def dL_dZ(self):
        """
        The derivative of the bound wrt the inducing inputs Z
        """
        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
        if self.has_uncertain_inputs:
            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1,self.Z,self.X, self.X_uncertainty)
            dL_dZ += 2.*self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # 'stripes'
        else:
            #re-cast computations in psi2 back to psi1:
            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1)
            dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
        return dL_dZ
    def _raw_predict(self, Xnew, slices, full_cov=False):
        """Internal helper function for making predictions, does not account for normalisation"""
        Kx = self.kern.K(self.Z, Xnew)
        mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
        if full_cov:
            Kxx = self.kern.K(Xnew)
            var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting
        else:
            Kxx = self.kern.Kdiag(Xnew)
            var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
        return mu,var[:,None]
    def plot(self, *args, **kwargs):
        """
        Plot the fitted model: just call the GP plot function and then add inducing inputs
        """
        GP.plot(self,*args,**kwargs)
        if self.Q==1:
            if self.has_uncertain_inputs:
                pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
        if self.Q==2:
            pb.plot(self.Z[:,0],self.Z[:,1],'wo')
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@ -1,205 +1,46 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-import pylab as pb
+from sparse_GP import sparse_GP
-from ..util.linalg import mdot, jitchol, chol_inv, pdinv
+from .. import likelihoods
 from ..util.plot import gpplot
 from .. import kern
 from ..inference.likelihoods import likelihood
 from GP_regression import GP_regression
-#Still TODO:
+class sparse_GP_regression(sparse_GP):
 # make use of slices properly (kernel can now do this)
 # enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array)
 class sparse_GP_regression(GP_regression):
    """
-    Variational sparse GP model (Regression)
+    Gaussian Process model for regression
    This is a thin wrapper around the GP class, with a set of sensible defalts
    :param X: input observations
    :param Y: observed values
    :param kernel: a GPy kernel, defaults to rbf+white
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
    :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing)
    :rtype: model object
    .. Note:: Multiple independent outputs are allowed using columns of Y
    :param X: inputs
    :type X: np.ndarray (N x Q)
    :param Y: observed data
    :type Y: np.ndarray of observations (N x D)
    :param kernel : the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
    :type X_uncertainty: np.ndarray (N x Q) | None
    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
    :param beta: noise precision. TODO> ignore beta if doing EP
    :type beta: float
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
-    def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
+    def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None,Z=None, M=10):
-        self.scale_factor = 100.0
+        #kern defaults to rbf
-        self.beta = beta
+        if kernel is None:
            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)
        #Z defaults to a subset of the data
        if Z is None:
-            self.Z = np.random.permutation(X.copy())[:M]
+            Z = np.random.permutation(X.copy())[:M]
            self.M = M
        else:
            assert Z.shape[1]==X.shape[1]
            self.Z = Z
            self.M = Z.shape[0]
        if X_uncertainty is None:
            self.has_uncertain_inputs=False
        else:
            assert X_uncertainty.shape==X.shape
            self.has_uncertain_inputs=True
            self.X_uncertainty = X_uncertainty
-        GP_regression.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y)
+        #likelihood defaults to Gaussian
-        self.trYYT = np.sum(np.square(self.Y))
+        likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y)
-        #normalise X uncertainty also
+        sparse_GP.__init__(self, X, likelihood, kernel, Z, normalize_X=normalize_X, Xslices=Xslices)
        if self.has_uncertain_inputs:
            self.X_uncertainty /= np.square(self._Xstd)
    def _computations(self):
        # TODO find routine to multiply triangular matrices
        #TODO: slices for psi statistics (easy enough)
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
        if self.has_uncertain_inputs:
            self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
            self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
            self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
            self.psi2_beta_scaled = (self.psi2*(self.beta/self.scale_factor**2)).sum(0)
        else:
            self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
            self.psi1 = self.kern.K(self.Z,self.X)
            #self.psi2 = np.dot(self.psi1,self.psi1.T)
            #self.psi2 = self.psi1.T[:,:,None]*self.psi1.T[:,None,:]
            tmp = self.psi1/(self.scale_factor/np.sqrt(self.beta))
            self.psi2_beta_scaled = np.dot(tmp,tmp.T)
        sf = self.scale_factor
        sf2 = sf**2
        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)#+np.eye(self.M)*1e-3)
        self.V = (self.beta/self.scale_factor)*self.Y
        self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T)
        self.B = np.eye(self.M)/sf2 + self.A
        self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
        self.psi1V = np.dot(self.psi1, self.V)
        self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
        self.C = mdot(self.Lmi.T, self.Bi, self.Lmi)
        self.E = mdot(self.C, self.psi1VVpsi1/sf2, self.C.T)
        # Compute dL_dpsi
        self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
        self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T
        self.dL_dpsi2 = 0.5 * self.beta * self.D * self.Kmmi[None,:,:] # dB
        self.dL_dpsi2 += - 0.5 * self.beta/sf2 * self.D * self.C[None,:,:] # dC
        self.dL_dpsi2 += - 0.5 * self.beta * self.E[None,:,:] # dD
        # Compute dL_dKmm
        self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB
        self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC
        self.dL_dKmm +=  np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD
    def _set_params(self, p):
        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
        self.beta = p[self.M*self.Q]
        self.kern._set_params(p[self.Z.size + 1:])
        self._computations()
    def _get_params(self):
        return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()])
    def _get_param_names(self):
        return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed()
    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        sf2 = self.scale_factor**2
        A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) -0.5*self.beta*self.trYYT
        B = -0.5*self.D*(self.beta*self.psi0-np.trace(self.A)*sf2)
        C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
        D = +0.5*np.sum(self.psi1VVpsi1 * self.C)
        return A+B+C+D
    def _log_likelihood_gradients(self):
        return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
    def dL_dbeta(self):
        """
        Compute the gradient of the log likelihood wrt beta.
        """
        #TODO: suport heteroscedatic noise
        sf2 = self.scale_factor**2
        dA_dbeta =   0.5 * self.N*self.D/self.beta - 0.5 * self.trYYT
        dB_dbeta = - 0.5 * self.D * (self.psi0 - np.trace(self.A)/self.beta*sf2)
        dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta
        dD_dbeta = np.sum((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) * self.psi1VVpsi1 )/self.beta
        return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta)
    def dL_dtheta(self):
        """
        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
        """
        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
        if self.has_uncertain_inputs:
            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.dL_dpsi1.T, self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape
        else:
            #re-cast computations in psi2 back to psi1:
            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1)
            dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
        return dL_dtheta
    def dL_dZ(self):
        """
        The derivative of the bound wrt the inducing inputs Z
        """
        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
        if self.has_uncertain_inputs:
            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1,self.Z,self.X, self.X_uncertainty)
            dL_dZ += 2.*self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # 'stripes'
        else:
            #re-cast computations in psi2 back to psi1:
            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1)
            dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
        return dL_dZ
    def _raw_predict(self, Xnew, slices, full_cov=False):
        """Internal helper function for making predictions, does not account for normalisation"""
        Kx = self.kern.K(self.Z, Xnew)
        mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
        if full_cov:
            Kxx = self.kern.K(Xnew)
            var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) + np.eye(Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case.
        else:
            Kxx = self.kern.Kdiag(Xnew)
            var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0) + 1./self.beta # TODO: This beta doesn't belong here in the EP case.
        return mu,var
    def plot(self, *args, **kwargs):
        """
        Plot the fitted model: just call the GP_regression plot function and then add inducing inputs
        """
        GP_regression.plot(self,*args,**kwargs)
        if self.Q==1:
            pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12)
            if self.has_uncertain_inputs:
                pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
        if self.Q==2:
            pb.plot(self.Z[:,0],self.Z[:,1],'wo')
--- a/GPy/models/uncollapsed_sparse_GP.py
+++ b/GPy/models/uncollapsed_sparse_GP.py
@ -6,7 +6,7 @@ import pylab as pb
 from ..util.linalg import mdot, jitchol, chol_inv, pdinv
 from ..util.plot import gpplot
 from .. import kern
-from ..inference.likelihoods import likelihood
+from ..likelihoods import likelihood
 from sparse_GP_regression import sparse_GP_regression
 class uncollapsed_sparse_GP(sparse_GP_regression):
@ -136,8 +136,8 @@ class uncollapsed_sparse_GP(sparse_GP_regression):
        #dL_dm = np.dot(self.Kmmi,self.psi1V) - np.dot(self.Lambda,self.q_u_mean)
        dL_dm = np.dot(self.Kmmi,self.psi1V) - self.q_u_canonical[0]
-        #dL_dSim = 
+        #dL_dSim =
-        #dL_dmhSi = 
+        #dL_dmhSi =
        return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten()))  # natgrad only, grad TODO
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -154,17 +154,16 @@ class GradientTests(unittest.TestCase):
        m.constrain_positive('(linear|bias|white)')
        self.assertTrue(m.checkgrad())
-    def test_GP_EP(self):
+    def test_GP_EP_probit(self):
        return # Disabled TODO
        N = 20
-        X = np.hstack([np.random.rand(N/2)+1,np.random.rand(N/2)-1])[:,None]
+        X = np.hstack([np.random.normal(5,2,N/2),np.random.normal(10,2,N/2)])[:,None]
-        k = GPy.kern.rbf(1) + GPy.kern.white(1)
+        Y = np.hstack([np.ones(N/2),np.repeat(-1,N/2)])[:,None]
-        Y = np.hstack([np.ones(N/2),-np.ones(N/2)])[:,None]
+        kernel = GPy.kern.rbf(1)
-        likelihood = GPy.inference.likelihoods.probit(Y)
+        distribution = GPy.likelihoods.likelihood_functions.probit()
-        m = GPy.models.GP_EP(X,likelihood,k)
+        likelihood = GPy.likelihoods.EP(Y, distribution)
-        m.constrain_positive('(var|len)')
+        m = GPy.models.GP(X, likelihood, kernel)
-        m.approximate_likelihood()
+        m.ensure_default_constraints()
-        self.assertTrue(m.checkgrad())
+        self.assertTrue(m.EPEM)
    @unittest.skip("FITC will be broken for a while")
    def test_generalized_FITC(self):
--- a/GPy/util/Tango.py
+++ b/GPy/util/Tango.py
@ -3,7 +3,6 @@
 import matplotlib as mpl
 import pylab as pb
 import sys
 #sys.path.append('/home/james/mlprojects/sitran_cluster/')
@ -15,12 +14,12 @@ def removeRightTicks(ax=None):
    ax = ax or pb.gca()
    for i, line in enumerate(ax.get_yticklines()):
        if i%2 == 1:   # odd indices
-            line.set_visible(False) 
+            line.set_visible(False)
 def removeUpperTicks(ax=None):
    ax = ax or pb.gca()
    for i, line in enumerate(ax.get_xticklines()):
        if i%2 == 1:   # odd indices
-            line.set_visible(False) 
+            line.set_visible(False)
 def fewerXticks(ax=None,divideby=2):
    ax = ax or pb.gca()
    ax.set_xticks(ax.get_xticks()[::divideby])
@ -126,8 +125,6 @@ cdict_RB = {'red' :((0.,coloursRGB['mediumRed'][0]/256.,coloursRGB['mediumRed'][
            'blue':((0.,coloursRGB['mediumRed'][2]/256.,coloursRGB['mediumRed'][2]/256.),
                      (.5,coloursRGB['mediumPurple'][2]/256.,coloursRGB['mediumPurple'][2]/256.),
                      (1.,coloursRGB['mediumBlue'][2]/256.,coloursRGB['mediumBlue'][2]/256.))}
 cmap_RB = mpl.colors.LinearSegmentedColormap('TangoRedBlue',cdict_RB,256)
 cdict_BGR = {'red' :((0.,coloursRGB['mediumBlue'][0]/256.,coloursRGB['mediumBlue'][0]/256.),
                     (.5,coloursRGB['mediumGreen'][0]/256.,coloursRGB['mediumGreen'][0]/256.),
@ -138,7 +135,7 @@ cdict_BGR = {'red' :((0.,coloursRGB['mediumBlue'][0]/256.,coloursRGB['mediumBlue
            'blue':((0.,coloursRGB['mediumBlue'][2]/256.,coloursRGB['mediumBlue'][2]/256.),
                      (.5,coloursRGB['mediumGreen'][2]/256.,coloursRGB['mediumGreen'][2]/256.),
                      (1.,coloursRGB['mediumRed'][2]/256.,coloursRGB['mediumRed'][2]/256.))}
-cmap_BGR = mpl.colors.LinearSegmentedColormap('TangoRedBlue',cdict_BGR,256)
+
 cdict_Alu = {'red' :((0./5,coloursRGB['Aluminium1'][0]/256.,coloursRGB['Aluminium1'][0]/256.),
                     (1./5,coloursRGB['Aluminium2'][0]/256.,coloursRGB['Aluminium2'][0]/256.),
@ -158,13 +155,12 @@ cdict_Alu = {'red' :((0./5,coloursRGB['Aluminium1'][0]/256.,coloursRGB['Aluminiu
                     (3./5,coloursRGB['Aluminium4'][2]/256.,coloursRGB['Aluminium4'][2]/256.),
                     (4./5,coloursRGB['Aluminium5'][2]/256.,coloursRGB['Aluminium5'][2]/256.),
                     (5./5,coloursRGB['Aluminium6'][2]/256.,coloursRGB['Aluminium6'][2]/256.))}
-cmap_Alu = mpl.colors.LinearSegmentedColormap('TangoAluminium',cdict_Alu,256)
+# cmap_Alu = mpl.colors.LinearSegmentedColormap('TangoAluminium',cdict_Alu,256)
-
+# cmap_BGR = mpl.colors.LinearSegmentedColormap('TangoRedBlue',cdict_BGR,256)
 # cmap_RB = mpl.colors.LinearSegmentedColormap('TangoRedBlue',cdict_RB,256)
 if __name__=='__main__':
    import pylab as pb
    pb.figure()
    pb.pcolor(pb.rand(10,10),cmap=cmap_RB)
    pb.colorbar()
    pb.show()
--- a/GPy/util/plot.py
+++ b/GPy/util/plot.py
@ -6,30 +6,26 @@ import Tango
 import pylab as pb
 import numpy as np
-def gpplot(x,mu,var,edgecol=Tango.coloursHex['darkBlue'],fillcol=Tango.coloursHex['lightBlue'],axes=None,**kwargs):
+def gpplot(x,mu,lower,upper,edgecol=Tango.coloursHex['darkBlue'],fillcol=Tango.coloursHex['lightBlue'],axes=None,**kwargs):
    if axes is None:
        axes = pb.gca()
    mu = mu.flatten()
    x = x.flatten()
    lower = lower.flatten()
    upper = upper.flatten()
    #here's the mean
    axes.plot(x,mu,color=edgecol,linewidth=2)
-    #ensure variance is a vector
+    #here's the box
    if len(var.shape)>1:
        err = 2*np.sqrt(np.diag(var))
    else:
        err = 2*np.sqrt(var)
    #here's the 2*std box
    kwargs['linewidth']=0.5
    if not 'alpha' in kwargs.keys():
        kwargs['alpha'] = 0.3
-    axes.fill(np.hstack((x,x[::-1])),np.hstack((mu+err,mu[::-1]-err[::-1])),color=fillcol,**kwargs)
+    axes.fill(np.hstack((x,x[::-1])),np.hstack((upper,lower[::-1])),color=fillcol,**kwargs)
    #this is the edge:
-    axes.plot(x,mu+err,color=edgecol,linewidth=0.2)
+    axes.plot(x,upper,color=edgecol,linewidth=0.2)
-    axes.plot(x,mu-err,color=edgecol,linewidth=0.2)
+    axes.plot(x,lower,color=edgecol,linewidth=0.2)
 def removeRightTicks(ax=None):
    ax = ax or pb.gca()
@ -74,4 +70,36 @@ def align_subplots(N,M,xlim=None, ylim=None):
        else:
            removeUpperTicks()
 def x_frame1D(X,plot_limits=None,resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
    """
    assert X.shape[1] ==1, "x_frame1D is defined for one-dimensional inputs"
    if plot_limits is None:
        xmin,xmax = X.min(0),X.max(0)
        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
    elif len(plot_limits)==2:
        xmin, xmax = plot_limits
    else:
        raise ValueError, "Bad limits for plotting"
    Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
    return Xnew, xmin, xmax
 def x_frame2D(X,plot_limits=None,resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
    """
    assert X.shape[1] ==2, "x_frame2D is defined for two-dimensional inputs"
    if plot_limits is None:
        xmin,xmax = X.min(0),X.max(0)
        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
    elif len(plot_limits)==2:
        xmin, xmax = plot_limits
    else:
        raise ValueError, "Bad limits for plotting"
    resolution = resolution or 50
    xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
    Xnew = np.vstack((xx.flatten(),yy.flatten())).T
    return Xnew, xx, yy, xmin, xmax
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 GPy
 ===
-A Gaussian processes framework in python.
+A Gaussian processes framework in python
 * [Online documentation](https://gpy.readthedocs.org/en/latest/)
 * [Unit tests (Travis-CI)](https://travis-ci.org/SheffieldML/GPy)
--- a/doc/Figures/kern-def.png
+++ b/doc/Figures/kern-def.png
--- a/doc/Figures/tuto_GP_regression_m1.png
+++ b/doc/Figures/tuto_GP_regression_m1.png
--- a/doc/Figures/tuto_GP_regression_m2.png
+++ b/doc/Figures/tuto_GP_regression_m2.png
--- a/doc/Figures/tuto_GP_regression_m3.png
+++ b/doc/Figures/tuto_GP_regression_m3.png
--- a/doc/Figures/tuto_kern_overview_add_orth.png
+++ b/doc/Figures/tuto_kern_overview_add_orth.png
--- a/doc/Figures/tuto_kern_overview_allkern.png
+++ b/doc/Figures/tuto_kern_overview_allkern.png
--- a/doc/Figures/tuto_kern_overview_basicdef.png
+++ b/doc/Figures/tuto_kern_overview_basicdef.png
--- a/doc/Figures/tuto_kern_overview_mANOVA.png
+++ b/doc/Figures/tuto_kern_overview_mANOVA.png
--- a/doc/Figures/tuto_kern_overview_mANOVAdec.png
+++ b/doc/Figures/tuto_kern_overview_mANOVAdec.png
--- a/doc/GPy.examples.rst
+++ b/doc/GPy.examples.rst
@ -33,6 +33,14 @@ examples Package
    :undoc-members:
    :show-inheritance:
 :mod:`poisson` Module
 ---------------------
 .. automodule:: GPy.examples.poisson
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`regression` Module
 ------------------------
@ -57,6 +65,14 @@ examples Package
    :undoc-members:
    :show-inheritance:
 :mod:`sparse_ep_fix` Module
 ---------------------------
 .. automodule:: GPy.examples.sparse_ep_fix
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`uncertain_input_GP_regression_demo` Module
 ------------------------------------------------
--- a/doc/GPy.inference.rst
+++ b/doc/GPy.inference.rst
@ -1,22 +1,6 @@
 inference Package
 =================
 :mod:`Expectation_Propagation` Module
 -------------------------------------
 .. automodule:: GPy.inference.Expectation_Propagation
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`likelihoods` Module
 -------------------------
 .. automodule:: GPy.inference.likelihoods
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`optimization` Module
 --------------------------
--- a/doc/GPy.likelihoods.rst
+++ b/doc/GPy.likelihoods.rst
@ -0,0 +1,43 @@
 likelihoods Package
 ===================
 :mod:`likelihoods` Package
 --------------------------
 .. automodule:: GPy.likelihoods
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`EP` Module
 ----------------
 .. automodule:: GPy.likelihoods.EP
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`Gaussian` Module
 ----------------------
 .. automodule:: GPy.likelihoods.Gaussian
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`likelihood` Module
 ------------------------
 .. automodule:: GPy.likelihoods.likelihood
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`likelihood_functions` Module
 ----------------------------------
 .. automodule:: GPy.likelihoods.likelihood_functions
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/GPy.models.rst
+++ b/doc/GPy.models.rst
@ -17,18 +17,18 @@ models Package
    :undoc-members:
    :show-inheritance:
-:mod:`GPLVM` Module
+:mod:`GP` Module
-------------------
+----------------
-.. automodule:: GPy.models.GPLVM
+.. automodule:: GPy.models.GP
    :members:
    :undoc-members:
    :show-inheritance:
-:mod:`GP_EP` Module
+:mod:`GPLVM` Module
 -------------------
-.. automodule:: GPy.models.GP_EP
+.. automodule:: GPy.models.GPLVM
    :members:
    :undoc-members:
    :show-inheritance:
@ -41,10 +41,10 @@ models Package
    :undoc-members:
    :show-inheritance:
-:mod:`generalized_FITC` Module
+:mod:`sparse_GP` Module
------------------------------
+-----------------------
-.. automodule:: GPy.models.generalized_FITC
+.. automodule:: GPy.models.sparse_GP
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/GPy.rst
+++ b/doc/GPy.rst
@ -18,6 +18,7 @@ Subpackages
    GPy.examples
    GPy.inference
    GPy.kern
    GPy.likelihoods
    GPy.models
    GPy.util
--- a/doc/Makefile
+++ b/doc/Makefile
@ -41,6 +41,7 @@ help:
 clean:
 	-rm -rf $(BUILDDIR)/*
 html:
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 	@echo
--- a/doc/conf.py
+++ b/doc/conf.py
@ -11,12 +11,50 @@
 # All configuration values have a default; values that are commented out
 # serve to show the default.
-import sys, os
+import sys
 import os
 print "python exec:", sys.executable
 print "sys.path:", sys.path
 try:
    import numpy
    print "numpy: %s, %s" % (numpy.__version__, numpy.__file__)
 except ImportError:
    print "no numpy"
 try:
    import matplotlib
    print "matplotlib: %s, %s" % (matplotlib.__version__, matplotlib.__file__)
 except ImportError:
    print "no matplotlib"
 try:
    import ipython
    print "ipython: %s, %s" % (ipython.__version__, ipython.__file__)
 except ImportError:
    print "no ipython"
 try:
    import sphinx
    print "sphinx: %s, %s" % (sphinx.__version__, sphinx.__file__)
 except ImportError:
    print "no sphinx"
 print "sys.path:", sys.path
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+#sys.path.insert(0, os.path.abspath('../GPy'))
 #print "sys.path.after:", sys.path
 # If your extensions are in another directory, add it here. If the directory
 # is relative to the documentation root, use os.path.abspath to make it
 # absolute, like shown here.
 sys.path.append(os.path.abspath('sphinxext'))
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #sys.path.insert(0, os.path.abspath('./sphinxext'))
 # -- General configuration -----------------------------------------------------
@ -25,15 +63,77 @@ import sys, os
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
+print "Importing extensions"
 extensions = ['sphinx.ext.autodoc',
              #'sphinx.ext.doctest'
              'sphinx.ext.viewcode',
              'sphinx.ext.pngmath',
              'ipython_directive',
              'ipython_console_highlighting'
              #'matplotlib.sphinxext.plot_directive'
             ]
 plot_formats = [('png', 80), ('pdf', 50)]
 print "finished importing"
 ##############################################################################
 ##
 ## Mock out imports with C dependencies because ReadTheDocs can't build them.
 #############################################################################
 class Mock(object):
    def __init__(self, *args, **kwargs):
        pass
    def __call__(self, *args, **kwargs):
        return Mock()
    @classmethod
    def __getattr__(cls, name):
        if name in ('__file__', '__path__'):
            return '/dev/null'
        elif name[0] == name[0].upper():
            mockType = type(name, (), {})
            mockType.__module__ = __name__
            return mockType
        else:
            return Mock()
 #import mock
 print "Mocking"
 MOCK_MODULES = ['pylab', 'sympy', 'sympy.utilities', 'sympy.utilities.codegen', 'sympy.core.cache', 'sympy.core', 'sympy.parsing', 'sympy.parsing.sympy_parser', 'matplotlib']
 #'matplotlib', 'matplotlib.color', 'matplotlib.pyplot', 'pylab' ]
 for mod_name in MOCK_MODULES:
    sys.modules[mod_name] = Mock()
 # ----------------------- READTHEDOCS ------------------
 on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
 on_rtd = True
 if on_rtd:
-  sys.path.append("../GPy")
+    sys.path.append(os.path.abspath('../GPy'))
-  os.system("pwd")
+
-  os.system("sphinx-apidoc -f -o . ../GPy")
+    import subprocess
    proc = subprocess.Popen("pwd", stdout=subprocess.PIPE, shell=True)
    (out, err) = proc.communicate()
    print "program output:", out
    proc = subprocess.Popen("ls ../", stdout=subprocess.PIPE, shell=True)
    (out, err) = proc.communicate()
    print "program output:", out
    proc = subprocess.Popen("sphinx-apidoc -f -o . ../GPy", stdout=subprocess.PIPE, shell=True)
    (out, err) = proc.communicate()
    print "program output:", out
    #proc = subprocess.Popen("whereis numpy", stdout=subprocess.PIPE, shell=True)
    #(out, err) = proc.communicate()
    #print "program output:", out
    #proc = subprocess.Popen("whereis matplotlib", stdout=subprocess.PIPE, shell=True)
    #(out, err) = proc.communicate()
    #print "program output:", out
 print "Compiled files"
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@ -181,21 +281,21 @@ htmlhelp_basename = 'GPydoc'
 # -- Options for LaTeX output --------------------------------------------------
 latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
+    # The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
+    #'papersize': 'letterpaper',
-# The font size ('10pt', '11pt' or '12pt').
+    # The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
+    #'pointsize': '10pt',
-# Additional stuff for the LaTeX preamble.
+    # Additional stuff for the LaTeX preamble.
-#'preamble': '',
+    #'preamble': '',
 }
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title, author, documentclass [howto/manual]).
 latex_documents = [
-  ('index', 'GPy.tex', u'GPy Documentation',
+    ('index', 'GPy.tex', u'GPy Documentation',
-   u'Author', 'manual'),
+    u'Author', 'manual'),
 ]
 # The name of an image file (relative to this directory) to place at the top of
@ -238,9 +338,9 @@ man_pages = [
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-  ('index', 'GPy', u'GPy Documentation',
+    ('index', 'GPy', u'GPy Documentation',
-   u'Author', 'GPy', 'One line description of project.',
+    u'Author', 'GPy', 'One line description of project.',
-   'Miscellaneous'),
+    'Miscellaneous'),
 ]
 # Documents to append as an appendix to all manuals.
@ -294,3 +394,5 @@ epub_copyright = u'2013, Author'
 # Allow duplicate toc entries.
 #epub_tocdup = True
 autodoc_member_order = "source"
--- a/doc/doc-requirements.txt
+++ b/doc/doc-requirements.txt
@ -0,0 +1,3 @@
 ipython
 numpy
 scipy
--- a/doc/index.rst
+++ b/doc/index.rst
@ -8,8 +8,8 @@ Welcome to GPy's documentation!
 For a quick start, you can have a look at one of the tutorials:
 * `Basic Gaussian process regression <tuto_GP_regression.html>`_  
 * `A kernel overview <tuto_kernel_overview.html>`_ 
 * Advanced GP regression (Forthcoming)
 * Kernel manipulation (Forthcoming)
 * Writting kernels (Forthcoming)
 You may also be interested by some examples in the GPy/examples folder.
@ -28,4 +28,3 @@ Indices and tables
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
--- a/doc/sphinxext/ipython_console_highlighting.py
+++ b/doc/sphinxext/ipython_console_highlighting.py
@ -0,0 +1,115 @@
 """reST directive for syntax-highlighting ipython interactive sessions.
 XXX - See what improvements can be made based on the new (as of Sept 2009)
 'pycon' lexer for the python console.  At the very least it will give better
 highlighted tracebacks.
 """
 #-----------------------------------------------------------------------------
 # Needed modules
 # Standard library
 import re
 # Third party
 from pygments.lexer import Lexer, do_insertions
 from pygments.lexers.agile import (PythonConsoleLexer, PythonLexer,
                                   PythonTracebackLexer)
 from pygments.token import Comment, Generic
 from sphinx import highlighting
 #-----------------------------------------------------------------------------
 # Global constants
 line_re = re.compile('.*?\n')
 #-----------------------------------------------------------------------------
 # Code begins - classes and functions
 class IPythonConsoleLexer(Lexer):
    """
    For IPython console output or doctests, such as:
    .. sourcecode:: ipython
      In [1]: a = 'foo'
      In [2]: a
      Out[2]: 'foo'
      In [3]: print a
      foo
      In [4]: 1 / 0
    Notes:
      - Tracebacks are not currently supported.
      - It assumes the default IPython prompts, not customized ones.
    """
    name = 'IPython console session'
    aliases = ['ipython']
    mimetypes = ['text/x-ipython-console']
    input_prompt = re.compile("(In \[[0-9]+\]: )|(   \.\.\.+:)")
    output_prompt = re.compile("(Out\[[0-9]+\]: )|(   \.\.\.+:)")
    continue_prompt = re.compile("   \.\.\.+:")
    tb_start = re.compile("\-+")
    def get_tokens_unprocessed(self, text):
        pylexer = PythonLexer(**self.options)
        tblexer = PythonTracebackLexer(**self.options)
        curcode = ''
        insertions = []
        for match in line_re.finditer(text):
            line = match.group()
            input_prompt = self.input_prompt.match(line)
            continue_prompt = self.continue_prompt.match(line.rstrip())
            output_prompt = self.output_prompt.match(line)
            if line.startswith("#"):
                insertions.append((len(curcode),
                                   [(0, Comment, line)]))
            elif input_prompt is not None:
                insertions.append((len(curcode),
                                   [(0, Generic.Prompt, input_prompt.group())]))
                curcode += line[input_prompt.end():]
            elif continue_prompt is not None:
                insertions.append((len(curcode),
                                   [(0, Generic.Prompt, continue_prompt.group())]))
                curcode += line[continue_prompt.end():]
            elif output_prompt is not None:
                # Use the 'error' token for output.  We should probably make
                # our own token, but error is typicaly in a bright color like
                # red, so it works fine for our output prompts.
                insertions.append((len(curcode),
                                   [(0, Generic.Error, output_prompt.group())]))
                curcode += line[output_prompt.end():]
            else:
                if curcode:
                    for item in do_insertions(insertions,
                                              pylexer.get_tokens_unprocessed(curcode)):
                        yield item
                        curcode = ''
                        insertions = []
                yield match.start(), Generic.Output, line
        if curcode:
            for item in do_insertions(insertions,
                                      pylexer.get_tokens_unprocessed(curcode)):
                yield item
 def setup(app):
    """Setup as a sphinx extension."""
    # This is only a lexer, so adding it below to pygments appears sufficient.
    # But if somebody knows that the right API usage should be to do that via
    # sphinx, by all means fix it here.  At least having this setup.py
    # suppresses the sphinx warning we'd get without it.
    pass
 #-----------------------------------------------------------------------------
 # Register the extension as a valid pygments lexer
 highlighting.lexers['ipython'] = IPythonConsoleLexer()
--- a/doc/sphinxext/ipython_directive.py
+++ b/doc/sphinxext/ipython_directive.py
@ -0,0 +1,835 @@
 # -*- coding: utf-8 -*-
 """Sphinx directive to support embedded IPython code.
 This directive allows pasting of entire interactive IPython sessions, prompts
 and all, and their code will actually get re-executed at doc build time, with
 all prompts renumbered sequentially. It also allows you to input code as a pure
 python input by giving the argument python to the directive. The output looks
 like an interactive ipython section.
 To enable this directive, simply list it in your Sphinx ``conf.py`` file
 (making sure the directory where you placed it is visible to sphinx, as is
 needed for all Sphinx directives).
 By default this directive assumes that your prompts are unchanged IPython ones,
 but this can be customized. The configurable options that can be placed in
 conf.py are
 ipython_savefig_dir:
    The directory in which to save the figures. This is relative to the
    Sphinx source directory. The default is `html_static_path`.
 ipython_rgxin:
    The compiled regular expression to denote the start of IPython input
    lines. The default is re.compile('In \[(\d+)\]:\s?(.*)\s*'). You
    shouldn't need to change this.
 ipython_rgxout:
    The compiled regular expression to denote the start of IPython output
    lines. The default is re.compile('Out\[(\d+)\]:\s?(.*)\s*'). You
    shouldn't need to change this.
 ipython_promptin:
    The string to represent the IPython input prompt in the generated ReST.
    The default is 'In [%d]:'. This expects that the line numbers are used
    in the prompt.
 ipython_promptout:
    The string to represent the IPython prompt in the generated ReST. The
    default is 'Out [%d]:'. This expects that the line numbers are used
    in the prompt.
 ToDo
 ----
 - Turn the ad-hoc test() function into a real test suite.
 - Break up ipython-specific functionality from matplotlib stuff into better
  separated code.
 Authors
 -------
 - John D Hunter: orignal author.
 - Fernando Perez: refactoring, documentation, cleanups, port to 0.11.
 - VáclavŠmilauer <eudoxos-AT-arcig.cz>: Prompt generalizations.
 - Skipper Seabold, refactoring, cleanups, pure python addition
 """
 #-----------------------------------------------------------------------------
 # Imports
 #-----------------------------------------------------------------------------
 # Stdlib
 import cStringIO
 import os
 import re
 import sys
 import tempfile
 import ast
 # To keep compatibility with various python versions
 try:
    from hashlib import md5
 except ImportError:
    from md5 import md5
 # Third-party
 try:
    import matplotlib
    matplotlib.use('Agg')
 except ImportError:
    print "Couldn't find matplotlib"
 import sphinx
 from docutils.parsers.rst import directives
 from docutils import nodes
 from sphinx.util.compat import Directive
 # Our own
 from IPython import Config, InteractiveShell
 from IPython.core.profiledir import ProfileDir
 from IPython.utils import io
 #-----------------------------------------------------------------------------
 # Globals
 #-----------------------------------------------------------------------------
 # for tokenizing blocks
 COMMENT, INPUT, OUTPUT =  range(3)
 #-----------------------------------------------------------------------------
 # Functions and class declarations
 #-----------------------------------------------------------------------------
 def block_parser(part, rgxin, rgxout, fmtin, fmtout):
    """
    part is a string of ipython text, comprised of at most one
    input, one ouput, comments, and blank lines.  The block parser
    parses the text into a list of::
      blocks = [ (TOKEN0, data0), (TOKEN1, data1), ...]
    where TOKEN is one of [COMMENT | INPUT | OUTPUT ] and
    data is, depending on the type of token::
      COMMENT : the comment string
      INPUT: the (DECORATOR, INPUT_LINE, REST) where
         DECORATOR: the input decorator (or None)
         INPUT_LINE: the input as string (possibly multi-line)
         REST : any stdout generated by the input line (not OUTPUT)
      OUTPUT: the output string, possibly multi-line
    """
    block = []
    lines = part.split('\n')
    N = len(lines)
    i = 0
    decorator = None
    while 1:
        if i==N:
            # nothing left to parse -- the last line
            break
        line = lines[i]
        i += 1
        line_stripped = line.strip()
        if line_stripped.startswith('#'):
            block.append((COMMENT, line))
            continue
        if line_stripped.startswith('@'):
            # we're assuming at most one decorator -- may need to
            # rethink
            decorator = line_stripped
            continue
        # does this look like an input line?
        matchin = rgxin.match(line)
        if matchin:
            lineno, inputline = int(matchin.group(1)), matchin.group(2)
            # the ....: continuation string
            continuation = '   %s:'%''.join(['.']*(len(str(lineno))+2))
            Nc = len(continuation)
            # input lines can continue on for more than one line, if
            # we have a '\' line continuation char or a function call
            # echo line 'print'.  The input line can only be
            # terminated by the end of the block or an output line, so
            # we parse out the rest of the input line if it is
            # multiline as well as any echo text
            rest = []
            while i<N:
                # look ahead; if the next line is blank, or a comment, or
                # an output line, we're done
                nextline = lines[i]
                matchout = rgxout.match(nextline)
                #print "nextline=%s, continuation=%s, starts=%s"%(nextline, continuation, nextline.startswith(continuation))
                if matchout or nextline.startswith('#'):
                    break
                elif nextline.startswith(continuation):
                    inputline += '\n' + nextline[Nc:]
                else:
                    rest.append(nextline)
                i+= 1
            block.append((INPUT, (decorator, inputline, '\n'.join(rest))))
            continue
        # if it looks like an output line grab all the text to the end
        # of the block
        matchout = rgxout.match(line)
        if matchout:
            lineno, output = int(matchout.group(1)), matchout.group(2)
            if i<N-1:
                output = '\n'.join([output] + lines[i:])
            block.append((OUTPUT, output))
            break
    return block
 class EmbeddedSphinxShell(object):
    """An embedded IPython instance to run inside Sphinx"""
    def __init__(self):
        self.cout = cStringIO.StringIO()
        # Create config object for IPython
        config = Config()
        config.Global.display_banner = False
        config.Global.exec_lines = ['import numpy as np',
                                    'from pylab import *'
                                    ]
        config.InteractiveShell.autocall = False
        config.InteractiveShell.autoindent = False
        config.InteractiveShell.colors = 'NoColor'
        # create a profile so instance history isn't saved
        tmp_profile_dir = tempfile.mkdtemp(prefix='profile_')
        profname = 'auto_profile_sphinx_build'
        pdir = os.path.join(tmp_profile_dir,profname)
        profile = ProfileDir.create_profile_dir(pdir)
        # Create and initialize ipython, but don't start its mainloop
        IP = InteractiveShell.instance(config=config, profile_dir=profile)
        # io.stdout redirect must be done *after* instantiating InteractiveShell
        io.stdout = self.cout
        io.stderr = self.cout
        # For debugging, so we can see normal output, use this:
        #from IPython.utils.io import Tee
        #io.stdout = Tee(self.cout, channel='stdout') # dbg
        #io.stderr = Tee(self.cout, channel='stderr') # dbg
        # Store a few parts of IPython we'll need.
        self.IP = IP
        self.user_ns = self.IP.user_ns
        self.user_global_ns = self.IP.user_global_ns
        self.input = ''
        self.output = ''
        self.is_verbatim = False
        self.is_doctest = False
        self.is_suppress = False
        # on the first call to the savefig decorator, we'll import
        # pyplot as plt so we can make a call to the plt.gcf().savefig
        self._pyplot_imported = False
    def clear_cout(self):
        self.cout.seek(0)
        self.cout.truncate(0)
    def process_input_line(self, line, store_history=True):
        """process the input, capturing stdout"""
        #print "input='%s'"%self.input
        stdout = sys.stdout
        splitter = self.IP.input_splitter
        try:
            sys.stdout = self.cout
            splitter.push(line)
            more = splitter.push_accepts_more()
            if not more:
                source_raw = splitter.source_raw_reset()[1]
                self.IP.run_cell(source_raw, store_history=store_history)
        finally:
            sys.stdout = stdout
    def process_image(self, decorator):
        """
        # build out an image directive like
        # .. image:: somefile.png
        #    :width 4in
        #
        # from an input like
        # savefig somefile.png width=4in
        """
        savefig_dir = self.savefig_dir
        source_dir = self.source_dir
        saveargs = decorator.split(' ')
        filename = saveargs[1]
        # insert relative path to image file in source
        outfile = os.path.relpath(os.path.join(savefig_dir,filename),
                    source_dir)
        imagerows = ['.. image:: %s'%outfile]
        for kwarg in saveargs[2:]:
            arg, val = kwarg.split('=')
            arg = arg.strip()
            val = val.strip()
            imagerows.append('   :%s: %s'%(arg, val))
        image_file = os.path.basename(outfile) # only return file name
        image_directive = '\n'.join(imagerows)
        return image_file, image_directive
    # Callbacks for each type of token
    def process_input(self, data, input_prompt, lineno):
        """Process data block for INPUT token."""
        decorator, input, rest = data
        image_file = None
        image_directive = None
        #print 'INPUT:', data  # dbg
        is_verbatim = decorator=='@verbatim' or self.is_verbatim
        is_doctest = decorator=='@doctest' or self.is_doctest
        is_suppress = decorator=='@suppress' or self.is_suppress
        is_savefig = decorator is not None and \
                     decorator.startswith('@savefig')
        input_lines = input.split('\n')
        if len(input_lines) > 1:
            if input_lines[-1] != "":
                input_lines.append('') # make sure there's a blank line
                                       # so splitter buffer gets reset
        continuation = '   %s:'%''.join(['.']*(len(str(lineno))+2))
        Nc = len(continuation)
        if is_savefig:
            image_file, image_directive = self.process_image(decorator)
        ret = []
        is_semicolon = False
        for i, line in enumerate(input_lines):
            if line.endswith(';'):
                is_semicolon = True
            if i==0:
                # process the first input line
                if is_verbatim:
                    self.process_input_line('')
                    self.IP.execution_count += 1 # increment it anyway
                else:
                    # only submit the line in non-verbatim mode
                    self.process_input_line(line, store_history=True)
                formatted_line = '%s %s'%(input_prompt, line)
            else:
                # process a continuation line
                if not is_verbatim:
                    self.process_input_line(line, store_history=True)
                formatted_line = '%s %s'%(continuation, line)
            if not is_suppress:
                ret.append(formatted_line)
        if not is_suppress and len(rest.strip()) and is_verbatim:
            # the "rest" is the standard output of the
            # input, which needs to be added in
            # verbatim mode
            ret.append(rest)
        self.cout.seek(0)
        output = self.cout.read()
        if not is_suppress and not is_semicolon:
            ret.append(output)
        elif is_semicolon: # get spacing right
            ret.append('')
        self.cout.truncate(0)
        return (ret, input_lines, output, is_doctest, image_file,
                    image_directive)
        #print 'OUTPUT', output  # dbg
    def process_output(self, data, output_prompt,
                       input_lines, output, is_doctest, image_file):
        """Process data block for OUTPUT token."""
        if is_doctest:
            submitted = data.strip()
            found = output
            if found is not None:
                found = found.strip()
                # XXX - fperez: in 0.11, 'output' never comes with the prompt
                # in it, just the actual output text.  So I think all this code
                # can be nuked...
                # the above comment does not appear to be accurate... (minrk)
                ind = found.find(output_prompt)
                if ind<0:
                    e='output prompt="%s" does not match out line=%s' % \
                       (output_prompt, found)
                    raise RuntimeError(e)
                found = found[len(output_prompt):].strip()
                if found!=submitted:
                    e = ('doctest failure for input_lines="%s" with '
                         'found_output="%s" and submitted output="%s"' %
                         (input_lines, found, submitted) )
                    raise RuntimeError(e)
                #print 'doctest PASSED for input_lines="%s" with found_output="%s" and submitted output="%s"'%(input_lines, found, submitted)
    def process_comment(self, data):
        """Process data fPblock for COMMENT token."""
        if not self.is_suppress:
            return [data]
    def save_image(self, image_file):
        """
        Saves the image file to disk.
        """
        self.ensure_pyplot()
        command = 'plt.gcf().savefig("%s")'%image_file
        #print 'SAVEFIG', command  # dbg
        self.process_input_line('bookmark ipy_thisdir', store_history=False)
        self.process_input_line('cd -b ipy_savedir', store_history=False)
        self.process_input_line(command, store_history=False)
        self.process_input_line('cd -b ipy_thisdir', store_history=False)
        self.process_input_line('bookmark -d ipy_thisdir', store_history=False)
        self.clear_cout()
    def process_block(self, block):
        """
        process block from the block_parser and return a list of processed lines
        """
        ret = []
        output = None
        input_lines = None
        lineno = self.IP.execution_count
        input_prompt = self.promptin%lineno
        output_prompt = self.promptout%lineno
        image_file = None
        image_directive = None
        for token, data in block:
            if token==COMMENT:
                out_data = self.process_comment(data)
            elif token==INPUT:
                (out_data, input_lines, output, is_doctest, image_file,
                    image_directive) = \
                          self.process_input(data, input_prompt, lineno)
            elif token==OUTPUT:
                out_data = \
                    self.process_output(data, output_prompt,
                                        input_lines, output, is_doctest,
                                        image_file)
            if out_data:
                ret.extend(out_data)
        # save the image files
        if image_file is not None:
            self.save_image(image_file)
        return ret, image_directive
    def ensure_pyplot(self):
        if self._pyplot_imported:
            return
        self.process_input_line('import matplotlib.pyplot as plt',
                                store_history=False)
    def process_pure_python(self, content):
        """
        content is a list of strings. it is unedited directive conent
        This runs it line by line in the InteractiveShell, prepends
        prompts as needed capturing stderr and stdout, then returns
        the content as a list as if it were ipython code
        """
        output = []
        savefig = False # keep up with this to clear figure
        multiline = False # to handle line continuation
        multiline_start = None
        fmtin = self.promptin
        ct = 0
        for lineno, line in enumerate(content):
            line_stripped = line.strip()
            if not len(line):
                output.append(line)
                continue
            # handle decorators
            if line_stripped.startswith('@'):
                output.extend([line])
                if 'savefig' in line:
                    savefig = True # and need to clear figure
                continue
            # handle comments
            if line_stripped.startswith('#'):
                output.extend([line])
                continue
            # deal with lines checking for multiline
            continuation  = u'   %s:'% ''.join(['.']*(len(str(ct))+2))
            if not multiline:
                modified = u"%s %s" % (fmtin % ct, line_stripped)
                output.append(modified)
                ct += 1
                try:
                    ast.parse(line_stripped)
                    output.append(u'')
                except Exception: # on a multiline
                    multiline = True
                    multiline_start = lineno
            else: # still on a multiline
                modified = u'%s %s' % (continuation, line)
                output.append(modified)
                try:
                    mod = ast.parse(
                            '\n'.join(content[multiline_start:lineno+1]))
                    if isinstance(mod.body[0], ast.FunctionDef):
                        # check to see if we have the whole function
                        for element in mod.body[0].body:
                            if isinstance(element, ast.Return):
                                multiline = False
                    else:
                        output.append(u'')
                        multiline = False
                except Exception:
                    pass
            if savefig: # clear figure if plotted
                self.ensure_pyplot()
                self.process_input_line('plt.clf()', store_history=False)
                self.clear_cout()
                savefig = False
        return output
 class IpythonDirective(Directive):
    has_content = True
    required_arguments = 0
    optional_arguments = 4 # python, suppress, verbatim, doctest
    final_argumuent_whitespace = True
    option_spec = { 'python': directives.unchanged,
                    'suppress' : directives.flag,
                    'verbatim' : directives.flag,
                    'doctest' : directives.flag,
                  }
    shell = EmbeddedSphinxShell()
    def get_config_options(self):
        # contains sphinx configuration variables
        config = self.state.document.settings.env.config
        # get config variables to set figure output directory
        confdir = self.state.document.settings.env.app.confdir
        savefig_dir = config.ipython_savefig_dir
        source_dir = os.path.dirname(self.state.document.current_source)
        if savefig_dir is None:
            savefig_dir = config.html_static_path
        if isinstance(savefig_dir, list):
            savefig_dir = savefig_dir[0] # safe to assume only one path?
        savefig_dir = os.path.join(confdir, savefig_dir)
        # get regex and prompt stuff
        rgxin     = config.ipython_rgxin
        rgxout    = config.ipython_rgxout
        promptin  = config.ipython_promptin
        promptout = config.ipython_promptout
        return savefig_dir, source_dir, rgxin, rgxout, promptin, promptout
    def setup(self):
        # reset the execution count if we haven't processed this doc
        #NOTE: this may be borked if there are multiple seen_doc tmp files
        #check time stamp?
        seen_docs = [i for i in os.listdir(tempfile.tempdir)
            if i.startswith('seen_doc')]
        if seen_docs:
            fname = os.path.join(tempfile.tempdir, seen_docs[0])
            docs = open(fname).read().split('\n')
            if not self.state.document.current_source in docs:
                self.shell.IP.history_manager.reset()
                self.shell.IP.execution_count = 1
        else: # haven't processed any docs yet
            docs = []
        # get config values
        (savefig_dir, source_dir, rgxin,
                rgxout, promptin, promptout) = self.get_config_options()
        # and attach to shell so we don't have to pass them around
        self.shell.rgxin = rgxin
        self.shell.rgxout = rgxout
        self.shell.promptin = promptin
        self.shell.promptout = promptout
        self.shell.savefig_dir = savefig_dir
        self.shell.source_dir = source_dir
        # setup bookmark for saving figures directory
        self.shell.process_input_line('bookmark ipy_savedir %s'%savefig_dir,
                                      store_history=False)
        self.shell.clear_cout()
        # write the filename to a tempfile because it's been "seen" now
        if not self.state.document.current_source in docs:
            fd, fname = tempfile.mkstemp(prefix="seen_doc", text=True)
            fout = open(fname, 'a')
            fout.write(self.state.document.current_source+'\n')
            fout.close()
        return rgxin, rgxout, promptin, promptout
    def teardown(self):
        # delete last bookmark
        self.shell.process_input_line('bookmark -d ipy_savedir',
                                      store_history=False)
        self.shell.clear_cout()
    def run(self):
        debug = False
        #TODO, any reason block_parser can't be a method of embeddable shell
        # then we wouldn't have to carry these around
        rgxin, rgxout, promptin, promptout = self.setup()
        options = self.options
        self.shell.is_suppress = 'suppress' in options
        self.shell.is_doctest = 'doctest' in options
        self.shell.is_verbatim = 'verbatim' in options
        # handle pure python code
        if 'python' in self.arguments:
            content = self.content
            self.content = self.shell.process_pure_python(content)
        parts = '\n'.join(self.content).split('\n\n')
        lines = ['.. code-block:: ipython','']
        figures = []
        for part in parts:
            block = block_parser(part, rgxin, rgxout, promptin, promptout)
            if len(block):
                rows, figure = self.shell.process_block(block)
                for row in rows:
                    lines.extend(['   %s'%line for line in row.split('\n')])
                if figure is not None:
                    figures.append(figure)
        #text = '\n'.join(lines)
        #figs = '\n'.join(figures)
        for figure in figures:
            lines.append('')
            lines.extend(figure.split('\n'))
            lines.append('')
        #print lines
        if len(lines)>2:
            if debug:
                print '\n'.join(lines)
            else: #NOTE: this raises some errors, what's it for?
                #print 'INSERTING %d lines'%len(lines)
                self.state_machine.insert_input(
                    lines, self.state_machine.input_lines.source(0))
        text = '\n'.join(lines)
        txtnode = nodes.literal_block(text, text)
        txtnode['language'] = 'ipython'
        #imgnode = nodes.image(figs)
        # cleanup
        self.teardown()
        return []#, imgnode]
 # Enable as a proper Sphinx directive
 def setup(app):
    setup.app = app
    app.add_directive('ipython', IpythonDirective)
    app.add_config_value('ipython_savefig_dir', None, True)
    app.add_config_value('ipython_rgxin',
                         re.compile('In \[(\d+)\]:\s?(.*)\s*'), True)
    app.add_config_value('ipython_rgxout',
                         re.compile('Out\[(\d+)\]:\s?(.*)\s*'), True)
    app.add_config_value('ipython_promptin', 'In [%d]:', True)
    app.add_config_value('ipython_promptout', 'Out[%d]:', True)
 # Simple smoke test, needs to be converted to a proper automatic test.
 def test():
    examples = [
        r"""
 In [9]: pwd
 Out[9]: '/home/jdhunter/py4science/book'
 In [10]: cd bookdata/
 /home/jdhunter/py4science/book/bookdata
 In [2]: from pylab import *
 In [2]: ion()
 In [3]: im = imread('stinkbug.png')
@savefig mystinkbug.png width=4in
 In [4]: imshow(im)
 Out[4]: <matplotlib.image.AxesImage object at 0x39ea850>
 """,
        r"""
 In [1]: x = 'hello world'
 # string methods can be
 # used to alter the string
@doctest
 In [2]: x.upper()
 Out[2]: 'HELLO WORLD'
@verbatim
 In [3]: x.st<TAB>
 x.startswith  x.strip
 """,
    r"""
 In [130]: url = 'http://ichart.finance.yahoo.com/table.csv?s=CROX\
   .....: &d=9&e=22&f=2009&g=d&a=1&br=8&c=2006&ignore=.csv'
 In [131]: print url.split('&')
 ['http://ichart.finance.yahoo.com/table.csv?s=CROX', 'd=9', 'e=22', 'f=2009', 'g=d', 'a=1', 'b=8', 'c=2006', 'ignore=.csv']
 In [60]: import urllib
 """,
    r"""\
 In [133]: import numpy.random
@suppress
 In [134]: numpy.random.seed(2358)
@doctest
 In [135]: numpy.random.rand(10,2)
 Out[135]:
 array([[ 0.64524308,  0.59943846],
       [ 0.47102322,  0.8715456 ],
       [ 0.29370834,  0.74776844],
       [ 0.99539577,  0.1313423 ],
       [ 0.16250302,  0.21103583],
       [ 0.81626524,  0.1312433 ],
       [ 0.67338089,  0.72302393],
       [ 0.7566368 ,  0.07033696],
       [ 0.22591016,  0.77731835],
       [ 0.0072729 ,  0.34273127]])
 """,
    r"""
 In [106]: print x
 jdh
 In [109]: for i in range(10):
   .....:     print i
   .....:
   .....:
 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 """,
        r"""
 In [144]: from pylab import *
 In [145]: ion()
 # use a semicolon to suppress the output
@savefig test_hist.png width=4in
 In [151]: hist(np.random.randn(10000), 100);
@savefig test_plot.png width=4in
 In [151]: plot(np.random.randn(10000), 'o');
   """,
        r"""
 # use a semicolon to suppress the output
 In [151]: plt.clf()
@savefig plot_simple.png width=4in
 In [151]: plot([1,2,3])
@savefig hist_simple.png width=4in
 In [151]: hist(np.random.randn(10000), 100);
 """,
     r"""
 # update the current fig
 In [151]: ylabel('number')
 In [152]: title('normal distribution')
@savefig hist_with_text.png
 In [153]: grid(True)
        """,
        ]
    # skip local-file depending first example:
    examples = examples[1:]
    #ipython_directive.DEBUG = True  # dbg
    #options = dict(suppress=True)  # dbg
    options = dict()
    for example in examples:
        content = example.split('\n')
        ipython_directive('debug', arguments=None, options=options,
                          content=content, lineno=0,
                          content_offset=None, block_text=None,
                          state=None, state_machine=None,
                          )
 # Run test suite as a script
 if __name__=='__main__':
    if not os.path.isdir('_static'):
        os.mkdir('_static')
    test()
    print 'All OK? Check figures in _static/'
--- a/doc/tuto_GP_regression.rst
+++ b/doc/tuto_GP_regression.rst
@ -1,4 +1,3 @@
 *************************************
 Gaussian process regression tutorial
 *************************************
@ -12,7 +11,7 @@ We first import the libraries we will need: ::
    import numpy as np
    import GPy
-1 dimensional model
+1-dimensional model
 ===================
 For this toy example, we assume we have the following inputs and outputs::
@ -22,13 +21,11 @@ For this toy example, we assume we have the following inputs and outputs::
 Note that the observations Y include some noise.
-The first step is to define the covariance kernel we want to use for the model. We choose here a kernel based on Gaussian kernel (i.e. rbf or square exponential) plus some white noise::
+The first step is to define the covariance kernel we want to use for the model. We choose here a kernel based on Gaussian kernel (i.e. rbf or square exponential)::
-    Gaussian = GPy.kern.rbf(D=1)
+    kernel = GPy.kern.rbf(D=1, variance=1., lengthscale=1.)
    noise = GPy.kern.white(D=1)
    kernel = Gaussian + noise
-The parameter ``D`` stands for the dimension of the input space. Note that many other kernels are implemented such as:
+The parameter ``D`` stands for the dimension of the input space. The parameters ``variance`` and ``lengthscale`` are optional. Note that many other kernels are implemented such as:
 * linear (``GPy.kern.linear``)
 * exponential kernel (``GPy.kern.exponential``)
@ -41,19 +38,19 @@ The inputs required for building the model are the observations and the kernel::
    m = GPy.models.GP_regression(X,Y,kernel)
-The functions ``print`` and ``plot`` give an insight of the model we have just build. The code::
+By default, some observation noise is added to the modle. The functions ``print`` and ``plot`` give an insight of the model we have just build. The code::
    print m
    m.plot()
 gives the following output: ::
-    Marginal log-likelihood: -2.281e+01
+    Marginal log-likelihood: -4.479e+00
           Name        |  Value   |  Constraints  |  Ties  |  Prior  
    -----------------------------------------------------------------
       rbf_variance    |  1.0000  |               |        |         
      rbf_lengthscale  |  1.0000  |               |        |         
-      white_variance   |  1.0000  |               |        |         
+      noise variance   |  1.0000  |               |        |         
 .. figure::  Figures/tuto_GP_regression_m1.png
    :align:   center
@ -75,24 +72,24 @@ but it is also possible to set a range on to constrain one parameter to be fixed
    m.unconstrain('')                            # Required to remove the previous constrains
    m.constrain_positive('rbf_variance')
    m.constrain_bounded('lengthscale',1.,10. )
-    m.constrain_fixed('white',0.0025)
+    m.constrain_fixed('noise',0.0025)
 Once the constrains have been imposed, the model can be optimized::
    m.optimize()
-If we want to perform some restarts to try to improve the result of the optimization, we can use the optimize_restart function::
+If we want to perform some restarts to try to improve the result of the optimization, we can use the ``optimize_restart`` function::
    m.optimize_restarts(Nrestarts = 10)
 Once again, we can use ``print(m)`` and ``m.plot()`` to look at the resulting model  resulting model::
-    Marginal log-likelihood: 2.001e+01
+    Marginal log-likelihood: 3.603e+01
           Name        |  Value   |  Constraints  |  Ties  |  Prior  
    -----------------------------------------------------------------
-       rbf_variance    |  0.8033  |     (+ve)     |        |         
+       rbf_variance    |  0.8151  |     (+ve)     |        |         
-      rbf_lengthscale  |  1.8033  |  (1.0, 10.0)  |        |         
+      rbf_lengthscale  |  1.8037  |  (1.0, 10.0)  |        |         
-      white_variance   |  0.0025  |     Fixed     |        |               
+      noise variance   |  0.0025  |     Fixed     |        |         
 .. figure::  Figures/tuto_GP_regression_m2.png
    :align:   center
@ -101,7 +98,7 @@ Once again, we can use ``print(m)`` and ``m.plot()`` to look at the resulting mo
    GP regression model after optimization of the parameters.
-2 dimensional example
+2-dimensional example
 =====================
 Here is a 2 dimensional example::
@ -131,15 +128,16 @@ Here is a 2 dimensional example::
    m.plot()
    print(m)
-The flag ``ARD=True`` in the definition of the Matern kernel specifies that we want one lengthscale parameter per dimension (ie the GP is not isotropic). The output of the last 2 lines is::
+The flag ``ARD=True`` in the definition of the Matern kernel specifies that we want one lengthscale parameter per dimension (ie the GP is not isotropic). The output of the last two lines is::
-    Marginal log-likelihood: 2.893e+01
+    Marginal log-likelihood: 6.682e+01
-               Name            |  Value   |  Constraints  |  Ties  |  Prior  
+             Name          |  Value   |  Constraints  |  Ties  |  Prior  
-    -------------------------------------------------------------------------
+    ---------------------------------------------------------------------
-        Mat52_ARD_variance     |  0.4094  |     (+ve)     |        |         
+        Mat52_variance     |  0.3860  |     (+ve)     |        |         
-      Mat52_ARD_lengthscale_0  |  2.1060  |     (+ve)     |        |         
+      Mat52_lengthscale_0  |  2.0578  |     (+ve)     |        |         
-      Mat52_ARD_lengthscale_1  |  2.0546  |     (+ve)     |        |         
+      Mat52_lengthscale_1  |  1.8542  |     (+ve)     |        |         
-          white_variance       |  0.0012  |     (+ve)     |        |         
+        white_variance     |  0.0023  |     (+ve)     |        |         
        noise variance     |  0.0000  |     (+ve)     |        |         
 .. figure::  Figures/tuto_GP_regression_m3.png
    :align:   center
--- a/doc/tuto_kernel_overview.rst
+++ b/doc/tuto_kernel_overview.rst
@ -0,0 +1,177 @@
 ****************************
 tutorial : A kernel overview
 ****************************
 First we import the libraries we will need ::
    import pylab as pb
    import numpy as np
    import GPy
    pb.ion()
 For most kernels, the dimension is the only mandatory parameter to define a kernel object. However, it is also possible to specify the values of the parameters. For example, the three following commands are valid for defining a squared exponential kernel (ie rbf or Gaussian) ::
    ker1 = GPy.kern.rbf(1)  # Equivalent to ker1 = GPy.kern.rbf(D=1, variance=1., lengthscale=1.)
    ker2 = GPy.kern.rbf(D=1, variance = 1.5, lengthscale=2.)
    ker3 = GPy.kern.rbf(1, .5, .5)
 A `plot` and a `print` functions are implemented to represent kernel objects ::
    print ker1
    ker1.plot()
    ker2.plot()
    ker3.plot()
 .. figure::  Figures/tuto_kern_overview_basicdef.png
    :align:   center
    :height: 350px
 Implemented kernels
 ===================
 Many kernels are already implemented in GPy. Here is a summary of most of them:
 .. figure::  Figures/tuto_kern_overview_allkern.png
    :align:  center
    :height: 800px
 On the other hand, it is possible to use the `sympy` package to build new kernels. This will be the subject of another tutorial.
 Operations to combine kernel
 ============================
 In ``GPy``, kernel objects can be combined with the usual ``+`` and ``*`` operators. ::
    k1 = GPy.kern.rbf(1,variance=1., lengthscale=2)
    k2 = GPy.kern.Matern32(1,variance=1., lengthscale=2)
    ker_add = k1 + k2
    print ker_add
    ker_prod = k1 * k2
    print ker_prod
 Note that by default, the operator ``+`` adds kernels defined on the same input space whereas ``*`` assumes that the kernels are defined on different input spaces. Here for example ``ker_add.D`` will return ``1`` whereas ``ker_prod.D`` will return ``2``.
 In order to add kernels defined on the different input spaces, the required command is::
    ker_add_orth = k1.add_orthogonal(k2)
 .. figure::  Figures/tuto_kern_overview_add_orth.png
    :align:  center
    :height: 350px
    Output of ``ker_add_orth.plot(plot_limits=[[-10,-10],[10,10]])``.
 Example : Building an ANOVA kernel
 ==================================
 In two dimensions ANOVA kernels have the following form: 
 .. math::
    k_{ANOVA}(x,y) = \prod_{i=1}^2 (1 + k_i(x_i,y_i)) = 1 + k_1(x_1,y_1) + k_2(x_2,y_2) + k_1(x_1,y_1) \times k_2(x_2,y_2).
 Let us assume that we want to define an ANOVA kernel with a Matern 3/2 kernel for :math:`k_i`. As seen previously, we can define this kernel as follows ::
    k_cst = GPy.kern.bias(1,variance=1.)
    k_mat = GPy.kern.Matern52(1,variance=1., lengthscale=3)
    Kanova = (k_cst + k_mat) * (k_cst + k_mat)
    print Kanova
 Printing the resulting kernel outputs the following ::
                     Name                  |  Value   |  Constraints  |  Ties  
    ---------------------------------------------------------------------------
           bias<times>bias_variance        |  1.0000  |               |        
           bias<times>Mat52_variance       |  1.0000  |               |        
      bias<times>Mat52_Mat52_lengthscale   |  3.0000  |               |  (1)   
           Mat52<times>bias_variance       |  1.0000  |               |        
      Mat52<times>bias_Mat52_lengthscale   |  3.0000  |               |  (0)   
          Mat52<times>Mat52_variance       |  1.0000  |               |        
      Mat52<times>Mat52_Mat52_lengthscale  |  3.0000  |               |  (0)   
      Mat52<times>Mat52_Mat52_lengthscale  |  3.0000  |               |  (1)
 Note the ties between the lengthscales of ``Kanova`` to keep the number of lengthscales equal to 2. On the other hand, there are four variance terms in the new parameterization: one for each term of the right hand part of the above equation. We can illustrate the use of this kernel on a toy example::
    # sample inputs and outputs
    X = np.random.uniform(-3.,3.,(40,2))
    Y = 0.5*X[:,:1] + 0.5*X[:,1:] + 2*np.sin(X[:,:1]) * np.sin(X[:,1:])
    # Create GP regression model
    m = GPy.models.GP_regression(X,Y,Kanova)
    m.plot()
 .. figure::  Figures/tuto_kern_overview_mANOVA.png
    :align:  center
    :height: 350px
 As :math:`k_{ANOVA}` corresponds to the sum of 4 kernels, the best predictor can be splited in a sum of 4 functions 
 .. math::
    bp(x) & = k(x)^t K^{-1} Y \\
          & = (1 + k_1(x_1) +  k_2(x_2) +  k_1(x_1)k_2(x_2))^t K^{-1} Y \\
          & = 1^t K^{-1} Y + k_1(x_1)^t K^{-1} Y + k_2(x_2)^t K^{-1} Y + (k_1(x_1)k_2(x_2))^t K^{-1} Y
 The submodels can be represented with the option ``which_function`` of ``plot``: ::
    pb.figure(figsize=(20,5))
    pb.subplots_adjust(wspace=0.5)
    pb.subplot(1,5,1)
    m.plot()
    pb.subplot(1,5,2)
    pb.ylabel("=   ",rotation='horizontal',fontsize='30')
    pb.subplot(1,5,3)
    m.plot(which_functions=[False,True,False,False])
    pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
    pb.subplot(1,5,4)
    m.plot(which_functions=[False,False,True,False])
    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
    pb.subplot(1,5,5)
    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
    m.plot(which_functions=[False,False,False,True])
 .. figure::  Figures/tuto_kern_overview_mANOVAdec.png
    :align:  center
    :height: 200px
 ..  import pylab as pb
    import numpy as np
    import GPy
    pb.ion()
    ker1 = GPy.kern.rbf(D=1)  # Equivalent to ker1 = GPy.kern.rbf(D=1, variance=1., lengthscale=1.)
    ker2 = GPy.kern.rbf(D=1, variance = .75, lengthscale=3.)
    ker3 = GPy.kern.rbf(1, .5, .25)
    ker1.plot()
    ker2.plot()
    ker3.plot()
    #pb.savefig("Figures/tuto_kern_overview_basicdef.png")
    kernels = [GPy.kern.rbf(1), GPy.kern.exponential(1), GPy.kern.Matern32(1), GPy.kern.Matern52(1),  GPy.kern.Brownian(1), GPy.kern.bias(1), GPy.kern.linear(1), GPy.kern.spline(1), GPy.kern.periodic_exponential(1), GPy.kern.periodic_Matern32(1), GPy.kern.periodic_Matern52(1), GPy.kern.white(1)]
    kernel_names = ["GPy.kern.rbf", "GPy.kern.exponential", "GPy.kern.Matern32", "GPy.kern.Matern52", "GPy.kern.Brownian", "GPy.kern.bias", "GPy.kern.linear", "GPy.kern.spline", "GPy.kern.periodic_exponential", "GPy.kern.periodic_Matern32", "GPy.kern.periodic_Matern52", "GPy.kern.white"]
    pb.figure(figsize=(16,12))
    pb.subplots_adjust(wspace=.5, hspace=.5)
    for i, kern in enumerate(kernels):
       pb.subplot(3,4,i+1)
       kern.plot(x=7.5,plot_limits=[0.00001,15.])
       pb.title(kernel_names[i]+ '\n')
       #pb.axes([.1,.1,.8,.7])
       #pb.figtext(.5,.9,'Foo Bar', fontsize=18, ha='center')
       #pb.figtext(.5,.85,'Lorem ipsum dolor sit amet, consectetur adipiscing elit',fontsize=10,ha='center')
    # actual plot for the noise
    i = 11
    X = np.linspace(0.,15.,201)
    WN = 0*X
    WN[100] = 1.
    pb.subplot(3,4,i+1)
    pb.plot(X,WN,'b')
--- a/grid_parameters.py
+++ b/grid_parameters.py
@ -1,64 +0,0 @@
 import numpy as np
 import pylab as pb
 pb.ion()
 import sys
 import GPy
 pb.close('all')
 N = 200
 M = 15
 resolution=5
 X = np.linspace(0,12,N)[:,None]
 Z = np.linspace(0,12,M)[:,None] # inducing points (fixed for now)
 Y = np.sin(X) + np.random.randn(*X.shape)/np.sqrt(50.)
 #k = GPy.kern.rbf(1)
 k = GPy.kern.Matern32(1) + GPy.kern.white(1)
 models = [GPy.models.sparse_GP_regression(X,Y,Z=Z,kernel=k)
    ,GPy.models.sparse_GP_regression(X,Y,Z=Z,kernel=k)
    ,GPy.models.sparse_GP_regression(X,Y,Z=Z,kernel=k)
    ,GPy.models.sparse_GP_regression(X,Y,Z=Z,kernel=k)]
 models[0].scale_factor = 1.
 models[1].scale_factor = 10.
 models[2].scale_factor = 100.
 models[3].scale_factor = 1000.
    #GPy.models.sgp_debugB(X,Y,Z=Z,kernel=k),
    #GPy.models.sgp_debugC(X,Y,Z=Z,kernel=k)]#,
    #GPy.models.sgp_debugE(X,Y,Z=Z,kernel=k)]
 [m.constrain_fixed('white',0.1) for m in models]
 #xx,yy = np.mgrid[1.5:4:0+resolution*1j,-2:2:0+resolution*1j]
 xx,yy = np.mgrid[3:16:0+resolution*1j,-2:1:0+resolution*1j]
 lls = []
 cgs = []
 grads = []
 count = 0
 for l,v in zip(xx.flatten(),yy.flatten()):
    count += 1
    print count, 'of', resolution**2
    sys.stdout.flush()
    [m.set('lengthscale',l) for m in models]
    [m.set('_variance',10.**v) for m in models]
    lls.append([m.log_likelihood() for m in models])
    grads.append([m.log_likelihood_gradients() for m in models])
    cgs.append([m.checkgrad(verbose=0,return_ratio=True) for m in models])
 lls = np.array(zip(*lls)).reshape(-1,resolution,resolution)
 cgs = np.array(zip(*cgs)).reshape(-1,resolution,resolution)
 for ll,cg in zip(lls,cgs):
    pb.figure()
    pb.contourf(xx,yy,ll,100,cmap=pb.cm.gray)
    pb.colorbar()
    try:
        pb.contour(xx,yy,np.exp(ll),colors='k')
    except:
        pass
    pb.scatter(xx.flatten(),yy.flatten(),20,np.log(np.abs(cg.flatten())),cmap=pb.cm.jet,linewidth=0)
    pb.colorbar()
--- a/setup.py
+++ b/setup.py
@ -3,7 +3,7 @@
 import os
 from numpy.distutils.core import Extension, setup
-from sphinx.setup_command import BuildDoc
+#from sphinx.setup_command import BuildDoc
 # Version number
 version = '0.1.3'
@ -19,7 +19,7 @@ setup(name = 'GPy',
      license = "BSD 3-clause",
      keywords = "machine-learning gaussian-processes kernels",
      url = "http://ml.sheffield.ac.uk/GPy/",
-      packages = ['GPy', 'GPy.core', 'GPy.kern', 'GPy.util', 'GPy.models', 'GPy.inference', 'GPy.examples'],
+      packages = ['GPy', 'GPy.core', 'GPy.kern', 'GPy.util', 'GPy.models', 'GPy.inference', 'GPy.examples', 'GPy.likelihoods'],
      package_dir={'GPy': 'GPy'},
      package_data = {'GPy': ['GPy/examples']},
      py_modules = ['GPy.__init__'],
@ -27,8 +27,11 @@ setup(name = 'GPy',
      #ext_modules =  [Extension(name = 'GPy.kern.lfmUpsilonf2py',
      #          sources = ['GPy/kern/src/lfmUpsilonf2py.f90'])],
      install_requires=['sympy', 'numpy>=1.6', 'scipy>=0.9','matplotlib>=1.1'],
-      setup_requires=['sphinx'],
+      extras_require = {
-      cmdclass = {'build_sphinx': BuildDoc},
+        'docs':['Sphinx', 'ipython'],
      },
      #setup_requires=['sphinx'],
      #cmdclass = {'build_sphinx': BuildDoc},
      classifiers=[
      "Development Status :: 1 - Alpha",
      "Topic :: Machine Learning",