merged master

2026-05-02 16:22:39 +02:00 · 2013-01-31 09:57:40 +00:00 · 2013-01-31 09:57:40 +00:00 · 49d2e4e4f6
commit 49d2e4e4f6
parent e1edb062ed 0224e700f4
62 changed files with 2469 additions and 1230 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,11 @@
 language: python
 python:
  - "2.7"
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
 install: 
  - sudo apt-get install python-scipy
  - pip install sphinx
  - pip install . --use-mirrors
 # command to run tests, e.g. python setup.py test
 script: 
  - nosetests --with-xcoverage --with-xunit --cover-package=GPy --cover-erase GPy/testing
--- a/GPy/init.py
+++ b/GPy/init.py
@ -6,5 +6,6 @@ import kern
 import models
 import inference
 import util
 import examples
 #import examples TODO: discuss!
 from core import priors
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -14,18 +14,18 @@ from ..inference import optimization
 class model(parameterised):
    def __init__(self):
        parameterised.__init__(self)
-        self.priors = [None for i in range(self.get_param().size)]
+        self.priors = [None for i in range(self._get_params().size)]
        self.optimization_runs = []
        self.sampling_runs = []
-        self.set_param(self.get_param())
+        self._set_params(self._get_params())
        self.preferred_optimizer = 'tnc'
-    def get_param(self):
+    def _get_params(self):
        raise NotImplementedError, "this needs to be implemented to utilise the model class"
-    def set_param(self,x):
+    def _set_params(self,x):
        raise NotImplementedError, "this needs to be implemented to utilise the model class"
    def log_likelihood(self):
        raise NotImplementedError, "this needs to be implemented to utilise the model class"
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
        raise NotImplementedError, "this needs to be implemented to utilise the model class"
    def set_prior(self,which,what):
@ -67,7 +67,7 @@ class model(parameterised):
            unconst = np.setdiff1d(which, self.constrained_positive_indices)
            if len(unconst):
                print "Warning: constraining parameters to be positive:"
-                print '\n'.join([n for i,n in enumerate(self.get_param_names()) if i in unconst])
+                print '\n'.join([n for i,n in enumerate(self._get_param_names()) if i in unconst])
                print '\n'
                self.constrain_positive(unconst)
        elif isinstance(what,priors.Gaussian):
@ -80,48 +80,65 @@ class model(parameterised):
        for w in which:
            self.priors[w] = what
-    def get(self,name):
+    def get(self,name, return_names=False):
        """
-        get a model parameter by name
+        Get a model parameter by name. The name is applied as a regular expression and all parameters that match that regular expression are returned. 
        """
        matches = self.grep_param_names(name)
        if len(matches):
-            return self.get_param()[matches]
+            if return_names:
                return self._get_params()[matches], np.asarray(self._get_param_names())[matches].tolist()
            else:
                return self._get_params()[matches]
        else:
            raise AttributeError, "no parameter matches %s"%name
    def set(self,name,val):
        """
-        Set a model parameter by name
+        Set model parameter(s) by name. The name is provided as a regular expression. All parameters matching that regular expression are set to ghe given value.
        """
        matches = self.grep_param_names(name)
        if len(matches):
-            x = self.get_param()
+            x = self._get_params()
            x[matches] = val
-            self.set_param(x)
+            self._set_params(x)
        else:
            raise AttributeError, "no parameter matches %s"%name
    def get_gradient(self,name, return_names=False):
        """
        Get model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned. 
        """
        matches = self.grep_param_names(name)
        if len(matches):
            if return_names:
                return self._log_likelihood_gradients()[matches],  np.asarray(self._get_param_names())[matches].tolist()
            else:
                return self._log_likelihood_gradients()[matches]
        else:
            raise AttributeError, "no parameter matches %s"%name
    def log_prior(self):
        """evaluate the prior"""
-        return np.sum([p.lnpdf(x) for p, x in zip(self.priors,self.get_param()) if p is not None])
+        return np.sum([p.lnpdf(x) for p, x in zip(self.priors,self._get_params()) if p is not None])
-    def log_prior_gradients(self):
+    def _log_prior_gradients(self):
        """evaluate the gradients of the priors"""
-        x = self.get_param()
+        x = self._get_params()
        ret = np.zeros(x.size)
        [np.put(ret,i,p.lnpdf_grad(xx)) for i,(p,xx) in enumerate(zip(self.priors,x)) if not p is None]
        return ret
-    def extract_gradients(self):
+    def _log_likelihood_gradients_transformed(self):
        """
        Use self.log_likelihood_gradients and self.prior_gradients to get the gradients of the model.
        Adjust the gradient for constraints and ties, return.
        """
-        g = self.log_likelihood_gradients() + self.log_prior_gradients()
+        g = self._log_likelihood_gradients() + self._log_prior_gradients()
-        x = self.get_param()
+        x = self._get_params()
        g[self.constrained_positive_indices] = g[self.constrained_positive_indices]*x[self.constrained_positive_indices]
        g[self.constrained_negative_indices] = g[self.constrained_negative_indices]*x[self.constrained_negative_indices]
        [np.put(g,i,g[i]*(x[i]-l)*(h-x[i])/(h-l)) for i,l,h in zip(self.constrained_bounded_indices, self.constrained_bounded_lowers, self.constrained_bounded_uppers)]
@ -138,14 +155,14 @@ class model(parameterised):
        Make this draw from the prior if one exists, else draw from N(0,1)
        """
        #first take care of all parameters (from N(0,1))
-        x = self.extract_param()
+        x = self._get_params_transformed()
        x = np.random.randn(x.size)
-        self.expand_param(x)
+        self._set_params_transformed(x)
        #now draw from prior where possible
-        x = self.get_param()
+        x = self._get_params()
        [np.put(x,i,p.rvs(1)) for i,p in enumerate(self.priors) if not p is None]
-        self.set_param(x)
+        self._set_params(x)
-        self.expand_param(self.extract_param())#makes sure all of the tied parameters get the same init (since there's only one prior object...)
+        self._set_params_transformed(self._get_params_transformed())#makes sure all of the tied parameters get the same init (since there's only one prior object...)
    def optimize_restarts(self, Nrestarts=10, robust=False, verbose=True, **kwargs):
@ -165,7 +182,7 @@ class model(parameterised):
        :verbose: whether to show informations about the current restart
        """
-        initial_parameters = self.extract_param()
+        initial_parameters = self._get_params_transformed()
        for i in range(Nrestarts):
            try:
                self.randomize()
@ -182,9 +199,9 @@ class model(parameterised):
                    raise e
        if len(self.optimization_runs):
            i = np.argmin([o.f_opt for o in self.optimization_runs])
-            self.expand_param(self.optimization_runs[i].x_opt)
+            self._set_params_transformed(self.optimization_runs[i].x_opt)
        else:
-            self.expand_param(initial_parameters)
+            self._set_params_transformed(initial_parameters)
    def ensure_default_constraints(self,warn=False):
        """
@ -194,7 +211,7 @@ class model(parameterised):
        for s in positive_strings:
            for i in self.grep_param_names(s):
                if not (i in self.all_constrained_indices()):
-                    name = self.get_param_names()[i]
+                    name = self._get_param_names()[i]
                    self.constrain_positive(name)
                    if warn:
                        print "Warning! constraining %s postive"%name
@ -214,24 +231,24 @@ class model(parameterised):
            optimizer = self.preferred_optimizer
        def f(x):
-            self.expand_param(x)
+            self._set_params_transformed(x)
            return -self.log_likelihood()-self.log_prior()
        def fp(x):
-            self.expand_param(x)
+            self._set_params_transformed(x)
-            return -self.extract_gradients()
+            return -self._log_likelihood_gradients_transformed()
        def f_fp(x):
-            self.expand_param(x)
+            self._set_params_transformed(x)
-            return -self.log_likelihood()-self.log_prior(),-self.extract_gradients()
+            return -self.log_likelihood()-self.log_prior(),-self._log_likelihood_gradients_transformed()
        if start == None:
-            start = self.extract_param()
+            start = self._get_params_transformed()
        optimizer = optimization.get_optimizer(optimizer)
        opt = optimizer(start, model = self, **kwargs)
        opt.run(f_fp=f_fp, f=f, fp=fp)
        self.optimization_runs.append(opt)
-        self.expand_param(opt.x_opt)
+        self._set_params_transformed(opt.x_opt)
    def optimize_SGD(self, momentum = 0.1, learning_rate = 0.01, iterations = 20, **kwargs):
        # assert self.Y.shape[1] > 1, "SGD only works with D > 1"
@ -248,13 +265,13 @@ class model(parameterised):
        else:
            print "numerically calculating hessian. please be patient!"
-            x = self.get_param()
+            x = self._get_params()
            def f(x):
-                self.set_param(x)
+                self._set_params(x)
                return self.log_likelihood()
            h = ndt.Hessian(f)
            A = -h(x)
-            self.set_param(x)
+            self._set_params(x)
        # check for almost zero components on the diagonal which screw up the cholesky
        aa = np.nonzero((np.diag(A)<1e-6) & (np.diag(A)>0.))[0]
        A[aa,aa] = 0.
@ -268,7 +285,7 @@ class model(parameterised):
            hld = np.sum(np.log(np.diag(jitchol(A)[0])))
        except:
            return np.nan
-        return 0.5*self.get_param().size*np.log(2*np.pi) + self.log_likelihood() - hld
+        return 0.5*self._get_params().size*np.log(2*np.pi) + self.log_likelihood() - hld
    def __str__(self):
        s = parameterised.__str__(self).split('\n')
@ -292,18 +309,18 @@ class model(parameterised):
        If the overall gradient fails, invividual components are tested.
        """
-        x = self.extract_param().copy()
+        x = self._get_params_transformed().copy()
        #choose a random direction to step in:
        dx = step*np.sign(np.random.uniform(-1,1,x.size))
        #evaulate around the point x
-        self.expand_param(x+dx)
+        self._set_params_transformed(x+dx)
-        f1,g1 = self.log_likelihood() + self.log_prior(), self.extract_gradients()
+        f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()
-        self.expand_param(x-dx)
+        self._set_params_transformed(x-dx)
-        f2,g2 = self.log_likelihood() + self.log_prior(), self.extract_gradients()
+        f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()
-        self.expand_param(x)
+        self._set_params_transformed(x)
-        gradient = self.extract_gradients()
+        gradient = self._log_likelihood_gradients_transformed()
        numerical_gradient = (f1-f2)/(2*dx)
        global_ratio = (f1-f2)/(2*np.dot(dx,gradient))
@ -319,7 +336,7 @@ class model(parameterised):
                print "Global check failed. Testing individual gradients\n"
                try:
-                    names = self.extract_param_names()
+                    names = self._get_param_names_transformed()
                except NotImplementedError:
                    names = ['Variable %i'%i for i in range(len(x))]
@ -338,13 +355,13 @@ class model(parameterised):
            for i in range(len(x)):
                xx = x.copy()
                xx[i] += step
-                self.expand_param(xx)
+                self._set_params_transformed(xx)
-                f1,g1 = self.log_likelihood() + self.log_prior(), self.extract_gradients()[i]
+                f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()[i]
                xx[i] -= 2.*step
-                self.expand_param(xx)
+                self._set_params_transformed(xx)
-                f2,g2 = self.log_likelihood() + self.log_prior(), self.extract_gradients()[i]
+                f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()[i]
-                self.expand_param(x)
+                self._set_params_transformed(x)
-                gradient = self.extract_gradients()[i]
+                gradient = self._log_likelihood_gradients_transformed()[i]
                numerical_gradient = (f1-f2)/(2*step)
--- a/GPy/core/parameterised.py
+++ b/GPy/core/parameterised.py
@ -66,7 +66,7 @@ class parameterised(object):
        if hasattr(self,'prior'):
            pass
-        self.expand_param(self.extract_param())# sets tied parameters to single value
+        self._set_params_transformed(self._get_params_transformed())# sets tied parameters to single value
    def untie_everything(self):
        """Unties all parameters by setting tied_indices to an empty list."""
@ -87,7 +87,7 @@ class parameterised(object):
        Returns
        -------
-        the indices of self.get_param_names which match the regular expression.
+        the indices of self._get_param_names which match the regular expression.
        Notes
        -----
@ -96,9 +96,9 @@ class parameterised(object):
        if type(expr) is str:
            expr = re.compile(expr)
-            return np.nonzero([expr.search(name) for name in self.get_param_names()])[0]
+            return np.nonzero([expr.search(name) for name in self._get_param_names()])[0]
        elif type(expr) is re._pattern_type:
-            return np.nonzero([expr.search(name) for name in self.get_param_names()])[0]
+            return np.nonzero([expr.search(name) for name in self._get_param_names()])[0]
        else:
            return expr
@ -115,11 +115,11 @@ class parameterised(object):
        assert not np.any(matches[:,None]==self.all_constrained_indices()), "Some indices are already constrained"
        self.constrained_positive_indices = np.hstack((self.constrained_positive_indices, matches))
        #check to ensure constraint is in place
-        x = self.get_param()
+        x = self._get_params()
        for i,xx in enumerate(x):
            if (xx<0) & (i in matches):
                x[i] = -xx
-        self.set_param(x)
+        self._set_params(x)
    def unconstrain(self,which):
@ -163,11 +163,11 @@ class parameterised(object):
        assert not np.any(matches[:,None]==self.all_constrained_indices()), "Some indices are already constrained"
        self.constrained_negative_indices = np.hstack((self.constrained_negative_indices, matches))
        #check to ensure constraint is in place
-        x = self.get_param()
+        x = self._get_params()
        for i,xx in enumerate(x):
            if (xx>0.) and (i in matches):
                x[i] = -xx
-        self.set_param(x)
+        self._set_params(x)
@ -187,11 +187,11 @@ class parameterised(object):
        self.constrained_bounded_uppers.append(upper)
        self.constrained_bounded_lowers.append(lower)
        #check to ensure constraint is in place
-        x = self.get_param()
+        x = self._get_params()
        for i,xx in enumerate(x):
            if ((xx<=lower)|(xx>=upper)) & (i in matches):
                x[i] = sigmoid(xx)*(upper-lower) + lower
-        self.set_param(x)
+        self._set_params(x)
    def constrain_fixed(self, which, value = None):
@ -213,14 +213,14 @@ class parameterised(object):
        if value != None:
            self.constrained_fixed_values.append(value)
        else:
-            self.constrained_fixed_values.append(self.get_param()[self.constrained_fixed_indices[-1]])
+            self.constrained_fixed_values.append(self._get_params()[self.constrained_fixed_indices[-1]])
        #self.constrained_fixed_values.append(value)
-        self.expand_param(self.extract_param())
+        self._set_params_transformed(self._get_params_transformed())
-    def extract_param(self):
+    def _get_params_transformed(self):
-        """use self.get_param to get the 'true' parameters of the model, which are then tied, constrained and fixed"""
+        """use self._get_params to get the 'true' parameters of the model, which are then tied, constrained and fixed"""
-        x = self.get_param()
+        x = self._get_params()
        x[self.constrained_positive_indices] = np.log(x[self.constrained_positive_indices])
        x[self.constrained_negative_indices] = np.log(-x[self.constrained_negative_indices])
        [np.put(x,i,np.log(np.clip(x[i]-l,1e-10,np.inf)/np.clip(h-x[i],1e-10,np.inf))) for i,l,h in zip(self.constrained_bounded_indices, self.constrained_bounded_lowers, self.constrained_bounded_uppers)]
@ -232,8 +232,8 @@ class parameterised(object):
            return x
-    def expand_param(self,x):
+    def _set_params_transformed(self,x):
-        """ takes the vector x, which is then modified (by untying, reparameterising or inserting fixed values), and then call self.set_param"""
+        """ takes the vector x, which is then modified (by untying, reparameterising or inserting fixed values), and then call self._set_params"""
        #work out how many places are fixed, and where they are. tricky logic!
        Nfix_places = 0.
@ -257,14 +257,14 @@ class parameterised(object):
        xx[self.constrained_positive_indices] = np.exp(xx[self.constrained_positive_indices])
        xx[self.constrained_negative_indices] = -np.exp(xx[self.constrained_negative_indices])
        [np.put(xx,i,low+sigmoid(xx[i])*(high-low)) for i,low,high in zip(self.constrained_bounded_indices, self.constrained_bounded_lowers, self.constrained_bounded_uppers)]
-        self.set_param(xx)
+        self._set_params(xx)
-    def extract_param_names(self):
+    def _get_param_names_transformed(self):
        """
        Returns the parameter names as propagated after constraining,
-        tying or fixing, i.e. a list of the same length as extract_param()
+        tying or fixing, i.e. a list of the same length as _get_params_transformed()
        """
-        n =  self.get_param_names()
+        n =  self._get_param_names()
        #remove/concatenate the tied parameter names
        if len(self.tied_indices):
@ -294,13 +294,13 @@ class parameterised(object):
        """
        Return a string describing the parameter names and their ties and constraints
        """
-        names = self.get_param_names()
+        names = self._get_param_names()
        N = len(names)
        if not N:
            return "This object has no free parameters."
        header = ['Name','Value','Constraints','Ties']
-        values = self.get_param() #map(str,self.get_param())
+        values = self._get_params() #map(str,self._get_params())
        #sort out the constraints
        constraints = ['']*len(names)
        for i in self.constrained_positive_indices:
--- a/GPy/core/priors.py
+++ b/GPy/core/priors.py
@ -10,6 +10,7 @@ from ..util.linalg import pdinv
 class prior:
    def pdf(self,x):
        return np.exp(self.lnpdf(x))
    def plot(self):
        rvs = self.rvs(1000)
        pb.hist(rvs,100,normed=True)
@ -17,45 +18,79 @@ class prior:
        xx = np.linspace(xmin,xmax,1000)
        pb.plot(xx,self.pdf(xx),'r',linewidth=2)
 class Gaussian(prior):
    """
-	Implementation of the univariate Gaussian probability function, coupled with random variables, since scipy.stats sucks.
+    Implementation of the univariate Gaussian probability function, coupled with random variables.
-	Using Bishop 2006 notation"""
+
    :param mu: mean
    :param sigma: standard deviation
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    def __init__(self,mu,sigma):
        self.mu = float(mu)
        self.sigma = float(sigma)
        self.sigma2 = np.square(self.sigma)
        self.constant = -0.5*np.log(2*np.pi*self.sigma2)
    def __str__(self):
        return "N("+str(np.round(self.mu))+', '+str(np.round(self.sigma2))+')'
    def lnpdf(self,x):
        return self.constant - 0.5*np.square(x-self.mu)/self.sigma2
    def lnpdf_grad(self,x):
        return -(x-self.mu)/self.sigma2
    def rvs(self,n):
        return np.random.randn(n)*self.sigma + self.mu
 class log_Gaussian(prior):
    """
    Implementation of the univariate *log*-Gaussian probability function, coupled with random variables.
    :param mu: mean
    :param sigma: standard deviation
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    def __init__(self,mu,sigma):
        self.mu = float(mu)
        self.sigma = float(sigma)
        self.sigma2 = np.square(self.sigma)
        self.constant = -0.5*np.log(2*np.pi*self.sigma2)
    def __str__(self):
        return "lnN("+str(np.round(self.mu))+', '+str(np.round(self.sigma2))+')'
    def lnpdf(self,x):
        return self.constant - 0.5*np.square(np.log(x)-self.mu)/self.sigma2 -np.log(x)
    def lnpdf_grad(self,x):
        return -((np.log(x)-self.mu)/self.sigma2+1.)/x
    def rvs(self,n):
        return np.exp(np.random.randn(n)*self.sigma + self.mu)
 class multivariate_Gaussian:
    """
-	Implementation of the multivariate Gaussian probability function, coupled with random variables, since scipy.stats sucks.
+    Implementation of the multivariate Gaussian probability function, coupled with random variables.
-	Using Bishop 2006 notation"""
+
    :param mu: mean (N-dimensional array)
    :param var: covariance matrix (NxN)
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    def __init__(self,mu,var):
        self.mu = np.array(mu).flatten()
        self.var = np.array(var)
@ -67,17 +102,22 @@ class multivariate_Gaussian:
        self.constant = -0.5*self.D*np.log(2*np.pi) - self.hld
    def summary(self):
-		pass #TODO
+        raise NotImplementedError
    def pdf(self,x):
        return np.exp(self.lnpdf(x))
    def lnpdf(self,x):
        d = x-self.mu
        return self.constant - 0.5*np.sum(d*np.dot(d,self.inv),1)
    def lnpdf_grad(self,x):
        d = x-self.mu
        return -np.dot(self.inv,d)
    def rvs(self,n):
        return np.random.multivariate_normal(self.mu, self.var,n)
    def plot(self):
        if self.D==2:
            rvs = self.rvs(200)
@ -91,7 +131,15 @@ class multivariate_Gaussian:
 def gamma_from_EV(E,V):
-	"""create an instance of a gamma prior  by specifying the Expected value(s) and Variance(s) of the distribution"""
+    """
    Creates an instance of a gamma prior  by specifying the Expected value(s)
    and Variance(s) of the distribution.
    :param E: expected value
    :param V: variance
    """
    a = np.square(E)/V
    b = E/V
    return gamma(a,b)
@ -99,15 +147,23 @@ def gamma_from_EV(E,V):
 class gamma(prior):
    """
-	Implementation of the Gamma probability function, coupled with random variables, since scipy.stats sucks.
+    Implementation of the Gamma probability function, coupled with random variables.
-	Using Bishop 2006 notation
+
    :param a: shape parameter
    :param b: rate parameter (warning: it's the *inverse* of the scale)
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    def __init__(self,a,b):
        self.a = float(a)
        self.b = float(b)
        self.constant = -gammaln(self.a) + a*np.log(b)
    def __str__(self):
        return "Ga("+str(np.round(self.a))+', '+str(np.round(self.b))+')'
    def summary(self):
        ret = {"E[x]": self.a/self.b,\
            "E[ln x]": digamma(self.a) - np.log(self.b),\
@ -118,10 +174,12 @@ class gamma(prior):
        else:
            ret['mode'] = np.nan
        return ret
    def lnpdf(self,x):
        return self.constant + (self.a-1)*np.log(x) - self.b*x
    def lnpdf_grad(self,x):
        return (self.a-1.)/x - self.b
    def rvs(self,n):
        return np.random.gamma(scale=1./self.b,shape=self.a,size=n)
--- a/GPy/examples/BGPLVM_demo.py
+++ b/GPy/examples/BGPLVM_demo.py
@ -0,0 +1,33 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 import GPy
 np.random.seed(123344)
 N = 10
 M = 5
 Q = 3
 D = 4
 #generate GPLVM-like data
 X = np.random.rand(N, Q)
 k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
 K = k.K(X)
 Y = np.random.multivariate_normal(np.zeros(N),K,D).T
 # k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
 k = GPy.kern.linear(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
 m = GPy.models.Bayesian_GPLVM(Y, Q, kernel = k,  M=M)
 m.constrain_positive('(rbf|bias|noise|white|S)')
 # m.constrain_fixed('S', 1)
 # pb.figure()
 # m.plot()
 # pb.title('PCA initialisation')
 # pb.figure()
 # m.optimize(messages = 1)
 # m.plot()
 # pb.title('After optimisation')
 m.randomize()
 m.checkgrad(verbose = 1)
--- a/GPy/examples/GPLVM_demo.py
+++ b/GPy/examples/GPLVM_demo.py
@ -1,28 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 import GPy
 np.random.seed(1)
 print "GPLVM with RBF kernel"
 N = 100
 Q = 1
 D = 2
 X = np.random.rand(N, Q)
 k = GPy.kern.rbf(Q, 1.0, 2.0) + GPy.kern.white(Q, 0.00001)
 K = k.K(X)
 Y = np.random.multivariate_normal(np.zeros(N),K,D).T
 m = GPy.models.GPLVM(Y, Q)
 m.constrain_positive('(rbf|bias|white)')
 pb.figure()
 m.plot()
 pb.title('PCA initialisation')
 pb.figure()
 m.optimize(messages = 1)
 m.plot()
 pb.title('After optimisation')
--- a/GPy/examples/GP_regression_demo.py
+++ b/GPy/examples/GP_regression_demo.py
@ -1,51 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Simple Gaussian Processes regression with an RBF kernel
 """
 import pylab as pb
 import numpy as np
 import GPy
 pb.ion()
 pb.close('all')
 ######################################
 ## 1 dimensional example
 # sample inputs and outputs
 X = np.random.uniform(-3.,3.,(20,1))
 Y = np.sin(X)+np.random.randn(20,1)*0.05
 # create simple GP model
 m = GPy.models.GP_regression(X,Y)
 # contrain all parameters to be positive
 m.constrain_positive('')
 # optimize and plot
 m.optimize('tnc', max_f_eval = 1000)
 m.plot()
 print(m)
 ######################################
 ## 2 dimensional example
 # sample inputs and outputs
 X = np.random.uniform(-3.,3.,(40,2))
 Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(40,1)*0.05
 # create simple GP model
 m = GPy.models.GP_regression(X,Y)
 # contrain all parameters to be positive
 m.constrain_positive('')
 # optimize and plot
 pb.figure()
 m.optimize('tnc', max_f_eval = 1000)
 m.plot()
 print(m)
--- a/GPy/examples/GP_regression_kern_demo.py
+++ b/GPy/examples/GP_regression_kern_demo.py
@ -1,33 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Simple one-dimensional Gaussian Processes with assorted kernel functions
 """
 import pylab as pb
 import numpy as np
 import GPy
 # sample inputs and outputs
 D = 1
 X = np.random.randn(10,D)*2
 X = np.linspace(-1.5,1.5,5)[:,None]
 X = np.append(X,[[5]],0)
 Y = np.sin(np.pi*X/2) #+np.random.randn(X.shape[0],1)*0.05
 models = [GPy.models.GP_regression(X,Y, k) for k in (GPy.kern.rbf(D), GPy.kern.Matern52(D), GPy.kern.Matern32(D), GPy.kern.exponential(D), GPy.kern.linear(D) + GPy.kern.white(D),  GPy.kern.bias(D) + GPy.kern.white(D))]
 pb.figure(figsize=(12,8))
 for i,m in enumerate(models):
    m.constrain_positive('')
    m.optimize()
    pb.subplot(3,2,i+1)
    m.plot()
    #pb.title(m.kern.parts[0].name)
 GPy.util.plot.align_subplots(3,2,(-3,6),(-2.5,2.5))
 pb.show()
--- a/GPy/examples/init.py
+++ b/GPy/examples/init.py
@ -0,0 +1,8 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 # Please don't delete this without explaining to Neil the right way of doing this. I want to be able to run:
 # GPy.examples.regression.toy_rbf_1D() from ipython having imported GPy, and this seems to be the way to do it!
 import classification
 import regression
 import unsupervised
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -8,8 +8,6 @@ Simple Gaussian Processes classification
 import pylab as pb
 import numpy as np
 import GPy
 pb.ion()
 pb.close('all')
 default_seed=10000
 ######################################
@ -27,7 +25,7 @@ def crescent_data(model_type='Full', inducing=10, seed=default_seed):
    likelihood = GPy.inference.likelihoods.probit(data['Y'])
    if model_type=='Full':
-        m = GPy.models.simple_GP_EP(data['X'],likelihood)
+        m = GPy.models.GP_EP(data['X'],likelihood)
    else:
        # create sparse GP EP model
        m = GPy.models.sparse_GP_EP(data['X'],likelihood=likelihood,inducing=inducing,ep_proxy=model_type)
@ -49,7 +47,7 @@ def oil():
    likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1])
    # create simple GP model
-    m = GPy.models.simple_GP_EP(data['X'],likelihood)
+    m = GPy.models.GP_EP(data['X'],likelihood)
    # contrain all parameters to be positive
    m.constrain_positive('')
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -8,8 +8,6 @@ Gaussian Processes regression examples
 import pylab as pb
 import numpy as np
 import GPy
 pb.ion()
 pb.close('all')
 def toy_rbf_1d():
@ -19,10 +17,8 @@ def toy_rbf_1d():
    # create simple GP model
    m = GPy.models.GP_regression(data['X'],data['Y'])
    # contrain all parameters to be positive
    m.constrain_positive('')
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    # plot
@ -37,10 +33,8 @@ def rogers_girolami_olympics():
    # create simple GP model
    m = GPy.models.GP_regression(data['X'],data['Y'])
    # contrain all parameters to be positive
    m.constrain_positive('')
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    # plot
@ -48,6 +42,10 @@ def rogers_girolami_olympics():
    print(m)
    return m
 def della_gatta_TRP63_gene_expression(number=942):
    """Run a standard Gaussian process regression on the della Gatta et al TRP63 Gene Expression data set for a given gene number."""
 def toy_rbf_1d_50():
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d_50()
@ -55,10 +53,8 @@ def toy_rbf_1d_50():
    # create simple GP model
    m = GPy.models.GP_regression(data['X'],data['Y'])
    # contrain all parameters to be positive
    m.constrain_positive('')
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    # plot
@ -73,11 +69,95 @@ def silhouette():
    # create simple GP model
    m = GPy.models.GP_regression(data['X'],data['Y'])
    # contrain all parameters to be positive
    m.constrain_positive('')
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    print(m)
    return m
 def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000):
    """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisey mode is higher."""
    # Contour over a range of length scales and signal/noise ratios.
    length_scales = np.linspace(0.1, 60., resolution)
    log_SNRs = np.linspace(-3., 4., resolution)
    data = GPy.util.datasets.della_gatta_TRP63_gene_expression(gene_number)
    # Sub sample the data to ensure multiple optima
    #data['Y'] = data['Y'][0::2, :]
    #data['X'] = data['X'][0::2, :]
    # Remove the mean (no bias kernel to ensure signal/noise is in RBF/white)
    data['Y'] = data['Y'] - np.mean(data['Y'])
    lls = GPy.examples.regression.contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
    pb.contour(length_scales, log_SNRs, np.exp(lls), 20)
    ax = pb.gca()
    pb.xlabel('length scale')
    pb.ylabel('log_10 SNR')
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    # Now run a few optimizations
    models = []
    optim_point_x = np.empty(2)
    optim_point_y = np.empty(2)
    np.random.seed(seed=seed)
    for i in range(0, model_restarts):
        kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.)) + GPy.kern.white(1,variance=np.random.exponential(1.))
        m = GPy.models.GP_regression(data['X'],data['Y'], kernel=kern)
        optim_point_x[0] = m.get('rbf_lengthscale')
        optim_point_y[0] = np.log10(m.get('rbf_variance')) - np.log10(m.get('white_variance'));
        # optimize
        m.ensure_default_constraints()
        m.optimize(xtol=1e-6,ftol=1e-6)
        optim_point_x[1] = m.get('rbf_lengthscale')
        optim_point_y[1] = np.log10(m.get('rbf_variance')) - np.log10(m.get('white_variance'));
        pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1]-optim_point_x[0], optim_point_y[1]-optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
        models.append(m)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    return (models, lls)
 def contour_data(data, length_scales, log_SNRs, signal_kernel_call=GPy.kern.rbf):
    """Evaluate the GP objective function for a given data set for a range of signal to noise ratios and a range of lengthscales.
    :data_set: A data set from the utils.datasets director.
    :length_scales: a list of length scales to explore for the contour plot.
    :log_SNRs: a list of base 10 logarithm signal to noise ratios to explore for the contour plot.
    :signal_kernel: a kernel to use for the 'signal' portion of the data."""
    lls = []
    total_var = np.var(data['Y'])
    for log_SNR in log_SNRs:
        SNR = 10**log_SNR
        length_scale_lls = []
        for length_scale in length_scales:
            noise_var = 1.
            signal_var = SNR
            noise_var = noise_var/(noise_var + signal_var)*total_var
            signal_var = signal_var/(noise_var + signal_var)*total_var
            signal_kernel = signal_kernel_call(1, variance=signal_var, lengthscale=length_scale)
            noise_kernel = GPy.kern.white(1, variance=noise_var)
            kernel = signal_kernel + noise_kernel
            K = kernel.K(data['X'])
            total_var = (np.dot(np.dot(data['Y'].T,GPy.util.linalg.pdinv(K)[0]), data['Y'])/data['Y'].shape[0])[0,0]
            noise_var *= total_var
            signal_var *= total_var
            kernel = signal_kernel_call(1, variance=signal_var, lengthscale=length_scale) + GPy.kern.white(1, variance=noise_var)
            model = GPy.models.GP_regression(data['X'], data['Y'], kernel=kernel)
            model.constrain_positive('')
            length_scale_lls.append(model.log_likelihood())
        lls.append(length_scale_lls)
    return np.array(lls)
--- a/GPy/examples/unsupervised.py
+++ b/GPy/examples/unsupervised.py
@ -0,0 +1,25 @@
 """
 Usupervised learning with Gaussian Processes.
 """
 import pylab as pb
 import numpy as np
 import GPy
 ######################################
 ## Oil data subsampled to 100 points.
 def oil_100():
    data = GPy.util.datasets.oil_100()
    # create simple GP model
    m = GPy.models.GPLVM(data['X'], 2)
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    # plot
    print(m)
    return m
--- a/GPy/examples/warped_GP_demo.py
+++ b/GPy/examples/warped_GP_demo.py
@ -1,45 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import scipy as sp
 import pdb, sys, pickle
 import matplotlib.pylab as plt
 import GPy
 np.random.seed(1)
 N = 100
 # sample inputs and outputs
 X = np.random.uniform(-np.pi,np.pi,(N,1))
 Y = np.sin(X)+np.random.randn(N,1)*0.05
 # Y += np.abs(Y.min()) + 0.5
 Z = np.exp(Y)# Y**(1/3.0)
 # rescaling targets?
 Zmax = Z.max()
 Zmin = Z.min()
 Z = (Z-Zmin)/(Zmax-Zmin) - 0.5
 m = GPy.models.warpedGP(X, Z, warping_terms = 2)
 m.constrain_positive('(tanh_a|tanh_b|rbf|white|bias)')
 m.randomize()
 plt.figure()
 plt.xlabel('predicted f(Z)')
 plt.ylabel('actual f(Z)')
 plt.plot(m.Y, Y, 'o', alpha = 0.5, label = 'before training')
 m.optimize(messages = True)
 plt.plot(m.Y, Y, 'o', alpha = 0.5, label = 'after training')
 plt.legend(loc = 0)
 m.plot_warping()
 plt.figure()
 plt.title('warped GP fit')
 m.plot()
 m1 = GPy.models.GP_regression(X, Z)
 m1.constrain_positive('(rbf|white|bias)')
 m1.randomize()
 m1.optimize(messages = True)
 plt.figure()
 plt.title('GP fit')
 m1.plot()
--- a/GPy/inference/likelihoods.py
+++ b/GPy/inference/likelihoods.py
@ -9,7 +9,7 @@ import pylab as pb
 from ..util.plot import gpplot
 class likelihood:
-    def __init__(self,Y):
+    def __init__(self,Y,location=0,scale=1):
        """
        Likelihood class for doing Expectation propagation
@ -18,6 +18,8 @@ class likelihood:
        """
        self.Y = Y
        self.N = self.Y.shape[0]
        self.location = location
        self.scale = scale
    def plot1Da(self,X_new,Mean_new,Var_new,X_u,Mean_u,Var_u):
        """
@ -99,6 +101,119 @@ class probit(likelihood):
    def predictive_mean(self,mu,variance):
        return stats.norm.cdf(mu/np.sqrt(1+variance))
-    def log_likelihood_gradients():
+    def _log_likelihood_gradients():
        raise NotImplementedError
 class poisson(likelihood):
    """
    Poisson likelihood
    Y is expected to take values in {0,1,2,...}
    -----
    $$
    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
    $$
    """
    def moments_match(self,i,tau_i,v_i):
        """
        Moments match of the marginal approximation in EP algorithm
        :param i: number of observation (int)
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
        mu = v_i/tau_i
        sigma = np.sqrt(1./tau_i)
        def poisson_norm(f):
            """
            Product of the likelihood and the cavity distribution
            """
            pdf_norm_f = stats.norm.pdf(f,loc=mu,scale=sigma)
            rate = np.exp( (f*self.scale)+self.location)
            poisson = stats.poisson.pmf(float(self.Y[i]),rate)
            return pdf_norm_f*poisson
        def log_pnm(f):
            """
            Log of poisson_norm
            """
            return -(-.5*(f-mu)**2/sigma**2 - np.exp( (f*self.scale)+self.location) + ( (f*self.scale)+self.location)*self.Y[i])
        """
        Golden Search and Simpson's Rule
        --------------------------------
        Simpson's Rule is used to calculate the moments mumerically, it needs a grid of points as input.
        Golden Search is used to find the mode in the poisson_norm distribution and define around it the grid for Simpson's Rule
        """
        #TODO golden search & simpson's rule can be defined in the general likelihood class, rather than in each specific case.
        #Golden search
        golden_A = -1 if self.Y[i] == 0 else np.array([np.log(self.Y[i]),mu]).min() #Lower limit
        golden_B = np.array([np.log(self.Y[i]),mu]).max() #Upper limit
        golden_A = (golden_A - self.location)/self.scale
        golden_B = (golden_B - self.location)/self.scale
        opt = sp.optimize.golden(log_pnm,brack=(golden_A,golden_B)) #Better to work with log_pnm than with poisson_norm
        # Simpson's approximation
        width = 3./np.log(max(self.Y[i],2))
        A = opt - width #Lower limit
        B = opt + width #Upper limit
        K =  10*int(np.log(max(self.Y[i],150))) #Number of points in the grid, we DON'T want K to be the same number for every case
        h = (B-A)/K # length of the intervals
        grid_x = np.hstack([np.linspace(opt-width,opt,K/2+1)[1:-1], np.linspace(opt,opt+width,K/2+1)]) # grid of points (X axis)
        x = np.hstack([A,B,grid_x[range(1,K,2)],grid_x[range(2,K-1,2)]]) # grid_x rearranged, just to make Simpson's algorithm easier
        zeroth = np.hstack([poisson_norm(A),poisson_norm(B),[4*poisson_norm(f) for f in grid_x[range(1,K,2)]],[2*poisson_norm(f) for f in grid_x[range(2,K-1,2)]]]) # grid of points (Y axis) rearranged like x
        first = zeroth*x
        second = first*x
        Z_hat = sum(zeroth)*h/3 # Zero-th moment
        mu_hat = sum(first)*h/(3*Z_hat) # First moment
        m2 = sum(second)*h/(3*Z_hat) # Second moment
        sigma2_hat = m2 - mu_hat**2 # Second central moment
        return float(Z_hat), float(mu_hat), float(sigma2_hat)
    def plot1Db(self,X,X_new,F_new,F2_new=None,U=None):
        pb.subplot(212)
        #gpplot(X_new,F_new,np.sqrt(F2_new))
        pb.plot(X_new,F_new)#,np.sqrt(F2_new)) #FIXME
        pb.plot(X,self.Y,'kx',mew=1.5)
        if U is not None:
            pb.plot(U,np.ones(U.shape[0])*self.Y.min()*.8,'r|',mew=1.5,markersize=12)
    def predictive_mean(self,mu,variance):
        return np.exp(mu*self.scale + self.location)
    def predictive_variance(self,mu,variance):
        return mu
    def _log_likelihood_gradients():
        raise NotImplementedError
 class gaussian(likelihood):
    """
    Gaussian likelihood
    Y is expected to take values in (-inf,inf)
    """
    def moments_match(self,i,tau_i,v_i):
        """
        Moments match of the marginal approximation in EP algorithm
        :param i: number of observation (int)
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
        mu = v_i/tau_i
        sigma = np.sqrt(1./tau_i)
        s = 1. if self.Y[i] == 0 else 1./self.Y[i]
        sigma2_hat = 1./(1./sigma**2 + 1./s**2)
        mu_hat = sigma2_hat*(mu/sigma**2 + self.Y[i]/s**2)
        Z_hat = 1./np.sqrt(2*np.pi) * 1./np.sqrt(sigma**2+s**2) * np.exp(-.5*(mu-self.Y[i])**2/(sigma**2 + s**2))
        return Z_hat, mu_hat, sigma2_hat
    def plot1Db(self,X,X_new,F_new,U=None):
        assert X.shape[1] == 1, 'Number of dimensions must be 1'
        gpplot(X_new,F_new,np.zeros(X_new.shape[0]))
        pb.plot(X,self.Y,'kx',mew=1.5)
        if U is not None:
            pb.plot(U,np.ones(U.shape[0])*self.Y.min()*.8,'r|',mew=1.5,markersize=12)
    def predictive_mean(self,mu,Sigma):
        return mu
    def _log_likelihood_gradients():
        raise NotImplementedError
--- a/GPy/inference/samplers.py
+++ b/GPy/inference/samplers.py
@ -17,7 +17,7 @@ class Metropolis_Hastings:
    def __init__(self,model,cov=None):
        """Metropolis Hastings, with tunings according to Gelman et al. """
        self.model = model
-        current = self.model.extract_param()
+        current = self.model._get_params_transformed()
        self.D = current.size
        self.chains = []
        if cov is None:
@ -32,19 +32,19 @@ class Metropolis_Hastings:
        if start is None:
            self.model.randomize()
        else:
-            self.model.expand_param(start)
+            self.model._set_params_transformed(start)
    def sample(self, Ntotal, Nburn, Nthin, tune=True, tune_throughout=False, tune_interval=400):
-        current = self.model.extract_param()
+        current = self.model._get_params_transformed()
        fcurrent = self.model.log_likelihood() + self.model.log_prior()
        accepted = np.zeros(Ntotal,dtype=np.bool)
        for it in range(Ntotal):
            print "sample %d of %d\r"%(it,Ntotal),
            sys.stdout.flush()
            prop = np.random.multivariate_normal(current, self.cov*self.scale*self.scale)
-            self.model.expand_param(prop)
+            self.model._set_params_transformed(prop)
            fprop = self.model.log_likelihood() + self.model.log_prior()
            if fprop>fcurrent:#sample accepted, going 'uphill'
@ -73,12 +73,12 @@ class Metropolis_Hastings:
    def predict(self,function,args):
        """Make a prediction for the function, to which we will pass the additional arguments"""
-        param = self.model.get_param()
+        param = self.model._get_params()
        fs = []
        for p in self.chain:
-            self.model.set_param(p)
+            self.model._set_params(p)
            fs.append(function(*args))
-        self.model.set_param(param)# reset model to starting state
+        self.model._set_params(param)# reset model to starting state
        return fs
--- a/GPy/kern/Brownian.py
+++ b/GPy/kern/Brownian.py
@ -23,16 +23,16 @@ class Brownian(kernpart):
        assert self.D==1, "Brownian motion in 1D only"
        self.Nparam = 1.
        self.name = 'Brownian'
-        self.set_param(np.array([variance]).flatten())
+        self._set_params(np.array([variance]).flatten())
-    def get_param(self):
+    def _get_params(self):
        return self.variance
-    def set_param(self,x):
+    def _set_params(self,x):
        assert x.shape==(1,)
        self.variance = x
-    def get_param_names(self):
+    def _get_param_names(self):
        return ['variance']
    def K(self,X,X2,target):
--- a/GPy/kern/Matern32.py
+++ b/GPy/kern/Matern32.py
@ -20,43 +20,54 @@ class Matern32(kernpart):
    :type D: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
-    :param lengthscale: the lengthscales :math:`\ell_i`
+    :param lengthscale: the vector of lengthscale :math:`\ell_i`
-    :type lengthscale: np.ndarray of size (D,)
+    :type lengthscale: np.ndarray of size (1,) or (D,) depending on ARD
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    """
-    def __init__(self,D,variance=1.,lengthscales=None):
+    def __init__(self,D,variance=1.,lengthscale=None,ARD=False):
        self.D = D
-        if lengthscales is not None:
+        self.ARD = ARD
-            assert lengthscales.shape==(self.D,)
+        if ARD == False:
-        else:
+            self.Nparam = 2
            lengthscales = np.ones(self.D)
        self.Nparam = self.D + 1
            self.name = 'Mat32'
-        self.set_param(np.hstack((variance,lengthscales)))
+            if lengthscale is not None:
                assert lengthscale.shape == (1,)
            else:
                lengthscale = np.ones(1)
        else:
            self.Nparam = self.D + 1
            self.name = 'Mat32_ARD'
            if lengthscale is not None:
                assert lengthscale.shape == (self.D,)
            else:
                lengthscale = np.ones(self.D)
        self._set_params(np.hstack((variance,lengthscale)))
-    def get_param(self):
+    def _get_params(self):
        """return the value of the parameters."""
-        return np.hstack((self.variance,self.lengthscales))
+        return np.hstack((self.variance,self.lengthscale))
-    def set_param(self,x):
+    def _set_params(self,x):
        """set the value of the parameters."""
-        assert x.size==(self.D+1)
+        assert x.size == self.Nparam
        self.variance = x[0]
-        self.lengthscales = x[1:]
+        self.lengthscale = x[1:]
-    def get_param_names(self):
+    def _get_param_names(self):
        """return parameter names."""
-        if self.D==1:
+        if self.Nparam == 2:
            return ['variance','lengthscale']
        else:
-            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
+            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
    def K(self,X,X2,target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        np.add(self.variance*(1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist), target,target)
    def Kdiag(self,X,target):
@ -66,13 +77,20 @@ class Matern32(kernpart):
    def dK_dtheta(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        dvar = (1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist)
        invdist = 1./np.where(dist!=0.,dist,np.inf)
-        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
+        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
-        dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
+        #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
        target[0] += np.sum(dvar*partial)
        if self.ARD == True:
            dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
            #dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
            target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
        else:
            dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist)) * dist2M.sum(-1)*invdist
            #dl = self.variance*dvar*dist2M.sum(-1)*invdist
            target[1] += np.sum(dl*partial)
    def dKdiag_dtheta(self,partial,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
@ -81,8 +99,8 @@ class Matern32(kernpart):
    def dK_dX(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
-        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
+        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
        dK_dX = - np.transpose(3*self.variance*dist*np.exp(-np.sqrt(3)*dist)*ddist_dX,(1,0,2))
        target += np.sum(dK_dX*partial.T[:,:,None],0)
@ -104,7 +122,7 @@ class Matern32(kernpart):
        """
        assert self.D == 1
        def L(x,i):
-            return(3./self.lengthscales**2*F[i](x) + 2*np.sqrt(3)/self.lengthscales*F1[i](x) + F2[i](x))
+            return(3./self.lengthscale**2*F[i](x) + 2*np.sqrt(3)/self.lengthscale*F1[i](x) + F2[i](x))
        n = F.shape[0]
        G = np.zeros((n,n))
        for i in range(n):
@ -114,5 +132,5 @@ class Matern32(kernpart):
        F1lower = np.array([f(lower) for f in F1])[:,None]
        #print "OLD \n", np.dot(F1lower,F1lower.T), "\n \n"
        #return(G)
-        return(self.lengthscales**3/(12.*np.sqrt(3)*self.variance) * G + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscales**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
+        return(self.lengthscale**3/(12.*np.sqrt(3)*self.variance) * G + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscale**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
--- a/GPy/kern/Matern52.py
+++ b/GPy/kern/Matern52.py
@ -19,42 +19,53 @@ class Matern52(kernpart):
    :type D: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
-    :param lengthscale: the lengthscales :math:`\ell_i`
+    :param lengthscale: the vector of lengthscale :math:`\ell_i`
-    :type lengthscale: np.ndarray of size (D,)
+    :type lengthscale: np.ndarray of size (1,) or (D,) depending on ARD
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    """
-    def __init__(self,D,variance=1.,lengthscales=None):
+    def __init__(self,D,variance=1.,lengthscale=None,ARD=False):
        self.D = D
-        if lengthscales is not None:
+        self.ARD = ARD
-            assert lengthscales.shape==(self.D,)
+        if ARD == False:
            self.Nparam = 2
            self.name = 'Mat32'
            if lengthscale is not None:
                assert lengthscale.shape == (1,)
            else:
                lengthscale = np.ones(1)
        else:
            lengthscales = np.ones(self.D)
            self.Nparam = self.D + 1
-        self.name = 'Mat52'
+            self.name = 'Mat32_ARD'
-        self.set_param(np.hstack((variance,lengthscales)))
+            if lengthscale is not None:
                assert lengthscale.shape == (self.D,)
            else:
                lengthscale = np.ones(self.D)
        self._set_params(np.hstack((variance,lengthscale)))
-    def get_param(self):
+    def _get_params(self):
        """return the value of the parameters."""
-        return np.hstack((self.variance,self.lengthscales))
+        return np.hstack((self.variance,self.lengthscale))
-    def set_param(self,x):
+    def _set_params(self,x):
        """set the value of the parameters."""
-        assert x.size==(self.D+1)
+        assert x.size == self.Nparam
        self.variance = x[0]
-        self.lengthscales = x[1:]
+        self.lengthscale = x[1:]
-    def get_param_names(self):
+    def _get_param_names(self):
        """return parameter names."""
-        if self.D==1:
+        if self.Nparam == 2:
            return ['variance','lengthscale']
        else:
-            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
+            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
    def K(self,X,X2,target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        np.add(self.variance*(1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist), target,target)
    def Kdiag(self,X,target):
@ -64,13 +75,20 @@ class Matern52(kernpart):
    def dK_dtheta(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        invdist = 1./np.where(dist!=0.,dist,np.inf)
-        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
+        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
        dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist)
        dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
        target[0] += np.sum(dvar*partial)
        if self.ARD:
            dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
            #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
            target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
        else:
            dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist)) * dist2M.sum(-1)*invdist
            #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist)) * dist2M.sum(-1)*invdist
            target[1] += np.sum(dl*partial)
    def dKdiag_dtheta(self,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
@ -79,8 +97,8 @@ class Matern52(kernpart):
    def dK_dX(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
-        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
+        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
        dK_dX = -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
        target += np.sum(dK_dX*partial.T[:,:,None],0)
@ -104,18 +122,18 @@ class Matern52(kernpart):
        """
        assert self.D == 1
        def L(x,i):
-            return(5*np.sqrt(5)/self.lengthscales**3*F[i](x) + 15./self.lengthscales**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscales*F2[i](x) + F3[i](x))
+            return(5*np.sqrt(5)/self.lengthscale**3*F[i](x) + 15./self.lengthscale**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscale*F2[i](x) + F3[i](x))
        n = F.shape[0]
        G = np.zeros((n,n))
        for i in range(n):
            for j in range(i,n):
                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
-        G_coef = 3.*self.lengthscales**5/(400*np.sqrt(5))
+        G_coef = 3.*self.lengthscale**5/(400*np.sqrt(5))
        Flower = np.array([f(lower) for f in F])[:,None]
        F1lower = np.array([f(lower) for f in F1])[:,None]
        F2lower = np.array([f(lower) for f in F2])[:,None]
-        orig = 9./8*np.dot(Flower,Flower.T) + 9.*self.lengthscales**4/200*np.dot(F2lower,F2lower.T)
+        orig = 9./8*np.dot(Flower,Flower.T) + 9.*self.lengthscale**4/200*np.dot(F2lower,F2lower.T)
-        orig2 = 3./5*self.lengthscales**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
+        orig2 = 3./5*self.lengthscale**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
        return(1./self.variance* (G_coef*G + orig + orig2))
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -2,5 +2,5 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, rbf_ARD, spline, Brownian, linear_ARD, rbf_sympy, sympykern
+from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, rbf_sympy, sympykern, periodic_exponential, periodic_Matern32, periodic_Matern52
 from kern import kern
--- a/GPy/kern/bias.py
+++ b/GPy/kern/bias.py
@ -17,16 +17,16 @@ class bias(kernpart):
        self.D = D
        self.Nparam = 1
        self.name = 'bias'
-        self.set_param(np.array([variance]).flatten())
+        self._set_params(np.array([variance]).flatten())
-    def get_param(self):
+    def _get_params(self):
        return self.variance
-    def set_param(self,x):
+    def _set_params(self,x):
        assert x.shape==(1,)
        self.variance = x
-    def get_param_names(self):
+    def _get_param_names(self):
        return ['variance']
    def K(self,X,X2,target):
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -6,10 +6,8 @@ import numpy as np
 from kern import kern
 from rbf import rbf as rbfpart
 from rbf_ARD import rbf_ARD as rbf_ARD_part
 from white import white as whitepart
 from linear import linear as linearpart
 from linear_ARD import linear_ARD as linear_ARD_part
 from exponential import exponential as exponentialpart
 from Matern32 import Matern32 as Matern32part
 from Matern52 import Matern52 as Matern52part
@ -17,12 +15,15 @@ from bias import bias as biaspart
 from finite_dimensional import finite_dimensional as finite_dimensionalpart
 from spline import spline as splinepart
 from Brownian import Brownian as Brownianpart
 from periodic_exponential import periodic_exponential as periodic_exponentialpart
 from periodic_Matern32 import periodic_Matern32 as periodic_Matern32part
 from periodic_Matern52 import periodic_Matern52 as periodic_Matern52part
 #TODO these s=constructors are not as clean as we'd like. Tidy the code up
 #using meta-classes to make the objects construct properly wthout them.
-def rbf(D,variance=1., lengthscale=1.):
+def rbf(D,variance=1., lengthscale=None,ARD=False):
    """
    Construct an RBF kernel
@ -32,46 +33,23 @@ def rbf(D,variance=1., lengthscale=1.):
    :type variance: float
    :param lengthscale: the lengthscale of the kernel
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
-    part = rbfpart(D,variance,lengthscale)
+    part = rbfpart(D,variance,lengthscale,ARD)
    return kern(D, [part])
-def rbf_ARD(D,variance=1., lengthscales=None):
+def linear(D,variances=None,ARD=True):
    """
    Construct an RBF kernel with Automatic Relevance Determination (ARD)
    :param D: dimensionality of the kernel, obligatory
    :type D: int
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscales: the lengthscales of the kernel
    :type lengthscales: None|np.ndarray
    """
    part = rbf_ARD_part(D,variance,lengthscales)
    return kern(D, [part])
 def linear(D,lengthscales=None):
    """
     Construct a linear kernel.
     Arguments
     ---------
     D (int), obligatory
-     lengthscales (np.ndarray)
+     variances (np.ndarray)
     ARD (boolean)
    """
-    part = linearpart(D,lengthscales)
+    part = linearpart(D,variances,ARD)
    return kern(D, [part])
 def linear_ARD(D,lengthscales=None):
    """
     Construct a linear ARD kernel.
     Arguments
     ---------
     D (int), obligatory
     lengthscales (np.ndarray)
    """
    part = linear_ARD_part(D,lengthscales)
    return kern(D, [part])
 def white(D,variance=1.):
@ -86,43 +64,52 @@ def white(D,variance=1.):
    part = whitepart(D,variance)
    return kern(D, [part])
-def exponential(D,variance=1., lengthscales=None):
+def exponential(D,variance=1., lengthscale=None, ARD=False):
    """
-     Construct a exponential kernel.
+    Construct an exponential kernel
-     Arguments
+    :param D: dimensionality of the kernel, obligatory
-     ---------
+    :type D: int
-     D (int), obligatory
+    :param variance: the variance of the kernel
-     variance (float)
+    :type variance: float
-     lengthscales (np.ndarray)
+    :param lengthscale: the lengthscale of the kernel
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
-    part = exponentialpart(D,variance, lengthscales)
+    part = exponentialpart(D,variance, lengthscale, ARD)
    return kern(D, [part])
-def Matern32(D,variance=1., lengthscales=None):
+def Matern32(D,variance=1., lengthscale=None, ARD=False):
    """
     Construct a Matern 3/2 kernel.
-     Arguments
+    :param D: dimensionality of the kernel, obligatory
-     ---------
+    :type D: int
-     D (int), obligatory
+    :param variance: the variance of the kernel
-     variance (float)
+    :type variance: float
-     lengthscales (np.ndarray)
+    :param lengthscale: the lengthscale of the kernel
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
-    part = Matern32part(D,variance, lengthscales)
+    part = Matern32part(D,variance, lengthscale, ARD)
    return kern(D, [part])
-def Matern52(D,variance=1., lengthscales=None):
+def Matern52(D,variance=1., lengthscale=None, ARD=False):
    """
     Construct a Matern 5/2 kernel.
-     Arguments
+    :param D: dimensionality of the kernel, obligatory
-     ---------
+    :type D: int
-     D (int), obligatory
+    :param variance: the variance of the kernel
-     variance (float)
+    :type variance: float
-     lengthscales (np.ndarray)
+    :param lengthscale: the lengthscale of the kernel
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
-    part = Matern52part(D,variance, lengthscales)
+    part = Matern52part(D,variance, lengthscale, ARD)
    return kern(D, [part])
 def bias(D,variance=1.):
@ -200,3 +187,57 @@ def sympykern(D,k):
    A kernel from a symbolic sympy representation
    """
    return kern(D,[spkern(D,k)])
 def periodic_exponential(D=1,variance=1., lengthscale=None, period=2*np.pi,n_freq=10,lower=0.,upper=4*np.pi):
    """
    Construct an periodic exponential kernel
    :param D: dimensionality, only defined for D=1
    :type D: int
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscale: the lengthscale of the kernel
    :type lengthscale: float
    :param period: the period
    :type period: float
    :param n_freq: the number of frequencies considered for the periodic subspace
    :type n_freq: int
    """
    part = periodic_exponentialpart(D,variance, lengthscale, period, n_freq, lower, upper)
    return kern(D, [part])
 def periodic_Matern32(D,variance=1., lengthscale=None, period=2*np.pi,n_freq=10,lower=0.,upper=4*np.pi):
    """
     Construct a periodic Matern 3/2 kernel.
     :param D: dimensionality, only defined for D=1
     :type D: int
     :param variance: the variance of the kernel
     :type variance: float
     :param lengthscale: the lengthscale of the kernel
     :type lengthscale: float
     :param period: the period
     :type period: float
     :param n_freq: the number of frequencies considered for the periodic subspace
     :type n_freq: int
    """
    part = periodic_Matern32part(D,variance, lengthscale, period, n_freq, lower, upper)
    return kern(D, [part])
 def periodic_Matern52(D,variance=1., lengthscale=None, period=2*np.pi,n_freq=10,lower=0.,upper=4*np.pi):
    """
     Construct a periodic Matern 5/2 kernel.
     :param D: dimensionality, only defined for D=1
     :type D: int
     :param variance: the variance of the kernel
     :type variance: float
     :param lengthscale: the lengthscale of the kernel
     :type lengthscale: float
     :param period: the period
     :type period: float
     :param n_freq: the number of frequencies considered for the periodic subspace
     :type n_freq: int
    """
    part = periodic_Matern52part(D,variance, lengthscale, period, n_freq, lower, upper)
    return kern(D, [part])
--- a/GPy/kern/exponential.py
+++ b/GPy/kern/exponential.py
@ -19,42 +19,53 @@ class exponential(kernpart):
    :type D: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
-    :param lengthscale: the lengthscales :math:`\ell_i`
+    :param lengthscale: the vector of lengthscale :math:`\ell_i`
-    :type lengthscale: np.ndarray of size (D,)
+    :type lengthscale: np.ndarray of size (1,) or (D,) depending on ARD
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    """
-    def __init__(self,D,variance=1.,lengthscales=None):
+    def __init__(self,D,variance=1.,lengthscale=None,ARD=False):
        self.D = D
-        if lengthscales is not None:
+        self.ARD = ARD
-            assert lengthscales.shape==(self.D,)
+        if ARD == False:
-        else:
+            self.Nparam = 2
            lengthscales = np.ones(self.D)
        self.Nparam = self.D + 1
            self.name = 'exp'
-        self.set_param(np.hstack((variance,lengthscales)))
+            if lengthscale is not None:
                assert lengthscale.shape == (1,)
            else:
                lengthscale = np.ones(1)
        else:
            self.Nparam = self.D + 1
            self.name = 'exp_ARD'
            if lengthscale is not None:
                assert lengthscale.shape == (self.D,)
            else:
                lengthscale = np.ones(self.D)
        self._set_params(np.hstack((variance,lengthscale)))
-    def get_param(self):
+    def _get_params(self):
        """return the value of the parameters."""
-        return np.hstack((self.variance,self.lengthscales))
+        return np.hstack((self.variance,self.lengthscale))
-    def set_param(self,x):
+    def _set_params(self,x):
        """set the value of the parameters."""
-        assert x.size==(self.D+1)
+        assert x.size == self.Nparam
        self.variance = x[0]
-        self.lengthscales = x[1:]
+        self.lengthscale = x[1:]
-    def get_param_names(self):
+    def _get_param_names(self):
        """return parameter names."""
-        if self.D==1:
+        if self.Nparam == 2:
            return ['variance','lengthscale']
        else:
-            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
+            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
    def K(self,X,X2,target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        np.add(self.variance*np.exp(-dist), target,target)
    def Kdiag(self,X,target):
@ -64,13 +75,17 @@ class exponential(kernpart):
    def dK_dtheta(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        invdist = 1./np.where(dist!=0.,dist,np.inf)
-        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
+        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
        dvar = np.exp(-dist)
        dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
        target[0] += np.sum(dvar*partial)
        if self.ARD == True:
            dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
            target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
        else:
            dl = self.variance*dvar*dist2M.sum(-1)*invdist
            target[1] += np.sum(dl*partial)
    def dKdiag_dtheta(self,partial,X,target): 
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
@ -80,8 +95,8 @@ class exponential(kernpart):
    def dK_dX(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
-        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
+        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
        dK_dX = - np.transpose(self.variance*np.exp(-dist)*ddist_dX,(1,0,2))
        target += np.sum(dK_dX*partial.T[:,:,None],0)
@ -101,14 +116,14 @@ class exponential(kernpart):
        """
        assert self.D == 1
        def L(x,i):
-            return(1./self.lengthscales*F[i](x) + F1[i](x))
+            return(1./self.lengthscale*F[i](x) + F1[i](x))
        n = F.shape[0]
        G = np.zeros((n,n))
        for i in range(n):
            for j in range(i,n):
                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
        Flower = np.array([f(lower) for f in F])[:,None]
-        return(self.lengthscales/2./self.variance * G + 1./self.variance * np.dot(Flower,Flower.T))
+        return(self.lengthscale/2./self.variance * G + 1./self.variance * np.dot(Flower,Flower.T))
--- a/GPy/kern/finite_dimensional.py
+++ b/GPy/kern/finite_dimensional.py
@ -27,15 +27,15 @@ class finite_dimensional(kernpart):
            weights = np.ones(self.n)
        self.Nparam = self.n + 1
        self.name = 'finite_dim'
-        self.set_param(np.hstack((variance,weights)))
+        self._set_params(np.hstack((variance,weights)))
-    def get_param(self):
+    def _get_params(self):
        return np.hstack((self.variance,self.weights))
-    def set_param(self,x):
+    def _set_params(self,x):
        assert x.size == (self.Nparam)
        self.variance = x[0]
        self.weights = x[1:]
-    def get_param_names(self):
+    def _get_param_names(self):
        if self.n==1:
            return ['variance','weight']
        else:
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -133,20 +133,20 @@ class kern(parameterised):
        newkern.tied_indices = self.tied_indices + [self.Nparam + x for x in other.tied_indices]
        return newkern
-    def get_param(self):
+    def _get_params(self):
-        return np.hstack([p.get_param() for p in self.parts])
+        return np.hstack([p._get_params() for p in self.parts])
-    def set_param(self,x):
+    def _set_params(self,x):
-        [p.set_param(x[s]) for p, s in zip(self.parts, self.param_slices)]
+        [p._set_params(x[s]) for p, s in zip(self.parts, self.param_slices)]
-    def get_param_names(self):
+    def _get_param_names(self):
        #this is a bit nasty: we wat to distinguish between parts with the same name by appending a count
        part_names = np.array([k.name for k in self.parts],dtype=np.str)
        counts = [np.sum(part_names==ni) for i, ni in enumerate(part_names)]
        cum_counts = [np.sum(part_names[i:]==ni) for i, ni in enumerate(part_names)]
        names = [name+'_'+str(cum_count) if count>1 else name for name,count,cum_count in zip(part_names,counts,cum_counts)]
-        return sum([[name+'_'+n for n in k.get_param_names()] for name,k in zip(names,self.parts)],[])
+        return sum([[name+'_'+n for n in k._get_param_names()] for name,k in zip(names,self.parts)],[])
    def K(self,X,X2=None,slices1=None,slices2=None):
        assert X.shape[1]==self.D
@ -284,6 +284,8 @@ class kern(parameterised):
        # 1. get all the psi1 statistics
        psi1_matrices = [np.zeros((mu.shape[0], Z.shape[0])) for p in self.parts]
        [p.psi1(Z[s2],mu[s1],S[s1],psi1_target[s1,s2]) for p,s1,s2,psi1_target in zip(self.parts,slices1,slices2, psi1_matrices)]
        partial1 = np.zeros_like(partial1)
        # 2. get all the dpsi1/dtheta gradients
        psi1_gradients = [np.zeros(self.Nparam) for p in self.parts]
        [p.dpsi1_dtheta(partial1[s2,s1],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],psi1g_target[ps]) for p,ps,s1,s2,i_s,psi1g_target in zip(self.parts, self.param_slices,slices1,slices2,self.input_slices,psi1_gradients)]
@ -292,7 +294,7 @@ class kern(parameterised):
        for a,b in itertools.combinations(range(len(psi1_matrices)), 2):
            gne = (psi1_gradients[a][None]*psi1_matrices[b].sum(0)[:,None]).sum(0)
-            target += 0#(gne[None] + gne[:, None]).sum(0)
+            target += (gne[None] + gne[:, None]).sum(0)
        return target
    def dpsi2_dZ(self,partial,Z,mu,S,slices1=None,slices2=None):
--- a/GPy/kern/kernpart.py
+++ b/GPy/kern/kernpart.py
@ -16,11 +16,11 @@ class kernpart(object):
        self.Nparam = 1
        self.name = 'unnamed'
-    def get_param(self):
+    def _get_params(self):
        raise NotImplementedError
-    def set_param(self,x):
+    def _set_params(self,x):
        raise NotImplementedError
-    def get_param_names(self):
+    def _get_param_names(self):
        raise NotImplementedError
    def K(self,X,X2,target):
        raise NotImplementedError
--- a/GPy/kern/linear.py
+++ b/GPy/kern/linear.py
@ -1,63 +1,148 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import kernpart
 import numpy as np
 class linear(kernpart):
    """
    Linear kernel
    .. math::
       k(x,y) = \sum_{i=1}^D \sigma^2_i x_iy_i
    :param D: the number of input dimensions
    :type D: int
-    :param variance: variance
+    :param variances: the vector of variances :math:`\sigma^2_i`
-    :type variance: None|float
+    :type variances: np.ndarray of size (1,) or (D,) depending on ARD
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single variance parameter \sigma^2), otherwise there is one variance parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    """
-    def __init__(self, D, variance=None):
+    def __init__(self,D,variances=None,ARD=True):
        self.D = D
-        if variance is None:
+        self.ARD = ARD
-            variance = 1.0
+        if ARD == False:
            self.Nparam = 1
            self.name = 'linear'
-        self.set_param(variance)
+            if variances is not None:
                assert variances.shape == (1,)
            else:
                variances = np.ones(1)
            self._Xcache, self._X2cache = np.empty(shape=(2,))
        else:
            self.Nparam = self.D
            self.name = 'linear_ARD'
            if variances is not None:
                assert variances.shape == (self.D,)
            else:
                variances = np.ones(self.D)
        self._set_params(variances)
-    def get_param(self):
+    def _get_params(self):
-        return self.variance
+        return self.variances
-    def set_param(self,x):
+    def _set_params(self,x):
-        self.variance = x
+        assert x.size==(self.Nparam)
        self.variances = x
        self.variances2 = np.square(self.variances)
-    def get_param_names(self):
+    def _get_param_names(self):
        if self.Nparam == 1:
            return ['variance']
        else:
            return ['variance_%i'%i for i in range(self.variances.size)]
    def K(self,X,X2,target):
        if self.ARD:
            XX = X*np.sqrt(self.variances)
            XX2 = X2*np.sqrt(self.variances)
            target += np.dot(XX, XX2.T)
        else:
            self._K_computations(X, X2)
-        target += self.variance * self._dot_product
+            target += self.variances * self._dot_product
    def Kdiag(self,X,target):
-        np.add(target,np.sum(self.variance*np.square(X),-1),target)
+        np.add(target,np.sum(self.variances*np.square(X),-1),target)
    def dK_dtheta(self,partial,X,X2,target):
-        """
+        if self.ARD:
-        Computes the derivatives wrt theta
+            product = X[:,None,:]*X2[None,:,:]
-        Return shape is NxMx(Ntheta)
+            target += (partial[:,:,None]*product).sum(0).sum(0)
-        """
+        else:
            self._K_computations(X, X2)
-        product = self._dot_product
+            target += np.sum(self._dot_product*partial)
        # product = np.dot(X, X2.T)
        target += np.sum(product*partial)
    def dK_dX(self,partial,X,X2,target):
-        target += self.variance * np.sum(partial[:,None,:]*X2.T[None,:,:],-1)
+        target += (((X2[:, None, :] * self.variances)) * partial[:,:, None]).sum(0)
-    def dKdiag_dtheta(self,partial,X,target):
+    #---------------------------------------#
-        target += np.sum(partial*np.square(X).sum(1))
+    #             PSI statistics            #
    #---------------------------------------#
    def psi0(self,Z,mu,S,target):
        expected = np.square(mu) + S
        target += np.sum(self.variances*expected)
    def dpsi0_dtheta(self,partial,Z,mu,S,target):
        expected = np.square(mu) + S
        target += (partial[:, None] * (-2.*np.sum(expected,0))).sum()
    def dpsi0_dmuS(self,partial, Z,mu,S,target_mu,target_S):
        target_mu += partial[:, None] * (2*mu*self.variances)
        target_S += partial[:, None] * self.variances
    def dpsi0_dZ(self,Z,mu,S,target):
        pass
    def psi1(self,Z,mu,S,target):
        """the variance, it does nothing"""
        self.K(mu,Z,target)
    def dpsi1_dtheta(self,partial,Z,mu,S,target):
        """the variance, it does nothing"""
        self.dK_dtheta(partial,mu,Z,target)
    def dpsi1_dmuS(self,partial,Z,mu,S,target_mu,target_S):
        """Do nothing for S, it does not affect psi1"""
        target_mu += (partial.T[:,:, None]*(Z/self.variances)).sum(1)
    def dpsi1_dZ(self,partial,Z,mu,S,target):
        self.dK_dX(partial.T,Z,mu,target)
    def psi2(self,Z,mu,S,target):
        """
        returns N,M,M matrix
        """
        mu2_S = np.square(mu)+S# N,Q,
        ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q
        psi2 = ZZ*np.square(self.variances)*mu2_S[:, None, None, :]
        target += psi2.sum(-1)
    def dpsi2_dtheta(self,partial,Z,mu,S,target):
        mu2_S = np.square(mu)+S# N,Q,
        ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q
        target += (partial[:,:,:,None]*(2.*ZZ*mu2_S[:,None,None,:]*self.variances)).sum()
    def dpsi2_dmuS(self,partial,Z,mu,S,target_mu,target_S):
        """Think N,M,M,Q """
        mu2_S = np.sum(np.square(mu)+S,0)# Q,
        ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q
        tmp = ZZ*np.square(self.variances) # M,M,Q
        target_mu += (partial[:,:,:,None]*tmp*2.*mu[:,None,None,:]).sum(1).sum(1)
        target_S += (partial[:,:,:,None]*tmp).sum(1).sum(1)
    def dpsi2_dZ(self,partial,Z,mu,S,target):
        mu2_S = np.sum(np.square(mu)+S,0)# Q,
        target += (partial[:,:,:,None]* (Z * mu2_S * np.square(self.variances))).sum(0).sum(0)
    #---------------------------------------#
    #            Precomputations            #
    #---------------------------------------#
    def _K_computations(self,X,X2):
        # (Nicolo) changed the logic here. If X2 is None, we want to cache
        # (X,X). In practice X2 should always be passed.
        if X2 is None:
            X2 = X
        if not (np.all(X==self._Xcache) and np.all(X2==self._X2cache)):
@ -67,58 +152,3 @@ class linear(kernpart):
        else:
            # print "Cache hit!"
            pass # TODO: insert debug message here (logging framework)
    # def psi0(self,Z,mu,S,target):
    #     expected = np.square(mu) + S
    #     np.add(target,np.sum(self.variance*expected),target)
    # def dpsi0_dtheta(self,Z,mu,S,target):
    #     expected = np.square(mu) + S
    #     return -2.*np.sum(expected,0)
    # def dpsi0_dmuS(self,Z,mu,S,target_mu,target_S):
    #     np.add(target_mu,2*mu*self.variances,target_mu)
    #     np.add(target_S,self.variances,target_S)
    # def dpsi0_dZ(self,Z,mu,S,target):
    #     pass
    # def psi1(self,Z,mu,S,target):
    #     """the variance, it does nothing"""
    #     self.K(mu,Z,target)
    # def dpsi1_dtheta(self,Z,mu,S,target):
    #     """the variance, it does nothing"""
    #     self.dK_dtheta(mu,Z,target)
    # def dpsi1_dmuS(self,Z,mu,S,target_mu,target_S):
    #     """Do nothing for S, it does not affect psi1"""
    #     np.add(target_mu,Z/self.variances2,target_mu)
    # def dpsi1_dZ(self,Z,mu,S,target):
    #     self.dK_dX(mu,Z,target)
    # def psi2(self,Z,mu,S,target):
    #     """Think N,M,M,Q """
    #     mu2_S = np.square(mu)+SN,Q,
    #     ZZ = Z[:,None,:]*Z[None,:,:] M,M,Q
    #     psi2 = ZZ*np.square(self.variances)*mu2_S
    #     np.add(target, psi2.sum(-1),target) M,M
    # def dpsi2_dtheta(self,Z,mu,S,target):
    #     mu2_S = np.square(mu)+SN,Q,
    #     ZZ = Z[:,None,:]*Z[None,:,:] M,M,Q
    #     target += 2.*ZZ*mu2_S*self.variances
    # def dpsi2_dmuS(self,Z,mu,S,target_mu,target_S):
    #     """Think N,M,M,Q """
    #     mu2_S = np.sum(np.square(mu)+S,0)Q,
    #     ZZ = Z[:,None,:]*Z[None,:,:] M,M,Q
    #     tmp = ZZ*np.square(self.variances) M,M,Q
    #     np.add(target_mu, tmp*2.*mu[:,None,None,:],target_mu) N,M,M,Q
    #     np.add(target_S, tmp, target_S) N,M,M,Q
    # def dpsi2_dZ(self,Z,mu,S,target):
    #     mu2_S = np.sum(np.square(mu)+S,0)Q,
    #     target += Z[:,None,:]*np.square(self.variances)*mu2_S
--- a/GPy/kern/linear_ARD.py
+++ b/GPy/kern/linear_ARD.py
@ -1,108 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import kernpart
 import numpy as np
 class linear_ARD(kernpart):
    """
    Linear ARD kernel
    :param D: the number of input dimensions
    :type D: int
    :param variances: ARD variances
    :type variances: None|np.ndarray
    """
    def __init__(self,D,variances=None):
        self.D = D
        if variances is not None:
            assert variances.shape==(self.D,)
        else:
            variances = np.ones(self.D)
        self.Nparam = int(self.D)
        self.name = 'linear'
        self.set_param(variances)
    def get_param(self):
        return self.variances
    def set_param(self,x):
        assert x.size==(self.Nparam)
        self.variances = x
    def get_param_names(self):
        if self.D==1:
            return ['variance']
        else:
            return ['variance_%i'%i for i in range(self.variances.size)]
    def K(self,X,X2,target):
        XX = X*np.sqrt(self.variances)
        XX2 = X2*np.sqrt(self.variances)
        target += np.dot(XX, XX2.T)
    def Kdiag(self,X,target):
        np.add(target,np.sum(self.variances*np.square(X),-1),target)
    def dK_dtheta(self,partial,X,X2,target):
        product = X[:,None,:]*X2[None,:,:]
        target += (partial[:,:,None]*product).sum(0).sum(0)
    def dK_dX(self,partial,X,X2,target):
        target += (((X2[:, None, :] * self.variances)) * partial[:,:, None]).sum(0)
    def psi0(self,Z,mu,S,target):
        expected = np.square(mu) + S
        np.add(target,np.sum(self.variances*expected),target)
    def dpsi0_dtheta(self,Z,mu,S,target):
        expected = np.square(mu) + S
        return -2.*np.sum(expected,0)
    def dpsi0_dmuS(self,Z,mu,S,target_mu,target_S):
        np.add(target_mu,2*mu*self.variances,target_mu)
        np.add(target_S,self.variances,target_S)
    def dpsi0_dZ(self,Z,mu,S,target):
        pass
    def psi1(self,Z,mu,S,target):
        """the variance, it does nothing"""
        self.K(mu,Z,target)
    def dpsi1_dtheta(self,Z,mu,S,target):
        """the variance, it does nothing"""
        self.dK_dtheta(mu,Z,target)
    def dpsi1_dmuS(self,Z,mu,S,target_mu,target_S):
        """Do nothing for S, it does not affect psi1"""
        np.add(target_mu,Z/self.variances2,target_mu)
    def dpsi1_dZ(self,Z,mu,S,target):
        self.dK_dX(mu,Z,target)
    def psi2(self,Z,mu,S,target):
        """Think N,M,M,Q """
        mu2_S = np.square(mu)+S# N,Q,
        ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q
        psi2 = ZZ*np.square(self.variances)*mu2_S
        np.add(target, psi2.sum(-1),target) # M,M
    def dpsi2_dtheta(self,Z,mu,S,target):
        mu2_S = np.square(mu)+S# N,Q,
        ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q
        target += 2.*ZZ*mu2_S*self.variances
    def dpsi2_dmuS(self,Z,mu,S,target_mu,target_S):
        """Think N,M,M,Q """
        mu2_S = np.sum(np.square(mu)+S,0)# Q,
        ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q
        tmp = ZZ*np.square(self.variances) # M,M,Q
        np.add(target_mu, tmp*2.*mu[:,None,None,:],target_mu) #N,M,M,Q
        np.add(target_S, tmp, target_S) #N,M,M,Q
    def dpsi2_dZ(self,Z,mu,S,target):
        mu2_S = np.sum(np.square(mu)+S,0)# Q,
        target += Z[:,None,:]*np.square(self.variances)*mu2_S
--- a/GPy/kern/periodic_Matern32.py
+++ b/GPy/kern/periodic_Matern32.py
@ -0,0 +1,172 @@
 from kernpart import kernpart
 import numpy as np
 from GPy.util.linalg import mdot, pdinv
 class periodic_Matern32(kernpart):
    """
    Kernel of the periodic subspace (up to a given frequency) of a Matern 3/2 RKHS. Only defined for D=1.
    :param D: the number of input dimensions
    :type D: int
    :param variance: the variance of the Matern kernel
    :type variance: float
    :param lengthscale: the lengthscale of the Matern kernel
    :type lengthscale: np.ndarray of size (D,)
    :param period: the period
    :type period: float
    :param n_freq: the number of frequencies considered for the periodic subspace
    :type n_freq: int
    :rtype: kernel object
    """
    def __init__(self,D=1,variance=1.,lengthscale=None,period=2*np.pi,n_freq=10,lower=0.,upper=4*np.pi):
        assert D==1
        self.name = 'periodic_Mat32'
        self.D = D
        if lengthscale is not None:
            assert lengthscale.shape==(self.D,)
        else:
            lengthscale = np.ones(self.D)
        self.lower,self.upper = lower, upper
        self.Nparam = 3
        self.n_freq = n_freq
        self.n_basis = 2*n_freq
        self._set_params(np.hstack((variance,lengthscale,period)))
    def _cos(self,alpha,omega,phase):
        def f(x):
            return alpha*np.cos(omega*x+phase)
        return f
    def _cos_factorization(self,alpha,omega,phase):
        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
        r =  np.sqrt(r1**2 + r2**2)
        psi = np.where(r1 != 0, (np.arctan(r2/r1) + (r1<0.)*np.pi),np.arcsin(r2))
        return r,omega[:,0:1], psi
    def _int_computation(self,r1,omega1,phi1,r2,omega2,phi2):
        Gint1 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) + 1./(omega1-omega2.T)*( np.sin((omega1-omega2.T)*self.upper+phi1-phi2.T) - np.sin((omega1-omega2.T)*self.lower+phi1-phi2.T) )
        Gint2 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) +  np.cos(phi1-phi2.T)*(self.upper-self.lower)
        #Gint2[0,0] = 2.*(self.upper-self.lower)*np.cos(phi1[0,0])*np.cos(phi2[0,0])
        Gint = np.dot(r1,r2.T)/2 * np.where(np.isnan(Gint1),Gint2,Gint1)
        return Gint
    def _get_params(self):
        """return the value of the parameters."""
        return np.hstack((self.variance,self.lengthscale,self.period))
    def _set_params(self,x):
        """set the value of the parameters."""
        assert x.size==3
        self.variance = x[0]
        self.lengthscale = x[1]
        self.period = x[2]
        self.a = [3./self.lengthscale**2, 2*np.sqrt(3)/self.lengthscale, 1.]
        self.b = [1,self.lengthscale**2/3]
        self.basis_alpha = np.ones((self.n_basis,))
        self.basis_omega = np.array(sum([[i*2*np.pi/self.period]*2 for i in  range(1,self.n_freq+1)],[]))
        self.basis_phi =   np.array(sum([[-np.pi/2, 0.]  for i in range(1,self.n_freq+1)],[]))
        self.G = self.Gram_matrix()
        self.Gi = np.linalg.inv(self.G)
    def _get_param_names(self):
        """return parameter names."""
        return ['variance','lengthscale','period']
    def Gram_matrix(self):
        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega,self.a[2]*self.basis_omega**2))
        Lo = np.column_stack((self.basis_omega,self.basis_omega,self.basis_omega))
        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2,self.basis_phi+np.pi))
        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
        Gint = self._int_computation( r,omega,phi, r,omega,phi)
        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        return(self.lengthscale**3/(12*np.sqrt(3)*self.variance) * Gint + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscale**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
    def K(self,X,X2,target):
        """Compute the covariance matrix between X and X2."""
        FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        if X2 is None:
            FX2 = FX
        else:
            FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
        np.add(mdot(FX,self.Gi,FX2.T), target,target)
    def Kdiag(self,X,target):
        """Compute the diagonal of the covariance matrix associated to X."""
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
    def dK_dtheta(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters (shape is NxMxNparam)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega,self.a[2]*self.basis_omega**2))
        Lo = np.column_stack((self.basis_omega,self.basis_omega,self.basis_omega))
        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2,self.basis_phi+np.pi))
        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
        Gint = self._int_computation( r,omega,phi, r,omega,phi)
        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        #dK_dvar
        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
        #dK_dlen
        da_dlen = [-6/self.lengthscale**3,-2*np.sqrt(3)/self.lengthscale**2,0.]
        db_dlen = [0.,2*self.lengthscale/3.]
        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega,da_dlen[2]*self.basis_omega**2))
        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
        dGint_dlen = dGint_dlen + dGint_dlen.T
        dG_dlen = self.lengthscale**2/(4*np.sqrt(3))*Gint + self.lengthscale**3/(12*np.sqrt(3))*dGint_dlen + db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F1lower,F1lower.T)
        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX2.T)
        #dK_dper
        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
        dFX2_dper = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X2,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X2)
        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period))
        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2))
        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
        #IPPprim2[0,0] = 2*(self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
        #IPPint2[0,0] = (self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period))
        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
        dGint_dper = dGint_dper + dGint_dper.T
        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        dG_dper = 1./self.variance*(self.lengthscale**3/(12*np.sqrt(3))*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)) + self.b[1]*(np.dot(dF1lower_dper,F1lower.T)+np.dot(F1lower,dF1lower_dper.T)))
        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
        # np.add(target[:,:,0],dK_dvar, target[:,:,0])
        target[0] += np.sum(dK_dvar*partial)
        #np.add(target[:,:,1],dK_dlen, target[:,:,1])
        target[1] += np.sum(dK_dlen*partial)
        #np.add(target[:,:,2],dK_dper, target[:,:,2])
        target[2] += np.sum(dK_dper*partial)
--- a/GPy/kern/periodic_Matern52.py
+++ b/GPy/kern/periodic_Matern52.py
@ -0,0 +1,184 @@
 from kernpart import kernpart
 import numpy as np
 from GPy.util.linalg import mdot, pdinv
 class periodic_Matern52(kernpart):
    """
    Kernel of the periodic subspace (up to a given frequency) of a Matern 5/2 RKHS. Only defined for D=1.
    :param D: the number of input dimensions
    :type D: int
    :param variance: the variance of the Matern kernel
    :type variance: float
    :param lengthscale: the lengthscale of the Matern kernel
    :type lengthscale: np.ndarray of size (D,)
    :param period: the period
    :type period: float
    :param n_freq: the number of frequencies considered for the periodic subspace
    :type n_freq: int
    :rtype: kernel object
    """
    def __init__(self,D=1,variance=1.,lengthscale=None,period=2*np.pi,n_freq=10,lower=0.,upper=4*np.pi):
        assert D==1
        self.name = 'periodic_Mat52'
        self.D = D
        if lengthscale is not None:
            assert lengthscale.shape==(self.D,)
        else:
            lengthscale = np.ones(self.D)
        self.lower,self.upper = lower, upper
        self.Nparam = 3
        self.n_freq = n_freq
        self.n_basis = 2*n_freq
        self._set_params(np.hstack((variance,lengthscale,period)))
    def _cos(self,alpha,omega,phase):
        def f(x):
            return alpha*np.cos(omega*x+phase)
        return f
    def _cos_factorization(self,alpha,omega,phase):
        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
        r =  np.sqrt(r1**2 + r2**2)
        psi = np.where(r1 != 0, (np.arctan(r2/r1) + (r1<0.)*np.pi),np.arcsin(r2))
        return r,omega[:,0:1], psi
    def _int_computation(self,r1,omega1,phi1,r2,omega2,phi2):
        Gint1 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) + 1./(omega1-omega2.T)*( np.sin((omega1-omega2.T)*self.upper+phi1-phi2.T) - np.sin((omega1-omega2.T)*self.lower+phi1-phi2.T) )
        Gint2 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) +  np.cos(phi1-phi2.T)*(self.upper-self.lower)
        #Gint2[0,0] = 2.*(self.upper-self.lower)*np.cos(phi1[0,0])*np.cos(phi2[0,0])
        Gint = np.dot(r1,r2.T)/2 * np.where(np.isnan(Gint1),Gint2,Gint1)
        return Gint
    def _get_params(self):
        """return the value of the parameters."""
        return np.hstack((self.variance,self.lengthscale,self.period))
    def _set_params(self,x):
        """set the value of the parameters."""
        assert x.size==3
        self.variance = x[0]
        self.lengthscale = x[1]
        self.period = x[2]
        self.a = [5*np.sqrt(5)/self.lengthscale**3, 15./self.lengthscale**2,3*np.sqrt(5)/self.lengthscale, 1.]
        self.b  = [9./8, 9*self.lengthscale**4/200., 3*self.lengthscale**2/5., 3*self.lengthscale**2/(5*8.), 3*self.lengthscale**2/(5*8.)]
        self.basis_alpha = np.ones((2*self.n_freq,))
        self.basis_omega = np.array(sum([[i*2*np.pi/self.period]*2 for i in  range(1,self.n_freq+1)],[]))
        self.basis_phi =   np.array(sum([[-np.pi/2, 0.]  for i in range(1,self.n_freq+1)],[]))
        self.G = self.Gram_matrix()
        self.Gi = np.linalg.inv(self.G)
    def _get_param_names(self):
        """return parameter names."""
        return ['variance','lengthscale','period']
    def Gram_matrix(self):
        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)), self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2, self.a[3]*self.basis_omega**3))
        Lo = np.column_stack((self.basis_omega, self.basis_omega, self.basis_omega, self.basis_omega))
        Lp = np.column_stack((self.basis_phi, self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
        Gint = self._int_computation( r,omega,phi, r,omega,phi)
        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        F2lower = np.array(self._cos(self.basis_alpha*self.basis_omega**2,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
        lower_terms = self.b[0]*np.dot(Flower,Flower.T) + self.b[1]*np.dot(F2lower,F2lower.T) + self.b[2]*np.dot(F1lower,F1lower.T) + self.b[3]*np.dot(F2lower,Flower.T) + self.b[4]*np.dot(Flower,F2lower.T)
        return(3*self.lengthscale**5/(400*np.sqrt(5)*self.variance) * Gint + 1./self.variance*lower_terms)
    def K(self,X,X2,target):
        """Compute the covariance matrix between X and X2."""
        FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        if X2 is None:
            FX2 = FX
        else:
            FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
        np.add(mdot(FX,self.Gi,FX2.T), target,target)
    def Kdiag(self,X,target):
        """Compute the diagonal of the covariance matrix associated to X."""
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
    def dK_dtheta(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters (shape is NxMxNparam)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)), self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2, self.a[3]*self.basis_omega**3))
        Lo = np.column_stack((self.basis_omega, self.basis_omega, self.basis_omega, self.basis_omega))
        Lp = np.column_stack((self.basis_phi, self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
        Gint = self._int_computation( r,omega,phi, r,omega,phi)
        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        F2lower = np.array(self._cos(self.basis_alpha*self.basis_omega**2,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
        #dK_dvar
        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
        #dK_dlen
        da_dlen = [-3*self.a[0]/self.lengthscale, -2*self.a[1]/self.lengthscale, -self.a[2]/self.lengthscale, 0.]
        db_dlen = [0., 4*self.b[1]/self.lengthscale, 2*self.b[2]/self.lengthscale, 2*self.b[3]/self.lengthscale, 2*self.b[4]/self.lengthscale]
        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)), da_dlen[1]*self.basis_omega, da_dlen[2]*self.basis_omega**2, da_dlen[3]*self.basis_omega**3))
        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
        dGint_dlen = dGint_dlen + dGint_dlen.T
        dlower_terms_dlen = db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F2lower,F2lower.T) + db_dlen[2]*np.dot(F1lower,F1lower.T) + db_dlen[3]*np.dot(F2lower,Flower.T) + db_dlen[4]*np.dot(Flower,F2lower.T)
        dG_dlen = 15*self.lengthscale**4/(400*np.sqrt(5))*Gint + 3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dlen + dlower_terms_dlen
        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX2.T)
        #dK_dper
        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
        dFX2_dper = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X2,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X2)
        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period, -self.a[3]*self.basis_omega**4/self.period))
        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2,self.basis_phi))
        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
        #IPPprim2[0,0] = 2*(self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
        #IPPint2[0,0] = (self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period, -3*self.a[3]*self.basis_omega**3/self.period))
        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
        dGint_dper = dGint_dper + dGint_dper.T
        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        dF2lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**3/self.period,self.basis_omega,self.basis_phi+np.pi*3/2)(self.lower) + self._cos(-2*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
        dlower_terms_dper  = self.b[0] * (np.dot(dFlower_dper,Flower.T) + np.dot(Flower.T,dFlower_dper))
        dlower_terms_dper += self.b[1] * (np.dot(dF2lower_dper,F2lower.T) + np.dot(F2lower,dF2lower_dper.T)) - 4*self.b[1]/self.period*np.dot(F2lower,F2lower.T)
        dlower_terms_dper += self.b[2] * (np.dot(dF1lower_dper,F1lower.T) + np.dot(F1lower,dF1lower_dper.T)) - 2*self.b[2]/self.period*np.dot(F1lower,F1lower.T)
        dlower_terms_dper += self.b[3] * (np.dot(dF2lower_dper,Flower.T) + np.dot(F2lower,dFlower_dper.T)) - 2*self.b[3]/self.period*np.dot(F2lower,Flower.T)
        dlower_terms_dper += self.b[4] * (np.dot(dFlower_dper,F2lower.T) + np.dot(Flower,dF2lower_dper.T)) - 2*self.b[4]/self.period*np.dot(Flower,F2lower.T)
        dG_dper = 1./self.variance*(3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dper + 0.5*dlower_terms_dper)
        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
        # np.add(target[:,:,0],dK_dvar, target[:,:,0])
        target[0] += np.sum(dK_dvar*partial)
        #np.add(target[:,:,1],dK_dlen, target[:,:,1])
        target[1] += np.sum(dK_dlen*partial)
        #np.add(target[:,:,2],dK_dper, target[:,:,2])
        target[2] += np.sum(dK_dper*partial)
--- a/GPy/kern/periodic_exponential.py
+++ b/GPy/kern/periodic_exponential.py
@ -0,0 +1,169 @@
 from kernpart import kernpart
 import numpy as np
 from GPy.util.linalg import mdot, pdinv
 class periodic_exponential(kernpart):
    """
    Kernel of the periodic subspace (up to a given frequency) of a exponential (Matern 1/2) RKHS. Only defined for D=1.
    :param D: the number of input dimensions
    :type D: int
    :param variance: the variance of the Matern kernel
    :type variance: float
    :param lengthscale: the lengthscale of the Matern kernel
    :type lengthscale: np.ndarray of size (D,)
    :param period: the period
    :type period: float
    :param n_freq: the number of frequencies considered for the periodic subspace
    :type n_freq: int
    :rtype: kernel object
    """
    def __init__(self,D=1,variance=1.,lengthscale=None,period=2*np.pi,n_freq=10,lower=0.,upper=4*np.pi):
        assert D==1
        self.name = 'periodic_exp'
        self.D = D
        if lengthscale is not None:
            assert lengthscale.shape==(self.D,)
        else:
            lengthscale = np.ones(self.D)
        self.lower,self.upper = lower, upper
        self.Nparam = 3
        self.n_freq = n_freq
        self.n_basis = 2*n_freq
        self._set_params(np.hstack((variance,lengthscale,period)))
    def _cos(self,alpha,omega,phase):
        def f(x):
            return alpha*np.cos(omega*x+phase)
        return f
    def _cos_factorization(self,alpha,omega,phase):
        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
        r =  np.sqrt(r1**2 + r2**2)
        psi = np.where(r1 != 0, (np.arctan(r2/r1) + (r1<0.)*np.pi),np.arcsin(r2))
        return r,omega[:,0:1], psi
    def _int_computation(self,r1,omega1,phi1,r2,omega2,phi2):
        Gint1 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) + 1./(omega1-omega2.T)*( np.sin((omega1-omega2.T)*self.upper+phi1-phi2.T) - np.sin((omega1-omega2.T)*self.lower+phi1-phi2.T) )
        Gint2 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) +  np.cos(phi1-phi2.T)*(self.upper-self.lower)
        #Gint2[0,0] = 2.*(self.upper-self.lower)*np.cos(phi1[0,0])*np.cos(phi2[0,0])
        Gint = np.dot(r1,r2.T)/2 * np.where(np.isnan(Gint1),Gint2,Gint1)
        return Gint
    def _get_params(self):
        """return the value of the parameters."""
        return np.hstack((self.variance,self.lengthscale,self.period))
    def _set_params(self,x):
        """set the value of the parameters."""
        assert x.size==3
        self.variance = x[0]
        self.lengthscale = x[1]
        self.period = x[2]
        self.a = [1./self.lengthscale, 1.]
        self.b = [1]
        self.basis_alpha = np.ones((self.n_basis,))
        self.basis_omega = np.array(sum([[i*2*np.pi/self.period]*2 for i in  range(1,self.n_freq+1)],[]))
        self.basis_phi =   np.array(sum([[-np.pi/2, 0.]  for i in range(1,self.n_freq+1)],[]))
        self.G = self.Gram_matrix()
        self.Gi = np.linalg.inv(self.G)
    def _get_param_names(self):
        """return parameter names."""
        return ['variance','lengthscale','period']
    def Gram_matrix(self):
        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega))
        Lo = np.column_stack((self.basis_omega,self.basis_omega))
        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2))
        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
        Gint = self._int_computation( r,omega,phi, r,omega,phi)
        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
        return(self.lengthscale/(2*self.variance) * Gint + 1./self.variance*np.dot(Flower,Flower.T))
    def K(self,X,X2,target):
        """Compute the covariance matrix between X and X2."""
        FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        if X2 is None:
            FX2 = FX
        else:
            FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
        np.add(mdot(FX,self.Gi,FX2.T), target,target)
    def Kdiag(self,X,target):
        """Compute the diagonal of the covariance matrix associated to X."""
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
    def dK_dtheta(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters (shape is NxMxNparam)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega))
        Lo = np.column_stack((self.basis_omega,self.basis_omega))
        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2))
        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
        Gint = self._int_computation( r,omega,phi, r,omega,phi)
        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
        #dK_dvar
        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
        #dK_dlen
        da_dlen = [-1./self.lengthscale**2,0.]
        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega))
        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
        dGint_dlen = dGint_dlen + dGint_dlen.T
        dG_dlen = 1./2*Gint + self.lengthscale/2*dGint_dlen
        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX2.T)
        #dK_dper
        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
        dFX2_dper = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X2,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X2)
        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period))
        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
        #IPPprim2[0,0] = 2*(self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
        #IPPint2[0,0] = (self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period))
        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2))
        r2,omega2,phi2 = dLa_dper2.T,Lo[:,0:1],dLp_dper2.T
        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) + self._int_computation(r2,omega2,phi2, r,omega,phi)
        dGint_dper = dGint_dper + dGint_dper.T
        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        dG_dper = 1./self.variance*(self.lengthscale/2*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)))
        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
        # np.add(target[:,:,0],dK_dvar, target[:,:,0])
        target[0] += np.sum(dK_dvar*partial)
        #np.add(target[:,:,1],dK_dlen, target[:,:,1])
        target[1] += np.sum(dK_dlen*partial)
        #np.add(target[:,:,2],dK_dper, target[:,:,2])
        target[2] += np.sum(dK_dper*partial)
--- a/GPy/kern/rbf.py
+++ b/GPy/kern/rbf.py
@ -8,46 +8,68 @@ import hashlib
 class rbf(kernpart):
    """
-    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel.
+    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:
    .. math::
-       k(r) = \sigma^2 \exp(- \frac{r^2}{2\ell}) \qquad \qquad \\text{ where  } r = \sqrt{\frac{\sum_{i=1}^d (x_i-x^\prime_i)^2}{\ell^2}}
+       k(r) = \sigma^2 \exp(- \frac{1}{2}r^2) \qquad \qquad \\text{ where  } r^2 = \sum_{i=1}^d \frac{ (x_i-x^\prime_i)^2}{\ell_i^2}}
-    where \ell is the lengthscale, \alpha the smoothness, \sigma^2 the variance and d the dimensionality of the input.
+    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.
    :param D: the number of input dimensions
    :type D: int
    :param variance: the variance of the kernel
    :type variance: float
-    :param lengthscale: the lengthscale of the kernel
+    :param lengthscale: the vector of lengthscale of the kernel
-    :type lengthscale: float
+    :type lengthscale: np.ndarray od size (1,) or (D,) depending on ARD
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    .. Note: for rbf with different lengthscale on each dimension, see rbf_ARD
    """
-    def __init__(self,D,variance=1.,lengthscale=1.):
+    def __init__(self,D,variance=1.,lengthscale=None,ARD=False):
        self.D = D
        self.ARD = ARD
        if not ARD:
            self.Nparam = 2
            self.name = 'rbf'
-        self.set_param(np.hstack((variance,lengthscale)))
+            if lengthscale is not None:
                assert lengthscale.shape == (1,)
            else:
                lengthscale = np.ones(1)
        else:
            self.Nparam = self.D + 1
            self.name = 'rbf_ARD'
            if lengthscale is not None:
                assert lengthscale.shape == (self.D,)
            else:
                lengthscale = np.ones(self.D)
        self._set_params(np.hstack((variance,lengthscale)))
        #initialize cache
        self._Z, self._mu, self._S = np.empty(shape=(3,1))
        self._X, self._X2, self._params = np.empty(shape=(3,1))
-    def get_param(self):
+    def _get_params(self):
        return np.hstack((self.variance,self.lengthscale))
-    def set_param(self,x):
+    def _set_params(self,x):
-        self.variance, self.lengthscale = x
+        assert x.size==(self.Nparam)
        self.variance = x[0]
        self.lengthscale = x[1:]
        self.lengthscale2 = np.square(self.lengthscale)
        #reset cached results
        self._X, self._X2, self._params = np.empty(shape=(3,1))
        self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
-    def get_param_names(self):
+    def _get_param_names(self):
        if self.Nparam == 2:
            return ['variance','lengthscale']
        else:
            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
    def K(self,X,X2,target):
        if X2 is None:
@ -61,7 +83,12 @@ class rbf(kernpart):
    def dK_dtheta(self,partial,X,X2,target):
        self._K_computations(X,X2)
        target[0] += np.sum(self._K_dvar*partial)
-        target[1] += np.sum(self._K_dvar*self.variance*self._K_dist2/self.lengthscale*partial)
+        if self.ARD == True:
            dl = self._K_dvar[:,:,None]*self.variance*self._K_dist2/self.lengthscale
            target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
        else:
            target[1] += np.sum(self._K_dvar*self.variance*(self._K_dist2.sum(-1))/self.lengthscale*partial)
        #np.sum(self._K_dvar*self.variance*self._K_dist2/self.lengthscale*partial)
    def dKdiag_dtheta(self,partial,X,target):
        #NB: derivative of diagonal elements wrt lengthscale is 0
@ -76,20 +103,10 @@ class rbf(kernpart):
    def dKdiag_dX(self,partial,X,target):
        pass
-    def _K_computations(self,X,X2):
+
-        if not (np.all(X==self._X) and np.all(X2==self._X2)):
+    #---------------------------------------#
-            self._X = X
+    #             PSI statistics            #
-            self._X2 = X2
+    #---------------------------------------#
            if X2 is None: X2 = X
            XXT = np.dot(X,X2.T)
            if X is X2:
                self._K_dist2 = (-2.*XXT + np.diag(XXT)[:,np.newaxis] + np.diag(XXT)[np.newaxis,:])/self.lengthscale2
            else:
                self._K_dist2 = (-2.*XXT + np.sum(np.square(X),1)[:,None] + np.sum(np.square(X2),1)[None,:])/self.lengthscale2
            # TODO Remove comments if this is fine.
            # Commented out by Neil as doesn't seem to be used elsewhere.
            #self._K_exponent = -0.5*self._K_dist2
            self._K_dvar = np.exp(-0.5*self._K_dist2)
    def psi0(self,Z,mu,S,target):
        target += self.variance
@ -109,7 +126,11 @@ class rbf(kernpart):
        denom_deriv = S[:,None,:]/(self.lengthscale**3+self.lengthscale*S[:,None,:])
        d_length = self._psi1[:,:,None]*(self.lengthscale*np.square(self._psi1_dist/(self.lengthscale2+S[:,None,:])) + denom_deriv)
        target[0] += np.sum(partial*self._psi1/self.variance)
-        target[1] += np.sum(d_length*partial[:,:,None])
+        dpsi1_dlength = d_length*partial[:,:,None]
        if not self.ARD:
            target[1] += dpsi1_dlength.sum()
        else:
            target[1:] += dpsi1_dlength.sum(0).sum(0)
    def dpsi1_dZ(self,partial,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
@ -125,30 +146,52 @@ class rbf(kernpart):
    def psi2(self,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
-        target += self._psi2.sum(0) #TODO: psi2 should be NxMxM (for het. noise)
+        target += self._psi2
    def dpsi2_dtheta(self,partial,Z,mu,S,target):
        """Shape N,M,M,Ntheta"""
        self._psi_computations(Z,mu,S)
-        d_var = np.sum(2.*self._psi2/self.variance,0)
+        d_var = 2.*self._psi2/self.variance
        d_length = self._psi2[:,:,:,None]*(0.5*self._psi2_Zdist_sq*self._psi2_denom + 2.*self._psi2_mudist_sq + 2.*S[:,None,None,:]/self.lengthscale2)/(self.lengthscale*self._psi2_denom)
        d_length = d_length.sum(0)
        target[0] += np.sum(partial*d_var)
-        target[1] += np.sum(d_length*partial[:,:,None])
+        dpsi2_dlength = d_length*partial[:,:,:,None]
        if not self.ARD:
            target[1] += dpsi2_dlength.sum()
        else:
            target[1:] += dpsi2_dlength.sum(0).sum(0).sum(0)
    def dpsi2_dZ(self,partial,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        term1 = 0.5*self._psi2_Zdist/self.lengthscale2 # M, M, Q
        term2 = self._psi2_mudist/self._psi2_denom/self.lengthscale2 # N, M, M, Q
        dZ = self._psi2[:,:,:,None] * (term1[None] + term2)
-        target += (partial[None,:,:,None]*dZ).sum(0).sum(0)
+        target += (partial[:,:,:,None]*dZ).sum(0).sum(0) # <----------------- TODO not sure about the first ':' here, should be a None (WAS a none in the debug branch)
    def dpsi2_dmuS(self,partial,Z,mu,S,target_mu,target_S):
        """Think N,M,M,Q """
        self._psi_computations(Z,mu,S)
        tmp = self._psi2[:,:,:,None]/self.lengthscale2/self._psi2_denom
-        target_mu += (partial[None,:,:,None]*-tmp*2.*self._psi2_mudist).sum(1).sum(1)
+        target_mu += (partial[:,:,:,None]*-tmp*2.*self._psi2_mudist).sum(1).sum(1)
-        target_S += (partial[None,:,:,None]*tmp*(2.*self._psi2_mudist_sq-1)).sum(1).sum(1)
+        target_S += (partial[:,:,:,None]*tmp*(2.*self._psi2_mudist_sq-1)).sum(1).sum(1)
    #---------------------------------------#
    #            Precomputations            #
    #---------------------------------------#
    def _K_computations(self,X,X2):
        if not (np.all(X==self._X) and np.all(X2==self._X2)):
            self._X = X
            self._X2 = X2
            if X2 is None: X2 = X
            self._K_dist = X[:,None,:]-X2[None,:,:] # this can be computationally heavy
            self._params = np.empty(shape=(1,0))    #ensure the next section gets called
        if not np.all(self._params == self._get_params()):
            self._params == self._get_params()
            self._K_dist2 = np.square(self._K_dist/self.lengthscale)
            self._K_dvar = np.exp(-0.5*self._K_dist2.sum(-1))
    def _psi_computations(self,Z,mu,S):
        #here are the "statistics" for psi1 and psi2
--- a/GPy/kern/rbf_ARD.py
+++ b/GPy/kern/rbf_ARD.py
@ -1,253 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import kernpart
 import numpy as np
 import hashlib
 class rbf_ARD(kernpart):
    def __init__(self,D,variance=1.,lengthscales=None):
        """
        Arguments
        ----------
        D: int - the number of input dimensions
        variance: float
        lengthscales : np.ndarray of shape (D,)
        """
        self.D = D
        if lengthscales is not None:
            assert lengthscales.shape==(self.D,)
        else:
            lengthscales = np.ones(self.D)
        self.Nparam = self.D + 1
        self.name = 'rbf_ARD'
        self.set_param(np.hstack((variance,lengthscales)))
        #initialize cache
        self._Z, self._mu, self._S = np.empty(shape=(3,1))
        self._X, self._X2, self._params = np.empty(shape=(3,1))
    def get_param(self):
        return np.hstack((self.variance,self.lengthscales))
    def set_param(self,x):
        assert x.size==(self.D+1)
        self.variance = x[0]
        self.lengthscales = x[1:]
        self.lengthscales2 = np.square(self.lengthscales)
        #reset cached results
        self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
    def get_param_names(self):
        if self.D==1:
            return ['variance','lengthscale']
        else:
            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
    def K(self,X,X2,target):
        self._K_computations(X,X2)
        np.add(self.variance*self._K_dvar, target,target)
    def Kdiag(self,X,target):
        np.add(target,self.variance,target)
    def dK_dtheta(self,partial,X,X2,target):
        self._K_computations(X,X2)
        dl = self._K_dvar[:,:,None]*self.variance*self._K_dist2/self.lengthscales
        target[0] += np.sum(self._K_dvar*partial)
        target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
    def dKdiag_dtheta(self,X,target):
        target[0] += np.sum(partial)
    def dK_dX(self,partial,X,X2,target):
        self._K_computations(X,X2)
        dZ = self.variance*self._K_dvar[:,:,None]*self._K_dist/self.lengthscales2
        dK_dX = -dZ.transpose(1,0,2)
        target += np.sum(dK_dX*partial.T[:,:,None],0)
    def dKdiag_dX(self,partial,X,target):
        pass
    def psi0(self,Z,mu,S,target):
        target += self.variance
    def dpsi0_dtheta(self,partial,Z,mu,S,target):
        target[0] += np.sum(partial)
    def dpsi0_dmuS(self,partial,Z,mu,S,target_mu,target_S):
        pass
    def psi1(self,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        np.add(target, self._psi1,target)
    def dpsi1_dtheta(self,partial,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        denom_deriv = S[:,None,:]/(self.lengthscales**3+self.lengthscales*S[:,None,:])
        d_length = self._psi1[:,:,None]*(self.lengthscales*np.square(self._psi1_dist/(self.lengthscales2+S[:,None,:])) + denom_deriv)
        target[0] += np.sum(partial*self._psi1/self.variance)
        target[1:] += (d_length*partial[:,:,None]).sum(0).sum(0)
    def dpsi1_dZ(self,partial,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        # np.add(target,-self._psi1[:,:,None]*self._psi1_dist/self.lengthscales2/self._psi1_denom,target)
        denominator = (self.lengthscales2*(self._psi1_denom))
        dpsi1_dZ = - self._psi1[:,:,None] * ((self._psi1_dist/denominator))
        target += np.sum(partial.T[:,:,None] * dpsi1_dZ, 0)
    def dpsi1_dmuS(self,partial,Z,mu,S,target_mu,target_S):
        """return shapes are N,M,Q"""
        self._psi_computations(Z,mu,S)
        tmp = self._psi1[:,:,None]/self.lengthscales2/self._psi1_denom
        target_mu += np.sum(partial.T[:, :, None]*tmp*self._psi1_dist,1)
        target_S += np.sum(partial.T[:, :, None]*0.5*tmp*(self._psi1_dist_sq-1),1)
    def psi2(self,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        target += self._psi2
    def dpsi2_dtheta(self,partial,Z,mu,S,target):
        """Shape N,M,M,Ntheta"""
        self._psi_computations(Z,mu,S)
        d_var = 2.*self._psi2/self.variance
        d_length = self._psi2[:,:,:,None]*(0.5*self._psi2_Zdist_sq*self._psi2_denom + 2.*self._psi2_mudist_sq + 2.*S[:,None,None,:]/self.lengthscales2)/(self.lengthscales*self._psi2_denom)
        # d_length = d_length.sum(0)
        target[0] += np.sum(partial*d_var)
        target[1:] += (d_length*partial[:,:,:,None]).sum(0).sum(0).sum(0)
    def dpsi2_dZ(self,partial,Z,mu,S,target):
        """Returns shape N,M,M,Q"""
        self._psi_computations(Z,mu,S)
        term1 = 0.5*self._psi2_Zdist/self.lengthscales2 # M, M, Q
        term2 = self._psi2_mudist/self._psi2_denom/self.lengthscales2 # N, M, M, Q
        dZ = self._psi2[:,:,:,None] * (term1[None] + term2)
        target += (partial[:,:,:,None]*dZ).sum(0).sum(0)
    def dpsi2_dmuS(self,partial,Z,mu,S,target_mu,target_S):
        """Think N,M,M,Q """
        self._psi_computations(Z,mu,S)
        tmp = self._psi2[:,:,:,None]/self.lengthscales2/self._psi2_denom
        target_mu += (partial[:,:,:,None]*-tmp*2.*self._psi2_mudist).sum(1).sum(1)
        target_S += (partial[:,:,:,None]*tmp*(2.*self._psi2_mudist_sq-1)).sum(1).sum(1)
    def _K_computations(self,X,X2):
        if not (np.all(X==self._X) and np.all(X2==self._X2)):
            self._X = X
            self._X2 = X2
            if X2 is None: X2 = X
            self._K_dist = X[:,None,:]-X2[None,:,:] # this can be computationally heavy
            self._params = np.empty(shape=(1,0))#ensure the next section gets called
        if not np.all(self._params == self.get_param()):
            self._params == self.get_param()
            self._K_dist2 = np.square(self._K_dist/self.lengthscales)
            self._K_exponent = -0.5*self._K_dist2.sum(-1)
            self._K_dvar = np.exp(-0.5*self._K_dist2.sum(-1))
    def _psi_computations(self,Z,mu,S):
        #here are the "statistics" for psi1 and psi2
        if not np.all(Z==self._Z):
            #Z has changed, compute Z specific stuff
            self._psi2_Zhat = 0.5*(Z[:,None,:] +Z[None,:,:]) # M,M,Q
            self._psi2_Zdist = Z[:,None,:]-Z[None,:,:] # M,M,Q
            self._psi2_Zdist_sq = np.square(self._psi2_Zdist)/self.lengthscales2 # M,M,Q
            self._Z = Z
        if not (np.all(Z==self._Z) and np.all(mu==self._mu) and np.all(S==self._S)):
            #something's changed. recompute EVERYTHING
            #psi1
            self._psi1_denom = S[:,None,:]/self.lengthscales2 + 1.
            self._psi1_dist = Z[None,:,:]-mu[:,None,:]
            self._psi1_dist_sq = np.square(self._psi1_dist)/self.lengthscales2/self._psi1_denom
            self._psi1_exponent = -0.5*np.sum(self._psi1_dist_sq+np.log(self._psi1_denom),-1)
            self._psi1 = self.variance*np.exp(self._psi1_exponent)
            #psi2
            self._psi2_denom = 2.*S[:,None,None,:]/self.lengthscales2+1. # N,M,M,Q
            self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
            self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscales2*self._psi2_denom)
            self._psi2_exponent = np.sum(-self._psi2_Zdist_sq/4. -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M
            self._psi2 = np.square(self.variance)*np.exp(self._psi2_exponent) # N,M,M
            self._Z, self._mu, self._S = Z, mu,S
 if __name__=='__main__':
    #run some simple tests on the kernel (TODO:move these to unititest)
    #TODO: these are broken in this new structure!
    N = 10
    M = 5
    Q = 3
    Z = np.random.randn(M,Q)
    mu = np.random.randn(N,Q)
    S = np.random.rand(N,Q)
    var = 2.5
    lengthscales = np.ones(Q)*0.7
    k = rbf(Q,var,lengthscales)
    from checkgrad import checkgrad
    def k_theta_test(param,k):
        k.set_param(param)
        K = k.K(Z)
        dK_dtheta = k.dK_dtheta(Z)
        f = np.sum(K)
        df = dK_dtheta.sum(0).sum(0)
        return f,np.array(df)
    print "dk_dtheta_test"
    checkgrad(k_theta_test,np.random.randn(1+Q),args=(k,))
    def psi1_mu_test(mu,k):
        mu = mu.reshape(N,Q)
        f = np.sum(k.psi1(Z,mu,S))
        df = k.dpsi1_dmuS(Z,mu,S)[0].sum(1)
        return f,df.flatten()
    print "psi1_mu_test"
    checkgrad(psi1_mu_test,np.random.randn(N*Q),args=(k,))
    def psi1_S_test(S,k):
        S = S.reshape(N,Q)
        f = np.sum(k.psi1(Z,mu,S))
        df = k.dpsi1_dmuS(Z,mu,S)[1].sum(1)
        return f,df.flatten()
    print "psi1_S_test"
    checkgrad(psi1_S_test,np.random.rand(N*Q),args=(k,))
    def psi1_theta_test(theta,k):
        k.set_param(theta)
        f = np.sum(k.psi1(Z,mu,S))
        df = np.array([np.sum(grad) for grad in k.dpsi1_dtheta(Z,mu,S)])
        return f,df
    print "psi1_theta_test"
    checkgrad(psi1_theta_test,np.random.rand(1+Q),args=(k,))
    def psi2_mu_test(mu,k):
        mu = mu.reshape(N,Q)
        f = np.sum(k.psi2(Z,mu,S))
        df = k.dpsi2_dmuS(Z,mu,S)[0].sum(1).sum(1)
        return f,df.flatten()
    print "psi2_mu_test"
    checkgrad(psi2_mu_test,np.random.randn(N*Q),args=(k,))
    def psi2_S_test(S,k):
        S = S.reshape(N,Q)
        f = np.sum(k.psi2(Z,mu,S))
        df = k.dpsi2_dmuS(Z,mu,S)[1].sum(1).sum(1)
        return f,df.flatten()
    print "psi2_S_test"
    checkgrad(psi2_S_test,np.random.rand(N*Q),args=(k,))
    def psi2_theta_test(theta,k):
        k.set_param(theta)
        f = np.sum(k.psi2(Z,mu,S))
        df = np.array([np.sum(grad) for grad in k.dpsi2_dtheta(Z,mu,S)])
        return f,df
    print "psi2_theta_test"
    checkgrad(psi2_theta_test,np.random.rand(1+Q),args=(k,))
--- a/GPy/kern/spline.py
+++ b/GPy/kern/spline.py
@ -25,15 +25,15 @@ class spline(kernpart):
        assert self.D==1
        self.Nparam = 1
        self.name = 'spline'
-        self.set_param(np.squeeze(variance))
+        self._set_params(np.squeeze(variance))
-    def get_param(self):
+    def _get_params(self):
        return self.variance
-    def set_param(self,x):
+    def _set_params(self,x):
        self.variance = x
-    def get_param_names(self):
+    def _get_param_names(self):
        return ['variance']
    def K(self,X,X2,target):
--- a/GPy/kern/sympykern.py
+++ b/GPy/kern/sympykern.py
@ -44,7 +44,7 @@ class spkern(kernpart):
        if param is None:
            param = np.ones(self.Nparam)
        assert param.size==self.Nparam
-        self.set_param(param)
+        self._set_params(param)
        #Differentiate!
        self._sp_dk_dtheta = [sp.diff(k,theta).simplify() for theta in self._sp_theta]
@ -247,12 +247,12 @@ class spkern(kernpart):
        Z = X
        weave.inline(self._dKdiag_dX_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
-    def set_param(self,param):
+    def _set_params(self,param):
        #print param.flags['C_CONTIGUOUS']
        self._param = param.copy()
-    def get_param(self):
+    def _get_params(self):
        return self._param
-    def get_param_names(self):
+    def _get_param_names(self):
        return [x.name for x in self._sp_theta]
--- a/GPy/kern/white.py
+++ b/GPy/kern/white.py
@ -17,16 +17,16 @@ class white(kernpart):
        self.D = D
        self.Nparam = 1
        self.name = 'white'
-        self.set_param(np.array([variance]).flatten())
+        self._set_params(np.array([variance]).flatten())
-    def get_param(self):
+    def _get_params(self):
        return self.variance
-    def set_param(self,x):
+    def _set_params(self,x):
        assert x.shape==(1,)
        self.variance = x
-    def get_param_names(self):
+    def _get_param_names(self):
        return ['variance']
    def K(self,X,X2,target):
--- a/GPy/models/BGPLVM.py
+++ b/GPy/models/BGPLVM.py
@ -28,12 +28,12 @@ class Bayesian_GPLVM(sparse_GP_regression, GPLVM):
        sparse_GP_regression.__init__(self, X, Y, X_uncertainty = S, **kwargs)
-    def get_param_names(self):
+    def _get_param_names(self):
        X_names = sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
        S_names = sum([['S_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
-        return (X_names + S_names + sparse_GP_regression.get_param_names(self))
+        return (X_names + S_names + sparse_GP_regression._get_param_names(self))
-    def get_param(self):
+    def _get_params(self):
        """
        Horizontally stacks the parameters in order to present them to the optimizer.
        The resulting 1-D array has this structure:
@ -43,13 +43,13 @@ class Bayesian_GPLVM(sparse_GP_regression, GPLVM):
        ===============================================================
        """
-        return np.hstack((self.X.flatten(), self.X_uncertainty.flatten(), sparse_GP_regression.get_param(self)))
+        return np.hstack((self.X.flatten(), self.X_uncertainty.flatten(), sparse_GP_regression._get_params(self)))
-    def set_param(self,x):
+    def _set_params(self,x):
        N, Q = self.N, self.Q
        self.X = x[:self.X.size].reshape(N,Q).copy()
        self.X_uncertainty = x[(N*Q):(2*N*Q)].reshape(N,Q).copy()
-        sparse_GP_regression.set_param(self, x[(2*N*Q):])
+        sparse_GP_regression._set_params(self, x[(2*N*Q):])
    def dL_dmuS(self):
        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi1_dmuS(self.dL_dpsi1,self.Z,self.X,self.X_uncertainty)
@ -60,5 +60,6 @@ class Bayesian_GPLVM(sparse_GP_regression, GPLVM):
        return np.hstack((dL_dmu.flatten(), dL_dS.flatten()))
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
-        return np.hstack((self.dL_dmuS().flatten(), sparse_GP_regression.log_likelihood_gradients(self)))
+        return np.hstack((self.dL_dmuS().flatten(), sparse_GP_regression._log_likelihood_gradients(self)))
--- a/GPy/models/GPLVM.py
+++ b/GPy/models/GPLVM.py
@ -33,18 +33,18 @@ class GPLVM(GP_regression):
        else:
            return np.random.randn(Y.shape[0], Q)
-    def get_param_names(self):
+    def _get_param_names(self):
        return (sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
-                + self.kern.extract_param_names())
+                + self.kern._get_param_names_transformed())
-    def get_param(self):
+    def _get_params(self):
-        return np.hstack((self.X.flatten(), self.kern.extract_param()))
+        return np.hstack((self.X.flatten(), self.kern._get_params_transformed()))
-    def set_param(self,x):
+    def _set_params(self,x):
        self.X = x[:self.X.size].reshape(self.N,self.Q).copy()
-        GP_regression.set_param(self, x[self.X.size:])
+        GP_regression._set_params(self, x[self.X.size:])
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
        dL_dK = self.dL_dK()
        dL_dtheta = self.kern.dK_dtheta(dL_dK,self.X)
--- a/GPy/models/GP_EP.py
+++ b/GPy/models/GP_EP.py
@ -41,14 +41,14 @@ class GP_EP(model):
        self.K = self.kernel.K(self.X)
        model.__init__(self)
-    def set_param(self,p):
+    def _set_params(self,p):
-        self.kernel.expand_param(p)
+        self.kernel._set_params_transformed(p)
-    def get_param(self):
+    def _get_params(self):
-        return self.kernel.extract_param()
+        return self.kernel._get_params_transformed()
-    def get_param_names(self):
+    def _get_param_names(self):
-        return self.kernel.extract_param_names()
+        return self.kernel._get_param_names_transformed()
    def approximate_likelihood(self):
        self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta])
@ -78,7 +78,7 @@ class GP_EP(model):
        L3 = sum(np.log(self.ep_approx.Z_hat))
        return L1 + L2A + L2B + L3
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
        dK_dp = self.kernel.dK_dtheta(self.X)
        self.dK_dp = dK_dp
        aux1,info_1 = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1)
@ -138,7 +138,7 @@ class GP_EP(model):
        """
        self.epsilon_em = epsilon
        log_likelihood_change = self.epsilon_em + 1.
-        self.parameters_path = [self.kernel.get_param()]
+        self.parameters_path = [self.kernel._get_params()]
        self.approximate_likelihood()
        self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]]
        self.log_likelihood_path = [self.log_likelihood()]
@ -150,11 +150,11 @@ class GP_EP(model):
            log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1]
            if log_likelihood_change < 0:
                print 'log_likelihood decrement'
-                self.kernel.expand_param(self.parameters_path[-1])
+                self.kernel._set_params_transformed(self.parameters_path[-1])
-                self.kernM.expand_param(self.parameters_path[-1])
+                self.kernM._set_params_transformed(self.parameters_path[-1])
            else:
                self.approximate_likelihood()
                self.log_likelihood_path.append(self.log_likelihood())
-                self.parameters_path.append(self.kernel.get_param())
+                self.parameters_path.append(self.kernel._get_params())
                self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde])
            iteration += 1
--- a/GPy/models/GP_regression.py
+++ b/GPy/models/GP_regression.py
@ -63,47 +63,47 @@ class GP_regression(model):
            self._Ystd = np.ones((1,self.Y.shape[1]))
        if self.D > self.N:
-            # then it's more efficient to store Youter
+            # then it's more efficient to store YYT
-            self.Youter = np.dot(self.Y, self.Y.T)
+            self.YYT = np.dot(self.Y, self.Y.T)
        else:
-            self.Youter = None
+            self.YYT = None
        model.__init__(self)
-    def set_param(self,p):
+    def _set_params(self,p):
-        self.kern.expand_param(p)
+        self.kern._set_params_transformed(p)
        self.K = self.kern.K(self.X,slices1=self.Xslices)
        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
-    def get_param(self):
+    def _get_params(self):
-        return self.kern.extract_param()
+        return self.kern._get_params_transformed()
-    def get_param_names(self):
+    def _get_param_names(self):
-        return self.kern.extract_param_names()
+        return self.kern._get_param_names_transformed()
    def _model_fit_term(self):
        """
-        Computes the model fit using Youter if it's available
+        Computes the model fit using YYT if it's available
        """
-        if self.Youter is None:
+        if self.YYT is None:
            return -0.5*np.sum(np.square(np.dot(self.Li,self.Y)))
        else:
-            return -0.5*np.sum(np.multiply(self.Ki, self.Youter))
+            return -0.5*np.sum(np.multiply(self.Ki, self.YYT))
    def log_likelihood(self):
        complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - 0.5*self.D*self.K_logdet
        return complexity_term + self._model_fit_term()
    def dL_dK(self):
-        if self.Youter is None:
+        if self.YYT is None:
            alpha = np.dot(self.Ki,self.Y)
            dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki)
        else:
-            dL_dK = 0.5*(mdot(self.Ki, self.Youter, self.Ki) - self.D*self.Ki)
+            dL_dK = 0.5*(mdot(self.Ki, self.YYT, self.Ki) - self.D*self.Ki)
        return dL_dK
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
        return self.kern.dK_dtheta(partial=self.dL_dK(),X=self.X)
    def predict(self,Xnew, slices=None, full_cov=False):
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -3,7 +3,7 @@
 from GP_regression import GP_regression
-from sparse_GP_regression import sparse_GP_regression, sgp_debugB, sgp_debugC, sgp_debugE
+from sparse_GP_regression import sparse_GP_regression
 from GPLVM import GPLVM
 from warped_GP import warpedGP
 from GP_EP import GP_EP
--- a/GPy/models/generalized_FITC.py
+++ b/GPy/models/generalized_FITC.py
@ -42,15 +42,15 @@ class generalized_FITC(model):
        self.jitter = 1e-12
        model.__init__(self)
-    def set_param(self,p):
+    def _set_params(self,p):
-        self.kernel.expand_param(p[0:-self.Z.size])
+        self.kernel._set_params_transformed(p[0:-self.Z.size])
        self.Z = p[-self.Z.size:].reshape(self.M,self.D)
-    def get_param(self):
+    def _get_params(self):
-        return np.hstack([self.kernel.extract_param(),self.Z.flatten()])
+        return np.hstack([self.kernel._get_params_transformed(),self.Z.flatten()])
-    def get_param_names(self):
+    def _get_param_names(self):
-        return self.kernel.extract_param_names()+['iip_%i'%i for i in range(self.Z.size)]
+        return self.kernel._get_param_names_transformed()+['iip_%i'%i for i in range(self.Z.size)]
    def approximate_likelihood(self):
        self.Kmm = self.kernel.K(self.Z)
@ -91,15 +91,15 @@ class generalized_FITC(model):
    def log_likelihood(self):
        self.posterior_param()
-        self.Youter = np.dot(self.mu_tilde,self.mu_tilde.T)
+        self.YYT = np.dot(self.mu_tilde,self.mu_tilde.T)
        A = -self.hld
-        B = -.5*np.sum(self.Qi*self.Youter)
+        B = -.5*np.sum(self.Qi*self.YYT)
        C = sum(np.log(self.ep_approx.Z_hat))
        D = .5*np.sum(np.log(1./self.ep_approx.tau_tilde + 1./self.ep_approx.tau_))
        E = .5*np.sum((self.ep_approx.v_/self.ep_approx.tau_ - self.mu_tilde.flatten())**2/(1./self.ep_approx.tau_ + 1./self.ep_approx.tau_tilde))
        return  A + B + C + D + E
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
        dKmm_dtheta = self.kernel.dK_dtheta(self.Z)
        dKnn_dtheta = self.kernel.dK_dtheta(self.X)
        dKmn_dtheta = self.kernel.dK_dtheta(self.Z,self.X)
@ -214,7 +214,7 @@ class generalized_FITC(model):
        """
        self.epsilon_em = epsilon
        log_likelihood_change = self.epsilon_em + 1.
-        self.parameters_path = [self.kernel.get_param()]
+        self.parameters_path = [self.kernel._get_params()]
        self.approximate_likelihood()
        self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]]
        self.inducing_inputs_path = [self.Z]
@ -227,7 +227,7 @@ class generalized_FITC(model):
            log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1]
            if log_likelihood_change < 0:
                print 'log_likelihood decrement'
-                self.kernel.expand_param(self.parameters_path[-1])
+                self.kernel._set_params_transformed(self.parameters_path[-1])
                self.kernM = self.kernel.copy()
                slef.kernM.expand_X(self.iducing_inputs_path[-1])
                self.__init__(self.kernel,self.likelihood,kernM=self.kernM,powerep=[self.eta,self.delta],epsilon_ep = self.epsilon_ep, epsilon_em = self.epsilon_em)
@ -235,7 +235,7 @@ class generalized_FITC(model):
            else:
                self.approximate_likelihood()
                self.log_likelihood_path.append(self.log_likelihood())
-                self.parameters_path.append(self.kernel.get_param())
+                self.parameters_path.append(self.kernel._get_params())
                self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde])
                self.inducing_inputs_path.append(self.Z)
            iteration += 1
--- a/GPy/models/sparse_GPLVM.py
+++ b/GPy/models/sparse_GPLVM.py
@ -27,16 +27,16 @@ class sparse_GPLVM(sparse_GP_regression, GPLVM):
        X = self.initialise_latent(init, Q, Y)
        sparse_GP_regression.__init__(self, X, Y, **kwargs)
-    def get_param_names(self):
+    def _get_param_names(self):
        return (sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
-                + sparse_GP_regression.get_param_names(self))
+                + sparse_GP_regression._get_param_names(self))
-    def get_param(self):
+    def _get_params(self):
-        return np.hstack((self.X.flatten(), sparse_GP_regression.get_param(self)))
+        return np.hstack((self.X.flatten(), sparse_GP_regression._get_params(self)))
-    def set_param(self,x):
+    def _set_params(self,x):
        self.X = x[:self.X.size].reshape(self.N,self.Q).copy()
-        sparse_GP_regression.set_param(self, x[self.X.size:])
+        sparse_GP_regression._set_params(self, x[self.X.size:])
    def log_likelihood(self):
        return sparse_GP_regression.log_likelihood(self)
@ -49,8 +49,8 @@ class sparse_GPLVM(sparse_GP_regression, GPLVM):
        return dL_dX
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
-        return np.hstack((self.dL_dX().flatten(), sparse_GP_regression.log_likelihood_gradients(self)))
+        return np.hstack((self.dL_dX().flatten(), sparse_GP_regression._log_likelihood_gradients(self)))
    def plot(self):
        GPLVM.plot(self)
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@ -107,6 +107,20 @@ class sparse_GP_regression(GP_regression):
        self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC
        self.dL_dKmm +=  np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD
    def _set_params(self, p):
        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
        self.beta = p[self.M*self.Q]
        self.kern._set_params(p[self.Z.size + 1:])
        self._computations()
    def _get_params(self):
        return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()])
    def _get_param_names(self):
        return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed()
    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        sf2 = self.scale_factor**2
@ -116,17 +130,8 @@ class sparse_GP_regression(GP_regression):
        D = +0.5*np.sum(self.psi1VVpsi1 * self.C)
        return A+B+C+D
-    def set_param(self, p):
+    def _log_likelihood_gradients(self):
-        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
+        return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
        self.beta = p[self.M*self.Q]
        self.kern.set_param(p[self.Z.size + 1:])
        self._computations()
    def get_param(self):
        return np.hstack([self.Z.flatten(),self.beta,self.kern.extract_param()])
    def get_param_names(self):
        return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern.extract_param_names()
    def dL_dbeta(self):
        """
@ -172,9 +177,6 @@ class sparse_GP_regression(GP_regression):
            dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
        return dL_dZ
    def log_likelihood_gradients(self):
        return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
    def _raw_predict(self, Xnew, slices, full_cov=False):
        """Internal helper function for making predictions, does not account for normalisation"""
@ -201,94 +203,3 @@ class sparse_GP_regression(GP_regression):
                pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
        if self.Q==2:
            pb.plot(self.Z[:,0],self.Z[:,1],'wo')
 class sgp_debugB(sparse_GP_regression):
    def _computations(self):
        sparse_GP_regression._computations(self)
        # Compute dL_dpsi
        self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
        self.dL_dpsi1 = np.zeros_like(self.psi1)
        self.dL_dpsi2 = - 0.5 * self.beta * (self.D*( - self.Kmmi))
        # Compute dL_dKmm
        self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB
    def log_likelihood(self):
        A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
        B = -0.5*self.beta*self.D*self.trace_K
        C = -0.5*self.D * self.B_logdet
        D = -0.5*self.beta*self.trYYT
        E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv)
        return B
    def dL_dbeta(self):
        dA_dbeta =   0.5 * self.N*self.D/self.beta
        dB_dbeta = - 0.5 * self.D * self.trace_K
        dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta
        dD_dbeta = - 0.5 * self.trYYT
        tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V)
        dE_dbeta = (np.sum(np.square(self.C)) - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T)))/self.beta
        return np.squeeze(dB_dbeta)
 class sgp_debugC(sparse_GP_regression):
    def _computations(self):
        sparse_GP_regression._computations(self)
        # Compute dL_dpsi
        self.dL_dpsi0 = np.zeros(self.N)
        self.dL_dpsi1 = np.zeros_like(self.psi1)
        self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv))
        # Compute dL_dKmm
        self.dL_dKmm = -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC
    def log_likelihood(self):
        A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
        B = -0.5*self.beta*self.D*self.trace_K
        C = -0.5*self.D * self.B_logdet
        D = -0.5*self.beta*self.trYYT
        E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv)
        return C
    def dL_dbeta(self):
        dA_dbeta =   0.5 * self.N*self.D/self.beta
        dB_dbeta = - 0.5 * self.D * self.trace_K
        dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta
        dD_dbeta = - 0.5 * self.trYYT
        tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V)
        dE_dbeta = (np.sum(np.square(self.C)) - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T)))/self.beta
        return np.squeeze(dC_dbeta)
 class sgp_debugE(sparse_GP_regression):
    def _computations(self):
        sparse_GP_regression._computations(self)
        # Compute dL_dpsi
        self.dL_dpsi0 = np.zeros(self.N)
        self.dL_dpsi1 = np.zeros_like(self.psi1)
        self.dL_dpsi2 = - 0.5 * self.beta * (self.G)
        # Compute dL_dKmm
        tmp = mdot(self.beta*self.psi2, self.Kmmi, self.psi1VVpsi1)
        self.dL_dKmm = -0.5*mdot(self.Kmmi,tmp + tmp.T + self.psi1VVpsi1,self.Kmmi)
        #self.dL_dKmm =  np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE
    def log_likelihood(self):
        A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
        B = -0.5*self.beta*self.D*self.trace_K
        C = -0.5*self.D * self.B_logdet
        D = -0.5*self.beta*self.trYYT
        E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv)
        return E
    def dL_dbeta(self):
        dA_dbeta =   0.5 * self.N*self.D/self.beta
        dB_dbeta = - 0.5 * self.D * self.trace_K
        dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta
        dD_dbeta = - 0.5 * self.trYYT
        tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V)
        dE_dbeta = (np.sum(np.square(self.C)) - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T)))/self.beta
        return np.squeeze(dE_dbeta)
--- a/GPy/models/warped_GP.py
+++ b/GPy/models/warped_GP.py
@ -13,7 +13,7 @@ from GP_regression import GP_regression
 class warpedGP(GP_regression):
    """
-    TODO: fucking docstrings!
+    TODO: fecking docstrings!
    @nfusi: I'#ve hacked a little on this, but no guarantees. J.
    """
@ -30,17 +30,17 @@ class warpedGP(GP_regression):
        self.transform_data()
        GP_regression.__init__(self, X, self.Y, **kwargs)
-    def set_param(self, x):
+    def _set_params(self, x):
        self.warping_params = x[:self.warping_function.num_parameters].reshape(self.warp_params_shape).copy()
        self.transform_data()
-        GP_regression.set_param(self, x[self.warping_function.num_parameters:].copy())
+        GP_regression._set_params(self, x[self.warping_function.num_parameters:].copy())
-    def get_param(self):
+    def _get_params(self):
-        return np.hstack((self.warping_params.flatten().copy(), GP_regression.get_param(self).copy()))
+        return np.hstack((self.warping_params.flatten().copy(), GP_regression._get_params(self).copy()))
-    def get_param_names(self):
+    def _get_param_names(self):
-        warping_names = self.warping_function.get_param_names()
+        warping_names = self.warping_function._get_param_names()
-        param_names = GP_regression.get_param_names(self)
+        param_names = GP_regression._get_param_names(self)
        return warping_names + param_names
    def transform_data(self):
@ -48,9 +48,9 @@ class warpedGP(GP_regression):
        # this supports the 'smart' behaviour in GP_regression
        if self.D > self.N:
-            self.Youter = np.dot(self.Y, self.Y.T)
+            self.YYT = np.dot(self.Y, self.Y.T)
        else:
-            self.Youter = None
+            self.YYT = None
        return self.Y
@ -59,8 +59,8 @@ class warpedGP(GP_regression):
        jacobian = self.warping_function.fgrad_y(self.Z, self.warping_params)
        return ll + np.log(jacobian).sum()
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
-        ll_grads = GP_regression.log_likelihood_gradients(self)
+        ll_grads = GP_regression._log_likelihood_gradients(self)
        alpha = np.dot(self.Ki, self.Y.flatten())
        warping_grads = self.warping_function_gradients(alpha)
        return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
@ -81,7 +81,7 @@ class warpedGP(GP_regression):
    def predict(self, X, in_unwarped_space = False, **kwargs):
        mu, var = GP_regression.predict(self, X, **kwargs)
-        # The plot() function calls set_param() before calling predict()
+        # The plot() function calls _set_params() before calling predict()
        # this is causing the observations to be plotted in the transformed
        # space (where Y lives), making the plot looks very wrong
        # if the predictions are made in the untransformed space
--- a/GPy/testing/prior_tests.py
+++ b/GPy/testing/prior_tests.py
@ -0,0 +1,61 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import unittest
 import numpy as np
 import GPy
 class PriorTests(unittest.TestCase):
    def test_lognormal(self):
        xmin, xmax = 1, 2.5*np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
        y  = b*X + C + 1*np.sin(X)
        y += 0.05*np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GP_regression(X, y)
        m.ensure_default_constraints()
        lognormal = GPy.priors.log_Gaussian(1, 2)
        m.set_prior('rbf', lognormal)
        m.randomize()
        self.assertTrue(m.checkgrad())
    def test_gamma(self):
        xmin, xmax = 1, 2.5*np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
        y  = b*X + C + 1*np.sin(X)
        y += 0.05*np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GP_regression(X, y)
        m.ensure_default_constraints()
        gamma = GPy.priors.gamma(1, 1)
        m.set_prior('rbf', gamma)
        m.randomize()
        self.assertTrue(m.checkgrad())
    def test_incompatibility(self):
        xmin, xmax = 1, 2.5*np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
        y  = b*X + C + 1*np.sin(X)
        y += 0.05*np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GP_regression(X, y)
        m.ensure_default_constraints()
        gaussian = GPy.priors.Gaussian(1, 1)
        success = False
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
        try:
            m.set_prior('rbf', gaussian)
        except AssertionError:
            success = True
        self.assertTrue(success)
 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
    unittest.main()
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -42,51 +42,66 @@ class GradientTests(unittest.TestCase):
        # contrain all parameters to be positive
        self.assertTrue(m.checkgrad())
-    def test_gp_regression_rbf_white_kern_1d(self):
+    def test_gp_regression_rbf_1d(self):
        ''' Testing the GP regression with rbf kernel with white kernel on 1d data '''
        rbf = GPy.kern.rbf(1)
        self.check_model_with_white(rbf, model_type='GP_regression', dimension=1)
-    def test_GP_regression_rbf_ARD_white_kern_2D(self):
+    def test_GP_regression_rbf_2D(self):
        ''' Testing the GP regression with rbf and white kernel on 2d data '''
        k = GPy.kern.rbf_ARD(2)
        self.check_model_with_white(k, model_type='GP_regression', dimension=2)
    def test_GP_regression_rbf_white_kern_2D(self):
        ''' Testing the GP regression with rbf and white kernel on 2d data '''
        rbf = GPy.kern.rbf(2)
        self.check_model_with_white(rbf, model_type='GP_regression', dimension=2)
-    def test_GP_regression_matern52_kern_1D(self):
+    def test_GP_regression_rbf_ARD_2D(self):
        ''' Testing the GP regression with rbf and white kernel on 2d data '''
        k = GPy.kern.rbf(2,ARD=True)
        self.check_model_with_white(k, model_type='GP_regression', dimension=2)
    def test_GP_regression_matern52_1D(self):
        ''' Testing the GP regression with matern52 kernel on 1d data '''
        matern52 = GPy.kern.Matern52(1)
        self.check_model_with_white(matern52, model_type='GP_regression', dimension=1)
-    def test_GP_regression_matern52_kern_2D(self):
+    def test_GP_regression_matern52_2D(self):
        ''' Testing the GP regression with matern52 kernel on 2d data '''
        matern52 = GPy.kern.Matern52(2)
        self.check_model_with_white(matern52, model_type='GP_regression', dimension=2)
-    def test_GP_regression_matern32_kern_1D(self):
+    def test_GP_regression_matern52_ARD_2D(self):
        ''' Testing the GP regression with matern52 kernel on 2d data '''
        matern52 = GPy.kern.Matern52(2,ARD=True)
        self.check_model_with_white(matern52, model_type='GP_regression', dimension=2)
    def test_GP_regression_matern32_1D(self):
        ''' Testing the GP regression with matern32 kernel on 1d data '''
        matern32 = GPy.kern.Matern32(1)
        self.check_model_with_white(matern32, model_type='GP_regression', dimension=1)
-    def test_GP_regression_matern32_kern_2D(self):
+    def test_GP_regression_matern32_2D(self):
        ''' Testing the GP regression with matern32 kernel on 2d data '''
        matern32 = GPy.kern.Matern32(2)
        self.check_model_with_white(matern32, model_type='GP_regression', dimension=2)
-    def test_GP_regression_exponential_kern_1D(self):
+    def test_GP_regression_matern32_ARD_2D(self):
        ''' Testing the GP regression with matern32 kernel on 2d data '''
        matern32 = GPy.kern.Matern32(2,ARD=True)
        self.check_model_with_white(matern32, model_type='GP_regression', dimension=2)
    def test_GP_regression_exponential_1D(self):
        ''' Testing the GP regression with exponential kernel on 1d data '''
        exponential = GPy.kern.exponential(1)
        self.check_model_with_white(exponential, model_type='GP_regression', dimension=1)
-    def test_GP_regression_exponential_kern_2D(self):
+    def test_GP_regression_exponential_2D(self):
        ''' Testing the GP regression with exponential kernel on 2d data '''
        exponential = GPy.kern.exponential(2)
        self.check_model_with_white(exponential, model_type='GP_regression', dimension=2)
    def test_GP_regression_exponential_ARD_2D(self):
        ''' Testing the GP regression with exponential kernel on 2d data '''
        exponential = GPy.kern.exponential(2,ARD=True)
        self.check_model_with_white(exponential, model_type='GP_regression', dimension=2)
    def test_GP_regression_bias_kern_1D(self):
        ''' Testing the GP regression with bias kernel on 1d data '''
        bias = GPy.kern.bias(1)
@ -121,7 +136,7 @@ class GradientTests(unittest.TestCase):
        """ Testing GPLVM with rbf + bias and white kernel """
        N, Q, D = 50, 1, 2
        X = np.random.rand(N, Q)
-        k = GPy.kern.rbf(Q, 0.5, 0.9) + GPy.kern.bias(Q, 0.1) + GPy.kern.white(Q, 0.05)
+        k = GPy.kern.rbf(Q, 0.5, 0.9*np.ones((1,))) + GPy.kern.bias(Q, 0.1) + GPy.kern.white(Q, 0.05)
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
        m = GPy.models.GPLVM(Y, Q, kernel = k)
@ -151,6 +166,7 @@ class GradientTests(unittest.TestCase):
        m.approximate_likelihood()
        self.assertTrue(m.checkgrad())
    @unittest.skip("FITC will be broken for a while")
    def test_generalized_FITC(self):
        N = 20
        X = np.hstack([np.random.rand(N/2)+1,np.random.rand(N/2)-1])[:,None]
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -90,7 +90,7 @@ def toy_rbf_1d(seed=default_seed):
    N = 500
    X = np.random.uniform(low=-1.0, high=1.0, size=(N, numIn))
    X.sort(axis=0)
-    rbf = GPy.kern.rbf(numIn, variance=1., lengthscale=0.25)
+    rbf = GPy.kern.rbf(numIn, variance=1., lengthscale=np.array((0.25,)))
    white = GPy.kern.white(numIn, variance=1e-2)
    kernel = rbf + white
    K = kernel.K(X)
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@ -33,7 +33,7 @@ class WarpingFunction(object):
        """inverse function transformation"""
        raise NotImplementedError
-    def get_param_names(self):
+    def _get_param_names(self):
        raise NotImplementedError
    def plot(self, psi, xmin, xmax):
@ -151,7 +151,7 @@ class TanhWarpingFunction(WarpingFunction):
        return gradients
-    def get_param_names(self):
+    def _get_param_names(self):
        variables = ['a', 'b', 'c']
        names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] for q in range(self.n_terms)],[])
        return names
--- a/README.md
+++ b/README.md
@ -1,5 +1,7 @@
 GPy
 ===
-Gaussian processes framework in python 
+A Gaussian processes framework in python.
 * [Online documentation](https://gpy.readthedocs.org/en/latest/)
 * [Unit tests (Travis-CI)](https://travis-ci.org/SheffieldML/GPy)
--- a/doc/GPy.core.rst
+++ b/doc/GPy.core.rst
@ -0,0 +1,35 @@
 core Package
 ============
 :mod:`core` Package
 -------------------
 .. automodule:: GPy.core
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`model` Module
 -------------------
 .. automodule:: GPy.core.model
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`parameterised` Module
 ---------------------------
 .. automodule:: GPy.core.parameterised
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`priors` Module
 --------------------
 .. automodule:: GPy.core.priors
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/GPy.inference.rst
+++ b/doc/GPy.inference.rst
@ -0,0 +1,35 @@
 inference Package
 =================
 :mod:`Expectation_Propagation` Module
 -------------------------------------
 .. automodule:: GPy.inference.Expectation_Propagation
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`likelihoods` Module
 -------------------------
 .. automodule:: GPy.inference.likelihoods
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`optimization` Module
 --------------------------
 .. automodule:: GPy.inference.optimization
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`samplers` Module
 ----------------------
 .. automodule:: GPy.inference.samplers
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/GPy.kern.rst
+++ b/doc/GPy.kern.rst
@ -0,0 +1,139 @@
 kern Package
 ============
 :mod:`kern` Package
 -------------------
 .. automodule:: GPy.kern
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`Brownian` Module
 ----------------------
 .. automodule:: GPy.kern.Brownian
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`Matern32` Module
 ----------------------
 .. automodule:: GPy.kern.Matern32
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`Matern52` Module
 ----------------------
 .. automodule:: GPy.kern.Matern52
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`bias` Module
 ------------------
 .. automodule:: GPy.kern.bias
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`constructors` Module
 --------------------------
 .. automodule:: GPy.kern.constructors
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`exponential` Module
 -------------------------
 .. automodule:: GPy.kern.exponential
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`finite_dimensional` Module
 --------------------------------
 .. automodule:: GPy.kern.finite_dimensional
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`kern` Module
 ------------------
 .. automodule:: GPy.kern.kern
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`kernpart` Module
 ----------------------
 .. automodule:: GPy.kern.kernpart
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`linear` Module
 --------------------
 .. automodule:: GPy.kern.linear
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`linear_ARD` Module
 ------------------------
 .. automodule:: GPy.kern.linear_ARD
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`rbf-testing` Module
 -------------------------
 .. automodule:: GPy.kern.rbf-testing
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`rbf` Module
 -----------------
 .. automodule:: GPy.kern.rbf
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`spline` Module
 --------------------
 .. automodule:: GPy.kern.spline
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`sympykern` Module
 -----------------------
 .. automodule:: GPy.kern.sympykern
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`white` Module
 -------------------
 .. automodule:: GPy.kern.white
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/GPy.models.rst
+++ b/doc/GPy.models.rst
@ -0,0 +1,75 @@
 models Package
 ==============
 :mod:`models` Package
 ---------------------
 .. automodule:: GPy.models
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`GPLVM` Module
 -------------------
 .. automodule:: GPy.models.GPLVM
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`GP_EP` Module
 -------------------
 .. automodule:: GPy.models.GP_EP
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`GP_regression` Module
 ---------------------------
 .. automodule:: GPy.models.GP_regression
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`generalized_FITC` Module
 ------------------------------
 .. automodule:: GPy.models.generalized_FITC
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`sparse_GPLVM` Module
 --------------------------
 .. automodule:: GPy.models.sparse_GPLVM
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`sparse_GP_regression` Module
 ----------------------------------
 .. automodule:: GPy.models.sparse_GP_regression
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`uncollapsed_sparse_GP` Module
 -----------------------------------
 .. automodule:: GPy.models.uncollapsed_sparse_GP
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`warped_GP` Module
 -----------------------
 .. automodule:: GPy.models.warped_GP
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/GPy.rst
+++ b/doc/GPy.rst
@ -0,0 +1,22 @@
 GPy Package
 ===========
 :mod:`GPy` Package
 ------------------
 .. automodule:: GPy.__init__
    :members:
    :undoc-members:
    :show-inheritance:
 Subpackages
 -----------
 .. toctree::
    GPy.core
    GPy.inference
    GPy.kern
    GPy.models
    GPy.util
--- a/doc/GPy.util.rst
+++ b/doc/GPy.util.rst
@ -0,0 +1,67 @@
 util Package
 ============
 :mod:`util` Package
 -------------------
 .. automodule:: GPy.util
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`Tango` Module
 -------------------
 .. automodule:: GPy.util.Tango
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`datasets` Module
 ----------------------
 .. automodule:: GPy.util.datasets
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`linalg` Module
 --------------------
 .. automodule:: GPy.util.linalg
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`misc` Module
 ------------------
 .. automodule:: GPy.util.misc
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`plot` Module
 ------------------
 .. automodule:: GPy.util.plot
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`squashers` Module
 -----------------------
 .. automodule:: GPy.util.squashers
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`warping_functions` Module
 -------------------------------
 .. automodule:: GPy.util.warping_functions
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/conf.py
+++ b/doc/conf.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # GPy documentation build configuration file, created by
-# sphinx-quickstart on Wed Jan  9 15:21:20 2013.
+# sphinx-quickstart on Fri Jan 18 15:30:28 2013.
 #
 # This file is execfile()d with the current directory set to its containing dir.
 #
@ -25,7 +25,15 @@ import sys, os
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo', 'sphinx.ext.pngmath', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode']
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
 # ----------------------- READTHEDOCS ------------------
 on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
 if on_rtd:
  sys.path.append("../GPy")
  os.system("pwd")
  os.system("sphinx-apidoc -f -o . ../GPy")
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@ -41,16 +49,16 @@ master_doc = 'index'
 # General information about the project.
 project = u'GPy'
-copyright = u'2013, The GPy authors'
+copyright = u'2013, Author'
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '0.00001'
+version = ''
 # The full version, including alpha/beta/rc tags.
-release = '0.00001'
+release = ''
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@ -184,7 +192,7 @@ latex_elements = {
 # (source start file, target name, title, author, documentclass [howto/manual]).
 latex_documents = [
  ('index', 'GPy.tex', u'GPy Documentation',
-   u'The GPy authors', 'manual'),
+   u'Author', 'manual'),
 ]
 # The name of an image file (relative to this directory) to place at the top of
@ -214,7 +222,7 @@ latex_documents = [
 # (source start file, name, description, authors, manual section).
 man_pages = [
    ('index', 'gpy', u'GPy Documentation',
-     [u'The GPy authors'], 1)
+     [u'Author'], 1)
 ]
 # If true, show URL addresses after external links.
@ -228,7 +236,7 @@ man_pages = [
 #  dir menu entry, description, category)
 texinfo_documents = [
  ('index', 'GPy', u'GPy Documentation',
-   u'The GPy authors', 'GPy', 'One line description of project.',
+   u'Author', 'GPy', 'One line description of project.',
   'Miscellaneous'),
 ]
@ -240,3 +248,47 @@ texinfo_documents = [
 # How to display URL addresses: 'footnote', 'no', or 'inline'.
 #texinfo_show_urls = 'footnote'
 # -- Options for Epub output ---------------------------------------------------
 # Bibliographic Dublin Core info.
 epub_title = u'GPy'
 epub_author = u'Author'
 epub_publisher = u'Author'
 epub_copyright = u'2013, Author'
 # The language of the text. It defaults to the language option
 # or en if the language is not set.
 #epub_language = ''
 # The scheme of the identifier. Typical schemes are ISBN or URL.
 #epub_scheme = ''
 # The unique identifier of the text. This can be a ISBN number
 # or the project homepage.
 #epub_identifier = ''
 # A unique identification for the text.
 #epub_uid = ''
 # A tuple containing the cover image and cover page html template filenames.
 #epub_cover = ()
 # HTML files that should be inserted before the pages created by sphinx.
 # The format is a list of tuples containing the path and title.
 #epub_pre_files = []
 # HTML files shat should be inserted after the pages created by sphinx.
 # The format is a list of tuples containing the path and title.
 #epub_post_files = []
 # A list of files that should not be packed into the epub file.
 #epub_exclude_files = []
 # The depth of the table of contents in toc.ncx.
 #epub_tocdepth = 3
 # Allow duplicate toc entries.
 #epub_tocdup = True
--- a/doc/index.rst
+++ b/doc/index.rst
@ -1,5 +1,5 @@
 .. GPy documentation master file, created by
-   sphinx-quickstart on Wed Jan  9 15:21:20 2013.
+   sphinx-quickstart on Fri Jan 18 17:36:01 2013.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.
@ -9,8 +9,9 @@ Welcome to GPy's documentation!
 Contents:
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 4
   GPy
 Indices and tables
--- a/doc/make.bat
+++ b/doc/make.bat
@ -0,0 +1,190 @@
@ECHO OFF
 REM Command file for Sphinx documentation
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
 set BUILDDIR=_build
 set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 set I18NSPHINXOPTS=%SPHINXOPTS% .
 if NOT "%PAPER%" == "" (
 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 )
 if "%1" == "" goto help
 if "%1" == "help" (
 	:help
 	echo.Please use `make ^<target^>` where ^<target^> is one of
 	echo.  html       to make standalone HTML files
 	echo.  dirhtml    to make HTML files named index.html in directories
 	echo.  singlehtml to make a single large HTML file
 	echo.  pickle     to make pickle files
 	echo.  json       to make JSON files
 	echo.  htmlhelp   to make HTML files and a HTML help project
 	echo.  qthelp     to make HTML files and a qthelp project
 	echo.  devhelp    to make HTML files and a Devhelp project
 	echo.  epub       to make an epub
 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 	echo.  text       to make text files
 	echo.  man        to make manual pages
 	echo.  texinfo    to make Texinfo files
 	echo.  gettext    to make PO message catalogs
 	echo.  changes    to make an overview over all changed/added/deprecated items
 	echo.  linkcheck  to check all external links for integrity
 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 	goto end
 )
 if "%1" == "clean" (
 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 	del /q /s %BUILDDIR%\*
 	goto end
 )
 if "%1" == "html" (
 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 	goto end
 )
 if "%1" == "dirhtml" (
 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 	goto end
 )
 if "%1" == "singlehtml" (
 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 	goto end
 )
 if "%1" == "pickle" (
 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished; now you can process the pickle files.
 	goto end
 )
 if "%1" == "json" (
 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished; now you can process the JSON files.
 	goto end
 )
 if "%1" == "htmlhelp" (
 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished; now you can run HTML Help Workshop with the ^
 .hhp project file in %BUILDDIR%/htmlhelp.
 	goto end
 )
 if "%1" == "qthelp" (
 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
 .qhcp project file in %BUILDDIR%/qthelp, like this:
 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\GPy.qhcp
 	echo.To view the help file:
 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\GPy.ghc
 	goto end
 )
 if "%1" == "devhelp" (
 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished.
 	goto end
 )
 if "%1" == "epub" (
 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
 	goto end
 )
 if "%1" == "latex" (
 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
 	goto end
 )
 if "%1" == "text" (
 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The text files are in %BUILDDIR%/text.
 	goto end
 )
 if "%1" == "man" (
 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
 	goto end
 )
 if "%1" == "texinfo" (
 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
 	goto end
 )
 if "%1" == "gettext" (
 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
 	goto end
 )
 if "%1" == "changes" (
 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.The overview file is in %BUILDDIR%/changes.
 	goto end
 )
 if "%1" == "linkcheck" (
 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Link check complete; look for any errors in the above output ^
 or in %BUILDDIR%/linkcheck/output.txt.
 	goto end
 )
 if "%1" == "doctest" (
 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Testing of doctests in the sources finished, look at the ^
 results in %BUILDDIR%/doctest/output.txt.
 	goto end
 )
 :end
--- a/doc/modules.rst
+++ b/doc/modules.rst
@ -0,0 +1,7 @@
 GPy
 ===
 .. toctree::
   :maxdepth: 4
   GPy
--- a/doc/tuto_GP_regression.rst
+++ b/doc/tuto_GP_regression.rst
@ -0,0 +1,105 @@
 *************************************
 Gaussian process regression tutorial
 *************************************
 We will see in this tutorial the basics for building a 1 dimensional and a 2 dimensional Gaussian process model, also known as a kriging model.
 We first import the libraries we will need: ::
    import pylab as pb
    pb.ion()
    import numpy as np
    import GPy
 1 dimensional model
 ===================
 For this toy example, we assume we have the following inputs and outputs::
    X = np.random.uniform(-3.,3.,(20,1))
    Y = np.sin(X) + np.random.randn(20,1)*0.05
 Note that the observations Y include some noise.
 The first step is to define the covariance kernel we want to use for the model. We choose here a kernel based on Gaussian kernel (i.e. rbf or square exponential) plus some white noise::
    Gaussian = GPy.kern.rbf(D=1)
    noise = GPy.kern.white(D=1)
    kernel = Gaussian + noise
 The parameter D stands for the dimension of the input space. Note that many other kernels are implemented such as:
 * linear (``GPy.kern.linear``)
 * exponential kernel (``GPy.kern.exponential``)
 * Matern 3/2 (``GPy.kern.Matern32``)
 * Matern 5/2 (``GPy.kern.Matern52``)
 * spline (``GPy.kern.spline``)
 * and many others...
 The inputs required for building the model are the observations and the kernel::
    m = GPy.models.GP_regression(X,Y,kernel)
 The functions ``print`` and ``plot`` can help us understand the model we have just build::
    print m
    m.plot()
 The default values of the kernel parameters may not be relevant for the current data (for example, the confidence intervals seems too wide on the previous figure). A common approach is find the values of the parameters that maximize the likelihood of the data. There are two steps for doing that with GPy:
 * Constrain the parameters of the kernel to ensure the kernel will always be a valid covariance structure (For example, we don\'t want some variances to be negative!).
 * Run the optimization
 There are various ways to constrain the parameters of the kernel. The most basic is to constrain all the parameters to be positive::
    m.constrain_positive('')
 but it is also possible to set a range on to constrain one parameter to be fixed. The parameter of ``m.constrain_positive`` is a regular expression that matches the name of the parameters to be constrained (as seen in ``print m``). For example, if we want the variance to be positive, the lengthscale to be in [1,10] and the noise variance to be fixed we can write::
    #m.unconstrain('')                            # Required if the model has been previously constrained
    m.constrain_positive('rbf_variance')
    m.constrain_bounded('lengthscale',1.,10. )
    m.constrain_fixed('white',0.0025)
 Once the constrains have bee imposed, the model can be optimized::
    m.optimize()
 If we want to perform some restarts to try to improve the result of the optimization, we can use the optimize_restart function::
    m.optimize_restarts(Nrestarts = 10)
    m.plot()
    print(m)
 2 dimensional example
 =====================
 Here is a 2 dimensional example::
    import pylab as pb
    pb.ion()
    import numpy as np
    import GPy
    # sample inputs and outputs
    X = np.random.uniform(-3.,3.,(50,2))
    Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(50,1)*0.05
    # define kernel
    ker = GPy.kern.Matern52(2,ARD=True) + GPy.kern.white(2)
    # create simple GP model
    m = GPy.models.GP_regression(X,Y,ker)
    # contrain all parameters to be positive
    m.constrain_positive('')
    # optimize and plot
    pb.figure()
    m.optimize('tnc', max_f_eval = 1000)
    m.plot()
    print(m)
 The flag ``ARD=True`` in the definition of the Matern kernel specifies that we want one lengthscale parameter per dimension (ie the GP is not isotropic).
--- a/setup.py
+++ b/setup.py
@ -24,9 +24,9 @@ setup(name = 'GPy',
      package_data = {'GPy': ['GPy/examples']},
      py_modules = ['GPy.__init__'],
      long_description=read('README.md'),
-      ext_modules =  [Extension(name = 'GPy.kern.lfmUpsilonf2py',
+      #ext_modules =  [Extension(name = 'GPy.kern.lfmUpsilonf2py',
-                sources = ['GPy/kern/src/lfmUpsilonf2py.f90'])],
+      #          sources = ['GPy/kern/src/lfmUpsilonf2py.f90'])],
-      install_requires=['numpy>=1.6', 'scipy>=0.9','matplotlib>=1.1'],
+      install_requires=['sympy', 'numpy>=1.6', 'scipy>=0.9','matplotlib>=1.1'],
      setup_requires=['sphinx'],
      cmdclass = {'build_sphinx': BuildDoc},
      classifiers=[