merged trivial conflict

2026-07-23 17:01:06 +02:00 · 2013-01-18 17:49:50 +00:00 · 2013-01-18 17:49:50 +00:00 · b52c826387
commit b52c826387
parent 6d98a23077 9201bb8a89
47 changed files with 1043 additions and 612 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,11 @@
 language: python
 python:
  - "2.7"
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
 install: 
  - sudo apt-get install python-scipy
  - pip install sphinx
  - pip install . --use-mirrors
 # command to run tests, e.g. python setup.py test
 script: 
  - nosetests --with-xcoverage --with-xunit --cover-package=GPy --cover-erase GPy/testing
--- a/GPy/init.py
+++ b/GPy/init.py
@ -6,5 +6,6 @@ import kern
 import models
 import inference
 import util
 #import examples
 #import examples TODO: discuss!
 from core import priors
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -14,18 +14,18 @@ from ..inference import optimization
 class model(parameterised):
    def __init__(self):
        parameterised.__init__(self)
-        self.priors = [None for i in range(self.get_param().size)]
+        self.priors = [None for i in range(self._get_params().size)]
        self.optimization_runs = []
        self.sampling_runs = []
-        self.set_param(self.get_param())
+        self._set_params(self._get_params())
        self.preferred_optimizer = 'tnc'
-    def get_param(self):
+    def _get_params(self):
        raise NotImplementedError, "this needs to be implemented to utilise the model class"
-    def set_param(self,x):
+    def _set_params(self,x):
        raise NotImplementedError, "this needs to be implemented to utilise the model class"
    def log_likelihood(self):
        raise NotImplementedError, "this needs to be implemented to utilise the model class"
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
        raise NotImplementedError, "this needs to be implemented to utilise the model class"
    def set_prior(self,which,what):
@ -67,7 +67,7 @@ class model(parameterised):
            unconst = np.setdiff1d(which, self.constrained_positive_indices)
            if len(unconst):
                print "Warning: constraining parameters to be positive:"
-                print '\n'.join([n for i,n in enumerate(self.get_param_names()) if i in unconst])
+                print '\n'.join([n for i,n in enumerate(self._get_param_names()) if i in unconst])
                print '\n'
                self.constrain_positive(unconst)
        elif isinstance(what,priors.Gaussian):
@ -80,48 +80,65 @@ class model(parameterised):
        for w in which:
            self.priors[w] = what
-    def get(self,name):
+    def get(self,name, return_names=False):
        """
-        get a model parameter by name
+        Get a model parameter by name. The name is applied as a regular expression and all parameters that match that regular expression are returned. 
        """
        matches = self.grep_param_names(name)
        if len(matches):
-            return self.get_param()[matches]
+            if return_names:
                return self._get_params()[matches], np.asarray(self._get_param_names())[matches].tolist()
            else:
                return self._get_params()[matches]
        else:
            raise AttributeError, "no parameter matches %s"%name
    def set(self,name,val):
        """
-        Set a model parameter by name
+        Set model parameter(s) by name. The name is provided as a regular expression. All parameters matching that regular expression are set to ghe given value.
        """
        matches = self.grep_param_names(name)
        if len(matches):
-            x = self.get_param()
+            x = self._get_params()
            x[matches] = val
-            self.set_param(x)
+            self._set_params(x)
        else:
            raise AttributeError, "no parameter matches %s"%name
    def get_gradient(self,name, return_names=False):
        """
        Get model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned. 
        """
        matches = self.grep_param_names(name)
        if len(matches):
            if return_names:
                return self._log_likelihood_gradients()[matches],  np.asarray(self._get_param_names())[matches].tolist()
            else:
                return self._log_likelihood_gradients()[matches]
        else:
            raise AttributeError, "no parameter matches %s"%name
    def log_prior(self):
        """evaluate the prior"""
-        return np.sum([p.lnpdf(x) for p, x in zip(self.priors,self.get_param()) if p is not None])
+        return np.sum([p.lnpdf(x) for p, x in zip(self.priors,self._get_params()) if p is not None])
-    def log_prior_gradients(self):
+    def _log_prior_gradients(self):
        """evaluate the gradients of the priors"""
-        x = self.get_param()
+        x = self._get_params()
        ret = np.zeros(x.size)
        [np.put(ret,i,p.lnpdf_grad(xx)) for i,(p,xx) in enumerate(zip(self.priors,x)) if not p is None]
        return ret
-    def extract_gradients(self):
+    def _log_likelihood_gradients_transformed(self):
        """
        Use self.log_likelihood_gradients and self.prior_gradients to get the gradients of the model.
        Adjust the gradient for constraints and ties, return.
        """
-        g = self.log_likelihood_gradients() + self.log_prior_gradients()
+        g = self._log_likelihood_gradients() + self._log_prior_gradients()
-        x = self.get_param()
+        x = self._get_params()
        g[self.constrained_positive_indices] = g[self.constrained_positive_indices]*x[self.constrained_positive_indices]
        g[self.constrained_negative_indices] = g[self.constrained_negative_indices]*x[self.constrained_negative_indices]
        [np.put(g,i,g[i]*(x[i]-l)*(h-x[i])/(h-l)) for i,l,h in zip(self.constrained_bounded_indices, self.constrained_bounded_lowers, self.constrained_bounded_uppers)]
@ -138,14 +155,14 @@ class model(parameterised):
        Make this draw from the prior if one exists, else draw from N(0,1)
        """
        #first take care of all parameters (from N(0,1))
-        x = self.extract_param()
+        x = self._get_params_transformed()
        x = np.random.randn(x.size)
-        self.expand_param(x)
+        self._set_params_transformed(x)
        #now draw from prior where possible
-        x = self.get_param()
+        x = self._get_params()
        [np.put(x,i,p.rvs(1)) for i,p in enumerate(self.priors) if not p is None]
-        self.set_param(x)
+        self._set_params(x)
-        self.expand_param(self.extract_param())#makes sure all of the tied parameters get the same init (since there's only one prior object...)
+        self._set_params_transformed(self._get_params_transformed())#makes sure all of the tied parameters get the same init (since there's only one prior object...)
    def optimize_restarts(self, Nrestarts=10, robust=False, verbose=True, **kwargs):
@ -165,7 +182,7 @@ class model(parameterised):
        :verbose: whether to show informations about the current restart
        """
-        initial_parameters = self.extract_param()
+        initial_parameters = self._get_params_transformed()
        for i in range(Nrestarts):
            try:
                self.randomize()
@ -182,9 +199,9 @@ class model(parameterised):
                    raise e
        if len(self.optimization_runs):
            i = np.argmin([o.f_opt for o in self.optimization_runs])
-            self.expand_param(self.optimization_runs[i].x_opt)
+            self._set_params_transformed(self.optimization_runs[i].x_opt)
        else:
-            self.expand_param(initial_parameters)
+            self._set_params_transformed(initial_parameters)
    def ensure_default_constraints(self,warn=False):
        """
@ -194,7 +211,7 @@ class model(parameterised):
        for s in positive_strings:
            for i in self.grep_param_names(s):
                if not (i in self.all_constrained_indices()):
-                    name = self.get_param_names()[i]
+                    name = self._get_param_names()[i]
                    self.constrain_positive(name)
                    if warn:
                        print "Warning! constraining %s postive"%name
@ -214,24 +231,24 @@ class model(parameterised):
            optimizer = self.preferred_optimizer
        def f(x):
-            self.expand_param(x)
+            self._set_params_transformed(x)
            return -self.log_likelihood()-self.log_prior()
        def fp(x):
-            self.expand_param(x)
+            self._set_params_transformed(x)
-            return -self.extract_gradients()
+            return -self._log_likelihood_gradients_transformed()
        def f_fp(x):
-            self.expand_param(x)
+            self._set_params_transformed(x)
-            return -self.log_likelihood()-self.log_prior(),-self.extract_gradients()
+            return -self.log_likelihood()-self.log_prior(),-self._log_likelihood_gradients_transformed()
        if start == None:
-            start = self.extract_param()
+            start = self._get_params_transformed()
        optimizer = optimization.get_optimizer(optimizer)
        opt = optimizer(start, model = self, **kwargs)
        opt.run(f_fp=f_fp, f=f, fp=fp)
        self.optimization_runs.append(opt)
-        self.expand_param(opt.x_opt)
+        self._set_params_transformed(opt.x_opt)
    def optimize_SGD(self, momentum = 0.1, learning_rate = 0.01, iterations = 20, **kwargs):
        # assert self.Y.shape[1] > 1, "SGD only works with D > 1"
@ -248,13 +265,13 @@ class model(parameterised):
        else:
            print "numerically calculating hessian. please be patient!"
-            x = self.get_param()
+            x = self._get_params()
            def f(x):
-                self.set_param(x)
+                self._set_params(x)
                return self.log_likelihood()
            h = ndt.Hessian(f)
            A = -h(x)
-            self.set_param(x)
+            self._set_params(x)
        # check for almost zero components on the diagonal which screw up the cholesky
        aa = np.nonzero((np.diag(A)<1e-6) & (np.diag(A)>0.))[0]
        A[aa,aa] = 0.
@ -268,7 +285,7 @@ class model(parameterised):
            hld = np.sum(np.log(np.diag(jitchol(A)[0])))
        except:
            return np.nan
-        return 0.5*self.get_param().size*np.log(2*np.pi) + self.log_likelihood() - hld
+        return 0.5*self._get_params().size*np.log(2*np.pi) + self.log_likelihood() - hld
    def __str__(self):
        s = parameterised.__str__(self).split('\n')
@ -292,18 +309,18 @@ class model(parameterised):
        If the overall gradient fails, invividual components are tested.
        """
-        x = self.extract_param().copy()
+        x = self._get_params_transformed().copy()
        #choose a random direction to step in:
        dx = step*np.sign(np.random.uniform(-1,1,x.size))
        #evaulate around the point x
-        self.expand_param(x+dx)
+        self._set_params_transformed(x+dx)
-        f1,g1 = self.log_likelihood() + self.log_prior(), self.extract_gradients()
+        f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()
-        self.expand_param(x-dx)
+        self._set_params_transformed(x-dx)
-        f2,g2 = self.log_likelihood() + self.log_prior(), self.extract_gradients()
+        f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()
-        self.expand_param(x)
+        self._set_params_transformed(x)
-        gradient = self.extract_gradients()
+        gradient = self._log_likelihood_gradients_transformed()
        numerical_gradient = (f1-f2)/(2*dx)
        ratio = (f1-f2)/(2*np.dot(dx,gradient))
@ -319,7 +336,7 @@ class model(parameterised):
                print "Global check failed. Testing individual gradients\n"
                try:
-                    names = self.extract_param_names()
+                    names = self._get_param_names_transformed()
                except NotImplementedError:
                    names = ['Variable %i'%i for i in range(len(x))]
@ -338,13 +355,13 @@ class model(parameterised):
            for i in range(len(x)):
                xx = x.copy()
                xx[i] += step
-                self.expand_param(xx)
+                self._set_params_transformed(xx)
-                f1,g1 = self.log_likelihood() + self.log_prior(), self.extract_gradients()[i]
+                f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()[i]
                xx[i] -= 2.*step
-                self.expand_param(xx)
+                self._set_params_transformed(xx)
-                f2,g2 = self.log_likelihood() + self.log_prior(), self.extract_gradients()[i]
+                f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()[i]
-                self.expand_param(x)
+                self._set_params_transformed(x)
-                gradient = self.extract_gradients()[i]
+                gradient = self._log_likelihood_gradients_transformed()[i]
                numerical_gradient = (f1-f2)/(2*step)
--- a/GPy/core/parameterised.py
+++ b/GPy/core/parameterised.py
@ -66,7 +66,7 @@ class parameterised(object):
        if hasattr(self,'prior'):
            pass
-        self.expand_param(self.extract_param())# sets tied parameters to single value
+        self._set_params_transformed(self._get_params_transformed())# sets tied parameters to single value
    def untie_everything(self):
        """Unties all parameters by setting tied_indices to an empty list."""
@ -87,7 +87,7 @@ class parameterised(object):
        Returns
        -------
-        the indices of self.get_param_names which match the regular expression.
+        the indices of self._get_param_names which match the regular expression.
        Notes
        -----
@ -96,9 +96,9 @@ class parameterised(object):
        if type(expr) is str:
            expr = re.compile(expr)
-            return np.nonzero([expr.search(name) for name in self.get_param_names()])[0]
+            return np.nonzero([expr.search(name) for name in self._get_param_names()])[0]
        elif type(expr) is re._pattern_type:
-            return np.nonzero([expr.search(name) for name in self.get_param_names()])[0]
+            return np.nonzero([expr.search(name) for name in self._get_param_names()])[0]
        else:
            return expr
@ -115,11 +115,11 @@ class parameterised(object):
        assert not np.any(matches[:,None]==self.all_constrained_indices()), "Some indices are already constrained"
        self.constrained_positive_indices = np.hstack((self.constrained_positive_indices, matches))
        #check to ensure constraint is in place
-        x = self.get_param()
+        x = self._get_params()
        for i,xx in enumerate(x):
            if (xx<0) & (i in matches):
                x[i] = -xx
-        self.set_param(x)
+        self._set_params(x)
    def unconstrain(self,which):
@ -163,11 +163,11 @@ class parameterised(object):
        assert not np.any(matches[:,None]==self.all_constrained_indices()), "Some indices are already constrained"
        self.constrained_negative_indices = np.hstack((self.constrained_negative_indices, matches))
        #check to ensure constraint is in place
-        x = self.get_param()
+        x = self._get_params()
        for i,xx in enumerate(x):
            if (xx>0.) and (i in matches):
                x[i] = -xx
-        self.set_param(x)
+        self._set_params(x)
@ -187,11 +187,11 @@ class parameterised(object):
        self.constrained_bounded_uppers.append(upper)
        self.constrained_bounded_lowers.append(lower)
        #check to ensure constraint is in place
-        x = self.get_param()
+        x = self._get_params()
        for i,xx in enumerate(x):
            if ((xx<=lower)|(xx>=upper)) & (i in matches):
                x[i] = sigmoid(xx)*(upper-lower) + lower
-        self.set_param(x)
+        self._set_params(x)
    def constrain_fixed(self, which, value = None):
@ -213,14 +213,14 @@ class parameterised(object):
        if value != None:
            self.constrained_fixed_values.append(value)
        else:
-            self.constrained_fixed_values.append(self.get_param()[self.constrained_fixed_indices[-1]])
+            self.constrained_fixed_values.append(self._get_params()[self.constrained_fixed_indices[-1]])
        #self.constrained_fixed_values.append(value)
-        self.expand_param(self.extract_param())
+        self._set_params_transformed(self._get_params_transformed())
-    def extract_param(self):
+    def _get_params_transformed(self):
-        """use self.get_param to get the 'true' parameters of the model, which are then tied, constrained and fixed"""
+        """use self._get_params to get the 'true' parameters of the model, which are then tied, constrained and fixed"""
-        x = self.get_param()
+        x = self._get_params()
        x[self.constrained_positive_indices] = np.log(x[self.constrained_positive_indices])
        x[self.constrained_negative_indices] = np.log(-x[self.constrained_negative_indices])
        [np.put(x,i,np.log(np.clip(x[i]-l,1e-10,np.inf)/np.clip(h-x[i],1e-10,np.inf))) for i,l,h in zip(self.constrained_bounded_indices, self.constrained_bounded_lowers, self.constrained_bounded_uppers)]
@ -232,8 +232,8 @@ class parameterised(object):
            return x
-    def expand_param(self,x):
+    def _set_params_transformed(self,x):
-        """ takes the vector x, which is then modified (by untying, reparameterising or inserting fixed values), and then call self.set_param"""
+        """ takes the vector x, which is then modified (by untying, reparameterising or inserting fixed values), and then call self._set_params"""
        #work out how many places are fixed, and where they are. tricky logic!
        Nfix_places = 0.
@ -257,14 +257,14 @@ class parameterised(object):
        xx[self.constrained_positive_indices] = np.exp(xx[self.constrained_positive_indices])
        xx[self.constrained_negative_indices] = -np.exp(xx[self.constrained_negative_indices])
        [np.put(xx,i,low+sigmoid(xx[i])*(high-low)) for i,low,high in zip(self.constrained_bounded_indices, self.constrained_bounded_lowers, self.constrained_bounded_uppers)]
-        self.set_param(xx)
+        self._set_params(xx)
-    def extract_param_names(self):
+    def _get_param_names_transformed(self):
        """
        Returns the parameter names as propagated after constraining,
-        tying or fixing, i.e. a list of the same length as extract_param()
+        tying or fixing, i.e. a list of the same length as _get_params_transformed()
        """
-        n =  self.get_param_names()
+        n =  self._get_param_names()
        #remove/concatenate the tied parameter names
        if len(self.tied_indices):
@ -294,13 +294,13 @@ class parameterised(object):
        """
        Return a string describing the parameter names and their ties and constraints
        """
-        names = self.get_param_names()
+        names = self._get_param_names()
        N = len(names)
        if not N:
            return "This object has no free parameters."
        header = ['Name','Value','Constraints','Ties']
-        values = self.get_param() #map(str,self.get_param())
+        values = self._get_params() #map(str,self._get_params())
        #sort out the constraints
        constraints = ['']*len(names)
        for i in self.constrained_positive_indices:
--- a/GPy/examples/GPLVM_demo.py
+++ b/GPy/examples/GPLVM_demo.py
@ -1,28 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 import GPy
 np.random.seed(1)
 print "GPLVM with RBF kernel"
 N = 100
 Q = 1
 D = 2
 X = np.random.rand(N, Q)
 k = GPy.kern.rbf(Q, 1.0, 2.0) + GPy.kern.white(Q, 0.00001)
 K = k.K(X)
 Y = np.random.multivariate_normal(np.zeros(N),K,D).T
 m = GPy.models.GPLVM(Y, Q)
 m.constrain_positive('(rbf|bias|white)')
 pb.figure()
 m.plot()
 pb.title('PCA initialisation')
 pb.figure()
 m.optimize(messages = 1)
 m.plot()
 pb.title('After optimisation')
--- a/GPy/examples/GP_regression_demo.py
+++ b/GPy/examples/GP_regression_demo.py
@ -1,51 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Simple Gaussian Processes regression with an RBF kernel
 """
 import pylab as pb
 import numpy as np
 import GPy
 pb.ion()
 pb.close('all')
 ######################################
 ## 1 dimensional example
 # sample inputs and outputs
 X = np.random.uniform(-3.,3.,(20,1))
 Y = np.sin(X)+np.random.randn(20,1)*0.05
 # create simple GP model
 m = GPy.models.GP_regression(X,Y)
 # contrain all parameters to be positive
 m.constrain_positive('')
 # optimize and plot
 m.optimize('tnc', max_f_eval = 1000)
 m.plot()
 print(m)
 ######################################
 ## 2 dimensional example
 # sample inputs and outputs
 X = np.random.uniform(-3.,3.,(40,2))
 Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(40,1)*0.05
 # create simple GP model
 m = GPy.models.GP_regression(X,Y)
 # contrain all parameters to be positive
 m.constrain_positive('')
 # optimize and plot
 pb.figure()
 m.optimize('tnc', max_f_eval = 1000)
 m.plot()
 print(m)
--- a/GPy/examples/GP_regression_kern_demo.py
+++ b/GPy/examples/GP_regression_kern_demo.py
@ -1,33 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Simple one-dimensional Gaussian Processes with assorted kernel functions
 """
 import pylab as pb
 import numpy as np
 import GPy
 # sample inputs and outputs
 D = 1
 X = np.random.randn(10,D)*2
 X = np.linspace(-1.5,1.5,5)[:,None]
 X = np.append(X,[[5]],0)
 Y = np.sin(np.pi*X/2) #+np.random.randn(X.shape[0],1)*0.05
 models = [GPy.models.GP_regression(X,Y, k) for k in (GPy.kern.rbf(D), GPy.kern.Matern52(D), GPy.kern.Matern32(D), GPy.kern.exponential(D), GPy.kern.linear(D) + GPy.kern.white(D),  GPy.kern.bias(D) + GPy.kern.white(D))]
 pb.figure(figsize=(12,8))
 for i,m in enumerate(models):
    m.constrain_positive('')
    m.optimize()
    pb.subplot(3,2,i+1)
    m.plot()
    #pb.title(m.kern.parts[0].name)
 GPy.util.plot.align_subplots(3,2,(-3,6),(-2.5,2.5))
 pb.show()
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -8,8 +8,6 @@ Simple Gaussian Processes classification
 import pylab as pb
 import numpy as np
 import GPy
 pb.ion()
 pb.close('all')
 default_seed=10000
 ######################################
@ -27,7 +25,7 @@ def crescent_data(model_type='Full', inducing=10, seed=default_seed):
    likelihood = GPy.inference.likelihoods.probit(data['Y'])
    if model_type=='Full':
-        m = GPy.models.simple_GP_EP(data['X'],likelihood)
+        m = GPy.models.GP_EP(data['X'],likelihood)
    else:
        # create sparse GP EP model
        m = GPy.models.sparse_GP_EP(data['X'],likelihood=likelihood,inducing=inducing,ep_proxy=model_type)
@ -49,7 +47,7 @@ def oil():
    likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1])
    # create simple GP model
-    m = GPy.models.simple_GP_EP(data['X'],likelihood)
+    m = GPy.models.GP_EP(data['X'],likelihood)
    # contrain all parameters to be positive
    m.constrain_positive('')
--- a/GPy/examples/oil_flow_demo.py
+++ b/GPy/examples/oil_flow_demo.py
@ -1,53 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import cPickle as pickle
 import numpy as np
 import pylab as pb
 import GPy
 import pylab as plt
 np.random.seed(1)
 def plot_oil(X, theta, labels, label):
    plt.figure()
    X = X[:,np.argsort(theta)[:2]]
    flow_type = (X[labels[:,0]==1])
    plt.plot(flow_type[:,0], flow_type[:,1], 'rx')
    flow_type = (X[labels[:,1]==1])
    plt.plot(flow_type[:,0], flow_type[:,1], 'gx')
    flow_type = (X[labels[:,2]==1])
    plt.plot(flow_type[:,0], flow_type[:,1], 'bx')
    plt.title(label)
 data = pickle.load(open('../util/datasets/oil_flow_3classes.pickle', 'r'))
 Y = data['DataTrn']
 N, D = Y.shape
 selected = np.random.permutation(N)[:200]
 labels = data['DataTrnLbls'][selected]
 Y = Y[selected]
 N, D = Y.shape
 Y -= Y.mean(axis=0)
 Y /= Y.std(axis=0)
 Q = 2
 m1 = GPy.models.sparse_GPLVM(Y, Q, M = 15)
 m1.constrain_positive('(rbf|bias|noise)')
 m1.constrain_bounded('white', 1e-6, 1.0)
 plot_oil(m1.X, np.array([1,1]), labels, 'PCA initialization')
 # m.optimize(messages = True)
 m1.optimize('bfgs', messages = True)
 plot_oil(m1.X, np.array([1,1]), labels, 'sparse GPLVM')
 # pb.figure()
 # m.plot()
 # pb.title('PCA initialisation')
 # pb.figure()
 # m.optimize(messages = 1)
 # m.plot()
 # pb.title('After optimisation')
 m = GPy.models.GPLVM(Y, Q)
 m.constrain_positive('(white|rbf|bias|noise)')
 m.optimize()
 plot_oil(m.X, np.array([1,1]), labels, 'GPLVM')
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -8,8 +8,6 @@ Gaussian Processes regression examples
 import pylab as pb
 import numpy as np
 import GPy
 pb.ion()
 pb.close('all')
 def toy_rbf_1d():
@ -19,10 +17,8 @@ def toy_rbf_1d():
    # create simple GP model
    m = GPy.models.GP_regression(data['X'],data['Y'])
    # contrain all parameters to be positive
    m.constrain_positive('')
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    # plot
@ -37,10 +33,8 @@ def rogers_girolami_olympics():
    # create simple GP model
    m = GPy.models.GP_regression(data['X'],data['Y'])
    # contrain all parameters to be positive
    m.constrain_positive('')
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    # plot
@ -48,6 +42,10 @@ def rogers_girolami_olympics():
    print(m)
    return m
 def della_gatta_TRP63_gene_expression(number=942):
    """Run a standard Gaussian process regression on the della Gatta et al TRP63 Gene Expression data set for a given gene number."""
 def toy_rbf_1d_50():
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d_50()
@ -55,10 +53,8 @@ def toy_rbf_1d_50():
    # create simple GP model
    m = GPy.models.GP_regression(data['X'],data['Y'])
    # contrain all parameters to be positive
    m.constrain_positive('')
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    # plot
@ -73,11 +69,95 @@ def silhouette():
    # create simple GP model
    m = GPy.models.GP_regression(data['X'],data['Y'])
    # contrain all parameters to be positive
    m.constrain_positive('')
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    print(m)
    return m
 def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000):
    """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisey mode is higher."""
    # Contour over a range of length scales and signal/noise ratios.
    length_scales = np.linspace(0.1, 60., resolution)
    log_SNRs = np.linspace(-3., 4., resolution)
    data = GPy.util.datasets.della_gatta_TRP63_gene_expression(gene_number)
    # Sub sample the data to ensure multiple optima
    #data['Y'] = data['Y'][0::2, :]
    #data['X'] = data['X'][0::2, :]
    # Remove the mean (no bias kernel to ensure signal/noise is in RBF/white)
    data['Y'] = data['Y'] - np.mean(data['Y'])
    lls = GPy.examples.regression.contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
    pb.contour(length_scales, log_SNRs, np.exp(lls), 20)
    ax = pb.gca()
    pb.xlabel('length scale')
    pb.ylabel('log_10 SNR')
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    # Now run a few optimizations
    models = []
    optim_point_x = np.empty(2)
    optim_point_y = np.empty(2)
    np.random.seed(seed=seed)
    for i in range(0, model_restarts):
        kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.)) + GPy.kern.white(1,variance=np.random.exponential(1.))
        m = GPy.models.GP_regression(data['X'],data['Y'], kernel=kern)
        optim_point_x[0] = m.get('rbf_lengthscale')
        optim_point_y[0] = np.log10(m.get('rbf_variance')) - np.log10(m.get('white_variance'));
        # optimize
        m.ensure_default_constraints()
        m.optimize(xtol=1e-6,ftol=1e-6)
        optim_point_x[1] = m.get('rbf_lengthscale')
        optim_point_y[1] = np.log10(m.get('rbf_variance')) - np.log10(m.get('white_variance'));
        pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1]-optim_point_x[0], optim_point_y[1]-optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
        models.append(m)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    return (models, lls)
 def contour_data(data, length_scales, log_SNRs, signal_kernel_call=GPy.kern.rbf):
    """Evaluate the GP objective function for a given data set for a range of signal to noise ratios and a range of lengthscales.
    :data_set: A data set from the utils.datasets director.
    :length_scales: a list of length scales to explore for the contour plot.
    :log_SNRs: a list of base 10 logarithm signal to noise ratios to explore for the contour plot.
    :signal_kernel: a kernel to use for the 'signal' portion of the data."""
    lls = []
    total_var = np.var(data['Y'])
    for log_SNR in log_SNRs:
        SNR = 10**log_SNR
        length_scale_lls = []
        for length_scale in length_scales:
            noise_var = 1.
            signal_var = SNR
            noise_var = noise_var/(noise_var + signal_var)*total_var
            signal_var = signal_var/(noise_var + signal_var)*total_var
            signal_kernel = signal_kernel_call(1, variance=signal_var, lengthscale=length_scale)
            noise_kernel = GPy.kern.white(1, variance=noise_var)
            kernel = signal_kernel + noise_kernel
            K = kernel.K(data['X'])
            total_var = (np.dot(np.dot(data['Y'].T,GPy.util.linalg.pdinv(K)[0]), data['Y'])/data['Y'].shape[0])[0,0]
            noise_var *= total_var
            signal_var *= total_var
            kernel = signal_kernel_call(1, variance=signal_var, lengthscale=length_scale) + GPy.kern.white(1, variance=noise_var)
            model = GPy.models.GP_regression(data['X'], data['Y'], kernel=kernel)
            model.constrain_positive('')
            length_scale_lls.append(model.log_likelihood())
        lls.append(length_scale_lls)
    return np.array(lls)
--- a/GPy/examples/sparse_GPLVM_demo.py
+++ b/GPy/examples/sparse_GPLVM_demo.py
@ -10,11 +10,11 @@ print "sparse GPLVM with RBF kernel"
 N = 100
 M = 4
-Q = 1
+Q = 2
 D = 2
 #generate GPLVM-like data
 X = np.random.rand(N, Q)
-k = GPy.kern.rbf(Q, 1.0, 2.0) + GPy.kern.white(Q, 0.00001)
+k = GPy.kern.rbf(Q,1.,2*np.ones((1,))) + GPy.kern.white(Q, 0.00001)
 K = k.K(X)
 Y = np.random.multivariate_normal(np.zeros(N),K,D).T
--- a/GPy/examples/unsupervised.py
+++ b/GPy/examples/unsupervised.py
@ -0,0 +1,25 @@
 """
 Usupervised learning with Gaussian Processes.
 """
 import pylab as pb
 import numpy as np
 import GPy
 ######################################
 ## Oil data subsampled to 100 points.
 def oil_100():
    data = GPy.util.datasets.oil_100()
    # create simple GP model
    m = GPy.models.GPLVM(data['X'], 2)
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    # plot
    print(m)
    return m
--- a/GPy/examples/warped_GP_demo.py
+++ b/GPy/examples/warped_GP_demo.py
@ -1,45 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import scipy as sp
 import pdb, sys, pickle
 import matplotlib.pylab as plt
 import GPy
 np.random.seed(1)
 N = 100
 # sample inputs and outputs
 X = np.random.uniform(-np.pi,np.pi,(N,1))
 Y = np.sin(X)+np.random.randn(N,1)*0.05
 # Y += np.abs(Y.min()) + 0.5
 Z = np.exp(Y)# Y**(1/3.0)
 # rescaling targets?
 Zmax = Z.max()
 Zmin = Z.min()
 Z = (Z-Zmin)/(Zmax-Zmin) - 0.5
 m = GPy.models.warpedGP(X, Z, warping_terms = 2)
 m.constrain_positive('(tanh_a|tanh_b|rbf|white|bias)')
 m.randomize()
 plt.figure()
 plt.xlabel('predicted f(Z)')
 plt.ylabel('actual f(Z)')
 plt.plot(m.Y, Y, 'o', alpha = 0.5, label = 'before training')
 m.optimize(messages = True)
 plt.plot(m.Y, Y, 'o', alpha = 0.5, label = 'after training')
 plt.legend(loc = 0)
 m.plot_warping()
 plt.figure()
 plt.title('warped GP fit')
 m.plot()
 m1 = GPy.models.GP_regression(X, Z)
 m1.constrain_positive('(rbf|white|bias)')
 m1.randomize()
 m1.optimize(messages = True)
 plt.figure()
 plt.title('GP fit')
 m1.plot()
--- a/GPy/inference/likelihoods.py
+++ b/GPy/inference/likelihoods.py
@ -99,6 +99,6 @@ class probit(likelihood):
    def predictive_mean(self,mu,variance):
        return stats.norm.cdf(mu/np.sqrt(1+variance))
-    def log_likelihood_gradients():
+    def _log_likelihood_gradients():
        raise NotImplementedError
--- a/GPy/inference/samplers.py
+++ b/GPy/inference/samplers.py
@ -17,7 +17,7 @@ class Metropolis_Hastings:
    def __init__(self,model,cov=None):
        """Metropolis Hastings, with tunings according to Gelman et al. """
        self.model = model
-        current = self.model.extract_param()
+        current = self.model._get_params_transformed()
        self.D = current.size
        self.chains = []
        if cov is None:
@ -32,19 +32,19 @@ class Metropolis_Hastings:
        if start is None:
            self.model.randomize()
        else:
-            self.model.expand_param(start)
+            self.model._set_params_transformed(start)
    def sample(self, Ntotal, Nburn, Nthin, tune=True, tune_throughout=False, tune_interval=400):
-        current = self.model.extract_param()
+        current = self.model._get_params_transformed()
        fcurrent = self.model.log_likelihood() + self.model.log_prior()
        accepted = np.zeros(Ntotal,dtype=np.bool)
        for it in range(Ntotal):
            print "sample %d of %d\r"%(it,Ntotal),
            sys.stdout.flush()
            prop = np.random.multivariate_normal(current, self.cov*self.scale*self.scale)
-            self.model.expand_param(prop)
+            self.model._set_params_transformed(prop)
            fprop = self.model.log_likelihood() + self.model.log_prior()
            if fprop>fcurrent:#sample accepted, going 'uphill'
@ -73,12 +73,12 @@ class Metropolis_Hastings:
    def predict(self,function,args):
        """Make a prediction for the function, to which we will pass the additional arguments"""
-        param = self.model.get_param()
+        param = self.model._get_params()
        fs = []
        for p in self.chain:
-            self.model.set_param(p)
+            self.model._set_params(p)
            fs.append(function(*args))
-        self.model.set_param(param)# reset model to starting state
+        self.model._set_params(param)# reset model to starting state
        return fs
--- a/GPy/kern/Brownian.py
+++ b/GPy/kern/Brownian.py
@ -23,16 +23,16 @@ class Brownian(kernpart):
        assert self.D==1, "Brownian motion in 1D only"
        self.Nparam = 1.
        self.name = 'Brownian'
-        self.set_param(np.array([variance]).flatten())
+        self._set_params(np.array([variance]).flatten())
-    def get_param(self):
+    def _get_params(self):
        return self.variance
-    def set_param(self,x):
+    def _set_params(self,x):
        assert x.shape==(1,)
        self.variance = x
-    def get_param_names(self):
+    def _get_param_names(self):
        return ['variance']
    def K(self,X,X2,target):
--- a/GPy/kern/Matern32.py
+++ b/GPy/kern/Matern32.py
@ -20,43 +20,54 @@ class Matern32(kernpart):
    :type D: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
-    :param lengthscale: the lengthscales :math:`\ell_i`
+    :param lengthscale: the vector of lengthscale :math:`\ell_i`
-    :type lengthscale: np.ndarray of size (D,)
+    :type lengthscale: np.ndarray of size (1,) or (D,) depending on ARD
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    """
-    def __init__(self,D,variance=1.,lengthscales=None):
+    def __init__(self,D,variance=1.,lengthscale=None,ARD=False):
        self.D = D
-        if lengthscales is not None:
+        self.ARD = ARD
-            assert lengthscales.shape==(self.D,)
+        if ARD == False:
            self.Nparam = 2
            self.name = 'Mat32'
            if lengthscale is not None:
                assert lengthscale.shape == (1,)
            else:
                lengthscale = np.ones(1)
        else:
-            lengthscales = np.ones(self.D)
+            self.Nparam = self.D + 1
-        self.Nparam = self.D + 1
+            self.name = 'Mat32_ARD'
-        self.name = 'Mat32'
+            if lengthscale is not None:
-        self.set_param(np.hstack((variance,lengthscales)))
+                assert lengthscale.shape == (self.D,)
            else:
                lengthscale = np.ones(self.D)
        self._set_params(np.hstack((variance,lengthscale)))
-    def get_param(self):
+    def _get_params(self):
        """return the value of the parameters."""
-        return np.hstack((self.variance,self.lengthscales))
+        return np.hstack((self.variance,self.lengthscale))
-    def set_param(self,x):
+    def _set_params(self,x):
        """set the value of the parameters."""
-        assert x.size==(self.D+1)
+        assert x.size == self.Nparam
        self.variance = x[0]
-        self.lengthscales = x[1:]
+        self.lengthscale = x[1:]
-    def get_param_names(self):
+    def _get_param_names(self):
        """return parameter names."""
-        if self.D==1:
+        if self.Nparam == 2:
            return ['variance','lengthscale']
        else:
-            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
+            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
    def K(self,X,X2,target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        np.add(self.variance*(1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist), target,target)
    def Kdiag(self,X,target):
@ -66,13 +77,20 @@ class Matern32(kernpart):
    def dK_dtheta(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        dvar = (1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist)
        invdist = 1./np.where(dist!=0.,dist,np.inf)
-        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
+        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
-        dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
+        #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
        target[0] += np.sum(dvar*partial)
-        target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
+        if self.ARD == True:
            dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
            #dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
            target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
        else:
            dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist)) * dist2M.sum(-1)*invdist
            #dl = self.variance*dvar*dist2M.sum(-1)*invdist
            target[1] += np.sum(dl*partial)
    def dKdiag_dtheta(self,partial,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
@ -81,8 +99,8 @@ class Matern32(kernpart):
    def dK_dX(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
-        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
+        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
        dK_dX = - np.transpose(3*self.variance*dist*np.exp(-np.sqrt(3)*dist)*ddist_dX,(1,0,2))
        target += np.sum(dK_dX*partial.T[:,:,None],0)
@ -104,7 +122,7 @@ class Matern32(kernpart):
        """
        assert self.D == 1
        def L(x,i):
-            return(3./self.lengthscales**2*F[i](x) + 2*np.sqrt(3)/self.lengthscales*F1[i](x) + F2[i](x))
+            return(3./self.lengthscale**2*F[i](x) + 2*np.sqrt(3)/self.lengthscale*F1[i](x) + F2[i](x))
        n = F.shape[0]
        G = np.zeros((n,n))
        for i in range(n):
@ -114,5 +132,5 @@ class Matern32(kernpart):
        F1lower = np.array([f(lower) for f in F1])[:,None]
        #print "OLD \n", np.dot(F1lower,F1lower.T), "\n \n"
        #return(G)
-        return(self.lengthscales**3/(12.*np.sqrt(3)*self.variance) * G + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscales**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
+        return(self.lengthscale**3/(12.*np.sqrt(3)*self.variance) * G + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscale**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
--- a/GPy/kern/Matern52.py
+++ b/GPy/kern/Matern52.py
@ -19,42 +19,53 @@ class Matern52(kernpart):
    :type D: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
-    :param lengthscale: the lengthscales :math:`\ell_i`
+    :param lengthscale: the vector of lengthscale :math:`\ell_i`
-    :type lengthscale: np.ndarray of size (D,)
+    :type lengthscale: np.ndarray of size (1,) or (D,) depending on ARD
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    """
-    def __init__(self,D,variance=1.,lengthscales=None):
+    def __init__(self,D,variance=1.,lengthscale=None,ARD=False):
        self.D = D
-        if lengthscales is not None:
+        self.ARD = ARD
-            assert lengthscales.shape==(self.D,)
+        if ARD == False:
            self.Nparam = 2
            self.name = 'Mat32'
            if lengthscale is not None:
                assert lengthscale.shape == (1,)
            else:
                lengthscale = np.ones(1)
        else:
-            lengthscales = np.ones(self.D)
+            self.Nparam = self.D + 1
-        self.Nparam = self.D + 1
+            self.name = 'Mat32_ARD'
-        self.name = 'Mat52'
+            if lengthscale is not None:
-        self.set_param(np.hstack((variance,lengthscales)))
+                assert lengthscale.shape == (self.D,)
            else:
                lengthscale = np.ones(self.D)
        self._set_params(np.hstack((variance,lengthscale)))
-    def get_param(self):
+    def _get_params(self):
        """return the value of the parameters."""
-        return np.hstack((self.variance,self.lengthscales))
+        return np.hstack((self.variance,self.lengthscale))
-    def set_param(self,x):
+    def _set_params(self,x):
        """set the value of the parameters."""
-        assert x.size==(self.D+1)
+        assert x.size == self.Nparam
        self.variance = x[0]
-        self.lengthscales = x[1:]
+        self.lengthscale = x[1:]
-    def get_param_names(self):
+    def _get_param_names(self):
        """return parameter names."""
-        if self.D==1:
+        if self.Nparam == 2:
            return ['variance','lengthscale']
        else:
-            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
+            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
    def K(self,X,X2,target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        np.add(self.variance*(1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist), target,target)
    def Kdiag(self,X,target):
@ -64,13 +75,19 @@ class Matern52(kernpart):
    def dK_dtheta(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        invdist = 1./np.where(dist!=0.,dist,np.inf)
-        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
+        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
        dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist)
        dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
        target[0] += np.sum(dvar*partial)
-        target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
+        if self.ARD:
            dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
            #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
            target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
        else:
            dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist)) * dist2M.sum(-1)*invdist
            #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist)) * dist2M.sum(-1)*invdist
            target[1] += np.sum(dl*partial)
    def dKdiag_dtheta(self,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
@ -79,8 +96,8 @@ class Matern52(kernpart):
    def dK_dX(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
-        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
+        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
        dK_dX = -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
        target += np.sum(dK_dX*partial.T[:,:,None],0)
@ -104,18 +121,18 @@ class Matern52(kernpart):
        """
        assert self.D == 1
        def L(x,i):
-            return(5*np.sqrt(5)/self.lengthscales**3*F[i](x) + 15./self.lengthscales**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscales*F2[i](x) + F3[i](x))
+            return(5*np.sqrt(5)/self.lengthscale**3*F[i](x) + 15./self.lengthscale**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscale*F2[i](x) + F3[i](x))
        n = F.shape[0]
        G = np.zeros((n,n))
        for i in range(n):
            for j in range(i,n):
                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
-        G_coef = 3.*self.lengthscales**5/(400*np.sqrt(5))
+        G_coef = 3.*self.lengthscale**5/(400*np.sqrt(5))
        Flower = np.array([f(lower) for f in F])[:,None]
        F1lower = np.array([f(lower) for f in F1])[:,None]
        F2lower = np.array([f(lower) for f in F2])[:,None]
-        orig = 9./8*np.dot(Flower,Flower.T) + 9.*self.lengthscales**4/200*np.dot(F2lower,F2lower.T)
+        orig = 9./8*np.dot(Flower,Flower.T) + 9.*self.lengthscale**4/200*np.dot(F2lower,F2lower.T)
-        orig2 = 3./5*self.lengthscales**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
+        orig2 = 3./5*self.lengthscale**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
        return(1./self.variance* (G_coef*G + orig + orig2))
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -2,5 +2,5 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, rbf_ARD, spline, Brownian, linear_ARD, rbf_sympy, sympykern
+from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, linear_ARD, rbf_sympy, sympykern
 from kern import kern
--- a/GPy/kern/bias.py
+++ b/GPy/kern/bias.py
@ -17,16 +17,16 @@ class bias(kernpart):
        self.D = D
        self.Nparam = 1
        self.name = 'bias'
-        self.set_param(np.array([variance]).flatten())
+        self._set_params(np.array([variance]).flatten())
-    def get_param(self):
+    def _get_params(self):
        return self.variance
-    def set_param(self,x):
+    def _set_params(self,x):
        assert x.shape==(1,)
        self.variance = x
-    def get_param_names(self):
+    def _get_param_names(self):
        return ['variance']
    def K(self,X,X2,target):
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -22,7 +22,7 @@ from Brownian import Brownian as Brownianpart
 #using meta-classes to make the objects construct properly wthout them.
-def rbf(D,variance=1., lengthscale=1.):
+def rbf(D,variance=1., lengthscale=None,ARD=False):
    """
    Construct an RBF kernel
@ -32,22 +32,10 @@ def rbf(D,variance=1., lengthscale=1.):
    :type variance: float
    :param lengthscale: the lengthscale of the kernel
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
-    part = rbfpart(D,variance,lengthscale)
+    part = rbfpart(D,variance,lengthscale,ARD)
    return kern(D, [part])
 def rbf_ARD(D,variance=1., lengthscales=None):
    """
    Construct an RBF kernel with Automatic Relevance Determination (ARD)
    :param D: dimensionality of the kernel, obligatory
    :type D: int
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscales: the lengthscales of the kernel
    :type lengthscales: None|np.ndarray
    """
    part = rbf_ARD_part(D,variance,lengthscales)
    return kern(D, [part])
 def linear(D,lengthscales=None):
@ -86,43 +74,52 @@ def white(D,variance=1.):
    part = whitepart(D,variance)
    return kern(D, [part])
-def exponential(D,variance=1., lengthscales=None):
+def exponential(D,variance=1., lengthscale=None, ARD=False):
    """
-     Construct a exponential kernel.
+    Construct an exponential kernel
-     Arguments
+    :param D: dimensionality of the kernel, obligatory
-     ---------
+    :type D: int
-     D (int), obligatory
+    :param variance: the variance of the kernel
-     variance (float)
+    :type variance: float
-     lengthscales (np.ndarray)
+    :param lengthscale: the lengthscale of the kernel
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
-    part = exponentialpart(D,variance, lengthscales)
+    part = exponentialpart(D,variance, lengthscale, ARD)
    return kern(D, [part])
-def Matern32(D,variance=1., lengthscales=None):
+def Matern32(D,variance=1., lengthscale=None, ARD=False):
    """
     Construct a Matern 3/2 kernel.
-     Arguments
+    :param D: dimensionality of the kernel, obligatory
-     ---------
+    :type D: int
-     D (int), obligatory
+    :param variance: the variance of the kernel
-     variance (float)
+    :type variance: float
-     lengthscales (np.ndarray)
+    :param lengthscale: the lengthscale of the kernel
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
-    part = Matern32part(D,variance, lengthscales)
+    part = Matern32part(D,variance, lengthscale, ARD)
    return kern(D, [part])
-def Matern52(D,variance=1., lengthscales=None):
+def Matern52(D,variance=1., lengthscale=None, ARD=False):
    """
     Construct a Matern 5/2 kernel.
-     Arguments
+    :param D: dimensionality of the kernel, obligatory
-     ---------
+    :type D: int
-     D (int), obligatory
+    :param variance: the variance of the kernel
-     variance (float)
+    :type variance: float
-     lengthscales (np.ndarray)
+    :param lengthscale: the lengthscale of the kernel
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
-    part = Matern52part(D,variance, lengthscales)
+    part = Matern52part(D,variance, lengthscale, ARD)
    return kern(D, [part])
 def bias(D,variance=1.):
--- a/GPy/kern/exponential.py
+++ b/GPy/kern/exponential.py
@ -19,42 +19,53 @@ class exponential(kernpart):
    :type D: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
-    :param lengthscale: the lengthscales :math:`\ell_i`
+    :param lengthscale: the vector of lengthscale :math:`\ell_i`
-    :type lengthscale: np.ndarray of size (D,)
+    :type lengthscale: np.ndarray of size (1,) or (D,) depending on ARD
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    """
-    def __init__(self,D,variance=1.,lengthscales=None):
+    def __init__(self,D,variance=1.,lengthscale=None,ARD=False):
        self.D = D
-        if lengthscales is not None:
+        self.ARD = ARD
-            assert lengthscales.shape==(self.D,)
+        if ARD == False:
            self.Nparam = 2
            self.name = 'exp'
            if lengthscale is not None:
                assert lengthscale.shape == (1,)
            else:
                lengthscale = np.ones(1)
        else:
-            lengthscales = np.ones(self.D)
+            self.Nparam = self.D + 1
-        self.Nparam = self.D + 1
+            self.name = 'exp_ARD'
-        self.name = 'exp'
+            if lengthscale is not None:
-        self.set_param(np.hstack((variance,lengthscales)))
+                assert lengthscale.shape == (self.D,)
            else:
                lengthscale = np.ones(self.D)
        self._set_params(np.hstack((variance,lengthscale)))
-    def get_param(self):
+    def _get_params(self):
        """return the value of the parameters."""
-        return np.hstack((self.variance,self.lengthscales))
+        return np.hstack((self.variance,self.lengthscale))
-    def set_param(self,x):
+    def _set_params(self,x):
        """set the value of the parameters."""
-        assert x.size==(self.D+1)
+        assert x.size == self.Nparam
        self.variance = x[0]
-        self.lengthscales = x[1:]
+        self.lengthscale = x[1:]
-    def get_param_names(self):
+    def _get_param_names(self):
        """return parameter names."""
-        if self.D==1:
+        if self.Nparam == 2:
            return ['variance','lengthscale']
        else:
-            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
+            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
    def K(self,X,X2,target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        np.add(self.variance*np.exp(-dist), target,target)
    def Kdiag(self,X,target):
@ -64,13 +75,17 @@ class exponential(kernpart):
    def dK_dtheta(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        invdist = 1./np.where(dist!=0.,dist,np.inf)
-        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
+        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
        dvar = np.exp(-dist)
        dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
        target[0] += np.sum(dvar*partial)
-        target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
+        if self.ARD == True:
            dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
            target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
        else:
            dl = self.variance*dvar*dist2M.sum(-1)*invdist
            target[1] += np.sum(dl*partial)
    def dKdiag_dtheta(self,partial,X,target): 
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
@ -80,8 +95,8 @@ class exponential(kernpart):
    def dK_dX(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
-        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
+        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
        dK_dX = - np.transpose(self.variance*np.exp(-dist)*ddist_dX,(1,0,2))
        target += np.sum(dK_dX*partial.T[:,:,None],0)
@ -101,14 +116,14 @@ class exponential(kernpart):
        """
        assert self.D == 1
        def L(x,i):
-            return(1./self.lengthscales*F[i](x) + F1[i](x))
+            return(1./self.lengthscale*F[i](x) + F1[i](x))
        n = F.shape[0]
        G = np.zeros((n,n))
        for i in range(n):
            for j in range(i,n):
                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
        Flower = np.array([f(lower) for f in F])[:,None]
-        return(self.lengthscales/2./self.variance * G + 1./self.variance * np.dot(Flower,Flower.T))
+        return(self.lengthscale/2./self.variance * G + 1./self.variance * np.dot(Flower,Flower.T))
--- a/GPy/kern/finite_dimensional.py
+++ b/GPy/kern/finite_dimensional.py
@ -27,15 +27,15 @@ class finite_dimensional(kernpart):
            weights = np.ones(self.n)
        self.Nparam = self.n + 1
        self.name = 'finite_dim'
-        self.set_param(np.hstack((variance,weights)))
+        self._set_params(np.hstack((variance,weights)))
-    def get_param(self):
+    def _get_params(self):
        return np.hstack((self.variance,self.weights))
-    def set_param(self,x):
+    def _set_params(self,x):
        assert x.size == (self.Nparam)
        self.variance = x[0]
        self.weights = x[1:]
-    def get_param_names(self):
+    def _get_param_names(self):
        if self.n==1:
            return ['variance','weight']
        else:
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -133,20 +133,20 @@ class kern(parameterised):
        newkern.tied_indices = self.tied_indices + [self.Nparam + x for x in other.tied_indices]
        return newkern
-    def get_param(self):
+    def _get_params(self):
-        return np.hstack([p.get_param() for p in self.parts])
+        return np.hstack([p._get_params() for p in self.parts])
-    def set_param(self,x):
+    def _set_params(self,x):
-        [p.set_param(x[s]) for p, s in zip(self.parts, self.param_slices)]
+        [p._set_params(x[s]) for p, s in zip(self.parts, self.param_slices)]
-    def get_param_names(self):
+    def _get_param_names(self):
        #this is a bit nasty: we wat to distinguish between parts with the same name by appending a count
        part_names = np.array([k.name for k in self.parts],dtype=np.str)
        counts = [np.sum(part_names==ni) for i, ni in enumerate(part_names)]
        cum_counts = [np.sum(part_names[i:]==ni) for i, ni in enumerate(part_names)]
        names = [name+'_'+str(cum_count) if count>1 else name for name,count,cum_count in zip(part_names,counts,cum_counts)]
-        return sum([[name+'_'+n for n in k.get_param_names()] for name,k in zip(names,self.parts)],[])
+        return sum([[name+'_'+n for n in k._get_param_names()] for name,k in zip(names,self.parts)],[])
    def K(self,X,X2=None,slices1=None,slices2=None):
        assert X.shape[1]==self.D
--- a/GPy/kern/kernpart.py
+++ b/GPy/kern/kernpart.py
@ -16,11 +16,11 @@ class kernpart(object):
        self.Nparam = 1
        self.name = 'unnamed'
-    def get_param(self):
+    def _get_params(self):
        raise NotImplementedError
-    def set_param(self,x):
+    def _set_params(self,x):
        raise NotImplementedError
-    def get_param_names(self):
+    def _get_param_names(self):
        raise NotImplementedError
    def K(self,X,X2,target):
        raise NotImplementedError
--- a/GPy/kern/linear.py
+++ b/GPy/kern/linear.py
@ -20,16 +20,16 @@ class linear(kernpart):
            variance = 1.0
        self.Nparam = 1
        self.name = 'linear'
-        self.set_param(variance)
+        self._set_params(variance)
        self._Xcache, self._X2cache = np.empty(shape=(2,))
-    def get_param(self):
+    def _get_params(self):
        return self.variance
-    def set_param(self,x):
+    def _set_params(self,x):
        self.variance = x
-    def get_param_names(self):
+    def _get_param_names(self):
        return ['variance']
    def K(self,X,X2,target):
--- a/GPy/kern/linear_ARD.py
+++ b/GPy/kern/linear_ARD.py
@ -23,16 +23,16 @@ class linear_ARD(kernpart):
            variances = np.ones(self.D)
        self.Nparam = int(self.D)
        self.name = 'linear'
-        self.set_param(variances)
+        self._set_params(variances)
-    def get_param(self):
+    def _get_params(self):
        return self.variances
-    def set_param(self,x):
+    def _set_params(self,x):
        assert x.size==(self.Nparam)
        self.variances = x
-    def get_param_names(self):
+    def _get_param_names(self):
        if self.D==1:
            return ['variance']
        else:
--- a/GPy/kern/rbf-testing.py
+++ b/GPy/kern/rbf-testing.py
@ -0,0 +1,178 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import kernpart
 import numpy as np
 import hashlib
 class rbf_testing(kernpart):
    """
    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel.
    .. math::
       k(r) = \sigma^2 \exp(- \frac{r^2}{2\ell}) \qquad \qquad \\text{ where  } r = \sqrt{\frac{\sum_{i=1}^d (x_i-x^\prime_i)^2}{\ell^2}}
    where \ell is the lengthscale, \alpha the smoothness, \sigma^2 the variance and d the dimensionality of the input.
    :param D: the number of input dimensions
    :type D: int
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscale: the lengthscale of the kernel
    :type lengthscale: float
    .. Note: for rbf_testing with different lengthscale on each dimension, see rbf_testing_ARD
    """
    def __init__(self,D,variance=1.,lengthscale=1.):
        self.D = D
        self.Nparam = 2
        self.name = 'rbf_testing'
        self._set_params(np.hstack((variance,lengthscale)))
        #initialize cache
        self._Z, self._mu, self._S = np.empty(shape=(3,1))
        self._X, self._X2, self._params = np.empty(shape=(3,1))
    def _get_params(self):
        return np.hstack((self.variance,self.lengthscale))
    def _set_params(self,x):
        self.variance, self.lengthscale = x
        self.lengthscale2 = np.square(self.lengthscale)
        #reset cached results
        self._X, self._X2, self._params = np.empty(shape=(3,1))
        self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
    def _get_param_names(self):
        return ['variance','lengthscale']
    def K(self,X,X2,target):
        if X2 is None:
            X2 = X
        self._K_computations(X,X2)
        np.add(self.variance*self._K_dvar, target,target)
    def Kdiag(self,X,target):
        np.add(target,self.variance,target)
    def dK_dtheta(self,partial,X,X2,target):
        self._K_computations(X,X2)
        target[0] += np.sum(self._K_dvar*partial)
        target[1] += np.sum(self._K_dvar*self.variance*self._K_dist2/self.lengthscale*partial)
    def dKdiag_dtheta(self,partial,X,target):
        #NB: derivative of diagonal elements wrt lengthscale is 0
        target[0] += np.sum(partial)
    def dK_dX(self,partial,X,X2,target):
        self._K_computations(X,X2)
        _K_dist = X[:,None,:]-X2[None,:,:]
        dK_dX = np.transpose(-self.variance*self._K_dvar[:,:,np.newaxis]*_K_dist/self.lengthscale2,(1,0,2))
        target += np.sum(dK_dX*partial.T[:,:,None],0)
    def dKdiag_dX(self,partial,X,target):
        pass
    def _K_computations(self,X,X2):
        if not (np.all(X==self._X) and np.all(X2==self._X2)):
            self._X = X
            self._X2 = X2
            if X2 is None: X2 = X
            XXT = np.dot(X,X2.T)
            if X is X2:
                self._K_dist2 = (-2.*XXT + np.diag(XXT)[:,np.newaxis] + np.diag(XXT)[np.newaxis,:])/self.lengthscale2
            else:
                self._K_dist2 = (-2.*XXT + np.sum(np.square(X),1)[:,None] + np.sum(np.square(X2),1)[None,:])/self.lengthscale2
            # TODO Remove comments if this is fine.
            # Commented out by Neil as doesn't seem to be used elsewhere.
            #self._K_exponent = -0.5*self._K_dist2
            self._K_dvar = np.exp(-0.5*self._K_dist2)
    def psi0(self,Z,mu,S,target):
        target += self.variance
    def dpsi0_dtheta(self,partial,Z,mu,S,target):
        target[0] += 1.
    def dpsi0_dmuS(self,Z,mu,S,target_mu,target_S):
        pass
    def psi1(self,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        target += self._psi1
    def dpsi1_dtheta(self,partial,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        denom_deriv = S[:,None,:]/(self.lengthscale**3+self.lengthscale*S[:,None,:])
        d_length = self._psi1[:,:,None]*(self.lengthscale*np.square(self._psi1_dist/(self.lengthscale2+S[:,None,:])) + denom_deriv)
        target[0] += np.sum(partial*self._psi1/self.variance)
        target[1] += np.sum(d_length*partial[:,:,None])
    def dpsi1_dZ(self,partial,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        target += np.sum(partial[:,:,None]*-self._psi1[:,:,None]*self._psi1_dist/self.lengthscale2/self._psi1_denom,0)
    def dpsi1_dmuS(self,partial,Z,mu,S,target_mu,target_S):
        self._psi_computations(Z,mu,S)
        tmp = self._psi1[:,:,None]/self.lengthscale2/self._psi1_denom
        target_mu += np.sum(partial*tmp*self._psi1_dist,1)
        target_S += np.sum(partial*0.5*tmp*(self._psi1_dist_sq-1),1)
    def psi2(self,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        target += self._psi2.sum(0) #TODO: psi2 should be NxMxM (for het. noise)
    def dpsi2_dtheta(self,partial,Z,mu,S,target):
        """Shape N,M,M,Ntheta"""
        self._psi_computations(Z,mu,S)
        d_var = np.sum(2.*self._psi2/self.variance,0)
        d_length = self._psi2[:,:,:,None]*(0.5*self._psi2_Zdist_sq*self._psi2_denom + 2.*self._psi2_mudist_sq + 2.*S[:,None,None,:]/self.lengthscale2)/(self.lengthscale*self._psi2_denom)
        d_length = d_length.sum(0)
        target[0] += np.sum(partial*d_var)
        target[1] += np.sum(d_length*partial)
    def dpsi2_dZ(self,partial,Z,mu,S,target):
        """Returns shape N,M,M,Q"""
        self._psi_computations(Z,mu,S)
        dZ = self._psi2[:,:,:,None]/self.lengthscale2*(-0.5*self._psi2_Zdist + self._psi2_mudist/self._psi2_denom)
        target += np.sum(partial[None,:,:,None]*dZ,0).sum(1)
    def dpsi2_dmuS(self,Z,mu,S,target_mu,target_S):
        """Think N,M,M,Q """
        self._psi_computations(Z,mu,S)
        tmp = self._psi2[:,:,:,None]/self.lengthscale2/self._psi2_denom
        target_mu += (partial*-tmp*2.*self._psi2_mudist).sum(1).sum(1)
        target_S += (partial*tmp*(2.*self._psi2_mudist_sq-1)).sum(1).sum(1)
    def _psi_computations(self,Z,mu,S):
        #here are the "statistics" for psi1 and psi2
        if not np.all(Z==self._Z):
            #Z has changed, compute Z specific stuff
            self._psi2_Zhat = 0.5*(Z[:,None,:] +Z[None,:,:]) # M,M,Q
            self._psi2_Zdist = Z[:,None,:]-Z[None,:,:] # M,M,Q
            self._psi2_Zdist_sq = np.square(self._psi2_Zdist)/self.lengthscale2 # M,M,Q
            self._Z = Z
        if not (np.all(Z==self._Z) and np.all(mu==self._mu) and np.all(S==self._S)):
            #something's changed. recompute EVERYTHING
            #TODO: make more efficient for large Q (using NDL's dot product trick)
            #psi1
            self._psi1_denom = S[:,None,:]/self.lengthscale2 + 1.
            self._psi1_dist = Z[None,:,:]-mu[:,None,:]
            self._psi1_dist_sq = np.square(self._psi1_dist)/self.lengthscale2/self._psi1_denom
            self._psi1_exponent = -0.5*np.sum(self._psi1_dist_sq+np.log(self._psi1_denom),-1)
            self._psi1 = self.variance*np.exp(self._psi1_exponent)
            #psi2
            self._psi2_denom = 2.*S[:,None,None,:]/self.lengthscale2+1. # N,M,M,Q
            self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
            self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
            self._psi2_exponent = np.sum(-self._psi2_Zdist_sq/4. -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M
            self._psi2 = np.square(self.variance)*np.exp(self._psi2_exponent) # N,M,M
            self._Z, self._mu, self._S = Z, mu,S
--- a/GPy/kern/rbf.py
+++ b/GPy/kern/rbf.py
@ -8,46 +8,67 @@ import hashlib
 class rbf(kernpart):
    """
-    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel.
+    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:
    .. math::
-       k(r) = \sigma^2 \exp(- \frac{r^2}{2\ell}) \qquad \qquad \\text{ where  } r = \sqrt{\frac{\sum_{i=1}^d (x_i-x^\prime_i)^2}{\ell^2}}
+       k(r) = \sigma^2 \exp(- \frac{1}{2}r^2) \qquad \qquad \\text{ where  } r^2 = \sum_{i=1}^d \frac{ (x_i-x^\prime_i)^2}{\ell_i^2}}
-    where \ell is the lengthscale, \alpha the smoothness, \sigma^2 the variance and d the dimensionality of the input.
+    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.
    :param D: the number of input dimensions
    :type D: int
    :param variance: the variance of the kernel
    :type variance: float
-    :param lengthscale: the lengthscale of the kernel
+    :param lengthscale: the vector of lengthscale of the kernel
-    :type lengthscale: float
+    :type lengthscale: np.ndarray od size (1,) or (D,) depending on ARD
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    .. Note: for rbf with different lengthscale on each dimension, see rbf_ARD
    """
-    def __init__(self,D,variance=1.,lengthscale=1.):
+    def __init__(self,D,variance=1.,lengthscale=None,ARD=False):
        self.D = D
-        self.Nparam = 2
+        self.ARD = ARD
-        self.name = 'rbf'
+        if ARD == False:
-        self.set_param(np.hstack((variance,lengthscale)))
+            self.Nparam = 2
            self.name = 'rbf'
            if lengthscale is not None:
                assert lengthscale.shape == (1,)
            else:
                lengthscale = np.ones(1)     
        else:
            self.Nparam = self.D + 1
            self.name = 'rbf_ARD'
            if lengthscale is not None:
                assert lengthscale.shape == (self.D,)
            else:
                lengthscale = np.ones(self.D)
        self._set_params(np.hstack((variance,lengthscale)))        
        #initialize cache
        self._Z, self._mu, self._S = np.empty(shape=(3,1))
        self._X, self._X2, self._params = np.empty(shape=(3,1))
-    def get_param(self):
+    def _get_params(self):
        return np.hstack((self.variance,self.lengthscale))
-    def set_param(self,x):
+    def _set_params(self,x):
-        self.variance, self.lengthscale = x
+        assert x.size==(self.Nparam)
        self.variance = x[0]
        self.lengthscale = x[1:]
        self.lengthscale2 = np.square(self.lengthscale)
        #reset cached results
        self._X, self._X2, self._params = np.empty(shape=(3,1))
        self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
-    def get_param_names(self):
+    def _get_param_names(self):
-        return ['variance','lengthscale']
+        if self.Nparam == 2:
            return ['variance','lengthscale']
        else:
            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]        
    def K(self,X,X2,target):
        if X2 is None:
@ -61,7 +82,12 @@ class rbf(kernpart):
    def dK_dtheta(self,partial,X,X2,target):
        self._K_computations(X,X2)
        target[0] += np.sum(self._K_dvar*partial)
-        target[1] += np.sum(self._K_dvar*self.variance*self._K_dist2/self.lengthscale*partial)
+        if self.ARD == True:
            dl = self._K_dvar[:,:,None]*self.variance*self._K_dist2/self.lengthscale
            target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
        else:
            target[1] += np.sum(self._K_dvar*self.variance*(self._K_dist2.sum(-1))/self.lengthscale*partial)
        #np.sum(self._K_dvar*self.variance*self._K_dist2/self.lengthscale*partial)
    def dKdiag_dtheta(self,partial,X,target):
        #NB: derivative of diagonal elements wrt lengthscale is 0
@ -81,15 +107,12 @@ class rbf(kernpart):
            self._X = X
            self._X2 = X2
            if X2 is None: X2 = X
-            XXT = np.dot(X,X2.T)
+            self._K_dist = X[:,None,:]-X2[None,:,:] # this can be computationally heavy
-            if X is X2:
+            self._params = np.empty(shape=(1,0))    #ensure the next section gets called
-                self._K_dist2 = (-2.*XXT + np.diag(XXT)[:,np.newaxis] + np.diag(XXT)[np.newaxis,:])/self.lengthscale2
+        if not np.all(self._params == self._get_params()):
-            else:
+            self._params == self._get_params()
-                self._K_dist2 = (-2.*XXT + np.sum(np.square(X),1)[:,None] + np.sum(np.square(X2),1)[None,:])/self.lengthscale2
+            self._K_dist2 = np.square(self._K_dist/self.lengthscale)
-            # TODO Remove comments if this is fine.
+            self._K_dvar = np.exp(-0.5*self._K_dist2.sum(-1))
            # Commented out by Neil as doesn't seem to be used elsewhere.
            #self._K_exponent = -0.5*self._K_dist2
            self._K_dvar = np.exp(-0.5*self._K_dist2)
    def psi0(self,Z,mu,S,target):
        target += self.variance
@ -132,7 +155,7 @@ class rbf(kernpart):
        d_length = self._psi2[:,:,:,None]*(0.5*self._psi2_Zdist_sq*self._psi2_denom + 2.*self._psi2_mudist_sq + 2.*S[:,None,None,:]/self.lengthscale2)/(self.lengthscale*self._psi2_denom)
        d_length = d_length.sum(0)
        target[0] += np.sum(partial*d_var)
-        target[1] += np.sum(d_length*partial)
+        target[1:] += (d_length*partial[:,:,None]).sum(0).sum(0)
    def dpsi2_dZ(self,partial,Z,mu,S,target):
        """Returns shape N,M,M,Q"""
@ -175,4 +198,3 @@ class rbf(kernpart):
            self._psi2 = np.square(self.variance)*np.exp(self._psi2_exponent) # N,M,M
            self._Z, self._mu, self._S = Z, mu,S
--- a/GPy/kern/rbf_ARD.py
+++ b/GPy/kern/rbf_ARD.py
@ -22,16 +22,16 @@ class rbf_ARD(kernpart):
            lengthscales = np.ones(self.D)
        self.Nparam = self.D + 1
        self.name = 'rbf_ARD'
-        self.set_param(np.hstack((variance,lengthscales)))
+        self._set_params(np.hstack((variance,lengthscales)))
        #initialize cache
        self._Z, self._mu, self._S = np.empty(shape=(3,1))
        self._X, self._X2, self._params = np.empty(shape=(3,1))
-    def get_param(self):
+    def _get_params(self):
        return np.hstack((self.variance,self.lengthscales))
-    def set_param(self,x):
+    def _set_params(self,x):
        assert x.size==(self.D+1)
        self.variance = x[0]
        self.lengthscales = x[1:]
@ -39,7 +39,7 @@ class rbf_ARD(kernpart):
        #reset cached results
        self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
-    def get_param_names(self):
+    def _get_param_names(self):
        if self.D==1:
            return ['variance','lengthscale']
        else:
@ -135,8 +135,8 @@ class rbf_ARD(kernpart):
            if X2 is None: X2 = X
            self._K_dist = X[:,None,:]-X2[None,:,:] # this can be computationally heavy
            self._params = np.empty(shape=(1,0))#ensure the next section gets called
-        if not np.all(self._params == self.get_param()):
+        if not np.all(self._params == self._get_params()):
-            self._params == self.get_param()
+            self._params == self._get_params()
            self._K_dist2 = np.square(self._K_dist/self.lengthscales)
            self._K_exponent = -0.5*self._K_dist2.sum(-1)
            self._K_dvar = np.exp(-0.5*self._K_dist2.sum(-1))
@ -189,7 +189,7 @@ if __name__=='__main__':
    from checkgrad import checkgrad
    def k_theta_test(param,k):
-        k.set_param(param)
+        k._set_params(param)
        K = k.K(Z)
        dK_dtheta = k.dK_dtheta(Z)
        f = np.sum(K)
@ -216,7 +216,7 @@ if __name__=='__main__':
    checkgrad(psi1_S_test,np.random.rand(N*Q),args=(k,))
    def psi1_theta_test(theta,k):
-        k.set_param(theta)
+        k._set_params(theta)
        f = np.sum(k.psi1(Z,mu,S))
        df = np.array([np.sum(grad) for grad in k.dpsi1_dtheta(Z,mu,S)])
        return f,df
@ -241,7 +241,7 @@ if __name__=='__main__':
    checkgrad(psi2_S_test,np.random.rand(N*Q),args=(k,))
    def psi2_theta_test(theta,k):
-        k.set_param(theta)
+        k._set_params(theta)
        f = np.sum(k.psi2(Z,mu,S))
        df = np.array([np.sum(grad) for grad in k.dpsi2_dtheta(Z,mu,S)])
        return f,df
--- a/GPy/kern/spline.py
+++ b/GPy/kern/spline.py
@ -25,15 +25,15 @@ class spline(kernpart):
        assert self.D==1
        self.Nparam = 1
        self.name = 'spline'
-        self.set_param(np.squeeze(variance))
+        self._set_params(np.squeeze(variance))
-    def get_param(self):
+    def _get_params(self):
        return self.variance
-    def set_param(self,x):
+    def _set_params(self,x):
        self.variance = x
-    def get_param_names(self):
+    def _get_param_names(self):
        return ['variance']
    def K(self,X,X2,target):
--- a/GPy/kern/sympykern.py
+++ b/GPy/kern/sympykern.py
@ -44,7 +44,7 @@ class spkern(kernpart):
        if param is None:
            param = np.ones(self.Nparam)
        assert param.size==self.Nparam
-        self.set_param(param)
+        self._set_params(param)
        #Differentiate!
        self._sp_dk_dtheta = [sp.diff(k,theta).simplify() for theta in self._sp_theta]
@ -247,12 +247,12 @@ class spkern(kernpart):
        Z = X
        weave.inline(self._dKdiag_dX_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
-    def set_param(self,param):
+    def _set_params(self,param):
        #print param.flags['C_CONTIGUOUS']
        self._param = param.copy()
-    def get_param(self):
+    def _get_params(self):
        return self._param
-    def get_param_names(self):
+    def _get_param_names(self):
        return [x.name for x in self._sp_theta]
--- a/GPy/kern/white.py
+++ b/GPy/kern/white.py
@ -17,16 +17,16 @@ class white(kernpart):
        self.D = D
        self.Nparam = 1
        self.name = 'white'
-        self.set_param(np.array([variance]).flatten())
+        self._set_params(np.array([variance]).flatten())
-    def get_param(self):
+    def _get_params(self):
        return self.variance
-    def set_param(self,x):
+    def _set_params(self,x):
        assert x.shape==(1,)
        self.variance = x
-    def get_param_names(self):
+    def _get_param_names(self):
        return ['variance']
    def K(self,X,X2,target):
--- a/GPy/models/GPLVM.py
+++ b/GPy/models/GPLVM.py
@ -33,18 +33,18 @@ class GPLVM(GP_regression):
        else:
            return np.random.randn(Y.shape[0], Q)
-    def get_param_names(self):
+    def _get_param_names(self):
        return (sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
-                + self.kern.extract_param_names())
+                + self.kern._get_param_names_transformed())
-    def get_param(self):
+    def _get_params(self):
-        return np.hstack((self.X.flatten(), self.kern.extract_param()))
+        return np.hstack((self.X.flatten(), self.kern._get_params_transformed()))
-    def set_param(self,x):
+    def _set_params(self,x):
        self.X = x[:self.X.size].reshape(self.N,self.Q).copy()
-        GP_regression.set_param(self, x[self.X.size:])
+        GP_regression._set_params(self, x[self.X.size:])
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
        dL_dK = self.dL_dK()
        dL_dtheta = self.kern.dK_dtheta(dL_dK,self.X)
--- a/GPy/models/GP_EP.py
+++ b/GPy/models/GP_EP.py
@ -41,14 +41,14 @@ class GP_EP(model):
        self.K = self.kernel.K(self.X)
        model.__init__(self)
-    def set_param(self,p):
+    def _set_params(self,p):
-        self.kernel.expand_param(p)
+        self.kernel._set_params_transformed(p)
-    def get_param(self):
+    def _get_params(self):
-        return self.kernel.extract_param()
+        return self.kernel._get_params_transformed()
-    def get_param_names(self):
+    def _get_param_names(self):
-        return self.kernel.extract_param_names()
+        return self.kernel._get_param_names_transformed()
    def approximate_likelihood(self):
        self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta])
@ -78,7 +78,7 @@ class GP_EP(model):
        L3 = sum(np.log(self.ep_approx.Z_hat))
        return L1 + L2A + L2B + L3
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
        dK_dp = self.kernel.dK_dtheta(self.X)
        self.dK_dp = dK_dp
        aux1,info_1 = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1)
@ -138,7 +138,7 @@ class GP_EP(model):
        """
        self.epsilon_em = epsilon
        log_likelihood_change = self.epsilon_em + 1.
-        self.parameters_path = [self.kernel.get_param()]
+        self.parameters_path = [self.kernel._get_params()]
        self.approximate_likelihood()
        self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]]
        self.log_likelihood_path = [self.log_likelihood()]
@ -150,11 +150,11 @@ class GP_EP(model):
            log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1]
            if log_likelihood_change < 0:
                print 'log_likelihood decrement'
-                self.kernel.expand_param(self.parameters_path[-1])
+                self.kernel._set_params_transformed(self.parameters_path[-1])
-                self.kernM.expand_param(self.parameters_path[-1])
+                self.kernM._set_params_transformed(self.parameters_path[-1])
            else:
                self.approximate_likelihood()
                self.log_likelihood_path.append(self.log_likelihood())
-                self.parameters_path.append(self.kernel.get_param())
+                self.parameters_path.append(self.kernel._get_params())
                self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde])
            iteration += 1
--- a/GPy/models/GP_regression.py
+++ b/GPy/models/GP_regression.py
@ -63,47 +63,47 @@ class GP_regression(model):
            self._Ystd = np.ones((1,self.Y.shape[1]))
        if self.D > self.N:
-            # then it's more efficient to store Youter
+            # then it's more efficient to store YYT
-            self.Youter = np.dot(self.Y, self.Y.T)
+            self.YYT = np.dot(self.Y, self.Y.T)
        else:
-            self.Youter = None
+            self.YYT = None
        model.__init__(self)
-    def set_param(self,p):
+    def _set_params(self,p):
-        self.kern.expand_param(p)
+        self.kern._set_params_transformed(p)
        self.K = self.kern.K(self.X,slices1=self.Xslices)
        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
-    def get_param(self):
+    def _get_params(self):
-        return self.kern.extract_param()
+        return self.kern._get_params_transformed()
-    def get_param_names(self):
+    def _get_param_names(self):
-        return self.kern.extract_param_names()
+        return self.kern._get_param_names_transformed()
    def _model_fit_term(self):
        """
-        Computes the model fit using Youter if it's available
+        Computes the model fit using YYT if it's available
        """
-        if self.Youter is None:
+        if self.YYT is None:
            return -0.5*np.sum(np.square(np.dot(self.Li,self.Y)))
        else:
-            return -0.5*np.sum(np.multiply(self.Ki, self.Youter))
+            return -0.5*np.sum(np.multiply(self.Ki, self.YYT))
    def log_likelihood(self):
        complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - 0.5*self.D*self.K_logdet
        return complexity_term + self._model_fit_term()
    def dL_dK(self):
-        if self.Youter is None:
+        if self.YYT is None:
            alpha = np.dot(self.Ki,self.Y)
            dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki)
        else:
-            dL_dK = 0.5*(mdot(self.Ki, self.Youter, self.Ki) - self.D*self.Ki)
+            dL_dK = 0.5*(mdot(self.Ki, self.YYT, self.Ki) - self.D*self.Ki)
        return dL_dK
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
        return self.kern.dK_dtheta(partial=self.dL_dK(),X=self.X)
    def predict(self,Xnew, slices=None, full_cov=False):
--- a/GPy/models/generalized_FITC.py
+++ b/GPy/models/generalized_FITC.py
@ -42,15 +42,15 @@ class generalized_FITC(model):
        self.jitter = 1e-12
        model.__init__(self)
-    def set_param(self,p):
+    def _set_params(self,p):
-        self.kernel.expand_param(p[0:-self.Z.size])
+        self.kernel._set_params_transformed(p[0:-self.Z.size])
        self.Z = p[-self.Z.size:].reshape(self.M,self.D)
-    def get_param(self):
+    def _get_params(self):
-        return np.hstack([self.kernel.extract_param(),self.Z.flatten()])
+        return np.hstack([self.kernel._get_params_transformed(),self.Z.flatten()])
-    def get_param_names(self):
+    def _get_param_names(self):
-        return self.kernel.extract_param_names()+['iip_%i'%i for i in range(self.Z.size)]
+        return self.kernel._get_param_names_transformed()+['iip_%i'%i for i in range(self.Z.size)]
    def approximate_likelihood(self):
        self.Kmm = self.kernel.K(self.Z)
@ -91,15 +91,15 @@ class generalized_FITC(model):
    def log_likelihood(self):
        self.posterior_param()
-        self.Youter = np.dot(self.mu_tilde,self.mu_tilde.T)
+        self.YYT = np.dot(self.mu_tilde,self.mu_tilde.T)
        A = -self.hld
-        B = -.5*np.sum(self.Qi*self.Youter)
+        B = -.5*np.sum(self.Qi*self.YYT)
        C = sum(np.log(self.ep_approx.Z_hat))
        D = .5*np.sum(np.log(1./self.ep_approx.tau_tilde + 1./self.ep_approx.tau_))
        E = .5*np.sum((self.ep_approx.v_/self.ep_approx.tau_ - self.mu_tilde.flatten())**2/(1./self.ep_approx.tau_ + 1./self.ep_approx.tau_tilde))
        return  A + B + C + D + E
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
        dKmm_dtheta = self.kernel.dK_dtheta(self.Z)
        dKnn_dtheta = self.kernel.dK_dtheta(self.X)
        dKmn_dtheta = self.kernel.dK_dtheta(self.Z,self.X)
@ -214,7 +214,7 @@ class generalized_FITC(model):
        """
        self.epsilon_em = epsilon
        log_likelihood_change = self.epsilon_em + 1.
-        self.parameters_path = [self.kernel.get_param()]
+        self.parameters_path = [self.kernel._get_params()]
        self.approximate_likelihood()
        self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]]
        self.inducing_inputs_path = [self.Z]
@ -227,7 +227,7 @@ class generalized_FITC(model):
            log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1]
            if log_likelihood_change < 0:
                print 'log_likelihood decrement'
-                self.kernel.expand_param(self.parameters_path[-1])
+                self.kernel._set_params_transformed(self.parameters_path[-1])
                self.kernM = self.kernel.copy()
                slef.kernM.expand_X(self.iducing_inputs_path[-1])
                self.__init__(self.kernel,self.likelihood,kernM=self.kernM,powerep=[self.eta,self.delta],epsilon_ep = self.epsilon_ep, epsilon_em = self.epsilon_em)
@ -235,7 +235,7 @@ class generalized_FITC(model):
            else:
                self.approximate_likelihood()
                self.log_likelihood_path.append(self.log_likelihood())
-                self.parameters_path.append(self.kernel.get_param())
+                self.parameters_path.append(self.kernel._get_params())
                self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde])
                self.inducing_inputs_path.append(self.Z)
            iteration += 1
--- a/GPy/models/sparse_GPLVM.py
+++ b/GPy/models/sparse_GPLVM.py
@ -27,16 +27,16 @@ class sparse_GPLVM(sparse_GP_regression, GPLVM):
        X = self.initialise_latent(init, Q, Y)
        sparse_GP_regression.__init__(self, X, Y, **kwargs)
-    def get_param_names(self):
+    def _get_param_names(self):
        return (sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
-                + sparse_GP_regression.get_param_names(self))
+                + sparse_GP_regression._get_param_names(self))
-    def get_param(self):
+    def _get_params(self):
-        return np.hstack((self.X.flatten(), sparse_GP_regression.get_param(self)))
+        return np.hstack((self.X.flatten(), sparse_GP_regression._get_params(self)))
-    def set_param(self,x):
+    def _set_params(self,x):
        self.X = x[:self.X.size].reshape(self.N,self.Q).copy()
-        sparse_GP_regression.set_param(self, x[self.X.size:])
+        sparse_GP_regression._set_params(self, x[self.X.size:])
    def log_likelihood(self):
        return sparse_GP_regression.log_likelihood(self)
@ -49,8 +49,8 @@ class sparse_GPLVM(sparse_GP_regression, GPLVM):
        return dL_dX
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
-        return np.hstack((self.dL_dX().flatten(), sparse_GP_regression.log_likelihood_gradients(self)))
+        return np.hstack((self.dL_dX().flatten(), sparse_GP_regression._log_likelihood_gradients(self)))
    def plot(self):
        GPLVM.plot(self)
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@ -59,10 +59,10 @@ class sparse_GP_regression(GP_regression):
        if self.has_uncertain_inputs:
            self.X_uncertainty /= np.square(self._Xstd)
-    def set_param(self, p):
+    def _set_params(self, p):
        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
        self.beta = p[self.M*self.Q]
-        self.kern.set_param(p[self.Z.size + 1:])
+        self.kern._set_params(p[self.Z.size + 1:])
        self.beta2 = self.beta**2
        self._compute_kernel_matrices()
        self._computations()
@ -106,11 +106,11 @@ class sparse_GP_regression(GP_regression):
        self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC
        self.dL_dKmm +=  np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE
-    def get_param(self):
+    def _get_params(self):
-        return np.hstack([self.Z.flatten(),self.beta,self.kern.extract_param()])
+        return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()])
-    def get_param_names(self):
+    def _get_param_names(self):
-        return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern.extract_param_names()
+        return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed()
    def log_likelihood(self):
        """
@ -168,7 +168,7 @@ class sparse_GP_regression(GP_regression):
            dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
        return dL_dZ
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
        return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
    def _raw_predict(self, Xnew, slices, full_cov=False):
--- a/GPy/models/warped_GP.py
+++ b/GPy/models/warped_GP.py
@ -13,7 +13,7 @@ from GP_regression import GP_regression
 class warpedGP(GP_regression):
    """
-    TODO: fucking docstrings!
+    TODO: fecking docstrings!
    @nfusi: I'#ve hacked a little on this, but no guarantees. J.
    """
@ -30,17 +30,17 @@ class warpedGP(GP_regression):
        self.transform_data()
        GP_regression.__init__(self, X, self.Y, **kwargs)
-    def set_param(self, x):
+    def _set_params(self, x):
        self.warping_params = x[:self.warping_function.num_parameters].reshape(self.warp_params_shape).copy()
        self.transform_data()
-        GP_regression.set_param(self, x[self.warping_function.num_parameters:].copy())
+        GP_regression._set_params(self, x[self.warping_function.num_parameters:].copy())
-    def get_param(self):
+    def _get_params(self):
-        return np.hstack((self.warping_params.flatten().copy(), GP_regression.get_param(self).copy()))
+        return np.hstack((self.warping_params.flatten().copy(), GP_regression._get_params(self).copy()))
-    def get_param_names(self):
+    def _get_param_names(self):
-        warping_names = self.warping_function.get_param_names()
+        warping_names = self.warping_function._get_param_names()
-        param_names = GP_regression.get_param_names(self)
+        param_names = GP_regression._get_param_names(self)
        return warping_names + param_names
    def transform_data(self):
@ -48,9 +48,9 @@ class warpedGP(GP_regression):
        # this supports the 'smart' behaviour in GP_regression
        if self.D > self.N:
-            self.Youter = np.dot(self.Y, self.Y.T)
+            self.YYT = np.dot(self.Y, self.Y.T)
        else:
-            self.Youter = None
+            self.YYT = None
        return self.Y
@ -59,8 +59,8 @@ class warpedGP(GP_regression):
        jacobian = self.warping_function.fgrad_y(self.Z, self.warping_params)
        return ll + np.log(jacobian).sum()
-    def log_likelihood_gradients(self):
+    def _log_likelihood_gradients(self):
-        ll_grads = GP_regression.log_likelihood_gradients(self)
+        ll_grads = GP_regression._log_likelihood_gradients(self)
        alpha = np.dot(self.Ki, self.Y.flatten())
        warping_grads = self.warping_function_gradients(alpha)
        return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
@ -81,7 +81,7 @@ class warpedGP(GP_regression):
    def predict(self, X, in_unwarped_space = False, **kwargs):
        mu, var = GP_regression.predict(self, X, **kwargs)
-        # The plot() function calls set_param() before calling predict()
+        # The plot() function calls _set_params() before calling predict()
        # this is causing the observations to be plotted in the transformed
        # space (where Y lives), making the plot looks very wrong
        # if the predictions are made in the untransformed space
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -42,51 +42,66 @@ class GradientTests(unittest.TestCase):
        # contrain all parameters to be positive
        self.assertTrue(m.checkgrad())
-    def test_gp_regression_rbf_white_kern_1d(self):
+    def test_gp_regression_rbf_1d(self):
        ''' Testing the GP regression with rbf kernel with white kernel on 1d data '''
        rbf = GPy.kern.rbf(1)
        self.check_model_with_white(rbf, model_type='GP_regression', dimension=1)
-    def test_GP_regression_rbf_ARD_white_kern_2D(self):
+    def test_GP_regression_rbf_2D(self):
        ''' Testing the GP regression with rbf and white kernel on 2d data '''
        k = GPy.kern.rbf_ARD(2)
        self.check_model_with_white(k, model_type='GP_regression', dimension=2)
    def test_GP_regression_rbf_white_kern_2D(self):
        ''' Testing the GP regression with rbf and white kernel on 2d data '''
        rbf = GPy.kern.rbf(2)
        self.check_model_with_white(rbf, model_type='GP_regression', dimension=2)
-    def test_GP_regression_matern52_kern_1D(self):
+    def test_GP_regression_rbf_ARD_2D(self):
        ''' Testing the GP regression with rbf and white kernel on 2d data '''
        k = GPy.kern.rbf(2,ARD=True)
        self.check_model_with_white(k, model_type='GP_regression', dimension=2)
    def test_GP_regression_matern52_1D(self):
        ''' Testing the GP regression with matern52 kernel on 1d data '''
        matern52 = GPy.kern.Matern52(1)
        self.check_model_with_white(matern52, model_type='GP_regression', dimension=1)
-    def test_GP_regression_matern52_kern_2D(self):
+    def test_GP_regression_matern52_2D(self):
        ''' Testing the GP regression with matern52 kernel on 2d data '''
        matern52 = GPy.kern.Matern52(2)
        self.check_model_with_white(matern52, model_type='GP_regression', dimension=2)
-    def test_GP_regression_matern32_kern_1D(self):
+    def test_GP_regression_matern52_ARD_2D(self):
        ''' Testing the GP regression with matern52 kernel on 2d data '''
        matern52 = GPy.kern.Matern52(2,ARD=True)
        self.check_model_with_white(matern52, model_type='GP_regression', dimension=2)
    def test_GP_regression_matern32_1D(self):
        ''' Testing the GP regression with matern32 kernel on 1d data '''
        matern32 = GPy.kern.Matern32(1)
        self.check_model_with_white(matern32, model_type='GP_regression', dimension=1)
-    def test_GP_regression_matern32_kern_2D(self):
+    def test_GP_regression_matern32_2D(self):
        ''' Testing the GP regression with matern32 kernel on 2d data '''
        matern32 = GPy.kern.Matern32(2)
        self.check_model_with_white(matern32, model_type='GP_regression', dimension=2)
-    def test_GP_regression_exponential_kern_1D(self):
+    def test_GP_regression_matern32_ARD_2D(self):
        ''' Testing the GP regression with matern32 kernel on 2d data '''
        matern32 = GPy.kern.Matern32(2,ARD=True)
        self.check_model_with_white(matern32, model_type='GP_regression', dimension=2)
    def test_GP_regression_exponential_1D(self):
        ''' Testing the GP regression with exponential kernel on 1d data '''
        exponential = GPy.kern.exponential(1)
        self.check_model_with_white(exponential, model_type='GP_regression', dimension=1)
-    def test_GP_regression_exponential_kern_2D(self):
+    def test_GP_regression_exponential_2D(self):
        ''' Testing the GP regression with exponential kernel on 2d data '''
        exponential = GPy.kern.exponential(2)
        self.check_model_with_white(exponential, model_type='GP_regression', dimension=2)
    def test_GP_regression_exponential_ARD_2D(self):
        ''' Testing the GP regression with exponential kernel on 2d data '''
        exponential = GPy.kern.exponential(2,ARD=True)
        self.check_model_with_white(exponential, model_type='GP_regression', dimension=2)
    def test_GP_regression_bias_kern_1D(self):
        ''' Testing the GP regression with bias kernel on 1d data '''
        bias = GPy.kern.bias(1)
@ -121,7 +136,7 @@ class GradientTests(unittest.TestCase):
        """ Testing GPLVM with rbf + bias and white kernel """
        N, Q, D = 50, 1, 2
        X = np.random.rand(N, Q)
-        k = GPy.kern.rbf(Q, 0.5, 0.9) + GPy.kern.bias(Q, 0.1) + GPy.kern.white(Q, 0.05)
+        k = GPy.kern.rbf(Q, 0.5, 0.9*np.ones((1,))) + GPy.kern.bias(Q, 0.1) + GPy.kern.white(Q, 0.05)
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
        m = GPy.models.GPLVM(Y, Q, kernel = k)
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -90,7 +90,7 @@ def toy_rbf_1d(seed=default_seed):
    N = 500
    X = np.random.uniform(low=-1.0, high=1.0, size=(N, numIn))
    X.sort(axis=0)
-    rbf = GPy.kern.rbf(numIn, variance=1., lengthscale=0.25)
+    rbf = GPy.kern.rbf(numIn, variance=1., lengthscale=np.array((0.25,)))
    white = GPy.kern.white(numIn, variance=1e-2)
    kernel = rbf + white
    K = kernel.K(X)
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@ -33,7 +33,7 @@ class WarpingFunction(object):
        """inverse function transformation"""
        raise NotImplementedError
-    def get_param_names(self):
+    def _get_param_names(self):
        raise NotImplementedError
    def plot(self, psi, xmin, xmax):
@ -151,7 +151,7 @@ class TanhWarpingFunction(WarpingFunction):
        return gradients
-    def get_param_names(self):
+    def _get_param_names(self):
        variables = ['a', 'b', 'c']
        names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] for q in range(self.n_terms)],[])
        return names
--- a/doc/conf.py
+++ b/doc/conf.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # GPy documentation build configuration file, created by
-# sphinx-quickstart on Wed Jan  9 15:21:20 2013.
+# sphinx-quickstart on Fri Jan 18 15:30:28 2013.
 #
 # This file is execfile()d with the current directory set to its containing dir.
 #
@ -25,7 +25,7 @@ import sys, os
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo', 'sphinx.ext.pngmath', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode']
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@ -41,16 +41,16 @@ master_doc = 'index'
 # General information about the project.
 project = u'GPy'
-copyright = u'2013, The GPy authors'
+copyright = u'2013, Author'
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '0.00001'
+version = ''
 # The full version, including alpha/beta/rc tags.
-release = '0.00001'
+release = ''
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@ -184,7 +184,7 @@ latex_elements = {
 # (source start file, target name, title, author, documentclass [howto/manual]).
 latex_documents = [
  ('index', 'GPy.tex', u'GPy Documentation',
-   u'The GPy authors', 'manual'),
+   u'Author', 'manual'),
 ]
 # The name of an image file (relative to this directory) to place at the top of
@ -214,7 +214,7 @@ latex_documents = [
 # (source start file, name, description, authors, manual section).
 man_pages = [
    ('index', 'gpy', u'GPy Documentation',
-     [u'The GPy authors'], 1)
+     [u'Author'], 1)
 ]
 # If true, show URL addresses after external links.
@ -228,7 +228,7 @@ man_pages = [
 #  dir menu entry, description, category)
 texinfo_documents = [
  ('index', 'GPy', u'GPy Documentation',
-   u'The GPy authors', 'GPy', 'One line description of project.',
+   u'Author', 'GPy', 'One line description of project.',
   'Miscellaneous'),
 ]
@ -240,3 +240,53 @@ texinfo_documents = [
 # How to display URL addresses: 'footnote', 'no', or 'inline'.
 #texinfo_show_urls = 'footnote'
 # -- Options for Epub output ---------------------------------------------------
 # Bibliographic Dublin Core info.
 epub_title = u'GPy'
 epub_author = u'Author'
 epub_publisher = u'Author'
 epub_copyright = u'2013, Author'
 # The language of the text. It defaults to the language option
 # or en if the language is not set.
 #epub_language = ''
 # The scheme of the identifier. Typical schemes are ISBN or URL.
 #epub_scheme = ''
 # The unique identifier of the text. This can be a ISBN number
 # or the project homepage.
 #epub_identifier = ''
 # A unique identification for the text.
 #epub_uid = ''
 # A tuple containing the cover image and cover page html template filenames.
 #epub_cover = ()
 # HTML files that should be inserted before the pages created by sphinx.
 # The format is a list of tuples containing the path and title.
 #epub_pre_files = []
 # HTML files shat should be inserted after the pages created by sphinx.
 # The format is a list of tuples containing the path and title.
 #epub_post_files = []
 # A list of files that should not be packed into the epub file.
 #epub_exclude_files = []
 # The depth of the table of contents in toc.ncx.
 #epub_tocdepth = 3
 # Allow duplicate toc entries.
 #epub_tocdup = True
 # ----------------------- READTHEDOCS ------------------
 on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
 if on_rtd:
  os.system("pwd")
  os.system("sphinx-apidoc -f -o . ../GPy")
--- a/doc/index.rst
+++ b/doc/index.rst
@ -1,16 +1,23 @@
 .. GPy documentation master file, created by
-   sphinx-quickstart on Wed Jan  9 15:21:20 2013.
+   sphinx-quickstart on Fri Jan 18 15:30:28 2013.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.
 Welcome to GPy's documentation!
 ===============================
 Added old command again
 And another thing
 Now what if I add this other hting
 Just checking another thing
 Contents:
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 4
   GPy
   setup
 Indices and tables
--- a/doc/make.bat
+++ b/doc/make.bat
@ -0,0 +1,190 @@
@ECHO OFF
 REM Command file for Sphinx documentation
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
 set BUILDDIR=_build
 set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 set I18NSPHINXOPTS=%SPHINXOPTS% .
 if NOT "%PAPER%" == "" (
 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 )
 if "%1" == "" goto help
 if "%1" == "help" (
 	:help
 	echo.Please use `make ^<target^>` where ^<target^> is one of
 	echo.  html       to make standalone HTML files
 	echo.  dirhtml    to make HTML files named index.html in directories
 	echo.  singlehtml to make a single large HTML file
 	echo.  pickle     to make pickle files
 	echo.  json       to make JSON files
 	echo.  htmlhelp   to make HTML files and a HTML help project
 	echo.  qthelp     to make HTML files and a qthelp project
 	echo.  devhelp    to make HTML files and a Devhelp project
 	echo.  epub       to make an epub
 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 	echo.  text       to make text files
 	echo.  man        to make manual pages
 	echo.  texinfo    to make Texinfo files
 	echo.  gettext    to make PO message catalogs
 	echo.  changes    to make an overview over all changed/added/deprecated items
 	echo.  linkcheck  to check all external links for integrity
 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 	goto end
 )
 if "%1" == "clean" (
 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 	del /q /s %BUILDDIR%\*
 	goto end
 )
 if "%1" == "html" (
 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 	goto end
 )
 if "%1" == "dirhtml" (
 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 	goto end
 )
 if "%1" == "singlehtml" (
 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 	goto end
 )
 if "%1" == "pickle" (
 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished; now you can process the pickle files.
 	goto end
 )
 if "%1" == "json" (
 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished; now you can process the JSON files.
 	goto end
 )
 if "%1" == "htmlhelp" (
 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished; now you can run HTML Help Workshop with the ^
 .hhp project file in %BUILDDIR%/htmlhelp.
 	goto end
 )
 if "%1" == "qthelp" (
 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
 .qhcp project file in %BUILDDIR%/qthelp, like this:
 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\GPy.qhcp
 	echo.To view the help file:
 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\GPy.ghc
 	goto end
 )
 if "%1" == "devhelp" (
 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished.
 	goto end
 )
 if "%1" == "epub" (
 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
 	goto end
 )
 if "%1" == "latex" (
 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
 	goto end
 )
 if "%1" == "text" (
 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The text files are in %BUILDDIR%/text.
 	goto end
 )
 if "%1" == "man" (
 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
 	goto end
 )
 if "%1" == "texinfo" (
 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
 	goto end
 )
 if "%1" == "gettext" (
 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
 	goto end
 )
 if "%1" == "changes" (
 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.The overview file is in %BUILDDIR%/changes.
 	goto end
 )
 if "%1" == "linkcheck" (
 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Link check complete; look for any errors in the above output ^
 or in %BUILDDIR%/linkcheck/output.txt.
 	goto end
 )
 if "%1" == "doctest" (
 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
 	if errorlevel 1 exit /b 1
 	echo.
 	echo.Testing of doctests in the sources finished, look at the ^
 results in %BUILDDIR%/doctest/output.txt.
 	goto end
 )
 :end
--- a/setup.py
+++ b/setup.py
@ -24,9 +24,9 @@ setup(name = 'GPy',
      package_data = {'GPy': ['GPy/examples']},
      py_modules = ['GPy.__init__'],
      long_description=read('README.md'),
-      ext_modules =  [Extension(name = 'GPy.kern.lfmUpsilonf2py',
+      #ext_modules =  [Extension(name = 'GPy.kern.lfmUpsilonf2py',
-                sources = ['GPy/kern/src/lfmUpsilonf2py.f90'])],
+      #          sources = ['GPy/kern/src/lfmUpsilonf2py.f90'])],
-      install_requires=['numpy>=1.6', 'scipy>=0.9','matplotlib>=1.1'],
+      install_requires=['sympy', 'numpy>=1.6', 'scipy>=0.9','matplotlib>=1.1'],
      setup_requires=['sphinx'],
      cmdclass = {'build_sphinx': BuildDoc},
      classifiers=[