From d1b6d18ddf341ee879083ded17d62ce90e4aa120 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Tue, 18 Feb 2014 18:49:13 -0500
Subject: [PATCH 01/38] Changes to sympy covariance.

---
 GPy/kern/parts/sympykern.py | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index a09d4bfc..2d015b27 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -2,6 +2,7 @@ import numpy as np
 import sympy as sp
 from sympy.utilities.codegen import codegen
 from sympy.core.cache import clear_cache
+
 from scipy import weave
 import re
 import os
@@ -28,39 +29,47 @@ class spkern(Kernpart):
      - to handle multpile correlated outputs, you'll need to add parameters with an index, such as lengthscale_i and lengthscale_j.
     """
     def __init__(self, input_dim, k=None, output_dim=1, name=None, param=None):
+
         if name is None:
-            self.name='sympykern'
-        else:
-            self.name = name
+            name='sympykern'
         if k is None:
             raise ValueError, "You must provide an argument for the covariance function."
+        super(spkern, self).__init__(input_dim, name)
+
         self._sp_k = k
+
+        # pull the variable names out of the symbolic covariance function.
         sp_vars = [e for e in k.atoms() if e.is_Symbol]
         self._sp_x= sorted([e for e in sp_vars if e.name[0:2]=='x_'],key=lambda x:int(x.name[2:]))
         self._sp_z= sorted([e for e in sp_vars if e.name[0:2]=='z_'],key=lambda z:int(z.name[2:]))
+
         # Check that variable names make sense.
         assert all([x.name=='x_%i'%i for i,x in enumerate(self._sp_x)])
         assert all([z.name=='z_%i'%i for i,z in enumerate(self._sp_z)])
         assert len(self._sp_x)==len(self._sp_z)
-        self.input_dim = len(self._sp_x)
+        assert len(self._sp_x)==input_dim
+
+        # If it is a multi-output covariance, add an input for indexing the outputs.
         self._real_input_dim = self.input_dim
         if output_dim > 1:
             self.input_dim += 1
         assert self.input_dim == input_dim
         self.output_dim = output_dim
-        # extract parameter names
+
+        # extract parameter names from the covariance
         thetas = sorted([e for e in sp_vars if not (e.name[0:2]=='x_' or e.name[0:2]=='z_')],key=lambda e:e.name)
 
 
-        # Look for parameters with index.
+        # Look for parameters with index (subscripts), they are associated with different outputs.
         if self.output_dim>1:
             self._sp_theta_i = sorted([e for e in thetas if (e.name[-2:]=='_i')], key=lambda e:e.name)
             self._sp_theta_j = sorted([e for e in thetas if (e.name[-2:]=='_j')], key=lambda e:e.name)
+
             # Make sure parameter appears with both indices!
             assert len(self._sp_theta_i)==len(self._sp_theta_j)
             assert all([theta_i.name[:-2]==theta_j.name[:-2] for theta_i, theta_j in zip(self._sp_theta_i, self._sp_theta_j)])
 
-            # Extract names of shared parameters
+            # Extract names of shared parameters (those without a subscript)
             self._sp_theta = [theta for theta in thetas if theta not in self._sp_theta_i and theta not in self._sp_theta_j]
             
             self.num_split_params = len(self._sp_theta_i)
@@ -77,7 +86,8 @@ class spkern(Kernpart):
             self._sp_theta = thetas
             self.num_shared_params = len(self._sp_theta)
             self.num_params = self.num_shared_params
-        
+
+        # Add parameters to the model.
         for theta in self._sp_theta:
             val = 1.0
             if param is not None:
@@ -87,18 +97,22 @@ class spkern(Kernpart):
         #deal with param            
         self._set_params(self._get_params())
 
-        #Differentiate!
+        # Differentiate with respect to parameters.
         self._sp_dk_dtheta = [sp.diff(k,theta).simplify() for theta in self._sp_theta]
         if self.output_dim > 1:
             self._sp_dk_dtheta_i = [sp.diff(k,theta).simplify() for theta in self._sp_theta_i]
-            
+
+        # differentiate with respect to input variables.
         self._sp_dk_dx = [sp.diff(k,xi).simplify() for xi in self._sp_x]
 
+        # psi_stats aren't yet implemented.
         if False:
             self.compute_psi_stats()
 
+        # generate the code for the covariance functions
         self._gen_code()
 
+        if weave
         if False:
             extra_compile_args = ['-ftree-vectorize', '-mssse3', '-ftree-vectorizer-verbose=5']
         else:

From f6484bcbd03110e5d7a0d27a84463e803038a9fc Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Tue, 18 Feb 2014 19:37:53 -0500
Subject: [PATCH 02/38] Using params class with sympy covariance. Adding
 conditional statements for presence of weave.

---
 GPy/kern/parts/sympykern.py | 511 ++++++++++++++++++++----------------
 1 file changed, 284 insertions(+), 227 deletions(-)

diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index 2d015b27..a5bb7b1d 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -1,17 +1,31 @@
-import numpy as np
-import sympy as sp
-from sympy.utilities.codegen import codegen
-from sympy.core.cache import clear_cache
+try: 
+    import sympy as sp
+    sympy_available=True
+except ImportError:
+    sympy_available=False
+    exit()
+
+from sympy.core.cache import clear_cache
+from sympy.utilities.codegen import codegen
+
+try:
+    from scipy import weave
+    weave_available = True
+except ImportError:
+    weave_available = False
 
-from scipy import weave
-import re
 import os
-import sys
 current_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
+import sys
+import numpy as np
+import re
 import tempfile
 import pdb
 import ast
+
 from kernpart import Kernpart
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
 
 class spkern(Kernpart):
     """
@@ -75,17 +89,20 @@ class spkern(Kernpart):
             self.num_split_params = len(self._sp_theta_i)
             self._split_theta_names = ["%s"%theta.name[:-2] for theta in self._sp_theta_i]
             for theta in self._split_theta_names:
-                setattr(self, theta, np.ones(self.output_dim))
+                setattr(self, theta, Param(theta, np.ones(self.output_dim), None))
+                self.add_parameters(getattr(self, theta))
+
+                #setattr(self, theta, np.ones(self.output_dim))
             
             self.num_shared_params = len(self._sp_theta)
-            self.num_params = self.num_shared_params+self.num_split_params*self.output_dim
+            #self.num_params = self.num_shared_params+self.num_split_params*self.output_dim
             
         else:
             self.num_split_params = 0
             self._split_theta_names = []
             self._sp_theta = thetas
             self.num_shared_params = len(self._sp_theta)
-            self.num_params = self.num_shared_params
+            #self.num_params = self.num_shared_params
 
         # Add parameters to the model.
         for theta in self._sp_theta:
@@ -93,9 +110,12 @@ class spkern(Kernpart):
             if param is not None:
                 if param.has_key(theta):
                     val = param[theta]
-            setattr(self, theta.name, val)
+            #setattr(self, theta.name, val)
+            setattr(self, theta.name, Param(theta.name, val, None))
+            self.add_parameters(getattr(self, theta.name))
+        self.parameters_changed() # initializes cache
         #deal with param            
-        self._set_params(self._get_params())
+        #self._set_params(self._get_params())
 
         # Differentiate with respect to parameters.
         self._sp_dk_dtheta = [sp.diff(k,theta).simplify() for theta in self._sp_theta]
@@ -112,26 +132,26 @@ class spkern(Kernpart):
         # generate the code for the covariance functions
         self._gen_code()
 
-        if weave
-        if False:
-            extra_compile_args = ['-ftree-vectorize', '-mssse3', '-ftree-vectorizer-verbose=5']
-        else:
-            extra_compile_args = []
+        if weave_available:
+            if False:
+                extra_compile_args = ['-ftree-vectorize', '-mssse3', '-ftree-vectorizer-verbose=5']
+            else:
+                extra_compile_args = []
             
-        self.weave_kwargs = {
-            'support_code':self._function_code,
-            'include_dirs':[tempfile.gettempdir(), os.path.join(current_dir,'parts/')],
-            'headers':['"sympy_helpers.h"'],
-            'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp")],
-            'extra_compile_args':extra_compile_args,
-            'extra_link_args':['-lgomp'],
-            'verbose':True}
+                self.weave_kwargs = {
+                    'support_code':self._function_code,
+                    'include_dirs':[tempfile.gettempdir(), os.path.join(current_dir,'parts/')],
+                    'headers':['"sympy_helpers.h"'],
+                    'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp")],
+                    'extra_compile_args':extra_compile_args,
+                    'extra_link_args':['-lgomp'],
+                    'verbose':True}
 
     def __add__(self,other):
         return spkern(self._sp_k+other._sp_k)
 
     def _gen_code(self):
-        #generate c functions from sympy objects        
+
         argument_sequence = self._sp_x+self._sp_z+self._sp_theta
         code_list = [('k',self._sp_k)]
         # gradients with respect to covariance input
@@ -142,193 +162,224 @@ class spkern(Kernpart):
         if self.output_dim > 1:
             argument_sequence += self._sp_theta_i + self._sp_theta_j
             code_list += [('dk_d%s'%theta.name,dtheta) for theta,dtheta in zip(self._sp_theta_i,self._sp_dk_dtheta_i)]
+        # generate c functions from sympy objects
+        if weave_available:
+            code_type = "C"
+        else:
+            code_type = "PYTHON"
         (foo_c,self._function_code), (foo_h,self._function_header) = \
-                                     codegen(code_list, "C",'foobar',argument_sequence=argument_sequence)
-        #put the header file where we can find it
-        f = file(os.path.join(tempfile.gettempdir(),'foobar.h'),'w')
-        f.write(self._function_header)
-        f.close()
+                                     codegen(code_list,
+                                             code_type,
+                                             self.name,
+                                             argument_sequence=argument_sequence)
 
+
+        # Use weave to compute the underlying functions.
+        if weave_available:
+            # put the header file where we can find it
+            f = file(os.path.join(tempfile.gettempdir(), self.name + '.h'),'w')
+            f.write(self._function_header)
+            f.close()
+
+    
         # Substitute any known derivatives which sympy doesn't compute
         self._function_code = re.sub('DiracDelta\(.+?,.+?\)','0.0',self._function_code)
 
-        # This is the basic argument construction for the C code.
-        #arg_list = (["X[i*input_dim+%s]"%x.name[2:] for x in self._sp_x]
-        #            + ["Z[j*input_dim+%s]"%z.name[2:] for z in self._sp_z])
-        arg_list = (["X2(i, %s)"%x.name[2:] for x in self._sp_x]
-                    + ["Z2(j, %s)"%z.name[2:] for z in self._sp_z])
-        if self.output_dim>1:
-            reverse_arg_list = list(arg_list)
-            reverse_arg_list.reverse()
+        if weave_available:
+            # arg_list will store the arguments required for the C code.
+            arg_list = (["X2(i, %s)"%x.name[2:] for x in self._sp_x]
+                        + ["Z2(j, %s)"%z.name[2:] for z in self._sp_z])
 
-        param_arg_list = [shared_params.name for shared_params in self._sp_theta]
-        arg_list += param_arg_list
+            # for multiple outputs reverse argument list is also required
+            if self.output_dim>1:
+                reverse_arg_list = list(arg_list)
+                reverse_arg_list.reverse()
 
-        precompute_list=[]
-        if self.output_dim > 1:
-            reverse_arg_list+=list(param_arg_list)
-            split_param_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['ii', 'jj'] for theta in self._sp_theta_i]
-            split_param_reverse_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['jj', 'ii'] for theta in self._sp_theta_i]
-            arg_list += split_param_arg_list
-            reverse_arg_list += split_param_reverse_arg_list
-            # Extract the right output indices from the inputs.
-            c_define_output_indices = [' '*16 + "int %s=(int)%s(%s, %i);"%(index, var, index2, self.input_dim-1) for index, var, index2 in zip(['ii', 'jj'], ['X2', 'Z2'], ['i', 'j'])]
-            precompute_list += c_define_output_indices
-            reverse_arg_string = ", ".join(reverse_arg_list)
-        arg_string = ", ".join(arg_list)
-        precompute_string = "\n".join(precompute_list)
-        # Here's the code to do the looping for K
-        self._K_code =\
-        """
-        // _K_code
-        // Code for computing the covariance function.
-        int i;
-        int j;
-        int N = target_array->dimensions[0];
-        int num_inducing = target_array->dimensions[1];
-        int input_dim = X_array->dimensions[1];
-        //#pragma omp parallel for private(j)
-        for (i=0;i<N;i++){
-            for (j=0;j<num_inducing;j++){
-%s
-                //target[i*num_inducing+j] = 
-                TARGET2(i, j) += k(%s);
+            # This gives the parameters for the arg list.
+            param_arg_list = [shared_params.name for shared_params in self._sp_theta]
+            arg_list += param_arg_list
+
+            precompute_list=[]
+            if self.output_dim > 1:
+                reverse_arg_list+=list(param_arg_list)
+                # For multiple outputs, also need the split parameters.
+                split_param_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['ii', 'jj'] for theta in self._sp_theta_i]
+                split_param_reverse_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['jj', 'ii'] for theta in self._sp_theta_i]
+                arg_list += split_param_arg_list
+                reverse_arg_list += split_param_reverse_arg_list
+                # Extract the right output indices from the inputs.
+                c_define_output_indices = [' '*16 + "int %s=(int)%s(%s, %i);"%(index, var, index2, self.input_dim-1) for index, var, index2 in zip(['ii', 'jj'], ['X2', 'Z2'], ['i', 'j'])]
+                precompute_list += c_define_output_indices
+                reverse_arg_string = ", ".join(reverse_arg_list)
+            arg_string = ", ".join(arg_list)
+            precompute_string = "\n".join(precompute_list)
+
+            # Now we use the arguments in code that computes the separate parts.
+
+            # Any precomputations will be done here eventually.
+            self._precompute = \
+                             """
+                             // Precompute code would go here. It will be called when parameters are updated. 
+                             """
+
+            # Here's the code to do the looping for K
+            self._K_code =\
+            """
+            // _K_code
+            // Code for computing the covariance function.
+            int i;
+            int j;
+            int N = target_array->dimensions[0];
+            int num_inducing = target_array->dimensions[1];
+            int input_dim = X_array->dimensions[1];
+            //#pragma omp parallel for private(j)
+            for (i=0;i<N;i++){
+                for (j=0;j<num_inducing;j++){
+                    %s
+                    //target[i*num_inducing+j] = 
+                    TARGET2(i, j) += k(%s);
+                }
             }
-        }
-        %s
-        """%(precompute_string,arg_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+            %s
+            """%(precompute_string,arg_string,"/*"+str(self._sp_k)+"*/")
+           # adding a string representation of the function in the
+           # comment forces recompile when needed
 
-        
-        # Code to compute diagonal of covariance.
-        diag_arg_string = re.sub('Z','X',arg_string)
-        diag_arg_string = re.sub('int jj','//int jj',diag_arg_string)
-        diag_arg_string = re.sub('j','i',diag_arg_string)
-        diag_precompute_string = re.sub('int jj','//int jj',precompute_string)
-        diag_precompute_string = re.sub('Z','X',diag_precompute_string)
-        diag_precompute_string = re.sub('j','i',diag_precompute_string)
-        # Code to do the looping for Kdiag
-        self._Kdiag_code =\
-        """
-        // _Kdiag_code
-        // Code for computing diagonal of covariance function.
-        int i;
-        int N = target_array->dimensions[0];
-        int input_dim = X_array->dimensions[1];
-        //#pragma omp parallel for
-        for (i=0;i<N;i++){
-                %s
-                //target[i] =
-                TARGET1(i)=k(%s);
-        }
-        %s
-        """%(diag_precompute_string,diag_arg_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
 
-        # Code to compute gradients
-        grad_func_list = []
-        if self.output_dim>1:
-            grad_func_list += c_define_output_indices
-            grad_func_list += [' '*16 + 'TARGET1(%i+ii) += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, arg_string) for i, theta in enumerate(self._sp_theta_i)]
-            grad_func_list += [' '*16 + 'TARGET1(%i+jj) += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, reverse_arg_string) for i, theta in enumerate(self._sp_theta_i)]
-        grad_func_list += ([' '*16 + 'TARGET1(%i) += partial[i*num_inducing+j]*dk_d%s(%s);'%(i,theta.name,arg_string) for i,theta in  enumerate(self._sp_theta)])
-        grad_func_string = '\n'.join(grad_func_list) 
-
-        self._dK_dtheta_code =\
-        """
-        // _dK_dtheta_code
-        // Code for computing gradient of covariance with respect to parameters.
-        int i;
-        int j;
-        int N = partial_array->dimensions[0];
-        int num_inducing = partial_array->dimensions[1];
-        int input_dim = X_array->dimensions[1];
-        //#pragma omp parallel for private(j)
-        for (i=0;i<N;i++){
-            for (j=0;j<num_inducing;j++){
-%s
+            # Code to compute diagonal of covariance.
+            diag_arg_string = re.sub('Z','X',arg_string)
+            diag_arg_string = re.sub('int jj','//int jj',diag_arg_string)
+            diag_arg_string = re.sub('j','i',diag_arg_string)
+            diag_precompute_string = re.sub('int jj','//int jj',precompute_string)
+            diag_precompute_string = re.sub('Z','X',diag_precompute_string)
+            diag_precompute_string = re.sub('j','i',diag_precompute_string)
+            # Code to do the looping for Kdiag
+            self._Kdiag_code =\
+            """
+            // _Kdiag_code
+            // Code for computing diagonal of covariance function.
+            int i;
+            int N = target_array->dimensions[0];
+            int input_dim = X_array->dimensions[1];
+            //#pragma omp parallel for
+            for (i=0;i<N;i++){
+                    %s
+                    //target[i] =
+                    TARGET1(i)=k(%s);
             }
-        }
-        %s
-        """%(grad_func_string,"/*"+str(self._sp_k)+"*/") # adding a string representation forces recompile when needed
+            %s
+            """%(diag_precompute_string,diag_arg_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+
+            # Code to compute gradients
+            grad_func_list = []
+            if self.output_dim>1:
+                grad_func_list += c_define_output_indices
+                grad_func_list += [' '*16 + 'TARGET1(%i+ii) += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, arg_string) for i, theta in enumerate(self._sp_theta_i)]
+                grad_func_list += [' '*16 + 'TARGET1(%i+jj) += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, reverse_arg_string) for i, theta in enumerate(self._sp_theta_i)]
+            grad_func_list += ([' '*16 + 'TARGET1(%i) += partial[i*num_inducing+j]*dk_d%s(%s);'%(i,theta.name,arg_string) for i,theta in  enumerate(self._sp_theta)])
+            grad_func_string = '\n'.join(grad_func_list) 
+
+            self._dK_dtheta_code =\
+            """
+            // _dK_dtheta_code
+            // Code for computing gradient of covariance with respect to parameters.
+            int i;
+            int j;
+            int N = partial_array->dimensions[0];
+            int num_inducing = partial_array->dimensions[1];
+            int input_dim = X_array->dimensions[1];
+            //#pragma omp parallel for private(j)
+            for (i=0;i<N;i++){
+                for (j=0;j<num_inducing;j++){
+    %s
+                }
+            }
+            %s
+            """%(grad_func_string,"/*"+str(self._sp_k)+"*/") # adding a string representation forces recompile when needed
 
 
-        # Code to compute gradients for Kdiag TODO: needs clean up
-        diag_grad_func_string = re.sub('Z','X',grad_func_string,count=0)
-        diag_grad_func_string = re.sub('int jj','//int jj',diag_grad_func_string)
-        diag_grad_func_string = re.sub('j','i',diag_grad_func_string)
-        diag_grad_func_string = re.sub('partial\[i\*num_inducing\+i\]','partial[i]',diag_grad_func_string)
-        self._dKdiag_dtheta_code =\
-        """
-        // _dKdiag_dtheta_code
-        // Code for computing gradient of diagonal with respect to parameters.
-        int i;
-        int N = partial_array->dimensions[0];
-        int input_dim = X_array->dimensions[1];
-        for (i=0;i<N;i++){
+            # Code to compute gradients for Kdiag TODO: needs clean up
+            diag_grad_func_string = re.sub('Z','X',grad_func_string,count=0)
+            diag_grad_func_string = re.sub('int jj','//int jj',diag_grad_func_string)
+            diag_grad_func_string = re.sub('j','i',diag_grad_func_string)
+            diag_grad_func_string = re.sub('partial\[i\*num_inducing\+i\]','partial[i]',diag_grad_func_string)
+            self._dKdiag_dtheta_code =\
+            """
+            // _dKdiag_dtheta_code
+            // Code for computing gradient of diagonal with respect to parameters.
+            int i;
+            int N = partial_array->dimensions[0];
+            int input_dim = X_array->dimensions[1];
+            for (i=0;i<N;i++){
+                    %s
+            }
+            %s
+            """%(diag_grad_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+
+            # Code for gradients wrt X, TODO: may need to deal with special case where one input is actually an output.
+            gradX_func_list = []
+            if self.output_dim>1:
+                gradX_func_list += c_define_output_indices
+            gradX_func_list += ["TARGET2(i, %i) += partial[i*num_inducing+j]*dk_dx_%i(%s);"%(q,q,arg_string) for q in range(self._real_input_dim)]
+            gradX_func_string = "\n".join(gradX_func_list)
+
+            self._dK_dX_code = \
+            """
+            // _dK_dX_code
+            // Code for computing gradient of covariance with respect to inputs.
+            int i;
+            int j;
+            int N = partial_array->dimensions[0];
+            int num_inducing = partial_array->dimensions[1];
+            int input_dim = X_array->dimensions[1];
+            //#pragma omp parallel for private(j)
+            for (i=0;i<N; i++){
+              for (j=0; j<num_inducing; j++){
                 %s
-        }
-        %s
-        """%(diag_grad_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
-
-        # Code for gradients wrt X, TODO: may need to deal with special case where one input is actually an output.
-        gradX_func_list = []
-        if self.output_dim>1:
-            gradX_func_list += c_define_output_indices
-        gradX_func_list += ["TARGET2(i, %i) += partial[i*num_inducing+j]*dk_dx_%i(%s);"%(q,q,arg_string) for q in range(self._real_input_dim)]
-        gradX_func_string = "\n".join(gradX_func_list)
-
-        self._dK_dX_code = \
-        """
-        // _dK_dX_code
-        // Code for computing gradient of covariance with respect to inputs.
-        int i;
-        int j;
-        int N = partial_array->dimensions[0];
-        int num_inducing = partial_array->dimensions[1];
-        int input_dim = X_array->dimensions[1];
-        //#pragma omp parallel for private(j)
-        for (i=0;i<N; i++){
-          for (j=0; j<num_inducing; j++){
+              }
+            }
             %s
-          }
-        }
-        %s
-        """%(gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
-  
+            """%(gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
 
-        diag_gradX_func_string = re.sub('Z','X',gradX_func_string,count=0)
-        diag_gradX_func_string = re.sub('int jj','//int jj',diag_gradX_func_string)
-        diag_gradX_func_string = re.sub('j','i',diag_gradX_func_string)
-        diag_gradX_func_string = re.sub('partial\[i\*num_inducing\+i\]','2*partial[i]',diag_gradX_func_string)
 
-        # Code for gradients of Kdiag wrt X
-        self._dKdiag_dX_code= \
-        """
-        // _dKdiag_dX_code
-        // Code for computing gradient of diagonal with respect to inputs.
-        int N = partial_array->dimensions[0];
-        int input_dim = X_array->dimensions[1];
-        for (int i=0;i<N; i++){
+            diag_gradX_func_string = re.sub('Z','X',gradX_func_string,count=0)
+            diag_gradX_func_string = re.sub('int jj','//int jj',diag_gradX_func_string)
+            diag_gradX_func_string = re.sub('j','i',diag_gradX_func_string)
+            diag_gradX_func_string = re.sub('partial\[i\*num_inducing\+i\]','2*partial[i]',diag_gradX_func_string)
+
+            # Code for gradients of Kdiag wrt X
+            self._dKdiag_dX_code= \
+            """
+            // _dKdiag_dX_code
+            // Code for computing gradient of diagonal with respect to inputs.
+            int N = partial_array->dimensions[0];
+            int input_dim = X_array->dimensions[1];
+            for (int i=0;i<N; i++){
+                %s
+            }
             %s
-        }
-        %s
-        """%(diag_gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a
-        # string representation forces recompile when needed Get rid
-        # of Zs in argument for diagonal. TODO: Why wasn't
-        # diag_func_string called here? Need to check that.
-        #self._dKdiag_dX_code = self._dKdiag_dX_code.replace('Z[j', 'X[i')
+            """%(diag_gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a
+            # string representation forces recompile when needed Get rid
+            # of Zs in argument for diagonal. TODO: Why wasn't
+            # diag_func_string called here? Need to check that.
+            #self._dKdiag_dX_code = self._dKdiag_dX_code.replace('Z[j', 'X[i')
 
-        # Code to use when only X is provided. 
-        self._K_code_X = self._K_code.replace('Z[', 'X[')
-        self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z[', 'X[')
-        self._dK_dX_code_X = self._dK_dX_code.replace('Z[', 'X[').replace('+= partial[', '+= 2*partial[')
-        self._K_code_X = self._K_code.replace('Z2(', 'X2(')
-        self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z2(', 'X2(')
-        self._dK_dX_code_X = self._dK_dX_code.replace('Z2(', 'X2(')
+            # Code to use when only X is provided. 
+            self._K_code_X = self._K_code.replace('Z[', 'X[')
+            self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z[', 'X[')
+            self._dK_dX_code_X = self._dK_dX_code.replace('Z[', 'X[').replace('+= partial[', '+= 2*partial[')
+            self._K_code_X = self._K_code.replace('Z2(', 'X2(')
+            self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z2(', 'X2(')
+            self._dK_dX_code_X = self._dK_dX_code.replace('Z2(', 'X2(')
 
 
-        #TODO: insert multiple functions here via string manipulation
-        #TODO: similar functions for psi_stats
+            #TODO: insert multiple functions here via string manipulation
+            #TODO: similar functions for psi_stats
+            #TODO: similar functions when cython available.
+            #TODO: similar functions when only python available.
+            
     def _get_arg_names(self, Z=None, partial=None):
+        
         arg_names = ['target','X']
         for shared_params in self._sp_theta:
             arg_names += [shared_params.name]
@@ -340,8 +391,8 @@ class spkern(Kernpart):
             arg_names += self._split_theta_names
             arg_names += ['output_dim']
         return arg_names
-        
-    def _weave_inline(self, code, X, target, Z=None, partial=None):
+
+    def _generate_inline(self, code, X, target, Z=None, partial=None):
         output_dim = self.output_dim
         for shared_params in self._sp_theta:
             locals()[shared_params.name] = getattr(self, shared_params.name)
@@ -350,35 +401,38 @@ class spkern(Kernpart):
         for split_params in self._split_theta_names:
             locals()[split_params] = getattr(self, split_params)
         arg_names = self._get_arg_names(Z, partial)        
-        weave.inline(code=code, arg_names=arg_names,**self.weave_kwargs)
-
+        if weave_available:
+            weave.inline(code=code, arg_names=arg_names,**self.weave_kwargs)
+        else:
+            raise RuntimeError('Weave not available and other variants of sympy covariance not yet implemented')
+        
     def K(self,X,Z,target):        
         if Z is None:
-            self._weave_inline(self._K_code_X, X, target)
+            self._generate_inline(self._K_code_X, X, target)
         else:
-            self._weave_inline(self._K_code, X, target, Z)
+            self._generate_inline(self._K_code, X, target, Z)
 
 
     def Kdiag(self,X,target):
-        self._weave_inline(self._Kdiag_code, X, target)
+        self._generate_inline(self._Kdiag_code, X, target)
 
     def _param_grad_helper(self,partial,X,Z,target):
         if Z is None:
-            self._weave_inline(self._dK_dtheta_code_X, X, target, Z, partial)
+            self._generate_inline(self._dK_dtheta_code_X, X, target, Z, partial)
         else:
-            self._weave_inline(self._dK_dtheta_code, X, target, Z, partial)
+            self._generate_inline(self._dK_dtheta_code, X, target, Z, partial)
             
     def dKdiag_dtheta(self,partial,X,target):
-        self._weave_inline(self._dKdiag_dtheta_code, X, target, Z=None, partial=partial)
+        self._generate_inline(self._dKdiag_dtheta_code, X, target, Z=None, partial=partial)
                
     def gradients_X(self,partial,X,Z,target):
         if Z is None:
-            self._weave_inline(self._dK_dX_code_X, X, target, Z, partial)
+            self._generate_inline(self._dK_dX_code_X, X, target, Z, partial)
         else:
-            self._weave_inline(self._dK_dX_code, X, target, Z, partial)
+            self._generate_inline(self._dK_dX_code, X, target, Z, partial)
 
     def dKdiag_dX(self,partial,X,target):
-        self._weave.inline(self._dKdiag_dX_code, X, target, Z, partial)
+        self._generate_inline(self._dKdiag_dX_code, X, target, Z, partial)
 
     def compute_psi_stats(self):
         #define some normal distributions
@@ -407,31 +461,34 @@ class spkern(Kernpart):
             self._sp_psi2 = sp.integrate(self._sp_psi2,(self._sp_x[i],-sp.oo,sp.oo))
             clear_cache()
         self._sp_psi2 = self._sp_psi2.simplify()
+    def parameters_changed(self):
+        # Do anything here that needs to happen when parameters change, like precompute.
+        self._generate_inline(self._precompute, X, target, Z, partial)
 
-
-    def _set_params(self,param):        
-        assert param.size == (self.num_params)
-        for i, shared_params in enumerate(self._sp_theta):
-            setattr(self, shared_params.name, param[i])
+    
+    # def _set_params(self,param):        
+    #     assert param.size == (self.num_params)
+    #     for i, shared_params in enumerate(self._sp_theta):
+    #         setattr(self, shared_params.name, param[i])
             
-        if self.output_dim>1:
-            for i, split_params in enumerate(self._split_theta_names):
-                start = self.num_shared_params + i*self.output_dim
-                end = self.num_shared_params + (i+1)*self.output_dim
-                setattr(self, split_params, param[start:end])
+    #     if self.output_dim>1:
+    #         for i, split_params in enumerate(self._split_theta_names):
+    #             start = self.num_shared_params + i*self.output_dim
+    #             end = self.num_shared_params + (i+1)*self.output_dim
+    #             setattr(self, split_params, param[start:end])
 
 
-    def _get_params(self):
-        params = np.zeros(0)
-        for shared_params in self._sp_theta:
-            params = np.hstack((params, getattr(self, shared_params.name)))
-        if self.output_dim>1:
-            for split_params in self._split_theta_names:
-                params = np.hstack((params, getattr(self, split_params).flatten()))
-        return params
+    # def _get_params(self):
+    #     params = np.zeros(0)
+    #     for shared_params in self._sp_theta:
+    #         params = np.hstack((params, getattr(self, shared_params.name)))
+    #     if self.output_dim>1:
+    #         for split_params in self._split_theta_names:
+    #             params = np.hstack((params, getattr(self, split_params).flatten()))
+    #     return params
 
-    def _get_param_names(self):
-        if self.output_dim>1:
-            return [x.name for x in self._sp_theta] + [x.name[:-2] + str(i)  for x in self._sp_theta_i for i in range(self.output_dim)]
-        else:
-            return [x.name for x in self._sp_theta]
+    # def _get_param_names(self):
+    #     if self.output_dim>1:
+    #         return [x.name for x in self._sp_theta] + [x.name[:-2] + str(i)  for x in self._sp_theta_i for i in range(self.output_dim)]
+    #     else:
+    #         return [x.name for x in self._sp_theta]

From 0082acad6392f98fd1d5c6335a7adb19d7679aca Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 19 Feb 2014 10:51:12 +0000
Subject: [PATCH 03/38] minor edits

---
 .../latent_function_inference/dtc.py          | 83 ++++++++++++++++++-
 1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/GPy/inference/latent_function_inference/dtc.py b/GPy/inference/latent_function_inference/dtc.py
index dbbff6d0..1a811de6 100644
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@@ -32,7 +32,7 @@ class DTC(object):
         #make sure the noise is not hetero
         beta = 1./np.squeeze(likelihood.variance)
         if beta.size <1:
-            raise NotImplementedError, "no hetero noise with this implementatino of DTC"
+            raise NotImplementedError, "no hetero noise with this implementation of DTC"
 
         Kmm = kern.K(Z)
         Knn = kern.Kdiag(X)
@@ -89,4 +89,85 @@ class DTC(object):
 
         return post, log_marginal, grad_dict
 
+class vDTC(object):
+    def __init__(self):
+        self.const_jitter = 1e-6
+
+    def inference(self, kern, X, X_variance, Z, likelihood, Y):
+        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
+
+        #TODO: MAX! fix this!
+        from ...util.misc import param_to_array
+        Y = param_to_array(Y)
+
+        num_inducing, _ = Z.shape
+        num_data, output_dim = Y.shape
+
+        #make sure the noise is not hetero
+        beta = 1./np.squeeze(likelihood.variance)
+        if beta.size <1:
+            raise NotImplementedError, "no hetero noise with this implementation of DTC"
+
+        Kmm = kern.K(Z)
+        Knn = kern.Kdiag(X)
+        Knm = kern.K(X, Z)
+        U = Knm
+        Uy = np.dot(U.T,Y)
+
+        #factor Kmm 
+        Kmmi, L, Li, _ = pdinv(Kmm)
+
+        # Compute A
+        LiUTbeta = np.dot(Li, U.T)*np.sqrt(beta)
+        A_ = tdot(LiUTbeta)
+        trace_term = -0.5*(np.sum(Knn)*beta - np.trace(A_))
+        A = A_ + np.eye(num_inducing)
+
+        # factor A
+        LA = jitchol(A)
+
+        # back substutue to get b, P, v
+        tmp, _ = dtrtrs(L, Uy, lower=1)
+        b, _ = dtrtrs(LA, tmp*beta, lower=1)
+        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
+        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
+        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
+        P = tdot(tmp.T)
+
+        #compute log marginal
+        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
+                       -np.sum(np.log(np.diag(LA)))*output_dim + \
+                       0.5*num_data*output_dim*np.log(beta) + \
+                       -0.5*beta*np.sum(np.square(Y)) + \
+                       0.5*np.sum(np.square(b)) + \
+                       trace_term
+
+        # Compute dL_dKmm
+        vvT_P = tdot(v.reshape(-1,1)) + P
+        LAL = Li.T.dot(A).dot(Li)
+        dL_dK = Kmmi - 0.5*(vvT_P + LAL)
+
+        # Compute dL_dU
+        vY = np.dot(v.reshape(-1,1),Y.T)
+        #dL_dU = vY - np.dot(vvT_P, U.T)
+        dL_dU = vY - np.dot(vvT_P - Kmmi, U.T)
+        dL_dU *= beta
+
+        #compute dL_dR
+        Uv = np.dot(U, v)
+        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - 1./beta + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) + np.sum(np.square(Uv), 1) )*beta**2
+        dL_dR -=beta*trace_term/num_data
+
+        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':np.zeros_like(Knn) + -0.5*beta, 'dL_dKnm':dL_dU.T}
+
+        #update gradients
+        kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
+        likelihood.update_gradients(dL_dR)
+
+        #construct a posterior object
+        post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)
+
+
+        return post, log_marginal, grad_dict
+
 

From 89e216b6a67cf7c8dd0c2e274299239e94d90ebe Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 19 Feb 2014 13:38:36 +0000
Subject: [PATCH 04/38] moved stuff. much breakage. Ow.

---
 GPy/kern/{parts => }/Brownian.py             |  0
 GPy/kern/{parts => }/Matern32.py             |  0
 GPy/kern/{parts => }/Matern52.py             |  0
 GPy/kern/{parts => }/ODE_1.py                |  0
 GPy/kern/__init__.py                         | 38 +++++++++++++++-----
 GPy/kern/__init__old.py                      |  9 +++++
 GPy/kern/{parts => }/bias.py                 |  0
 GPy/kern/{parts => }/coregionalize.py        |  0
 GPy/kern/{parts => }/eq_ode1.py              |  0
 GPy/kern/{parts => }/exponential.py          |  0
 GPy/kern/{parts => }/finite_dimensional.py   |  0
 GPy/kern/{parts => }/fixed.py                |  0
 GPy/kern/{parts => }/gibbs.py                |  0
 GPy/kern/{parts => }/hetero.py               |  0
 GPy/kern/{parts => }/hierarchical.py         |  0
 GPy/kern/{parts => }/independent_outputs.py  |  0
 GPy/kern/{parts => }/kernpart.py             |  0
 GPy/kern/{parts => }/linear.py               |  0
 GPy/kern/{parts => }/mlp.py                  |  0
 GPy/kern/{parts => }/odekern1.c              |  0
 GPy/kern/parts/__init__.py                   | 29 ---------------
 GPy/kern/{parts => }/periodic_Matern32.py    |  0
 GPy/kern/{parts => }/periodic_Matern52.py    |  0
 GPy/kern/{parts => }/periodic_exponential.py |  0
 GPy/kern/{parts => }/poly.py                 |  0
 GPy/kern/{parts => }/prod.py                 |  0
 GPy/kern/{parts => }/prod_orthogonal.py      |  0
 GPy/kern/{parts => }/rational_quadratic.py   |  0
 GPy/kern/{parts => }/rbf.py                  |  0
 GPy/kern/{parts => }/rbf_inv.py              |  0
 GPy/kern/{parts => }/rbfcos.py               |  0
 GPy/kern/{parts => }/spline.py               |  0
 GPy/kern/{parts => }/ss_rbf.py               |  0
 GPy/kern/{parts => }/symmetric.py            |  0
 GPy/kern/{parts => }/sympy_helpers.cpp       |  0
 GPy/kern/{parts => }/sympy_helpers.h         |  0
 GPy/kern/{parts => }/sympykern.py            |  0
 GPy/kern/{parts => }/white.py                |  0
 38 files changed, 38 insertions(+), 38 deletions(-)
 rename GPy/kern/{parts => }/Brownian.py (100%)
 rename GPy/kern/{parts => }/Matern32.py (100%)
 rename GPy/kern/{parts => }/Matern52.py (100%)
 rename GPy/kern/{parts => }/ODE_1.py (100%)
 create mode 100644 GPy/kern/__init__old.py
 rename GPy/kern/{parts => }/bias.py (100%)
 rename GPy/kern/{parts => }/coregionalize.py (100%)
 rename GPy/kern/{parts => }/eq_ode1.py (100%)
 rename GPy/kern/{parts => }/exponential.py (100%)
 rename GPy/kern/{parts => }/finite_dimensional.py (100%)
 rename GPy/kern/{parts => }/fixed.py (100%)
 rename GPy/kern/{parts => }/gibbs.py (100%)
 rename GPy/kern/{parts => }/hetero.py (100%)
 rename GPy/kern/{parts => }/hierarchical.py (100%)
 rename GPy/kern/{parts => }/independent_outputs.py (100%)
 rename GPy/kern/{parts => }/kernpart.py (100%)
 rename GPy/kern/{parts => }/linear.py (100%)
 rename GPy/kern/{parts => }/mlp.py (100%)
 rename GPy/kern/{parts => }/odekern1.c (100%)
 delete mode 100644 GPy/kern/parts/__init__.py
 rename GPy/kern/{parts => }/periodic_Matern32.py (100%)
 rename GPy/kern/{parts => }/periodic_Matern52.py (100%)
 rename GPy/kern/{parts => }/periodic_exponential.py (100%)
 rename GPy/kern/{parts => }/poly.py (100%)
 rename GPy/kern/{parts => }/prod.py (100%)
 rename GPy/kern/{parts => }/prod_orthogonal.py (100%)
 rename GPy/kern/{parts => }/rational_quadratic.py (100%)
 rename GPy/kern/{parts => }/rbf.py (100%)
 rename GPy/kern/{parts => }/rbf_inv.py (100%)
 rename GPy/kern/{parts => }/rbfcos.py (100%)
 rename GPy/kern/{parts => }/spline.py (100%)
 rename GPy/kern/{parts => }/ss_rbf.py (100%)
 rename GPy/kern/{parts => }/symmetric.py (100%)
 rename GPy/kern/{parts => }/sympy_helpers.cpp (100%)
 rename GPy/kern/{parts => }/sympy_helpers.h (100%)
 rename GPy/kern/{parts => }/sympykern.py (100%)
 rename GPy/kern/{parts => }/white.py (100%)

diff --git a/GPy/kern/parts/Brownian.py b/GPy/kern/Brownian.py
similarity index 100%
rename from GPy/kern/parts/Brownian.py
rename to GPy/kern/Brownian.py
diff --git a/GPy/kern/parts/Matern32.py b/GPy/kern/Matern32.py
similarity index 100%
rename from GPy/kern/parts/Matern32.py
rename to GPy/kern/Matern32.py
diff --git a/GPy/kern/parts/Matern52.py b/GPy/kern/Matern52.py
similarity index 100%
rename from GPy/kern/parts/Matern52.py
rename to GPy/kern/Matern52.py
diff --git a/GPy/kern/parts/ODE_1.py b/GPy/kern/ODE_1.py
similarity index 100%
rename from GPy/kern/parts/ODE_1.py
rename to GPy/kern/ODE_1.py
diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index eb4076c3..0a758f1e 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -1,9 +1,29 @@
-# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-from constructors import *
-try:
-    from constructors import rbf_sympy, sympykern # these depend on sympy
-except:
-    pass
-from kern import *
+import bias
+import Brownian
+import coregionalize
+import exponential
+import eq_ode1
+import finite_dimensional
+import fixed
+import gibbs
+import hetero
+import hierarchical
+import independent_outputs
+import linear
+import Matern32
+import Matern52
+import mlp
+import ODE_1
+import periodic_exponential
+import periodic_Matern32
+import periodic_Matern52
+import poly
+import prod_orthogonal
+import prod
+import rational_quadratic
+import rbfcos
+import rbf
+import rbf_inv
+import spline
+import symmetric
+import white
diff --git a/GPy/kern/__init__old.py b/GPy/kern/__init__old.py
new file mode 100644
index 00000000..eb4076c3
--- /dev/null
+++ b/GPy/kern/__init__old.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from constructors import *
+try:
+    from constructors import rbf_sympy, sympykern # these depend on sympy
+except:
+    pass
+from kern import *
diff --git a/GPy/kern/parts/bias.py b/GPy/kern/bias.py
similarity index 100%
rename from GPy/kern/parts/bias.py
rename to GPy/kern/bias.py
diff --git a/GPy/kern/parts/coregionalize.py b/GPy/kern/coregionalize.py
similarity index 100%
rename from GPy/kern/parts/coregionalize.py
rename to GPy/kern/coregionalize.py
diff --git a/GPy/kern/parts/eq_ode1.py b/GPy/kern/eq_ode1.py
similarity index 100%
rename from GPy/kern/parts/eq_ode1.py
rename to GPy/kern/eq_ode1.py
diff --git a/GPy/kern/parts/exponential.py b/GPy/kern/exponential.py
similarity index 100%
rename from GPy/kern/parts/exponential.py
rename to GPy/kern/exponential.py
diff --git a/GPy/kern/parts/finite_dimensional.py b/GPy/kern/finite_dimensional.py
similarity index 100%
rename from GPy/kern/parts/finite_dimensional.py
rename to GPy/kern/finite_dimensional.py
diff --git a/GPy/kern/parts/fixed.py b/GPy/kern/fixed.py
similarity index 100%
rename from GPy/kern/parts/fixed.py
rename to GPy/kern/fixed.py
diff --git a/GPy/kern/parts/gibbs.py b/GPy/kern/gibbs.py
similarity index 100%
rename from GPy/kern/parts/gibbs.py
rename to GPy/kern/gibbs.py
diff --git a/GPy/kern/parts/hetero.py b/GPy/kern/hetero.py
similarity index 100%
rename from GPy/kern/parts/hetero.py
rename to GPy/kern/hetero.py
diff --git a/GPy/kern/parts/hierarchical.py b/GPy/kern/hierarchical.py
similarity index 100%
rename from GPy/kern/parts/hierarchical.py
rename to GPy/kern/hierarchical.py
diff --git a/GPy/kern/parts/independent_outputs.py b/GPy/kern/independent_outputs.py
similarity index 100%
rename from GPy/kern/parts/independent_outputs.py
rename to GPy/kern/independent_outputs.py
diff --git a/GPy/kern/parts/kernpart.py b/GPy/kern/kernpart.py
similarity index 100%
rename from GPy/kern/parts/kernpart.py
rename to GPy/kern/kernpart.py
diff --git a/GPy/kern/parts/linear.py b/GPy/kern/linear.py
similarity index 100%
rename from GPy/kern/parts/linear.py
rename to GPy/kern/linear.py
diff --git a/GPy/kern/parts/mlp.py b/GPy/kern/mlp.py
similarity index 100%
rename from GPy/kern/parts/mlp.py
rename to GPy/kern/mlp.py
diff --git a/GPy/kern/parts/odekern1.c b/GPy/kern/odekern1.c
similarity index 100%
rename from GPy/kern/parts/odekern1.c
rename to GPy/kern/odekern1.c
diff --git a/GPy/kern/parts/__init__.py b/GPy/kern/parts/__init__.py
deleted file mode 100644
index 0a758f1e..00000000
--- a/GPy/kern/parts/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import bias
-import Brownian
-import coregionalize
-import exponential
-import eq_ode1
-import finite_dimensional
-import fixed
-import gibbs
-import hetero
-import hierarchical
-import independent_outputs
-import linear
-import Matern32
-import Matern52
-import mlp
-import ODE_1
-import periodic_exponential
-import periodic_Matern32
-import periodic_Matern52
-import poly
-import prod_orthogonal
-import prod
-import rational_quadratic
-import rbfcos
-import rbf
-import rbf_inv
-import spline
-import symmetric
-import white
diff --git a/GPy/kern/parts/periodic_Matern32.py b/GPy/kern/periodic_Matern32.py
similarity index 100%
rename from GPy/kern/parts/periodic_Matern32.py
rename to GPy/kern/periodic_Matern32.py
diff --git a/GPy/kern/parts/periodic_Matern52.py b/GPy/kern/periodic_Matern52.py
similarity index 100%
rename from GPy/kern/parts/periodic_Matern52.py
rename to GPy/kern/periodic_Matern52.py
diff --git a/GPy/kern/parts/periodic_exponential.py b/GPy/kern/periodic_exponential.py
similarity index 100%
rename from GPy/kern/parts/periodic_exponential.py
rename to GPy/kern/periodic_exponential.py
diff --git a/GPy/kern/parts/poly.py b/GPy/kern/poly.py
similarity index 100%
rename from GPy/kern/parts/poly.py
rename to GPy/kern/poly.py
diff --git a/GPy/kern/parts/prod.py b/GPy/kern/prod.py
similarity index 100%
rename from GPy/kern/parts/prod.py
rename to GPy/kern/prod.py
diff --git a/GPy/kern/parts/prod_orthogonal.py b/GPy/kern/prod_orthogonal.py
similarity index 100%
rename from GPy/kern/parts/prod_orthogonal.py
rename to GPy/kern/prod_orthogonal.py
diff --git a/GPy/kern/parts/rational_quadratic.py b/GPy/kern/rational_quadratic.py
similarity index 100%
rename from GPy/kern/parts/rational_quadratic.py
rename to GPy/kern/rational_quadratic.py
diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/rbf.py
similarity index 100%
rename from GPy/kern/parts/rbf.py
rename to GPy/kern/rbf.py
diff --git a/GPy/kern/parts/rbf_inv.py b/GPy/kern/rbf_inv.py
similarity index 100%
rename from GPy/kern/parts/rbf_inv.py
rename to GPy/kern/rbf_inv.py
diff --git a/GPy/kern/parts/rbfcos.py b/GPy/kern/rbfcos.py
similarity index 100%
rename from GPy/kern/parts/rbfcos.py
rename to GPy/kern/rbfcos.py
diff --git a/GPy/kern/parts/spline.py b/GPy/kern/spline.py
similarity index 100%
rename from GPy/kern/parts/spline.py
rename to GPy/kern/spline.py
diff --git a/GPy/kern/parts/ss_rbf.py b/GPy/kern/ss_rbf.py
similarity index 100%
rename from GPy/kern/parts/ss_rbf.py
rename to GPy/kern/ss_rbf.py
diff --git a/GPy/kern/parts/symmetric.py b/GPy/kern/symmetric.py
similarity index 100%
rename from GPy/kern/parts/symmetric.py
rename to GPy/kern/symmetric.py
diff --git a/GPy/kern/parts/sympy_helpers.cpp b/GPy/kern/sympy_helpers.cpp
similarity index 100%
rename from GPy/kern/parts/sympy_helpers.cpp
rename to GPy/kern/sympy_helpers.cpp
diff --git a/GPy/kern/parts/sympy_helpers.h b/GPy/kern/sympy_helpers.h
similarity index 100%
rename from GPy/kern/parts/sympy_helpers.h
rename to GPy/kern/sympy_helpers.h
diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/sympykern.py
similarity index 100%
rename from GPy/kern/parts/sympykern.py
rename to GPy/kern/sympykern.py
diff --git a/GPy/kern/parts/white.py b/GPy/kern/white.py
similarity index 100%
rename from GPy/kern/parts/white.py
rename to GPy/kern/white.py

From 20f02a80b420696c131222e0fbf44046dcd2c3ab Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 19 Feb 2014 15:00:48 +0000
Subject: [PATCH 05/38] rbf and white seem to work

---
 GPy/core/gp.py                              |   2 +-
 GPy/kern/__init__.py                        |  61 +-
 GPy/kern/__init__old.py                     |   9 -
 GPy/kern/{ => _src}/Brownian.py             |   0
 GPy/kern/{ => _src}/Matern32.py             |   0
 GPy/kern/{ => _src}/Matern52.py             |   0
 GPy/kern/{ => _src}/ODE_1.py                |   0
 GPy/kern/_src/add.py                        | 264 ++++++++
 GPy/kern/{ => _src}/bias.py                 |   0
 GPy/kern/{ => _src}/constructors.py         |   0
 GPy/kern/{ => _src}/coregionalize.py        |   0
 GPy/kern/{ => _src}/eq_ode1.py              |   0
 GPy/kern/{ => _src}/exponential.py          |   0
 GPy/kern/{ => _src}/finite_dimensional.py   |   0
 GPy/kern/{ => _src}/fixed.py                |   0
 GPy/kern/{ => _src}/gibbs.py                |   0
 GPy/kern/{ => _src}/hetero.py               |   0
 GPy/kern/{ => _src}/hierarchical.py         |   0
 GPy/kern/{ => _src}/independent_outputs.py  |   0
 GPy/kern/_src/kern.py                       | 336 ++++++++++
 GPy/kern/_src/kernpart.py                   |  60 ++
 GPy/kern/{ => _src}/linear.py               |  12 +-
 GPy/kern/{ => _src}/mlp.py                  |   0
 GPy/kern/{ => _src}/odekern1.c              |   0
 GPy/kern/{ => _src}/periodic_Matern32.py    |   0
 GPy/kern/{ => _src}/periodic_Matern52.py    |   0
 GPy/kern/{ => _src}/periodic_exponential.py |   0
 GPy/kern/{ => _src}/poly.py                 |   0
 GPy/kern/{ => _src}/prod.py                 |   6 +-
 GPy/kern/{ => _src}/prod_orthogonal.py      |   0
 GPy/kern/{ => _src}/rational_quadratic.py   |   0
 GPy/kern/{ => _src}/rbf.py                  |  49 +-
 GPy/kern/{ => _src}/rbf_inv.py              |   0
 GPy/kern/{ => _src}/rbfcos.py               |   0
 GPy/kern/{ => _src}/spline.py               |   0
 GPy/kern/{ => _src}/ss_rbf.py               |   0
 GPy/kern/{ => _src}/symmetric.py            |   0
 GPy/kern/{ => _src}/sympy_helpers.cpp       |   0
 GPy/kern/{ => _src}/sympy_helpers.h         |   0
 GPy/kern/{ => _src}/sympykern.py            |   0
 GPy/kern/{ => _src}/white.py                |  28 +-
 GPy/kern/kern.py                            | 680 --------------------
 GPy/kern/kernpart.py                        | 176 -----
 GPy/models/mrd.py                           |   6 +-
 GPy/plotting/matplot_dep/kernel_plots.py    |   2 +-
 45 files changed, 737 insertions(+), 954 deletions(-)
 delete mode 100644 GPy/kern/__init__old.py
 rename GPy/kern/{ => _src}/Brownian.py (100%)
 rename GPy/kern/{ => _src}/Matern32.py (100%)
 rename GPy/kern/{ => _src}/Matern52.py (100%)
 rename GPy/kern/{ => _src}/ODE_1.py (100%)
 create mode 100644 GPy/kern/_src/add.py
 rename GPy/kern/{ => _src}/bias.py (100%)
 rename GPy/kern/{ => _src}/constructors.py (100%)
 rename GPy/kern/{ => _src}/coregionalize.py (100%)
 rename GPy/kern/{ => _src}/eq_ode1.py (100%)
 rename GPy/kern/{ => _src}/exponential.py (100%)
 rename GPy/kern/{ => _src}/finite_dimensional.py (100%)
 rename GPy/kern/{ => _src}/fixed.py (100%)
 rename GPy/kern/{ => _src}/gibbs.py (100%)
 rename GPy/kern/{ => _src}/hetero.py (100%)
 rename GPy/kern/{ => _src}/hierarchical.py (100%)
 rename GPy/kern/{ => _src}/independent_outputs.py (100%)
 create mode 100644 GPy/kern/_src/kern.py
 create mode 100644 GPy/kern/_src/kernpart.py
 rename GPy/kern/{ => _src}/linear.py (98%)
 rename GPy/kern/{ => _src}/mlp.py (100%)
 rename GPy/kern/{ => _src}/odekern1.c (100%)
 rename GPy/kern/{ => _src}/periodic_Matern32.py (100%)
 rename GPy/kern/{ => _src}/periodic_Matern52.py (100%)
 rename GPy/kern/{ => _src}/periodic_exponential.py (100%)
 rename GPy/kern/{ => _src}/poly.py (100%)
 rename GPy/kern/{ => _src}/prod.py (98%)
 rename GPy/kern/{ => _src}/prod_orthogonal.py (100%)
 rename GPy/kern/{ => _src}/rational_quadratic.py (100%)
 rename GPy/kern/{ => _src}/rbf.py (92%)
 rename GPy/kern/{ => _src}/rbf_inv.py (100%)
 rename GPy/kern/{ => _src}/rbfcos.py (100%)
 rename GPy/kern/{ => _src}/spline.py (100%)
 rename GPy/kern/{ => _src}/ss_rbf.py (100%)
 rename GPy/kern/{ => _src}/symmetric.py (100%)
 rename GPy/kern/{ => _src}/sympy_helpers.cpp (100%)
 rename GPy/kern/{ => _src}/sympy_helpers.h (100%)
 rename GPy/kern/{ => _src}/sympykern.py (100%)
 rename GPy/kern/{ => _src}/white.py (77%)
 delete mode 100644 GPy/kern/kern.py
 delete mode 100644 GPy/kern/kernpart.py

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index d769678e..10ba8e6b 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -43,7 +43,7 @@ class GP(Model):
         else:
             self.Y_metadata = None
 
-        assert isinstance(kernel, kern.kern)
+        assert isinstance(kernel, kern.Kern)
         self.kern = kernel
 
         assert isinstance(likelihood, likelihoods.Likelihood)
diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 0a758f1e..2098bd76 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -1,29 +1,32 @@
-import bias
-import Brownian
-import coregionalize
-import exponential
-import eq_ode1
-import finite_dimensional
-import fixed
-import gibbs
-import hetero
-import hierarchical
-import independent_outputs
-import linear
-import Matern32
-import Matern52
-import mlp
-import ODE_1
-import periodic_exponential
-import periodic_Matern32
-import periodic_Matern52
-import poly
-import prod_orthogonal
-import prod
-import rational_quadratic
-import rbfcos
-import rbf
-import rbf_inv
-import spline
-import symmetric
-import white
+from rbf import RBF
+from white import White
+from kern import Kern
+#import bias
+#import Brownian
+#import coregionalize
+#import exponential
+#import eq_ode1
+#import finite_dimensional
+#import fixed
+#import gibbs
+#import hetero
+#import hierarchical
+#import independent_outputs
+#import linear
+#import Matern32
+#import Matern52
+#import mlp
+#import ODE_1
+#import periodic_exponential
+#import periodic_Matern32
+#import periodic_Matern52
+#import poly
+#import prod_orthogonal
+#import prod
+#import rational_quadratic
+#import rbfcos
+#import rbf
+#import rbf_inv
+#import spline
+#import symmetric
+#import white
diff --git a/GPy/kern/__init__old.py b/GPy/kern/__init__old.py
deleted file mode 100644
index eb4076c3..00000000
--- a/GPy/kern/__init__old.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-from constructors import *
-try:
-    from constructors import rbf_sympy, sympykern # these depend on sympy
-except:
-    pass
-from kern import *
diff --git a/GPy/kern/Brownian.py b/GPy/kern/_src/Brownian.py
similarity index 100%
rename from GPy/kern/Brownian.py
rename to GPy/kern/_src/Brownian.py
diff --git a/GPy/kern/Matern32.py b/GPy/kern/_src/Matern32.py
similarity index 100%
rename from GPy/kern/Matern32.py
rename to GPy/kern/_src/Matern32.py
diff --git a/GPy/kern/Matern52.py b/GPy/kern/_src/Matern52.py
similarity index 100%
rename from GPy/kern/Matern52.py
rename to GPy/kern/_src/Matern52.py
diff --git a/GPy/kern/ODE_1.py b/GPy/kern/_src/ODE_1.py
similarity index 100%
rename from GPy/kern/ODE_1.py
rename to GPy/kern/_src/ODE_1.py
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
new file mode 100644
index 00000000..8d916941
--- /dev/null
+++ b/GPy/kern/_src/add.py
@@ -0,0 +1,264 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import sys
+import numpy as np
+import itertools
+from linear import Linear
+from ..core.parameterization import Parameterized
+from GPy.core.parameterization.param import Param
+from kern import Kern
+
+class Add(Kern):
+    def __init__(self, subkerns, tensor):
+        assert all([isinstance(k, Kern) for k in subkerns])
+        if tensor:
+            input_dim  = sum([k.input_dim for k in subkerns])
+            self.input_slices = []
+            n = 0
+            for k in subkerns:
+                self.input_slices.append(slice(n, n+k.input_dim))
+                n += k.input_dim
+        else:
+            assert all([k.input_dim == subkerns[0].input_dim for k in subkerns])
+            input_dim = subkerns[0].input_dim
+            self.input_slices = [slice(None) for k in subkerns]
+        super(Add, self).__init__(input_dim, 'add')
+        self.add_parameters(*subkerns)
+
+
+    def K(self, X, X2=None, which_parts='all'):
+        """
+        Compute the kernel function.
+
+        :param X: the first set of inputs to the kernel
+        :param X2: (optional) the second set of arguments to the kernel. If X2
+                   is None, this is passed throgh to the 'part' object, which
+                   handles this as X2 == X.
+        :param which_parts: a list of booleans detailing whether to include
+                            each of the part functions. By default, 'all'
+                            indicates all parts
+        """
+        if which_parts == 'all':
+            which_parts = [True] * self.size
+        assert X.shape[1] == self.input_dim
+        if X2 is None:
+            target = np.zeros((X.shape[0], X.shape[0]))
+            [p.K(X[:, i_s], None, target=target) for p, i_s, part_i_used in zip(self._parameters_, self.input_slices, which_parts) if part_i_used]
+        else:
+            target = np.zeros((X.shape[0], X2.shape[0]))
+            [p.K(X[:, i_s], X2[:, i_s], target=target) for p, i_s, part_i_used in zip(self._parameters_, self.input_slices, which_parts) if part_i_used]
+        return target
+
+    def update_gradients_full(self, dL_dK, X):
+        [p.update_gradients_full(dL_dK, X) for p in self._parameters_]
+
+    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+        [p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X, Z) for p in self._parameters_]
+
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        [p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_]
+
+    def _param_grad_helper(self, dL_dK, X, X2=None):
+        """
+        Compute the gradient of the covariance function with respect to the parameters.
+
+        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
+        :type dL_dK: Np.ndarray (num_samples x num_inducing)
+        :param X: Observed data inputs
+        :type X: np.ndarray (num_samples x input_dim)
+        :param X2: Observed data inputs (optional, defaults to X)
+        :type X2: np.ndarray (num_inducing x input_dim)
+
+        returns: dL_dtheta
+        """
+        assert X.shape[1] == self.input_dim
+        target = np.zeros(self.size)
+        if X2 is None:
+            [p._param_grad_helper(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
+        else:
+            [p._param_grad_helper(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
+
+        return self._transform_gradients(target)
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        """Compute the gradient of the objective function with respect to X.
+
+        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
+        :type dL_dK: np.ndarray (num_samples x num_inducing)
+        :param X: Observed data inputs
+        :type X: np.ndarray (num_samples x input_dim)
+        :param X2: Observed data inputs (optional, defaults to X)
+        :type X2: np.ndarray (num_inducing x input_dim)"""
+
+        target = np.zeros_like(X)
+        if X2 is None:
+            [p.gradients_X(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+        else:
+            [p.gradients_X(dL_dK, X[:, i_s], X2[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+        return target
+
+    def Kdiag(self, X, which_parts='all'):
+        """Compute the diagonal of the covariance function for inputs X."""
+        if which_parts == 'all':
+            which_parts = [True] * self.size
+        assert X.shape[1] == self.input_dim
+        target = np.zeros(X.shape[0])
+        [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self._parameters_, self.input_slices, which_parts) if part_on]
+        return target
+
+    def dKdiag_dtheta(self, dL_dKdiag, X):
+        """Compute the gradient of the diagonal of the covariance function with respect to the parameters."""
+        assert X.shape[1] == self.input_dim
+        assert dL_dKdiag.size == X.shape[0]
+        target = np.zeros(self.size)
+        [p.dKdiag_dtheta(dL_dKdiag, X[:, i_s], target[ps]) for p, i_s, ps in zip(self._parameters_, self.input_slices, self._param_slices_)]
+        return self._transform_gradients(target)
+
+    def dKdiag_dX(self, dL_dKdiag, X):
+        assert X.shape[1] == self.input_dim
+        target = np.zeros_like(X)
+        [p.dKdiag_dX(dL_dKdiag, X[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+        return target
+
+    def psi0(self, Z, mu, S):
+        target = np.zeros(mu.shape[0])
+        [p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
+        return target
+
+    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S):
+        target = np.zeros(self.size)
+        [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self._parameters_, self._param_slices_, self.input_slices)]
+        return self._transform_gradients(target)
+
+    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S):
+        target_mu, target_S = np.zeros_like(mu), np.zeros_like(S)
+        [p.dpsi0_dmuS(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+        return target_mu, target_S
+
+    def psi1(self, Z, mu, S):
+        target = np.zeros((mu.shape[0], Z.shape[0]))
+        [p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
+        return target
+
+    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S):
+        target = np.zeros((self.size))
+        [p.dpsi1_dtheta(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self._parameters_, self._param_slices_, self.input_slices)]
+        return self._transform_gradients(target)
+
+    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S):
+        target = np.zeros_like(Z)
+        [p.dpsi1_dZ(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+        return target
+
+    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S):
+        """return shapes are num_samples,num_inducing,input_dim"""
+        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
+        [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+        return target_mu, target_S
+
+    def psi2(self, Z, mu, S):
+        """
+        Computer the psi2 statistics for the covariance function.
+
+        :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
+        :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim)
+        :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing)
+
+        """
+        target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
+        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
+
+        # compute the "cross" terms
+        # TODO: input_slices needed
+        crossterms = 0
+
+        for [p1, i_s1], [p2, i_s2] in itertools.combinations(zip(self._parameters_, self.input_slices), 2):
+            if i_s1 == i_s2:
+                # TODO psi1 this must be faster/better/precached/more nice
+                tmp1 = np.zeros((mu.shape[0], Z.shape[0]))
+                p1.psi1(Z[:, i_s1], mu[:, i_s1], S[:, i_s1], tmp1)
+                tmp2 = np.zeros((mu.shape[0], Z.shape[0]))
+                p2.psi1(Z[:, i_s2], mu[:, i_s2], S[:, i_s2], tmp2)
+
+                prod = np.multiply(tmp1, tmp2)
+                crossterms += prod[:, :, None] + prod[:, None, :]
+
+        target += crossterms
+        return target
+
+    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
+        """Gradient of the psi2 statistics with respect to the parameters."""
+        target = np.zeros(self.size)
+        [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self._parameters_, self.input_slices, self._param_slices_)]
+
+        # compute the "cross" terms
+        # TODO: better looping, input_slices
+        for i1, i2 in itertools.permutations(range(len(self._parameters_)), 2):
+            p1, p2 = self._parameters_[i1], self._parameters_[i2]
+#             ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
+            ps1, ps2 = self._param_slices_[i1], self._param_slices_[i2]
+
+            tmp = np.zeros((mu.shape[0], Z.shape[0]))
+            p1.psi1(Z, mu, S, tmp)
+            p2.dpsi1_dtheta((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target[ps2])
+
+        return self._transform_gradients(target)
+
+    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S):
+        target = np.zeros_like(Z)
+        [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+        # target *= 2
+
+        # compute the "cross" terms
+        # TODO: we need input_slices here.
+        for p1, p2 in itertools.permutations(self._parameters_, 2):
+#             if p1.name == 'linear' and p2.name == 'linear':
+#                 raise NotImplementedError("We don't handle linear/linear cross-terms")
+            tmp = np.zeros((mu.shape[0], Z.shape[0]))
+            p1.psi1(Z, mu, S, tmp)
+            p2.dpsi1_dZ((tmp[:, None, :] * dL_dpsi2).sum(1), Z, mu, S, target)
+
+        return target * 2
+
+    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S):
+        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
+        [p.dpsi2_dmuS(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+
+        # compute the "cross" terms
+        # TODO: we need input_slices here.
+        for p1, p2 in itertools.permutations(self._parameters_, 2):
+#             if p1.name == 'linear' and p2.name == 'linear':
+#                 raise NotImplementedError("We don't handle linear/linear cross-terms")
+            tmp = np.zeros((mu.shape[0], Z.shape[0]))
+            p1.psi1(Z, mu, S, tmp)
+            p2.dpsi1_dmuS((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target_mu, target_S)
+
+        return target_mu, target_S
+
+    def plot(self, *args, **kwargs):
+        """
+        See GPy.plotting.matplot_dep.plot
+        """
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import kernel_plots
+        kernel_plots.plot(self,*args)
+
+    def _getstate(self):
+        """
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
+        """
+        return Parameterized._getstate(self) + [#self._parameters_,
+                self.input_dim,
+                self.input_slices,
+                self._param_slices_
+                ]
+
+    def _setstate(self, state):
+        self._param_slices_ = state.pop()
+        self.input_slices = state.pop()
+        self.input_dim = state.pop()
+        Parameterized._setstate(self, state)
+
+
diff --git a/GPy/kern/bias.py b/GPy/kern/_src/bias.py
similarity index 100%
rename from GPy/kern/bias.py
rename to GPy/kern/_src/bias.py
diff --git a/GPy/kern/constructors.py b/GPy/kern/_src/constructors.py
similarity index 100%
rename from GPy/kern/constructors.py
rename to GPy/kern/_src/constructors.py
diff --git a/GPy/kern/coregionalize.py b/GPy/kern/_src/coregionalize.py
similarity index 100%
rename from GPy/kern/coregionalize.py
rename to GPy/kern/_src/coregionalize.py
diff --git a/GPy/kern/eq_ode1.py b/GPy/kern/_src/eq_ode1.py
similarity index 100%
rename from GPy/kern/eq_ode1.py
rename to GPy/kern/_src/eq_ode1.py
diff --git a/GPy/kern/exponential.py b/GPy/kern/_src/exponential.py
similarity index 100%
rename from GPy/kern/exponential.py
rename to GPy/kern/_src/exponential.py
diff --git a/GPy/kern/finite_dimensional.py b/GPy/kern/_src/finite_dimensional.py
similarity index 100%
rename from GPy/kern/finite_dimensional.py
rename to GPy/kern/_src/finite_dimensional.py
diff --git a/GPy/kern/fixed.py b/GPy/kern/_src/fixed.py
similarity index 100%
rename from GPy/kern/fixed.py
rename to GPy/kern/_src/fixed.py
diff --git a/GPy/kern/gibbs.py b/GPy/kern/_src/gibbs.py
similarity index 100%
rename from GPy/kern/gibbs.py
rename to GPy/kern/_src/gibbs.py
diff --git a/GPy/kern/hetero.py b/GPy/kern/_src/hetero.py
similarity index 100%
rename from GPy/kern/hetero.py
rename to GPy/kern/_src/hetero.py
diff --git a/GPy/kern/hierarchical.py b/GPy/kern/_src/hierarchical.py
similarity index 100%
rename from GPy/kern/hierarchical.py
rename to GPy/kern/_src/hierarchical.py
diff --git a/GPy/kern/independent_outputs.py b/GPy/kern/_src/independent_outputs.py
similarity index 100%
rename from GPy/kern/independent_outputs.py
rename to GPy/kern/_src/independent_outputs.py
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
new file mode 100644
index 00000000..af362498
--- /dev/null
+++ b/GPy/kern/_src/kern.py
@@ -0,0 +1,336 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import sys
+import numpy as np
+import itertools
+from ..core.parameterization import Parameterized
+from GPy.core.parameterization.param import Param
+
+
+class Kern(Parameterized):
+    def __init__(self,input_dim,name):
+        """
+        The base class for a kernel: a positive definite function
+        which forms of a covariance function (kernel).
+
+        :param input_dim: the number of input dimensions to the function
+        :type input_dim: int
+
+        Do not instantiate.
+        """
+        super(Kern, self).__init__(name)
+        self.input_dim = input_dim
+
+    def K(self,X,X2,target):
+        raise NotImplementedError
+    def Kdiag(self,X,target):
+        raise NotImplementedError
+    def _param_grad_helper(self,dL_dK,X,X2,target):
+        raise NotImplementedError
+    def dKdiag_dtheta(self,dL_dKdiag,X,target): # TODO: Max??
+        # In the base case compute this by calling _param_grad_helper. Need to
+        # override for stationary covariances (for example) to save
+        # time.
+        for i in range(X.shape[0]):
+            self._param_grad_helper(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target)
+    def psi0(self,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
+        raise NotImplementedError
+    def psi1(self,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi1_dtheta(self,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,target_mu,target_S):
+        raise NotImplementedError
+    def psi2(self,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
+        raise NotImplementedError
+    def gradients_X(self, dL_dK, X, X2, target):
+        raise NotImplementedError
+    def dKdiag_dX(self, dL_dK, X, target):
+        raise NotImplementedError
+    def update_gradients_full(self, dL_dK, X):
+        """Set the gradients of all parameters when doing full (N) inference."""
+        raise NotImplementedError
+    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+        """Set the gradients of all parameters when doing sparse (M) inference."""
+        raise NotImplementedError
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        """Set the gradients of all parameters when doing variational (M) inference with uncertain inputs."""
+        raise NotImplementedError
+
+    def plot_ARD(self, *args):
+        """If an ARD kernel is present, plot a bar representation using matplotlib
+
+        See GPy.plotting.matplot_dep.plot_ARD
+        """
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import kernel_plots
+        return kernel_plots.plot_ARD(self,*args)
+
+
+    def __add__(self, other):
+        """ Overloading of the '+' operator. for more control, see self.add """
+        return self.add(other)
+
+    def add(self, other, tensor=False):
+        """
+        Add another kernel to this one.
+
+        If Tensor is False, both kernels are defined on the same _space_. then
+        the created kernel will have the same number of inputs as self and
+        other (which must be the same).
+
+        If Tensor is True, then the dimensions are stacked 'horizontally', so
+        that the resulting kernel has self.input_dim + other.input_dim
+
+        :param other: the other kernel to be added
+        :type other: GPy.kern
+
+        """
+        assert isinstance(other, Kern), "only kernels can be added to kernels..."
+        from add import Add
+        return Add([self, other], tensor)
+
+    def __call__(self, X, X2=None):
+        return self.K(X, X2)
+
+    def __mul__(self, other):
+        """ Here we overload the '*' operator. See self.prod for more information"""
+        return self.prod(other)
+
+    def __pow__(self, other, tensor=False):
+        """
+        Shortcut for tensor `prod`.
+        """
+        return self.prod(other, tensor=True)
+
+    def prod(self, other, tensor=False):
+        """
+        Multiply two kernels (either on the same space, or on the tensor product of the input space).
+
+        :param other: the other kernel to be added
+        :type other: GPy.kern
+        :param tensor: whether or not to use the tensor space (default is false).
+        :type tensor: bool
+
+        """
+        assert isinstance(other, Kern), "only kernels can be added to kernels..."
+        from prod import Prod
+        return Prod(self, other, tensor)
+
+
+from GPy.core.model import Model
+
+class Kern_check_model(Model):
+    """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel."""
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Model.__init__(self, 'kernel_test_model')
+        num_samples = 20
+        num_samples2 = 10
+        if kernel==None:
+            kernel = GPy.kern.rbf(1)
+        if X==None:
+            X = np.random.randn(num_samples, kernel.input_dim)
+        if dL_dK==None:
+            if X2==None:
+                dL_dK = np.ones((X.shape[0], X.shape[0]))
+            else:
+                dL_dK = np.ones((X.shape[0], X2.shape[0]))
+        
+        self.kernel=kernel
+        self.add_parameter(kernel)
+        self.X = X
+        self.X2 = X2
+        self.dL_dK = dL_dK
+
+    def is_positive_definite(self):
+        v = np.linalg.eig(self.kernel.K(self.X))[0]
+        if any(v<-10*sys.float_info.epsilon):
+            return False
+        else:
+            return True
+
+    def log_likelihood(self):
+        return (self.dL_dK*self.kernel.K(self.X, self.X2)).sum()
+
+    def _log_likelihood_gradients(self):
+        raise NotImplementedError, "This needs to be implemented to use the kern_check_model class."
+
+class Kern_check_dK_dtheta(Kern_check_model):
+    """This class allows gradient checks for the gradient of a kernel with respect to parameters. """
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
+
+    def _log_likelihood_gradients(self):
+        return self.kernel._param_grad_helper(self.dL_dK, self.X, self.X2)
+
+
+
+
+
+class Kern_check_dKdiag_dtheta(Kern_check_model):
+    """This class allows gradient checks of the gradient of the diagonal of a kernel with respect to the parameters."""
+    def __init__(self, kernel=None, dL_dK=None, X=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
+        if dL_dK==None:
+            self.dL_dK = np.ones((self.X.shape[0]))
+    def parameters_changed(self):
+        self.kernel.update_gradients_full(self.dL_dK, self.X)        
+
+    def log_likelihood(self):
+        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
+
+    def _log_likelihood_gradients(self):
+        return self.kernel.dKdiag_dtheta(self.dL_dK, self.X)
+
+class Kern_check_dK_dX(Kern_check_model):
+    """This class allows gradient checks for the gradient of a kernel with respect to X. """
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
+        self.remove_parameter(kernel)
+        self.X = Param('X', self.X)
+        self.add_parameter(self.X)
+    def _log_likelihood_gradients(self):
+        return self.kernel.gradients_X(self.dL_dK, self.X, self.X2).flatten()
+
+class Kern_check_dKdiag_dX(Kern_check_dK_dX):
+    """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Kern_check_dK_dX.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
+        if dL_dK==None:
+            self.dL_dK = np.ones((self.X.shape[0]))
+        
+    def log_likelihood(self):
+        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
+
+    def _log_likelihood_gradients(self):
+        return self.kernel.dKdiag_dX(self.dL_dK, self.X).flatten()
+
+def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
+    """
+    This function runs on kernels to check the correctness of their
+    implementation. It checks that the covariance function is positive definite
+    for a randomly generated data set.
+
+    :param kern: the kernel to be tested.
+    :type kern: GPy.kern.Kernpart
+    :param X: X input values to test the covariance function.
+    :type X: ndarray
+    :param X2: X2 input values to test the covariance function.
+    :type X2: ndarray
+
+    """
+    pass_checks = True
+    if X==None:
+        X = np.random.randn(10, kern.input_dim)
+        if output_ind is not None:
+            X[:, output_ind] = np.random.randint(kern.output_dim, X.shape[0])
+    if X2==None:
+        X2 = np.random.randn(20, kern.input_dim)
+        if output_ind is not None:
+            X2[:, output_ind] = np.random.randint(kern.output_dim, X2.shape[0])
+
+    if verbose:
+        print("Checking covariance function is positive definite.")
+    result = Kern_check_model(kern, X=X).is_positive_definite()
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Positive definite check failed for " + kern.name + " covariance function.")
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of K(X, X) wrt theta.")
+    result = Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of K(X, X2) wrt theta.")
+    result = Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of Kdiag(X) wrt theta.")
+    result = Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of Kdiag(X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of K(X, X) wrt X.")
+    try:
+        result = Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=verbose)
+    except NotImplementedError:
+        result=True
+        if verbose:
+            print("gradients_X not implemented for " + kern.name)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of K(X, X2) wrt X.")
+    try:
+        result = Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose)
+    except NotImplementedError:
+        result=True
+        if verbose:
+            print("gradients_X not implemented for " + kern.name)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of Kdiag(X) wrt X.")
+    try:
+        result = Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)
+    except NotImplementedError:
+        result=True
+        if verbose:
+            print("gradients_X not implemented for " + kern.name)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of Kdiag(X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    return pass_checks
diff --git a/GPy/kern/_src/kernpart.py b/GPy/kern/_src/kernpart.py
new file mode 100644
index 00000000..097ed741
--- /dev/null
+++ b/GPy/kern/_src/kernpart.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+#from ...core.parameterized.Parameterized import set_as_parameter
+from ...core.parameterization import Parameterized
+
+class Kernpart_stationary(Kernpart):
+    def __init__(self, input_dim, lengthscale=None, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if not ARD:
+            self.num_params = 2
+            if lengthscale is not None:
+                self.lengthscale = np.asarray(lengthscale)
+                assert self.lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                self.lengthscale = np.ones(1)
+        else:
+            self.num_params = self.input_dim + 1
+            if lengthscale is not None:
+                self.lengthscale = np.asarray(lengthscale)
+                assert self.lengthscale.size == self.input_dim, "bad number of lengthscales"
+            else:
+                self.lengthscale = np.ones(self.input_dim)
+
+        # initialize cache
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._parameters_ = np.empty(shape=(3, 1))
+
+    def _set_params(self, x):
+        self.lengthscale = x
+        self.lengthscale2 = np.square(self.lengthscale)
+        # reset cached results
+        self._X, self._X2, self._parameters_ = np.empty(shape=(3, 1))
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
+
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        # For stationary covariances, derivative of diagonal elements
+        # wrt lengthscale is 0.
+        target[0] += np.sum(dL_dKdiag)
+
+    def dKdiag_dX(self, dL_dK, X, target):
+        pass # true for all stationary kernels
+
+
+class Kernpart_inner(Kernpart):
+    def __init__(self,input_dim):
+        """
+        The base class for a kernpart_inner: a positive definite function which forms part of a kernel that is based on the inner product between inputs.
+
+        :param input_dim: the number of input dimensions to the function
+        :type input_dim: int
+
+        Do not instantiate.
+        """
+        Kernpart.__init__(self, input_dim)
+
+        # initialize cache
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._parameters_ = np.empty(shape=(3, 1))
diff --git a/GPy/kern/linear.py b/GPy/kern/_src/linear.py
similarity index 98%
rename from GPy/kern/linear.py
rename to GPy/kern/_src/linear.py
index 828ece11..ab77d4e6 100644
--- a/GPy/kern/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -4,13 +4,13 @@
 
 import numpy as np
 from scipy import weave
-from kernpart import Kernpart
-from ...util.linalg import tdot
-from ...util.misc import fast_array_equal, param_to_array
-from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from kern import Kern
+from ..util.linalg import tdot
+from ..util.misc import fast_array_equal, param_to_array
+from ..core.parameterization import Param
+from ..core.parameterization.transformations import Logexp
 
-class Linear(Kernpart):
+class Linear(Kern):
     """
     Linear kernel
 
diff --git a/GPy/kern/mlp.py b/GPy/kern/_src/mlp.py
similarity index 100%
rename from GPy/kern/mlp.py
rename to GPy/kern/_src/mlp.py
diff --git a/GPy/kern/odekern1.c b/GPy/kern/_src/odekern1.c
similarity index 100%
rename from GPy/kern/odekern1.c
rename to GPy/kern/_src/odekern1.c
diff --git a/GPy/kern/periodic_Matern32.py b/GPy/kern/_src/periodic_Matern32.py
similarity index 100%
rename from GPy/kern/periodic_Matern32.py
rename to GPy/kern/_src/periodic_Matern32.py
diff --git a/GPy/kern/periodic_Matern52.py b/GPy/kern/_src/periodic_Matern52.py
similarity index 100%
rename from GPy/kern/periodic_Matern52.py
rename to GPy/kern/_src/periodic_Matern52.py
diff --git a/GPy/kern/periodic_exponential.py b/GPy/kern/_src/periodic_exponential.py
similarity index 100%
rename from GPy/kern/periodic_exponential.py
rename to GPy/kern/_src/periodic_exponential.py
diff --git a/GPy/kern/poly.py b/GPy/kern/_src/poly.py
similarity index 100%
rename from GPy/kern/poly.py
rename to GPy/kern/_src/poly.py
diff --git a/GPy/kern/prod.py b/GPy/kern/_src/prod.py
similarity index 98%
rename from GPy/kern/prod.py
rename to GPy/kern/_src/prod.py
index 364c91b3..08221de7 100644
--- a/GPy/kern/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -1,17 +1,17 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kernpart import Kernpart
+from kern import Kern
 from coregionalize import Coregionalize
 import numpy as np
 import hashlib
 
-class Prod(Kernpart):
+class Prod(Kern):
     """
     Computes the product of 2 kernels
 
     :param k1, k2: the kernels to multiply
-    :type k1, k2: Kernpart
+    :type k1, k2: Kern
     :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
     :type tensor: Boolean
     :rtype: kernel object
diff --git a/GPy/kern/prod_orthogonal.py b/GPy/kern/_src/prod_orthogonal.py
similarity index 100%
rename from GPy/kern/prod_orthogonal.py
rename to GPy/kern/_src/prod_orthogonal.py
diff --git a/GPy/kern/rational_quadratic.py b/GPy/kern/_src/rational_quadratic.py
similarity index 100%
rename from GPy/kern/rational_quadratic.py
rename to GPy/kern/_src/rational_quadratic.py
diff --git a/GPy/kern/rbf.py b/GPy/kern/_src/rbf.py
similarity index 92%
rename from GPy/kern/rbf.py
rename to GPy/kern/_src/rbf.py
index 027aa382..36e454e3 100644
--- a/GPy/kern/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -4,13 +4,13 @@
 
 import numpy as np
 from scipy import weave
-from kernpart import Kernpart
-from ...util.linalg import tdot
-from ...util.misc import fast_array_equal, param_to_array
-from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from kern import Kern
+from ..util.linalg import tdot
+from ..util.misc import fast_array_equal, param_to_array
+from ..core.parameterization import Param
+from ..core.parameterization.transformations import Logexp
 
-class RBF(Kernpart):
+class RBF(Kern):
     """
     Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:
 
@@ -52,30 +52,16 @@ class RBF(Kernpart):
                 lengthscale = np.ones(self.input_dim)
 
         self.variance = Param('variance', variance, Logexp())
-        
+
         self.lengthscale = Param('lengthscale', lengthscale, Logexp())
         self.lengthscale.add_observer(self, self.update_lengthscale)
         self.update_lengthscale(self.lengthscale)
-        
+
         self.add_parameters(self.variance, self.lengthscale)
         self.parameters_changed() # initializes cache
 
-        #self.update_inv_lengthscale(self.lengthscale)
-        #self.parameters_changed()
-        # initialize cache
-        #self._Z, self._mu, self._S = np.empty(shape=(3, 1))
-        #self._X, self._X2, self._params_save = np.empty(shape=(3, 1))
-
-        # a set of optional args to pass to weave
-        # self.weave_options = {'headers'           : ['<omp.h>'],
-        #                       'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
-        #                       'extra_link_args'   : ['-lgomp']}
         self.weave_options = {}
 
-    def on_input_change(self, X):
-        #self._K_computations(X, None)
-        pass
-
     def update_lengthscale(self, l):
         self.lengthscale2 = np.square(self.lengthscale)
 
@@ -84,13 +70,16 @@ class RBF(Kernpart):
         self._X, self._X2 = np.empty(shape=(2, 1))
         self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
 
-    def K(self, X, X2, target):
+    def K(self, X, X2=None):
         self._K_computations(X, X2)
-        target += self.variance * self._K_dvar
+        return self.variance * self._K_dvar
 
-    def Kdiag(self, X, target):
-        np.add(target, self.variance, target)
+    def Kdiag(self, X):
+        ret = np.ones(X.shape[0])
+        ret[:] = self.variance
+        return ret
 
+    #TODO: remove TARGET!
     def psi0(self, Z, mu, S, target):
         target += self.variance
 
@@ -165,7 +154,7 @@ class RBF(Kernpart):
         else:
             self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
 
-    def gradients_X(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2):
         #if self._X is None or X.base is not self._X.base or X2 is not None:
         self._K_computations(X, X2)
         if X2 is None:
@@ -173,10 +162,10 @@ class RBF(Kernpart):
         else:
             _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
         gradients_X = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
-        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
+        return np.sum(gradients_X * dL_dK.T[:, :, None], 0)
 
-    def dKdiag_dX(self, dL_dKdiag, X, target):
-        pass
+    def dKdiag_dX(self, dL_dKdiag, X):
+        return np.zeros(X.shape[0])
 
     #---------------------------------------#
     #             PSI statistics            #
diff --git a/GPy/kern/rbf_inv.py b/GPy/kern/_src/rbf_inv.py
similarity index 100%
rename from GPy/kern/rbf_inv.py
rename to GPy/kern/_src/rbf_inv.py
diff --git a/GPy/kern/rbfcos.py b/GPy/kern/_src/rbfcos.py
similarity index 100%
rename from GPy/kern/rbfcos.py
rename to GPy/kern/_src/rbfcos.py
diff --git a/GPy/kern/spline.py b/GPy/kern/_src/spline.py
similarity index 100%
rename from GPy/kern/spline.py
rename to GPy/kern/_src/spline.py
diff --git a/GPy/kern/ss_rbf.py b/GPy/kern/_src/ss_rbf.py
similarity index 100%
rename from GPy/kern/ss_rbf.py
rename to GPy/kern/_src/ss_rbf.py
diff --git a/GPy/kern/symmetric.py b/GPy/kern/_src/symmetric.py
similarity index 100%
rename from GPy/kern/symmetric.py
rename to GPy/kern/_src/symmetric.py
diff --git a/GPy/kern/sympy_helpers.cpp b/GPy/kern/_src/sympy_helpers.cpp
similarity index 100%
rename from GPy/kern/sympy_helpers.cpp
rename to GPy/kern/_src/sympy_helpers.cpp
diff --git a/GPy/kern/sympy_helpers.h b/GPy/kern/_src/sympy_helpers.h
similarity index 100%
rename from GPy/kern/sympy_helpers.h
rename to GPy/kern/_src/sympy_helpers.h
diff --git a/GPy/kern/sympykern.py b/GPy/kern/_src/sympykern.py
similarity index 100%
rename from GPy/kern/sympykern.py
rename to GPy/kern/_src/sympykern.py
diff --git a/GPy/kern/white.py b/GPy/kern/_src/white.py
similarity index 77%
rename from GPy/kern/white.py
rename to GPy/kern/_src/white.py
index c7e4c6dd..7750267f 100644
--- a/GPy/kern/white.py
+++ b/GPy/kern/_src/white.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kernpart import Kernpart
+from kern import Kern
 import numpy as np
-from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from ..core.parameterization import Param
+from ..core.parameterization.transformations import Logexp
 
-class White(Kernpart):
+class White(Kern):
     """
     White noise kernel.
 
@@ -22,12 +22,14 @@ class White(Kernpart):
         self.add_parameters(self.variance)
         self._psi1 = 0 # TODO: more elegance here
 
-    def K(self,X,X2,target):
+    def K(self,X,X2):
         if X2 is None:
-            target += np.eye(X.shape[0])*self.variance
+            return np.eye(X.shape[0])*self.variance
 
-    def Kdiag(self,X,target):
-        target += self.variance
+    def Kdiag(self,X):
+        ret = np.ones(X.shape[0])
+        ret[:] = self.variance
+        return ret
 
     def update_gradients_full(self, dL_dK, X):
         self.variance.gradient = np.trace(dL_dK)
@@ -38,14 +40,8 @@ class White(Kernpart):
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         raise NotImplementedError
 
-    def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        target += np.sum(dL_dKdiag)
-
-    def gradients_X(self,dL_dK,X,X2,target):
-        pass
-
-    def dKdiag_dX(self,dL_dKdiag,X,target):
-        pass
+    def gradients_X(self,dL_dK,X,X2):
+        return np.zeros_like(X)
 
     def psi0(self,Z,mu,S,target):
         pass # target += self.variance
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
deleted file mode 100644
index 53728d0d..00000000
--- a/GPy/kern/kern.py
+++ /dev/null
@@ -1,680 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import sys
-import numpy as np
-import itertools
-from parts.prod import Prod as prod
-from parts.linear import Linear
-from parts.kernpart import Kernpart
-from ..core.parameterization import Parameterized
-from GPy.core.parameterization.param import Param
-
-class kern(Parameterized):
-    def __init__(self, input_dim, parts=[], input_slices=None):
-        """
-        This is the main kernel class for GPy. It handles multiple
-        (additive) kernel functions, and keeps track of various things
-        like which parameters live where.
-
-        The technical code for kernels is divided into _parts_ (see
-        e.g. rbf.py). This object contains a list of parts, which are
-        computed additively. For multiplication, special _prod_ parts
-        are used.
-
-        :param input_dim: The dimensionality of the kernel's input space
-        :type input_dim: int
-        :param parts: the 'parts' (PD functions) of the kernel
-        :type parts: list of Kernpart objects
-        :param input_slices: the slices on the inputs which apply to each kernel
-        :type input_slices: list of slice objects, or list of bools
-
-        """
-        super(kern, self).__init__('kern')
-        self.add_parameters(*parts)
-        self.input_dim = input_dim
-
-        if input_slices is None:
-            self.input_slices = [slice(None) for p in self._parameters_]
-        else:
-            assert len(input_slices) == len(self._parameters_)
-            self.input_slices = [sl if type(sl) is slice else slice(None) for sl in input_slices]
-
-        for p in self._parameters_:
-            assert isinstance(p, Kernpart), "bad kernel part"
-
-    def parameters_changed(self):
-        [p.parameters_changed() for p in self._parameters_]
-
-    def connect_input(self, Xparam):
-        [p.connect_input(Xparam) for p in self._parameters_]
-
-    def _getstate(self):
-        """
-        Get the current state of the class,
-        here just all the indices, rest can get recomputed
-        """
-        return Parameterized._getstate(self) + [#self._parameters_,
-                #self.num_params,
-                self.input_dim,
-                self.input_slices,
-                self._param_slices_
-                ]
-
-    def _setstate(self, state):
-        self._param_slices_ = state.pop()
-        self.input_slices = state.pop()
-        self.input_dim = state.pop()
-        #self.num_params = state.pop()
-        #self._parameters_ = state.pop()
-        Parameterized._setstate(self, state)
-
-
-    def plot_ARD(self, *args):
-        """If an ARD kernel is present, plot a bar representation using matplotlib
-
-        See GPy.plotting.matplot_dep.plot_ARD
-        """
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import kernel_plots
-        return kernel_plots.plot_ARD(self,*args)
-
-#     def _transform_gradients(self, g):
-#         """
-#         Apply the transformations of the kernel so that the returned vector
-#         represents the gradient in the transformed space (i.e. that given by
-#         get_params_transformed())
-#
-#         :param g: the gradient vector for the current model, usually created by _param_grad_helper
-#         """
-#         x = self._get_params()
-#         [np.place(g, index, g[index] * constraint.gradfactor(x[index]))
-#          for constraint, index in self.constraints.iteritems() if constraint is not __fixed__]
-# #         for constraint, index in self.constraints.iteritems():
-# #             if constraint != __fixed__:
-# #                 g[index] = g[index] * constraint.gradfactor(x[index])
-#         #[np.put(g, i, v) for i, v in [(t[0], np.sum(g[t])) for t in self.tied_indices]]
-#         [np.put(g, i, v) for i, v in [[i, t.sum()] for p in self._parameters_ for t,i in p._tied_to_me_.iteritems()]]
-# #         if len(self.tied_indices) or len(self.fixed_indices):
-# #             to_remove = np.hstack((self.fixed_indices + [t[1:] for t in self.tied_indices]))
-# #             return np.delete(g, to_remove)
-# #         else:
-#         if self._fixes_ is not None: return g[self._fixes_]
-#         return g
-#         x = self._get_params()
-#         [np.put(x, i, x * t.gradfactor(x[i])) for i, t in zip(self.constrained_indices, self.constraints)]
-#         [np.put(g, i, v) for i, v in [(t[0], np.sum(g[t])) for t in self.tied_indices]]
-#         if len(self.tied_indices) or len(self.fixed_indices):
-#             to_remove = np.hstack((self.fixed_indices + [t[1:] for t in self.tied_indices]))
-#             return np.delete(g, to_remove)
-#         else:
-#             return g
-
-    def __add__(self, other):
-        """ Overloading of the '+' operator. for more control, see self.add """
-        return self.add(other)
-
-    def add(self, other, tensor=False):
-        """
-        Add another kernel to this one.
-
-        If Tensor is False, both kernels are defined on the same _space_. then
-        the created kernel will have the same number of inputs as self and
-        other (which must be the same).
-
-        If Tensor is True, then the dimensions are stacked 'horizontally', so
-        that the resulting kernel has self.input_dim + other.input_dim
-
-        :param other: the other kernel to be added
-        :type other: GPy.kern
-
-        """
-        if tensor:
-            D = self.input_dim + other.input_dim
-            self_input_slices = [slice(*sl.indices(self.input_dim)) for sl in self.input_slices]
-            other_input_indices = [sl.indices(other.input_dim) for sl in other.input_slices]
-            other_input_slices = [slice(i[0] + self.input_dim, i[1] + self.input_dim, i[2]) for i in other_input_indices]
-
-            newkern = kern(D, self._parameters_ + other._parameters_, self_input_slices + other_input_slices)
-
-            # transfer constraints:
-#             newkern.constrained_indices = self.constrained_indices + [x + self.num_params for x in other.constrained_indices]
-#             newkern.constraints = self.constraints + other.constraints
-#             newkern.fixed_indices = self.fixed_indices + [self.num_params + x for x in other.fixed_indices]
-#             newkern.fixed_values = self.fixed_values + other.fixed_values
-#             newkern.constraints = self.constraints + other.constraints
-#             newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices]
-        else:
-            assert self.input_dim == other.input_dim
-            newkern = kern(self.input_dim, self._parameters_ + other._parameters_, self.input_slices + other.input_slices)
-            # transfer constraints:
-#             newkern.constrained_indices = self.constrained_indices + [i + self.num_params  for i in other.constrained_indices]
-#             newkern.constraints = self.constraints + other.constraints
-#             newkern.fixed_indices = self.fixed_indices + [self.num_params + x for x in other.fixed_indices]
-#             newkern.fixed_values = self.fixed_values + other.fixed_values
-#             newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices]
-        
-        [newkern.constraints.add(transform, ind) for transform, ind in self.constraints.iteritems()]
-        [newkern.constraints.add(transform, ind+self.size) for transform, ind in other.constraints.iteritems()]
-        newkern._fixes_ = ((self._fixes_ or 0) + (other._fixes_ or 0)) or None
-
-        return newkern
-
-    def __call__(self, X, X2=None):
-        return self.K(X, X2)
-
-    def __mul__(self, other):
-        """ Here we overload the '*' operator. See self.prod for more information"""
-        return self.prod(other)
-
-    def __pow__(self, other, tensor=False):
-        """
-        Shortcut for tensor `prod`.
-        """
-        return self.prod(other, tensor=True)
-
-    def prod(self, other, tensor=False):
-        """
-        Multiply two kernels (either on the same space, or on the tensor product of the input space).
-
-        :param other: the other kernel to be added
-        :type other: GPy.kern
-        :param tensor: whether or not to use the tensor space (default is false).
-        :type tensor: bool
-
-        """
-        K1 = self
-        K2 = other
-        #K1 = self.copy()
-        #K2 = other.copy()
-
-        slices = []
-        for sl1, sl2 in itertools.product(K1.input_slices, K2.input_slices):
-            s1, s2 = [False] * K1.input_dim, [False] * K2.input_dim
-            s1[sl1], s2[sl2] = [True], [True]
-            slices += [s1 + s2]
-
-        newkernparts = [prod(k1, k2, tensor) for k1, k2 in itertools.product(K1._parameters_, K2._parameters_)]
-
-        if tensor:
-            newkern = kern(K1.input_dim + K2.input_dim, newkernparts, slices)
-        else:
-            newkern = kern(K1.input_dim, newkernparts, slices)
-
-        #newkern._follow_constrains(K1, K2)
-        return newkern
-
-#     def _follow_constrains(self, K1, K2):
-#
-#         # Build the array that allows to go from the initial indices of the param to the new ones
-#         K1_param = []
-#         n = 0
-#         for k1 in K1.parts:
-#             K1_param += [range(n, n + k1.num_params)]
-#             n += k1.num_params
-#         n = 0
-#         K2_param = []
-#         for k2 in K2.parts:
-#             K2_param += [range(K1.num_params + n, K1.num_params + n + k2.num_params)]
-#             n += k2.num_params
-#         index_param = []
-#         for p1 in K1_param:
-#             for p2 in K2_param:
-#                 index_param += p1 + p2
-#         index_param = np.array(index_param)
-#
-#         # Get the ties and constrains of the kernels before the multiplication
-#         prev_ties = K1.tied_indices + [arr + K1.num_params for arr in K2.tied_indices]
-#
-#         prev_constr_ind = [K1.constrained_indices] + [K1.num_params + i for i in K2.constrained_indices]
-#         prev_constr = K1.constraints + K2.constraints
-#
-#         # prev_constr_fix = K1.fixed_indices + [arr + K1.num_params for arr in K2.fixed_indices]
-#         # prev_constr_fix_values = K1.fixed_values + K2.fixed_values
-#
-#         # follow the previous ties
-#         for arr in prev_ties:
-#             for j in arr:
-#                 index_param[np.where(index_param == j)[0]] = arr[0]
-#
-#         # ties and constrains
-#         for i in range(K1.num_params + K2.num_params):
-#             index = np.where(index_param == i)[0]
-#             if index.size > 1:
-#                 self.tie_params(index)
-#         for i, t in zip(prev_constr_ind, prev_constr):
-#             self.constrain(np.where(index_param == i)[0], t)
-#
-#     def _get_params(self):
-#         return np.hstack(self._parameters_)
-#         return np.hstack([p._get_params() for p in self._parameters_])
-
-#     def _set_params(self, x):
-#         import ipdb;ipdb.set_trace()
-#         [p._set_params(x[s]) for p, s in zip(self._parameters_, self._param_slices_)]
-
-#     def _get_param_names(self):
-#         # this is a bit nasty: we want to distinguish between parts with the same name by appending a count
-#         part_names = np.array([k.name for k in self._parameters_], dtype=np.str)
-#         counts = [np.sum(part_names == ni) for i, ni in enumerate(part_names)]
-#         cum_counts = [np.sum(part_names[i:] == ni) for i, ni in enumerate(part_names)]
-#         names = [name + '_' + str(cum_count) if count > 1 else name for name, count, cum_count in zip(part_names, counts, cum_counts)]
-#
-#         return sum([[name + '_' + n for n in k._get_param_names()] for name, k in zip(names, self._parameters_)], [])
-
-    def K(self, X, X2=None, which_parts='all'):
-        """
-        Compute the kernel function.
-
-        :param X: the first set of inputs to the kernel
-        :param X2: (optional) the second set of arguments to the kernel. If X2
-                   is None, this is passed throgh to the 'part' object, which
-                   handles this as X2 == X.
-        :param which_parts: a list of booleans detailing whether to include
-                            each of the part functions. By default, 'all'
-                            indicates all parts
-        """
-        if which_parts == 'all':
-            which_parts = [True] * self.size
-        assert X.shape[1] == self.input_dim
-        if X2 is None:
-            target = np.zeros((X.shape[0], X.shape[0]))
-            [p.K(X[:, i_s], None, target=target) for p, i_s, part_i_used in zip(self._parameters_, self.input_slices, which_parts) if part_i_used]
-        else:
-            target = np.zeros((X.shape[0], X2.shape[0]))
-            [p.K(X[:, i_s], X2[:, i_s], target=target) for p, i_s, part_i_used in zip(self._parameters_, self.input_slices, which_parts) if part_i_used]
-        return target
-
-    def update_gradients_full(self, dL_dK, X):
-        [p.update_gradients_full(dL_dK, X) for p in self._parameters_]
-
-    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        [p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X, Z) for p in self._parameters_]
-
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
-        [p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_]
-
-    def _param_grad_helper(self, dL_dK, X, X2=None):
-        """
-        Compute the gradient of the covariance function with respect to the parameters.
-
-        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
-        :type dL_dK: Np.ndarray (num_samples x num_inducing)
-        :param X: Observed data inputs
-        :type X: np.ndarray (num_samples x input_dim)
-        :param X2: Observed data inputs (optional, defaults to X)
-        :type X2: np.ndarray (num_inducing x input_dim)
-
-        returns: dL_dtheta
-        """
-        assert X.shape[1] == self.input_dim
-        target = np.zeros(self.size)
-        if X2 is None:
-            [p._param_grad_helper(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
-        else:
-            [p._param_grad_helper(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
-
-        return self._transform_gradients(target)
-
-    def gradients_X(self, dL_dK, X, X2=None):
-        """Compute the gradient of the objective function with respect to X.
-
-        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
-        :type dL_dK: np.ndarray (num_samples x num_inducing)
-        :param X: Observed data inputs
-        :type X: np.ndarray (num_samples x input_dim)
-        :param X2: Observed data inputs (optional, defaults to X)
-        :type X2: np.ndarray (num_inducing x input_dim)"""
-
-        target = np.zeros_like(X)
-        if X2 is None:
-            [p.gradients_X(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-        else:
-            [p.gradients_X(dL_dK, X[:, i_s], X2[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-        return target
-
-    def Kdiag(self, X, which_parts='all'):
-        """Compute the diagonal of the covariance function for inputs X."""
-        if which_parts == 'all':
-            which_parts = [True] * self.size
-        assert X.shape[1] == self.input_dim
-        target = np.zeros(X.shape[0])
-        [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self._parameters_, self.input_slices, which_parts) if part_on]
-        return target
-
-    def dKdiag_dtheta(self, dL_dKdiag, X):
-        """Compute the gradient of the diagonal of the covariance function with respect to the parameters."""
-        assert X.shape[1] == self.input_dim
-        assert dL_dKdiag.size == X.shape[0]
-        target = np.zeros(self.size)
-        [p.dKdiag_dtheta(dL_dKdiag, X[:, i_s], target[ps]) for p, i_s, ps in zip(self._parameters_, self.input_slices, self._param_slices_)]
-        return self._transform_gradients(target)
-
-    def dKdiag_dX(self, dL_dKdiag, X):
-        assert X.shape[1] == self.input_dim
-        target = np.zeros_like(X)
-        [p.dKdiag_dX(dL_dKdiag, X[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-        return target
-
-    def psi0(self, Z, mu, S):
-        target = np.zeros(mu.shape[0])
-        [p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
-        return target
-
-    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S):
-        target = np.zeros(self.size)
-        [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self._parameters_, self._param_slices_, self.input_slices)]
-        return self._transform_gradients(target)
-
-    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S):
-        target_mu, target_S = np.zeros_like(mu), np.zeros_like(S)
-        [p.dpsi0_dmuS(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-        return target_mu, target_S
-
-    def psi1(self, Z, mu, S):
-        target = np.zeros((mu.shape[0], Z.shape[0]))
-        [p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
-        return target
-
-    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S):
-        target = np.zeros((self.size))
-        [p.dpsi1_dtheta(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self._parameters_, self._param_slices_, self.input_slices)]
-        return self._transform_gradients(target)
-
-    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S):
-        target = np.zeros_like(Z)
-        [p.dpsi1_dZ(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-        return target
-
-    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S):
-        """return shapes are num_samples,num_inducing,input_dim"""
-        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
-        [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-        return target_mu, target_S
-
-    def psi2(self, Z, mu, S):
-        """
-        Computer the psi2 statistics for the covariance function.
-
-        :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
-        :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim)
-        :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing)
-
-        """
-        target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
-        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
-
-        # compute the "cross" terms
-        # TODO: input_slices needed
-        crossterms = 0
-
-        for [p1, i_s1], [p2, i_s2] in itertools.combinations(zip(self._parameters_, self.input_slices), 2):
-            if i_s1 == i_s2:
-                # TODO psi1 this must be faster/better/precached/more nice
-                tmp1 = np.zeros((mu.shape[0], Z.shape[0]))
-                p1.psi1(Z[:, i_s1], mu[:, i_s1], S[:, i_s1], tmp1)
-                tmp2 = np.zeros((mu.shape[0], Z.shape[0]))
-                p2.psi1(Z[:, i_s2], mu[:, i_s2], S[:, i_s2], tmp2)
-
-                prod = np.multiply(tmp1, tmp2)
-                crossterms += prod[:, :, None] + prod[:, None, :]
-
-        target += crossterms
-        return target
-
-    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
-        """Gradient of the psi2 statistics with respect to the parameters."""
-        target = np.zeros(self.size)
-        [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self._parameters_, self.input_slices, self._param_slices_)]
-
-        # compute the "cross" terms
-        # TODO: better looping, input_slices
-        for i1, i2 in itertools.permutations(range(len(self._parameters_)), 2):
-            p1, p2 = self._parameters_[i1], self._parameters_[i2]
-#             ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
-            ps1, ps2 = self._param_slices_[i1], self._param_slices_[i2]
-
-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dtheta((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target[ps2])
-
-        return self._transform_gradients(target)
-
-    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S):
-        target = np.zeros_like(Z)
-        [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-        # target *= 2
-
-        # compute the "cross" terms
-        # TODO: we need input_slices here.
-        for p1, p2 in itertools.permutations(self._parameters_, 2):
-#             if p1.name == 'linear' and p2.name == 'linear':
-#                 raise NotImplementedError("We don't handle linear/linear cross-terms")
-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dZ((tmp[:, None, :] * dL_dpsi2).sum(1), Z, mu, S, target)
-
-        return target * 2
-
-    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S):
-        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
-        [p.dpsi2_dmuS(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-
-        # compute the "cross" terms
-        # TODO: we need input_slices here.
-        for p1, p2 in itertools.permutations(self._parameters_, 2):
-#             if p1.name == 'linear' and p2.name == 'linear':
-#                 raise NotImplementedError("We don't handle linear/linear cross-terms")
-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dmuS((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target_mu, target_S)
-
-        return target_mu, target_S
-
-    def plot(self, *args, **kwargs):
-        """
-        See GPy.plotting.matplot_dep.plot
-        """
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import kernel_plots
-        kernel_plots.plot(self,*args)
-
-from GPy.core.model import Model
-
-class Kern_check_model(Model):
-    """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel."""
-    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
-        Model.__init__(self, 'kernel_test_model')
-        num_samples = 20
-        num_samples2 = 10
-        if kernel==None:
-            kernel = GPy.kern.rbf(1)
-        if X==None:
-            X = np.random.randn(num_samples, kernel.input_dim)
-        if dL_dK==None:
-            if X2==None:
-                dL_dK = np.ones((X.shape[0], X.shape[0]))
-            else:
-                dL_dK = np.ones((X.shape[0], X2.shape[0]))
-        
-        self.kernel=kernel
-        self.add_parameter(kernel)
-        self.X = X
-        self.X2 = X2
-        self.dL_dK = dL_dK
-
-    def is_positive_definite(self):
-        v = np.linalg.eig(self.kernel.K(self.X))[0]
-        if any(v<-10*sys.float_info.epsilon):
-            return False
-        else:
-            return True
-
-    def log_likelihood(self):
-        return (self.dL_dK*self.kernel.K(self.X, self.X2)).sum()
-
-    def _log_likelihood_gradients(self):
-        raise NotImplementedError, "This needs to be implemented to use the kern_check_model class."
-
-class Kern_check_dK_dtheta(Kern_check_model):
-    """This class allows gradient checks for the gradient of a kernel with respect to parameters. """
-    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
-        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
-
-    def _log_likelihood_gradients(self):
-        return self.kernel._param_grad_helper(self.dL_dK, self.X, self.X2)
-
-class Kern_check_dKdiag_dtheta(Kern_check_model):
-    """This class allows gradient checks of the gradient of the diagonal of a kernel with respect to the parameters."""
-    def __init__(self, kernel=None, dL_dK=None, X=None):
-        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
-        if dL_dK==None:
-            self.dL_dK = np.ones((self.X.shape[0]))
-    def parameters_changed(self):
-        self.kernel.update_gradients_full(self.dL_dK, self.X)        
-
-    def log_likelihood(self):
-        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
-
-    def _log_likelihood_gradients(self):
-        return self.kernel.dKdiag_dtheta(self.dL_dK, self.X)
-
-class Kern_check_dK_dX(Kern_check_model):
-    """This class allows gradient checks for the gradient of a kernel with respect to X. """
-    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
-        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
-        self.remove_parameter(kernel)
-        self.X = Param('X', self.X)
-        self.add_parameter(self.X)
-    def _log_likelihood_gradients(self):
-        return self.kernel.gradients_X(self.dL_dK, self.X, self.X2).flatten()
-
-class Kern_check_dKdiag_dX(Kern_check_dK_dX):
-    """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """
-    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
-        Kern_check_dK_dX.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
-        if dL_dK==None:
-            self.dL_dK = np.ones((self.X.shape[0]))
-        
-    def log_likelihood(self):
-        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
-
-    def _log_likelihood_gradients(self):
-        return self.kernel.dKdiag_dX(self.dL_dK, self.X).flatten()
-
-def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
-    """
-    This function runs on kernels to check the correctness of their
-    implementation. It checks that the covariance function is positive definite
-    for a randomly generated data set.
-
-    :param kern: the kernel to be tested.
-    :type kern: GPy.kern.Kernpart
-    :param X: X input values to test the covariance function.
-    :type X: ndarray
-    :param X2: X2 input values to test the covariance function.
-    :type X2: ndarray
-
-    """
-    pass_checks = True
-    if X==None:
-        X = np.random.randn(10, kern.input_dim)
-        if output_ind is not None:
-            X[:, output_ind] = np.random.randint(kern.output_dim, X.shape[0])
-    if X2==None:
-        X2 = np.random.randn(20, kern.input_dim)
-        if output_ind is not None:
-            X2[:, output_ind] = np.random.randint(kern.output_dim, X2.shape[0])
-
-    if verbose:
-        print("Checking covariance function is positive definite.")
-    result = Kern_check_model(kern, X=X).is_positive_definite()
-    if result and verbose:
-        print("Check passed.")
-    if not result:
-        print("Positive definite check failed for " + kern.name + " covariance function.")
-        pass_checks = False
-        return False
-
-    if verbose:
-        print("Checking gradients of K(X, X) wrt theta.")
-    result = Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose)
-    if result and verbose:
-        print("Check passed.")
-    if not result:
-        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
-        Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=True)
-        pass_checks = False
-        return False
-
-    if verbose:
-        print("Checking gradients of K(X, X2) wrt theta.")
-    result = Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose)
-    if result and verbose:
-        print("Check passed.")
-    if not result:
-        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
-        Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=True)
-        pass_checks = False
-        return False
-
-    if verbose:
-        print("Checking gradients of Kdiag(X) wrt theta.")
-    result = Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose)
-    if result and verbose:
-        print("Check passed.")
-    if not result:
-        print("Gradient of Kdiag(X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
-        Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=True)
-        pass_checks = False
-        return False
-
-    if verbose:
-        print("Checking gradients of K(X, X) wrt X.")
-    try:
-        result = Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=verbose)
-    except NotImplementedError:
-        result=True
-        if verbose:
-            print("gradients_X not implemented for " + kern.name)
-    if result and verbose:
-        print("Check passed.")
-    if not result:
-        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
-        Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=True)
-        pass_checks = False
-        return False
-
-    if verbose:
-        print("Checking gradients of K(X, X2) wrt X.")
-    try:
-        result = Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose)
-    except NotImplementedError:
-        result=True
-        if verbose:
-            print("gradients_X not implemented for " + kern.name)
-    if result and verbose:
-        print("Check passed.")
-    if not result:
-        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
-        Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=True)
-        pass_checks = False
-        return False
-
-    if verbose:
-        print("Checking gradients of Kdiag(X) wrt X.")
-    try:
-        result = Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)
-    except NotImplementedError:
-        result=True
-        if verbose:
-            print("gradients_X not implemented for " + kern.name)
-    if result and verbose:
-        print("Check passed.")
-    if not result:
-        print("Gradient of Kdiag(X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
-        Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=True)
-        pass_checks = False
-        return False
-
-    return pass_checks
diff --git a/GPy/kern/kernpart.py b/GPy/kern/kernpart.py
deleted file mode 100644
index 06f1446b..00000000
--- a/GPy/kern/kernpart.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-#from ...core.parameterized.Parameterized import set_as_parameter
-from ...core.parameterization import Parameterized
-
-class Kernpart(Parameterized):
-    def __init__(self,input_dim,name):
-        """
-        The base class for a kernpart: a positive definite function 
-        which forms part of a covariance function (kernel).
-
-        :param input_dim: the number of input dimensions to the function
-        :type input_dim: int
-
-        Do not instantiate.
-        """
-        super(Kernpart, self).__init__(name)
-        # the input dimensionality for the covariance
-        self.input_dim = input_dim
-        # the number of optimisable parameters
-        # the name of the covariance function.
-        # link to parameterized objects
-        #self._X = None
-    
-    def connect_input(self, X):
-        X.add_observer(self, self.on_input_change)
-        #self._X = X
-        
-    def on_input_change(self, X):
-        """
-        During optimization this function will be called when
-        the inputs X changed. Use this to update caches dependent
-        on the inputs X.
-        """
-        # overwrite this to update kernel when inputs X change
-        pass
-    
-        
-#     def set_as_parameter_named(self, name, gradient, index=None, *args, **kwargs):
-#         """
-#         :param names:        name of parameter to set as parameter
-#         :param gradient:     gradient method to get the gradient of this parameter
-#         :param index:        index of where to place parameter in printing
-#         :param args, kwargs: additional arguments to gradient
-#     
-#         Convenience method to connect Kernpart parameters:
-#         parameter with name (attribute of this Kernpart) will be set as parameter with following name:
-#         
-#             kernel_name + _ + parameter_name
-#     
-#         To add the kernels name to the parameter name use this method to 
-#         add parameters.
-#         """
-#         self.set_as_parameter(name, getattr(self, name), gradient, index, *args, **kwargs)
-#     def set_as_parameter(self, name, array, gradient, index=None, *args, **kwargs):
-#         """
-#         See :py:func:`GPy.core.parameterized.Parameterized.set_as_parameter`
-#         
-#         Note: this method adds the kernels name in front of the parameter.
-#         """
-#         p = Param(self.name+"_"+name, array, gradient, *args, **kwargs)
-#         if index is None:
-#             self._parameters_.append(p)
-#         else:
-#             self._parameters_.insert(index, p)
-#         self.__dict__[name] = p
-    #set_as_parameter.__doc__ += set_as_parameter.__doc__  # @UndefinedVariable
-#     def _get_params(self):
-#         raise NotImplementedError
-#     def _set_params(self,x):
-#         raise NotImplementedError
-#     def _get_param_names(self):
-#         raise NotImplementedError
-    def K(self,X,X2,target):
-        raise NotImplementedError
-    def Kdiag(self,X,target):
-        raise NotImplementedError
-    def _param_grad_helper(self,dL_dK,X,X2,target):
-        raise NotImplementedError
-    def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        # In the base case compute this by calling _param_grad_helper. Need to
-        # override for stationary covariances (for example) to save
-        # time.
-        for i in range(X.shape[0]):
-            self._param_grad_helper(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target)
-    def psi0(self,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
-        raise NotImplementedError
-    def psi1(self,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi1_dtheta(self,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,target_mu,target_S):
-        raise NotImplementedError
-    def psi2(self,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
-        raise NotImplementedError
-    def gradients_X(self, dL_dK, X, X2, target):
-        raise NotImplementedError
-    def dKdiag_dX(self, dL_dK, X, target):
-        raise NotImplementedError
-    def update_gradients_full(self, dL_dK, X):
-        """Set the gradients of all parameters when doing full (N) inference."""
-        raise NotImplementedError
-    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        """Set the gradients of all parameters when doing sparse (M) inference."""
-        raise NotImplementedError
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
-        """Set the gradients of all parameters when doing variational (M) inference with uncertain inputs."""
-        raise NotImplementedError
-
-class Kernpart_stationary(Kernpart):
-    def __init__(self, input_dim, lengthscale=None, ARD=False):
-        self.input_dim = input_dim
-        self.ARD = ARD
-        if not ARD:
-            self.num_params = 2
-            if lengthscale is not None:
-                self.lengthscale = np.asarray(lengthscale)
-                assert self.lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
-            else:
-                self.lengthscale = np.ones(1)
-        else:
-            self.num_params = self.input_dim + 1
-            if lengthscale is not None:
-                self.lengthscale = np.asarray(lengthscale)
-                assert self.lengthscale.size == self.input_dim, "bad number of lengthscales"
-            else:
-                self.lengthscale = np.ones(self.input_dim)
-
-        # initialize cache
-        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
-        self._X, self._X2, self._parameters_ = np.empty(shape=(3, 1))
-
-    def _set_params(self, x):
-        self.lengthscale = x
-        self.lengthscale2 = np.square(self.lengthscale)
-        # reset cached results
-        self._X, self._X2, self._parameters_ = np.empty(shape=(3, 1))
-        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
-
-
-    def dKdiag_dtheta(self, dL_dKdiag, X, target):
-        # For stationary covariances, derivative of diagonal elements
-        # wrt lengthscale is 0.
-        target[0] += np.sum(dL_dKdiag)
-
-    def dKdiag_dX(self, dL_dK, X, target):
-        pass # true for all stationary kernels
-
-
-class Kernpart_inner(Kernpart):
-    def __init__(self,input_dim):
-        """
-        The base class for a kernpart_inner: a positive definite function which forms part of a kernel that is based on the inner product between inputs.
-
-        :param input_dim: the number of input dimensions to the function
-        :type input_dim: int
-
-        Do not instantiate.
-        """
-        Kernpart.__init__(self, input_dim)
-
-        # initialize cache
-        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
-        self._X, self._X2, self._parameters_ = np.empty(shape=(3, 1))
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index 3e105785..b4f987ea 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -7,7 +7,7 @@ from GPy.util.linalg import PCA
 import numpy
 import itertools
 import pylab
-from GPy.kern.kern import kern
+from GPy.kern.kern import Kern
 from GPy.models.bayesian_gplvm import BayesianGPLVM
 
 class MRD(Model):
@@ -48,11 +48,11 @@ class MRD(Model):
         # sort out the kernels
         if kernels is None:
             kernels = [None] * len(likelihood_or_Y_list)
-        elif isinstance(kernels, kern):
+        elif isinstance(kernels, Kern):
             kernels = [kernels.copy() for i in range(len(likelihood_or_Y_list))]
         else:
             assert len(kernels) == len(likelihood_or_Y_list), "need one kernel per output"
-            assert all([isinstance(k, kern) for k in kernels]), "invalid kernel object detected!"
+            assert all([isinstance(k, Kern) for k in kernels]), "invalid kernel object detected!"
         assert not ('kernel' in kw), "pass kernels through `kernels` argument"
 
         self.input_dim = input_dim
diff --git a/GPy/plotting/matplot_dep/kernel_plots.py b/GPy/plotting/matplot_dep/kernel_plots.py
index 19c96bc0..80350475 100644
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@@ -7,7 +7,7 @@ import pylab as pb
 import Tango
 from matplotlib.textpath import TextPath
 from matplotlib.transforms import offset_copy
-from ...kern.parts.linear import Linear
+from ...kern.linear import Linear
 
 
 def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False):

From 493506408ca09dc62e9871b8d3c06019a046fa75 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 19 Feb 2014 15:01:35 +0000
Subject: [PATCH 06/38] =?UTF-8?q?init=20for=20src=20dir=C2=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPy/kern/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 2098bd76..7760f48f 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -1,6 +1,6 @@
-from rbf import RBF
-from white import White
-from kern import Kern
+from _src.rbf import RBF
+from _src.white import White
+from _src.kern import Kern
 #import bias
 #import Brownian
 #import coregionalize

From c4f6b0dbe7e391256a5ae7f729ea649ce48efcf1 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 19 Feb 2014 15:32:16 +0000
Subject: [PATCH 07/38] copy and missing data

---
 GPy/core/__init__.py                          |  4 +-
 GPy/core/model.py                             |  6 +-
 GPy/core/parameterization/param.py            | 13 ++-
 GPy/core/parameterization/parameter_core.py   | 82 +++++++++++++++++--
 GPy/core/parameterization/parameterized.py    | 57 ++-----------
 GPy/core/sparse_gp.py                         | 17 ++--
 GPy/examples/dimensionality_reduction.py      | 11 ++-
 .../latent_function_inference/__init__.py     | 16 +++-
 .../latent_function_inference/var_dtc.py      | 54 +++++++-----
 GPy/models/mrd.py                             | 16 ++++
 10 files changed, 179 insertions(+), 97 deletions(-)

diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py
index 839529d6..a42d76ed 100644
--- a/GPy/core/__init__.py
+++ b/GPy/core/__init__.py
@@ -2,7 +2,9 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from model import *
-from parameterization.parameterized import *
+from parameterization.parameterized import adjust_name_for_printing, Parameterizable
+from parameterization.param import Param, ParamConcatenation
+
 from gp import GP
 from sparse_gp import SparseGP
 from svigp import SVIGP
diff --git a/GPy/core/model.py b/GPy/core/model.py
index 55083aaf..c067d51d 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -4,12 +4,8 @@
 
 from .. import likelihoods
 from ..inference import optimization
-from ..util.linalg import jitchol
 from ..util.misc import opt_wrapper
 from parameterization import Parameterized
-from parameterization.parameterized import UNFIXED
-from parameterization.domains import _POSITIVE, _REAL
-from parameterization.index_operations import ParameterIndexOperations
 import multiprocessing as mp
 import numpy as np
 from numpy.linalg.linalg import LinAlgError
@@ -240,7 +236,7 @@ class Model(Parameterized):
         constrained positive.
         """
         raise DeprecationWarning, 'parameters now have default constraints'
-        positive_strings = ['variance', 'lengthscale', 'precision', 'kappa', 'sensitivity']
+        #positive_strings = ['variance', 'lengthscale', 'precision', 'kappa', 'sensitivity']
         # param_names = self._get_param_names()
         
 #         for s in positive_strings:
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index f54c0117..49d6682c 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -3,7 +3,7 @@
 
 import itertools
 import numpy
-from parameter_core import Constrainable, Gradcheckable, Indexable, Parameterizable, adjust_name_for_printing
+from parameter_core import Constrainable, Gradcheckable, Indexable, Parentable, adjust_name_for_printing
 from array_core import ObservableArray, ParamList
 
 ###### printing
@@ -15,7 +15,7 @@ __precision__ = numpy.get_printoptions()['precision'] # numpy printing precision
 __print_threshold__ = 5
 ######
 
-class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameterizable):
+class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable):
     """
     Parameter object for GPy models.
 
@@ -114,7 +114,14 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
         self._parent_index_ = state.pop()
         self._direct_parent_ = state.pop()
         self.name = state.pop()
-
+    
+    def copy(self, *args):
+        constr = self.constraints.copy()
+        priors = self.priors.copy()
+        p = Param(self.name, self.view(numpy.ndarray).copy(), self._default_constraint_)
+        p.constraints = constr
+        p.priors = priors
+        return p
     #===========================================================================
     # get/set parameters
     #===========================================================================
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 275198b2..9002adc3 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -68,6 +68,10 @@ class Parentable(object):
             return self
         return self._direct_parent_._highest_parent_
 
+    def _notify_parameters_changed(self):
+        if self.has_parent():
+            self._direct_parent_._notify_parameters_changed()
+
 class Nameable(Parentable):
     _name = None
     def __init__(self, name, direct_parent=None, parent_index=None):
@@ -80,22 +84,47 @@ class Nameable(Parentable):
     @name.setter
     def name(self, name):
         from_name = self.name
+        assert isinstance(name, str)
         self._name = name
         if self.has_parent():
-            self._direct_parent_._name_changed(self, from_name)    
-
+            self._direct_parent_._name_changed(self, from_name)
 
 class Parameterizable(Parentable):
     def __init__(self, *args, **kwargs):
         super(Parameterizable, self).__init__(*args, **kwargs)
         from GPy.core.parameterization.array_core import ParamList
         _parameters_ = ParamList()
+        self._added_names_ = set()
     
     def parameter_names(self, add_name=False):
         if add_name:
             return [adjust_name_for_printing(self.name) + "." + xi for x in self._parameters_ for xi in x.parameter_names(add_name=True)]
         return [xi for x in self._parameters_ for xi in x.parameter_names(add_name=True)]
+    
+    def _add_parameter_name(self, param):
+        pname = adjust_name_for_printing(param.name)
+        # and makes sure to not delete programmatically added parameters
+        if pname in self.__dict__:
+            if not (param is self.__dict__[pname]):
+                if pname in self._added_names_:
+                    del self.__dict__[pname]
+                    self._add_parameter_name(param)
+        else:
+            self.__dict__[pname] = param
+            self._added_names_.add(pname)
+            
+    def _remove_parameter_name(self, param=None, pname=None):
+        assert param is None or pname is None, "can only delete either param by name, or the name of a param"
+        pname = adjust_name_for_printing(pname) or adjust_name_for_printing(param.name)
+        if pname in self._added_names_:
+            del self.__dict__[pname]
+            self._added_names_.remove(pname)
+        self._connect_parameters()
 
+    def _name_changed(self, param, old_name):
+        self._remove_parameter_name(None, old_name)
+        self._add_parameter_name(param)
+            
     def _collect_gradient(self, target):
         import itertools
         [p._collect_gradient(target[s]) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
@@ -113,6 +142,38 @@ class Parameterizable(Parentable):
         [p._set_params(params[s], update=update) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
         self.parameters_changed()
 
+    def copy(self):
+        """Returns a (deep) copy of the current model"""
+        import copy
+        from .index_operations import ParameterIndexOperations, ParameterIndexOperationsView
+        from .array_core import ParamList
+        dc = dict()
+        for k, v in self.__dict__.iteritems():
+            if k not in ['_direct_parent_', '_parameters_', '_parent_index_'] + self.parameter_names():
+                if isinstance(v, (Constrainable, ParameterIndexOperations, ParameterIndexOperationsView)):
+                    dc[k] = v.copy()
+                else:
+                    dc[k] = copy.deepcopy(v)
+            if k == '_parameters_':
+                params = [p.copy() for p in v]
+        #dc = copy.deepcopy(self.__dict__)
+        dc['_direct_parent_'] = None
+        dc['_parent_index_'] = None
+        dc['_parameters_'] = ParamList()
+        s = self.__new__(self.__class__)
+        s.__dict__ = dc
+        #import ipdb;ipdb.set_trace()
+        for p in params:
+            s.add_parameter(p)
+        #dc._notify_parent_change()
+        return s
+        #return copy.deepcopy(self)
+
+    def _notify_parameters_changed(self):
+        self.parameters_changed()
+        if self.has_parent():
+            self._direct_parent_._notify_parameters_changed()
+
     def parameters_changed(self):
         """
         This method gets called when parameters have changed.
@@ -122,11 +183,6 @@ class Parameterizable(Parentable):
         """
         pass
 
-    def _notify_parameters_changed(self):
-        self.parameters_changed()
-        if self.has_parent():
-            self._direct_parent_._notify_parameters_changed()
-
 
 class Gradcheckable(Parentable):
     #===========================================================================
@@ -157,7 +213,7 @@ class Indexable(object):
         """
         raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?"        
         
-class Constrainable(Nameable, Indexable, Parameterizable):
+class Constrainable(Nameable, Indexable, Parentable):
     def __init__(self, name, default_constraint=None):
         super(Constrainable,self).__init__(name)
         self._default_constraint_ = default_constraint
@@ -167,6 +223,16 @@ class Constrainable(Nameable, Indexable, Parameterizable):
         if self._default_constraint_ is not None:
             self.constrain(self._default_constraint_)
     
+    def _disconnect_parent(self, constr=None):
+        if constr is None:
+            constr = self.constraints.copy()
+        self.constraints.clear()
+        self.constraints = constr
+        self._direct_parent_ = None
+        self._parent_index_ = None
+        self._connect_fixes()
+        self._notify_parent_change()
+        
     #===========================================================================
     # Fixing Parameters:
     #===========================================================================
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index c8a841c0..cef1daa2 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -3,16 +3,15 @@
 
 
 import numpy; np = numpy
-import copy
 import cPickle
 import itertools
 from re import compile, _pattern_type
-from param import ParamConcatenation, Param
-from parameter_core import Constrainable, Pickleable, Observable, adjust_name_for_printing, Gradcheckable
-from transformations import __fixed__, FIXED, UNFIXED
+from param import ParamConcatenation
+from parameter_core import Constrainable, Pickleable, Observable, Parameterizable, adjust_name_for_printing, Gradcheckable
+from transformations import __fixed__
 from array_core import ParamList
 
-class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
+class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable, Parameterizable):
     """
     Parameterized class
 
@@ -63,7 +62,6 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
             self._fixes_ = None
         self._param_slices_ = []
         self._connect_parameters()
-        self._added_names_ = set()
         del self._in_init_
 
     def add_parameter(self, param, index=None):
@@ -117,17 +115,10 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
             raise RuntimeError, "Parameter {} does not belong to this object, remove parameters directly from their respective parents".format(param._short())
         del self._parameters_[param._parent_index_]
         self.size -= param.size
-        constr = param.constraints.copy()
-        param.constraints.clear()
-        param.constraints = constr
-        param._direct_parent_ = None
-        param._parent_index_ = None
-        param._connect_fixes()
-        param._notify_parent_change()
-        pname = adjust_name_for_printing(param.name)
-        if pname in self._added_names_:
-            del self.__dict__[pname]
-        self._connect_parameters()
+        
+        param._disconnect_parent()
+        self._remove_parameter_name(param)
+        
         #self._notify_parent_change()
         self._connect_fixes()
 
@@ -145,19 +136,9 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
         for i, p in enumerate(self._parameters_):
             p._direct_parent_ = self
             p._parent_index_ = i
-            not_unique = []
             sizes.append(p.size + sizes[-1])
             self._param_slices_.append(slice(sizes[-2], sizes[-1]))
-            pname = adjust_name_for_printing(p.name)
-            # and makes sure to not delete programmatically added parameters
-            if pname in self.__dict__:
-                if isinstance(self.__dict__[pname], (Parameterized, Param)):
-                    if not p is self.__dict__[pname]:
-                        not_unique.append(pname)
-                        del self.__dict__[pname]
-            elif not (pname in not_unique):
-                self.__dict__[pname] = p
-                self._added_names_.add(pname)
+            self._add_parameter_name(p)
 
     #===========================================================================
     # Pickling operations
@@ -174,19 +155,7 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
                 cPickle.dump(self, f, protocol)
         else:
             cPickle.dump(self, f, protocol)
-    def copy(self):
-        """Returns a (deep) copy of the current model """
-        # dc = dict()
-        # for k, v in self.__dict__.iteritems():
-            # if k not in ['_highest_parent_', '_direct_parent_']:
-                # dc[k] = copy.deepcopy(v)
 
-        # dc = copy.deepcopy(self.__dict__)
-        # dc['_highest_parent_'] = None
-        # dc['_direct_parent_'] = None
-        # s = self.__class__.new()
-        # s.__dict__ = dc
-        return copy.deepcopy(self)
     def __getstate__(self):
         if self._has_get_set_state():
             return self._getstate()
@@ -265,14 +234,6 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
         if self._has_fixes(): tmp = self._get_params(); tmp[self._fixes_] = p; p = tmp; del tmp
         [numpy.put(p, ind, c.f(p[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
         return p
-    def _name_changed(self, param, old_name):
-        if hasattr(self, old_name) and old_name in self._added_names_:
-            delattr(self, old_name)
-            self._added_names_.remove(old_name)
-        pname = adjust_name_for_printing(param.name)
-        if pname not in self.__dict__:
-            self._added_names_.add(pname)
-            self.__dict__[pname] = param
     #===========================================================================
     # Indexable Handling
     #===========================================================================
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index edb8d8f6..1d436c53 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -33,12 +33,12 @@ class SparseGP(GP):
 
     def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, X_variance=None, name='sparse gp'):
 
-        #pick a sensible inference method
+        # pick a sensible inference method
         if inference_method is None:
             if isinstance(likelihood, likelihoods.Gaussian):
                 inference_method = var_dtc.VarDTC()
             else:
-                #inference_method = ??
+                # inference_method = ??
                 raise NotImplementedError, "what to do what to do?"
             print "defaulting to ", inference_method, "for latent function inference"
 
@@ -54,7 +54,7 @@ class SparseGP(GP):
         self.parameters_changed()
 
     def _update_gradients_Z(self, add=False):
-    #The derivative of the bound wrt the inducing inputs Z ( unless they're all fixed)
+    # The derivative of the bound wrt the inducing inputs Z ( unless they're all fixed)
         if not self.Z.is_fixed:
             if add: self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
             else: self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
@@ -77,13 +77,14 @@ class SparseGP(GP):
             mu = np.dot(Kx.T, self.posterior.woodbury_vector)
             if full_cov:
                 Kxx = self.kern.K(Xnew, which_parts=which_parts)
-                var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx) # NOTE this won't work for plotting
+                var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx), Kx, [1,0]).T
             else:
-                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
-                var = Kxx - np.sum(Kx * np.dot(self.posterior.woodbury_inv, Kx), 0)
+                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)[:, None]
+                #import ipdb;ipdb.set_trace()
+                var = Kxx - (np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T * Kx.T[:,:,None]).sum(1)
         else:
             # assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
-            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
+            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new)  # , which_parts=which_parts) TODO: which_parts
             mu = np.dot(Kx, self.Cpsi1V)
             if full_cov:
                 raise NotImplementedError, "TODO"
@@ -91,7 +92,7 @@ class SparseGP(GP):
                 Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new)
                 psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new)
                 var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
-        return mu, var[:,None]
+        return mu, var
 
 
     def _getstate(self):
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index a7eb0adb..2924386f 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -1,9 +1,9 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as _np
-default_seed = _np.random.seed(123344)
+#default_seed = _np.random.seed(123344)
 
-def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False, output_dim=200, nan=False):
+def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan=False):
     """
     model for testing purposes. Samples from a GP with rbf kernel and learns
     the samples with a new kernel. Normally not for optimization, just model cheking
@@ -41,7 +41,7 @@ def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False,
 
     if nan:
         m.inference_method = GPy.inference.latent_function_inference.var_dtc.VarDTCMissingData()
-        m.Y[_np.random.binomial(1,p,size=(Y.shape))] = _np.nan
+        m.Y[_np.random.binomial(1,p,size=(Y.shape)).astype(bool)] = _np.nan
         m.parameters_changed()
 
     #===========================================================================
@@ -186,6 +186,8 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40,
     return m
 
 def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
+    _np.random.seed(1234)
+    
     x = _np.linspace(0, 4 * _np.pi, N)[:, None]
     s1 = _np.vectorize(lambda x: _np.sin(x))
     s2 = _np.vectorize(lambda x: _np.cos(x))
@@ -293,10 +295,11 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
     Y = Ylist[0]
     k = kern.linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
     
-    inan = _np.random.binomial(1, .3, size=Y.shape)
+    inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
     m = BayesianGPLVM(Y, Q, init="random", num_inducing=num_inducing, kernel=k)
     m.inference_method = VarDTCMissingData()
     m.Y[inan] = _np.nan
+    m.q.variance *= .1
     m.parameters_changed()
     
     if optimize:
diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py
index 337a8477..a633c381 100644
--- a/GPy/inference/latent_function_inference/__init__.py
+++ b/GPy/inference/latent_function_inference/__init__.py
@@ -16,7 +16,9 @@ If the likelihood object is something other than Gaussian, then exact inference
 is not tractable. We then resort to a Laplace approximation (laplace.py) or
 expectation propagation (ep.py).
 
-The inference methods return a "Posterior" instance, which is a simple
+The inference methods return a 
+:class:`~GPy.inference.latent_function_inference.posterior.Posterior` 
+instance, which is a simple
 structure which contains a summary of the posterior. The model classes can then
 use this posterior object for making predictions, optimizing hyper-parameters,
 etc.
@@ -29,3 +31,15 @@ expectation_propagation = 'foo' # TODO
 from GPy.inference.latent_function_inference.var_dtc import VarDTC
 from dtc import DTC
 from fitc import FITC
+
+# class FullLatentFunctionData(object):
+#     
+# 
+# class LatentFunctionInference(object):
+#     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+#         """
+#         Do inference on the latent functions given a covariance function `kern`,
+#         inputs and outputs `X` and `Y`, and a likelihood `likelihood`. 
+#         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
+#         """
+#         raise NotImplementedError, "Abstract base class for full inference"
\ No newline at end of file
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index 264f7fc3..2f11cb08 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -139,7 +139,8 @@ class VarDTCMissingData(object):
             dL_dpsi2_all = np.zeros((X.shape[0], num_inducing, num_inducing))
         
         partial_for_likelihood = 0
-        LB_all = Cpsi1Vf_all = 0
+        woodbury_vector = np.zeros((num_inducing, Y.shape[1]))
+        woodbury_inv_all = np.zeros((num_inducing, num_inducing, Y.shape[1]))
         dL_dKmm = 0
         log_marginal = 0
         
@@ -153,6 +154,8 @@ class VarDTCMissingData(object):
 
         VVT_factor_all = np.empty(Y.shape)
         full_VVT_factor = VVT_factor_all.shape[1] == Y.shape[1]
+        if not full_VVT_factor:
+            psi1V = np.dot(Y.T*beta_all, psi1_all).T
         
         for y, trYYT, [v, ind] in itertools.izip(Ys, traces, self._subarray_indices):
             if het_noise: beta = beta_all[ind]
@@ -185,8 +188,7 @@ class VarDTCMissingData(object):
             psi1Vf = psi1.T.dot(VVT_factor)
             _LBi_Lmi_psi1Vf, Cpsi1Vf = _compute_psi1Vf(Lm, LB, psi1Vf)
             
-            if full_VVT_factor: Cpsi1Vf_all += Cpsi1Vf
-            LB_all += LB    
+            #LB_all[ind, :,:] = LB    
             
             # data fit and derivative of L w.r.t. Kmm
             delit = tdot(_LBi_Lmi_psi1Vf)
@@ -219,6 +221,21 @@ class VarDTCMissingData(object):
                 psi0, psi1, beta, 
                 data_fit, num_data, output_dim, trYYT)
             
+            if full_VVT_factor: woodbury_vector[:, ind] = Cpsi1Vf
+            else:
+                print 'foobar'
+                tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
+                tmp, _ = dpotrs(LB, tmp, lower=1)
+                woodbury_vector[:, ind] = dtrtrs(Lm, tmp, lower=1, trans=1)[0]
+                
+            #import ipdb;ipdb.set_trace()
+            Bi, _ = dpotri(LB, lower=1)
+            symmetrify(Bi)
+            Bi = -dpotri(LB, lower=1)[0]
+            from ...util import diag
+            diag.add(Bi, 1)
+            woodbury_inv_all[:, :, ind] = backsub_both_sides(Lm, Bi)[:,:,None]
+            
         # gradients:
         likelihood.update_gradients(partial_for_likelihood)
 
@@ -231,23 +248,22 @@ class VarDTCMissingData(object):
 
         #get sufficient things for posterior prediction
         #TODO: do we really want to do this in  the loop?
-        if full_VVT_factor:
-            woodbury_vector = Cpsi1Vf_all # == Cpsi1V
-        else:
-            print 'foobar'
-            psi1V = np.dot(Y.T*beta_all, psi1_all).T
-            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
-            tmp, _ = dpotrs(LB_all, tmp, lower=1)
-            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
-        
-        Bi, _ = dpotri(LB_all, lower=1)
-        symmetrify(Bi)
-        Bi = -dpotri(LB_all, lower=1)[0]
-        from ...util import diag
-        diag.add(Bi, 1)
+        #if not full_VVT_factor:
+        #    print 'foobar'
+        #    psi1V = np.dot(Y.T*beta_all, psi1_all).T
+        #    tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
+        #    tmp, _ = dpotrs(LB_all, tmp, lower=1)
+        #    woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+        #import ipdb;ipdb.set_trace()
+        #Bi, _ = dpotri(LB_all, lower=1)
+        #symmetrify(Bi)
+        #Bi = -dpotri(LB_all, lower=1)[0]
+        #from ...util import diag
+        #diag.add(Bi, 1)
     
-        woodbury_inv = backsub_both_sides(Lm, Bi)
-        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
+        #woodbury_inv = backsub_both_sides(Lm, Bi)
+        
+        post = Posterior(woodbury_inv=woodbury_inv_all, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
 
         return post, log_marginal, grad_dict
 
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index 3e105785..511ce5aa 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -10,6 +10,22 @@ import pylab
 from GPy.kern.kern import kern
 from GPy.models.bayesian_gplvm import BayesianGPLVM
 
+class MRD2(Model):
+    """
+    Apply MRD to all given datasets Y in Ylist. 
+    
+    Y_i in [n x p_i]
+    
+    The samples n in the datasets need 
+    to match up, whereas the dimensionality p_d can differ.
+    
+    :param [array-like] Ylist: List of datasets to apply MRD on
+    :param array-like q_mean: mean of starting latent space q in [n x q]
+    :param array-like q_variance: variance of starting latent space q in [n x q]
+    :param :class:`~GPy.inference.latent_function_inference
+    """
+    
+
 class MRD(Model):
     """
     Do MRD on given Datasets in Ylist.

From fd0dd8df85a3045bba76b8567ecdcd96d2b583c7 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 19 Feb 2014 15:50:13 +0000
Subject: [PATCH 08/38] updated naming to be consistent

---
 GPy/core/parameterization/param.py          |  7 +------
 GPy/core/parameterization/parameter_core.py |  4 ++++
 GPy/core/parameterization/parameterized.py  | 20 ++++++--------------
 3 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 49d6682c..e49dbe2e 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -238,11 +238,6 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable
     @property
     def _ties_str(self):
         return [t._short() for t in self._tied_to_] or ['']
-    @property
-    def name_hirarchical(self):
-        if self.has_parent():
-            return self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name)
-        return adjust_name_for_printing(self.name)
     def __repr__(self, *args, **kwargs):
         name = "\033[1m{x:s}\033[0;0m:\n".format(
                             x=self.name_hirarchical)
@@ -284,7 +279,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable
         return reduce(lambda a, b:max(a, len(str(b))), ind, len(__index_name__))
     def _short(self):
         # short string to print
-        name = self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name)
+        name = self.hirarchy_name()
         if self._realsize_ < 2:
             return name
         ind = self._indices()
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 9002adc3..2b2283c2 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -88,6 +88,10 @@ class Nameable(Parentable):
         self._name = name
         if self.has_parent():
             self._direct_parent_._name_changed(self, from_name)
+    def hirarchy_name(self):
+        if self.has_parent():
+            return self._direct_parent_.hirarchy_name() + "." + adjust_name_for_printing(self.name)
+        return adjust_name_for_printing(self.name)
 
 class Parameterizable(Parentable):
     def __init__(self, *args, **kwargs):
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index cef1daa2..f510d330 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -7,11 +7,11 @@ import cPickle
 import itertools
 from re import compile, _pattern_type
 from param import ParamConcatenation
-from parameter_core import Constrainable, Pickleable, Observable, Parameterizable, adjust_name_for_printing, Gradcheckable
+from parameter_core import Constrainable, Pickleable, Observable, Parameterizable, Parentable, adjust_name_for_printing, Gradcheckable
 from transformations import __fixed__
 from array_core import ParamList
 
-class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable, Parameterizable):
+class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable, Parameterizable, Parentable):
     """
     Parameterized class
 
@@ -212,7 +212,7 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable, Parame
     # Optimization handles:
     #===========================================================================
     def _get_param_names(self):
-        n = numpy.array([p.name_hirarchical + '[' + str(i) + ']' for p in self.flattened_parameters for i in p._indices()])
+        n = numpy.array([p.hirarchy_name() + '[' + str(i) + ']' for p in self.flattened_parameters for i in p._indices()])
         return n
     def _get_param_names_transformed(self):
         n = self._get_param_names()
@@ -296,10 +296,6 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable, Parame
         # you can retrieve the original param through this method, by passing
         # the copy here
         return self._parameters_[param._parent_index_]
-    def hirarchy_name(self):
-        if self.has_parent():
-            return self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name) + "."
-        return ''
     #===========================================================================
     # Get/set parameters:
     #===========================================================================
@@ -309,8 +305,8 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable, Parame
         """
         if not isinstance(regexp, _pattern_type): regexp = compile(regexp)
         found_params = []
-        for p in self._parameters_:
-            if regexp.match(p.name) is not None:
+        for p in self.flattened_parameters:
+            if regexp.match(p.hirarchy_name()) is not None:
                 found_params.append(p)
             if isinstance(p, Parameterized):
                 found_params.extend(p.grep_param_names(regexp))
@@ -352,11 +348,7 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable, Parame
     # Printing:
     #===========================================================================
     def _short(self):
-        # short string to print
-        if self.has_parent():
-            return self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name)
-        else:
-            return adjust_name_for_printing(self.name)
+        return self.hirarchy_name()
     @property
     def flattened_parameters(self):
         return [xi for x in self._parameters_ for xi in x.flattened_parameters]

From 1c3fe0c51e9a6741f3ca25496cf24471aaf37686 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 19 Feb 2014 16:54:25 +0000
Subject: [PATCH 09/38] regexp now on all parameters

---
 GPy/core/parameterization/param.py          | 12 +++++----
 GPy/core/parameterization/parameter_core.py | 19 ++++++++------
 GPy/core/parameterization/parameterized.py  | 28 ++++++---------------
 3 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index e49dbe2e..75d9faf2 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -221,7 +221,9 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable
     def _description_str(self):
         if self.size <= 1: return ["%f" % self]
         else: return [str(self.shape)]
-    def parameter_names(self, add_name=False):
+    def parameter_names(self, add_self=False, adjust_for_printing=False):
+        if adjust_for_printing:
+            return [adjust_name_for_printing(self.name)]
         return [self.name]
     @property
     def flattened_parameters(self):
@@ -240,7 +242,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable
         return [t._short() for t in self._tied_to_] or ['']
     def __repr__(self, *args, **kwargs):
         name = "\033[1m{x:s}\033[0;0m:\n".format(
-                            x=self.name_hirarchical)
+                            x=self.hirarchy_name())
         return name + super(Param, self).__repr__(*args, **kwargs)
     def _ties_for(self, rav_index):
         # size = sum(p.size for p in self._tied_to_)
@@ -274,7 +276,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable
         gen = map(lambda x: " ".join(map(str, x)), gen)
         return reduce(lambda a, b:max(a, len(b)), gen, len(header))
     def _max_len_values(self):
-        return reduce(lambda a, b:max(a, len("{x:=.{0}g}".format(__precision__, x=b))), self.flat, len(self.name_hirarchical))
+        return reduce(lambda a, b:max(a, len("{x:=.{0}g}".format(__precision__, x=b))), self.flat, len(self.hirarchy_name()))
     def _max_len_index(self, ind):
         return reduce(lambda a, b:max(a, len(str(b))), ind, len(__index_name__))
     def _short(self):
@@ -302,8 +304,8 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable
         if lp is None: lp = self._max_len_names(prirs, __tie_name__)
         sep = '-'
         header_format = "  {i:{5}^{2}s}  |  \033[1m{x:{5}^{1}s}\033[0;0m  |  {c:{5}^{0}s}  |  {p:{5}^{4}s}  |  {t:{5}^{3}s}"
-        if only_name: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.name_hirarchical, c=sep*lc, i=sep*li, t=sep*lt, p=sep*lp)  # nice header for printing
-        else: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.name_hirarchical, c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
+        if only_name: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hirarchy_name(), c=sep*lc, i=sep*li, t=sep*lt, p=sep*lp)  # nice header for printing
+        else: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hirarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
         if not ties: ties = itertools.cycle([''])
         return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {p:^{5}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, lp, x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)])  # return all the constraints with right indices
         # except: return super(Param, self).__str__()
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 2b2283c2..9a10f317 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -88,10 +88,12 @@ class Nameable(Parentable):
         self._name = name
         if self.has_parent():
             self._direct_parent_._name_changed(self, from_name)
-    def hirarchy_name(self):
+    def hirarchy_name(self, adjust_for_printing=True):
+        if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x)
+        else: adjust = lambda x: x
         if self.has_parent():
-            return self._direct_parent_.hirarchy_name() + "." + adjust_name_for_printing(self.name)
-        return adjust_name_for_printing(self.name)
+            return self._direct_parent_.hirarchy_name() + "." + adjust(self.name)
+        return adjust(self.name)
 
 class Parameterizable(Parentable):
     def __init__(self, *args, **kwargs):
@@ -100,10 +102,13 @@ class Parameterizable(Parentable):
         _parameters_ = ParamList()
         self._added_names_ = set()
     
-    def parameter_names(self, add_name=False):
-        if add_name:
-            return [adjust_name_for_printing(self.name) + "." + xi for x in self._parameters_ for xi in x.parameter_names(add_name=True)]
-        return [xi for x in self._parameters_ for xi in x.parameter_names(add_name=True)]
+    def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
+        if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x)
+        else: adjust = lambda x: x
+        if recursive: names = [xi for x in self._parameters_ for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)]
+        else: names = [adjust(x.name) for x in self._parameters_]
+        if add_self: names = map(lambda x: adjust(self.name) + "." + x, names)
+        return names
     
     def _add_parameter_name(self, param):
         pname = adjust_name_for_printing(param.name)
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index f510d330..12bf936c 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -305,13 +305,11 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable, Parame
         """
         if not isinstance(regexp, _pattern_type): regexp = compile(regexp)
         found_params = []
-        for p in self.flattened_parameters:
-            if regexp.match(p.hirarchy_name()) is not None:
+        for n, p in itertools.izip(self.parameter_names(False, False, True), self.flattened_parameters):
+            if regexp.match(n) is not None:
                 found_params.append(p)
-            if isinstance(p, Parameterized):
-                found_params.extend(p.grep_param_names(regexp))
         return found_params
-        return [param for param in self._parameters_ if regexp.match(param.name) is not None]
+
     def __getitem__(self, name, paramlist=None):
         if paramlist is None:
             paramlist = self.grep_param_names(name)
@@ -323,26 +321,16 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable, Parame
                     return ParamConcatenation(paramlist)
             return paramlist[-1]
         return ParamConcatenation(paramlist)
+    
     def __setitem__(self, name, value, paramlist=None):
         try: param = self.__getitem__(name, paramlist)
         except AttributeError as a: raise a
         param[:] = value
-#     def __getattr__(self, name):
-#         return self.__getitem__(name)
-#     def __getattribute__(self, name):
-#         #try:
-#             return object.__getattribute__(self, name)
-        # except AttributeError:
-        #    _, a, tb = sys.exc_info()
-        #    try:
-        #        return self.__getitem__(name)
-        #    except AttributeError:
-        #        raise AttributeError, a.message, tb
     def __setattr__(self, name, val):
-        # override the default behaviour, if setting a param, so broadcasting can by used
-        if hasattr(self, "_parameters_"):
-            paramlist = self.grep_param_names(name)
-            if len(paramlist) == 1: self.__setitem__(name, val, paramlist); return
+        # override the default behaviour, if setting a param, so broadcasting can by used        
+        if hasattr(self, '_parameters_'):
+            pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False)
+            if name in pnames: self._parameters_[pnames.index(name)][:] = val; return
         object.__setattr__(self, name, val);
     #===========================================================================
     # Printing:

From 92d71384b77aca1a5a2190ce2062624670fea9a8 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 19 Feb 2014 17:37:18 +0000
Subject: [PATCH 10/38] deleted kernpart, prod and add seem to work okay.

---
 GPy/core/gp.py                           | 26 ++++-----
 GPy/examples/regression.py               | 38 ++++++------
 GPy/kern/__init__.py                     |  1 +
 GPy/kern/_src/add.py                     | 70 ++++------------------
 GPy/kern/_src/coregionalize.py           | 12 ++--
 GPy/kern/_src/kern.py                    |  4 +-
 GPy/kern/_src/kernpart.py                | 60 -------------------
 GPy/kern/_src/linear.py                  |  8 +--
 GPy/kern/_src/prod.py                    | 74 +++++++-----------------
 GPy/kern/_src/rbf.py                     |  8 +--
 GPy/kern/_src/white.py                   |  6 +-
 GPy/models/gp_regression.py              |  2 +-
 GPy/models/mrd.py                        |  2 +-
 GPy/plotting/matplot_dep/kernel_plots.py |  2 +-
 GPy/plotting/matplot_dep/models_plots.py | 16 +++--
 GPy/util/datasets.py                     |  4 +-
 16 files changed, 95 insertions(+), 238 deletions(-)
 delete mode 100644 GPy/kern/_src/kernpart.py

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 10ba8e6b..2dcf0e14 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -70,7 +70,7 @@ class GP(Model):
     def log_likelihood(self):
         return self._log_marginal_likelihood
 
-    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
+    def _raw_predict(self, _Xnew, full_cov=False):
         """
         Internal helper function for making predictions, does not account
         for normalization or likelihood
@@ -80,29 +80,27 @@ class GP(Model):
         diagonal of the covariance is returned.
 
         """
-        Kx = self.kern.K(_Xnew, self.X, which_parts=which_parts).T
+        Kx = self.kern.K(_Xnew, self.X).T
         #LiKx, _ = dtrtrs(self.posterior.woodbury_chol, np.asfortranarray(Kx), lower=1)
         WiKx = np.dot(self.posterior.woodbury_inv, Kx)
         mu = np.dot(Kx.T, self.posterior.woodbury_vector)
         if full_cov:
-            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
+            Kxx = self.kern.K(_Xnew)
             #var = Kxx - tdot(LiKx.T)
             var = np.dot(Kx.T, WiKx)
         else:
-            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
+            Kxx = self.kern.Kdiag(_Xnew)
             #var = Kxx - np.sum(LiKx*LiKx, 0)
             var = Kxx - np.sum(WiKx*Kx, 0)
             var = var.reshape(-1, 1)
         return mu, var
 
-    def predict(self, Xnew, which_parts='all', full_cov=False, **likelihood_args):
+    def predict(self, Xnew, full_cov=False, **likelihood_args):
         """
         Predict the function(s) at the new point(s) Xnew.
 
         :param Xnew: The points at which to make a prediction
         :type Xnew: np.ndarray, Nnew x self.input_dim
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
-        :type which_parts: ('all', list of bools)
         :param full_cov: whether to return the full covariance matrix, or just
                          the diagonal
         :type full_cov: bool
@@ -118,13 +116,13 @@ class GP(Model):
 
         """
         #predict the latent function values
-        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
+        mu, var = self._raw_predict(Xnew, full_cov=full_cov)
 
         # now push through likelihood
         mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
         return mean, var, _025pm, _975pm
 
-    def posterior_samples_f(self,X,size=10,which_parts='all',full_cov=True):
+    def posterior_samples_f(self,X,size=10, full_cov=True):
         """
         Samples the posterior GP at the points X.
 
@@ -132,13 +130,11 @@ class GP(Model):
         :type X: np.ndarray, Nnew x self.input_dim.
         :param size: the number of a posteriori samples.
         :type size: int.
-        :param which_parts: which of the kernel functions to use (additively).
-        :type which_parts: 'all', or list of bools.
         :param full_cov: whether to return the full covariance matrix, or just the diagonal.
         :type full_cov: bool.
         :returns: Ysim: set of simulations, a Numpy array (N x samples).
         """
-        m, v = self._raw_predict(X, which_parts=which_parts, full_cov=full_cov)
+        m, v = self._raw_predict(X,  full_cov=full_cov)
         v = v.reshape(m.size,-1) if len(v.shape)==3 else v
         if not full_cov:
             Ysim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
@@ -147,7 +143,7 @@ class GP(Model):
 
         return Ysim
 
-    def posterior_samples(self,X,size=10,which_parts='all',full_cov=True,noise_model=None):
+    def posterior_samples(self,X,size=10, full_cov=True,noise_model=None):
         """
         Samples the posterior GP at the points X.
 
@@ -155,15 +151,13 @@ class GP(Model):
         :type X: np.ndarray, Nnew x self.input_dim.
         :param size: the number of a posteriori samples.
         :type size: int.
-        :param which_parts: which of the kernel functions to use (additively).
-        :type which_parts: 'all', or list of bools.
         :param full_cov: whether to return the full covariance matrix, or just the diagonal.
         :type full_cov: bool.
         :param noise_model: for mixed noise likelihood, the noise model to use in the samples.
         :type noise_model: integer.
         :returns: Ysim: set of simulations, a Numpy array (N x samples).
         """
-        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts, full_cov=full_cov)
+        Ysim = self.posterior_samples_f(X, size, full_cov=full_cov)
         if isinstance(self.likelihood, Gaussian):
             noise_std = np.sqrt(self.likelihood._get_params())
             Ysim += np.random.normal(0,noise_std,Ysim.shape)
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 55567051..5cac1857 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -41,7 +41,7 @@ def coregionalization_toy2(optimize=True, plot=True):
     Y = np.vstack((Y1, Y2))
 
     #build the kernel
-    k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
+    k1 = GPy.kern.RBF(1) + GPy.kern.bias(1)
     k2 = GPy.kern.coregionalize(2,1)
     k = k1**k2
     m = GPy.models.GPRegression(X, Y, kernel=k)
@@ -68,7 +68,7 @@ def coregionalization_toy2(optimize=True, plot=True):
 #    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
 #    Y = np.vstack((Y1, Y2))
 #
-#    k1 = GPy.kern.rbf(1)
+#    k1 = GPy.kern.RBF(1)
 #    m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
 #    m.constrain_fixed('.*rbf_var', 1.)
 #    m.optimize(max_iters=100)
@@ -127,7 +127,7 @@ def epomeo_gpx(max_iters=200, optimize=True, plot=True):
     Z = np.hstack((np.linspace(t[:,0].min(), t[:, 0].max(), num_inducing)[:, None],
                    np.random.randint(0, 4, num_inducing)[:, None]))
 
-    k1 = GPy.kern.rbf(1)
+    k1 = GPy.kern.RBF(1)
     k2 = GPy.kern.coregionalize(output_dim=5, rank=5)
     k = k1**k2
 
@@ -156,7 +156,7 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
 
     data['Y'] = data['Y'] - np.mean(data['Y'])
 
-    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
+    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.RBF)
     if plot:
         pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
         ax = pb.gca()
@@ -172,8 +172,8 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
     optim_point_y = np.empty(2)
     np.random.seed(seed=seed)
     for i in range(0, model_restarts):
-        # kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
-        kern = GPy.kern.rbf(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50))
+        # kern = GPy.kern.RBF(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
+        kern = GPy.kern.RBF(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50))
 
         m = GPy.models.GPRegression(data['X'], data['Y'], kernel=kern)
         m['noise_variance'] = np.random.uniform(1e-3, 1)
@@ -196,7 +196,7 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
         ax.set_ylim(ylim)
     return m # (models, lls)
 
-def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
+def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.RBF):
     """
     Evaluate the GP objective function for a given data set for a range of
     signal to noise ratios and a range of lengthscales.
@@ -278,10 +278,10 @@ def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
     optimizer='scg'
     x_len = 30
     X = np.linspace(0, 10, x_len)[:, None]
-    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
+    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.RBF(1).K(X))
     Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
 
-    kern = GPy.kern.rbf(1)
+    kern = GPy.kern.RBF(1)
     poisson_lik = GPy.likelihoods.Poisson()
     laplace_inf = GPy.inference.latent_function_inference.LaplaceInference()
 
@@ -319,10 +319,10 @@ def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize
     if kernel_type == 'linear':
         kernel = GPy.kern.linear(X.shape[1], ARD=1)
     elif kernel_type == 'rbf_inv':
-        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
+        kernel = GPy.kern.RBF_inv(X.shape[1], ARD=1)
     else:
-        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
-    kernel += GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernel = GPy.kern.RBF(X.shape[1], ARD=1)
+    kernel += GPy.kern.White(X.shape[1]) + GPy.kern.bias(X.shape[1])
     m = GPy.models.GPRegression(X, Y, kernel)
     # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
     # m.set_prior('.*lengthscale',len_prior)
@@ -358,9 +358,9 @@ def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, o
     if kernel_type == 'linear':
         kernel = GPy.kern.linear(X.shape[1], ARD=1)
     elif kernel_type == 'rbf_inv':
-        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
+        kernel = GPy.kern.RBF_inv(X.shape[1], ARD=1)
     else:
-        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
+        kernel = GPy.kern.RBF(X.shape[1], ARD=1)
     #kernel += GPy.kern.bias(X.shape[1])
     X_variance = np.ones(X.shape) * 0.5
     m = GPy.models.SparseGPRegression(X, Y, kernel, X_variance=X_variance)
@@ -421,7 +421,7 @@ def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, opti
     X = np.random.uniform(-3., 3., (num_samples, 1))
     Y = np.sin(X) + np.random.randn(num_samples, 1) * 0.05
     # construct kernel
-    rbf = GPy.kern.rbf(1)
+    rbf = GPy.kern.RBF(1)
     # create simple GP Model
     m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
     m.checkgrad(verbose=1)
@@ -444,7 +444,7 @@ def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, opt
         Y[inan] = np.nan
 
     # construct kernel
-    rbf = GPy.kern.rbf(2)
+    rbf = GPy.kern.RBF(2)
 
     # create simple GP Model
     m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
@@ -476,9 +476,9 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
     # likelihood = GPy.likelihoods.Gaussian(Y)
     Z = np.random.uniform(-3., 3., (7, 1))
 
-    k = GPy.kern.rbf(1)
+    k = GPy.kern.RBF(1)
     # create simple GP Model - no input uncertainty on this one
-    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.rbf(1), Z=Z)
+    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.RBF(1), Z=Z)
 
     if optimize:
         m.optimize('scg', messages=1, max_iters=max_iters)
@@ -489,7 +489,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
     print m
 
     # the same Model with uncertainty
-    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.rbf(1), Z=Z, X_variance=S)
+    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.RBF(1), Z=Z, X_variance=S)
     if optimize:
         m.optimize('scg', messages=1, max_iters=max_iters)
     if plot:
diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 7760f48f..214e230f 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -1,6 +1,7 @@
 from _src.rbf import RBF
 from _src.white import White
 from _src.kern import Kern
+Linear = 'foo'
 #import bias
 #import Brownian
 #import coregionalize
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index 8d916941..8d81674b 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -5,8 +5,8 @@ import sys
 import numpy as np
 import itertools
 from linear import Linear
-from ..core.parameterization import Parameterized
-from GPy.core.parameterization.param import Param
+from ...core.parameterization import Parameterized
+from ...core.parameterization.param import Param
 from kern import Kern
 
 class Add(Kern):
@@ -27,7 +27,7 @@ class Add(Kern):
         self.add_parameters(*subkerns)
 
 
-    def K(self, X, X2=None, which_parts='all'):
+    def K(self, X, X2=None):
         """
         Compute the kernel function.
 
@@ -35,52 +35,22 @@ class Add(Kern):
         :param X2: (optional) the second set of arguments to the kernel. If X2
                    is None, this is passed throgh to the 'part' object, which
                    handles this as X2 == X.
-        :param which_parts: a list of booleans detailing whether to include
-                            each of the part functions. By default, 'all'
-                            indicates all parts
         """
-        if which_parts == 'all':
-            which_parts = [True] * self.size
         assert X.shape[1] == self.input_dim
         if X2 is None:
-            target = np.zeros((X.shape[0], X.shape[0]))
-            [p.K(X[:, i_s], None, target=target) for p, i_s, part_i_used in zip(self._parameters_, self.input_slices, which_parts) if part_i_used]
+            return sum([p.K(X[:, i_s], None) for p, i_s in zip(self._parameters_, self.input_slices)])
         else:
-            target = np.zeros((X.shape[0], X2.shape[0]))
-            [p.K(X[:, i_s], X2[:, i_s], target=target) for p, i_s, part_i_used in zip(self._parameters_, self.input_slices, which_parts) if part_i_used]
-        return target
+            return sum([p.K(X[:, i_s], X2[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)])
 
     def update_gradients_full(self, dL_dK, X):
-        [p.update_gradients_full(dL_dK, X) for p in self._parameters_]
+        [p.update_gradients_full(dL_dK, X[:,i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
 
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        [p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X, Z) for p in self._parameters_]
+        [p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X[:,i_s], Z[:,i_s]) for p, i_s in zip(self._parameters_, i_s)]
 
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         [p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_]
 
-    def _param_grad_helper(self, dL_dK, X, X2=None):
-        """
-        Compute the gradient of the covariance function with respect to the parameters.
-
-        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
-        :type dL_dK: Np.ndarray (num_samples x num_inducing)
-        :param X: Observed data inputs
-        :type X: np.ndarray (num_samples x input_dim)
-        :param X2: Observed data inputs (optional, defaults to X)
-        :type X2: np.ndarray (num_inducing x input_dim)
-
-        returns: dL_dtheta
-        """
-        assert X.shape[1] == self.input_dim
-        target = np.zeros(self.size)
-        if X2 is None:
-            [p._param_grad_helper(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
-        else:
-            [p._param_grad_helper(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
-
-        return self._transform_gradients(target)
-
     def gradients_X(self, dL_dK, X, X2=None):
         """Compute the gradient of the objective function with respect to X.
 
@@ -93,33 +63,15 @@ class Add(Kern):
 
         target = np.zeros_like(X)
         if X2 is None:
-            [p.gradients_X(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+            [np.add(target[:,i_s], p.gradients_X(dL_dK, X[:, i_s], None), target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
         else:
-            [p.gradients_X(dL_dK, X[:, i_s], X2[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+            [np.add(target[:,i_s], p.gradients_X(dL_dK, X[:, i_s], X2[:,i_s]), target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
         return target
 
-    def Kdiag(self, X, which_parts='all'):
+    def Kdiag(self, X):
         """Compute the diagonal of the covariance function for inputs X."""
-        if which_parts == 'all':
-            which_parts = [True] * self.size
         assert X.shape[1] == self.input_dim
-        target = np.zeros(X.shape[0])
-        [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self._parameters_, self.input_slices, which_parts) if part_on]
-        return target
-
-    def dKdiag_dtheta(self, dL_dKdiag, X):
-        """Compute the gradient of the diagonal of the covariance function with respect to the parameters."""
-        assert X.shape[1] == self.input_dim
-        assert dL_dKdiag.size == X.shape[0]
-        target = np.zeros(self.size)
-        [p.dKdiag_dtheta(dL_dKdiag, X[:, i_s], target[ps]) for p, i_s, ps in zip(self._parameters_, self.input_slices, self._param_slices_)]
-        return self._transform_gradients(target)
-
-    def dKdiag_dX(self, dL_dKdiag, X):
-        assert X.shape[1] == self.input_dim
-        target = np.zeros_like(X)
-        [p.dKdiag_dX(dL_dKdiag, X[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-        return target
+        return sum([p.Kdiag(X[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)])
 
     def psi0(self, Z, mu, S):
         target = np.zeros(mu.shape[0])
diff --git a/GPy/kern/_src/coregionalize.py b/GPy/kern/_src/coregionalize.py
index 8b2f17e8..69fc27ef 100644
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2012, James Hensman and Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kernpart import Kernpart
+from kern import Kern
 import numpy as np
 from scipy import weave
 from ...core.parameterization import Param
 
-class Coregionalize(Kernpart):
+class Coregionalize(Kern):
     """
     Covariance function for intrinsic/linear coregionalization models
 
@@ -133,6 +133,8 @@ class Coregionalize(Kernpart):
         #dkappa = dL_dKdiag_small
         #target += np.hstack([dW.flatten(),dkappa])
 
-    def gradients_X(self,dL_dK,X,X2,target):
-        #NOTE In this case, pass is equivalent to returning zero.
-        pass
+    def gradients_X(self,dL_dK,X,X2):
+        if X2 is None:
+            return np.zeros((X.shape[0], X.shape[0]))
+        else:
+            return np.zeros((X.shape[0], X2.shape[0]))
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index af362498..b5b84305 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -4,8 +4,8 @@
 import sys
 import numpy as np
 import itertools
-from ..core.parameterization import Parameterized
-from GPy.core.parameterization.param import Param
+from ...core.parameterization import Parameterized
+from ...core.parameterization.param import Param
 
 
 class Kern(Parameterized):
diff --git a/GPy/kern/_src/kernpart.py b/GPy/kern/_src/kernpart.py
deleted file mode 100644
index 097ed741..00000000
--- a/GPy/kern/_src/kernpart.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-#from ...core.parameterized.Parameterized import set_as_parameter
-from ...core.parameterization import Parameterized
-
-class Kernpart_stationary(Kernpart):
-    def __init__(self, input_dim, lengthscale=None, ARD=False):
-        self.input_dim = input_dim
-        self.ARD = ARD
-        if not ARD:
-            self.num_params = 2
-            if lengthscale is not None:
-                self.lengthscale = np.asarray(lengthscale)
-                assert self.lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
-            else:
-                self.lengthscale = np.ones(1)
-        else:
-            self.num_params = self.input_dim + 1
-            if lengthscale is not None:
-                self.lengthscale = np.asarray(lengthscale)
-                assert self.lengthscale.size == self.input_dim, "bad number of lengthscales"
-            else:
-                self.lengthscale = np.ones(self.input_dim)
-
-        # initialize cache
-        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
-        self._X, self._X2, self._parameters_ = np.empty(shape=(3, 1))
-
-    def _set_params(self, x):
-        self.lengthscale = x
-        self.lengthscale2 = np.square(self.lengthscale)
-        # reset cached results
-        self._X, self._X2, self._parameters_ = np.empty(shape=(3, 1))
-        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
-
-
-    def dKdiag_dtheta(self, dL_dKdiag, X, target):
-        # For stationary covariances, derivative of diagonal elements
-        # wrt lengthscale is 0.
-        target[0] += np.sum(dL_dKdiag)
-
-    def dKdiag_dX(self, dL_dK, X, target):
-        pass # true for all stationary kernels
-
-
-class Kernpart_inner(Kernpart):
-    def __init__(self,input_dim):
-        """
-        The base class for a kernpart_inner: a positive definite function which forms part of a kernel that is based on the inner product between inputs.
-
-        :param input_dim: the number of input dimensions to the function
-        :type input_dim: int
-
-        Do not instantiate.
-        """
-        Kernpart.__init__(self, input_dim)
-
-        # initialize cache
-        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
-        self._X, self._X2, self._parameters_ = np.empty(shape=(3, 1))
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index ab77d4e6..5083c8de 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -5,10 +5,10 @@
 import numpy as np
 from scipy import weave
 from kern import Kern
-from ..util.linalg import tdot
-from ..util.misc import fast_array_equal, param_to_array
-from ..core.parameterization import Param
-from ..core.parameterization.transformations import Logexp
+from ...util.linalg import tdot
+from ...util.misc import fast_array_equal, param_to_array
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
 
 class Linear(Kern):
     """
diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index 08221de7..e0d069b2 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -35,64 +35,36 @@ class Prod(Kern):
         self._X, self._X2 = np.empty(shape=(2,1))
         self._params = None
 
-    def K(self,X,X2,target):
+    def K(self, X, X2=None):
         self._K_computations(X,X2)
-        target += self._K1 * self._K2
-
-    def K1(self,X, X2):
-        """Compute the part of the kernel associated with k1."""
-        self._K_computations(X, X2)
-        return self._K1
-
-    def K2(self, X, X2):
-        """Compute the part of the kernel associated with k2."""
-        self._K_computations(X, X2)
-        return self._K2
+        return self._K1 * self._K2
 
     def update_gradients_full(self, dL_dK, X):
         self._K_computations(X, None)
         self.k1.update_gradients_full(dL_dK*self._K2, X[:,self.slice1])
         self.k2.update_gradients_full(dL_dK*self._K1, X[:,self.slice2])
 
-    def _param_grad_helper(self,dL_dK,X,X2,target):
-        """Derivative of the covariance matrix with respect to the parameters."""
-        self._K_computations(X,X2)
-        if X2 is None:
-            self.k1._param_grad_helper(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.num_params])
-            self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.slice2], None, target[self.k1.num_params:])
-        else:
-            self.k1._param_grad_helper(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:self.k1.num_params])
-            self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[self.k1.num_params:])
-
-    def Kdiag(self,X,target):
+    def Kdiag(self, X):
         """Compute the diagonal of the covariance matrix associated to X."""
-        target1 = np.zeros(X.shape[0])
-        target2 = np.zeros(X.shape[0])
-        self.k1.Kdiag(X[:,self.slice1],target1)
-        self.k2.Kdiag(X[:,self.slice2],target2)
-        target += target1 * target2
+        return self.k1.Kdiag(X[:,self.slice1]) * self.k2.Kdiag(X[:,self.slice2])
 
+    def update_gradients_sparse(self):
+        pass
+        #wtf goes here??
+    #def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        #K1 = np.zeros(X.shape[0])
+        #K2 = np.zeros(X.shape[0])
+        #self.k1.Kdiag(X[:,self.slice1],K1)
+        #self.k2.Kdiag(X[:,self.slice2],K2)
+        #self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,self.slice1],target[:self.k1.num_params])
+        #self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.slice2],target[self.k1.num_params:])
 
-    def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        K1 = np.zeros(X.shape[0])
-        K2 = np.zeros(X.shape[0])
-        self.k1.Kdiag(X[:,self.slice1],K1)
-        self.k2.Kdiag(X[:,self.slice2],K2)
-        self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,self.slice1],target[:self.k1.num_params])
-        self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.slice2],target[self.k1.num_params:])
-
-    def gradients_X(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2):
         """derivative of the covariance matrix with respect to X."""
         self._K_computations(X,X2)
         if X2 is None:
-            if not isinstance(self.k1,Coregionalize) and not isinstance(self.k2,Coregionalize):
-                self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], None, target[:,self.slice1])
-                self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], None, target[:,self.slice2])
-            else:#if isinstance(self.k1,Coregionalize) or isinstance(self.k2,Coregionalize):
-                #NOTE The indices column in the inputs makes the ki.gradients_X fail when passing None instead of X[:,self.slicei]
-                X2 = X
-                self.k1.gradients_X(2.*dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
-                self.k2.gradients_X(2.*dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
+            self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], None, target[:,self.slice1])
+            self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], None, target[:,self.slice2])
         else:
             self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
             self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
@@ -112,14 +84,10 @@ class Prod(Kern):
             self._params == self._get_params().copy()
             if X2 is None:
                 self._X2 = None
-                self._K1 = np.zeros((X.shape[0],X.shape[0]))
-                self._K2 = np.zeros((X.shape[0],X.shape[0]))
-                self.k1.K(X[:,self.slice1],None,self._K1)
-                self.k2.K(X[:,self.slice2],None,self._K2)
+                self._K1 = self.k1.K(X[:,self.slice1],None)
+                self._K2 = self.k2.K(X[:,self.slice2],None)
             else:
                 self._X2 = X2.copy()
-                self._K1 = np.zeros((X.shape[0],X2.shape[0]))
-                self._K2 = np.zeros((X.shape[0],X2.shape[0]))
-                self.k1.K(X[:,self.slice1],X2[:,self.slice1],self._K1)
-                self.k2.K(X[:,self.slice2],X2[:,self.slice2],self._K2)
+                self._K1 = self.k1.K(X[:,self.slice1],X2[:,self.slice1])
+                self._K2 = self.k2.K(X[:,self.slice2],X2[:,self.slice2])
 
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 36e454e3..eb713433 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -5,10 +5,10 @@
 import numpy as np
 from scipy import weave
 from kern import Kern
-from ..util.linalg import tdot
-from ..util.misc import fast_array_equal, param_to_array
-from ..core.parameterization import Param
-from ..core.parameterization.transformations import Logexp
+from ...util.linalg import tdot
+from ...util.misc import fast_array_equal, param_to_array
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
 
 class RBF(Kern):
     """
diff --git a/GPy/kern/_src/white.py b/GPy/kern/_src/white.py
index 7750267f..2be73389 100644
--- a/GPy/kern/_src/white.py
+++ b/GPy/kern/_src/white.py
@@ -3,8 +3,8 @@
 
 from kern import Kern
 import numpy as np
-from ..core.parameterization import Param
-from ..core.parameterization.transformations import Logexp
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
 
 class White(Kern):
     """
@@ -25,6 +25,8 @@ class White(Kern):
     def K(self,X,X2):
         if X2 is None:
             return np.eye(X.shape[0])*self.variance
+        else:
+            return np.zeros((X.shape[0], X2.shape[0]))
 
     def Kdiag(self,X):
         ret = np.ones(X.shape[0])
diff --git a/GPy/models/gp_regression.py b/GPy/models/gp_regression.py
index a72acc1a..f8957906 100644
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@@ -23,7 +23,7 @@ class GPRegression(GP):
     def __init__(self, X, Y, kernel=None):
 
         if kernel is None:
-            kernel = kern.rbf(X.shape[1])
+            kernel = kern.RBF(X.shape[1])
 
         likelihood = likelihoods.Gaussian()
 
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index b4f987ea..acc6e11a 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -7,7 +7,7 @@ from GPy.util.linalg import PCA
 import numpy
 import itertools
 import pylab
-from GPy.kern.kern import Kern
+from GPy.kern import Kern
 from GPy.models.bayesian_gplvm import BayesianGPLVM
 
 class MRD(Model):
diff --git a/GPy/plotting/matplot_dep/kernel_plots.py b/GPy/plotting/matplot_dep/kernel_plots.py
index 80350475..30157294 100644
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@@ -7,7 +7,7 @@ import pylab as pb
 import Tango
 from matplotlib.textpath import TextPath
 from matplotlib.transforms import offset_copy
-from ...kern.linear import Linear
+from ...kern import Linear
 
 
 def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False):
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index c9896116..75ba39d9 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -9,7 +9,7 @@ from ...util.misc import param_to_array
 
 
 def plot_fit(model, plot_limits=None, which_data_rows='all',
-        which_data_ycols='all', which_parts='all', fixed_inputs=[],
+        which_data_ycols='all', fixed_inputs=[],
         levels=20, samples=0, fignum=None, ax=None, resolution=None,
         plot_raw=False,
         linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
@@ -20,7 +20,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
       - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
 
     Can plot only part of the data and part of the posterior functions
-    using which_data_rowsm which_data_ycols and which_parts
+    using which_data_rowsm which_data_ycols. 
 
     :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
     :type plot_limits: np.array
@@ -28,8 +28,6 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
     :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
     :param which_data_ycols: when the data has several columns (independant outputs), only plot these
     :type which_data_rows: 'all' or a list of integers
-    :param which_parts: which of the kernel functions to plot (additively)
-    :type which_parts: 'all', or list of bools
     :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
     :type fixed_inputs: a list of tuples
     :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
@@ -76,12 +74,12 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
 
         #make a prediction on the frame and plot it
         if plot_raw:
-            m, v = model._raw_predict(Xgrid, which_parts=which_parts)
+            m, v = model._raw_predict(Xgrid)
             lower = m - 2*np.sqrt(v)
             upper = m + 2*np.sqrt(v)
             Y = model.Y
         else:
-            m, v, lower, upper = model.predict(Xgrid, which_parts=which_parts)
+            m, v, lower, upper = model.predict(Xgrid)
             Y = model.Y
         for d in which_data_ycols:
             gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
@@ -89,7 +87,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
 
         #optionally plot some samples
         if samples: #NOTE not tested with fixed_inputs
-            Ysim = model.posterior_samples(Xgrid, samples, which_parts=which_parts)
+            Ysim = model.posterior_samples(Xgrid, samples)
             for yi in Ysim.T:
                 ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                 #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
@@ -131,10 +129,10 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
 
         #predict on the frame and plot
         if plot_raw:
-            m, _ = model._raw_predict(Xgrid, which_parts=which_parts)
+            m, _ = model._raw_predict(Xgrid)
             Y = model.Y
         else:
-            m, _, _, _ = model.predict(Xgrid, which_parts=which_parts)
+            m, _, _, _ = model.predict(Xgrid)
             Y = model.data
         for d in which_data_ycols:
             m_d = m[:,d].reshape(resolution, resolution).T
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index 059a39c3..23f5d0c8 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -513,8 +513,8 @@ def toy_rbf_1d(seed=default_seed, num_samples=500):
     num_in = 1
     X = np.random.uniform(low= -1.0, high=1.0, size=(num_samples, num_in))
     X.sort(axis=0)
-    rbf = GPy.kern.rbf(num_in, variance=1., lengthscale=np.array((0.25,)))
-    white = GPy.kern.white(num_in, variance=1e-2)
+    rbf = GPy.kern.RBF(num_in, variance=1., lengthscale=np.array((0.25,)))
+    white = GPy.kern.White(num_in, variance=1e-2)
     kernel = rbf + white
     K = kernel.K(X)
     y = np.reshape(np.random.multivariate_normal(np.zeros(num_samples), K), (num_samples, 1))

From de51ad638a0ea12469a881d32f7524eeb4ac3082 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 19 Feb 2014 22:23:07 +0000
Subject: [PATCH 11/38] prod now seems to work for sparse

---
 GPy/core/sparse_gp.py   | 13 ++++++-------
 GPy/kern/_src/add.py    |  2 +-
 GPy/kern/_src/linear.py | 16 ++++++++--------
 GPy/kern/_src/prod.py   | 39 +++++++++++++++++----------------------
 GPy/kern/_src/rbf.py    |  2 +-
 GPy/kern/_src/white.py  |  3 +--
 6 files changed, 34 insertions(+), 41 deletions(-)

diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index edb8d8f6..128dfca3 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -68,22 +68,21 @@ class SparseGP(GP):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
         self._update_gradients_Z(add=False)
 
-    def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
+    def _raw_predict(self, Xnew, X_variance_new=None, full_cov=False):
         """
         Make a prediction for the latent function values
         """
         if X_variance_new is None:
-            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
+            Kx = self.kern.K(self.Z, Xnew)
             mu = np.dot(Kx.T, self.posterior.woodbury_vector)
             if full_cov:
-                Kxx = self.kern.K(Xnew, which_parts=which_parts)
-                var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx) # NOTE this won't work for plotting
+                Kxx = self.kern.K(Xnew)
+                var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx)
             else:
-                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
+                Kxx = self.kern.Kdiag(Xnew)
                 var = Kxx - np.sum(Kx * np.dot(self.posterior.woodbury_inv, Kx), 0)
         else:
-            # assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
-            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
+            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new)
             mu = np.dot(Kx, self.Cpsi1V)
             if full_cov:
                 raise NotImplementedError, "TODO"
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index 8d81674b..edb82ef0 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -46,7 +46,7 @@ class Add(Kern):
         [p.update_gradients_full(dL_dK, X[:,i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
 
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        [p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X[:,i_s], Z[:,i_s]) for p, i_s in zip(self._parameters_, i_s)]
+        [p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X[:,i_s], Z[:,i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
 
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         [p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_]
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index 5083c8de..b3765774 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -43,16 +43,16 @@ class Linear(Kern):
                 assert variances.size == self.input_dim, "bad number of variances, need one ARD variance per input_dim"
             else:
                 variances = np.ones(self.input_dim)
-        
+
         self.variances = Param('variances', variances, Logexp())
-        self.variances.gradient = np.zeros(self.variances.shape)
+        #TODO: remove?self.variances.gradient = np.zeros(self.variances.shape)
         self.add_parameter(self.variances)
         self.variances.add_observer(self, self.update_variance)
 
         # initialize cache
         self._Z, self._mu, self._S = np.empty(shape=(3, 1))
         self._X, self._X2 = np.empty(shape=(2, 1))
-    
+
     def update_variance(self, v):
         self.variances2 = np.square(self.variances)
 
@@ -62,7 +62,7 @@ class Linear(Kern):
     def update_gradients_full(self, dL_dK, X):
         self.variances.gradient[:] = 0
         self._param_grad_helper(dL_dK, X, None, self.variances.gradient)
-    
+
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
         tmp = dL_dKdiag[:, None] * X ** 2
         if self.ARD:
@@ -71,7 +71,7 @@ class Linear(Kern):
             self.variances.gradient = tmp.sum()
         self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient)
         self._param_grad_helper(dL_dKnm, X, Z, self.variances.gradient)
-        
+
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         self._psi_computations(Z, mu, S)
         # psi0:
@@ -87,7 +87,7 @@ class Linear(Kern):
         #from Kmm
         self._K_computations(Z, None)
         self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient)
-        
+
     def K(self, X, X2, target):
         if self.ARD:
             XX = X * np.sqrt(self.variances)
@@ -224,7 +224,7 @@ class Linear(Kern):
         weave_options = {'headers'           : ['<omp.h>'],
                          'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
                          'extra_link_args'   : ['-lgomp']}
-        
+
         N,num_inducing,input_dim,mu = mu.shape[0],Z.shape[0],mu.shape[1],param_to_array(mu)
         weave.inline(code, support_code=support_code, libraries=['gomp'],
                      arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'],
@@ -281,7 +281,7 @@ class Linear(Kern):
                 self._X2 = None
             else:
                 self._X2 = X2.copy()
-                self._dot_product = np.dot(param_to_array(X), param_to_array(X2.T))  
+                self._dot_product = np.dot(param_to_array(X), param_to_array(X2.T))
 
     def _psi_computations(self, Z, mu, S):
         # here are the "statistics" for psi1 and psi2
diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index e0d069b2..67637770 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -36,38 +36,33 @@ class Prod(Kern):
         self._params = None
 
     def K(self, X, X2=None):
-        self._K_computations(X,X2)
+        self._K_computations(X, X2)
         return self._K1 * self._K2
 
+    def Kdiag(self, X):
+        return self.k1.Kdiag(X[:,self.slice1]) * self.k2.Kdiag(X[:,self.slice2])
+
     def update_gradients_full(self, dL_dK, X):
         self._K_computations(X, None)
         self.k1.update_gradients_full(dL_dK*self._K2, X[:,self.slice1])
         self.k2.update_gradients_full(dL_dK*self._K1, X[:,self.slice2])
 
-    def Kdiag(self, X):
-        """Compute the diagonal of the covariance matrix associated to X."""
-        return self.k1.Kdiag(X[:,self.slice1]) * self.k2.Kdiag(X[:,self.slice2])
+    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+        self.k1.update_gradients_sparse(dL_dKmm * self.k2.K(Z[:,self.slice2]), dL_dKnm * self.k2(X[:,self.slice2], Z[:,self.slice2]), dL_dKdiag * self.k2.Kdiag(X[:,self.slice2]), X[:,self.slice1], Z[:,self.slice1] )
+        self.k2.update_gradients_sparse(dL_dKmm * self.k1.K(Z[:,self.slice1]), dL_dKnm * self.k1(X[:,self.slice1], Z[:,self.slice1]), dL_dKdiag * self.k1.Kdiag(X[:,self.slice1]), X[:,self.slice2], Z[:,self.slice2] )
 
-    def update_gradients_sparse(self):
-        pass
-        #wtf goes here??
-    #def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        #K1 = np.zeros(X.shape[0])
-        #K2 = np.zeros(X.shape[0])
-        #self.k1.Kdiag(X[:,self.slice1],K1)
-        #self.k2.Kdiag(X[:,self.slice2],K2)
-        #self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,self.slice1],target[:self.k1.num_params])
-        #self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.slice2],target[self.k1.num_params:])
-
-    def gradients_X(self,dL_dK,X,X2):
+    def gradients_X(self, dL_dK, X, X2=None):
         """derivative of the covariance matrix with respect to X."""
-        self._K_computations(X,X2)
+        self._K_computations(X, X2)
+        target = np.zeros(X.shape)
         if X2 is None:
-            self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], None, target[:,self.slice1])
-            self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], None, target[:,self.slice2])
+            target[:,self.slice1] += self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], None)
+            target[:,self.slice2] += self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], None)
         else:
-            self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
-            self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
+            target[:,self.slice1] += self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1])
+            target[:,self.slice2] += self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2])
+
+        return target
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         K1 = np.zeros(X.shape[0])
@@ -78,7 +73,7 @@ class Prod(Kern):
         self.k1.gradients_X(dL_dKdiag*K2, X[:,self.slice1], target[:,self.slice1])
         self.k2.gradients_X(dL_dKdiag*K1, X[:,self.slice2], target[:,self.slice2])
 
-    def _K_computations(self,X,X2):
+    def _K_computations(self, X, X2):
         if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
             self._X = X.copy()
             self._params == self._get_params().copy()
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index eb713433..02640fdc 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -154,7 +154,7 @@ class RBF(Kern):
         else:
             self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
 
-    def gradients_X(self, dL_dK, X, X2):
+    def gradients_X(self, dL_dK, X, X2=None):
         #if self._X is None or X.base is not self._X.base or X2 is not None:
         self._K_computations(X, X2)
         if X2 is None:
diff --git a/GPy/kern/_src/white.py b/GPy/kern/_src/white.py
index 2be73389..d20e2fe1 100644
--- a/GPy/kern/_src/white.py
+++ b/GPy/kern/_src/white.py
@@ -20,9 +20,8 @@ class White(Kern):
         self.input_dim = input_dim
         self.variance = Param('variance', variance, Logexp())
         self.add_parameters(self.variance)
-        self._psi1 = 0 # TODO: more elegance here
 
-    def K(self,X,X2):
+    def K(self, X, X2=None):
         if X2 is None:
             return np.eye(X.shape[0])*self.variance
         else:

From 5214c3c1ac46d60b1818e614394e10106e117bc8 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Wed, 19 Feb 2014 19:39:24 -0500
Subject: [PATCH 12/38] Adding update_gradients to sympy.py.

---
 GPy/kern/parts/rbf.py       |   2 +-
 GPy/kern/parts/sympykern.py | 199 +++++++++++++++++++++---------------
 2 files changed, 115 insertions(+), 86 deletions(-)

diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py
index 027aa382..8811b74a 100644
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@@ -109,7 +109,7 @@ class RBF(Kernpart):
             self.lengthscale.gradient = self._dL_dlengthscales_via_K(dL_dK, X, None)
         else:
             self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK)
-
+b
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
         #contributions from Kdiag
         self.variance.gradient = np.sum(dL_dKdiag)
diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index a5bb7b1d..52813ecd 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -26,6 +26,8 @@ import ast
 from kernpart import Kernpart
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
+# TODO have this set up in a set up file!
+user_code_storage = tempfile.gettempdir()
 
 class spkern(Kernpart):
     """
@@ -61,13 +63,12 @@ class spkern(Kernpart):
         assert all([x.name=='x_%i'%i for i,x in enumerate(self._sp_x)])
         assert all([z.name=='z_%i'%i for i,z in enumerate(self._sp_z)])
         assert len(self._sp_x)==len(self._sp_z)
-        assert len(self._sp_x)==input_dim
+        x_dim=len(self._sp_x)
 
         # If it is a multi-output covariance, add an input for indexing the outputs.
-        self._real_input_dim = self.input_dim
-        if output_dim > 1:
-            self.input_dim += 1
-        assert self.input_dim == input_dim
+        self._real_input_dim = x_dim
+        # Check input dim is number of xs + 1 if output_dim is >1
+        assert self.input_dim == x_dim + int(output_dim > 1)
         self.output_dim = output_dim
 
         # extract parameter names from the covariance
@@ -113,7 +114,6 @@ class spkern(Kernpart):
             #setattr(self, theta.name, val)
             setattr(self, theta.name, Param(theta.name, val, None))
             self.add_parameters(getattr(self, theta.name))
-        self.parameters_changed() # initializes cache
         #deal with param            
         #self._set_params(self._get_params())
 
@@ -139,13 +139,15 @@ class spkern(Kernpart):
                 extra_compile_args = []
             
                 self.weave_kwargs = {
-                    'support_code':self._function_code,
-                    'include_dirs':[tempfile.gettempdir(), os.path.join(current_dir,'parts/')],
-                    'headers':['"sympy_helpers.h"'],
-                    'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp")],
+                    'support_code': None, #self._function_code,
+                    'include_dirs':[user_code_storage, os.path.join(current_dir,'parts/')],
+                    'headers':['"sympy_helpers.h"', '"'+self.name+'.h"'],
+                    'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp"), os.path.join(user_code_storage, self.name+'.cpp')],
                     'extra_compile_args':extra_compile_args,
                     'extra_link_args':['-lgomp'],
                     'verbose':True}
+        self.parameters_changed() # initializes caches
+
 
     def __add__(self,other):
         return spkern(self._sp_k+other._sp_k)
@@ -177,31 +179,39 @@ class spkern(Kernpart):
         # Use weave to compute the underlying functions.
         if weave_available:
             # put the header file where we can find it
-            f = file(os.path.join(tempfile.gettempdir(), self.name + '.h'),'w')
+            f = file(os.path.join(user_code_storage, self.name + '.h'),'w')
             f.write(self._function_header)
             f.close()
-
     
-        # Substitute any known derivatives which sympy doesn't compute
-        self._function_code = re.sub('DiracDelta\(.+?,.+?\)','0.0',self._function_code)
+
+        if weave_available:
+            # Substitute any known derivatives which sympy doesn't compute
+            self._function_code = re.sub('DiracDelta\(.+?,.+?\)','0.0',self._function_code)
+            # put the cpp file in user code storage (defaults to temp file location)
+            f = file(os.path.join(user_code_storage, self.name + '.cpp'),'w')
+        else:
+            # put the python file in user code storage
+            f = file(os.path.join(user_code_storage, self.name + '.py'),'w')
+        f.write(self._function_code)
+        f.close()
 
         if weave_available:
             # arg_list will store the arguments required for the C code.
-            arg_list = (["X2(i, %s)"%x.name[2:] for x in self._sp_x]
+            input_arg_list = (["X2(i, %s)"%x.name[2:] for x in self._sp_x]
                         + ["Z2(j, %s)"%z.name[2:] for z in self._sp_z])
 
             # for multiple outputs reverse argument list is also required
             if self.output_dim>1:
-                reverse_arg_list = list(arg_list)
-                reverse_arg_list.reverse()
+                reverse_input_arg_list = list(input_arg_list)
+                reverse_input_arg_list.reverse()
 
             # This gives the parameters for the arg list.
             param_arg_list = [shared_params.name for shared_params in self._sp_theta]
-            arg_list += param_arg_list
+            arg_list = input_arg_list + param_arg_list
 
             precompute_list=[]
             if self.output_dim > 1:
-                reverse_arg_list+=list(param_arg_list)
+                reverse_arg_list= reverse_input_arg_list + list(param_arg_list)
                 # For multiple outputs, also need the split parameters.
                 split_param_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['ii', 'jj'] for theta in self._sp_theta_i]
                 split_param_reverse_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['jj', 'ii'] for theta in self._sp_theta_i]
@@ -218,9 +228,9 @@ class spkern(Kernpart):
 
             # Any precomputations will be done here eventually.
             self._precompute = \
-                             """
-                             // Precompute code would go here. It will be called when parameters are updated. 
-                             """
+                """
+                // Precompute code would go here. It will be called when parameters are updated. 
+                """
 
             # Here's the code to do the looping for K
             self._K_code =\
@@ -229,11 +239,11 @@ class spkern(Kernpart):
             // Code for computing the covariance function.
             int i;
             int j;
-            int N = target_array->dimensions[0];
+            int n = target_array->dimensions[0];
             int num_inducing = target_array->dimensions[1];
             int input_dim = X_array->dimensions[1];
             //#pragma omp parallel for private(j)
-            for (i=0;i<N;i++){
+            for (i=0;i<n;i++){
                 for (j=0;j<num_inducing;j++){
                     %s
                     //target[i*num_inducing+j] = 
@@ -244,6 +254,7 @@ class spkern(Kernpart):
             """%(precompute_string,arg_string,"/*"+str(self._sp_k)+"*/")
            # adding a string representation of the function in the
            # comment forces recompile when needed
+            self._K_code_X = self._K_code.replace('Z2(', 'X2(')
 
 
             # Code to compute diagonal of covariance.
@@ -259,10 +270,10 @@ class spkern(Kernpart):
             // _Kdiag_code
             // Code for computing diagonal of covariance function.
             int i;
-            int N = target_array->dimensions[0];
+            int n = target_array->dimensions[0];
             int input_dim = X_array->dimensions[1];
             //#pragma omp parallel for
-            for (i=0;i<N;i++){
+            for (i=0;i<n;i++){
                     %s
                     //target[i] =
                     TARGET1(i)=k(%s);
@@ -274,9 +285,9 @@ class spkern(Kernpart):
             grad_func_list = []
             if self.output_dim>1:
                 grad_func_list += c_define_output_indices
-                grad_func_list += [' '*16 + 'TARGET1(%i+ii) += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, arg_string) for i, theta in enumerate(self._sp_theta_i)]
-                grad_func_list += [' '*16 + 'TARGET1(%i+jj) += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, reverse_arg_string) for i, theta in enumerate(self._sp_theta_i)]
-            grad_func_list += ([' '*16 + 'TARGET1(%i) += partial[i*num_inducing+j]*dk_d%s(%s);'%(i,theta.name,arg_string) for i,theta in  enumerate(self._sp_theta)])
+                grad_func_list += [' '*16 + 'TARGET1(%i+ii) += PARTIAL2(i, j)*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, arg_string) for i, theta in enumerate(self._sp_theta_i)]
+                grad_func_list += [' '*16 + 'TARGET1(%i+jj) += PARTIAL2(i, j)*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, reverse_arg_string) for i, theta in enumerate(self._sp_theta_i)]
+            grad_func_list += ([' '*16 + 'TARGET1(%i) += PARTIAL2(i, j)*dk_d%s(%s);'%(i,theta.name,arg_string) for i,theta in  enumerate(self._sp_theta)])
             grad_func_string = '\n'.join(grad_func_list) 
 
             self._dK_dtheta_code =\
@@ -285,17 +296,18 @@ class spkern(Kernpart):
             // Code for computing gradient of covariance with respect to parameters.
             int i;
             int j;
-            int N = partial_array->dimensions[0];
+            int n = partial_array->dimensions[0];
             int num_inducing = partial_array->dimensions[1];
             int input_dim = X_array->dimensions[1];
             //#pragma omp parallel for private(j)
-            for (i=0;i<N;i++){
+            for (i=0;i<n;i++){
                 for (j=0;j<num_inducing;j++){
-    %s
+                  %s
                 }
             }
             %s
             """%(grad_func_string,"/*"+str(self._sp_k)+"*/") # adding a string representation forces recompile when needed
+            self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z2(', 'X2(')
 
 
             # Code to compute gradients for Kdiag TODO: needs clean up
@@ -308,9 +320,9 @@ class spkern(Kernpart):
             // _dKdiag_dtheta_code
             // Code for computing gradient of diagonal with respect to parameters.
             int i;
-            int N = partial_array->dimensions[0];
+            int n = partial_array->dimensions[0];
             int input_dim = X_array->dimensions[1];
-            for (i=0;i<N;i++){
+            for (i=0;i<n;i++){
                     %s
             }
             %s
@@ -329,32 +341,33 @@ class spkern(Kernpart):
             // Code for computing gradient of covariance with respect to inputs.
             int i;
             int j;
-            int N = partial_array->dimensions[0];
+            int n = partial_array->dimensions[0];
             int num_inducing = partial_array->dimensions[1];
             int input_dim = X_array->dimensions[1];
             //#pragma omp parallel for private(j)
-            for (i=0;i<N; i++){
+            for (i=0;i<n; i++){
               for (j=0; j<num_inducing; j++){
                 %s
               }
             }
             %s
             """%(gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+            self._dK_dX_code_X = self._dK_dX_code.replace('Z2(', 'X2(')
 
 
             diag_gradX_func_string = re.sub('Z','X',gradX_func_string,count=0)
             diag_gradX_func_string = re.sub('int jj','//int jj',diag_gradX_func_string)
             diag_gradX_func_string = re.sub('j','i',diag_gradX_func_string)
-            diag_gradX_func_string = re.sub('partial\[i\*num_inducing\+i\]','2*partial[i]',diag_gradX_func_string)
+            diag_gradX_func_string = re.sub('PARTIAL2\(i\, i\)','2*PARTIAL(i)',diag_gradX_func_string)
 
             # Code for gradients of Kdiag wrt X
             self._dKdiag_dX_code= \
             """
             // _dKdiag_dX_code
             // Code for computing gradient of diagonal with respect to inputs.
-            int N = partial_array->dimensions[0];
+            int n = partial_array->dimensions[0];
             int input_dim = X_array->dimensions[1];
-            for (int i=0;i<N; i++){
+            for (int i=0;i<n; i++){
                 %s
             }
             %s
@@ -364,13 +377,7 @@ class spkern(Kernpart):
             # diag_func_string called here? Need to check that.
             #self._dKdiag_dX_code = self._dKdiag_dX_code.replace('Z[j', 'X[i')
 
-            # Code to use when only X is provided. 
-            self._K_code_X = self._K_code.replace('Z[', 'X[')
-            self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z[', 'X[')
-            self._dK_dX_code_X = self._dK_dX_code.replace('Z[', 'X[').replace('+= partial[', '+= 2*partial[')
-            self._K_code_X = self._K_code.replace('Z2(', 'X2(')
-            self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z2(', 'X2(')
-            self._dK_dX_code_X = self._dK_dX_code.replace('Z2(', 'X2(')
+            
 
 
             #TODO: insert multiple functions here via string manipulation
@@ -378,9 +385,10 @@ class spkern(Kernpart):
             #TODO: similar functions when cython available.
             #TODO: similar functions when only python available.
             
-    def _get_arg_names(self, Z=None, partial=None):
-        
-        arg_names = ['target','X']
+    def _get_arg_names(self, target=None, Z=None, partial=None):
+        arg_names = ['X']
+        if target is not None:
+            arg_names += ['target']
         for shared_params in self._sp_theta:
             arg_names += [shared_params.name]
         if Z is not None:
@@ -392,20 +400,21 @@ class spkern(Kernpart):
             arg_names += ['output_dim']
         return arg_names
 
-    def _generate_inline(self, code, X, target, Z=None, partial=None):
+    def _generate_inline(self, code, X, target=None, Z=None, partial=None):
         output_dim = self.output_dim
+        # Need to extract parameters to local variables first
         for shared_params in self._sp_theta:
             locals()[shared_params.name] = getattr(self, shared_params.name)
-
-        # Need to extract parameters first
+            
         for split_params in self._split_theta_names:
-            locals()[split_params] = getattr(self, split_params)
-        arg_names = self._get_arg_names(Z, partial)        
+            locals()[split_params] = np.asarray(getattr(self, split_params))
+        arg_names = self._get_arg_names(target, Z, partial)        
+
         if weave_available:
-            weave.inline(code=code, arg_names=arg_names,**self.weave_kwargs)
+            return weave.inline(code=code, arg_names=arg_names,**self.weave_kwargs)
         else:
             raise RuntimeError('Weave not available and other variants of sympy covariance not yet implemented')
-        
+
     def K(self,X,Z,target):        
         if Z is None:
             self._generate_inline(self._K_code_X, X, target)
@@ -421,9 +430,9 @@ class spkern(Kernpart):
             self._generate_inline(self._dK_dtheta_code_X, X, target, Z, partial)
         else:
             self._generate_inline(self._dK_dtheta_code, X, target, Z, partial)
-            
+
     def dKdiag_dtheta(self,partial,X,target):
-        self._generate_inline(self._dKdiag_dtheta_code, X, target, Z=None, partial=partial)
+        self._generate_inline(self._dKdiag_dtheta_code, X, target, Z=None, partial=partial).namelocals()[shared_params.name] = getattr(self, shared_params.name)
                
     def gradients_X(self,partial,X,Z,target):
         if Z is None:
@@ -461,34 +470,54 @@ class spkern(Kernpart):
             self._sp_psi2 = sp.integrate(self._sp_psi2,(self._sp_x[i],-sp.oo,sp.oo))
             clear_cache()
         self._sp_psi2 = self._sp_psi2.simplify()
+
     def parameters_changed(self):
-        # Do anything here that needs to happen when parameters change, like precompute.
-        self._generate_inline(self._precompute, X, target, Z, partial)
+        # Reset the caches
+        self._cache, self._cache2 = np.empty(shape=(2, 1))
+        self._cache3, self._cache4, self._cache5 = np.empty(shape=(3, 1)) 
+
+    def update_gradients_full(self, dL_dK, X):
+        # Need to extract parameters to local variables first
+        self._K_computations(X, None)
+        for shared_params in self._sp_theta:
+            parameter = getattr(self, shared_params.name)
+            code = getattr(self, '_dK_d' + shared_params.name + '_code')
+            setattr(parameter, 'gradient', self._generate_inline(code, X, target=None, Z=None, partial=dL_dK))
+            
+        for split_params in self._split_theta_names:
+            parameter = getattr(self, split_params.name)
+            code = getattr(self, '_dK_d' + split_params.name + '_code')
+            setattr(parameter, 'gradient', self._generate_inline(code, X, target=None, Z=None, partial=dL_dK))
+
+
+    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+        #contributions from Kdiag
+        self.variance.gradient = np.sum(dL_dKdiag)
+
+        #from Knm
+        self._K_computations(X, Z)
+        self.variance.gradient += np.sum(dL_dKnm * self._K_dvar)
+        if self.ARD:
+            self.lengthscale.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
+
+        else:
+            self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKnm)
+
+        #from Kmm
+        self._K_computations(Z, None)
+        self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
+        if self.ARD:
+            self.lengthscale.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
+        else:
+            self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
 
     
-    # def _set_params(self,param):        
-    #     assert param.size == (self.num_params)
-    #     for i, shared_params in enumerate(self._sp_theta):
-    #         setattr(self, shared_params.name, param[i])
-            
-    #     if self.output_dim>1:
-    #         for i, split_params in enumerate(self._split_theta_names):
-    #             start = self.num_shared_params + i*self.output_dim
-    #             end = self.num_shared_params + (i+1)*self.output_dim
-    #             setattr(self, split_params, param[start:end])
+    #---------------------------------------#
+    #            Precomputations            #
+    #---------------------------------------#
 
-
-    # def _get_params(self):
-    #     params = np.zeros(0)
-    #     for shared_params in self._sp_theta:
-    #         params = np.hstack((params, getattr(self, shared_params.name)))
-    #     if self.output_dim>1:
-    #         for split_params in self._split_theta_names:
-    #             params = np.hstack((params, getattr(self, split_params).flatten()))
-    #     return params
-
-    # def _get_param_names(self):
-    #     if self.output_dim>1:
-    #         return [x.name for x in self._sp_theta] + [x.name[:-2] + str(i)  for x in self._sp_theta_i for i in range(self.output_dim)]
-    #     else:
-    #         return [x.name for x in self._sp_theta]
+    def _K_computations(self, X, Z):
+        if Z is None:
+            self._generate_inline(self._precompute, X)
+        else:
+            self._generate_inline(self._precompute, X, Z=Z)

From 46f59f9f6427af69db0097957e1374c1a03f27d6 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 20 Feb 2014 08:38:14 +0000
Subject: [PATCH 13/38] gradients now lazy instantiated

---
 GPy/core/parameterization/param.py       | 11 +++++++++--
 GPy/examples/dimensionality_reduction.py |  3 ++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 75d9faf2..c052099d 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -54,7 +54,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable
         obj._tied_to_me_ = SetDict()
         obj._tied_to_ = []
         obj._original_ = True
-        obj.gradient = None
+        obj._gradient_ = None
         return obj
 
     def __init__(self, name, input_array, default_constraint=None):
@@ -76,10 +76,17 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable
         self._updated_ = getattr(obj, '_updated_', None)
         self._original_ = getattr(obj, '_original_', None)
         self._name = getattr(obj, 'name', None)
-        self.gradient = getattr(obj, 'gradient', None)
+        self._gradient_ = getattr(obj, '_gradient_', None)
         self.constraints = getattr(obj, 'constraints', None)
         self.priors = getattr(obj, 'priors', None)
 
+
+    @property
+    def gradient(self):
+        if self._gradient_ is None:
+            self._gradient_ = numpy.zeros(self._realshape_)
+        return self._gradient_
+        
     #===========================================================================
     # Pickling operations
     #===========================================================================
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 2924386f..4d42026d 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -296,11 +296,12 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
     k = kern.linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
     
     inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
-    m = BayesianGPLVM(Y, Q, init="random", num_inducing=num_inducing, kernel=k)
+    m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, kernel=k)
     m.inference_method = VarDTCMissingData()
     m.Y[inan] = _np.nan
     m.q.variance *= .1
     m.parameters_changed()
+    m.Yreal = Y
     
     if optimize:
         print "Optimizing model:"

From d636c8c30ce696ad27360b4f31a439263b98c2b5 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 20 Feb 2014 14:04:16 +0000
Subject: [PATCH 14/38] everything is broken

---
 GPy/core/gp.py                              |   1 +
 GPy/core/parameterization/param.py          |  10 +-
 GPy/core/parameterization/parameter_core.py |  46 ++---
 GPy/core/sparse_gp.py                       |  15 +-
 GPy/examples/dimensionality_reduction.py    |  26 +--
 GPy/kern/__init__.py                        |   2 +-
 GPy/kern/_src/add.py                        | 203 ++++++++++----------
 GPy/kern/_src/kern.py                       |  16 +-
 GPy/kern/_src/linear.py                     |  85 ++++----
 GPy/kern/_src/prod.py                       |  59 ++----
 GPy/kern/_src/rbf.py                        |  78 ++++----
 GPy/models/bayesian_gplvm.py                |  14 +-
 GPy/util/caching.py                         |  93 ++++++---
 13 files changed, 325 insertions(+), 323 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 2dcf0e14..13336ef5 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -44,6 +44,7 @@ class GP(Model):
             self.Y_metadata = None
 
         assert isinstance(kernel, kern.Kern)
+        assert self.input_dim == kernel.input_dim
         self.kern = kernel
 
         assert isinstance(likelihood, likelihoods.Likelihood)
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index f54c0117..016ecbf6 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -23,7 +23,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
     :param input_array:        array which this parameter handles
     :type input_array:         numpy.ndarray
     :param default_constraint: The default constraint for this parameter
-    :type default_constraint:  
+    :type default_constraint:
 
     You can add/remove constraints by calling constrain on the parameter itself, e.g:
 
@@ -59,7 +59,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
 
     def __init__(self, name, input_array, default_constraint=None):
         super(Param, self).__init__(name=name, default_constraint=default_constraint)
-    
+
     def __array_finalize__(self, obj):
         # see InfoArray.__array_finalize__ for comments
         if obj is None: return
@@ -192,7 +192,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
                 return numpy.r_[a]
             return numpy.r_[:b]
         return itertools.imap(f, itertools.izip_longest(slice_index[:self._realndim_], self._realshape_, fillvalue=slice(self.size)))
-    
+
     #===========================================================================
     # Convenience
     #===========================================================================
@@ -260,7 +260,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
             clean_curr_slice = [s for s in slice_index if numpy.any(s != Ellipsis)]
             for i in range(self._realndim_-len(clean_curr_slice)):
                 i+=len(clean_curr_slice)
-                clean_curr_slice += range(self._realshape_[i]) 
+                clean_curr_slice += range(self._realshape_[i])
             if (all(isinstance(n, (numpy.ndarray, list, tuple)) for n in clean_curr_slice)
                 and len(set(map(len, clean_curr_slice))) <= 1):
                 return numpy.fromiter(itertools.izip(*clean_curr_slice),
@@ -426,4 +426,4 @@ class ParamConcatenation(object):
             start = False
         return "\n".join(strings)
     def __repr__(self):
-        return "\n".join(map(repr,self.params))
\ No newline at end of file
+        return "\n".join(map(repr,self.params))
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 275198b2..5acdec58 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -18,8 +18,8 @@ class Observable(object):
     def remove_observer(self, observer):
         del self._observers_[observer]
     def _notify_observers(self):
-        [callble(self) for callble in self._observers_.itervalues()]
-    
+        [callble(self) for callble in self._observers_.values()]
+
 class Pickleable(object):
     def _getstate(self):
         """
@@ -51,7 +51,7 @@ class Parentable(object):
         super(Parentable,self).__init__()
         self._direct_parent_ = direct_parent
         self._parent_index_ = parent_index
-        
+
     def has_parent(self):
         return self._direct_parent_ is not None
 
@@ -82,7 +82,7 @@ class Nameable(Parentable):
         from_name = self.name
         self._name = name
         if self.has_parent():
-            self._direct_parent_._name_changed(self, from_name)    
+            self._direct_parent_._name_changed(self, from_name)
 
 
 class Parameterizable(Parentable):
@@ -90,7 +90,7 @@ class Parameterizable(Parentable):
         super(Parameterizable, self).__init__(*args, **kwargs)
         from GPy.core.parameterization.array_core import ParamList
         _parameters_ = ParamList()
-    
+
     def parameter_names(self, add_name=False):
         if add_name:
             return [adjust_name_for_printing(self.name) + "." + xi for x in self._parameters_ for xi in x.parameter_names(add_name=True)]
@@ -142,21 +142,21 @@ class Gradcheckable(Parentable):
 class Indexable(object):
     def _raveled_index(self):
         raise NotImplementedError, "Need to be able to get the raveled Index"
-        
+
     def _internal_offset(self):
         return 0
-    
+
     def _offset_for(self, param):
         raise NotImplementedError, "shouldnt happen, offset required from non parameterization object?"
-    
+
     def _raveled_index_for(self, param):
         """
         get the raveled index for a param
         that is an int array, containing the indexes for the flattened
         param inside this parameterized logic.
         """
-        raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?"        
-        
+        raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?"
+
 class Constrainable(Nameable, Indexable, Parameterizable):
     def __init__(self, name, default_constraint=None):
         super(Constrainable,self).__init__(name)
@@ -166,7 +166,7 @@ class Constrainable(Nameable, Indexable, Parameterizable):
         self.priors = ParameterIndexOperations()
         if self._default_constraint_ is not None:
             self.constrain(self._default_constraint_)
-    
+
     #===========================================================================
     # Fixing Parameters:
     #===========================================================================
@@ -182,21 +182,21 @@ class Constrainable(Nameable, Indexable, Parameterizable):
         rav_i = self._highest_parent_._raveled_index_for(self)
         self._highest_parent_._set_fixed(rav_i)
     fix = constrain_fixed
-    
+
     def unconstrain_fixed(self):
         """
         This parameter will no longer be fixed.
         """
         unconstrained = self.unconstrain(__fixed__)
-        self._highest_parent_._set_unfixed(unconstrained)    
+        self._highest_parent_._set_unfixed(unconstrained)
     unfix = unconstrain_fixed
-    
+
     def _set_fixed(self, index):
         import numpy as np
         if not self._has_fixes(): self._fixes_ = np.ones(self.size, dtype=bool)
         self._fixes_[index] = FIXED
         if np.all(self._fixes_): self._fixes_ = None  # ==UNFIXED
-    
+
     def _set_unfixed(self, index):
         import numpy as np
         if not self._has_fixes(): self._fixes_ = np.ones(self.size, dtype=bool)
@@ -212,7 +212,7 @@ class Constrainable(Nameable, Indexable, Parameterizable):
             self._fixes_[fixed_indices] = FIXED
         else:
             self._fixes_ = None
-    
+
     def _has_fixes(self):
         return hasattr(self, "_fixes_") and self._fixes_ is not None
 
@@ -222,17 +222,17 @@ class Constrainable(Nameable, Indexable, Parameterizable):
     def set_prior(self, prior, warning=True, update=True):
         repriorized = self.unset_priors()
         self._add_to_index_operations(self.priors, repriorized, prior, warning, update)
-    
+
     def unset_priors(self, *priors):
         return self._remove_from_index_operations(self.priors, priors)
-    
+
     def log_prior(self):
         """evaluate the prior"""
         if self.priors.size > 0:
             x = self._get_params()
             return reduce(lambda a,b: a+b, [p.lnpdf(x[ind]).sum() for p, ind in self.priors.iteritems()], 0)
         return 0.
-    
+
     def _log_prior_gradients(self):
         """evaluate the gradients of the priors"""
         import numpy as np
@@ -242,7 +242,7 @@ class Constrainable(Nameable, Indexable, Parameterizable):
             [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.iteritems()]
             return ret
         return 0.
-        
+
     #===========================================================================
     # Constrain operations -> done
     #===========================================================================
@@ -269,7 +269,7 @@ class Constrainable(Nameable, Indexable, Parameterizable):
         transformats of this parameter object.
         """
         return self._remove_from_index_operations(self.constraints, transforms)
-    
+
     def constrain_positive(self, warning=True, update=True):
         """
         :param warning: print a warning if re-constraining parameters.
@@ -314,7 +314,7 @@ class Constrainable(Nameable, Indexable, Parameterizable):
         Remove (lower, upper) bounded constrain from this parameter/
         """
         self.unconstrain(Logistic(lower, upper))
-    
+
     def _parent_changed(self, parent):
         from index_operations import ParameterIndexOperationsView
         self.constraints = ParameterIndexOperationsView(parent.constraints, parent._offset_for(self), self.size)
@@ -340,7 +340,7 @@ class Constrainable(Nameable, Indexable, Parameterizable):
             removed = np.union1d(removed, unconstrained)
             if t is __fixed__:
                 self._highest_parent_._set_unfixed(unconstrained)
-        
+
         return removed
 
 
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 128dfca3..c72de182 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -53,20 +53,19 @@ class SparseGP(GP):
         self.add_parameter(self.Z, index=0)
         self.parameters_changed()
 
-    def _update_gradients_Z(self, add=False):
-    #The derivative of the bound wrt the inducing inputs Z ( unless they're all fixed)
+    def _gradients_Z(self):
+        #The derivative of the bound wrt the inducing inputs Z ( unless they're all fixed)
         if not self.Z.is_fixed:
-            if add: self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
-            else: self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
             if self.X_variance is None:
-                self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
+                self.Z.gradient = self.kern.gradients_Z_sparse(X=self.X, Z=self.Z, **self.grad_dict)
             else:
-                self.Z.gradient += self.kern.dpsi1_dZ(self.grad_dict['dL_dpsi1'], self.Z, self.X, self.X_variance)
-                self.Z.gradient += self.kern.dpsi2_dZ(self.grad_dict['dL_dpsi2'], self.Z, self.X, self.X_variance)
+                self.Z.gradient = self.kern.gradients_Z_variational(mu=self.X, S=self.X_variance, Z=self.Z, **self.grad_dict)
+                print self.Z.gradient
+                print id(self.Z)
 
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
-        self._update_gradients_Z(add=False)
+        self.Z.gradient = self._gradients_Z()
 
     def _raw_predict(self, Xnew, X_variance_new=None, full_cov=False):
         """
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index a7eb0adb..a5e8615d 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -22,18 +22,18 @@ def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False,
     # generate GPLVM-like data
     X = _np.random.rand(num_inputs, input_dim)
     lengthscales = _np.random.rand(input_dim)
-    k = (GPy.kern.rbf(input_dim, .5, lengthscales, ARD=True)
+    k = (GPy.kern.RBF(input_dim, .5, lengthscales, ARD=True)
          #+ GPy.kern.white(input_dim, 0.01)
          )
     K = k.K(X)
     Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, (output_dim,)).T
 
-    # k = GPy.kern.rbf_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
-    k = GPy.kern.linear(input_dim)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
-    # k = GPy.kern.rbf(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
-    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.rbf(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
-    # k = GPy.kern.rbf(input_dim, .5, 2., ARD=0) + GPy.kern.rbf(input_dim, .3, .2, ARD=0)
-    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)
+    # k = GPy.kern.RBF_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
+    #k = GPy.kern.linear(input_dim)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.RBF(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.RBF(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.RBF(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
+    # k = GPy.kern.RBF(input_dim, .5, 2., ARD=0) + GPy.kern.RBF(input_dim, .3, .2, ARD=0)
+    # k = GPy.kern.RBF(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)
 
     p = .3
     
@@ -73,7 +73,7 @@ def gplvm_oil_100(optimize=True, verbose=1, plot=True):
     data = GPy.util.datasets.oil_100()
     Y = data['X']
     # create simple GP model
-    kernel = GPy.kern.rbf(6, ARD=True) + GPy.kern.bias(6)
+    kernel = GPy.kern.RBF(6, ARD=True) + GPy.kern.bias(6)
     m = GPy.models.GPLVM(Y, 6, kernel=kernel)
     m.data_labels = data['Y'].argmax(axis=1)
     if optimize: m.optimize('scg', messages=verbose)
@@ -88,7 +88,7 @@ def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_induci
     Y = Y - Y.mean(0)
     Y /= Y.std(0)
     # Create the model
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q)
+    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.bias(Q)
     m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing)
     m.data_labels = data['Y'][:N].argmax(axis=1)
 
@@ -138,7 +138,7 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4
                                          (1 - var))) + .001
     Z = _np.random.permutation(X)[:num_inducing]
 
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
 
     m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
     m.data_colors = c
@@ -164,7 +164,7 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40,
     _np.random.seed(0)
     data = GPy.util.datasets.oil()
 
-    kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2))
     Y = data['X'][:N]
     Yn = Gaussian(Y, normalize=True)
     m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k)
@@ -435,7 +435,7 @@ def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
 
     data = GPy.util.datasets.osu_run1()
     # optimize
-    back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.)
+    back_kernel=GPy.kern.RBF(data['Y'].shape[1], lengthscale=5.)
     mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
     m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
     if optimize: m.optimize(messages=verbose, max_f_eval=10000)
@@ -470,7 +470,7 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
 
     data = GPy.util.datasets.osu_run1()
     Q = 6
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
     m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
     # optimize
     m.ensure_default_constraints()
diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 214e230f..630d74da 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -1,7 +1,7 @@
 from _src.rbf import RBF
 from _src.white import White
 from _src.kern import Kern
-Linear = 'foo'
+from _src.linear import Linear
 #import bias
 #import Brownian
 #import coregionalize
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index edb82ef0..acc69fd4 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -34,7 +34,7 @@ class Add(Kern):
         :param X: the first set of inputs to the kernel
         :param X2: (optional) the second set of arguments to the kernel. If X2
                    is None, this is passed throgh to the 'part' object, which
-                   handles this as X2 == X.
+                   handLes this as X2 == X.
         """
         assert X.shape[1] == self.input_dim
         if X2 is None:
@@ -48,9 +48,6 @@ class Add(Kern):
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
         [p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X[:,i_s], Z[:,i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
 
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
-        [p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_]
-
     def gradients_X(self, dL_dK, X, X2=None):
         """Compute the gradient of the objective function with respect to X.
 
@@ -69,123 +66,125 @@ class Add(Kern):
         return target
 
     def Kdiag(self, X):
-        """Compute the diagonal of the covariance function for inputs X."""
         assert X.shape[1] == self.input_dim
         return sum([p.Kdiag(X[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)])
 
+
     def psi0(self, Z, mu, S):
-        target = np.zeros(mu.shape[0])
-        [p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
-        return target
-
-    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S):
-        target = np.zeros(self.size)
-        [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self._parameters_, self._param_slices_, self.input_slices)]
-        return self._transform_gradients(target)
-
-    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S):
-        target_mu, target_S = np.zeros_like(mu), np.zeros_like(S)
-        [p.dpsi0_dmuS(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-        return target_mu, target_S
+        return np.sum([p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices))],0)
 
     def psi1(self, Z, mu, S):
-        target = np.zeros((mu.shape[0], Z.shape[0]))
-        [p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
-        return target
-
-    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S):
-        target = np.zeros((self.size))
-        [p.dpsi1_dtheta(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self._parameters_, self._param_slices_, self.input_slices)]
-        return self._transform_gradients(target)
-
-    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S):
-        target = np.zeros_like(Z)
-        [p.dpsi1_dZ(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-        return target
-
-    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S):
-        """return shapes are num_samples,num_inducing,input_dim"""
-        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
-        [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-        return target_mu, target_S
+        return np.sum([p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)], 0)
 
     def psi2(self, Z, mu, S):
-        """
-        Computer the psi2 statistics for the covariance function.
-
-        :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
-        :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim)
-        :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing)
-
-        """
-        target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
-        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
+        psi2 = np.sum([p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)], 0)
 
         # compute the "cross" terms
-        # TODO: input_slices needed
-        crossterms = 0
+        from white import White
+        from rbf import RBF
+        #from rbf_inv import RBFInv
+        #from bias import Bias
+        from linear import Linear
+        #ffrom fixed import Fixed
 
-        for [p1, i_s1], [p2, i_s2] in itertools.combinations(zip(self._parameters_, self.input_slices), 2):
-            if i_s1 == i_s2:
-                # TODO psi1 this must be faster/better/precached/more nice
-                tmp1 = np.zeros((mu.shape[0], Z.shape[0]))
-                p1.psi1(Z[:, i_s1], mu[:, i_s1], S[:, i_s1], tmp1)
-                tmp2 = np.zeros((mu.shape[0], Z.shape[0]))
-                p2.psi1(Z[:, i_s2], mu[:, i_s2], S[:, i_s2], tmp2)
+        for (p1, i1), (p2, i2) in itertools.combinations(itertools.izip(self._parameters_, self.input_slices), 2):
+            # white doesn;t combine with anything
+            if isinstance(p1, White) or isinstance(p2, White):
+                pass
+            # rbf X bias
+            #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
+            elif isinstance(p1,  Bias) and isinstance(p2, (RBF, Linear))):
+                tmp = p2.psi1(Z[:,i2], mu[:,i2], S[:,i2])
+                psi2 += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
+            #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+            elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)):
+                tmp = p1.psi1(Z[:,i1], mu[:,i1], S[:,i1])
+                psi2 += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
+        return psi2
 
-                prod = np.multiply(tmp1, tmp2)
-                crossterms += prod[:, :, None] + prod[:, None, :]
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        from white import White
+        from rbf import RBF
+        #from rbf_inv import RBFInv
+        #from bias import Bias
+        from linear import Linear
+        #ffrom fixed import Fixed
 
-        target += crossterms
+        for p1, is1 in zip(self._parameters_, self.input_slices):
+
+            #compute the effective dL_dpsi1. Extra terms appear becaue of the cross terms in psi2!
+            eff_dL_dpsi1 = dL_dpsi1.copy()
+            for p2, is2 in zip(self._parameters_, self.input_slices):
+                if p2 is p1:
+                    continue
+                if isinstance(p2, White):
+                    continue
+                elif isinstance(p2, Bias):
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
+                else:
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z[:,is2], mu[:,is2], S[:,is2]) * 2.
+
+
+            p1.update_gradients_variational(dL_dKmm, dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], S[:,is1], Z[:,is1])
+
+
+    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        from white import white
+        from rbf import rbf
+        #from rbf_inv import rbfinv
+        #from bias import bias
+        from linear import linear
+        #ffrom fixed import fixed
+
+        target = np.zeros(Z.shape)
+        for p1, is1 in zip(self._parameters_, self.input_slices):
+
+            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
+            eff_dL_dpsi1 = dL_dpsi1.copy()
+            for p2, is2 in zip(self._parameters_, self.input_slices):
+                if p2 is p1:
+                    continue
+                if isinstance(p2, white):
+                    continue
+                elif isinstance(p2, bias):
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
+                else:
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(z[:,is2], mu[:,is2], s[:,is2]) * 2.
+
+
+            target += p1.gradients_z_variational(dL_dkmm, dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], s[:,is1], z[:,is1])
         return target
 
-    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
-        """Gradient of the psi2 statistics with respect to the parameters."""
-        target = np.zeros(self.size)
-        [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self._parameters_, self.input_slices, self._param_slices_)]
+    def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        from white import white
+        from rbf import rbf
+        #from rbf_inv import rbfinv
+        #from bias import bias
+        from linear import linear
+        #ffrom fixed import fixed
 
-        # compute the "cross" terms
-        # TODO: better looping, input_slices
-        for i1, i2 in itertools.permutations(range(len(self._parameters_)), 2):
-            p1, p2 = self._parameters_[i1], self._parameters_[i2]
-#             ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
-            ps1, ps2 = self._param_slices_[i1], self._param_slices_[i2]
+        target_mu = np.zeros(mu.shape)
+        target_S = np.zeros(S.shape)
+        for p1, is1 in zip(self._parameters_, self.input_slices):
 
-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dtheta((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target[ps2])
+            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
+            eff_dL_dpsi1 = dL_dpsi1.copy()
+            for p2, is2 in zip(self._parameters_, self.input_slices):
+                if p2 is p1:
+                    continue
+                if isinstance(p2, white):
+                    continue
+                elif isinstance(p2, bias):
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
+                else:
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(z[:,is2], mu[:,is2], s[:,is2]) * 2.
 
-        return self._transform_gradients(target)
-
-    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S):
-        target = np.zeros_like(Z)
-        [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-        # target *= 2
-
-        # compute the "cross" terms
-        # TODO: we need input_slices here.
-        for p1, p2 in itertools.permutations(self._parameters_, 2):
-#             if p1.name == 'linear' and p2.name == 'linear':
-#                 raise NotImplementedError("We don't handle linear/linear cross-terms")
-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dZ((tmp[:, None, :] * dL_dpsi2).sum(1), Z, mu, S, target)
-
-        return target * 2
-
-    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S):
-        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
-        [p.dpsi2_dmuS(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
-
-        # compute the "cross" terms
-        # TODO: we need input_slices here.
-        for p1, p2 in itertools.permutations(self._parameters_, 2):
-#             if p1.name == 'linear' and p2.name == 'linear':
-#                 raise NotImplementedError("We don't handle linear/linear cross-terms")
-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dmuS((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target_mu, target_S)
 
+            a, b = p1.gradients_muS_variational(dL_dkmm, dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], s[:,is1], z[:,is1])
+            target_mu += a
+            target_S += b
         return target_mu, target_S
 
     def plot(self, *args, **kwargs):
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index b5b84305..dd87200e 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -9,7 +9,7 @@ from ...core.parameterization.param import Param
 
 
 class Kern(Parameterized):
-    def __init__(self,input_dim,name):
+    def __init__(self, input_dim, name):
         """
         The base class for a kernel: a positive definite function
         which forms of a covariance function (kernel).
@@ -22,21 +22,15 @@ class Kern(Parameterized):
         super(Kern, self).__init__(name)
         self.input_dim = input_dim
 
-    def K(self,X,X2,target):
+    def K(self, X, X2, target):
         raise NotImplementedError
-    def Kdiag(self,X,target):
+    def Kdiag(self, Xa ,target):
         raise NotImplementedError
-    def _param_grad_helper(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self, dL_dK,X, X2, target):
         raise NotImplementedError
-    def dKdiag_dtheta(self,dL_dKdiag,X,target): # TODO: Max??
-        # In the base case compute this by calling _param_grad_helper. Need to
-        # override for stationary covariances (for example) to save
-        # time.
-        for i in range(X.shape[0]):
-            self._param_grad_helper(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target)
     def psi0(self,Z,mu,S,target):
         raise NotImplementedError
-    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
+    def dpsi0_dtheta(self,dL_dpsi0, Z,mu,S,target):
         raise NotImplementedError
     def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
         raise NotImplementedError
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index b3765774..7f5d43d3 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -9,6 +9,7 @@ from ...util.linalg import tdot
 from ...util.misc import fast_array_equal, param_to_array
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
+from ...util.caching import Cacher, cache_this
 
 class Linear(Kern):
     """
@@ -45,22 +46,35 @@ class Linear(Kern):
                 variances = np.ones(self.input_dim)
 
         self.variances = Param('variances', variances, Logexp())
-        #TODO: remove?self.variances.gradient = np.zeros(self.variances.shape)
         self.add_parameter(self.variances)
-        self.variances.add_observer(self, self.update_variance)
+        self.variances.add_observer(self, self._on_changed)
 
-        # initialize cache
-        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
-        self._X, self._X2 = np.empty(shape=(2, 1))
+    def _on_changed(self, obj):
+        self._notify_observers()
 
-    def update_variance(self, v):
-        self.variances2 = np.square(self.variances)
+    @cache_this(limit=3, reset_on_self=True)
+    def K(self, X, X2=None):
+        if self.ARD:
+            if X2 is None:
+                return tdot(X*np.sqrt(self.variances))
+            else:
+                rv = np.sqrt(self.variances)
+                return np.dot(X*rv, (X2*rv).T)
+        else:
+            return self._dot_product(X, X2) * self.variances
 
-    def on_input_change(self, X):
-        self._K_computations(X, None)
+    @cache_this(limit=3, reset_on_self=False)
+    def _dot_product(self, X, X2=None):
+        if X2 is None:
+            return tdot(X)
+        else:
+            return np.dot(X, X2.T)
+
+    def Kdiag(self, X):
+        return np.sum(self.variances * np.square(X), -1)
 
     def update_gradients_full(self, dL_dK, X):
-        self.variances.gradient[:] = 0
+        self.variances.gradient = np.zeros(self.variances.size)
         self._param_grad_helper(dL_dK, X, None, self.variances.gradient)
 
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
@@ -68,7 +82,7 @@ class Linear(Kern):
         if self.ARD:
             self.variances.gradient = tmp.sum(0)
         else:
-            self.variances.gradient = tmp.sum()
+            self.variances.gradient = np.atleast_1d(tmp.sum())
         self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient)
         self._param_grad_helper(dL_dKnm, X, Z, self.variances.gradient)
 
@@ -85,25 +99,8 @@ class Linear(Kern):
         if self.ARD: self.variances.gradient += tmp.sum(0).sum(0).sum(0)
         else: self.variances.gradient += tmp.sum()
         #from Kmm
-        self._K_computations(Z, None)
         self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient)
 
-    def K(self, X, X2, target):
-        if self.ARD:
-            XX = X * np.sqrt(self.variances)
-            if X2 is None:
-                target += tdot(XX)
-            else:
-                XX2 = X2 * np.sqrt(self.variances)
-                target += np.dot(XX, XX2.T)
-        else:
-            if X is not self._X or X2 is not None:
-                self._K_computations(X, X2)
-            target += self.variances * self._dot_product
-
-    def Kdiag(self, X, target):
-        np.add(target, np.sum(self.variances * np.square(X), -1), target)
-
     def _param_grad_helper(self, dL_dK, X, X2, target):
         if self.ARD:
             if X2 is None:
@@ -112,18 +109,16 @@ class Linear(Kern):
                 product = X[:, None, :] * X2[None, :, :]
                 target += (dL_dK[:, :, None] * product).sum(0).sum(0)
         else:
-            if X is not self._X or X2 is not None:
-                self._K_computations(X, X2)
-            target += np.sum(self._dot_product * dL_dK)
+            target += np.sum(self._dot_product(X, X2) * dL_dK)
 
-    def gradients_X(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2=None):
         if X2 is None:
-            target += 2*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
+            return 2.*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
         else:
-            target += (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
+            return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
 
-    def dKdiag_dX(self,dL_dKdiag,X,target):
-        target += 2.*self.variances*dL_dKdiag[:,None]*X
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return 2.*self.variances*dL_dKdiag[:,None]*X
 
     #---------------------------------------#
     #             PSI statistics            #
@@ -273,15 +268,15 @@ class Linear(Kern):
     #            Precomputations            #
     #---------------------------------------#
 
-    def _K_computations(self, X, X2):
-        if not (fast_array_equal(X, self._X) and fast_array_equal(X2, self._X2)):
-            self._X = X.copy()
-            if X2 is None:
-                self._dot_product = tdot(param_to_array(X))
-                self._X2 = None
-            else:
-                self._X2 = X2.copy()
-                self._dot_product = np.dot(param_to_array(X), param_to_array(X2.T))
+    #def _K_computations(self, X, X2):
+        #if not (fast_array_equal(X, self._X) and fast_array_equal(X2, self._X2)):
+            #self._X = X.copy()
+            #if X2 is None:
+                ##self._dot_product = tdot(param_to_array(X))
+                #self._X2 = None
+            #else:
+                #self._X2 = X2.copy()
+                #self._dot_product = np.dot(param_to_array(X), param_to_array(X2.T))
 
     def _psi_computations(self, Z, mu, S):
         # here are the "statistics" for psi1 and psi2
diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index 67637770..1d033f70 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -2,9 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from kern import Kern
-from coregionalize import Coregionalize
 import numpy as np
-import hashlib
 
 class Prod(Kern):
     """
@@ -17,7 +15,7 @@ class Prod(Kern):
     :rtype: kernel object
 
     """
-    def __init__(self,k1,k2,tensor=False):
+    def __init__(self, k1, k2, tensor=False):
         if tensor:
             super(Prod, self).__init__(k1.input_dim + k2.input_dim, k1.name + '_xx_' + k2.name)
             self.slice1 = slice(0,k1.input_dim)
@@ -25,64 +23,43 @@ class Prod(Kern):
         else:
             assert k1.input_dim == k2.input_dim, "Error: The input spaces of the kernels to multiply don't have the same dimension."
             super(Prod, self).__init__(k1.input_dim, k1.name + '_x_' + k2.name)
-            self.slice1 = slice(0,self.input_dim)
-            self.slice2 = slice(0,self.input_dim)
+            self.slice1 = slice(0, self.input_dim)
+            self.slice2 = slice(0, self.input_dim)
         self.k1 = k1
         self.k2 = k2
         self.add_parameters(self.k1, self.k2)
 
-        #initialize cache
-        self._X, self._X2 = np.empty(shape=(2,1))
-        self._params = None
-
     def K(self, X, X2=None):
-        self._K_computations(X, X2)
-        return self._K1 * self._K2
+        if X2 is None:
+            return self.k1.K(X[:,self.slice1], None) * self.k2.K(X[:,self.slice2], None)
+        else:
+            return self.k1.K(X[:,self.slice1], X2[:,self.slice1]) * self.k2.K(X[:,self.slice2], X2[:,self.slice2])
 
     def Kdiag(self, X):
         return self.k1.Kdiag(X[:,self.slice1]) * self.k2.Kdiag(X[:,self.slice2])
 
     def update_gradients_full(self, dL_dK, X):
-        self._K_computations(X, None)
-        self.k1.update_gradients_full(dL_dK*self._K2, X[:,self.slice1])
-        self.k2.update_gradients_full(dL_dK*self._K1, X[:,self.slice2])
+        self.k1.update_gradients_full(dL_dK*self.k2(X[:,self.slice2]), X[:,self.slice1])
+        self.k2.update_gradients_full(dL_dK*self.k1(X[:,self.slice1]), X[:,self.slice2])
 
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
         self.k1.update_gradients_sparse(dL_dKmm * self.k2.K(Z[:,self.slice2]), dL_dKnm * self.k2(X[:,self.slice2], Z[:,self.slice2]), dL_dKdiag * self.k2.Kdiag(X[:,self.slice2]), X[:,self.slice1], Z[:,self.slice1] )
         self.k2.update_gradients_sparse(dL_dKmm * self.k1.K(Z[:,self.slice1]), dL_dKnm * self.k1(X[:,self.slice1], Z[:,self.slice1]), dL_dKdiag * self.k1.Kdiag(X[:,self.slice1]), X[:,self.slice2], Z[:,self.slice2] )
 
     def gradients_X(self, dL_dK, X, X2=None):
-        """derivative of the covariance matrix with respect to X."""
-        self._K_computations(X, X2)
         target = np.zeros(X.shape)
         if X2 is None:
-            target[:,self.slice1] += self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], None)
-            target[:,self.slice2] += self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], None)
+            target[:,self.slice1] += self.k1.gradients_X(dL_dK*self.k2(X[:,self.slice2]), X[:,self.slice1], None)
+            target[:,self.slice2] += self.k2.gradients_X(dL_dK*self.k1(X[:,self.slice1]), X[:,self.slice2], None)
         else:
-            target[:,self.slice1] += self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1])
-            target[:,self.slice2] += self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2])
-
+            target[:,self.slice1] += self.k1.gradients_X(dL_dK*self.k2(X[:,self.slice2], X2[:,self.slice2]), X[:,self.slice1], X2[:,self.slice1])
+            target[:,self.slice2] += self.k2.gradients_X(dL_dK*self.k1(X[:,self.slice1], X2[:,self.slice1]), X[:,self.slice2], X2[:,self.slice2])
         return target
 
-    def dKdiag_dX(self, dL_dKdiag, X, target):
-        K1 = np.zeros(X.shape[0])
-        K2 = np.zeros(X.shape[0])
-        self.k1.Kdiag(X[:,self.slice1],K1)
-        self.k2.Kdiag(X[:,self.slice2],K2)
+    def gradients_X_diag(self, dL_dKdiag, X):
+        target = np.zeros(X.shape)
+        target[:,self.slice1] = self.k1.gradients_X(dL_dKdiag*self.k2.Kdiag(X[:,self.slice2]), X[:,self.slice1])
+        target[:,self.slice2] += self.k2.gradients_X(dL_dKdiag*self.k1.Kdiag(X[:,self.slice1]), X[:,self.slice2])
+        return target
 
-        self.k1.gradients_X(dL_dKdiag*K2, X[:,self.slice1], target[:,self.slice1])
-        self.k2.gradients_X(dL_dKdiag*K1, X[:,self.slice2], target[:,self.slice2])
-
-    def _K_computations(self, X, X2):
-        if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
-            self._X = X.copy()
-            self._params == self._get_params().copy()
-            if X2 is None:
-                self._X2 = None
-                self._K1 = self.k1.K(X[:,self.slice1],None)
-                self._K2 = self.k2.K(X[:,self.slice2],None)
-            else:
-                self._X2 = X2.copy()
-                self._K1 = self.k1.K(X[:,self.slice1],X2[:,self.slice1])
-                self._K2 = self.k2.K(X[:,self.slice2],X2[:,self.slice2])
 
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 02640fdc..0508436f 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -79,17 +79,18 @@ class RBF(Kern):
         ret[:] = self.variance
         return ret
 
-    #TODO: remove TARGET!
-    def psi0(self, Z, mu, S, target):
-        target += self.variance
+    def psi0(self, Z, mu, S):
+        ret = np.empty(mu.shape[0], dtype=np.float64)
+        ret[:] = self.variance
+        return ret
 
-    def psi1(self, Z, mu, S, target):
+    def psi1(self, Z, mu, S):
         self._psi_computations(Z, mu, S)
-        target += self._psi1
+        return self._psi1
 
-    def psi2(self, Z, mu, S, target):
+    def psi2(self, Z, mu, S):
         self._psi_computations(Z, mu, S)
-        target += self._psi2
+        return self._psi2
 
     def update_gradients_full(self, dL_dK, X):
         self._K_computations(X, None)
@@ -154,6 +155,37 @@ class RBF(Kern):
         else:
             self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
 
+    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        self._psi_computations(Z, mu, S)
+
+        #psi1
+        denominator = (self.lengthscale2 * (self._psi1_denom))
+        dpsi1_dZ = -self._psi1[:, :, None] * ((self._psi1_dist / denominator))
+        grad = np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0)
+
+        #psi2
+        term1 = self._psi2_Zdist / self.lengthscale2 # num_inducing, num_inducing, input_dim
+        term2 = self._psi2_mudist / self._psi2_denom / self.lengthscale2 # N, num_inducing, num_inducing, input_dim
+        dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
+        grad += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
+
+        return grad
+
+    def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        self._psi_computations(Z, mu, S)
+        #psi1
+        tmp = self._psi1[:, :, None] / self.lengthscale2 / self._psi1_denom
+        grad_mu = np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
+        grad_S = np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
+
+        tmp = self._psi2[:, :, :, None] / self.lengthscale2 / self._psi2_denom
+        grad_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
+        grad_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)
+
+        return grad_mu, grad_S
+
+
+
     def gradients_X(self, dL_dK, X, X2=None):
         #if self._X is None or X.base is not self._X.base or X2 is not None:
         self._K_computations(X, X2)
@@ -171,36 +203,7 @@ class RBF(Kern):
     #             PSI statistics            #
     #---------------------------------------#
 
-    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
-        pass
-
-    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
-        self._psi_computations(Z, mu, S)
-        denominator = (self.lengthscale2 * (self._psi1_denom))
-        dpsi1_dZ = -self._psi1[:, :, None] * ((self._psi1_dist / denominator))
-        target += np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0)
-
-    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
-        self._psi_computations(Z, mu, S)
-        tmp = self._psi1[:, :, None] / self.lengthscale2 / self._psi1_denom
-        target_mu += np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
-        target_S += np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
-
-    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
-        self._psi_computations(Z, mu, S)
-        term1 = self._psi2_Zdist / self.lengthscale2 # num_inducing, num_inducing, input_dim
-        term2 = self._psi2_mudist / self._psi2_denom / self.lengthscale2 # N, num_inducing, num_inducing, input_dim
-        dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
-        target += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
-
-    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
-        """Think N,num_inducing,num_inducing,input_dim """
-        self._psi_computations(Z, mu, S)
-        tmp = self._psi2[:, :, :, None] / self.lengthscale2 / self._psi2_denom
-        target_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
-        target_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)
-
-    #---------------------------------------#
+            #---------------------------------------#
     #            Precomputations            #
     #---------------------------------------#
 
@@ -362,6 +365,7 @@ class RBF(Kern):
         #include <omp.h>
         #include <math.h>
         """
+        mu = param_to_array(mu)
         weave.inline(code, support_code=support_code, libraries=['gomp'],
                      arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
                      type_converters=weave.converters.blitz, **self.weave_options)
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 914ca4ae..5fb1ca59 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -57,26 +57,16 @@ class BayesianGPLVM(SparseGP, GPLVM):
         self.init = state.pop()
         SparseGP._setstate(self, state)
 
-    def dL_dmuS(self):
-        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi0_dmuS(self.grad_dict['dL_dpsi0'], self.Z, self.X, self.X_variance)
-        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi1_dmuS(self.grad_dict['dL_dpsi1'], self.Z, self.X, self.X_variance)
-        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.grad_dict['dL_dpsi2'], self.Z, self.X, self.X_variance)
-        dL_dmu = dL_dmu_psi0 + dL_dmu_psi1 + dL_dmu_psi2
-        dL_dS = dL_dS_psi0 + dL_dS_psi1 + dL_dS_psi2
-
-        return dL_dmu, dL_dS
-
     def KL_divergence(self):
         var_mean = np.square(self.X).sum()
         var_S = np.sum(self.X_variance - np.log(self.X_variance))
         return 0.5 * (var_mean + var_S) - 0.5 * self.input_dim * self.num_data
 
     def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
-        self._update_gradients_Z(add=False)
+        super(BayesianGPLVM, self).parameters_changed()
 
         self._log_marginal_likelihood -= self.KL_divergence()
-        dL_dmu, dL_dS = self.dL_dmuS()
+        dL_dmu, dL_dS = self.kern.gradients_muS_variational(mu=self.X, S=self.X_variance, Z=self.Z, **self.grad_dict)
 
         # dL:
         self.q.mean.gradient  = dL_dmu
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index 51ba56f3..1f10cd64 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -1,46 +1,89 @@
-from ..core.parameterization.array_core import ObservableArray, ParamList
+from ..core.parameterization.parameter_core import Observable
+from ..core.parameterization.array_core import ParamList
+
 class Cacher(object):
-    def __init__(self, operation, limit=5):
+    def __init__(self, operation, limit=5, reset_on_first=False):
         self.limit = int(limit)
+        self._reset_on_first = reset_on_first
         self.operation=operation
-        self.cached_inputs = ParamList([])
+        self.cached_inputs = []
         self.cached_outputs = []
         self.inputs_changed = []
 
-    def __call__(self, X):
-        assert isinstance(X, ObservableArray)
-        if X in self.cached_inputs:
-            i = self.cached_inputs.index(X)
+    def __call__(self, *args):
+        if self._reset_on_first:
+            assert isinstance(args[0], Observable)
+            args[0].add_observer(args[0], self.reset)
+            cached_args = args
+        else:
+            cached_args = args[1:]
+
+
+        if not all([isinstance(arg, Observable) for arg in cached_args]):
+            return self.operation(*args)
+        if cached_args in self.cached_inputs:
+            i = self.cached_inputs.index(cached_args)
             if self.inputs_changed[i]:
-                self.cached_outputs[i] = self.operation(X)
+                self.cached_outputs[i] = self.operation(*args)
                 self.inputs_changed[i] = False
             return self.cached_outputs[i]
         else:
             if len(self.cached_inputs) == self.limit:
-                X_ = self.cached_inputs.pop(0)
-                X_.remove_observer(self)
+                args_ = self.cached_inputs.pop(0)
+                [a.remove_observer(self) for a in args_]
                 self.inputs_changed.pop(0)
                 self.cached_outputs.pop(0)
 
-            self.cached_inputs.append(X)
-            self.cached_outputs.append(self.operation(X))
+            self.cached_inputs.append(cached_args)
+            self.cached_outputs.append(self.operation(*args))
             self.inputs_changed.append(False)
-            X.add_observer(self, self.on_cache_changed)
+            [a.add_observer(self, self.on_cache_changed) for a in args]
             return self.cached_outputs[-1]
 
-    def on_cache_changed(self, X):
-        #print id(X)
-        Xbase = X
-        while Xbase is not None:
-            try:
-                i = self.cached_inputs.index(X)
-                break
-            except ValueError:
-                Xbase = X.base
-                continue
-        self.inputs_changed[i] = True
+    def on_cache_changed(self, arg):
+        self.inputs_changed = [any([a is arg for a in args]) or old_ic for args, old_ic in zip(self.cached_inputs, self.inputs_changed)]
+
+    def reset(self, obj):
+        [[a.remove_observer(self) for a in args] for args in self.cached_inputs]
+        self.cached_inputs = []
+        self.cached_outputs = []
+        self.inputs_changed = []
+
+
+
+
+def cache_this(limit=5, reset_on_self=False):
+    def limited_cache(f):
+        c = Cacher(f, limit, reset_on_first=reset_on_self)
+        def f_wrap(*args):
+            return c(*args)
+        f_wrap._cacher = c
+        return f_wrap
+    return limited_cache
+
+
+
+
+
+
+
+
+
+
+
+
+        #Xbase = X
+        #while Xbase is not None:
+            #try:
+                #i = self.cached_inputs.index(X)
+                #break
+            #except ValueError:
+                #Xbase = X.base
+                #continue
+        #self.inputs_changed[i] = True
+
+
 
-                
 
 
 

From 52ab456bfe9ffea60f8509826f6edeb2366c9337 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 20 Feb 2014 14:09:20 +0000
Subject: [PATCH 15/38] posterior with one covariance per dimension and param
 gradient fix

---
 GPy/core/parameterization/param.py                   | 3 +++
 GPy/inference/latent_function_inference/posterior.py | 7 +++++--
 GPy/util/warping_functions.py                        | 3 +--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index c052099d..7ab7e2b4 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -86,6 +86,9 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable
         if self._gradient_ is None:
             self._gradient_ = numpy.zeros(self._realshape_)
         return self._gradient_
+    @gradient.setter
+    def gradient(self, val):
+        self.gradient[:] = val
         
     #===========================================================================
     # Pickling operations
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index f28bf9d1..73741a13 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -81,13 +81,16 @@ class Posterior(object):
     def covariance(self):
         if self._covariance is None:
             #LiK, _ = dtrtrs(self.woodbury_chol, self._K, lower=1)
-            self._covariance = self._K - self._K.dot(self.woodbury_inv).dot(self._K)
+            self._covariance = np.tensordot(np.dot(np.atleast_3d(self.woodbury_inv).T, self._K), self._K, [1,0]).T
+            #self._covariance = self._K - self._K.dot(self.woodbury_inv).dot(self._K)
         return self._covariance
 
     @property
     def precision(self):
         if self._precision is None:
-            self._precision, _, _, _ = pdinv(self.covariance)
+            self._precision = np.zeros(np.atleast_3d(self.covariance).shape) # if one covariance per dimension
+            for p in xrange(self.covariance.shape[-1]):
+                self._precision[:,:,p] = pdinv(self.covariance[:,:,p])[0]
         return self._precision
 
     @property
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 35ad3b80..a0a385e0 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -3,8 +3,6 @@
 
 
 import numpy as np
-import scipy as sp
-import pylab as plt
 
 class WarpingFunction(object):
     """
@@ -39,6 +37,7 @@ class WarpingFunction(object):
     def plot(self, psi, xmin, xmax):
         y = np.arange(xmin, xmax, 0.01)
         f_y = self.f(y, psi)
+        from matplotlib import pyplot as plt
         plt.figure()
         plt.plot(y, f_y)
         plt.xlabel('y')

From 41b8b7edd814f191fadaf96af3f3c9e7f7f182fb Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 20 Feb 2014 14:10:36 +0000
Subject: [PATCH 16/38] empty init file

---
 GPy/kern/_src/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 GPy/kern/_src/__init__.py

diff --git a/GPy/kern/_src/__init__.py b/GPy/kern/_src/__init__.py
new file mode 100644
index 00000000..e69de29b

From 87ce8fea0b192510045f12fe221b796de2315a97 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 20 Feb 2014 14:24:41 +0000
Subject: [PATCH 17/38] weird Max related stuff is happening

---
 GPy/core/sparse_gp.py |  1 +
 GPy/kern/_src/add.py  |  4 ++--
 GPy/kern/_src/kern.py | 26 ++++++--------------------
 3 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index c72de182..1ae72556 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -58,6 +58,7 @@ class SparseGP(GP):
         if not self.Z.is_fixed:
             if self.X_variance is None:
                 self.Z.gradient = self.kern.gradients_Z_sparse(X=self.X, Z=self.Z, **self.grad_dict)
+                print self.Z.gradient
             else:
                 self.Z.gradient = self.kern.gradients_Z_variational(mu=self.X, S=self.X_variance, Z=self.Z, **self.grad_dict)
                 print self.Z.gradient
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index acc69fd4..d5515d98 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -71,7 +71,7 @@ class Add(Kern):
 
 
     def psi0(self, Z, mu, S):
-        return np.sum([p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices))],0)
+        return np.sum([p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)],0)
 
     def psi1(self, Z, mu, S):
         return np.sum([p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)], 0)
@@ -93,7 +93,7 @@ class Add(Kern):
                 pass
             # rbf X bias
             #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
-            elif isinstance(p1,  Bias) and isinstance(p2, (RBF, Linear))):
+            elif isinstance(p1,  Bias) and isinstance(p2, (RBF, Linear)):
                 tmp = p2.psi1(Z[:,i2], mu[:,i2], S[:,i2])
                 psi2 += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
             #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index dd87200e..63c5b458 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -26,33 +26,15 @@ class Kern(Parameterized):
         raise NotImplementedError
     def Kdiag(self, Xa ,target):
         raise NotImplementedError
-    def _param_grad_helper(self, dL_dK,X, X2, target):
-        raise NotImplementedError
     def psi0(self,Z,mu,S,target):
         raise NotImplementedError
-    def dpsi0_dtheta(self,dL_dpsi0, Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
-        raise NotImplementedError
     def psi1(self,Z,mu,S,target):
         raise NotImplementedError
-    def dpsi1_dtheta(self,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,target_mu,target_S):
-        raise NotImplementedError
     def psi2(self,Z,mu,S,target):
         raise NotImplementedError
-    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,target):
+    def gradients_X(self, dL_dK, X, X2):
         raise NotImplementedError
-    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
-        raise NotImplementedError
-    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
-        raise NotImplementedError
-    def gradients_X(self, dL_dK, X, X2, target):
-        raise NotImplementedError
-    def dKdiag_dX(self, dL_dK, X, target):
+    def gradients_X_diag(self, dL_dK, X):
         raise NotImplementedError
     def update_gradients_full(self, dL_dK, X):
         """Set the gradients of all parameters when doing full (N) inference."""
@@ -63,6 +45,10 @@ class Kern(Parameterized):
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         """Set the gradients of all parameters when doing variational (M) inference with uncertain inputs."""
         raise NotImplementedError
+    def gradients_Z_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+        grad = self.gradients_X(dL_dKmm, Z)
+        grad += self.gradients_X(dL_dKnm.T, Z, X)
+        return grad
 
     def plot_ARD(self, *args):
         """If an ARD kernel is present, plot a bar representation using matplotlib

From e03b8284666ab466fbea5726869087a5c6fb88fe Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 20 Feb 2014 14:34:14 +0000
Subject: [PATCH 18/38] foo

---
 GPy/core/sparse_gp.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 1ae72556..e619ad4d 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -53,20 +53,17 @@ class SparseGP(GP):
         self.add_parameter(self.Z, index=0)
         self.parameters_changed()
 
-    def _gradients_Z(self):
+    def update_gradients_Z(self):
         #The derivative of the bound wrt the inducing inputs Z ( unless they're all fixed)
         if not self.Z.is_fixed:
             if self.X_variance is None:
                 self.Z.gradient = self.kern.gradients_Z_sparse(X=self.X, Z=self.Z, **self.grad_dict)
-                print self.Z.gradient
             else:
                 self.Z.gradient = self.kern.gradients_Z_variational(mu=self.X, S=self.X_variance, Z=self.Z, **self.grad_dict)
-                print self.Z.gradient
-                print id(self.Z)
 
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
-        self.Z.gradient = self._gradients_Z()
+        self.update_gradients_Z()
 
     def _raw_predict(self, Xnew, X_variance_new=None, full_cov=False):
         """

From 4fb4a38cd11f9f6532e2dd55223566994c928323 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 20 Feb 2014 17:11:44 +0000
Subject: [PATCH 19/38] spellings

---
 GPy/kern/_src/rbf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 0508436f..78b9ffc4 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -169,6 +169,8 @@ class RBF(Kern):
         dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
         grad += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
 
+        grad += self.gradients_X(dL_dKmm, Z, None)
+
         return grad
 
     def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
@@ -184,8 +186,6 @@ class RBF(Kern):
 
         return grad_mu, grad_S
 
-
-
     def gradients_X(self, dL_dK, X, X2=None):
         #if self._X is None or X.base is not self._X.base or X2 is not None:
         self._K_computations(X, X2)
@@ -203,7 +203,7 @@ class RBF(Kern):
     #             PSI statistics            #
     #---------------------------------------#
 
-            #---------------------------------------#
+    #---------------------------------------#
     #            Precomputations            #
     #---------------------------------------#
 

From 8ea40a4a1354098cf4d720a585bff65c1d62c646 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Fri, 21 Feb 2014 08:03:44 +0000
Subject: [PATCH 20/38] rbf psi 2

---
 GPy/core/parameterization/param.py | 2 +-
 GPy/kern/_src/rbf.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 15b077a9..4c2cb469 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -85,7 +85,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable
     def gradient(self):
         if self._gradient_ is None:
             self._gradient_ = numpy.zeros(self._realshape_)
-        return self._gradient_
+        return self._gradient_[self._current_slice_]
     @gradient.setter
     def gradient(self, val):
         self.gradient[:] = val
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 0508436f..65b65120 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -177,7 +177,7 @@ class RBF(Kern):
         tmp = self._psi1[:, :, None] / self.lengthscale2 / self._psi1_denom
         grad_mu = np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
         grad_S = np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
-
+        #psi2
         tmp = self._psi2[:, :, :, None] / self.lengthscale2 / self._psi2_denom
         grad_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
         grad_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)

From 0c92fca31abf7a35d992502235bd571d26377904 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Fri, 21 Feb 2014 09:14:31 +0000
Subject: [PATCH 21/38] linear without caching, derivatives done

---
 GPy/core/parameterization/array_core.py       |  4 +-
 GPy/core/parameterization/parameter_core.py   | 12 ++--
 GPy/core/sparse_gp.py                         | 26 ++++-----
 .../latent_function_inference/var_dtc.py      |  2 -
 GPy/kern/_src/kern.py                         | 16 ++++--
 GPy/kern/_src/linear.py                       | 56 ++++++++++++-------
 GPy/util/caching.py                           |  9 ++-
 7 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/GPy/core/parameterization/array_core.py b/GPy/core/parameterization/array_core.py
index 7892e94a..b12ca59b 100644
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@@ -30,12 +30,12 @@ class ObservableArray(np.ndarray, Observable):
     def __new__(cls, input_array):
         obj = np.atleast_1d(input_array).view(cls)
         cls.__name__ = "ObservableArray\n     "
-        obj._observers_ = {}
+        obj._observer_callables_ = {}
         return obj
     def __array_finalize__(self, obj):
         # see InfoArray.__array_finalize__ for comments
         if obj is None: return
-        self._observers_ = getattr(obj, '_observers_', None)
+        self._observer_callables_ = getattr(obj, '_observer_callables_', None)
     def __array_wrap__(self, out_arr, context=None):
         return out_arr.view(np.ndarray)
 
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 9a10f317..f8d83edd 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -11,14 +11,14 @@ def adjust_name_for_printing(name):
     return ''
 
 class Observable(object):
-    _observers_ = {}
-    def add_observer(self, observer, callble):
-        self._observers_[observer] = callble
+    _observer_callables_ = {}
+    def add_observer(self, callble):
+        self._observer_callables_.append(callble)
         #callble(self)
-    def remove_observer(self, observer):
-        del self._observers_[observer]
+    def remove_observer(self, callble):
+        del self._observer_callables_[callble]
     def _notify_observers(self):
-        [callble(self) for callble in self._observers_.itervalues()]
+        [callble(self) for callble in self._observer_callables_]
     
 class Pickleable(object):
     def _getstate(self):
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index e619ad4d..5ab13251 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -44,26 +44,26 @@ class SparseGP(GP):
 
         self.Z = Param('inducing inputs', Z)
         self.num_inducing = Z.shape[0]
-
-        if not (X_variance is None):
-            assert X_variance.shape == X.shape
+        
         self.X_variance = X_variance
-
+        if self.has_uncertain_inputs():
+            assert X_variance.shape == X.shape
+        
         GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name)
         self.add_parameter(self.Z, index=0)
         self.parameters_changed()
 
-    def update_gradients_Z(self):
-        #The derivative of the bound wrt the inducing inputs Z ( unless they're all fixed)
-        if not self.Z.is_fixed:
-            if self.X_variance is None:
-                self.Z.gradient = self.kern.gradients_Z_sparse(X=self.X, Z=self.Z, **self.grad_dict)
-            else:
-                self.Z.gradient = self.kern.gradients_Z_variational(mu=self.X, S=self.X_variance, Z=self.Z, **self.grad_dict)
+    def has_uncertain_inputs(self):
+        return not (self.X_variance is None)                
 
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
-        self.update_gradients_Z()
+        if self.has_uncertain_inputs():
+            self.kern.update_gradients_variational(mu=self.X, S=self.X_variance, Z=self.Z, **self.grad_dict)
+            self.Z.gradient = self.kern.gradients_Z_variational(mu=self.X, S=self.X_variance, Z=self.Z, **self.grad_dict)
+        else:
+            self.kern.update_gradients_sparse(X=self.X, Z=self.Z, **self.grad_dict)
+            self.Z.gradient = self.kern.gradients_Z_sparse(X=self.X, Z=self.Z, **self.grad_dict)
 
     def _raw_predict(self, Xnew, X_variance_new=None, full_cov=False):
         """
@@ -97,12 +97,10 @@ class SparseGP(GP):
         """
         return GP._getstate(self) + [self.Z,
                 self.num_inducing,
-                self.has_uncertain_inputs,
                 self.X_variance]
 
     def _setstate(self, state):
         self.X_variance = state.pop()
-        self.has_uncertain_inputs = state.pop()
         self.num_inducing = state.pop()
         self.Z = state.pop()
         GP._setstate(self, state)
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index 2f11cb08..24f4a5b6 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -70,10 +70,8 @@ class VarDTC(object):
 
         if uncertain_inputs:
             grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
-            kern.update_gradients_variational(mu=X, S=X_variance, Z=Z, **grad_dict)
         else:
             grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1}
-            kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
 
         #get sufficient things for posterior prediction
         #TODO: do we really want to do this in  the loop?
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 63c5b458..6e9199dd 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -22,15 +22,15 @@ class Kern(Parameterized):
         super(Kern, self).__init__(name)
         self.input_dim = input_dim
 
-    def K(self, X, X2, target):
+    def K(self, X, X2):
         raise NotImplementedError
-    def Kdiag(self, Xa ,target):
+    def Kdiag(self, Xa):
         raise NotImplementedError
-    def psi0(self,Z,mu,S,target):
+    def psi0(self,Z,mu,S):
         raise NotImplementedError
-    def psi1(self,Z,mu,S,target):
+    def psi1(self,Z,mu,S):
         raise NotImplementedError
-    def psi2(self,Z,mu,S,target):
+    def psi2(self,Z,mu,S):
         raise NotImplementedError
     def gradients_X(self, dL_dK, X, X2):
         raise NotImplementedError
@@ -49,7 +49,11 @@ class Kern(Parameterized):
         grad = self.gradients_X(dL_dKmm, Z)
         grad += self.gradients_X(dL_dKnm.T, Z, X)
         return grad
-
+    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        raise NotImplementedError
+    def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        raise NotImplementedError
+    
     def plot_ARD(self, *args):
         """If an ARD kernel is present, plot a bar representation using matplotlib
 
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index 7f5d43d3..e8cf2e87 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -119,34 +119,55 @@ class Linear(Kern):
 
     def gradients_X_diag(self, dL_dKdiag, X):
         return 2.*self.variances*dL_dKdiag[:,None]*X
-
+        
     #---------------------------------------#
     #             PSI statistics            #
+    #              variational              #
     #---------------------------------------#
 
-    def psi0(self, Z, mu, S, target):
-        self._psi_computations(Z, mu, S)
-        target += np.sum(self.variances * self.mu2_S, 1)
+    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        # Kmm
+        grad = self.gradients_X(dL_dKmm, Z, None)
+        #psi1
+        grad += self.gradients_X(dL_dpsi1.T, Z, mu)
+        #psi2
+        self._weave_dpsi2_dZ(dL_dpsi2, Z, mu, S, grad)
+        return grad
 
+    def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        target_mu, target_S = np.zeros(mu.shape), np.zeros(mu.shape)
+        # psi0
+        target_mu += dL_dpsi0[:, None] * (2.0 * mu * self.variances)
+        target_S += dL_dpsi0[:, None] * self.variances
+        # psi1
+        target_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
+        # psi2
+        self._weave_dpsi2_dmuS(dL_dpsi2, Z, mu, S, target_mu, target_S)
+        
+        return target_mu, target_S
+        
+    def psi0(self, Z, mu, S):
+        self._psi_computations(Z, mu, S)
+        return np.sum(self.variances * self.mu2_S, 1)
+
+    def psi1(self, Z, mu, S):
+        """the variance, it does nothing"""
+        self._psi1 = self.K(mu, Z)
+        return self._psi1
+
+    def psi2(self, Z, mu, S):
+        self._psi_computations(Z, mu, S)
+        return self._psi2
+        
     def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
         target_mu += dL_dpsi0[:, None] * (2.0 * mu * self.variances)
         target_S += dL_dpsi0[:, None] * self.variances
 
-    def psi1(self, Z, mu, S, target):
-        """the variance, it does nothing"""
-        self._psi1 = self.K(mu, Z, target)
-
     def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
         """Do nothing for S, it does not affect psi1"""
         self._psi_computations(Z, mu, S)
         target_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
 
-    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
-        self.gradients_X(dL_dpsi1.T, Z, mu, target)
-
-    def psi2(self, Z, mu, S, target):
-        self._psi_computations(Z, mu, S)
-        target += self._psi2
 
     def psi2_new(self,Z,mu,S,target):
         tmp = np.zeros((mu.shape[0], Z.shape[0]))
@@ -172,7 +193,7 @@ class Linear(Kern):
         Zs_sq = Zs[:,None,:]*Zs[None,:,:]
         target_S += (dL_dpsi2[:,:,:,None]*Zs_sq[None,:,:,:]).sum(1).sum(1)
 
-    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
+    def _weave_dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
         """Think N,num_inducing,num_inducing,input_dim """
         self._psi_computations(Z, mu, S)
         AZZA = self.ZA.T[:, None, :, None] * self.ZA[None, :, None, :]
@@ -226,7 +247,7 @@ class Linear(Kern):
                      type_converters=weave.converters.blitz,**weave_options)
 
 
-    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
+    def _weave_dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
         self._psi_computations(Z, mu, S)
         #psi2_dZ = dL_dpsi2[:, :, :, None] * self.variances * self.ZAinner[:, :, None, :]
         #dummy_target = np.zeros_like(target)
@@ -261,9 +282,6 @@ class Linear(Kern):
                      type_converters=weave.converters.blitz,**weave_options)
 
 
-
-
-
     #---------------------------------------#
     #            Precomputations            #
     #---------------------------------------#
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index 1f10cd64..6bf9aab1 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -1,5 +1,4 @@
 from ..core.parameterization.parameter_core import Observable
-from ..core.parameterization.array_core import ParamList
 
 class Cacher(object):
     def __init__(self, operation, limit=5, reset_on_first=False):
@@ -13,7 +12,7 @@ class Cacher(object):
     def __call__(self, *args):
         if self._reset_on_first:
             assert isinstance(args[0], Observable)
-            args[0].add_observer(args[0], self.reset)
+            args[0].add_observer(self.reset)
             cached_args = args
         else:
             cached_args = args[1:]
@@ -30,21 +29,21 @@ class Cacher(object):
         else:
             if len(self.cached_inputs) == self.limit:
                 args_ = self.cached_inputs.pop(0)
-                [a.remove_observer(self) for a in args_]
+                [a.remove_observer(self.on_cache_changed) for a in args_]
                 self.inputs_changed.pop(0)
                 self.cached_outputs.pop(0)
 
             self.cached_inputs.append(cached_args)
             self.cached_outputs.append(self.operation(*args))
             self.inputs_changed.append(False)
-            [a.add_observer(self, self.on_cache_changed) for a in args]
+            [a.add_observer(self.on_cache_changed) for a in args]
             return self.cached_outputs[-1]
 
     def on_cache_changed(self, arg):
         self.inputs_changed = [any([a is arg for a in args]) or old_ic for args, old_ic in zip(self.cached_inputs, self.inputs_changed)]
 
     def reset(self, obj):
-        [[a.remove_observer(self) for a in args] for args in self.cached_inputs]
+        [[a.remove_observer(self.reset) for a in args] for args in self.cached_inputs]
         self.cached_inputs = []
         self.cached_outputs = []
         self.inputs_changed = []

From b19f9b9f33b671ff0a95e111f0fed6318d8d4663 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Fri, 21 Feb 2014 10:38:11 +0000
Subject: [PATCH 22/38] gradient operations and cachong

---
 GPy/core/parameterization/array_core.py     |   6 +-
 GPy/core/parameterization/param.py          |  11 +-
 GPy/core/parameterization/parameter_core.py | 248 ++++++++++----------
 GPy/core/parameterization/parameterized.py  |   8 +-
 GPy/kern/_src/kern.py                       |   4 +-
 GPy/kern/_src/linear.py                     |   2 +-
 GPy/models/bayesian_gplvm.py                |   2 +-
 GPy/util/caching.py                         |   8 +-
 8 files changed, 151 insertions(+), 138 deletions(-)

diff --git a/GPy/core/parameterization/array_core.py b/GPy/core/parameterization/array_core.py
index b12ca59b..dffe2ed1 100644
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@@ -30,12 +30,16 @@ class ObservableArray(np.ndarray, Observable):
     def __new__(cls, input_array):
         obj = np.atleast_1d(input_array).view(cls)
         cls.__name__ = "ObservableArray\n     "
-        obj._observer_callables_ = {}
         return obj
+    
+    def __init__(self, *a, **kw):
+        super(ObservableArray, self).__init__(*a, **kw)
+    
     def __array_finalize__(self, obj):
         # see InfoArray.__array_finalize__ for comments
         if obj is None: return
         self._observer_callables_ = getattr(obj, '_observer_callables_', None)
+        
     def __array_wrap__(self, out_arr, context=None):
         return out_arr.view(np.ndarray)
 
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 4c2cb469..c2c70f5c 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -15,7 +15,7 @@ __precision__ = numpy.get_printoptions()['precision'] # numpy printing precision
 __print_threshold__ = 5
 ######
 
-class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable):
+class Param(Constrainable, ObservableArray, Gradcheckable, Indexable):
     """
     Parameter object for GPy models.
 
@@ -57,8 +57,8 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable
         obj._gradient_ = None
         return obj
 
-    def __init__(self, name, input_array, default_constraint=None):
-        super(Param, self).__init__(name=name, default_constraint=default_constraint)
+    def __init__(self, name, input_array, default_constraint=None, *a, **kw):
+        super(Param, self).__init__(name=name, default_constraint=default_constraint, *a, **kw)
 
     def __array_finalize__(self, obj):
         # see InfoArray.__array_finalize__ for comments
@@ -144,7 +144,10 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable
         return self.flat
 
     def _collect_gradient(self, target):
-        target[:] = self.gradient.flat
+        target += self.gradient.flat
+        
+    def _set_gradient(self, g):
+        self.gradient = g
 
     #===========================================================================
     # Array operations -> done
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index f8d83edd..5e5e5432 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -7,18 +7,24 @@ __updated__ = '2013-12-16'
 
 def adjust_name_for_printing(name):
     if name is not None:
-        return name.replace(" ", "_").replace(".", "_").replace("-","").replace("+","").replace("!","").replace("*","").replace("/","")
+        return name.replace(" ", "_").replace(".", "_").replace("-", "").replace("+", "").replace("!", "").replace("*", "").replace("/", "")
     return ''
 
 class Observable(object):
-    _observer_callables_ = {}
-    def add_observer(self, callble):
-        self._observer_callables_.append(callble)
-        #callble(self)
-    def remove_observer(self, callble):
-        del self._observer_callables_[callble]
+    def __init__(self, *args, **kwargs):
+        from collections import defaultdict
+        self._observer_callables_ = defaultdict(list)
+    
+    def add_observer(self, observer, callble):
+        self._observer_callables_[observer].append(callble)
+        # callble(self)
+    
+    def remove_observer(self, observer, callble):
+        del self._observer_callables_[observer][callble]
+    
     def _notify_observers(self):
-        [callble(self) for callble in self._observer_callables_]
+        [[callble(self) for callble in callables] 
+         for callables in self._observer_callables_.itervalues()]
     
 class Pickleable(object):
     def _getstate(self):
@@ -47,10 +53,8 @@ class Pickleable(object):
 #===============================================================================
 
 class Parentable(object):
-    def __init__(self, direct_parent=None, parent_index=None):
-        super(Parentable,self).__init__()
-        self._direct_parent_ = direct_parent
-        self._parent_index_ = parent_index
+    _direct_parent_ = None
+    _parent_index_ = None
         
     def has_parent(self):
         return self._direct_parent_ is not None
@@ -73,9 +77,8 @@ class Parentable(object):
             self._direct_parent_._notify_parameters_changed()
 
 class Nameable(Parentable):
-    _name = None
-    def __init__(self, name, direct_parent=None, parent_index=None):
-        super(Nameable,self).__init__(direct_parent, parent_index)
+    def __init__(self, name, *a, **kw):
+        super(Nameable, self).__init__(*a, **kw)
         self._name = name or self.__class__.__name__
 
     @property
@@ -95,108 +98,10 @@ class Nameable(Parentable):
             return self._direct_parent_.hirarchy_name() + "." + adjust(self.name)
         return adjust(self.name)
 
-class Parameterizable(Parentable):
-    def __init__(self, *args, **kwargs):
-        super(Parameterizable, self).__init__(*args, **kwargs)
-        from GPy.core.parameterization.array_core import ParamList
-        _parameters_ = ParamList()
-        self._added_names_ = set()
-    
-    def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
-        if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x)
-        else: adjust = lambda x: x
-        if recursive: names = [xi for x in self._parameters_ for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)]
-        else: names = [adjust(x.name) for x in self._parameters_]
-        if add_self: names = map(lambda x: adjust(self.name) + "." + x, names)
-        return names
-    
-    def _add_parameter_name(self, param):
-        pname = adjust_name_for_printing(param.name)
-        # and makes sure to not delete programmatically added parameters
-        if pname in self.__dict__:
-            if not (param is self.__dict__[pname]):
-                if pname in self._added_names_:
-                    del self.__dict__[pname]
-                    self._add_parameter_name(param)
-        else:
-            self.__dict__[pname] = param
-            self._added_names_.add(pname)
-            
-    def _remove_parameter_name(self, param=None, pname=None):
-        assert param is None or pname is None, "can only delete either param by name, or the name of a param"
-        pname = adjust_name_for_printing(pname) or adjust_name_for_printing(param.name)
-        if pname in self._added_names_:
-            del self.__dict__[pname]
-            self._added_names_.remove(pname)
-        self._connect_parameters()
-
-    def _name_changed(self, param, old_name):
-        self._remove_parameter_name(None, old_name)
-        self._add_parameter_name(param)
-            
-    def _collect_gradient(self, target):
-        import itertools
-        [p._collect_gradient(target[s]) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
-
-    def _get_params(self):
-        import numpy as np
-        # don't overwrite this anymore!
-        if not self.size:
-            return np.empty(shape=(0,), dtype=np.float64)
-        return np.hstack([x._get_params() for x in self._parameters_ if x.size > 0])
-
-    def _set_params(self, params, update=True):
-        # don't overwrite this anymore!
-        import itertools
-        [p._set_params(params[s], update=update) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
-        self.parameters_changed()
-
-    def copy(self):
-        """Returns a (deep) copy of the current model"""
-        import copy
-        from .index_operations import ParameterIndexOperations, ParameterIndexOperationsView
-        from .array_core import ParamList
-        dc = dict()
-        for k, v in self.__dict__.iteritems():
-            if k not in ['_direct_parent_', '_parameters_', '_parent_index_'] + self.parameter_names():
-                if isinstance(v, (Constrainable, ParameterIndexOperations, ParameterIndexOperationsView)):
-                    dc[k] = v.copy()
-                else:
-                    dc[k] = copy.deepcopy(v)
-            if k == '_parameters_':
-                params = [p.copy() for p in v]
-        #dc = copy.deepcopy(self.__dict__)
-        dc['_direct_parent_'] = None
-        dc['_parent_index_'] = None
-        dc['_parameters_'] = ParamList()
-        s = self.__new__(self.__class__)
-        s.__dict__ = dc
-        #import ipdb;ipdb.set_trace()
-        for p in params:
-            s.add_parameter(p)
-        #dc._notify_parent_change()
-        return s
-        #return copy.deepcopy(self)
-
-    def _notify_parameters_changed(self):
-        self.parameters_changed()
-        if self.has_parent():
-            self._direct_parent_._notify_parameters_changed()
-
-    def parameters_changed(self):
-        """
-        This method gets called when parameters have changed.
-        Another way of listening to param changes is to
-        add self as a listener to the param, such that
-        updates get passed through. See :py:function:``GPy.core.param.Observable.add_observer``
-        """
-        pass
-
 
 class Gradcheckable(Parentable):
-    #===========================================================================
-    # Gradchecking
-    #===========================================================================
+    def __init__(self, *a, **kw):
+        super(Gradcheckable, self).__init__(*a, **kw)
     def checkgrad(self, verbose=0, step=1e-6, tolerance=1e-3):
         if self.has_parent():
             return self._highest_parent_._checkgrad(self, verbose=verbose, step=step, tolerance=tolerance)
@@ -204,6 +109,7 @@ class Gradcheckable(Parentable):
     def _checkgrad(self, param):
         raise NotImplementedError, "Need log likelihood to check gradient against"
 
+
 class Indexable(object):
     def _raveled_index(self):
         raise NotImplementedError, "Need to be able to get the raveled Index"
@@ -222,9 +128,10 @@ class Indexable(object):
         """
         raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?"        
         
-class Constrainable(Nameable, Indexable, Parentable):
-    def __init__(self, name, default_constraint=None):
-        super(Constrainable,self).__init__(name)
+
+class Constrainable(Nameable, Indexable):
+    def __init__(self, name, default_constraint=None, *a, **kw):
+        super(Constrainable, self).__init__(name=name, *a, **kw)
         self._default_constraint_ = default_constraint
         from index_operations import ParameterIndexOperations
         self.constraints = ParameterIndexOperations()
@@ -275,7 +182,7 @@ class Constrainable(Nameable, Indexable, Parentable):
     def _set_unfixed(self, index):
         import numpy as np
         if not self._has_fixes(): self._fixes_ = np.ones(self.size, dtype=bool)
-        #rav_i = self._raveled_index_for(param)[index]
+        # rav_i = self._raveled_index_for(param)[index]
         self._fixes_[index] = UNFIXED
         if np.all(self._fixes_): self._fixes_ = None  # ==UNFIXED
 
@@ -305,7 +212,7 @@ class Constrainable(Nameable, Indexable, Parentable):
         """evaluate the prior"""
         if self.priors.size > 0:
             x = self._get_params()
-            return reduce(lambda a,b: a+b, [p.lnpdf(x[ind]).sum() for p, ind in self.priors.iteritems()], 0)
+            return reduce(lambda a, b: a + b, [p.lnpdf(x[ind]).sum() for p, ind in self.priors.iteritems()], 0)
         return 0.
     
     def _log_prior_gradients(self):
@@ -409,7 +316,7 @@ class Constrainable(Nameable, Indexable, Parentable):
         if len(transforms) == 0:
             transforms = which.properties()
         import numpy as np
-        removed = np.empty((0, ), dtype=int)
+        removed = np.empty((0,), dtype=int)
         for t in transforms:
             unconstrained = which.remove(t, self._raveled_index())
             removed = np.union1d(removed, unconstrained)
@@ -419,5 +326,104 @@ class Constrainable(Nameable, Indexable, Parentable):
         return removed
 
 
+class Parameterizable(Constrainable):
+    def __init__(self, *args, **kwargs):
+        super(Parameterizable, self).__init__(*args, **kwargs)
+        from GPy.core.parameterization.array_core import ParamList
+        _parameters_ = ParamList()
+        self._added_names_ = set()
+    
+    def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
+        if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x)
+        else: adjust = lambda x: x
+        if recursive: names = [xi for x in self._parameters_ for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)]
+        else: names = [adjust(x.name) for x in self._parameters_]
+        if add_self: names = map(lambda x: adjust(self.name) + "." + x, names)
+        return names
+    
+    def _add_parameter_name(self, param):
+        pname = adjust_name_for_printing(param.name)
+        # and makes sure to not delete programmatically added parameters
+        if pname in self.__dict__:
+            if not (param is self.__dict__[pname]):
+                if pname in self._added_names_:
+                    del self.__dict__[pname]
+                    self._add_parameter_name(param)
+        else:
+            self.__dict__[pname] = param
+            self._added_names_.add(pname)
+            
+    def _remove_parameter_name(self, param=None, pname=None):
+        assert param is None or pname is None, "can only delete either param by name, or the name of a param"
+        pname = adjust_name_for_printing(pname) or adjust_name_for_printing(param.name)
+        if pname in self._added_names_:
+            del self.__dict__[pname]
+            self._added_names_.remove(pname)
+        self._connect_parameters()
 
+    def _name_changed(self, param, old_name):
+        self._remove_parameter_name(None, old_name)
+        self._add_parameter_name(param)
+            
+    def _collect_gradient(self, target):
+        import itertools
+        [p._collect_gradient(target[s]) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
+
+    def _set_gradient(self, g):
+        import itertools
+        [p._set_gradient(g[s]) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
+
+    def _get_params(self):
+        import numpy as np
+        # don't overwrite this anymore!
+        if not self.size:
+            return np.empty(shape=(0,), dtype=np.float64)
+        return np.hstack([x._get_params() for x in self._parameters_ if x.size > 0])
+
+    def _set_params(self, params, update=True):
+        # don't overwrite this anymore!
+        import itertools
+        [p._set_params(params[s], update=update) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
+        self.parameters_changed()
+
+    def copy(self):
+        """Returns a (deep) copy of the current model"""
+        import copy
+        from .index_operations import ParameterIndexOperations, ParameterIndexOperationsView
+        from .array_core import ParamList
+        dc = dict()
+        for k, v in self.__dict__.iteritems():
+            if k not in ['_direct_parent_', '_parameters_', '_parent_index_'] + self.parameter_names():
+                if isinstance(v, (Constrainable, ParameterIndexOperations, ParameterIndexOperationsView)):
+                    dc[k] = v.copy()
+                else:
+                    dc[k] = copy.deepcopy(v)
+            if k == '_parameters_':
+                params = [p.copy() for p in v]
+        # dc = copy.deepcopy(self.__dict__)
+        dc['_direct_parent_'] = None
+        dc['_parent_index_'] = None
+        dc['_parameters_'] = ParamList()
+        s = self.__new__(self.__class__)
+        s.__dict__ = dc
+        # import ipdb;ipdb.set_trace()
+        for p in params:
+            s.add_parameter(p)
+        # dc._notify_parent_change()
+        return s
+        # return copy.deepcopy(self)
+
+    def _notify_parameters_changed(self):
+        self.parameters_changed()
+        if self.has_parent():
+            self._direct_parent_._notify_parameters_changed()
+
+    def parameters_changed(self):
+        """
+        This method gets called when parameters have changed.
+        Another way of listening to param changes is to
+        add self as a listener to the param, such that
+        updates get passed through. See :py:function:``GPy.core.param.Observable.add_observer``
+        """
+        pass
 
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 12bf936c..177cc217 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -7,11 +7,11 @@ import cPickle
 import itertools
 from re import compile, _pattern_type
 from param import ParamConcatenation
-from parameter_core import Constrainable, Pickleable, Observable, Parameterizable, Parentable, adjust_name_for_printing, Gradcheckable
+from parameter_core import Constrainable, Pickleable, Parentable, Observable, Parameterizable, adjust_name_for_printing, Gradcheckable
 from transformations import __fixed__
 from array_core import ParamList
 
-class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable, Parameterizable, Parentable):
+class Parameterized(Parameterizable, Pickleable, Observable, Gradcheckable):
     """
     Parameterized class
 
@@ -53,8 +53,8 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable, Parame
         If you want to operate on all parameters use m[''] to wildcard select all paramters
         and concatenate them. Printing m[''] will result in printing of all parameters in detail.
     """
-    def __init__(self, name=None):
-        super(Parameterized, self).__init__(name=name)
+    def __init__(self, name=None, *a, **kw):
+        super(Parameterized, self).__init__(name=name, parent=None, parent_index=None, *a, **kw)
         self._in_init_ = True
         self._parameters_ = ParamList()
         self.size = sum(p.size for p in self._parameters_)
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 6e9199dd..9e98b97b 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -9,7 +9,7 @@ from ...core.parameterization.param import Param
 
 
 class Kern(Parameterized):
-    def __init__(self, input_dim, name):
+    def __init__(self, input_dim, name, *a, **kw):
         """
         The base class for a kernel: a positive definite function
         which forms of a covariance function (kernel).
@@ -19,7 +19,7 @@ class Kern(Parameterized):
 
         Do not instantiate.
         """
-        super(Kern, self).__init__(name)
+        super(Kern, self).__init__(name=name, *a, **kw)
         self.input_dim = input_dim
 
     def K(self, X, X2):
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index e8cf2e87..7822a1f6 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -9,7 +9,7 @@ from ...util.linalg import tdot
 from ...util.misc import fast_array_equal, param_to_array
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
-from ...util.caching import Cacher, cache_this
+from ...util.caching import cache_this
 
 class Linear(Kern):
     """
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 5fb1ca59..8aa378ce 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -36,7 +36,7 @@ class BayesianGPLVM(SparseGP, GPLVM):
         assert Z.shape[1] == X.shape[1]
 
         if kernel is None:
-            kernel = kern.rbf(input_dim) # + kern.white(input_dim)
+            kernel = kern.RBF(input_dim) # + kern.white(input_dim)
         
         if likelihood is None:
             likelihood = Gaussian()
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index 6bf9aab1..55e546df 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -12,7 +12,7 @@ class Cacher(object):
     def __call__(self, *args):
         if self._reset_on_first:
             assert isinstance(args[0], Observable)
-            args[0].add_observer(self.reset)
+            args[0].add_observer(self, self.reset)
             cached_args = args
         else:
             cached_args = args[1:]
@@ -29,21 +29,21 @@ class Cacher(object):
         else:
             if len(self.cached_inputs) == self.limit:
                 args_ = self.cached_inputs.pop(0)
-                [a.remove_observer(self.on_cache_changed) for a in args_]
+                [a.remove_observer(self, self.on_cache_changed) for a in args_]
                 self.inputs_changed.pop(0)
                 self.cached_outputs.pop(0)
 
             self.cached_inputs.append(cached_args)
             self.cached_outputs.append(self.operation(*args))
             self.inputs_changed.append(False)
-            [a.add_observer(self.on_cache_changed) for a in args]
+            [a.add_observer(self, self.on_cache_changed) for a in args]
             return self.cached_outputs[-1]
 
     def on_cache_changed(self, arg):
         self.inputs_changed = [any([a is arg for a in args]) or old_ic for args, old_ic in zip(self.cached_inputs, self.inputs_changed)]
 
     def reset(self, obj):
-        [[a.remove_observer(self.reset) for a in args] for args in self.cached_inputs]
+        [[a.remove_observer(self, self.reset) for a in args] for args in self.cached_inputs]
         self.cached_inputs = []
         self.cached_outputs = []
         self.inputs_changed = []

From 8b2f39450bffa5f6701924f0161a496829c46b65 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 21 Feb 2014 10:38:47 +0000
Subject: [PATCH 23/38] workin gon linear kernel

---
 GPy/core/parameterization/array_core.py     |   2 +-
 GPy/core/parameterization/param.py          |   2 +-
 GPy/core/parameterization/parameter_core.py |   2 +-
 GPy/examples/dimensionality_reduction.py    |  11 +-
 GPy/kern/_src/linear.py                     | 204 +++++++-------------
 GPy/kern/_src/rbf.py                        |   2 +-
 6 files changed, 83 insertions(+), 140 deletions(-)

diff --git a/GPy/core/parameterization/array_core.py b/GPy/core/parameterization/array_core.py
index b12ca59b..642ea823 100644
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@@ -30,7 +30,7 @@ class ObservableArray(np.ndarray, Observable):
     def __new__(cls, input_array):
         obj = np.atleast_1d(input_array).view(cls)
         cls.__name__ = "ObservableArray\n     "
-        obj._observer_callables_ = {}
+        obj._observer_callables_ = []
         return obj
     def __array_finalize__(self, obj):
         # see InfoArray.__array_finalize__ for comments
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 4c2cb469..44a27bdf 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -144,7 +144,7 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable
         return self.flat
 
     def _collect_gradient(self, target):
-        target[:] = self.gradient.flat
+        target += self.gradient.flat
 
     #===========================================================================
     # Array operations -> done
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index f8d83edd..d9f7c616 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -11,7 +11,7 @@ def adjust_name_for_printing(name):
     return ''
 
 class Observable(object):
-    _observer_callables_ = {}
+    _observer_callables_ = []
     def add_observer(self, callble):
         self._observer_callables_.append(callble)
         #callble(self)
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 80e77c57..3b5dcbf0 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -21,10 +21,11 @@ def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan
 
     # generate GPLVM-like data
     X = _np.random.rand(num_inputs, input_dim)
-    lengthscales = _np.random.rand(input_dim)
-    k = (GPy.kern.RBF(input_dim, .5, lengthscales, ARD=True)
-         #+ GPy.kern.white(input_dim, 0.01)
-         )
+    #lengthscales = _np.random.rand(input_dim)
+    #k = (GPy.kern.RBF(input_dim, .5, lengthscales, ARD=True)
+         ##+ GPy.kern.white(input_dim, 0.01)
+         #)
+    k = GPy.kern.Linear(input_dim)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
     K = k.K(X)
     Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, (output_dim,)).T
 
@@ -48,7 +49,7 @@ def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan
     # randomly obstruct data with percentage p
     #===========================================================================
     #m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
-    m.lengthscales = lengthscales
+    #m.lengthscales = lengthscales
 
     if plot:
         import matplotlib.pyplot as pb
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index e8cf2e87..1454e684 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -47,12 +47,13 @@ class Linear(Kern):
 
         self.variances = Param('variances', variances, Logexp())
         self.add_parameter(self.variances)
-        self.variances.add_observer(self, self._on_changed)
+        self.variances.add_observer(self._on_changed)
 
     def _on_changed(self, obj):
+        #TODO: move this to base class? isnt it jst for the caching?
         self._notify_observers()
 
-    @cache_this(limit=3, reset_on_self=True)
+    #@cache_this(limit=3, reset_on_self=True)
     def K(self, X, X2=None):
         if self.ARD:
             if X2 is None:
@@ -63,7 +64,7 @@ class Linear(Kern):
         else:
             return self._dot_product(X, X2) * self.variances
 
-    @cache_this(limit=3, reset_on_self=False)
+    #@cache_this(limit=3, reset_on_self=False)
     def _dot_product(self, X, X2=None):
         if X2 is None:
             return tdot(X)
@@ -73,43 +74,33 @@ class Linear(Kern):
     def Kdiag(self, X):
         return np.sum(self.variances * np.square(X), -1)
 
-    def update_gradients_full(self, dL_dK, X):
-        self.variances.gradient = np.zeros(self.variances.size)
-        self._param_grad_helper(dL_dK, X, None, self.variances.gradient)
-
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+        target = np.zeros(self.size)
+        self.update_gradients_diag(dL_dKdiag, X)
+        self._collect_gradient(target)
+        self.update_gradients_full(dL_dKnm, X, Z)
+        self._collect_gradient(target)
+        self.update_gradients_full(dL_dKmm, Z, None)
+        self._collect_gradient(target)
+        return target
+
+    def update_gradients_full(self, dL_dK, X):
+        if self.ARD:
+            if X2 is None:
+                self.variances.gradient = np.array([np.sum(dL_dK * tdot(X[:, i:i + 1])) for i in range(self.input_dim)])
+            else:
+                product = X[:, None, :] * X2[None, :, :]
+                self.variances.gradient = (dL_dK[:, :, None] * product).sum(0).sum(0)
+        else:
+            self.variances.gradient = np.sum(self._dot_product(X, X2) * dL_dK)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
         tmp = dL_dKdiag[:, None] * X ** 2
         if self.ARD:
             self.variances.gradient = tmp.sum(0)
         else:
             self.variances.gradient = np.atleast_1d(tmp.sum())
-        self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient)
-        self._param_grad_helper(dL_dKnm, X, Z, self.variances.gradient)
 
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
-        self._psi_computations(Z, mu, S)
-        # psi0:
-        tmp = dL_dpsi0[:, None] * self.mu2_S
-        if self.ARD: self.variances.gradient[:] = tmp.sum(0)
-        else: self.variances.gradient[:] = tmp.sum()
-        #psi1
-        self._param_grad_helper(dL_dpsi1, mu, Z, self.variances.gradient)
-        #psi2
-        tmp = dL_dpsi2[:, :, :, None] * (self.ZAinner[:, :, None, :] * (2 * Z)[None, None, :, :])
-        if self.ARD: self.variances.gradient += tmp.sum(0).sum(0).sum(0)
-        else: self.variances.gradient += tmp.sum()
-        #from Kmm
-        self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient)
-
-    def _param_grad_helper(self, dL_dK, X, X2, target):
-        if self.ARD:
-            if X2 is None:
-                [np.add(target[i:i + 1], np.sum(dL_dK * tdot(X[:, i:i + 1])), target[i:i + 1]) for i in range(self.input_dim)]
-            else:
-                product = X[:, None, :] * X2[None, :, :]
-                target += (dL_dK[:, :, None] * product).sum(0).sum(0)
-        else:
-            target += np.sum(self._dot_product(X, X2) * dL_dK)
 
     def gradients_X(self, dL_dK, X, X2=None):
         if X2 is None:
@@ -119,12 +110,37 @@ class Linear(Kern):
 
     def gradients_X_diag(self, dL_dKdiag, X):
         return 2.*self.variances*dL_dKdiag[:,None]*X
-        
+
     #---------------------------------------#
     #             PSI statistics            #
     #              variational              #
     #---------------------------------------#
 
+    def psi0(self, Z, mu, S):
+        return np.sum(self.variances * self._mu2S(mu, S), 1)
+
+    def psi1(self, Z, mu, S):
+        return self.K(mu, Z) #the variance, it does nothing
+
+    def psi2(self, Z, mu, S):
+        ZA = Z * self.variances
+        ZAinner = self._ZAinner(mu, S, Z)
+        return np.dot(ZAinner, ZA.T)
+
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        # psi0:
+        tmp = dL_dpsi0[:, None] * self._mu2S(mu, S)
+        if self.ARD: self.variances.gradient[:] = tmp.sum(0)
+        else: self.variances.gradient[:] = tmp.sum()
+        #psi1
+        self.variances.gradient += self._param_grad_helper(dL_dpsi1, mu, Z)
+        #psi2
+        tmp = dL_dpsi2[:, :, :, None] * (self._ZAinner(mu, S, Z)[:, :, None, :] * (2. * Z)[None, None, :, :])
+        if self.ARD: self.variances.gradient += tmp.sum(0).sum(0).sum(0)
+        else: self.variances.gradient += tmp.sum()
+        #from Kmm
+        self.variances.gradient += self._param_grad_helper(dL_dKmm, Z, None)
+
     def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         # Kmm
         grad = self.gradients_X(dL_dKmm, Z, None)
@@ -135,76 +151,30 @@ class Linear(Kern):
         return grad
 
     def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
-        target_mu, target_S = np.zeros(mu.shape), np.zeros(mu.shape)
+        grad_mu, grad_S = np.zeros(mu.shape), np.zeros(mu.shape)
         # psi0
-        target_mu += dL_dpsi0[:, None] * (2.0 * mu * self.variances)
-        target_S += dL_dpsi0[:, None] * self.variances
+        grad_mu += dL_dpsi0[:, None] * (2.0 * mu * self.variances)
+        grad_S += dL_dpsi0[:, None] * self.variances
         # psi1
-        target_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
+        grad_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
         # psi2
-        self._weave_dpsi2_dmuS(dL_dpsi2, Z, mu, S, target_mu, target_S)
-        
-        return target_mu, target_S
-        
-    def psi0(self, Z, mu, S):
-        self._psi_computations(Z, mu, S)
-        return np.sum(self.variances * self.mu2_S, 1)
+        self._weave_dpsi2_dmuS(dL_dpsi2, Z, mu, S, grad_mu, grad_S)
 
-    def psi1(self, Z, mu, S):
-        """the variance, it does nothing"""
-        self._psi1 = self.K(mu, Z)
-        return self._psi1
+        return grad_mu, grad_S
 
-    def psi2(self, Z, mu, S):
-        self._psi_computations(Z, mu, S)
-        return self._psi2
-        
-    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
-        target_mu += dL_dpsi0[:, None] * (2.0 * mu * self.variances)
-        target_S += dL_dpsi0[:, None] * self.variances
+    #--------------------------------------------------#
+    #            Helpers for psi statistics            #
+    #--------------------------------------------------#
 
-    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
-        """Do nothing for S, it does not affect psi1"""
-        self._psi_computations(Z, mu, S)
-        target_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
-
-
-    def psi2_new(self,Z,mu,S,target):
-        tmp = np.zeros((mu.shape[0], Z.shape[0]))
-        self.K(mu,Z,tmp)
-        target += tmp[:,:,None]*tmp[:,None,:] + np.sum(S[:,None,None,:]*self.variances**2*Z[None,:,None,:]*Z[None,None,:,:],-1)
-
-    def dpsi2_dtheta_new(self, dL_dpsi2, Z, mu, S, target):
-        tmp = np.zeros((mu.shape[0], Z.shape[0]))
-        self.K(mu,Z,tmp)
-        self._param_grad_helper(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target)
-        result= 2.*(dL_dpsi2[:,:,:,None]*S[:,None,None,:]*self.variances*Z[None,:,None,:]*Z[None,None,:,:]).sum(0).sum(0).sum(0)
-        if self.ARD:
-            target += result.sum(0).sum(0).sum(0)
-        else:
-            target += result.sum()
-
-    def dpsi2_dmuS_new(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
-        tmp = np.zeros((mu.shape[0], Z.shape[0]))
-        self.K(mu,Z,tmp)
-        self.gradients_X(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target_mu)
-
-        Zs = Z*self.variances
-        Zs_sq = Zs[:,None,:]*Zs[None,:,:]
-        target_S += (dL_dpsi2[:,:,:,None]*Zs_sq[None,:,:,:]).sum(1).sum(1)
 
     def _weave_dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
-        """Think N,num_inducing,num_inducing,input_dim """
-        self._psi_computations(Z, mu, S)
-        AZZA = self.ZA.T[:, None, :, None] * self.ZA[None, :, None, :]
+        # Think N,num_inducing,num_inducing,input_dim
+        ZA = Z * self.variances
+        AZZA = ZA.T[:, None, :, None] * ZA[None, :, None, :]
         AZZA = AZZA + AZZA.swapaxes(1, 2)
         AZZA_2 = AZZA/2.
-        #muAZZA = np.tensordot(mu,AZZA,(-1,0))
-        #target_mu_dummy, target_S_dummy = np.zeros_like(target_mu), np.zeros_like(target_S)
-        #target_mu_dummy += (dL_dpsi2[:, :, :, None] * muAZZA).sum(1).sum(1)
-        #target_S_dummy += (dL_dpsi2[:, :, :, None] * self.ZA[None, :, None, :] * self.ZA[None, None, :, :]).sum(1).sum(1)
 
-        #Using weave, we can exploiut the symmetry of this problem:
+        #Using weave, we can exploit the symmetry of this problem:
         code = """
         int n, m, mm,q,qq;
         double factor,tmp;
@@ -248,12 +218,8 @@ class Linear(Kern):
 
 
     def _weave_dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
-        self._psi_computations(Z, mu, S)
-        #psi2_dZ = dL_dpsi2[:, :, :, None] * self.variances * self.ZAinner[:, :, None, :]
-        #dummy_target = np.zeros_like(target)
-        #dummy_target += psi2_dZ.sum(0).sum(0)
 
-        AZA = self.variances*self.ZAinner
+        AZA = self.variances*self._ZAinner(mu, S, Z)
         code="""
         int n,m,mm,q;
         #pragma omp parallel for private(n,mm,q)
@@ -282,38 +248,14 @@ class Linear(Kern):
                      type_converters=weave.converters.blitz,**weave_options)
 
 
-    #---------------------------------------#
-    #            Precomputations            #
-    #---------------------------------------#
+    def _mu2S(self, mu, S):
+        return np.square(mu) + S
 
-    #def _K_computations(self, X, X2):
-        #if not (fast_array_equal(X, self._X) and fast_array_equal(X2, self._X2)):
-            #self._X = X.copy()
-            #if X2 is None:
-                ##self._dot_product = tdot(param_to_array(X))
-                #self._X2 = None
-            #else:
-                #self._X2 = X2.copy()
-                #self._dot_product = np.dot(param_to_array(X), param_to_array(X2.T))
+    def _ZAinner(self, mu, S, Z):
+        ZA = Z*self.variances
+        inner = (mu[:, None, :] * mu[:, :, None])
+        diag_indices = np.diag_indices(mu.shape[1], 2)
+        inner[:, diag_indices[0], diag_indices[1]] += S
+
+        return np.dot(ZA, inner).swapaxes(0, 1)  # NOTE: self.ZAinner \in [num_inducing x N x input_dim]!
 
-    def _psi_computations(self, Z, mu, S):
-        # here are the "statistics" for psi1 and psi2
-        Zv_changed = not (fast_array_equal(Z, self._Z) and fast_array_equal(self.variances, self._variances))
-        muS_changed = not (fast_array_equal(mu, self._mu) and fast_array_equal(S, self._S))
-        if Zv_changed:
-            # Z has changed, compute Z specific stuff
-            # self.ZZ = Z[:,None,:]*Z[None,:,:] # num_inducing,num_inducing,input_dim
-#             self.ZZ = np.empty((Z.shape[0], Z.shape[0], Z.shape[1]), order='F')
-#             [tdot(Z[:, i:i + 1], self.ZZ[:, :, i].T) for i in xrange(Z.shape[1])]
-            self.ZA = Z * self.variances
-            self._Z = Z.copy()
-            self._variances = self.variances.copy()
-        if muS_changed:
-            self.mu2_S = np.square(mu) + S
-            self.inner = (mu[:, None, :] * mu[:, :, None])
-            diag_indices = np.diag_indices(mu.shape[1], 2)
-            self.inner[:, diag_indices[0], diag_indices[1]] += S
-            self._mu, self._S = mu.copy(), S.copy()
-        if Zv_changed or muS_changed:
-            self.ZAinner = np.dot(self.ZA, self.inner).swapaxes(0, 1)  # NOTE: self.ZAinner \in [num_inducing x N x input_dim]!
-            self._psi2 = np.dot(self.ZAinner, self.ZA.T)
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 4fc2b591..807cac32 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -54,7 +54,7 @@ class RBF(Kern):
         self.variance = Param('variance', variance, Logexp())
 
         self.lengthscale = Param('lengthscale', lengthscale, Logexp())
-        self.lengthscale.add_observer(self, self.update_lengthscale)
+        self.lengthscale.add_observer(self.update_lengthscale)
         self.update_lengthscale(self.lengthscale)
 
         self.add_parameters(self.variance, self.lengthscale)

From 0dc9a32ba3d1b7034978930af228adb63c04d72b Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 21 Feb 2014 11:25:33 +0000
Subject: [PATCH 24/38] non-working grads in linear

---
 GPy/core/parameterization/param.py       |  2 +-
 GPy/examples/dimensionality_reduction.py |  2 +-
 GPy/kern/_src/linear.py                  | 21 ++++++++++++---------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 6fc58bff..ccbc76d5 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -147,7 +147,7 @@ class Param(Constrainable, ObservableArray, Gradcheckable, Indexable):
         target += self.gradient.flat
 
     def _set_gradient(self, g):
-        self.gradient = g
+        self.gradient = g.reshape(self._realshape_)
 
     #===========================================================================
     # Array operations -> done
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 3b5dcbf0..c8e79e6c 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -25,7 +25,7 @@ def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan
     #k = (GPy.kern.RBF(input_dim, .5, lengthscales, ARD=True)
          ##+ GPy.kern.white(input_dim, 0.01)
          #)
-    k = GPy.kern.Linear(input_dim)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+    k = GPy.kern.Linear(input_dim, ARD=1)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
     K = k.K(X)
     Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, (output_dim,)).T
 
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index 2e568d81..049b26f1 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -47,7 +47,7 @@ class Linear(Kern):
 
         self.variances = Param('variances', variances, Logexp())
         self.add_parameter(self.variances)
-        self.variances.add_observer(self._on_changed)
+        self.variances.add_observer(self, self._on_changed)
 
     def _on_changed(self, obj):
         #TODO: move this to base class? isnt it jst for the caching?
@@ -82,9 +82,9 @@ class Linear(Kern):
         self._collect_gradient(target)
         self.update_gradients_full(dL_dKmm, Z, None)
         self._collect_gradient(target)
-        return target
+        self._set_gradient(target)
 
-    def update_gradients_full(self, dL_dK, X):
+    def update_gradients_full(self, dL_dK, X, X2=None):
         if self.ARD:
             if X2 is None:
                 self.variances.gradient = np.array([np.sum(dL_dK * tdot(X[:, i:i + 1])) for i in range(self.input_dim)])
@@ -130,16 +130,19 @@ class Linear(Kern):
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         # psi0:
         tmp = dL_dpsi0[:, None] * self._mu2S(mu, S)
-        if self.ARD: self.variances.gradient[:] = tmp.sum(0)
-        else: self.variances.gradient[:] = tmp.sum()
+        if self.ARD: grad = tmp.sum(0)
+        else: grad = np.atleast_1d(tmp.sum())
         #psi1
-        self.variances.gradient += self._param_grad_helper(dL_dpsi1, mu, Z)
+        self.update_gradients_full(dL_dpsi1, mu, Z)
+        grad += self.variances.gradient
         #psi2
         tmp = dL_dpsi2[:, :, :, None] * (self._ZAinner(mu, S, Z)[:, :, None, :] * (2. * Z)[None, None, :, :])
-        if self.ARD: self.variances.gradient += tmp.sum(0).sum(0).sum(0)
-        else: self.variances.gradient += tmp.sum()
+        if self.ARD: grad += tmp.sum(0).sum(0).sum(0)
+        else: grad += tmp.sum()
         #from Kmm
-        self.variances.gradient += self._param_grad_helper(dL_dKmm, Z, None)
+        self.update_gradients_full(dL_dpsi1, mu, Z)
+        grad += self.variances.gradient
+        self._set_gradient(grad)
 
     def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         # Kmm

From 365bc4214010bbce65c25c3023074903859f0d61 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 21 Feb 2014 12:25:36 +0000
Subject: [PATCH 25/38] added Brownian motion

---
 GPy/kern/__init__.py                     |  4 +-
 GPy/kern/_src/Brownian.py                | 65 ------------------------
 GPy/kern/_src/brownian.py                | 50 ++++++++++++++++++
 GPy/kern/_src/kern.py                    | 11 +++-
 GPy/kern/_src/linear.py                  | 10 ----
 GPy/plotting/matplot_dep/models_plots.py |  8 +--
 6 files changed, 65 insertions(+), 83 deletions(-)
 delete mode 100644 GPy/kern/_src/Brownian.py
 create mode 100644 GPy/kern/_src/brownian.py

diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 630d74da..16c13066 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -2,8 +2,8 @@ from _src.rbf import RBF
 from _src.white import White
 from _src.kern import Kern
 from _src.linear import Linear
-#import bias
-#import Brownian
+from _src.brownian import Brownian
+#from _src.bias import Bias
 #import coregionalize
 #import exponential
 #import eq_ode1
diff --git a/GPy/kern/_src/Brownian.py b/GPy/kern/_src/Brownian.py
deleted file mode 100644
index 488e9b7a..00000000
--- a/GPy/kern/_src/Brownian.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-from kernpart import Kernpart
-import numpy as np
-
-def theta(x):
-    """Heavisdie step function"""
-    return np.where(x>=0.,1.,0.)
-
-class Brownian(Kernpart):
-    """
-    Brownian Motion kernel.
-
-    :param input_dim: the number of input dimensions
-    :type input_dim: int
-    :param variance:
-    :type variance: float
-    """
-    def __init__(self,input_dim,variance=1.):
-        self.input_dim = input_dim
-        assert self.input_dim==1, "Brownian motion in 1D only"
-        self.num_params = 1
-        self.name = 'Brownian'
-        self._set_params(np.array([variance]).flatten())
-
-    def _get_params(self):
-        return self.variance
-
-    def _set_params(self,x):
-        assert x.shape==(1,)
-        self.variance = x
-
-    def _get_param_names(self):
-        return ['variance']
-
-    def K(self,X,X2,target):
-        if X2 is None:
-            X2 = X
-        target += self.variance*np.fmin(X,X2.T)
-
-    def Kdiag(self,X,target):
-        target += self.variance*X.flatten()
-
-    def _param_grad_helper(self,dL_dK,X,X2,target):
-        if X2 is None:
-            X2 = X
-        target += np.sum(np.fmin(X,X2.T)*dL_dK)
-
-    def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        target += np.dot(X.flatten(), dL_dKdiag)
-
-    def gradients_X(self,dL_dK,X,X2,target):
-        raise NotImplementedError, "TODO"
-        #target += self.variance
-        #target -= self.variance*theta(X-X2.T)
-        #if X.shape==X2.shape:
-            #if np.all(X==X2):
-                #np.add(target[:,:,0],self.variance*np.diag(X2.flatten()-X.flatten()),target[:,:,0])
-
-
-    def dKdiag_dX(self,dL_dKdiag,X,target):
-        target += self.variance*dL_dKdiag[:,None]
-
diff --git a/GPy/kern/_src/brownian.py b/GPy/kern/_src/brownian.py
new file mode 100644
index 00000000..81b57a25
--- /dev/null
+++ b/GPy/kern/_src/brownian.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+
+class Brownian(Kern):
+    """
+    Brownian motion in 1D only.
+
+    Negative times are treated as a separate (backwards!) Brownian motion.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance:
+    :type variance: float
+    """
+    def __init__(self, input_dim=1, variance=1., name='Brownian'):
+        assert input_dim==1, "Brownian motion in 1D only"
+        super(Brownian, self).__init__(input_dim, name)
+
+        self.variance = Param('variance', variance, Logexp())
+        self.add_parameters(self.variance)
+
+    def K(self,X,X2=None):
+        if X2 is None:
+            X2 = X
+        return self.variance*np.where(np.sign(X)==np.sign(X2.T),np.fmin(np.abs(X),np.abs(X2.T)), 0.)
+
+    def Kdiag(self,X):
+        return self.variance*np.abs(X.flatten())
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None:
+            X2 = X
+        self.variance.gradient = np.sum(dL_dK * np.where(np.sign(X)==np.sign(X2.T),np.fmin(np.abs(X),np.abs(X2.T)), 0.))
+
+    #def update_gradients_diag(self, dL_dKdiag, X):
+        #self.variance.gradient = np.dot(np.abs(X.flatten()), dL_dKdiag)
+
+    #def gradients_X(self, dL_dK, X, X2=None):
+        #if X2 is None:
+            #return np.sum(self.variance*dL_dK*np.abs(X),1)[:,None]
+        #else:
+            #return np.sum(np.where(np.logical_and(np.abs(X)<np.abs(X2.T), np.sign(X)==np.sign(X2)), self.variance*dL_dK,0.),1)[:,None]
+
+
+
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 9e98b97b..b3ee57cd 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -40,8 +40,15 @@ class Kern(Parameterized):
         """Set the gradients of all parameters when doing full (N) inference."""
         raise NotImplementedError
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        """Set the gradients of all parameters when doing sparse (M) inference."""
-        raise NotImplementedError
+        target = np.zeros(self.size)
+        self.update_gradients_diag(dL_dKdiag, X)
+        self._collect_gradient(target)
+        self.update_gradients_full(dL_dKnm, X, Z)
+        self._collect_gradient(target)
+        self.update_gradients_full(dL_dKmm, Z, None)
+        self._collect_gradient(target)
+        self._set_gradient(target)
+
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         """Set the gradients of all parameters when doing variational (M) inference with uncertain inputs."""
         raise NotImplementedError
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index 049b26f1..e783cb8c 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -74,16 +74,6 @@ class Linear(Kern):
     def Kdiag(self, X):
         return np.sum(self.variances * np.square(X), -1)
 
-    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        target = np.zeros(self.size)
-        self.update_gradients_diag(dL_dKdiag, X)
-        self._collect_gradient(target)
-        self.update_gradients_full(dL_dKnm, X, Z)
-        self._collect_gradient(target)
-        self.update_gradients_full(dL_dKmm, Z, None)
-        self._collect_gradient(target)
-        self._set_gradient(target)
-
     def update_gradients_full(self, dL_dK, X, X2=None):
         if self.ARD:
             if X2 is None:
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index 75ba39d9..b6ca1191 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -92,11 +92,11 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
                 ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                 #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
 
-        
+
         #add error bars for uncertain (if input uncertainty is being modelled)
-        if hasattr(model,"has_uncertain_inputs"):
-            ax.errorbar(model.X[which_data, free_dims], model.likelihood.data[which_data, 0],
-                        xerr=2 * np.sqrt(model.X_variance[which_data, free_dims]),
+        if hasattr(model,"has_uncertain_inputs") and model.has_uncertain_inputs():
+            ax.errorbar(model.X[which_data_rows, free_dims], model.Y[which_data_rows, which_data_ycols],
+                        xerr=2 * np.sqrt(model.X_variance[which_data_rows, free_dims]),
                         ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
 
 

From 8b8ca5544f716fb37ed70f670593fbbcf811cb13 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Fri, 21 Feb 2014 12:29:28 +0000
Subject: [PATCH 26/38] linear and rbf fix for variational gradients in Z

---
 GPy/core/sparse_gp.py                         |   1 +
 .../latent_function_inference/var_dtc.py      | 194 +++++++++---------
 GPy/kern/_src/linear.py                       |  10 +-
 GPy/kern/_src/rbf.py                          |   4 +-
 GPy/plotting/matplot_dep/models_plots.py      |   6 +-
 5 files changed, 102 insertions(+), 113 deletions(-)

diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 5ab13251..61a664fe 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -58,6 +58,7 @@ class SparseGP(GP):
 
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
+        self.likelihood.update_gradients(self.grad_dict.pop('partial_for_likelihood'))
         if self.has_uncertain_inputs():
             self.kern.update_gradients_variational(mu=self.X, S=self.X_variance, Z=self.Z, **self.grad_dict)
             self.Z.gradient = self.kern.gradients_Z_variational(mu=self.X, S=self.X_variance, Z=self.Z, **self.grad_dict)
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index 24f4a5b6..a81bb711 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -60,18 +60,88 @@ class VarDTC(object):
         trYYT = self.get_trYYT(Y)
     
         # do the inference:
-        dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Cpsi1Vf, \
-         psi1, Lm, LB, log_marginal, Kmm, partial_for_likelihood  = _do_inference_on(
-                kern, X, X_variance, Z, likelihood, 
-                uncertain_inputs, output_dim, 
-                beta, VVT_factor, trYYT)
+        het_noise = beta.size < 1
+        num_inducing = Z.shape[0]
+        num_data = X.shape[0]
+        # kernel computations, using BGPLVM notation
+        Kmm = kern.K(Z)
+        psi0, psi1, psi2 = _compute_psi(kern, X, X_variance, Z, uncertain_inputs) 
+        
+        Lm = jitchol(Kmm)
+        
+        # The rather complex computations of A
+        if uncertain_inputs:
+            if het_noise:
+                psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
+            else:
+                psi2_beta = psi2.sum(0) * beta
+            #if 0:
+            #    evals, evecs = linalg.eigh(psi2_beta)
+            #    clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
+            #    if not np.array_equal(evals, clipped_evals):
+            #        pass # print evals
+            #    tmp = evecs * np.sqrt(clipped_evals)
+            #    tmp = tmp.T
+            # no backsubstitution because of bound explosion on tr(A) if not...
+            LmInv = dtrtri(Lm)
+            A = LmInv.dot(psi2_beta.dot(LmInv.T))
+        else:
+            if het_noise:
+                tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
+            else:
+                tmp = psi1 * (np.sqrt(beta))
+            tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
+            A = tdot(tmp) #print A.sum()
 
-        likelihood.update_gradients(partial_for_likelihood)
+        # factor B
+        B = np.eye(num_inducing) + A
+        LB = jitchol(B)
+        psi1Vf = np.dot(psi1.T, VVT_factor)
+        # back substutue C into psi1Vf
+        tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
+        _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
+        tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
+        Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+
+        # data fit and derivative of L w.r.t. Kmm
+        delit = tdot(_LBi_Lmi_psi1Vf)
+        data_fit = np.trace(delit)
+        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
+        delit = -0.5 * DBi_plus_BiPBi
+        delit += -0.5 * B * output_dim
+        delit += output_dim * np.eye(num_inducing)
+        # Compute dL_dKmm
+        dL_dKmm = backsub_both_sides(Lm, delit)
+
+        # derivatives of L w.r.t. psi
+        dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, 
+            VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, 
+            psi1, het_noise, uncertain_inputs)
+        
+        # log marginal likelihood
+        log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, 
+            psi0, A, LB, trYYT, data_fit)
+        
+        #put the gradients in the right places
+        partial_for_likelihood = _compute_partial_for_likelihood(likelihood, 
+            het_noise, uncertain_inputs, LB, 
+            _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, 
+            psi0, psi1, beta, 
+            data_fit, num_data, output_dim, trYYT)
+        
+        #likelihood.update_gradients(partial_for_likelihood)
 
         if uncertain_inputs:
-            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
+            grad_dict = {'dL_dKmm': dL_dKmm, 
+                         'dL_dpsi0':dL_dpsi0, 
+                         'dL_dpsi1':dL_dpsi1, 
+                         'dL_dpsi2':dL_dpsi2, 
+                         'partial_for_likelihood':partial_for_likelihood}
         else:
-            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1}
+            grad_dict = {'dL_dKmm': dL_dKmm, 
+                         'dL_dKdiag':dL_dpsi0, 
+                         'dL_dKnm':dL_dpsi1, 
+                         'partial_for_likelihood':partial_for_likelihood}
 
         #get sufficient things for posterior prediction
         #TODO: do we really want to do this in  the loop?
@@ -184,9 +254,10 @@ class VarDTCMissingData(object):
             LB = jitchol(B)
             
             psi1Vf = psi1.T.dot(VVT_factor)
-            _LBi_Lmi_psi1Vf, Cpsi1Vf = _compute_psi1Vf(Lm, LB, psi1Vf)
-            
-            #LB_all[ind, :,:] = LB    
+            tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
+            _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
+            tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
+            Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
             
             # data fit and derivative of L w.r.t. Kmm
             delit = tdot(_LBi_Lmi_psi1Vf)
@@ -233,16 +304,19 @@ class VarDTCMissingData(object):
             from ...util import diag
             diag.add(Bi, 1)
             woodbury_inv_all[:, :, ind] = backsub_both_sides(Lm, Bi)[:,:,None]
-            
-        # gradients:
-        likelihood.update_gradients(partial_for_likelihood)
 
+        # gradients:
         if uncertain_inputs:
-            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0_all, 'dL_dpsi1':dL_dpsi1_all, 'dL_dpsi2':dL_dpsi2_all}
-            kern.update_gradients_variational(mu=X, S=X_variance, Z=Z, **grad_dict)
+            grad_dict = {'dL_dKmm': dL_dKmm, 
+                         'dL_dpsi0':dL_dpsi0, 
+                         'dL_dpsi1':dL_dpsi1, 
+                         'dL_dpsi2':dL_dpsi2, 
+                         'partial_for_likelihood':partial_for_likelihood}
         else:
-            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0_all, 'dL_dKnm':dL_dpsi1_all}
-            kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
+            grad_dict = {'dL_dKmm': dL_dKmm, 
+                         'dL_dKdiag':dL_dpsi0, 
+                         'dL_dKnm':dL_dpsi1, 
+                         'partial_for_likelihood':partial_for_likelihood}
 
         #get sufficient things for posterior prediction
         #TODO: do we really want to do this in  the loop?
@@ -266,33 +340,6 @@ class VarDTCMissingData(object):
         return post, log_marginal, grad_dict
 
 
-def _compute_A(num_data, uncertain_inputs, beta, het_noise, psi1, psi2, Lm):
-# The rather complex computations of A
-    if uncertain_inputs:
-        if het_noise:
-            psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
-        else:
-            psi2_beta = psi2.sum(0) * beta
-        #if 0:
-        #    evals, evecs = linalg.eigh(psi2_beta)
-        #    clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
-        #    if not np.array_equal(evals, clipped_evals):
-        #        pass # print evals
-        #    tmp = evecs * np.sqrt(clipped_evals)
-        #    tmp = tmp.T
-        # no backsubstitution because of bound explosion on tr(A) if not...
-        LmInv = dtrtri(Lm)
-        A = LmInv.dot(psi2_beta.dot(LmInv.T))
-    else:
-        if het_noise:
-            tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
-        else:
-            tmp = psi1 * (np.sqrt(beta))
-        tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
-        A = tdot(tmp) #print A.sum()
-    return A
-
-
 def _compute_psi(kern, X, X_variance, Z, uncertain_inputs):
     if uncertain_inputs:
         psi0 = kern.psi0(Z, X, X_variance)
@@ -304,22 +351,6 @@ def _compute_psi(kern, X, X_variance, Z, uncertain_inputs):
         psi2 = None
     return psi0, psi1, psi2
 
-def _compute_Kmm(kern, X, X_variance, Z, uncertain_inputs):
-    Kmm = kern.K(Z)
-    psi0, psi1, psi2 = _compute_psi(kern, X, X_variance, Z, uncertain_inputs) 
-    return Kmm, psi0, psi1, psi2
-
-def _compute_dL_dKmm(num_inducing, output_dim, Lm, B, LB, _LBi_Lmi_psi1Vf):
-    # Compute dL_dKmm
-    delit = tdot(_LBi_Lmi_psi1Vf)
-    data_fit = np.trace(delit)
-    DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
-    delit = -0.5 * DBi_plus_BiPBi
-    delit += -0.5 * B * output_dim
-    delit += output_dim * np.eye(num_inducing)
-    dL_dKmm = backsub_both_sides(Lm, delit)
-    return DBi_plus_BiPBi, data_fit, dL_dKmm
-
 def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs):
     dL_dpsi0 = -0.5 * output_dim * (beta * np.ones([num_data, 1])).flatten()
     dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
@@ -343,15 +374,6 @@ def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, C
     return dL_dpsi0, dL_dpsi1, dL_dpsi2
 
 
-def _compute_psi1Vf(Lm, LB, psi1Vf):
-    # back substutue C into psi1Vf
-    tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
-    _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
-    tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
-    Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
-    return _LBi_Lmi_psi1Vf, Cpsi1Vf
-
-
 def _compute_partial_for_likelihood(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT):
     # the partial derivative vector for the likelihood
     if likelihood.size == 0:
@@ -393,35 +415,3 @@ def _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het
     lik_4 = 0.5 * data_fit
     log_marginal = lik_1 + lik_2 + lik_3 + lik_4
     return log_marginal
-
-def _do_inference_on(kern, X, X_variance, Z, likelihood, uncertain_inputs, output_dim, beta, VVT_factor, trYYT):
-    het_noise = beta.size < 1
-    num_inducing = Z.shape[0]
-    num_data = X.shape[0]
-    # kernel computations, using BGPLVM notation
-    Kmm, psi0, psi1, psi2 = _compute_Kmm(kern, X, X_variance, Z, uncertain_inputs)
-    #factor Kmm # TODO: cache?
-    Lm = jitchol(Kmm)
-    A = _compute_A(num_data, uncertain_inputs, beta, het_noise, psi1, psi2, Lm)
-    # factor B
-    B = np.eye(num_inducing) + A
-    LB = jitchol(B)
-    psi1Vf = np.dot(psi1.T, VVT_factor)
-    _LBi_Lmi_psi1Vf, Cpsi1Vf = _compute_psi1Vf(Lm, LB, psi1Vf)
-    # data fit and derivative of L w.r.t. Kmm
-    DBi_plus_BiPBi, data_fit, dL_dKmm = _compute_dL_dKmm(num_inducing, output_dim, 
-        Lm, B, LB, _LBi_Lmi_psi1Vf)
-    # derivatives of L w.r.t. psi
-    dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, 
-        VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, 
-        psi1, het_noise, uncertain_inputs)
-    # log marginal likelihood
-    log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, 
-        psi0, A, LB, trYYT, data_fit)
-    #put the gradients in the right places
-    partial_for_likelihood = _compute_partial_for_likelihood(likelihood, 
-        het_noise, uncertain_inputs, LB, 
-        _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, 
-        psi0, psi1, beta, 
-        data_fit, num_data, output_dim, trYYT)
-    return dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Cpsi1Vf, psi1, Lm, LB, log_marginal, Kmm, partial_for_likelihood
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index 049b26f1..312440b8 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -140,9 +140,8 @@ class Linear(Kern):
         if self.ARD: grad += tmp.sum(0).sum(0).sum(0)
         else: grad += tmp.sum()
         #from Kmm
-        self.update_gradients_full(dL_dpsi1, mu, Z)
-        grad += self.variances.gradient
-        self._set_gradient(grad)
+        self.update_gradients_full(dL_dKmm, Z, None)
+        self.variances.gradient += grad
 
     def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         # Kmm
@@ -221,7 +220,6 @@ class Linear(Kern):
 
 
     def _weave_dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
-
         AZA = self.variances*self._ZAinner(mu, S, Z)
         code="""
         int n,m,mm,q;
@@ -230,7 +228,7 @@ class Linear(Kern):
           for(q=0;q<input_dim;q++){
             for(mm=0;mm<num_inducing;mm++){
               for(n=0;n<N;n++){
-                target(m,q) += dL_dpsi2(n,m,mm)*AZA(n,mm,q);
+                target(m,q) += 2*dL_dpsi2(n,m,mm)*AZA(n,mm,q);
               }
             }
           }
@@ -245,7 +243,7 @@ class Linear(Kern):
                          'extra_link_args'   : ['-lgomp']}
 
         N,num_inducing,input_dim = mu.shape[0],Z.shape[0],mu.shape[1]
-        mu, AZA, target, dL_dpsi2 = param_to_array(mu, AZA, target, dL_dpsi2)
+        mu = param_to_array(mu)
         weave.inline(code, support_code=support_code, libraries=['gomp'],
                      arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'],
                      type_converters=weave.converters.blitz,**weave_options)
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 807cac32..c4d595d0 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -54,7 +54,7 @@ class RBF(Kern):
         self.variance = Param('variance', variance, Logexp())
 
         self.lengthscale = Param('lengthscale', lengthscale, Logexp())
-        self.lengthscale.add_observer(self.update_lengthscale)
+        self.lengthscale.add_observer(self, self.update_lengthscale)
         self.update_lengthscale(self.lengthscale)
 
         self.add_parameters(self.variance, self.lengthscale)
@@ -167,7 +167,7 @@ class RBF(Kern):
         term1 = self._psi2_Zdist / self.lengthscale2 # num_inducing, num_inducing, input_dim
         term2 = self._psi2_mudist / self._psi2_denom / self.lengthscale2 # N, num_inducing, num_inducing, input_dim
         dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
-        grad += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
+        grad += 2*(dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
 
         grad += self.gradients_X(dL_dKmm, Z, None)
 
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index 75ba39d9..1676fa2d 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -94,9 +94,9 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
 
         
         #add error bars for uncertain (if input uncertainty is being modelled)
-        if hasattr(model,"has_uncertain_inputs"):
-            ax.errorbar(model.X[which_data, free_dims], model.likelihood.data[which_data, 0],
-                        xerr=2 * np.sqrt(model.X_variance[which_data, free_dims]),
+        if hasattr(model,"has_uncertain_inputs") and model.has_uncertain_inputs():
+            ax.errorbar(model.X[which_data_rows, free_dims], model.Y[which_data_rows, 0],
+                        xerr=2 * np.sqrt(model.X_variance[which_data_rows, free_dims]),
                         ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
 
 

From 78ceef01c3c54a0d6aaf8e383c9a8507729d8c18 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 21 Feb 2014 17:06:06 +0000
Subject: [PATCH 27/38] removed materns

---
 GPy/kern/__init__.py         |   1 +
 GPy/kern/_src/Matern32.py    | 139 ----------------------
 GPy/kern/_src/Matern52.py    | 145 -----------------------
 GPy/kern/_src/exponential.py | 129 --------------------
 GPy/kern/_src/stationary.py  | 221 +++++++++++++++++++++++++++++++++++
 GPy/util/__init__.py         |   1 +
 GPy/util/diag.py             |  40 ++++---
 7 files changed, 246 insertions(+), 430 deletions(-)
 delete mode 100644 GPy/kern/_src/Matern32.py
 delete mode 100644 GPy/kern/_src/Matern52.py
 delete mode 100644 GPy/kern/_src/exponential.py
 create mode 100644 GPy/kern/_src/stationary.py

diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 16c13066..e5dc6d35 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -3,6 +3,7 @@ from _src.white import White
 from _src.kern import Kern
 from _src.linear import Linear
 from _src.brownian import Brownian
+from _src.stationary import Exponential, Matern32, Matern52, ExpQuad
 #from _src.bias import Bias
 #import coregionalize
 #import exponential
diff --git a/GPy/kern/_src/Matern32.py b/GPy/kern/_src/Matern32.py
deleted file mode 100644
index 08fa452c..00000000
--- a/GPy/kern/_src/Matern32.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-from kernpart import Kernpart
-import numpy as np
-from scipy import integrate
-
-class Matern32(Kernpart):
-    """
-    Matern 3/2 kernel:
-
-    .. math::
-
-       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
-
-    :param input_dim: the number of input dimensions
-    :type input_dim: int
-    :param variance: the variance :math:`\sigma^2`
-    :type variance: float
-    :param lengthscale: the vector of lengthscale :math:`\ell_i`
-    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
-    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
-    :type ARD: Boolean
-    :rtype: kernel object
-
-    """
-
-    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False):
-        self.input_dim = input_dim
-        self.ARD = ARD
-        if ARD == False:
-            self.num_params = 2
-            self.name = 'Mat32'
-            if lengthscale is not None:
-                lengthscale = np.asarray(lengthscale)
-                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
-            else:
-                lengthscale = np.ones(1)
-        else:
-            self.num_params = self.input_dim + 1
-            self.name = 'Mat32'
-            if lengthscale is not None:
-                lengthscale = np.asarray(lengthscale)
-                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
-            else:
-                lengthscale = np.ones(self.input_dim)
-        self._set_params(np.hstack((variance, lengthscale.flatten())))
-
-    def _get_params(self):
-        """return the value of the parameters."""
-        return np.hstack((self.variance, self.lengthscale))
-
-    def _set_params(self, x):
-        """set the value of the parameters."""
-        assert x.size == self.num_params
-        self.variance = x[0]
-        self.lengthscale = x[1:]
-
-    def _get_param_names(self):
-        """return parameter names."""
-        if self.num_params == 2:
-            return ['variance', 'lengthscale']
-        else:
-            return ['variance'] + ['lengthscale_%i' % i for i in range(self.lengthscale.size)]
-
-    def K(self, X, X2, target):
-        """Compute the covariance matrix between X and X2."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
-        np.add(self.variance * (1 + np.sqrt(3.) * dist) * np.exp(-np.sqrt(3.) * dist), target, target)
-
-    def Kdiag(self, X, target):
-        """Compute the diagonal of the covariance matrix associated to X."""
-        np.add(target, self.variance, target)
-
-    def _param_grad_helper(self, dL_dK, X, X2, target):
-        """derivative of the covariance matrix with respect to the parameters."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
-        dvar = (1 + np.sqrt(3.) * dist) * np.exp(-np.sqrt(3.) * dist)
-        invdist = 1. / np.where(dist != 0., dist, np.inf)
-        dist2M = np.square(X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 3
-        # dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
-        target[0] += np.sum(dvar * dL_dK)
-        if self.ARD == True:
-            dl = (self.variance * 3 * dist * np.exp(-np.sqrt(3.) * dist))[:, :, np.newaxis] * dist2M * invdist[:, :, np.newaxis]
-            # dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
-            target[1:] += (dl * dL_dK[:, :, None]).sum(0).sum(0)
-        else:
-            dl = (self.variance * 3 * dist * np.exp(-np.sqrt(3.) * dist)) * dist2M.sum(-1) * invdist
-            # dl = self.variance*dvar*dist2M.sum(-1)*invdist
-            target[1] += np.sum(dl * dL_dK)
-
-    def dKdiag_dtheta(self, dL_dKdiag, X, target):
-        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
-        target[0] += np.sum(dL_dKdiag)
-
-    def gradients_X(self, dL_dK, X, X2, target):
-        """derivative of the covariance matrix with respect to X."""
-        if X2 is None:
-            dist = np.sqrt(np.sum(np.square((X[:, None, :] - X[None, :, :]) / self.lengthscale), -1))[:, :, None]
-            ddist_dX = 2*(X[:, None, :] - X[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
-
-        else:
-            dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
-            ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
-        gradients_X = -np.transpose(3 * self.variance * dist * np.exp(-np.sqrt(3) * dist) * ddist_dX, (1, 0, 2))
-        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
-
-    def dKdiag_dX(self, dL_dKdiag, X, target):
-        pass
-
-    def Gram_matrix(self, F, F1, F2, lower, upper):
-        """
-        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
-
-        :param F: vector of functions
-        :type F: np.array
-        :param F1: vector of derivatives of F
-        :type F1: np.array
-        :param F2: vector of second derivatives of F
-        :type F2: np.array
-        :param lower,upper: boundaries of the input domain
-        :type lower,upper: floats
-        """
-        assert self.input_dim == 1
-        def L(x, i):
-            return(3. / self.lengthscale ** 2 * F[i](x) + 2 * np.sqrt(3) / self.lengthscale * F1[i](x) + F2[i](x))
-        n = F.shape[0]
-        G = np.zeros((n, n))
-        for i in range(n):
-            for j in range(i, n):
-                G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0]
-        Flower = np.array([f(lower) for f in F])[:, None]
-        F1lower = np.array([f(lower) for f in F1])[:, None]
-        # print "OLD \n", np.dot(F1lower,F1lower.T), "\n \n"
-        # return(G)
-        return(self.lengthscale ** 3 / (12.*np.sqrt(3) * self.variance) * G + 1. / self.variance * np.dot(Flower, Flower.T) + self.lengthscale ** 2 / (3.*self.variance) * np.dot(F1lower, F1lower.T))
diff --git a/GPy/kern/_src/Matern52.py b/GPy/kern/_src/Matern52.py
deleted file mode 100644
index 7d36254c..00000000
--- a/GPy/kern/_src/Matern52.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-from kernpart import Kernpart
-import numpy as np
-import hashlib
-from scipy import integrate
-
-class Matern52(Kernpart):
-    """
-    Matern 5/2 kernel:
-
-    .. math::
-
-       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
-
-    :param input_dim: the number of input dimensions
-    :type input_dim: int
-    :param variance: the variance :math:`\sigma^2`
-    :type variance: float
-    :param lengthscale: the vector of lengthscale :math:`\ell_i`
-    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
-    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
-    :type ARD: Boolean
-    :rtype: kernel object
-
-    """
-    def __init__(self,input_dim,variance=1.,lengthscale=None,ARD=False):
-        self.input_dim = input_dim
-        self.ARD = ARD
-        if ARD == False:
-            self.num_params = 2
-            self.name = 'Mat52'
-            if lengthscale is not None:
-                lengthscale = np.asarray(lengthscale)
-                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
-            else:
-                lengthscale = np.ones(1)
-        else:
-            self.num_params = self.input_dim + 1
-            self.name = 'Mat52'
-            if lengthscale is not None:
-                lengthscale = np.asarray(lengthscale)
-                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
-            else:
-                lengthscale = np.ones(self.input_dim)
-        self._set_params(np.hstack((variance,lengthscale.flatten())))
-
-    def _get_params(self):
-        """return the value of the parameters."""
-        return np.hstack((self.variance,self.lengthscale))
-
-    def _set_params(self,x):
-        """set the value of the parameters."""
-        assert x.size == self.num_params
-        self.variance = x[0]
-        self.lengthscale = x[1:]
-
-    def _get_param_names(self):
-        """return parameter names."""
-        if self.num_params == 2:
-            return ['variance','lengthscale']
-        else:
-            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
-
-    def K(self,X,X2,target):
-        """Compute the covariance matrix between X and X2."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
-        np.add(self.variance*(1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist), target,target)
-
-    def Kdiag(self,X,target):
-        """Compute the diagonal of the covariance matrix associated to X."""
-        np.add(target,self.variance,target)
-
-    def _param_grad_helper(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
-        invdist = 1./np.where(dist!=0.,dist,np.inf)
-        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
-        dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist)
-        dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
-        target[0] += np.sum(dvar*dL_dK)
-        if self.ARD:
-            dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
-            #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
-            target[1:] += (dl*dL_dK[:,:,None]).sum(0).sum(0)
-        else:
-            dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist)) * dist2M.sum(-1)*invdist
-            #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist)) * dist2M.sum(-1)*invdist
-            target[1] += np.sum(dl*dL_dK)
-
-    def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
-        target[0] += np.sum(dL_dKdiag)
-
-    def gradients_X(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to X."""
-        if X2 is None:
-            dist = np.sqrt(np.sum(np.square((X[:,None,:]-X[None,:,:])/self.lengthscale),-1))[:,:,None]
-            ddist_dX = 2*(X[:,None,:]-X[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
-        else:
-            dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
-            ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
-        gradients_X = -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
-        target += np.sum(gradients_X*dL_dK.T[:,:,None],0)
-
-    def dKdiag_dX(self,dL_dKdiag,X,target):
-        pass
-
-    def Gram_matrix(self,F,F1,F2,F3,lower,upper):
-        """
-        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
-
-        :param F: vector of functions
-        :type F: np.array
-        :param F1: vector of derivatives of F
-        :type F1: np.array
-        :param F2: vector of second derivatives of F
-        :type F2: np.array
-        :param F3: vector of third derivatives of F
-        :type F3: np.array
-        :param lower,upper: boundaries of the input domain
-        :type lower,upper: floats
-        """
-        assert self.input_dim == 1
-        def L(x,i):
-            return(5*np.sqrt(5)/self.lengthscale**3*F[i](x) + 15./self.lengthscale**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscale*F2[i](x) + F3[i](x))
-        n = F.shape[0]
-        G = np.zeros((n,n))
-        for i in range(n):
-            for j in range(i,n):
-                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
-        G_coef = 3.*self.lengthscale**5/(400*np.sqrt(5))
-        Flower = np.array([f(lower) for f in F])[:,None]
-        F1lower = np.array([f(lower) for f in F1])[:,None]
-        F2lower = np.array([f(lower) for f in F2])[:,None]
-        orig = 9./8*np.dot(Flower,Flower.T) + 9.*self.lengthscale**4/200*np.dot(F2lower,F2lower.T)
-        orig2 = 3./5*self.lengthscale**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
-        return(1./self.variance* (G_coef*G + orig + orig2))
-
-
-
diff --git a/GPy/kern/_src/exponential.py b/GPy/kern/_src/exponential.py
deleted file mode 100644
index 372d4d9b..00000000
--- a/GPy/kern/_src/exponential.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-from kernpart import Kernpart
-import numpy as np
-from scipy import integrate
-
-class Exponential(Kernpart):
-    """
-    Exponential kernel (aka Ornstein-Uhlenbeck or Matern 1/2)
-
-    .. math::
-
-       k(r) = \sigma^2 \exp(- r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
-
-    :param input_dim: the number of input dimensions
-    :type input_dim: int
-    :param variance: the variance :math:`\sigma^2`
-    :type variance: float
-    :param lengthscale: the vector of lengthscale :math:`\ell_i`
-    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
-    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
-    :type ARD: Boolean
-    :param name: the name of the kernel
-    :rtype: kernel object
-
-    """
-    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='exp'):
-        self.input_dim = input_dim
-        self.ARD = ARD
-        self.variance = variance
-        self.name = name
-        if ARD == False:
-            self.num_params = 2
-            if lengthscale is not None:
-                lengthscale = np.asarray(lengthscale)
-                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
-            else:
-                lengthscale = np.ones(1)
-        else:
-            self.num_params = self.input_dim + 1
-            if lengthscale is not None:
-                lengthscale = np.asarray(lengthscale)
-                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
-            else:
-                lengthscale = np.ones(self.input_dim)
-        #self._set_params(np.hstack((variance, lengthscale.flatten())))
-        self.set_as_parameter('variance', 'lengthscale')
-
-#     def _get_params(self):
-#         """return the value of the parameters."""
-#         return np.hstack((self.variance, self.lengthscale))
-# 
-#     def _set_params(self, x):
-#         """set the value of the parameters."""
-#         assert x.size == self.num_params
-#         self.variance = x[0]
-#         self.lengthscale = x[1:]
-# 
-#     def _get_param_names(self):
-#         """return parameter names."""
-#         if self.num_params == 2:
-#             return ['variance', 'lengthscale']
-#         else:
-#             return ['variance'] + ['lengthscale_%i' % i for i in range(self.lengthscale.size)]
-
-    def K(self, X, X2, target):
-        """Compute the covariance matrix between X and X2."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
-        np.add(self.variance * np.exp(-dist), target, target)
-
-    def Kdiag(self, X, target):
-        """Compute the diagonal of the covariance matrix associated to X."""
-        np.add(target, self.variance, target)
-
-    def _param_grad_helper(self, dL_dK, X, X2, target):
-        """derivative of the covariance matrix with respect to the parameters."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
-        invdist = 1. / np.where(dist != 0., dist, np.inf)
-        dist2M = np.square(X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 3
-        dvar = np.exp(-dist)
-        target[0] += np.sum(dvar * dL_dK)
-        if self.ARD == True:
-            dl = self.variance * dvar[:, :, None] * dist2M * invdist[:, :, None]
-            target[1:] += (dl * dL_dK[:, :, None]).sum(0).sum(0)
-        else:
-            dl = self.variance * dvar * dist2M.sum(-1) * invdist
-            target[1] += np.sum(dl * dL_dK)
-
-    def dKdiag_dtheta(self, dL_dKdiag, X, target):
-        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
-        # NB: derivative of diagonal elements wrt lengthscale is 0
-        target[0] += np.sum(dL_dKdiag)
-
-    def gradients_X(self, dL_dK, X, X2, target):
-        """derivative of the covariance matrix with respect to X."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
-        ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
-        gradients_X = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
-        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
-
-    def dKdiag_dX(self, dL_dKdiag, X, target):
-        pass
-
-    def Gram_matrix(self, F, F1, lower, upper):
-        """
-        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
-
-        :param F: vector of functions
-        :type F: np.array
-        :param F1: vector of derivatives of F
-        :type F1: np.array
-        :param lower,upper: boundaries of the input domain
-        :type lower,upper: floats
-        """
-        assert self.input_dim == 1
-        def L(x, i):
-            return(1. / self.lengthscale * F[i](x) + F1[i](x))
-        n = F.shape[0]
-        G = np.zeros((n, n))
-        for i in range(n):
-            for j in range(i, n):
-                G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0]
-        Flower = np.array([f(lower) for f in F])[:, None]
-        return(self.lengthscale / 2. / self.variance * G + 1. / self.variance * np.dot(Flower, Flower.T))
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
new file mode 100644
index 00000000..aaa534ac
--- /dev/null
+++ b/GPy/kern/_src/stationary.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+from ... import util
+import numpy as np
+from scipy import integrate
+
+class Stationary(Kern):
+    def __init__(self, input_dim, variance, lengthscale, ARD, name):
+        super(Stationary, self).__init__(input_dim, name)
+        self.ARD = ARD
+        if not ARD:
+            if lengthscale is None:
+                lengthscale = np.ones(1)
+            else:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == 1 "Only  lengthscale needed for non-ARD kernel"
+        else:
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size in [1, input_dim], "Bad lengthscales"
+                if lengthscale.size != input_dim:
+                    lengthscale = np.ones(input_dim)*lengthscale
+            else:
+                lengthscale = np.ones(self.input_dim)
+        self.lengthscale = Param('lengthscale', lengthscale, Logexp())
+        self.variance = Param('variance', variance, Logexp())
+        assert self.variance.size==1
+        self.add_parameters(self.variance, self.lengthscale)
+
+    def _dist(self, X, X2):
+        if X2 is None:
+            X2 = X
+        return X[:, None, :] - X2[None, :, :]
+
+    def _scaled_dist(self, X, X2=None):
+        return np.sqrt(np.sum(np.square(self._dist(X, X2) / self.lengthscale), -1))
+
+    def Kdiag(self, X):
+        ret = np.empty(X.shape[0])
+        ret[:] = self.variance
+        return ret
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.variance.gradient = np.sum(dL_dKdiag)
+        self.lengthscale.gradient = 0.
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return np.zeros(X.shape)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        K = self.K(X, X2)
+        self.variance.gradient = np.sum(K * dL_dK)/self.variance
+
+        rinv = self._inv_dist(X, X2)
+        dL_dr = self.dK_dr(X, X2) * dL_dK
+        x_xl3 = np.square(self._dist(X, X2)) / self.lengthscale**3
+
+        if self.ARD:
+            self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum(0).sum(0)
+        else:
+            self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum()
+
+    def _inv_dist(self, X, X2=None):
+        dist = self._scaled_dist(X, X2)
+        if X2 is None:
+            nondiag = util.diag.offdiag_view(dist)
+            nondiag[:] = 1./nondiag
+            return dist
+        else:
+            return 1./np.where(dist != 0., dist, np.inf)
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        dL_dr = self.dK_dr(X, X2) * dL_dK
+        invdist = self._inv_dist(X, X2)
+        ret = np.sum((invdist*dL_dr)[:,:,None]*self._dist(X, X2),1)/self.lengthscale**2
+        if X2 is None:
+            ret *= 2.
+        return ret
+
+
+
+
+class Exponential(Stationary):
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='Exponential'):
+        super(Exponential, self).__init__(input_dim, variance, lengthscale, ARD, name)
+
+    def K(self, X, X2=None):
+        dist = self._scaled_dist(X, X2)
+        return self.variance * np.exp(-0.5 * dist)
+
+    def dK_dr(self, X, X2):
+        return -0.5*self.K(X, X2)
+
+class Matern32(Stationary):
+    """
+    Matern 3/2 kernel:
+
+    .. math::
+
+       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param lengthscale: the vector of lengthscale :math:`\ell_i`
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
+    :type ARD: Boolean
+    :rtype: kernel object
+
+    """
+
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='Mat32'):
+        super(Matern32, self).__init__(input_dim, variance, lengthscale, ARD, name)
+
+    def K(self, X, X2=None):
+        dist = self._scaled_dist(X, X2)
+        return self.variance * (1. + np.sqrt(3.) * dist) * np.exp(-np.sqrt(3.) * dist)
+
+    def dK_dr(self, X, X2):
+        dist = self._scaled_dist(X, X2)
+        return -3.*self.variance*dist*np.exp(-np.sqrt(3.)*dist)
+
+    def Gram_matrix(self, F, F1, F2, lower, upper):
+        """
+        Return the Gram matrix of the vector of functions F with respect to the
+        RKHS norm. The use of this function is limited to input_dim=1.
+
+        :param F: vector of functions
+        :type F: np.array
+        :param F1: vector of derivatives of F
+        :type F1: np.array
+        :param F2: vector of second derivatives of F
+        :type F2: np.array
+        :param lower,upper: boundaries of the input domain
+        :type lower,upper: floats
+        """
+        assert self.input_dim == 1
+        def L(x, i):
+            return(3. / self.lengthscale ** 2 * F[i](x) + 2 * np.sqrt(3) / self.lengthscale * F1[i](x) + F2[i](x))
+        n = F.shape[0]
+        G = np.zeros((n, n))
+        for i in range(n):
+            for j in range(i, n):
+                G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0]
+        Flower = np.array([f(lower) for f in F])[:, None]
+        F1lower = np.array([f(lower) for f in F1])[:, None]
+        return(self.lengthscale ** 3 / (12.*np.sqrt(3) * self.variance) * G + 1. / self.variance * np.dot(Flower, Flower.T) + self.lengthscale ** 2 / (3.*self.variance) * np.dot(F1lower, F1lower.T))
+
+
+class Matern52(Stationary):
+    """
+    Matern 5/2 kernel:
+
+    .. math::
+
+       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       """
+
+    def K(self, X, X2=None):
+        r = self._scaled_dist(X, X2)
+        return self.variance*(1+np.sqrt(5.)*r+5./3*r**2)*np.exp(-np.sqrt(5.)*r)
+
+    def dK_dr(self, X, X2):
+        r = self._scaled_dist(X, X2)
+        return self.variance*(10./3*r -5.*r -5.*np.sqrt(5.)/3*r**2)*np.exp(-np.sqrt(5.)*r)
+
+    def Gram_matrix(self,F,F1,F2,F3,lower,upper):
+        """
+        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
+
+        :param F: vector of functions
+        :type F: np.array
+        :param F1: vector of derivatives of F
+        :type F1: np.array
+        :param F2: vector of second derivatives of F
+        :type F2: np.array
+        :param F3: vector of third derivatives of F
+        :type F3: np.array
+        :param lower,upper: boundaries of the input domain
+        :type lower,upper: floats
+        """
+        assert self.input_dim == 1
+        def L(x,i):
+            return(5*np.sqrt(5)/self.lengthscale**3*F[i](x) + 15./self.lengthscale**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscale*F2[i](x) + F3[i](x))
+        n = F.shape[0]
+        G = np.zeros((n,n))
+        for i in range(n):
+            for j in range(i,n):
+                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
+        G_coef = 3.*self.lengthscale**5/(400*np.sqrt(5))
+        Flower = np.array([f(lower) for f in F])[:,None]
+        F1lower = np.array([f(lower) for f in F1])[:,None]
+        F2lower = np.array([f(lower) for f in F2])[:,None]
+        orig = 9./8*np.dot(Flower,Flower.T) + 9.*self.lengthscale**4/200*np.dot(F2lower,F2lower.T)
+        orig2 = 3./5*self.lengthscale**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
+        return(1./self.variance* (G_coef*G + orig + orig2))
+
+
+
+
+class ExpQuad(Stationary):
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='ExpQuad'):
+        super(ExpQuad, self).__init__(input_dim, variance, lengthscale, ARD, name)
+
+    def K(self, X, X2=None):
+        r = self._scaled_dist(X, X2)
+        return self.variance * np.exp(-0.5 * r**2)
+
+    def dK_dr(self, X, X2):
+        dist = self._scaled_dist(X, X2)
+        return -dist*self.K(X, X2)
+
+
+
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index c10fea4c..f93bb0ec 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -12,6 +12,7 @@ import decorators
 import classification
 import subarray_and_sorting
 import caching
+import diag
 
 try:
     import sympy
diff --git a/GPy/util/diag.py b/GPy/util/diag.py
index 3d6b4dc9..3044ed54 100644
--- a/GPy/util/diag.py
+++ b/GPy/util/diag.py
@@ -11,14 +11,14 @@ import numpy as np
 def view(A, offset=0):
     """
     Get a view on the diagonal elements of a 2D array.
-    
-    This is actually a view (!) on the diagonal of the array, so you can 
+
+    This is actually a view (!) on the diagonal of the array, so you can
     in-place adjust the view.
-    
+
     :param :class:`ndarray` A: 2 dimensional numpy array
     :param int offset: view offset to give back (negative entries allowed)
     :rtype: :class:`ndarray` view of diag(A)
-    
+
     >>> import numpy as np
     >>> X = np.arange(9).reshape(3,3)
     >>> view(X)
@@ -36,7 +36,7 @@ def view(A, offset=0):
     """
     from numpy.lib.stride_tricks import as_strided
     assert A.ndim == 2, "only implemented for 2 dimensions"
-    assert A.shape[0] == A.shape[1], "attempting to get the view of non-square matrix?!" 
+    assert A.shape[0] == A.shape[1], "attempting to get the view of non-square matrix?!"
     if offset > 0:
         return as_strided(A[0, offset:], shape=(A.shape[0] - offset, ), strides=((A.shape[0]+1)*A.itemsize, ))
     elif offset < 0:
@@ -44,6 +44,12 @@ def view(A, offset=0):
     else:
         return as_strided(A, shape=(A.shape[0], ), strides=((A.shape[0]+1)*A.itemsize, ))
 
+def offdiag_view(A, offset=0):
+    from numpy.lib.stride_tricks import as_strided
+    assert A.ndim == 2, "only implemented for 2 dimensions"
+    Af = as_strided(A, shape=(A.size,), strides=(A.itemsize,))
+    return as_strided(Af[(1+offset):], shape=(A.shape[0]-1, A.shape[1]), strides=(A.strides[0] + A.itemsize, A.strides[1]))
+
 def _diag_ufunc(A,b,offset,func):
     dA = view(A, offset); func(dA,b,dA)
     return A
@@ -51,11 +57,11 @@ def _diag_ufunc(A,b,offset,func):
 def times(A, b, offset=0):
     """
     Times the view of A with b in place (!).
-    Returns modified A 
+    Returns modified A
     Broadcasting is allowed, thus b can be scalar.
-    
+
     if offset is not zero, make sure b is of right shape!
-    
+
     :param ndarray A: 2 dimensional array
     :param ndarray-like b: either one dimensional or scalar
     :param int offset: same as in view.
@@ -67,11 +73,11 @@ multiply = times
 def divide(A, b, offset=0):
     """
     Divide the view of A by b in place (!).
-    Returns modified A 
+    Returns modified A
     Broadcasting is allowed, thus b can be scalar.
-    
+
     if offset is not zero, make sure b is of right shape!
-    
+
     :param ndarray A: 2 dimensional array
     :param ndarray-like b: either one dimensional or scalar
     :param int offset: same as in view.
@@ -84,9 +90,9 @@ def add(A, b, offset=0):
     Add b to the view of A in place (!).
     Returns modified A.
     Broadcasting is allowed, thus b can be scalar.
-    
+
     if offset is not zero, make sure b is of right shape!
-    
+
     :param ndarray A: 2 dimensional array
     :param ndarray-like b: either one dimensional or scalar
     :param int offset: same as in view.
@@ -99,16 +105,16 @@ def subtract(A, b, offset=0):
     Subtract b from the view of A in place (!).
     Returns modified A.
     Broadcasting is allowed, thus b can be scalar.
-    
+
     if offset is not zero, make sure b is of right shape!
-    
+
     :param ndarray A: 2 dimensional array
     :param ndarray-like b: either one dimensional or scalar
     :param int offset: same as in view.
     :rtype: view of A, which is adjusted inplace
     """
     return _diag_ufunc(A, b, offset, np.subtract)
-        
+
 if __name__ == '__main__':
     import doctest
-    doctest.testmod()
\ No newline at end of file
+    doctest.testmod()

From fddc663f286e94feef218c74a4f555903e097bee Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 21 Feb 2014 17:32:40 +0000
Subject: [PATCH 28/38] working on coregionalize

---
 GPy/kern/_src/coregionalize.py | 91 ++++++++++++++++------------------
 1 file changed, 44 insertions(+), 47 deletions(-)

diff --git a/GPy/kern/_src/coregionalize.py b/GPy/kern/_src/coregionalize.py
index 69fc27ef..0d99ce21 100644
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@@ -5,6 +5,7 @@ from kern import Kern
 import numpy as np
 from scipy import weave
 from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
 
 class Coregionalize(Kern):
     """
@@ -20,7 +21,7 @@ class Coregionalize(Kern):
        k_2(x, y)=\mathbf{B} k(x, y)
 
     it is obtained as the tensor product between a covariance function
-    k(x,y) and B.
+    k(x, y) and B.
 
     :param output_dim: number of outputs to coregionalize
     :type output_dim: int
@@ -29,7 +30,7 @@ class Coregionalize(Kern):
     :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalization matrix B
     :type W: numpy array of dimensionality (num_outpus, W_columns)
     :param kappa: a vector which allows the outputs to behave independently
-    :type kappa: numpy array of dimensionality  (output_dim,)
+    :type kappa: numpy array of dimensionality  (output_dim, )
 
     .. note: see coregionalization examples in GPy.examples.regression for some usage.
     """
@@ -37,18 +38,18 @@ class Coregionalize(Kern):
         super(Coregionalize, self).__init__(input_dim=1, name=name)
         self.output_dim = output_dim
         self.rank = rank
-        if self.rank>output_dim-1:
+        if self.rank>output_dim:
             print("Warning: Unusual choice of rank, it should normally be less than the output_dim.")
         if W is None:
-            W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank)
+            W = 0.5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
         else:
-            assert W.shape==(self.output_dim,self.rank)
-        self.W = Param('W',W)
+            assert W.shape==(self.output_dim, self.rank)
+        self.W = Param('W', W)
         if kappa is None:
             kappa = 0.5*np.ones(self.output_dim)
         else:
-            assert kappa.shape==(self.output_dim,)
-        self.kappa = Param('kappa', kappa)
+            assert kappa.shape==(self.output_dim, )
+        self.kappa = Param('kappa', kappa, Logexp())
         self.add_parameters(self.W, self.kappa)
         self.parameters_changed()
 
@@ -56,54 +57,58 @@ class Coregionalize(Kern):
     def parameters_changed(self):
         self.B = np.dot(self.W, self.W.T) + np.diag(self.kappa)
 
-    def K(self,index,index2,target):
-        index = np.asarray(index,dtype=np.int)
+    def K(self, X, X2=None):
+        index = np.asarray(X, dtype=np.int)
 
         #here's the old code (numpy)
         #if index2 is None:
             #index2 = index
         #else:
-            #index2 = np.asarray(index2,dtype=np.int)
+            #index2 = np.asarray(index2, dtype=np.int)
         #false_target = target.copy()
-        #ii,jj = np.meshgrid(index,index2)
-        #ii,jj = ii.T, jj.T
-        #false_target += self.B[ii,jj]
+        #ii, jj = np.meshgrid(index, index2)
+        #ii, jj = ii.T, jj.T
+        #false_target += self.B[ii, jj]
 
-        if index2 is None:
+
+        if X2 is None:
+            target = np.empty((X.shape[0], X.shape[0]), dtype=np.float64)
             code="""
             for(int i=0;i<N; i++){
-              target[i+i*N] += B[index[i]+output_dim*index[i]];
+              target[i+i*N] = B[index[i]+output_dim*index[i]];
               for(int j=0; j<i; j++){
-                  target[j+i*N] += B[index[i]+output_dim*index[j]];
-                  target[i+j*N] += target[j+i*N];
+                  target[j+i*N] = B[index[i]+output_dim*index[j]];
+                  target[i+j*N] = target[j+i*N];
                 }
               }
             """
-            N,B,output_dim = index.size, self.B, self.output_dim
-            weave.inline(code,['target','index','N','B','output_dim'])
+            N, B, output_dim = index.size, self.B, self.output_dim
+            weave.inline(code, ['target', 'index', 'N', 'B', 'output_dim'])
         else:
-            index2 = np.asarray(index2,dtype=np.int)
+            index2 = np.asarray(X2, dtype=np.int)
+            target = np.empty((X.shape[0], X2.shape[0]), dtype=np.float64)
             code="""
             for(int i=0;i<num_inducing; i++){
               for(int j=0; j<N; j++){
-                  target[i+j*num_inducing] += B[output_dim*index[j]+index2[i]];
+                  target[i+j*num_inducing] = B[output_dim*index[j]+index2[i]];
                 }
               }
             """
-            N,num_inducing,B,output_dim = index.size,index2.size, self.B, self.output_dim
-            weave.inline(code,['target','index','index2','N','num_inducing','B','output_dim'])
+            N, num_inducing, B, output_dim = index.size, index2.size, self.B, self.output_dim
+            weave.inline(code, ['target', 'index', 'index2', 'N', 'num_inducing', 'B', 'output_dim'])
+        return target
 
 
-    def Kdiag(self,index,target):
-        target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
+    def Kdiag(self, X):
+        return np.diag(self.B)[np.asarray(X, dtype=np.int).flatten()]
 
-    def update_gradients_full(self,dL_dK, index, index2=None):
-        index = np.asarray(index,dtype=np.int)
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        index = np.asarray(X, dtype=np.int)
         dL_dK_small = np.zeros_like(self.B)
-        if index2 is None:
+        if X2 is None:
             index2 = index
         else:
-            index2 = np.asarray(index2,dtype=np.int)
+            index2 = np.asarray(X2, dtype=np.int)
 
         code="""
         for(int i=0; i<num_inducing; i++){
@@ -113,28 +118,20 @@ class Coregionalize(Kern):
         }
         """
         N, num_inducing, output_dim = index.size, index2.size, self.output_dim
-        weave.inline(code, ['N','num_inducing','output_dim','dL_dK','dL_dK_small','index','index2'])
+        weave.inline(code, ['N', 'num_inducing', 'output_dim', 'dL_dK', 'dL_dK_small', 'index', 'index2'])
 
         dkappa = np.diag(dL_dK_small)
         dL_dK_small += dL_dK_small.T
-        dW = (self.W[:,None,:]*dL_dK_small[:,:,None]).sum(0)
+        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
 
         self.W.gradient = dW
         self.kappa.gradient = dkappa
 
-    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        raise NotImplementedError, "some code below"
-    #def dKdiag_dtheta(self,dL_dKdiag,index,target):
-        #index = np.asarray(index,dtype=np.int).flatten()
-        #dL_dKdiag_small = np.zeros(self.output_dim)
-        #for i in range(self.output_dim):
-            #dL_dKdiag_small[i] += np.sum(dL_dKdiag[index==i])
-        #dW = 2.*self.W*dL_dKdiag_small[:,None]
-        #dkappa = dL_dKdiag_small
-        #target += np.hstack([dW.flatten(),dkappa])
+    def update_gradients_diag(self, dL_dKdiag, X):
+        index = np.asarray(X, dtype=np.int).flatten()
+        dL_dKdiag_small = np.array([dL_dKdiag[index==i] for i in xrange(output_dim)])
+        self.W.gradient = 2.*self.W*dL_dKdiag_small[:, None]
+        self.kappa.gradient = dL_dKdiag_small
 
-    def gradients_X(self,dL_dK,X,X2):
-        if X2 is None:
-            return np.zeros((X.shape[0], X.shape[0]))
-        else:
-            return np.zeros((X.shape[0], X2.shape[0]))
+    def gradients_X(self, dL_dK, X, X2=None):
+        return np.zeros(X.shape)

From 2da256fa93b284307c51a15c8e84f7e5b1b83ef5 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 21 Feb 2014 17:39:02 +0000
Subject: [PATCH 29/38] tidying

---
 GPy/kern/_src/coregionalize.py |  4 ++++
 GPy/kern/_src/stationary.py    | 16 +++-------------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/GPy/kern/_src/coregionalize.py b/GPy/kern/_src/coregionalize.py
index 0d99ce21..74cd2a1d 100644
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@@ -135,3 +135,7 @@ class Coregionalize(Kern):
 
     def gradients_X(self, dL_dK, X, X2=None):
         return np.zeros(X.shape)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return np.zeros(X.shape)
+
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index aaa534ac..7cc2e695 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -49,9 +49,6 @@ class Stationary(Kern):
         self.variance.gradient = np.sum(dL_dKdiag)
         self.lengthscale.gradient = 0.
 
-    def gradients_X_diag(self, dL_dKdiag, X):
-        return np.zeros(X.shape)
-
     def update_gradients_full(self, dL_dK, X, X2=None):
         K = self.K(X, X2)
         self.variance.gradient = np.sum(K * dL_dK)/self.variance
@@ -82,6 +79,9 @@ class Stationary(Kern):
             ret *= 2.
         return ret
 
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return np.zeros(X.shape)
+
 
 
 
@@ -104,16 +104,6 @@ class Matern32(Stationary):
 
        k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
 
-    :param input_dim: the number of input dimensions
-    :type input_dim: int
-    :param variance: the variance :math:`\sigma^2`
-    :type variance: float
-    :param lengthscale: the vector of lengthscale :math:`\ell_i`
-    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
-    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
-    :type ARD: Boolean
-    :rtype: kernel object
-
     """
 
     def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='Mat32'):

From 659643038fe0c6937c69e48cf12c4efd32e41edf Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Fri, 21 Feb 2014 17:53:44 +0000
Subject: [PATCH 30/38] parameterized now supports deleting of parameters

---
 GPy/core/model.py                             | 17 ++++------
 GPy/core/parameterization/index_operations.py | 18 +++++++---
 GPy/core/parameterization/parameter_core.py   | 12 ++++---
 GPy/core/parameterization/parameterized.py    | 18 ++++++----
 GPy/examples/dimensionality_reduction.py      | 19 +++++++----
 .../latent_function_inference/var_dtc.py      | 10 +++---
 GPy/kern/_src/kern.py                         |  2 +-
 .../matplot_dep/dim_reduction_plots.py        | 20 +++++------
 GPy/plotting/matplot_dep/kernel_plots.py      | 34 +++++++++----------
 GPy/plotting/matplot_dep/models_plots.py      | 23 +++++++------
 GPy/testing/index_operations_tests.py         |  7 ++++
 GPy/testing/parameterized_tests.py            | 16 ++++-----
 12 files changed, 113 insertions(+), 83 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index c067d51d..21bcf0c7 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -485,20 +485,17 @@ class Model(Parameterized):
         if not hasattr(self, 'kern'):
             raise ValueError, "this model has no kernel"
 
-        k = [p for p in self.kern._parameters_ if hasattr(p, "ARD") and p.ARD]
-        if (not len(k) == 1):
-            raise ValueError, "cannot determine sensitivity for this kernel"
-        k = k[0]
-        from ..kern.parts.rbf import RBF
-        from ..kern.parts.rbf_inv import RBFInv
-        from ..kern.parts.linear import Linear
+        k = self.kern#[p for p in self.kern._parameters_ if hasattr(p, "ARD") and p.ARD]
+        from ..kern import RBF, Linear#, RBFInv
+
         if isinstance(k, RBF):
             return 1. / k.lengthscale
-        elif isinstance(k, RBFInv):
-            return k.inv_lengthscale
+        #elif isinstance(k, RBFInv):
+        #    return k.inv_lengthscale
         elif isinstance(k, Linear):
             return k.variances
-
+        else:
+            raise ValueError, "cannot determine sensitivity for this kernel"
 
     def pseudo_EM(self, stop_crit=.1, **kwargs):
         """
diff --git a/GPy/core/parameterization/index_operations.py b/GPy/core/parameterization/index_operations.py
index bfd0bf21..b5399741 100644
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@@ -83,11 +83,21 @@ class ParameterIndexOperations(object):
     def iterproperties(self):
         return self._properties.iterkeys()
     
-    def shift(self, start, size):
+    def shift_right(self, start, size):
         for ind in self.iterindices():
             toshift = ind>=start
-            if toshift.size > 0:
-                ind[toshift] += size
+            ind[toshift] += size
+
+    def shift_left(self, start, size):
+        for v, ind in self.items():
+            todelete = (ind>=start) * (ind<start+size)
+            if todelete.size != 0:
+                ind = ind[~todelete]
+            toshift = ind>=start
+            if toshift.size != 0:
+                ind[toshift] -= size
+            if ind.size != 0: self._properties[v] = ind
+            else: del self._properties[v]
     
     def clear(self):
         self._properties.clear()
@@ -183,7 +193,7 @@ class ParameterIndexOperationsView(object):
             yield i 
 
 
-    def shift(self, start, size):
+    def shift_right(self, start, size):
         raise NotImplementedError, 'Shifting only supported in original ParamIndexOperations'
     
 
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 45b57eab..c2c8a05a 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -390,6 +390,7 @@ class Parameterizable(Constrainable):
         import copy
         from .index_operations import ParameterIndexOperations, ParameterIndexOperationsView
         from .array_core import ParamList
+
         dc = dict()
         for k, v in self.__dict__.iteritems():
             if k not in ['_direct_parent_', '_parameters_', '_parent_index_'] + self.parameter_names():
@@ -399,18 +400,21 @@ class Parameterizable(Constrainable):
                     dc[k] = copy.deepcopy(v)
             if k == '_parameters_':
                 params = [p.copy() for p in v]
-        # dc = copy.deepcopy(self.__dict__)
+        
         dc['_direct_parent_'] = None
         dc['_parent_index_'] = None
         dc['_parameters_'] = ParamList()
+        dc['constraints'].clear()
+        dc['priors'].clear()
+        dc['size'] = 0
+
         s = self.__new__(self.__class__)
         s.__dict__ = dc
-        # import ipdb;ipdb.set_trace()
+        
         for p in params:
             s.add_parameter(p)
-        # dc._notify_parent_change()
+        
         return s
-        # return copy.deepcopy(self)
 
     def _notify_parameters_changed(self):
         self.parameters_changed()
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 177cc217..d463ed43 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -87,8 +87,8 @@ class Parameterized(Parameterizable, Pickleable, Observable, Gradcheckable):
                 self._parameters_.append(param)
             else:
                 start = sum(p.size for p in self._parameters_[:index])
-                self.constraints.shift(start, param.size)
-                self.priors.shift(start, param.size)
+                self.constraints.shift_right(start, param.size)
+                self.priors.shift_right(start, param.size)
                 self.constraints.update(param.constraints, start)
                 self.priors.update(param.priors, start)
                 self._parameters_.insert(index, param)
@@ -113,15 +113,19 @@ class Parameterized(Parameterizable, Pickleable, Observable, Gradcheckable):
         """
         if not param in self._parameters_:
             raise RuntimeError, "Parameter {} does not belong to this object, remove parameters directly from their respective parents".format(param._short())
-        del self._parameters_[param._parent_index_]
+        
+        start = sum([p.size for p in self._parameters_[:param._parent_index_]])
+        self._remove_parameter_name(param)
         self.size -= param.size
+        del self._parameters_[param._parent_index_]
         
         param._disconnect_parent()
-        self._remove_parameter_name(param)
-        
-        #self._notify_parent_change()
+        self.constraints.shift_left(start, param.size)
         self._connect_fixes()
-
+        self._connect_parameters()
+        self._notify_parent_change()
+        
+        
     def _connect_parameters(self):
         # connect parameterlist to this parameterized object
         # This just sets up the right connection for the params objects
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index c8e79e6c..3ba54d34 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -74,7 +74,7 @@ def gplvm_oil_100(optimize=True, verbose=1, plot=True):
     data = GPy.util.datasets.oil_100()
     Y = data['X']
     # create simple GP model
-    kernel = GPy.kern.RBF(6, ARD=True) + GPy.kern.bias(6)
+    kernel = GPy.kern.RBF(6, ARD=True) + GPy.kern.Bias(6)
     m = GPy.models.GPLVM(Y, 6, kernel=kernel)
     m.data_labels = data['Y'].argmax(axis=1)
     if optimize: m.optimize('scg', messages=verbose)
@@ -190,17 +190,22 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
     _np.random.seed(1234)
     
     x = _np.linspace(0, 4 * _np.pi, N)[:, None]
-    s1 = _np.vectorize(lambda x: _np.sin(x))
+    s1 = _np.vectorize(lambda x: -_np.sin(x))
     s2 = _np.vectorize(lambda x: _np.cos(x))
     s3 = _np.vectorize(lambda x:-_np.exp(-_np.cos(2 * x)))
-    sS = _np.vectorize(lambda x: _np.sin(2 * x))
+    sS = _np.vectorize(lambda x: x*_np.sin(x))
 
     s1 = s1(x)
     s2 = s2(x)
     s3 = s3(x)
     sS = sS(x)
 
-    S1 = _np.hstack([s1, sS])
+    s1 -= s1.mean(); s1 /= s1.std(0)
+    s2 -= s2.mean(); s2 /= s2.std(0)
+    s3 -= s3.mean(); s3 /= s3.std(0)
+    sS -= sS.mean(); sS /= sS.std(0)
+
+    S1 = _np.hstack([s1, s2, sS])
     S2 = _np.hstack([s2, s3, sS])
     S3 = _np.hstack([s3, sS])
 
@@ -271,7 +276,7 @@ def bgplvm_simulation(optimize=True, verbose=1,
     D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
     _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
     Y = Ylist[0]
-    k = kern.linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
     
     if optimize:
@@ -291,10 +296,10 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
     from GPy.models import BayesianGPLVM
     from GPy.inference.latent_function_inference.var_dtc import VarDTCMissingData
 
-    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
+    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 5, 9
     _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
     Y = Ylist[0]
-    k = kern.linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
     
     inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
     m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, kernel=k)
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index a81bb711..5e88569c 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -308,14 +308,14 @@ class VarDTCMissingData(object):
         # gradients:
         if uncertain_inputs:
             grad_dict = {'dL_dKmm': dL_dKmm, 
-                         'dL_dpsi0':dL_dpsi0, 
-                         'dL_dpsi1':dL_dpsi1, 
-                         'dL_dpsi2':dL_dpsi2, 
+                         'dL_dpsi0':dL_dpsi0_all, 
+                         'dL_dpsi1':dL_dpsi1_all, 
+                         'dL_dpsi2':dL_dpsi2_all, 
                          'partial_for_likelihood':partial_for_likelihood}
         else:
             grad_dict = {'dL_dKmm': dL_dKmm, 
-                         'dL_dKdiag':dL_dpsi0, 
-                         'dL_dKnm':dL_dpsi1, 
+                         'dL_dKdiag':dL_dpsi0_all, 
+                         'dL_dKnm':dL_dpsi1_all, 
                          'partial_for_likelihood':partial_for_likelihood}
 
         #get sufficient things for posterior prediction
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index b3ee57cd..f436d322 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -67,7 +67,7 @@ class Kern(Parameterized):
         See GPy.plotting.matplot_dep.plot_ARD
         """
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import kernel_plots
+        from ...plotting.matplot_dep import kernel_plots
         return kernel_plots.plot_ARD(self,*args)
 
 
diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py
index 74292c05..3f4ea9b0 100644
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@@ -1,8 +1,8 @@
 import pylab as pb
 import numpy as np
-from ... import util
 from latent_space_visualizations.controllers.imshow_controller import ImshowController,ImAnnotateController
-from GPy.util.misc import param_to_array
+from ...util.misc import param_to_array
+from .base_plots import x_frame2D
 import itertools
 import Tango
 from matplotlib.cm import get_cmap
@@ -37,7 +37,7 @@ def plot_latent(model, labels=None, which_indices=None,
     if ax is None:
         fig = pb.figure(num=fignum)
         ax = fig.add_subplot(111)
-    util.plot.Tango.reset()
+    Tango.reset()
 
     if labels is None:
         labels = np.ones(model.num_data)
@@ -46,7 +46,7 @@ def plot_latent(model, labels=None, which_indices=None,
     X = param_to_array(model.X)
 
     # first, plot the output variance as a function of the latent space
-    Xtest, xx, yy, xmin, xmax = util.plot.x_frame2D(X[:, [input_1, input_2]], resolution=resolution)
+    Xtest, xx, yy, xmin, xmax = x_frame2D(X[:, [input_1, input_2]], resolution=resolution)
     Xtest_full = np.zeros((Xtest.shape[0], model.X.shape[1]))
 
     def plot_function(x):
@@ -87,7 +87,7 @@ def plot_latent(model, labels=None, which_indices=None,
         else:
             x = X[index, input_1]
             y = X[index, input_2]
-        ax.scatter(x, y, marker=m, s=s, color=util.plot.Tango.nextMedium(), label=this_label)
+        ax.scatter(x, y, marker=m, s=s, color=Tango.nextMedium(), label=this_label)
 
     ax.set_xlabel('latent dimension %i' % input_1)
     ax.set_ylabel('latent dimension %i' % input_2)
@@ -120,7 +120,7 @@ def plot_magnification(model, labels=None, which_indices=None,
     if ax is None:
         fig = pb.figure(num=fignum)
         ax = fig.add_subplot(111)
-    util.plot.Tango.reset()
+    Tango.reset()
 
     if labels is None:
         labels = np.ones(model.num_data)
@@ -128,7 +128,7 @@ def plot_magnification(model, labels=None, which_indices=None,
     input_1, input_2 = most_significant_input_dimensions(model, which_indices)
 
     # first, plot the output variance as a function of the latent space
-    Xtest, xx, yy, xmin, xmax = util.plot.x_frame2D(model.X[:, [input_1, input_2]], resolution=resolution)
+    Xtest, xx, yy, xmin, xmax = x_frame2D(model.X[:, [input_1, input_2]], resolution=resolution)
     Xtest_full = np.zeros((Xtest.shape[0], model.X.shape[1]))
 
     def plot_function(x):
@@ -165,7 +165,7 @@ def plot_magnification(model, labels=None, which_indices=None,
         else:
             x = model.X[index, input_1]
             y = model.X[index, input_2]
-        ax.scatter(x, y, marker=m, s=s, color=util.plot.Tango.nextMedium(), label=this_label)
+        ax.scatter(x, y, marker=m, s=s, color=Tango.nextMedium(), label=this_label)
 
     ax.set_xlabel('latent dimension %i' % input_1)
     ax.set_ylabel('latent dimension %i' % input_2)
@@ -205,7 +205,7 @@ def plot_steepest_gradient_map(model, fignum=None, ax=None, which_indices=None,
         return dmu_dX[indices, argmax], np.array(labels)[argmax]
 
     if ax is None:
-        fig = pyplot.figure(num=fignum)
+        fig = pb.figure(num=fignum)
         ax = fig.add_subplot(111)
 
     if data_labels is None:
@@ -241,7 +241,7 @@ def plot_steepest_gradient_map(model, fignum=None, ax=None, which_indices=None,
     ax.legend()
     ax.figure.tight_layout()
     if updates:
-        pyplot.show()
+        pb.show()
         clear = raw_input('Enter to continue')
         if clear.lower() in 'yes' or clear == '':
             controller.deactivate()
diff --git a/GPy/plotting/matplot_dep/kernel_plots.py b/GPy/plotting/matplot_dep/kernel_plots.py
index 30157294..3436c4ff 100644
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-import sys
 import numpy as np
 import pylab as pb
 import Tango
@@ -29,22 +28,23 @@ def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False):
     xticklabels = []
     bars = []
     x0 = 0
-    for p in kernel._parameters_:
-        c = Tango.nextMedium()
-        if hasattr(p, 'ARD') and p.ARD:
-            if title is None:
-                ax.set_title('ARD parameters, %s kernel' % p.name)
-            else:
-                ax.set_title(title)
-            if isinstance(p, Linear):
-                ard_params = p.variances
-            else:
-                ard_params = 1. / p.lengthscale
-
-            x = np.arange(x0, x0 + len(ard_params))
-            bars.append(ax.bar(x, ard_params, align='center', color=c, edgecolor='k', linewidth=1.2, label=p.name.replace("_"," ")))
-            xticklabels.extend([r"$\mathrm{{{name}}}\ {x}$".format(name=p.name, x=i) for i in np.arange(len(ard_params))])
-            x0 += len(ard_params)
+    #for p in kernel._parameters_:
+    p = kernel
+    c = Tango.nextMedium()
+    if hasattr(p, 'ARD') and p.ARD:
+        if title is None:
+            ax.set_title('ARD parameters, %s kernel' % p.name)
+        else:
+            ax.set_title(title)
+        if isinstance(p, Linear):
+            ard_params = p.variances
+        else:
+            ard_params = 1. / p.lengthscale
+        x = np.arange(x0, x0 + len(ard_params))
+        from ...util.misc import param_to_array
+        bars.append(ax.bar(x, param_to_array(ard_params), align='center', color=c, edgecolor='k', linewidth=1.2, label=p.name.replace("_"," ")))
+        xticklabels.extend([r"$\mathrm{{{name}}}\ {x}$".format(name=p.name, x=i) for i in np.arange(len(ard_params))])
+        x0 += len(ard_params)
     x = np.arange(x0)
     transOffset = offset_copy(ax.transData, fig=fig,
                               x=0., y= -2., units='points')
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index 47c8642e..59c32775 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -56,7 +56,10 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
     if ax is None:
         fig = pb.figure(num=fignum)
         ax = fig.add_subplot(111)
-
+    
+    X, Y = param_to_array(model.X, model.Y)
+    if model.has_uncertain_inputs(): X_variance = model.X_variance
+    
     #work out what the inputs are for plotting (1D or 2D)
     fixed_dims = np.array([i for i,v in fixed_inputs])
     free_dims = np.setdiff1d(np.arange(model.input_dim),fixed_dims)
@@ -66,7 +69,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
 
         #define the frame on which to plot
         resolution = resolution or 200
-        Xnew, xmin, xmax = x_frame1D(model.X[:,free_dims], plot_limits=plot_limits)
+        Xnew, xmin, xmax = x_frame1D(X[:,free_dims], plot_limits=plot_limits)
         Xgrid = np.empty((Xnew.shape[0],model.input_dim))
         Xgrid[:,free_dims] = Xnew
         for i,v in fixed_inputs:
@@ -77,13 +80,13 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
             m, v = model._raw_predict(Xgrid)
             lower = m - 2*np.sqrt(v)
             upper = m + 2*np.sqrt(v)
-            Y = model.Y
+            Y = Y
         else:
             m, v, lower, upper = model.predict(Xgrid)
-            Y = model.Y
+            Y = Y
         for d in which_data_ycols:
             gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-            ax.plot(model.X[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)
+            ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)
 
         #optionally plot some samples
         if samples: #NOTE not tested with fixed_inputs
@@ -95,8 +98,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         
         #add error bars for uncertain (if input uncertainty is being modelled)
         if hasattr(model,"has_uncertain_inputs") and model.has_uncertain_inputs():
-            ax.errorbar(model.X[which_data_rows, free_dims], model.Y[which_data_rows, which_data_ycols],
-                        xerr=2 * np.sqrt(model.X_variance[which_data_rows, free_dims]),
+            ax.errorbar(X[which_data_rows, free_dims].flatten(), Y[which_data_rows, which_data_ycols].flatten(),
+                        xerr=2 * np.sqrt(X_variance[which_data_rows, free_dims].flatten()),
                         ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
 
 
@@ -120,7 +123,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
 
         #define the frame for plotting on
         resolution = resolution or 50
-        Xnew, _, _, xmin, xmax = x_frame2D(model.X[:,free_dims], plot_limits, resolution)
+        Xnew, _, _, xmin, xmax = x_frame2D(X[:,free_dims], plot_limits, resolution)
         Xgrid = np.empty((Xnew.shape[0],model.input_dim))
         Xgrid[:,free_dims] = Xnew
         for i,v in fixed_inputs:
@@ -130,14 +133,14 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         #predict on the frame and plot
         if plot_raw:
             m, _ = model._raw_predict(Xgrid)
-            Y = model.Y
+            Y = Y
         else:
             m, _, _, _ = model.predict(Xgrid)
             Y = model.data
         for d in which_data_ycols:
             m_d = m[:,d].reshape(resolution, resolution).T
             ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-            ax.scatter(model.X[which_data_rows, free_dims[0]], model.X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
 
         #set the limits of the plot to some sensible values
         ax.set_xlim(xmin[0], xmax[0])
diff --git a/GPy/testing/index_operations_tests.py b/GPy/testing/index_operations_tests.py
index d5ef7007..171db5cc 100644
--- a/GPy/testing/index_operations_tests.py
+++ b/GPy/testing/index_operations_tests.py
@@ -24,6 +24,13 @@ class Test(unittest.TestCase):
         self.param_index.remove(one, [1])
         self.assertListEqual(self.param_index[one].tolist(), [3])        
 
+    def test_shift_left(self):
+        self.param_index.shift_left(1, 2)
+        self.assertListEqual(self.param_index[three].tolist(), [2,5])
+        self.assertListEqual(self.param_index[two].tolist(), [0,3])
+        self.assertListEqual(self.param_index[one].tolist(), [1])        
+
+
     def test_index_view(self):
         #=======================================================================
         #          0    1    2    3    4    5    6    7    8    9
diff --git a/GPy/testing/parameterized_tests.py b/GPy/testing/parameterized_tests.py
index ff57606a..6f13d294 100644
--- a/GPy/testing/parameterized_tests.py
+++ b/GPy/testing/parameterized_tests.py
@@ -10,8 +10,8 @@ import numpy as np
 class Test(unittest.TestCase):
 
     def setUp(self):
-        self.rbf = GPy.kern.rbf(1)
-        self.white = GPy.kern.white(1)
+        self.rbf = GPy.kern.RBF(1)
+        self.white = GPy.kern.White(1)
         from GPy.core.parameterization import Param
         from GPy.core.parameterization.transformations import Logistic
         self.param = Param('param', np.random.rand(25,2), Logistic(0, 1))
@@ -39,14 +39,13 @@ class Test(unittest.TestCase):
         
         
     def test_remove_parameter(self):
-        from GPy.core.parameterization.transformations import FIXED, UNFIXED, __fixed__
+        from GPy.core.parameterization.transformations import FIXED, UNFIXED, __fixed__, Logexp
         self.white.fix()
         self.test1.remove_parameter(self.white)
         self.assertIs(self.test1._fixes_,None)
         
         self.assertListEqual(self.white._fixes_.tolist(), [FIXED])
-        self.assertIs(self.white.constraints,self.white.white.constraints._param_index_ops)
-        self.assertEquals(self.white.white.constraints._offset, 0)
+        self.assertEquals(self.white.constraints._offset, 0)
         self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
         self.assertIs(self.test1.constraints, self.param.constraints._param_index_ops)        
         
@@ -57,18 +56,19 @@ class Test(unittest.TestCase):
         self.assertListEqual(self.test1.constraints[__fixed__].tolist(), [0])
         self.assertIs(self.white._fixes_,None)
         self.assertListEqual(self.test1._fixes_.tolist(),[FIXED] + [UNFIXED] * 52)
+        
         self.test1.remove_parameter(self.white)
         self.assertIs(self.test1._fixes_,None)
         self.assertListEqual(self.white._fixes_.tolist(), [FIXED])
-        self.assertIs(self.white.constraints,self.white.white.constraints._param_index_ops)
         self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
-        self.assertIs(self.test1.constraints, self.param.constraints._param_index_ops)        
+        self.assertIs(self.test1.constraints, self.param.constraints._param_index_ops)
+        self.assertListEqual(self.test1.constraints[Logexp()].tolist(), [0,1])
         
     def test_add_parameter_already_in_hirarchy(self):
         self.test1.add_parameter(self.white._parameters_[0])
         
     def test_default_constraints(self):
-        self.assertIs(self.rbf.rbf.variance.constraints._param_index_ops, self.rbf.constraints._param_index_ops)
+        self.assertIs(self.rbf.variance.constraints._param_index_ops, self.rbf.constraints._param_index_ops)
         self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
         self.assertListEqual(self.rbf.constraints.indices()[0].tolist(), range(2))
         from GPy.core.parameterization.transformations import Logexp

From 99c6a2095fa603f7842efa2a1f0e6e8d25354dd0 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@shef.ac.uk>
Date: Fri, 21 Feb 2014 17:56:37 +0000
Subject: [PATCH 31/38] adapt the new interface of the variational posterior
 distribution.

---
 GPy/core/parameterization/variational.py      |  26 ++
 GPy/core/sparse_gp.py                         |   9 +-
 .../latent_function_inference/var_dtc.py      |  61 +--
 GPy/kern/_src/kern.py                         |  12 +-
 GPy/kern/_src/rbf.py                          |  23 +-
 GPy/kern/_src/ss_rbf.py                       | 352 ------------------
 GPy/models/bayesian_gplvm.py                  |   2 +-
 7 files changed, 96 insertions(+), 389 deletions(-)
 delete mode 100644 GPy/kern/_src/ss_rbf.py

diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index a7b26a80..5fe63052 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -29,3 +29,29 @@ class Normal(Parameterized):
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ...plotting.matplot_dep import variational_plots
         return variational_plots.plot(self,*args)
+
+
+class SpikeAndSlab(Parameterized):
+    '''
+    The SpikeAndSlab distribution for variational approximations.
+    '''
+    def __init__(self, means, variances, binary_prob, name='latent space'):
+        """
+        binary_prob : the probability of the distribution on the slab part.
+        """
+        Parameterized.__init__(self, name=name)
+        self.mean = Param("mean", means)
+        self.variance = Param('variance', variances, Logexp())
+        self.gamma = Param("binary_prob",binary_prob,)
+        self.add_parameters(self.mean, self.variance, self.gamma)
+
+    def plot(self, *args):
+        """
+        Plot latent space X in 1D:
+
+        See  GPy.plotting.matplot_dep.variational_plots
+        """
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ...plotting.matplot_dep import variational_plots
+        return variational_plots.plot(self,*args)
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 61a664fe..71053867 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -57,11 +57,14 @@ class SparseGP(GP):
         return not (self.X_variance is None)                
 
     def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
+        if self.has_uncertain_inputs():
+            self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference_latent(self.kern, self.q, self.Z, self.likelihood, self.Y)
+        else:
+            self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
         self.likelihood.update_gradients(self.grad_dict.pop('partial_for_likelihood'))
         if self.has_uncertain_inputs():
-            self.kern.update_gradients_variational(mu=self.X, S=self.X_variance, Z=self.Z, **self.grad_dict)
-            self.Z.gradient = self.kern.gradients_Z_variational(mu=self.X, S=self.X_variance, Z=self.Z, **self.grad_dict)
+            self.kern.update_gradients_variational(posterior_variational=self.q, Z=self.Z, **self.grad_dict)
+            self.Z.gradient = self.kern.gradients_Z_variational(posterior_variational=self.q, Z=self.Z, **self.grad_dict)
         else:
             self.kern.update_gradients_sparse(X=self.X, Z=self.Z, **self.grad_dict)
             self.Z.gradient = self.kern.gradients_Z_sparse(X=self.X, Z=self.Z, **self.grad_dict)
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index a81bb711..c2f179ac 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -43,9 +43,20 @@ class VarDTC(object):
         return Y * prec # TODO chache this, and make it effective
     
     def inference(self, kern, X, X_variance, Z, likelihood, Y):
+        """Inference for normal sparseGP"""
+        uncertain_inputs = False
+        psi0, psi1, psi2 = _compute_psi(kern, X, X_variance, Z, uncertain_inputs)
+        return self._inference(kern, psi0, psi1, psi2, Z, likelihood, Y, uncertain_inputs)
+
+    def inference_latent(self, kern, posterior_variational, Z, likelihood, Y):
+        """Inference for GPLVM with uncertain inputs"""
+        uncertain_inputs = True
+        psi0, psi1, psi2 = _compute_psi_latent(kern, posterior_variational, Z)
+        return self._inference(kern, psi0, psi1, psi2, Z, likelihood, Y, uncertain_inputs)
+    
+    def _inference(self, kern, psi0, psi1, psi2, Z, likelihood, Y, uncertain_inputs):
 
         #see whether we're using variational uncertain inputs
-        uncertain_inputs = not (X_variance is None)
         
         _, output_dim = Y.shape
         
@@ -62,10 +73,9 @@ class VarDTC(object):
         # do the inference:
         het_noise = beta.size < 1
         num_inducing = Z.shape[0]
-        num_data = X.shape[0]
+        num_data = Y.shape[0]
         # kernel computations, using BGPLVM notation
-        Kmm = kern.K(Z)
-        psi0, psi1, psi2 = _compute_psi(kern, X, X_variance, Z, uncertain_inputs) 
+        Kmm = kern.K(Z) 
         
         Lm = jitchol(Kmm)
         
@@ -191,20 +201,31 @@ class VarDTCMissingData(object):
         else:
             self._subarray_indices = [[slice(None),slice(None)]]
             return [Y], [(Y**2).sum()]
-    
+
     def inference(self, kern, X, X_variance, Z, likelihood, Y):
+        """Inference for normal sparseGP"""
+        uncertain_inputs = False
+        psi0, psi1, psi2 = _compute_psi(kern, X, X_variance, Z, uncertain_inputs)
+        return self._inference(kern, psi0, psi1, psi2, Z, likelihood, Y, uncertain_inputs)
+
+    def inference_latent(self, kern, posterior_variational, Z, likelihood, Y):
+        """Inference for GPLVM with uncertain inputs"""
+        uncertain_inputs = True
+        psi0, psi1, psi2 = _compute_psi_latent(kern, posterior_variational, Z)
+        return self._inference(kern, psi0, psi1, psi2, Z, likelihood, Y, uncertain_inputs)
+    
+    def _inference(self, kern, psi0_all, psi1_all, psi2_all, Z, likelihood, Y, uncertain_inputs):
         Ys, traces = self._Y(Y)
         beta_all = 1./likelihood.variance
-        uncertain_inputs = not (X_variance is None)
         het_noise = beta_all.size != 1
 
         import itertools
         num_inducing = Z.shape[0]
 
-        dL_dpsi0_all = np.zeros(X.shape[0])
-        dL_dpsi1_all = np.zeros((X.shape[0], num_inducing))
+        dL_dpsi0_all = np.zeros(Y.shape[0])
+        dL_dpsi1_all = np.zeros((Y.shape[0], num_inducing))
         if uncertain_inputs:
-            dL_dpsi2_all = np.zeros((X.shape[0], num_inducing, num_inducing))
+            dL_dpsi2_all = np.zeros((Y.shape[0], num_inducing, num_inducing))
         
         partial_for_likelihood = 0
         woodbury_vector = np.zeros((num_inducing, Y.shape[1]))
@@ -217,9 +238,6 @@ class VarDTCMissingData(object):
         Lm = jitchol(Kmm)
         if uncertain_inputs: LmInv = dtrtri(Lm)
 
-        # kernel computations, using BGPLVM notation
-        psi0_all, psi1_all, psi2_all = _compute_psi(kern, X, X_variance, Z, uncertain_inputs)
-
         VVT_factor_all = np.empty(Y.shape)
         full_VVT_factor = VVT_factor_all.shape[1] == Y.shape[1]
         if not full_VVT_factor:
@@ -340,15 +358,16 @@ class VarDTCMissingData(object):
         return post, log_marginal, grad_dict
 
 
-def _compute_psi(kern, X, X_variance, Z, uncertain_inputs):
-    if uncertain_inputs:
-        psi0 = kern.psi0(Z, X, X_variance)
-        psi1 = kern.psi1(Z, X, X_variance)
-        psi2 = kern.psi2(Z, X, X_variance)
-    else:
-        psi0 = kern.Kdiag(X)
-        psi1 = kern.K(X, Z)
-        psi2 = None
+def _compute_psi(kern, X, X_variance, Z):
+    psi0 = kern.Kdiag(X)
+    psi1 = kern.K(X, Z)
+    psi2 = None
+    return psi0, psi1, psi2
+
+def _compute_psi_latent(kern, posterior_variational, Z):
+    psi0 = kern.psi0(Z, posterior_variational)
+    psi1 = kern.psi1(Z, posterior_variational)
+    psi2 = kern.psi2(Z, posterior_variational)
     return psi0, psi1, psi2
 
 def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs):
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index b3ee57cd..5fe29d51 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -26,11 +26,11 @@ class Kern(Parameterized):
         raise NotImplementedError
     def Kdiag(self, Xa):
         raise NotImplementedError
-    def psi0(self,Z,mu,S):
+    def psi0(self,Z,posterior_variational):
         raise NotImplementedError
-    def psi1(self,Z,mu,S):
+    def psi1(self,Z,posterior_variational):
         raise NotImplementedError
-    def psi2(self,Z,mu,S):
+    def psi2(self,Z,posterior_variational):
         raise NotImplementedError
     def gradients_X(self, dL_dK, X, X2):
         raise NotImplementedError
@@ -49,16 +49,16 @@ class Kern(Parameterized):
         self._collect_gradient(target)
         self._set_gradient(target)
 
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
         """Set the gradients of all parameters when doing variational (M) inference with uncertain inputs."""
         raise NotImplementedError
     def gradients_Z_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
         grad = self.gradients_X(dL_dKmm, Z)
         grad += self.gradients_X(dL_dKnm.T, Z, X)
         return grad
-    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
         raise NotImplementedError
-    def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+    def gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
         raise NotImplementedError
     
     def plot_ARD(self, *args):
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index c4d595d0..0c8588a2 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -79,16 +79,21 @@ class RBF(Kern):
         ret[:] = self.variance
         return ret
 
-    def psi0(self, Z, mu, S):
+    def psi0(self, Z, posterior_variational):
+        mu = posterior_variational.mean
         ret = np.empty(mu.shape[0], dtype=np.float64)
         ret[:] = self.variance
         return ret
 
-    def psi1(self, Z, mu, S):
+    def psi1(self, Z, posterior_variational):
+        mu = posterior_variational.mean
+        S = posterior_variational.variance
         self._psi_computations(Z, mu, S)
         return self._psi1
 
-    def psi2(self, Z, mu, S):
+    def psi2(self, Z, posterior_variational):
+        mu = posterior_variational.mean
+        S = posterior_variational.variance
         self._psi_computations(Z, mu, S)
         return self._psi2
 
@@ -121,7 +126,9 @@ class RBF(Kern):
         else:
             self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
 
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
+        mu = posterior_variational.mean
+        S = posterior_variational.variance        
         self._psi_computations(Z, mu, S)
 
         #contributions from psi0:
@@ -155,7 +162,9 @@ class RBF(Kern):
         else:
             self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
 
-    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
+        mu = posterior_variational.mean
+        S = posterior_variational.variance
         self._psi_computations(Z, mu, S)
 
         #psi1
@@ -173,7 +182,9 @@ class RBF(Kern):
 
         return grad
 
-    def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+    def gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
+        mu = posterior_variational.mean
+        S = posterior_variational.variance
         self._psi_computations(Z, mu, S)
         #psi1
         tmp = self._psi1[:, :, None] / self.lengthscale2 / self._psi1_denom
diff --git a/GPy/kern/_src/ss_rbf.py b/GPy/kern/_src/ss_rbf.py
deleted file mode 100644
index cab8fd11..00000000
--- a/GPy/kern/_src/ss_rbf.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from kernpart import Kernpart
-from ...util.linalg import tdot
-from ...util.misc import fast_array_equal, param_to_array
-from ...core.parameterization import Param
-
-class SS_RBF(Kernpart):
-    """
-    The RBF kernel for Spike-and-Slab GPLVM
-    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:
-
-    .. math::
-
-       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \ \ \ \ \  \\text{ where  } r^2 = \sum_{i=1}^d \\frac{ (x_i-x^\prime_i)^2}{\ell_i^2}
-
-    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.
-
-    :param input_dim: the number of input dimensions
-    :type input_dim: int
-    :param variance: the variance of the kernel
-    :type variance: float
-    :param lengthscale: the vector of lengthscale of the kernel
-    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
-    :rtype: kernel object
-    """
-
-    def __init__(self, input_dim, variance=1., lengthscale=None, name='rbf'):
-        super(RBF, self).__init__(input_dim, name)
-        self.input_dim = input_dim
-
-        if lengthscale is not None:
-            lengthscale = np.asarray(lengthscale)
-            assert lengthscale.size == self.input_dim, "bad number of lengthscales"
-        else:
-            lengthscale = np.ones(self.input_dim)
-
-        self.variance = Param('variance', variance)
-        self.lengthscale = Param('lengthscale', lengthscale)
-        self.lengthscale.add_observer(self, self.update_lengthscale)
-        self.add_parameters(self.variance, self.lengthscale)
-        self.parameters_changed() # initializes cache
-
-    def on_input_change(self, X):
-        #self._K_computations(X, None)
-        pass
-
-    def update_lengthscale(self, l):
-        self.lengthscale2 = np.square(self.lengthscale)
-
-    def parameters_changed(self):
-        # reset cached results
-        self._X, self._X2 = np.empty(shape=(2, 1))
-        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
-
-    def K(self, X, X2, target):
-        self._K_computations(X, X2)
-        target += self.variance * self._K_dvar
-
-    def Kdiag(self, X, target):
-        np.add(target, self.variance, target)
-
-    def psi0(self, Z, mu, S, target):
-        target += self.variance
-
-    def psi1(self, Z, mu, S, target):
-        self._psi_computations(Z, mu, S)
-        target += self._psi1
-
-    def psi2(self, Z, mu, S, target):
-        self._psi_computations(Z, mu, S)
-        target += self._psi2
-
-    def update_gradients_full(self, dL_dK, X):
-        self._K_computations(X, None)
-        self.variance.gradient = np.sum(self._K_dvar * dL_dK)
-        if self.ARD:
-            self.lengthscale.gradient = self._dL_dlengthscales_via_K(dL_dK, X, None)
-        else:
-            self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK)
-
-    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        #contributions from Kdiag
-        self.variance.gradient = np.sum(dL_dKdiag)
-
-        #from Knm
-        self._K_computations(X, Z)
-        self.variance.gradient += np.sum(dL_dKnm * self._K_dvar)
-        if self.ARD:
-            self.lengthscales.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
-
-        else:
-            self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
-
-        #from Kmm
-        self._K_computations(Z, None)
-        self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
-        if self.ARD:
-            self.lengthscales.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
-        else:
-            self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
-
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
-        self._psi_computations(Z, mu, S)
-
-        #contributions from psi0:
-        self.variance.gradient = np.sum(dL_dpsi0)
-
-        #from psi1
-        self.variance.gradient += np.sum(dL_dpsi1 * self._psi1 / self.variance)
-        d_length = self._psi1[:,:,None] * ((self._psi1_dist_sq - 1.)/(self.lengthscale*self._psi1_denom) +1./self.lengthscale)
-        dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
-        if not self.ARD:
-            self.lengthscale.gradeint = dpsi1_dlength.sum()
-        else:
-            self.lengthscale.gradient = dpsi1_dlength.sum(0).sum(0)
-
-        #from psi2
-        d_var = 2.*self._psi2 / self.variance
-        d_length = 2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] / self.lengthscale2) / (self.lengthscale * self._psi2_denom)
-
-        self.variance.gradient += np.sum(dL_dpsi2 * d_var)
-        dpsi2_dlength = d_length * dL_dpsi2[:, :, :, None]
-        if not self.ARD:
-            self.lengthscale.gradient += dpsi2_dlength.sum()
-        else:
-            self.lengthscale.gradient += dpsi2_dlength.sum(0).sum(0).sum(0)
-
-        #from Kmm
-        self._K_computations(Z, None)
-        self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
-        if self.ARD:
-            self.lengthscales.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
-        else:
-            self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK)
-
-    def gradients_X(self, dL_dK, X, X2, target):
-        #if self._X is None or X.base is not self._X.base or X2 is not None:
-        self._K_computations(X, X2)
-        if X2 is None:
-            _K_dist = 2*(X[:, None, :] - X[None, :, :])
-        else:
-            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
-        gradients_X = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
-        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
-
-    def dKdiag_dX(self, dL_dKdiag, X, target):
-        pass
-
-    #---------------------------------------#
-    #             PSI statistics            #
-    #---------------------------------------#
-
-    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
-        pass
-
-    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
-        self._psi_computations(Z, mu, S)
-        denominator = (self.lengthscale2 * (self._psi1_denom))
-        dpsi1_dZ = -self._psi1[:, :, None] * ((self._psi1_dist / denominator))
-        target += np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0)
-
-    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
-        self._psi_computations(Z, mu, S)
-        tmp = self._psi1[:, :, None] / self.lengthscale2 / self._psi1_denom
-        target_mu += np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
-        target_S += np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
-
-    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
-        self._psi_computations(Z, mu, S)
-        term1 = self._psi2_Zdist / self.lengthscale2 # num_inducing, num_inducing, input_dim
-        term2 = self._psi2_mudist / self._psi2_denom / self.lengthscale2 # N, num_inducing, num_inducing, input_dim
-        dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
-        target += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
-
-    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
-        """Think N,num_inducing,num_inducing,input_dim """
-        self._psi_computations(Z, mu, S)
-        tmp = self._psi2[:, :, :, None] / self.lengthscale2 / self._psi2_denom
-        target_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
-        target_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)
-
-    #---------------------------------------#
-    #            Precomputations            #
-    #---------------------------------------#
-
-    def _K_computations(self, X, X2):
-        #params = self._get_params()
-        if not (fast_array_equal(X, self._X) and fast_array_equal(X2, self._X2)):# and fast_array_equal(self._params_save , params)):
-            #self._X = X.copy()
-            #self._params_save = params.copy()
-            if X2 is None:
-                self._X2 = None
-                X = X / self.lengthscale
-                Xsquare = np.sum(np.square(X), 1)
-                self._K_dist2 = -2.*tdot(X) + (Xsquare[:, None] + Xsquare[None, :])
-            else:
-                self._X2 = X2.copy()
-                X = X / self.lengthscale
-                X2 = X2 / self.lengthscale
-                self._K_dist2 = -2.*np.dot(X, X2.T) + (np.sum(np.square(X), 1)[:, None] + np.sum(np.square(X2), 1)[None, :])
-            self._K_dvar = np.exp(-0.5 * self._K_dist2)
-
-    def _dL_dlengthscales_via_K(self, dL_dK, X, X2):
-        """
-        A helper function for update_gradients_* methods
-
-        Computes the derivative of the objective L wrt the lengthscales via
-
-        dL_dl = sum_{i,j}(dL_dK_{ij} dK_dl)
-
-        assumes self._K_computations has just been called.
-
-        This is only valid if self.ARD=True
-        """
-        target = np.zeros(self.input_dim)
-        dvardLdK = self._K_dvar * dL_dK
-        var_len3 = self.variance / np.power(self.lengthscale, 3)
-        if X2 is None:
-            # save computation for the symmetrical case
-            dvardLdK = dvardLdK + dvardLdK.T
-            code = """
-            int q,i,j;
-            double tmp;
-            for(q=0; q<input_dim; q++){
-              tmp = 0;
-              for(i=0; i<num_data; i++){
-                for(j=0; j<i; j++){
-                  tmp += (X(i,q)-X(j,q))*(X(i,q)-X(j,q))*dvardLdK(i,j);
-                }
-              }
-              target(q) += var_len3(q)*tmp;
-            }
-            """
-            num_data, num_inducing, input_dim = X.shape[0], X.shape[0], self.input_dim
-            X, dvardLdK = param_to_array(X, dvardLdK)
-            weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
-        else:
-            code = """
-            int q,i,j;
-            double tmp;
-            for(q=0; q<input_dim; q++){
-              tmp = 0;
-              for(i=0; i<num_data; i++){
-                for(j=0; j<num_inducing; j++){
-                  tmp += (X(i,q)-X2(j,q))*(X(i,q)-X2(j,q))*dvardLdK(i,j);
-                }
-              }
-              target(q) += var_len3(q)*tmp;
-            }
-            """
-            num_data, num_inducing, input_dim = X.shape[0], X2.shape[0], self.input_dim
-            X, X2, dvardLdK = param_to_array(X, X2, dvardLdK)
-            weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
-        return target
-
-
-
-    def _psi_computations(self, Z, mu, S):
-        # here are the "statistics" for psi1 and psi2
-        Z_changed = not fast_array_equal(Z, self._Z)
-        if Z_changed:
-            # Z has changed, compute Z specific stuff
-            self._psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
-            self._psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
-            self._psi2_Zdist_sq = np.square(self._psi2_Zdist / self.lengthscale) # M,M,Q
-
-        if Z_changed or not fast_array_equal(mu, self._mu) or not fast_array_equal(S, self._S):
-            # something's changed. recompute EVERYTHING
-
-            # psi1
-            self._psi1_denom = S[:, None, :] / self.lengthscale2 + 1.
-            self._psi1_dist = Z[None, :, :] - mu[:, None, :]
-            self._psi1_dist_sq = np.square(self._psi1_dist) / self.lengthscale2 / self._psi1_denom
-            self._psi1_exponent = -0.5 * np.sum(self._psi1_dist_sq + np.log(self._psi1_denom), -1)
-            self._psi1 = self.variance * np.exp(self._psi1_exponent)
-
-            # psi2
-            self._psi2_denom = 2.*S[:, None, None, :] / self.lengthscale2 + 1. # N,M,M,Q
-            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu, self._psi2_Zhat)
-            # self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
-            # self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
-            # self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q
-            self._psi2 = np.square(self.variance) * np.exp(self._psi2_exponent) # N,M,M,Q
-
-            # store matrices for caching
-            self._Z, self._mu, self._S = Z, mu, S
-
-    def weave_psi2(self, mu, Zhat):
-        N, input_dim = mu.shape
-        num_inducing = Zhat.shape[0]
-
-        mudist = np.empty((N, num_inducing, num_inducing, input_dim))
-        mudist_sq = np.empty((N, num_inducing, num_inducing, input_dim))
-        psi2_exponent = np.zeros((N, num_inducing, num_inducing))
-        psi2 = np.empty((N, num_inducing, num_inducing))
-
-        psi2_Zdist_sq = self._psi2_Zdist_sq
-        _psi2_denom = self._psi2_denom.squeeze().reshape(N, self.input_dim)
-        half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(N, self.input_dim)
-        variance_sq = float(np.square(self.variance))
-        if self.ARD:
-            lengthscale2 = self.lengthscale2
-        else:
-            lengthscale2 = np.ones(input_dim) * self.lengthscale2
-        code = """
-        double tmp;
-
-        #pragma omp parallel for private(tmp)
-        for (int n=0; n<N; n++){
-            for (int m=0; m<num_inducing; m++){
-               for (int mm=0; mm<(m+1); mm++){
-                   for (int q=0; q<input_dim; q++){
-                       //compute mudist
-                       tmp = mu(n,q) - Zhat(m,mm,q);
-                       mudist(n,m,mm,q) = tmp;
-                       mudist(n,mm,m,q) = tmp;
-
-                       //now mudist_sq
-                       tmp = tmp*tmp/lengthscale2(q)/_psi2_denom(n,q);
-                       mudist_sq(n,m,mm,q) = tmp;
-                       mudist_sq(n,mm,m,q) = tmp;
-
-                       //now psi2_exponent
-                       tmp = -psi2_Zdist_sq(m,mm,q) - tmp - half_log_psi2_denom(n,q);
-                       psi2_exponent(n,mm,m) += tmp;
-                       if (m !=mm){
-                           psi2_exponent(n,m,mm) += tmp;
-                       }
-                   //psi2 would be computed like this, but np is faster
-                   //tmp = variance_sq*exp(psi2_exponent(n,m,mm));
-                   //psi2(n,m,mm) = tmp;
-                   //psi2(n,mm,m) = tmp;
-                   }
-                }
-            }
-        }
-
-        """
-
-        support_code = """
-        #include <omp.h>
-        #include <math.h>
-        """
-        weave.inline(code, support_code=support_code, libraries=['gomp'],
-                     arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
-                     type_converters=weave.converters.blitz, **self.weave_options)
-
-        return mudist, mudist_sq, psi2_exponent, psi2
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 8aa378ce..cc68de68 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -66,7 +66,7 @@ class BayesianGPLVM(SparseGP, GPLVM):
         super(BayesianGPLVM, self).parameters_changed()
 
         self._log_marginal_likelihood -= self.KL_divergence()
-        dL_dmu, dL_dS = self.kern.gradients_muS_variational(mu=self.X, S=self.X_variance, Z=self.Z, **self.grad_dict)
+        dL_dmu, dL_dS = self.kern.gradients_q_variational(posterior_variational=self.q, Z=self.Z, **self.grad_dict)
 
         # dL:
         self.q.mean.gradient  = dL_dmu

From ea5d19bb4ede5bd952a63f776e2df9c2678843c1 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 21 Feb 2014 18:09:05 +0000
Subject: [PATCH 32/38] bias now looks in shape

---
 GPy/kern/_src/bias.py | 82 ++++++++++++++++---------------------------
 1 file changed, 31 insertions(+), 51 deletions(-)

diff --git a/GPy/kern/_src/bias.py b/GPy/kern/_src/bias.py
index d2301bcd..d45561f8 100644
--- a/GPy/kern/_src/bias.py
+++ b/GPy/kern/_src/bias.py
@@ -2,80 +2,60 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-from kernpart import Kernpart
+from kern import Kern
 from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
 
-class Bias(Kernpart):
+class Bias(Kern):
     def __init__(self,input_dim,variance=1.,name=None):
-        """
-        :param input_dim: the number of input dimensions
-        :type input_dim: int
-        :param variance: the variance of the kernel
-        :type variance: float
-        """
         super(Bias, self).__init__(input_dim, name)
-        from ...core.parameterization.transformations import Logexp
         self.variance = Param("variance", variance, Logexp())
         self.add_parameter(self.variance)
 
-    def K(self,X,X2,target):
-        target += self.variance
+    def K(self, X, X2=None):
+        shape = (X.shape[0], X.shape[0] if X2 is None else X2.shape[0])
+        ret = np.empty(shape, dtype=np.float64)
+        ret[:] = self.variance
+        return ret
 
-    def Kdiag(self,X,target):
-        target += self.variance
+    def Kdiag(self,X):
+        ret = np.empty((X.shape[0],), dtype=np.float64)
+        ret[:] = self.variance
+        return ret
 
-    #def dK_dtheta(self,dL_dKdiag,X,X2,target):
-        #target += dL_dKdiag.sum()
-    def update_gradients_full(self, dL_dK, X):
+    def update_gradients_full(self, dL_dK, X, X2=None):
         self.variance.gradient = dL_dK.sum()
 
-    def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        target += dL_dKdiag.sum()
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.variance.gradient = dL_dK.sum()
 
     def gradients_X(self, dL_dK,X, X2, target):
-        pass
+        return np.zeros(X.shape)
 
-    def dKdiag_dX(self,dL_dKdiag,X,target):
-        pass
+    def gradients_X_diag(self,dL_dKdiag,X,target):
+        return np.zeros(X.shape)
 
 
     #---------------------------------------#
     #             PSI statistics            #
     #---------------------------------------#
 
-    def psi0(self, Z, mu, S, target):
-        target += self.variance
+    def psi0(self, Z, mu, S):
+        return self.Kdiag(mu)
 
     def psi1(self, Z, mu, S, target):
-        self._psi1 = self.variance
-        target += self._psi1
-        
+        return self.K(mu, S)
+
     def psi2(self, Z, mu, S, target):
-        target += self.variance**2
+        ret = np.empty((mu.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64)
+        ret[:] = self.variance**2
+        return ret
 
-    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S, target):
-        target += dL_dpsi0.sum()
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        self.variance.gradient = dL_dKmm.sum() + dL_dpsi0.sum() + dL_dpsi1.sum() + 2.*self.variance*dL_dpsi2.sum()
 
-    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
-        target += dL_dpsi1.sum()
+    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        return np.zeros(Z.shape)
 
-    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
-        target += 2.*self.variance*dL_dpsi2.sum()
-
-    def dpsi0_dZ(self, dL_dpsi0, Z, mu, S, target):
-        pass
-
-    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
-        pass
-
-    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
-        pass
-
-    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
-        pass
-
-    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
-        pass
-
-    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
-        pass
+    def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        return np.zeros(mu.shape), np.zeros(S.shape)

From 61a101ed0547b158388abddc029edf62370b7182 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Sun, 23 Feb 2014 11:02:20 -0500
Subject: [PATCH 33/38] Changes to sympykern.py

---
 GPy/kern/parts/rbf.py       |   2 +-
 GPy/kern/parts/sympykern.py | 204 +++++++++++++++++++++---------------
 2 files changed, 123 insertions(+), 83 deletions(-)

diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py
index 8811b74a..027aa382 100644
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@@ -109,7 +109,7 @@ class RBF(Kernpart):
             self.lengthscale.gradient = self._dL_dlengthscales_via_K(dL_dK, X, None)
         else:
             self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK)
-b
+
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
         #contributions from Kdiag
         self.variance.gradient = np.sum(dL_dKdiag)
diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index 52813ecd..3d6517a8 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -1,3 +1,4 @@
+# Check Matthew Rocklin's blog post.
 try: 
     import sympy as sp
     sympy_available=True
@@ -129,6 +130,8 @@ class spkern(Kernpart):
         if False:
             self.compute_psi_stats()
 
+        self._code = {}
+
         # generate the code for the covariance functions
         self._gen_code()
 
@@ -169,6 +172,7 @@ class spkern(Kernpart):
             code_type = "C"
         else:
             code_type = "PYTHON"
+        # Need to add the sympy_helpers header in here.
         (foo_c,self._function_code), (foo_h,self._function_header) = \
                                      codegen(code_list,
                                              code_type,
@@ -233,7 +237,7 @@ class spkern(Kernpart):
                 """
 
             # Here's the code to do the looping for K
-            self._K_code =\
+            self._code['K'] =\
             """
             // _K_code
             // Code for computing the covariance function.
@@ -254,7 +258,7 @@ class spkern(Kernpart):
             """%(precompute_string,arg_string,"/*"+str(self._sp_k)+"*/")
            # adding a string representation of the function in the
            # comment forces recompile when needed
-            self._K_code_X = self._K_code.replace('Z2(', 'X2(')
+            self._code['K_X'] = self._code['K'].replace('Z2(', 'X2(')
 
 
             # Code to compute diagonal of covariance.
@@ -265,9 +269,9 @@ class spkern(Kernpart):
             diag_precompute_string = re.sub('Z','X',diag_precompute_string)
             diag_precompute_string = re.sub('j','i',diag_precompute_string)
             # Code to do the looping for Kdiag
-            self._Kdiag_code =\
+            self._code['Kdiag'] =\
             """
-            // _Kdiag_code
+            // _code['Kdiag']
             // Code for computing diagonal of covariance function.
             int i;
             int n = target_array->dimensions[0];
@@ -282,51 +286,88 @@ class spkern(Kernpart):
             """%(diag_precompute_string,diag_arg_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
 
             # Code to compute gradients
-            grad_func_list = []
             if self.output_dim>1:
-                grad_func_list += c_define_output_indices
-                grad_func_list += [' '*16 + 'TARGET1(%i+ii) += PARTIAL2(i, j)*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, arg_string) for i, theta in enumerate(self._sp_theta_i)]
-                grad_func_list += [' '*16 + 'TARGET1(%i+jj) += PARTIAL2(i, j)*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, reverse_arg_string) for i, theta in enumerate(self._sp_theta_i)]
-            grad_func_list += ([' '*16 + 'TARGET1(%i) += PARTIAL2(i, j)*dk_d%s(%s);'%(i,theta.name,arg_string) for i,theta in  enumerate(self._sp_theta)])
-            grad_func_string = '\n'.join(grad_func_list) 
+                for i, theta in enumerate(self._sp_theta_i):
+                    grad_func_list = [' '*26 + 'TARGET1(ii) += PARTIAL2(i, j)*dk_d%s(%s);'%(theta.name, arg_string)]
+                    grad_func_list += [' '*26 + 'TARGET1(jj) += PARTIAL2(i, j)*dk_d%s(%s);'%(theta.name, reverse_arg_string)]
+                    grad_func_list = c_define_output_indices+grad_func_list
 
-            self._dK_dtheta_code =\
-            """
-            // _dK_dtheta_code
-            // Code for computing gradient of covariance with respect to parameters.
-            int i;
-            int j;
-            int n = partial_array->dimensions[0];
-            int num_inducing = partial_array->dimensions[1];
-            int input_dim = X_array->dimensions[1];
-            //#pragma omp parallel for private(j)
-            for (i=0;i<n;i++){
-                for (j=0;j<num_inducing;j++){
+                    grad_func_string = '\n'.join(grad_func_list) 
+                    self._code['dK_d' + theta.name] =\
+                      """
+                      int i;
+                      int j;
+                      int n = partial_array->dimensions[0];
+                      int num_inducing = partial_array->dimensions[1];
+                      int input_dim = X_array->dimensions[1];
+                      //#pragma omp parallel for private(j)
+                      for (i=0;i<n;i++){
+                        for (j=0;j<num_inducing;j++){
+%s
+                        }
+                      }
+                      %s
+                      """%(grad_func_string,"/*"+str(self._sp_k)+"*/") # adding a string representation forces recompile when needed
+                    self._code['dK_d' +theta.name + '_X'] = self._code['dK_d' + theta.name].replace('Z2(', 'X2(')
+                    # Code to compute gradients for Kdiag TODO: needs clean up
+                    diag_grad_func_string = re.sub('Z','X',grad_func_string,count=0)
+                    diag_grad_func_string = re.sub('int jj','//int jj',diag_grad_func_string)
+                    diag_grad_func_string = re.sub('j','i',diag_grad_func_string)
+                    diag_grad_func_string = re.sub('PARTIAL2\(i, i\)','PARTIAL(i)',diag_grad_func_string)
+                    self._code['dKdiag_d' + theta.name] =\
+                      """
+                      // _dKdiag_dtheta_code
+                      // Code for computing gradient of diagonal with respect to parameters.
+                      int i;
+                      int n = partial_array->dimensions[0];
+                      int input_dim = X_array->dimensions[1];
+                      for (i=0;i<n;i++){
+                        %s
+                      }
+                      %s
+                      """%(diag_grad_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+
+            for i, theta in enumerate(self._sp_theta):
+                grad_func_list = [' '*26 + 'TARGET1(%i) += PARTIAL2(i, j)*dk_d%s(%s);'%(i,theta.name,arg_string)]
+                grad_func_string = '\n'.join(grad_func_list) 
+
+                self._code['dK_d' + theta.name] =\
+                  """
+                  // _dK_dtheta_code
+                  // Code for computing gradient of covariance with respect to parameters.
+                  int i;
+                  int j;
+                  int n = partial_array->dimensions[0];
+                  int num_inducing = partial_array->dimensions[1];
+                  int input_dim = X_array->dimensions[1];
+                  //#pragma omp parallel for private(j)
+                  for (i=0;i<n;i++){
+                    for (j=0;j<num_inducing;j++){
+                      %s
+                    }
+                  }
                   %s
-                }
-            }
-            %s
-            """%(grad_func_string,"/*"+str(self._sp_k)+"*/") # adding a string representation forces recompile when needed
-            self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z2(', 'X2(')
+                  """%(grad_func_string,"/*"+str(self._sp_k)+"*/") # adding a string representation forces recompile when needed
+                self._code['dK_d' + theta.name +'_X'] = self._code['dK_d' + theta.name].replace('Z2(', 'X2(')
+                # Code to compute gradients for Kdiag TODO: needs clean up
+                diag_grad_func_string = re.sub('Z','X',grad_func_string,count=0)
+                diag_grad_func_string = re.sub('int jj','//int jj',diag_grad_func_string)
+                diag_grad_func_string = re.sub('j','i',diag_grad_func_string)
+                diag_grad_func_string = re.sub('PARTIAL2\(i, i\)','PARTIAL(i)',diag_grad_func_string)
+                self._code['dKdiag_d' + theta.name] =\
+                   """
+                   // _dKdiag_dtheta_code
+                   // Code for computing gradient of diagonal with respect to parameters.
+                   int i;
+                   int n = partial_array->dimensions[0];
+                   int input_dim = X_array->dimensions[1];
+                   for (i=0;i<n;i++){
+                     %s
+                   }
+                   %s
+                   """%(diag_grad_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
 
 
-            # Code to compute gradients for Kdiag TODO: needs clean up
-            diag_grad_func_string = re.sub('Z','X',grad_func_string,count=0)
-            diag_grad_func_string = re.sub('int jj','//int jj',diag_grad_func_string)
-            diag_grad_func_string = re.sub('j','i',diag_grad_func_string)
-            diag_grad_func_string = re.sub('partial\[i\*num_inducing\+i\]','partial[i]',diag_grad_func_string)
-            self._dKdiag_dtheta_code =\
-            """
-            // _dKdiag_dtheta_code
-            // Code for computing gradient of diagonal with respect to parameters.
-            int i;
-            int n = partial_array->dimensions[0];
-            int input_dim = X_array->dimensions[1];
-            for (i=0;i<n;i++){
-                    %s
-            }
-            %s
-            """%(diag_grad_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
 
             # Code for gradients wrt X, TODO: may need to deal with special case where one input is actually an output.
             gradX_func_list = []
@@ -335,7 +376,7 @@ class spkern(Kernpart):
             gradX_func_list += ["TARGET2(i, %i) += partial[i*num_inducing+j]*dk_dx_%i(%s);"%(q,q,arg_string) for q in range(self._real_input_dim)]
             gradX_func_string = "\n".join(gradX_func_list)
 
-            self._dK_dX_code = \
+            self._code['dK_dX'] = \
             """
             // _dK_dX_code
             // Code for computing gradient of covariance with respect to inputs.
@@ -352,7 +393,7 @@ class spkern(Kernpart):
             }
             %s
             """%(gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
-            self._dK_dX_code_X = self._dK_dX_code.replace('Z2(', 'X2(')
+            self._code['dK_dX_X'] = self._code['dK_dX'].replace('Z2(', 'X2(')
 
 
             diag_gradX_func_string = re.sub('Z','X',gradX_func_string,count=0)
@@ -361,7 +402,7 @@ class spkern(Kernpart):
             diag_gradX_func_string = re.sub('PARTIAL2\(i\, i\)','2*PARTIAL(i)',diag_gradX_func_string)
 
             # Code for gradients of Kdiag wrt X
-            self._dKdiag_dX_code= \
+            self._code['dKdiag_dX'] = \
             """
             // _dKdiag_dX_code
             // Code for computing gradient of diagonal with respect to inputs.
@@ -375,7 +416,6 @@ class spkern(Kernpart):
             # string representation forces recompile when needed Get rid
             # of Zs in argument for diagonal. TODO: Why wasn't
             # diag_func_string called here? Need to check that.
-            #self._dKdiag_dX_code = self._dKdiag_dX_code.replace('Z[j', 'X[i')
 
             
 
@@ -417,31 +457,31 @@ class spkern(Kernpart):
 
     def K(self,X,Z,target):        
         if Z is None:
-            self._generate_inline(self._K_code_X, X, target)
+            self._generate_inline(self._code['K_X'], X, target)
         else:
-            self._generate_inline(self._K_code, X, target, Z)
+            self._generate_inline(self._code['K'], X, target, Z)
 
 
     def Kdiag(self,X,target):
-        self._generate_inline(self._Kdiag_code, X, target)
+        self._generate_inline(self._code['Kdiag'], X, target)
 
     def _param_grad_helper(self,partial,X,Z,target):
         if Z is None:
-            self._generate_inline(self._dK_dtheta_code_X, X, target, Z, partial)
+            self._generate_inline(self._code['dK_dtheta_X'], X, target, Z, partial)
         else:
-            self._generate_inline(self._dK_dtheta_code, X, target, Z, partial)
+            self._generate_inline(self._code['dK_dtheta'], X, target, Z, partial)
 
     def dKdiag_dtheta(self,partial,X,target):
-        self._generate_inline(self._dKdiag_dtheta_code, X, target, Z=None, partial=partial).namelocals()[shared_params.name] = getattr(self, shared_params.name)
+        self._generate_inline(self._code['dKdiag_dtheta'], X, target, Z=None, partial=partial).namelocals()[shared_params.name] = getattr(self, shared_params.name)
                
     def gradients_X(self,partial,X,Z,target):
         if Z is None:
-            self._generate_inline(self._dK_dX_code_X, X, target, Z, partial)
+            self._generate_inline(self._code['dK_dX_X'], X, target, Z, partial)
         else:
-            self._generate_inline(self._dK_dX_code, X, target, Z, partial)
+            self._generate_inline(self._code['dK_dX'], X, target, Z, partial)
 
     def dKdiag_dX(self,partial,X,target):
-        self._generate_inline(self._dKdiag_dX_code, X, target, Z, partial)
+        self._generate_inline(self._code['dKdiag_dX'], X, target, Z, partial)
 
     def compute_psi_stats(self):
         #define some normal distributions
@@ -481,37 +521,37 @@ class spkern(Kernpart):
         self._K_computations(X, None)
         for shared_params in self._sp_theta:
             parameter = getattr(self, shared_params.name)
-            code = getattr(self, '_dK_d' + shared_params.name + '_code')
+            code = self._code['dK_d' + shared_params.name]
             setattr(parameter, 'gradient', self._generate_inline(code, X, target=None, Z=None, partial=dL_dK))
             
         for split_params in self._split_theta_names:
             parameter = getattr(self, split_params.name)
-            code = getattr(self, '_dK_d' + split_params.name + '_code')
+            code = self._code['dK_d' + split_params.name]
             setattr(parameter, 'gradient', self._generate_inline(code, X, target=None, Z=None, partial=dL_dK))
-
-
-    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
-        #contributions from Kdiag
-        self.variance.gradient = np.sum(dL_dKdiag)
-
-        #from Knm
-        self._K_computations(X, Z)
-        self.variance.gradient += np.sum(dL_dKnm * self._K_dvar)
-        if self.ARD:
-            self.lengthscale.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
-
-        else:
-            self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKnm)
-
-        #from Kmm
-        self._K_computations(Z, None)
-        self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
-        if self.ARD:
-            self.lengthscale.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
-        else:
-            self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
-
     
+
+    # def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+    #     #contributions from Kdiag
+    #     self.variance.gradient = np.sum(dL_dKdiag)
+        
+    #     #from Knm
+    #     self._K_computations(X, Z)
+    #     self.variance.gradient += np.sum(dL_dKnm * self._K_dvar)
+    #     if self.ARD:
+    #         self.lengthscale.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
+            
+    #     else:
+    #         self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKnm)
+            
+    #     #from Kmm
+    #     self._K_computations(Z, None)
+    #     self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
+    #     if self.ARD:
+    #         self.lengthscale.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
+    #     else:
+    #         self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
+            
+            
     #---------------------------------------#
     #            Precomputations            #
     #---------------------------------------#

From 61a6086af6bbd4e6205cfe83a065d99b00385a68 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 24 Feb 2014 08:22:06 +0000
Subject: [PATCH 34/38] minor fixes in kerns

---
 GPy/kern/__init__.py        | 2 +-
 GPy/kern/_src/bias.py       | 1 +
 GPy/kern/_src/stationary.py | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index e5dc6d35..594ff6d3 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -2,9 +2,9 @@ from _src.rbf import RBF
 from _src.white import White
 from _src.kern import Kern
 from _src.linear import Linear
+from _src.bias import Bias
 from _src.brownian import Brownian
 from _src.stationary import Exponential, Matern32, Matern52, ExpQuad
-#from _src.bias import Bias
 #import coregionalize
 #import exponential
 #import eq_ode1
diff --git a/GPy/kern/_src/bias.py b/GPy/kern/_src/bias.py
index d45561f8..e1938c95 100644
--- a/GPy/kern/_src/bias.py
+++ b/GPy/kern/_src/bias.py
@@ -5,6 +5,7 @@
 from kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
+import numpy as np
 
 class Bias(Kern):
     def __init__(self,input_dim,variance=1.,name=None):
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index 7cc2e695..a6ff9424 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -18,7 +18,7 @@ class Stationary(Kern):
                 lengthscale = np.ones(1)
             else:
                 lengthscale = np.asarray(lengthscale)
-                assert lengthscale.size == 1 "Only  lengthscale needed for non-ARD kernel"
+                assert lengthscale.size == 1, "Only  lengthscale needed for non-ARD kernel"
         else:
             if lengthscale is not None:
                 lengthscale = np.asarray(lengthscale)

From b20beaa8630034adfefaf3561f3cad6ec88d323e Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 24 Feb 2014 08:55:18 +0000
Subject: [PATCH 35/38] some work pon EP (uninished)

---
 GPy/inference/latent_function_inference/ep.py | 421 +++---------------
 1 file changed, 61 insertions(+), 360 deletions(-)

diff --git a/GPy/inference/latent_function_inference/ep.py b/GPy/inference/latent_function_inference/ep.py
index aa106067..87c08221 100644
--- a/GPy/inference/latent_function_inference/ep.py
+++ b/GPy/inference/latent_function_inference/ep.py
@@ -3,390 +3,91 @@ from scipy import stats
 from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
 from likelihood import likelihood
 
-class EP(likelihood):
-    def __init__(self,data,noise_model):
-        """
-        Expectation Propagation
-
-        :param data: data to model
-        :type data: numpy array
-        :param noise_model: noise distribution
-        :type noise_model: A GPy noise model
-
-        """
-        self.noise_model = noise_model
-        self.data = data
-        self.num_data, self.output_dim = self.data.shape
-        self.is_heteroscedastic = True
-        self.num_params = 0
-
-        #Initial values - Likelihood approximation parameters:
-        #p(y|f) = t(f|tau_tilde,v_tilde)
-        self.tau_tilde = np.zeros(self.num_data)
-        self.v_tilde = np.zeros(self.num_data)
-
-        #initial values for the GP variables
-        self.Y = np.zeros((self.num_data,1))
-        self.covariance_matrix = np.eye(self.num_data)
-        self.precision = np.ones(self.num_data)[:,None]
-        self.Z = 0
-        self.YYT = None
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.V
-        self.trYYT = 0.
-
-        super(EP, self).__init__()
-
-    def restart(self):
-        self.tau_tilde = np.zeros(self.num_data)
-        self.v_tilde = np.zeros(self.num_data)
-        self.Y = np.zeros((self.num_data,1))
-        self.covariance_matrix = np.eye(self.num_data)
-        self.precision = np.ones(self.num_data)[:,None]
-        self.Z = 0
-        self.YYT = None
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.V
-        self.trYYT = 0.
-
-    def predictive_values(self,mu,var,full_cov,**noise_args):
-        if full_cov:
-            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.noise_model.predictive_values(mu,var,**noise_args)
-
-    def log_predictive_density(self, y_test, mu_star, var_star):
-        """
-        Calculation of the log predictive density
-
-        .. math:
-            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
-
-        :param y_test: test observations (y_{*})
-        :type y_test: (Nx1) array
-        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type mu_star: (Nx1) array
-        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type var_star: (Nx1) array
-        """
-        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
-
-    def _get_params(self):
-        #return np.zeros(0)
-        return self.noise_model._get_params()
-
-    def _get_param_names(self):
-        #return []
-        return self.noise_model._get_param_names()
-
-    def _set_params(self,p):
-        #pass # TODO: the EP likelihood might want to take some parameters...
-        self.noise_model._set_params(p)
-
-    def _gradients(self,partial):
-        #return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
-        return self.noise_model._gradients(partial)
-
-    def _compute_GP_variables(self):
-        #Variables to be called from GP
-        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
-        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
-        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
-        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
-        self.Z += 0.5*self.num_data*np.log(2*np.pi)
-
-        self.Y =  mu_tilde[:,None]
-        self.YYT = np.dot(self.Y,self.Y.T)
-        self.covariance_matrix = np.diag(1./self.tau_tilde)
-        self.precision = self.tau_tilde[:,None]
-        self.V = self.precision * self.Y
-        self.VVT_factor = self.V
-        self.trYYT = np.trace(self.YYT)
-
-    def fit_full(self, K, epsilon=1e-3,power_ep=[1.,1.]):
+class EP(object):
+    def __init__(self, epsilon=1e-6, eta=1., delta=1.):
         """
         The expectation-propagation algorithm.
         For nomenclature see Rasmussen & Williams 2006.
 
         :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
         :type epsilon: float
-        :param power_ep: Power EP parameters
-        :type power_ep: list of floats
-
+        :param eta: Power EP thing TODO: Ricardo: what, exactly?
+        :type eta: float64
+        :param delta: Power EP thing TODO: Ricardo: what, exactly?
+        :type delta: float64
         """
-        self.epsilon = epsilon
-        self.eta, self.delta = power_ep
+        self.epsilon, self.eta, self.delta = epsilon, eta, delta
+        self.reset()
+
+    def reset(self):
+        self.old_mutilde, self.old_vtilde = None, None
+
+    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+
+        K = kern.K(X)
+
+        mu_tilde, tau_tilde = self.expectation_propagation()
+
+
+    def expectation_propagation(self, K, Y, Y_metadata, likelihood)
+
+        num_data, data_dim = Y.shape
+        assert data_dim == 1, "This EP methods only works for 1D outputs"
+
 
         #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
         mu = np.zeros(self.num_data)
         Sigma = K.copy()
 
-        """
-        Initial values - Cavity distribution parameters:
-        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
-        sigma_ = 1./tau_
-        mu_ = v_/tau_
-        """
-        self.tau_ = np.empty(self.num_data,dtype=float)
-        self.v_ = np.empty(self.num_data,dtype=float)
-
         #Initial values - Marginal moments
-        z = np.empty(self.num_data,dtype=float)
-        self.Z_hat = np.empty(self.num_data,dtype=float)
-        phi = np.empty(self.num_data,dtype=float)
-        mu_hat = np.empty(self.num_data,dtype=float)
-        sigma2_hat = np.empty(self.num_data,dtype=float)
+        Z_hat = np.empty(num_data,dtype=np.float64)
+        mu_hat = np.empty(num_data,dtype=np.float64)
+        sigma2_hat = np.empty(num_data,dtype=np.float64)
+
+        #initial values - Gaussian factors
+        if self.old_mutilde is None:
+            tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data, num_data))
+        else:
+            assert old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!"
+            mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde
+            tau_tilde = v_tilde/mu_tilde
 
         #Approximation
         epsilon_np1 = self.epsilon + 1.
         epsilon_np2 = self.epsilon + 1.
-       	self.iterations = 0
-        self.np1 = [self.tau_tilde.copy()]
-        self.np2 = [self.v_tilde.copy()]
-        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.num_data)
+       	iterations = 0
+        while (epsilon_np1 > self.epsilon) or (epsilon_np2 > self.epsilon):
+            update_order = np.random.permutation(num_data)
             for i in update_order:
                 #Cavity distribution parameters
-                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
-                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
+                tau_cav = 1./Sigma[i,i] - self.eta*tau_tilde[i]
+                v_cav = mu[i]/Sigma[i,i] - self.eta*v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
+                Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match(Y[i], tau_cav, v_cav, Y_metadata=(None if Y_metadata is None else Y_metadata[i]))
                 #Site parameters update
-                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
-                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
-                self.tau_tilde[i] += Delta_tau
-                self.v_tilde[i] += Delta_v
+                delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
+                delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
+                tau_tilde[i] += delta_tau
+                v_tilde[i] += delta_v
                 #Posterior distribution parameters update
-                DSYR(Sigma,Sigma[:,i].copy(), -float(Delta_tau/(1.+ Delta_tau*Sigma[i,i])))
-                mu = np.dot(Sigma,self.v_tilde)
-                self.iterations += 1
-            #Sigma recomptutation with Cholesky decompositon
-            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
-            B = np.eye(self.num_data) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
+                DSYR(Sigma, Sigma[:,i].copy(), -Delta_tau/(1.+ Delta_tau*Sigma[i,i]))
+                mu = np.dot(Sigma, v_tilde)
+                iterations += 1
+
+            #(re) compute Sigma and mu using full Cholesky decompy
+            tau_tilde_root = np.sqrt(tau_tilde)
+            Sroot_tilde_K = tau_tilde_root[:,None] * K
+            B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
             L = jitchol(B)
-            V,info = dtrtrs(L,Sroot_tilde_K,lower=1)
+            V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
             Sigma = K - np.dot(V.T,V)
-            mu = np.dot(Sigma,self.v_tilde)
-            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.num_data
-            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.num_data
-            self.np1.append(self.tau_tilde.copy())
-            self.np2.append(self.v_tilde.copy())
+            mu = np.dot(Sigma,v_tilde)
 
-        return self._compute_GP_variables()
+            #monitor convergence
+            epsilon_np1 = np.mean(np.square(tau_tilde-tau_tilde_old))
+            epsilon_np2 = np.mean(np.square(v_tilde-v_tilde_old))
+            tau_tilde_old = tau_tilde.copy()
+            v_tilde_old = v_tilde.copy()
 
-    def fit_DTC(self, Kmm, Kmn, epsilon=1e-3,power_ep=[1.,1.]):
-        """
-        The expectation-propagation algorithm with sparse pseudo-input.
-        For nomenclature see ... 2013.
+        return mu, Sigma, mu_tilde, tau_tilde
 
-        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
-        :type epsilon: float
-        :param power_ep: Power EP parameters
-        :type power_ep: list of floats
-
-        """
-        self.epsilon = epsilon
-        self.eta, self.delta = power_ep
-
-        num_inducing = Kmm.shape[0]
-
-        #TODO: this doesn't work with uncertain inputs!
-
-        """
-        Prior approximation parameters:
-        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
-        Sigma0 = Qnn = Knm*Kmmi*Kmn
-        """
-        KmnKnm = np.dot(Kmn,Kmn.T)
-        Lm = jitchol(Kmm)
-        Lmi = chol_inv(Lm)
-        Kmmi = np.dot(Lmi.T,Lmi)
-        KmmiKmn = np.dot(Kmmi,Kmn)
-        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
-        LLT0 = Kmm.copy()
-
-        #Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm)
-        #KmnKnm = np.dot(Kmn, Kmn.T)
-        #KmmiKmn = np.dot(Kmmi,Kmn)
-        #Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
-        #LLT0 = Kmm.copy()
-
-        """
-        Posterior approximation: q(f|y) = N(f| mu, Sigma)
-        Sigma = Diag + P*R.T*R*P.T + K
-        mu = w + P*Gamma
-        """
-        mu = np.zeros(self.num_data)
-        LLT = Kmm.copy()
-        Sigma_diag = Qnn_diag.copy()
-
-        """
-        Initial values - Cavity distribution parameters:
-        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
-        sigma_ = 1./tau_
-        mu_ = v_/tau_
-        """
-        self.tau_ = np.empty(self.num_data,dtype=float)
-        self.v_ = np.empty(self.num_data,dtype=float)
-
-        #Initial values - Marginal moments
-        z = np.empty(self.num_data,dtype=float)
-        self.Z_hat = np.empty(self.num_data,dtype=float)
-        phi = np.empty(self.num_data,dtype=float)
-        mu_hat = np.empty(self.num_data,dtype=float)
-        sigma2_hat = np.empty(self.num_data,dtype=float)
-
-        #Approximation
-        epsilon_np1 = 1
-        epsilon_np2 = 1
-       	self.iterations = 0
-        np1 = [self.tau_tilde.copy()]
-        np2 = [self.v_tilde.copy()]
-        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.num_data)
-            for i in update_order:
-                #Cavity distribution parameters
-                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
-                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
-                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
-                #Site parameters update
-                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
-                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
-                self.tau_tilde[i] += Delta_tau
-                self.v_tilde[i] += Delta_v
-                #Posterior distribution parameters update
-                DSYR(LLT,Kmn[:,i].copy(),Delta_tau) #LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
-                L = jitchol(LLT)
-                #cholUpdate(L,Kmn[:,i]*np.sqrt(Delta_tau))
-                V,info = dtrtrs(L,Kmn,lower=1)
-                Sigma_diag = np.sum(V*V,-2)
-                si = np.sum(V.T*V[:,i],-1)
-                mu += (Delta_v-Delta_tau*mu[i])*si
-                self.iterations += 1
-            #Sigma recomputation with Cholesky decompositon
-            LLT = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
-            L = jitchol(LLT)
-            V,info = dtrtrs(L,Kmn,lower=1)
-            V2,info = dtrtrs(L.T,V,lower=0)
-            Sigma_diag = np.sum(V*V,-2)
-            Knmv_tilde = np.dot(Kmn,self.v_tilde)
-            mu = np.dot(V2.T,Knmv_tilde)
-            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.num_data
-            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.num_data
-            np1.append(self.tau_tilde.copy())
-            np2.append(self.v_tilde.copy())
-
-        self._compute_GP_variables()
-
-    def fit_FITC(self, Kmm, Kmn, Knn_diag, epsilon=1e-3,power_ep=[1.,1.]):
-        """
-        The expectation-propagation algorithm with sparse pseudo-input.
-        For nomenclature see Naish-Guzman and Holden, 2008.
-
-        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
-        :type epsilon: float
-        :param power_ep: Power EP parameters
-        :type power_ep: list of floats
-        """
-        self.epsilon = epsilon
-        self.eta, self.delta = power_ep
-
-        num_inducing = Kmm.shape[0]
-
-        """
-        Prior approximation parameters:
-        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
-        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
-        """
-        Lm = jitchol(Kmm)
-        Lmi = chol_inv(Lm)
-        Kmmi = np.dot(Lmi.T,Lmi)
-        P0 = Kmn.T
-        KmnKnm = np.dot(P0.T, P0)
-        KmmiKmn = np.dot(Kmmi,P0.T)
-        Qnn_diag = np.sum(P0.T*KmmiKmn,-2)
-        Diag0 = Knn_diag - Qnn_diag
-        R0 = jitchol(Kmmi).T
-
-        """
-        Posterior approximation: q(f|y) = N(f| mu, Sigma)
-        Sigma = Diag + P*R.T*R*P.T + K
-        mu = w + P*Gamma
-        """
-        self.w = np.zeros(self.num_data)
-        self.Gamma = np.zeros(num_inducing)
-        mu = np.zeros(self.num_data)
-        P = P0.copy()
-        R = R0.copy()
-        Diag = Diag0.copy()
-        Sigma_diag = Knn_diag
-        RPT0 = np.dot(R0,P0.T)
-
-        """
-        Initial values - Cavity distribution parameters:
-        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
-        sigma_ = 1./tau_
-        mu_ = v_/tau_
-        """
-        self.tau_ = np.empty(self.num_data,dtype=float)
-        self.v_ = np.empty(self.num_data,dtype=float)
-
-        #Initial values - Marginal moments
-        z = np.empty(self.num_data,dtype=float)
-        self.Z_hat = np.empty(self.num_data,dtype=float)
-        phi = np.empty(self.num_data,dtype=float)
-        mu_hat = np.empty(self.num_data,dtype=float)
-        sigma2_hat = np.empty(self.num_data,dtype=float)
-
-        #Approximation
-        epsilon_np1 = 1
-        epsilon_np2 = 1
-       	self.iterations = 0
-        self.np1 = [self.tau_tilde.copy()]
-        self.np2 = [self.v_tilde.copy()]
-        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.num_data)
-            for i in update_order:
-                #Cavity distribution parameters
-                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
-                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
-                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
-                #Site parameters update
-                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
-                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
-                self.tau_tilde[i] += Delta_tau
-                self.v_tilde[i] += Delta_v
-                #Posterior distribution parameters update
-                dtd1 = Delta_tau*Diag[i] + 1.
-                dii = Diag[i]
-                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
-                pi_ = P[i,:].reshape(1,num_inducing)
-                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
-                Rp_i = np.dot(R,pi_.T)
-                RTR = np.dot(R.T,np.dot(np.eye(num_inducing) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
-                R = jitchol(RTR).T
-                self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1
-                self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
-                RPT = np.dot(R,P.T)
-                Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
-                mu = self.w + np.dot(P,self.Gamma)
-                self.iterations += 1
-            #Sigma recomptutation with Cholesky decompositon
-            Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde)
-            Diag = Diag0 * Iplus_Dprod_i
-            P = Iplus_Dprod_i[:,None] * P0
-            safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0)
-            L = jitchol(np.eye(num_inducing) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
-            R,info = dtrtrs(L,R0,lower=1)
-            RPT = np.dot(R,P.T)
-            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
-            self.w = Diag * self.v_tilde
-            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
-            mu = self.w + np.dot(P,self.Gamma)
-            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.num_data
-            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.num_data
-            self.np1.append(self.tau_tilde.copy())
-            self.np2.append(self.v_tilde.copy())
-
-        return self._compute_GP_variables()

From 1eb8cc5eab01b9a0448f0bd46e5c1e1ab767e633 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Mon, 24 Feb 2014 09:49:29 +0000
Subject: [PATCH 36/38] variational posterior and prior added, linear updated

---
 GPy/core/gp.py                                |  5 +-
 GPy/core/parameterization/array_core.py       |  4 +-
 GPy/core/parameterization/variational.py      | 56 +++++++++++++-----
 GPy/core/sparse_gp.py                         | 16 ++---
 .../latent_function_inference/posterior.py    | 18 +++---
 GPy/kern/_src/kern.py                         |  3 +-
 GPy/kern/_src/linear.py                       | 58 ++++++++++---------
 GPy/kern/_src/stationary.py                   |  2 +-
 GPy/models/bayesian_gplvm.py                  | 34 +++++------
 9 files changed, 118 insertions(+), 78 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 13336ef5..d8d1a87a 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -30,7 +30,10 @@ class GP(Model):
         super(GP, self).__init__(name)
 
         assert X.ndim == 2
-        self.X = ObservableArray(X)
+        if isinstance(X, ObservableArray):
+            self.X = self.X = X
+        else: self.X = ObservableArray(X)
+        
         self.num_data, self.input_dim = self.X.shape
 
         assert Y.ndim == 2
diff --git a/GPy/core/parameterization/array_core.py b/GPy/core/parameterization/array_core.py
index dffe2ed1..e8be0f77 100644
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@@ -28,7 +28,9 @@ class ObservableArray(np.ndarray, Observable):
     """
     __array_priority__ = -1 # Never give back ObservableArray
     def __new__(cls, input_array):
-        obj = np.atleast_1d(input_array).view(cls)
+        if not isinstance(input_array, ObservableArray):
+            obj = np.atleast_1d(input_array).view(cls)
+        else: obj = input_array
         cls.__name__ = "ObservableArray\n     "
         return obj
     
diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index 5fe63052..d1c0faf8 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -3,21 +3,54 @@ Created on 6 Nov 2013
 
 @author: maxz
 '''
+
+import numpy as np
 from parameterized import Parameterized
 from param import Param
 from transformations import Logexp
 
-class Normal(Parameterized):
+class VariationalPrior(object):
+    def KL_divergence(self, variational_posterior):
+        raise NotImplementedError, "override this for variational inference of latent space"
+
+    def update_gradients_KL(self, variational_posterior):
+        """
+        updates the gradients for mean and variance **in place**
+        """
+        raise NotImplementedError, "override this for variational inference of latent space"
+    
+class NormalPrior(VariationalPrior):
+    def KL_divergence(self, variational_posterior):
+        var_mean = np.square(variational_posterior.mean).sum()
+        var_S = (variational_posterior.variance - np.log(variational_posterior.variance)).sum()
+        return 0.5 * (var_mean + var_S) - 0.5 * variational_posterior.input_dim * variational_posterior.num_data
+
+    def update_gradients_KL(self, variational_posterior):
+        # dL:
+        variational_posterior.mean.gradient -= variational_posterior.mean
+        variational_posterior.variance.gradient -= (1. - (1. / (variational_posterior.variance))) * 0.5
+
+
+class VariationalPosterior(Parameterized):
+    def __init__(self, means=None, variances=None, name=None, **kw):
+        super(VariationalPosterior, self).__init__(name=name, **kw)
+        self.mean = Param("mean", means)
+        self.variance = Param("variance", variances, Logexp())
+        self.add_parameters(self.mean, self.variance)
+        self.num_data, self.input_dim = self.mean.shape
+        if self.has_uncertain_inputs():
+            assert self.variance.shape == self.mean.shape, "need one variance per sample and dimenion"
+    
+    def has_uncertain_inputs(self):
+        return not self.variance is None
+
+
+class NormalPosterior(VariationalPosterior):
     '''
-    Normal distribution for variational approximations.
+    NormalPosterior distribution for variational approximations.
 
     holds the means and variances for a factorizing multivariate normal distribution
     '''
-    def __init__(self, means, variances, name='latent space'):
-        Parameterized.__init__(self, name=name)
-        self.mean = Param("mean", means)
-        self.variance = Param('variance', variances, Logexp())
-        self.add_parameters(self.mean, self.variance)
 
     def plot(self, *args):
         """
@@ -30,8 +63,7 @@ class Normal(Parameterized):
         from ...plotting.matplot_dep import variational_plots
         return variational_plots.plot(self,*args)
 
-
-class SpikeAndSlab(Parameterized):
+class SpikeAndSlab(VariationalPosterior):
     '''
     The SpikeAndSlab distribution for variational approximations.
     '''
@@ -39,11 +71,9 @@ class SpikeAndSlab(Parameterized):
         """
         binary_prob : the probability of the distribution on the slab part.
         """
-        Parameterized.__init__(self, name=name)
-        self.mean = Param("mean", means)
-        self.variance = Param('variance', variances, Logexp())
+        super(SpikeAndSlab, self).__init__(means, variances, name)
         self.gamma = Param("binary_prob",binary_prob,)
-        self.add_parameters(self.mean, self.variance, self.gamma)
+        self.add_parameter(self.gamma)
 
     def plot(self, *args):
         """
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 71053867..37f2baf8 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -5,8 +5,9 @@ import numpy as np
 from ..util.linalg import mdot
 from gp import GP
 from parameterization.param import Param
-from GPy.inference.latent_function_inference import var_dtc
+from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
+from parameterization.variational import NormalPosterior
 
 class SparseGP(GP):
     """
@@ -45,16 +46,14 @@ class SparseGP(GP):
         self.Z = Param('inducing inputs', Z)
         self.num_inducing = Z.shape[0]
         
-        self.X_variance = X_variance
-        if self.has_uncertain_inputs():
-            assert X_variance.shape == X.shape
+        self.q = NormalPosterior(X, X_variance)
         
-        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name)
+        GP.__init__(self, self.q.mean, Y, kernel, likelihood, inference_method=inference_method, name=name)
         self.add_parameter(self.Z, index=0)
         self.parameters_changed()
 
     def has_uncertain_inputs(self):
-        return not (self.X_variance is None)                
+        return self.q.has_uncertain_inputs()                
 
     def parameters_changed(self):
         if self.has_uncertain_inputs():
@@ -81,7 +80,10 @@ class SparseGP(GP):
                 var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx)
             else:
                 Kxx = self.kern.Kdiag(Xnew)
-                var = Kxx - np.sum(Kx * np.dot(self.posterior.woodbury_inv, Kx), 0)
+                WKx_old = np.dot(np.atleast_3d(self.posterior.woodbury_inv)[:,:,0], Kx)
+                WKx = np.tensordot(np.atleast_3d(self.posterior.woodbury_inv), Kx, [0,0])
+                import ipdb;ipdb.set_trace()
+                var = Kxx - np.sum(Kx * WKx, 0)
         else:
             Kx = self.kern.psi1(self.Z, Xnew, X_variance_new)
             mu = np.dot(Kx, self.Cpsi1V)
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index 73741a13..a996e1df 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs, dpotri, symmetrify, jitchol, dtrtri
+from ...util.linalg import pdinv, dpotrs, dpotri, symmetrify, jitchol
 
 class Posterior(object):
     """
@@ -83,14 +83,15 @@ class Posterior(object):
             #LiK, _ = dtrtrs(self.woodbury_chol, self._K, lower=1)
             self._covariance = np.tensordot(np.dot(np.atleast_3d(self.woodbury_inv).T, self._K), self._K, [1,0]).T
             #self._covariance = self._K - self._K.dot(self.woodbury_inv).dot(self._K)
-        return self._covariance
+        return self._covariance.squeeze()
 
     @property
     def precision(self):
         if self._precision is None:
-            self._precision = np.zeros(np.atleast_3d(self.covariance).shape) # if one covariance per dimension
-            for p in xrange(self.covariance.shape[-1]):
-                self._precision[:,:,p] = pdinv(self.covariance[:,:,p])[0]
+            cov = np.atleast_3d(self.covariance)
+            self._precision = np.zeros(cov.shape) # if one covariance per dimension
+            for p in xrange(cov.shape[-1]):
+                self._precision[:,:,p] = pdinv(cov[:,:,p])[0]
         return self._precision
 
     @property
@@ -98,7 +99,10 @@ class Posterior(object):
         if self._woodbury_chol is None:
             #compute woodbury chol from 
             if self._woodbury_inv is not None:
-                _, _, self._woodbury_chol, _ = pdinv(self._woodbury_inv)
+                winv = np.atleast_3d(self._woodbury_inv)
+                self._woodbury_chol = np.zeros(winv.shape)
+                for p in xrange(winv.shape[-1]):
+                    self._woodbury_chol[:,:,p] = pdinv(winv[:,:,p])[2]
                 #Li = jitchol(self._woodbury_inv)
                 #self._woodbury_chol, _ = dtrtri(Li)
                 #W, _, _, _, = pdinv(self._woodbury_inv)
@@ -132,7 +136,7 @@ class Posterior(object):
     @property
     def K_chol(self):
         if self._K_chol is None:
-            self._K_chol = dportf(self._K)
+            self._K_chol = jitchol(self._K)
         return self._K_chol
 
 
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 3ef231b3..8bd9b6d1 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -127,11 +127,12 @@ from GPy.core.model import Model
 class Kern_check_model(Model):
     """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel."""
     def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        from GPy.kern import RBF
         Model.__init__(self, 'kernel_test_model')
         num_samples = 20
         num_samples2 = 10
         if kernel==None:
-            kernel = GPy.kern.rbf(1)
+            kernel = RBF(1)
         if X==None:
             X = np.random.randn(num_samples, kernel.input_dim)
         if dL_dK==None:
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index 61a1dbd3..a66b3705 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -106,51 +106,52 @@ class Linear(Kern):
     #              variational              #
     #---------------------------------------#
 
-    def psi0(self, Z, mu, S):
-        return np.sum(self.variances * self._mu2S(mu, S), 1)
+    def psi0(self, Z, posterior_variational):
+        return np.sum(self.variances * self._mu2S(posterior_variational), 1)
 
-    def psi1(self, Z, mu, S):
-        return self.K(mu, Z) #the variance, it does nothing
+    def psi1(self, Z, posterior_variational):
+        return self.K(posterior_variational.mean, Z) #the variance, it does nothing
 
-    def psi2(self, Z, mu, S):
+    def psi2(self, Z, posterior_variational):
         ZA = Z * self.variances
-        ZAinner = self._ZAinner(mu, S, Z)
+        ZAinner = self._ZAinner(posterior_variational, Z)
         return np.dot(ZAinner, ZA.T)
 
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, posterior_variational, Z):
+        mu, S = posterior_variational.mean, posterior_variational.variance
         # psi0:
-        tmp = dL_dpsi0[:, None] * self._mu2S(mu, S)
+        tmp = dL_dpsi0[:, None] * self._mu2S(posterior_variational)
         if self.ARD: grad = tmp.sum(0)
         else: grad = np.atleast_1d(tmp.sum())
         #psi1
         self.update_gradients_full(dL_dpsi1, mu, Z)
         grad += self.variances.gradient
         #psi2
-        tmp = dL_dpsi2[:, :, :, None] * (self._ZAinner(mu, S, Z)[:, :, None, :] * (2. * Z)[None, None, :, :])
+        tmp = dL_dpsi2[:, :, :, None] * (self._ZAinner(posterior_variational, Z)[:, :, None, :] * (2. * Z)[None, None, :, :])
         if self.ARD: grad += tmp.sum(0).sum(0).sum(0)
         else: grad += tmp.sum()
         #from Kmm
         self.update_gradients_full(dL_dKmm, Z, None)
         self.variances.gradient += grad
 
-    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, posterior_variational, Z):
         # Kmm
         grad = self.gradients_X(dL_dKmm, Z, None)
         #psi1
-        grad += self.gradients_X(dL_dpsi1.T, Z, mu)
+        grad += self.gradients_X(dL_dpsi1.T, Z, posterior_variational.mean)
         #psi2
-        self._weave_dpsi2_dZ(dL_dpsi2, Z, mu, S, grad)
+        self._weave_dpsi2_dZ(dL_dpsi2, Z, posterior_variational, grad)
         return grad
 
-    def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
-        grad_mu, grad_S = np.zeros(mu.shape), np.zeros(mu.shape)
+    def gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, posterior_variational, Z):
+        grad_mu, grad_S = np.zeros(posterior_variational.mean.shape), np.zeros(posterior_variational.mean.shape)
         # psi0
-        grad_mu += dL_dpsi0[:, None] * (2.0 * mu * self.variances)
+        grad_mu += dL_dpsi0[:, None] * (2.0 * posterior_variational.mean * self.variances)
         grad_S += dL_dpsi0[:, None] * self.variances
         # psi1
         grad_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
         # psi2
-        self._weave_dpsi2_dmuS(dL_dpsi2, Z, mu, S, grad_mu, grad_S)
+        self._weave_dpsi2_dmuS(dL_dpsi2, Z, posterior_variational, grad_mu, grad_S)
 
         return grad_mu, grad_S
 
@@ -159,7 +160,7 @@ class Linear(Kern):
     #--------------------------------------------------#
 
 
-    def _weave_dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
+    def _weave_dpsi2_dmuS(self, dL_dpsi2, Z, pv, target_mu, target_S):
         # Think N,num_inducing,num_inducing,input_dim
         ZA = Z * self.variances
         AZZA = ZA.T[:, None, :, None] * ZA[None, :, None, :]
@@ -202,15 +203,16 @@ class Linear(Kern):
         weave_options = {'headers'           : ['<omp.h>'],
                          'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
                          'extra_link_args'   : ['-lgomp']}
-
+        
+        mu = pv.mean
         N,num_inducing,input_dim,mu = mu.shape[0],Z.shape[0],mu.shape[1],param_to_array(mu)
         weave.inline(code, support_code=support_code, libraries=['gomp'],
                      arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'],
                      type_converters=weave.converters.blitz,**weave_options)
 
 
-    def _weave_dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
-        AZA = self.variances*self._ZAinner(mu, S, Z)
+    def _weave_dpsi2_dZ(self, dL_dpsi2, Z, pv, target):
+        AZA = self.variances*self._ZAinner(pv, Z)
         code="""
         int n,m,mm,q;
         #pragma omp parallel for private(n,mm,q)
@@ -232,21 +234,21 @@ class Linear(Kern):
                          'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
                          'extra_link_args'   : ['-lgomp']}
 
-        N,num_inducing,input_dim = mu.shape[0],Z.shape[0],mu.shape[1]
-        mu = param_to_array(mu)
+        N,num_inducing,input_dim = pv.mean.shape[0],Z.shape[0],pv.mean.shape[1]
+        mu = param_to_array(pv.mean)
         weave.inline(code, support_code=support_code, libraries=['gomp'],
                      arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'],
                      type_converters=weave.converters.blitz,**weave_options)
 
 
-    def _mu2S(self, mu, S):
-        return np.square(mu) + S
+    def _mu2S(self, pv):
+        return np.square(pv.mean) + pv.variance
 
-    def _ZAinner(self, mu, S, Z):
+    def _ZAinner(self, pv, Z):
         ZA = Z*self.variances
-        inner = (mu[:, None, :] * mu[:, :, None])
-        diag_indices = np.diag_indices(mu.shape[1], 2)
-        inner[:, diag_indices[0], diag_indices[1]] += S
+        inner = (pv.mean[:, None, :] * pv.mean[:, :, None])
+        diag_indices = np.diag_indices(pv.mean.shape[1], 2)
+        inner[:, diag_indices[0], diag_indices[1]] += pv.variance
 
         return np.dot(ZA, inner).swapaxes(0, 1)  # NOTE: self.ZAinner \in [num_inducing x N x input_dim]!
 
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index 7cc2e695..a6ff9424 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -18,7 +18,7 @@ class Stationary(Kern):
                 lengthscale = np.ones(1)
             else:
                 lengthscale = np.asarray(lengthscale)
-                assert lengthscale.size == 1 "Only  lengthscale needed for non-ARD kernel"
+                assert lengthscale.size == 1, "Only  lengthscale needed for non-ARD kernel"
         else:
             if lengthscale is not None:
                 lengthscale = np.asarray(lengthscale)
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index cc68de68..7b09e0b1 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -8,7 +8,7 @@ from ..core import SparseGP
 from ..likelihoods import Gaussian
 from ..inference.optimization import SCG
 from ..util import linalg
-from ..core.parameterization.variational import Normal
+from ..core.parameterization.variational import NormalPosterior, NormalPrior
 
 class BayesianGPLVM(SparseGP, GPLVM):
     """
@@ -29,7 +29,7 @@ class BayesianGPLVM(SparseGP, GPLVM):
         self.init = init
 
         if X_variance is None:
-            X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0.001, 1)
+            X_variance = np.random.uniform(0,.1,X.shape)
 
         if Z is None:
             Z = np.random.permutation(X.copy())[:num_inducing]
@@ -40,7 +40,9 @@ class BayesianGPLVM(SparseGP, GPLVM):
         
         if likelihood is None:
             likelihood = Gaussian()
-        self.q = Normal(X, X_variance)
+        self.q = NormalPosterior(X, X_variance)
+        self.variational_prior = NormalPrior()
+        
         SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, X_variance, name, **kwargs)
         self.add_parameter(self.q, index=0)
         #self.ensure_default_constraints()
@@ -57,24 +59,17 @@ class BayesianGPLVM(SparseGP, GPLVM):
         self.init = state.pop()
         SparseGP._setstate(self, state)
 
-    def KL_divergence(self):
-        var_mean = np.square(self.X).sum()
-        var_S = np.sum(self.X_variance - np.log(self.X_variance))
-        return 0.5 * (var_mean + var_S) - 0.5 * self.input_dim * self.num_data
-
     def parameters_changed(self):
         super(BayesianGPLVM, self).parameters_changed()
-
-        self._log_marginal_likelihood -= self.KL_divergence()
-        dL_dmu, dL_dS = self.kern.gradients_q_variational(posterior_variational=self.q, Z=self.Z, **self.grad_dict)
-
-        # dL:
-        self.q.mean.gradient  = dL_dmu
-        self.q.variance.gradient  = dL_dS  
-
-        # dKL:
-        self.q.mean.gradient -= self.X
-        self.q.variance.gradient -= (1. - (1. / (self.X_variance))) * 0.5
+        self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.q)
+        
+        # TODO: This has to go into kern
+        # maybe a update_gradients_q_variational?
+        self.q.mean.gradient, self.q.variance.gradient = self.kern.gradients_q_variational(posterior_variational=self.q, Z=self.Z, **self.grad_dict)
+        
+        # update for the KL divergence
+        self.variational_prior.update_gradients_KL(self.q)
+        
     
     def plot_latent(self, plot_inducing=True, *args, **kwargs):
         """
@@ -147,6 +142,7 @@ class BayesianGPLVM(SparseGP, GPLVM):
         """
         See GPy.plotting.matplot_dep.dim_reduction_plots.plot_steepest_gradient_map
         """
+        import sys
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ..plotting.matplot_dep import dim_reduction_plots
 

From f311bfdf17c78bc4f56f03514d4e28b26e2e5057 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@shef.ac.uk>
Date: Mon, 24 Feb 2014 11:33:58 +0000
Subject: [PATCH 37/38] changed to 'update_gradients_q_variational'

---
 GPy/core/parameterization/variational.py | 4 ++--
 GPy/kern/_src/rbf.py                     | 7 ++++---
 GPy/models/bayesian_gplvm.py             | 4 +---
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index d1c0faf8..05ce2109 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -63,7 +63,7 @@ class NormalPosterior(VariationalPosterior):
         from ...plotting.matplot_dep import variational_plots
         return variational_plots.plot(self,*args)
 
-class SpikeAndSlab(VariationalPosterior):
+class SpikeAndSlabPosterior(VariationalPosterior):
     '''
     The SpikeAndSlab distribution for variational approximations.
     '''
@@ -71,7 +71,7 @@ class SpikeAndSlab(VariationalPosterior):
         """
         binary_prob : the probability of the distribution on the slab part.
         """
-        super(SpikeAndSlab, self).__init__(means, variances, name)
+        super(SpikeAndSlabPosterior, self).__init__(means, variances, name)
         self.gamma = Param("binary_prob",binary_prob,)
         self.add_parameter(self.gamma)
 
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 0c8588a2..e23e9e2c 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -182,7 +182,7 @@ class RBF(Kern):
 
         return grad
 
-    def gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
+    def update_gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
         mu = posterior_variational.mean
         S = posterior_variational.variance
         self._psi_computations(Z, mu, S)
@@ -194,8 +194,9 @@ class RBF(Kern):
         tmp = self._psi2[:, :, :, None] / self.lengthscale2 / self._psi2_denom
         grad_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
         grad_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)
-
-        return grad_mu, grad_S
+        
+        posterior_variational.mean.gradient = grad_mu
+        posterior_variational.variance.gradient = grad_S
 
     def gradients_X(self, dL_dK, X, X2=None):
         #if self._X is None or X.base is not self._X.base or X2 is not None:
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 7b09e0b1..a8d643b9 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -63,9 +63,7 @@ class BayesianGPLVM(SparseGP, GPLVM):
         super(BayesianGPLVM, self).parameters_changed()
         self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.q)
         
-        # TODO: This has to go into kern
-        # maybe a update_gradients_q_variational?
-        self.q.mean.gradient, self.q.variance.gradient = self.kern.gradients_q_variational(posterior_variational=self.q, Z=self.Z, **self.grad_dict)
+        self.kern.update_gradients_q_variational(posterior_variational=self.q, Z=self.Z, **self.grad_dict)
         
         # update for the KL divergence
         self.variational_prior.update_gradients_KL(self.q)

From 8dbb65ab504fc6cd2c8743646e5c3e1ca30d571c Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Mon, 24 Feb 2014 11:34:22 +0000
Subject: [PATCH 38/38] 2d plotting

---
 GPy/core/sparse_gp.py                    | 10 ++--
 GPy/examples/dimensionality_reduction.py | 66 ++++++++++++------------
 GPy/plotting/matplot_dep/models_plots.py | 18 +++----
 GPy/testing/index_operations_tests.py    |  5 ++
 4 files changed, 50 insertions(+), 49 deletions(-)

diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 37f2baf8..bb3116ba 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -77,13 +77,11 @@ class SparseGP(GP):
             mu = np.dot(Kx.T, self.posterior.woodbury_vector)
             if full_cov:
                 Kxx = self.kern.K(Xnew)
-                var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx)
+                #var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx)
+                var = Kxx - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
             else:
                 Kxx = self.kern.Kdiag(Xnew)
-                WKx_old = np.dot(np.atleast_3d(self.posterior.woodbury_inv)[:,:,0], Kx)
-                WKx = np.tensordot(np.atleast_3d(self.posterior.woodbury_inv), Kx, [0,0])
-                import ipdb;ipdb.set_trace()
-                var = Kxx - np.sum(Kx * WKx, 0)
+                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
         else:
             Kx = self.kern.psi1(self.Z, Xnew, X_variance_new)
             mu = np.dot(Kx, self.Cpsi1V)
@@ -93,7 +91,7 @@ class SparseGP(GP):
                 Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new)
                 psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new)
                 var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
-        return mu, var[:,None]
+        return mu, var
 
 
     def _getstate(self):
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 3ba54d34..b6030eb7 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -89,7 +89,7 @@ def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_induci
     Y = Y - Y.mean(0)
     Y /= Y.std(0)
     # Create the model
-    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.bias(Q)
+    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q)
     m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing)
     m.data_labels = data['Y'][:N].argmax(axis=1)
 
@@ -139,7 +139,7 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4
                                          (1 - var))) + .001
     Z = _np.random.permutation(X)[:num_inducing]
 
-    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q, _np.exp(-2)) + GPy.kern.White(Q, _np.exp(-2))
 
     m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
     m.data_colors = c
@@ -159,28 +159,26 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4
 
 def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
     import GPy
-    from GPy.likelihoods import Gaussian
     from matplotlib import pyplot as plt
 
     _np.random.seed(0)
     data = GPy.util.datasets.oil()
 
-    kernel = GPy.kern.RBF_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, 1., [.1] * Q, ARD=True)# + GPy.kern.Bias(Q, _np.exp(-2))
     Y = data['X'][:N]
-    Yn = Gaussian(Y, normalize=True)
-    m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k)
+    m = GPy.models.BayesianGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k)
     m.data_labels = data['Y'][:N].argmax(axis=1)
-    m['noise'] = Yn.Y.var() / 100.
+    m['.*noise.var'] = Y.var() / 100.
 
     if optimize:
         m.optimize('scg', messages=verbose, max_iters=max_iters, gtol=.05)
 
     if plot:
-        y = m.likelihood.Y[0, :]
+        y = m.Y[0, :]
         fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
         m.plot_latent(ax=latent_axes)
-        data_show = GPy.util.visualize.vector_show(y)
-        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], # @UnusedVariable
+        data_show = GPy.plotting.matplot_dep.visualize.vector_show(y)
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X[0, :], # @UnusedVariable
             m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
         raw_input('Press enter to finish')
         plt.close(fig)
@@ -190,8 +188,8 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
     _np.random.seed(1234)
     
     x = _np.linspace(0, 4 * _np.pi, N)[:, None]
-    s1 = _np.vectorize(lambda x: -_np.sin(x))
-    s2 = _np.vectorize(lambda x: _np.cos(x))
+    s1 = _np.vectorize(lambda x: -_np.sin(_np.exp(x)))
+    s2 = _np.vectorize(lambda x: _np.cos(x)**2)
     s3 = _np.vectorize(lambda x:-_np.exp(-_np.cos(2 * x)))
     sS = _np.vectorize(lambda x: x*_np.sin(x))
 
@@ -328,7 +326,7 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
     _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
     likelihood_list = [Gaussian(x, normalize=True) for x in Ylist]
 
-    k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
+    k = kern.Linear(Q, ARD=True) + kern.Bias(Q, _np.exp(-2)) + kern.White(Q, _np.exp(-2))
     m = MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
     m.ensure_default_constraints()
 
@@ -355,15 +353,15 @@ def brendan_faces(optimize=True, verbose=True, plot=True):
     m = GPy.models.GPLVM(Yn, Q)
 
     # optimize
-    m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped())
+    m.constrain('rbf|noise|white', GPy.transformations.LogexpClipped())
 
     if optimize: m.optimize('scg', messages=verbose, max_iters=1000)
 
     if plot:
         ax = m.plot_latent(which_indices=(0, 1))
         y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
         raw_input('Press enter to finish')
 
     return m
@@ -382,8 +380,8 @@ def olivetti_faces(optimize=True, verbose=True, plot=True):
     if plot:
         ax = m.plot_latent(which_indices=(0, 1))
         y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
         raw_input('Press enter to finish')
 
     return m
@@ -398,8 +396,8 @@ def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=Tru
         Y = data['Y'][range[0]:range[1], :].copy()
     if plot:
         y = Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.data_play(Y, data_show, frame_rate)
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.plotting.matplot_dep.visualize.data_play(Y, data_show, frame_rate)
     return Y
 
 def stick(kernel=None, optimize=True, verbose=True, plot=True):
@@ -410,12 +408,12 @@ def stick(kernel=None, optimize=True, verbose=True, plot=True):
     # optimize
     m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
     if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    if plot and GPy.util.visualize.visual_available:
+    if plot and GPy.plotting.matplot_dep.visualize.visual_available:
         plt.clf
         ax = m.plot_latent()
         y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
         raw_input('Press enter to finish')
 
     return m
@@ -429,12 +427,12 @@ def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
     mapping = GPy.mappings.Linear(data['Y'].shape[1], 2)
     m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
     if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    if plot and GPy.util.visualize.visual_available:
+    if plot and GPy.plotting.matplot_dep.visualize.visual_available:
         plt.clf
         ax = m.plot_latent()
         y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
         raw_input('Press enter to finish')
 
     return m
@@ -449,12 +447,12 @@ def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
     mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
     m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
     if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    if plot and GPy.util.visualize.visual_available:
+    if plot and GPy.plotting.matplot_dep.visualize.visual_available:
         plt.clf
         ax = m.plot_latent()
         y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
         raw_input('Press enter to finish')
 
     return m
@@ -480,7 +478,7 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
 
     data = GPy.util.datasets.osu_run1()
     Q = 6
-    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q, _np.exp(-2)) + GPy.kern.White(Q, _np.exp(-2))
     m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
     # optimize
     m.ensure_default_constraints()
@@ -491,8 +489,8 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
         plt.sca(latent_axes)
         m.plot_latent()
         y = m.likelihood.Y[0, :].copy()
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
         raw_input('Press enter to finish')
 
     return m
@@ -511,8 +509,8 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose
     if plot:
         ax = m.plot_latent()
         y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel'])
-        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        data_show = GPy.plotting.matplot_dep.visualize.skeleton_show(y[None, :], data['skel'])
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
         raw_input('Press enter to finish')
         lvm_visualizer.close()
 
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index 59c32775..3d019bfd 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -57,8 +57,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         fig = pb.figure(num=fignum)
         ax = fig.add_subplot(111)
     
-    X, Y = param_to_array(model.X, model.Y)
-    if model.has_uncertain_inputs(): X_variance = model.X_variance
+    X, Y, Z = param_to_array(model.X, model.Y, model.Z)
+    if model.has_uncertain_inputs(): X_variance = param_to_array(model.q.variance)
     
     #work out what the inputs are for plotting (1D or 2D)
     fixed_dims = np.array([i for i,v in fixed_inputs])
@@ -97,10 +97,10 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
 
         
         #add error bars for uncertain (if input uncertainty is being modelled)
-        if hasattr(model,"has_uncertain_inputs") and model.has_uncertain_inputs():
-            ax.errorbar(X[which_data_rows, free_dims].flatten(), Y[which_data_rows, which_data_ycols].flatten(),
-                        xerr=2 * np.sqrt(X_variance[which_data_rows, free_dims].flatten()),
-                        ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
+        #if hasattr(model,"has_uncertain_inputs") and model.has_uncertain_inputs():
+        #    ax.errorbar(X[which_data_rows, free_dims].flatten(), Y[which_data_rows, which_data_ycols].flatten(),
+        #                xerr=2 * np.sqrt(X_variance[which_data_rows, free_dims].flatten()),
+        #                ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
 
 
         #set the limits of the plot to some sensible values
@@ -112,7 +112,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         #add inducing inputs (if a sparse model is used)
         if hasattr(model,"Z"):
             #Zu = model.Z[:,free_dims] * model._Xscale[:,free_dims] + model._Xoffset[:,free_dims]
-            Zu = param_to_array(model.Z[:,free_dims])
+            Zu = Z[:,free_dims]
             z_height = ax.get_ylim()[0]
             ax.plot(Zu, np.zeros_like(Zu) + z_height, 'r|', mew=1.5, markersize=12)
 
@@ -136,7 +136,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
             Y = Y
         else:
             m, _, _, _ = model.predict(Xgrid)
-            Y = model.data
+            Y = Y
         for d in which_data_ycols:
             m_d = m[:,d].reshape(resolution, resolution).T
             ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
@@ -152,7 +152,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         #add inducing inputs (if a sparse model is used)
         if hasattr(model,"Z"):
             #Zu = model.Z[:,free_dims] * model._Xscale[:,free_dims] + model._Xoffset[:,free_dims]
-            Zu = model.Z[:,free_dims]
+            Zu = Z[:,free_dims]
             ax.plot(Zu[:,free_dims[0]], Zu[:,free_dims[1]], 'wo')
 
     else:
diff --git a/GPy/testing/index_operations_tests.py b/GPy/testing/index_operations_tests.py
index 171db5cc..64b0c908 100644
--- a/GPy/testing/index_operations_tests.py
+++ b/GPy/testing/index_operations_tests.py
@@ -30,6 +30,11 @@ class Test(unittest.TestCase):
         self.assertListEqual(self.param_index[two].tolist(), [0,3])
         self.assertListEqual(self.param_index[one].tolist(), [1])        
 
+    def test_shift_right(self):
+        self.param_index.shift_right(5, 2)
+        self.assertListEqual(self.param_index[three].tolist(), [2,4,9])
+        self.assertListEqual(self.param_index[two].tolist(), [0,7])
+        self.assertListEqual(self.param_index[one].tolist(), [3])        
 
     def test_index_view(self):
         #=======================================================================