diff --git a/GPy/core/model.py b/GPy/core/model.py
index a1b2abe4..57d41602 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -260,17 +260,18 @@ class Model(Parameterized):
         """
         positive_strings = ['variance', 'lengthscale', 'precision', 'kappa', 'sensitivity']
         # param_names = self._get_param_names()
-        for s in positive_strings:
-            paramlist = self.grep_param_names(".*"+s)
-            for param in paramlist:
-                for p in param.flattened_parameters:
-                    rav_i = set(self._raveled_index_for(p))
-                    for constraint in self.constraints.iter_properties():
-                        rav_i -= set(self._constraint_indices(p, constraint))
-                    rav_i -= set(np.nonzero(self._fixes_for(p)!=UNFIXED)[0])
-                    ind = self._backtranslate_index(p, np.array(list(rav_i), dtype=int))
-                    if ind.size != 0:
-                        p[np.unravel_index(ind, p.shape)].constrain_positive()
+        
+#         for s in positive_strings:
+#             paramlist = self.grep_param_names(".*"+s)
+#             for param in paramlist:
+#                 for p in param.flattened_parameters:
+#                     rav_i = set(self._raveled_index_for(p))
+#                     for constraint in self.constraints.iter_properties():
+#                         rav_i -= set(self._constraint_indices(p, constraint))
+#                     rav_i -= set(np.nonzero(self._fixes_for(p)!=UNFIXED)[0])
+#                     ind = self._backtranslate_index(p, np.array(list(rav_i), dtype=int))
+#                     if ind.size != 0:
+#                         p[np.unravel_index(ind, p.shape)].constrain_positive(warning=warning)
 #             if paramlist:
 #                 self.__getitem__(None, paramlist).constrain_positive(warning=warning)
 #         currently_constrained = self.all_constrained_indices()
@@ -380,7 +381,7 @@ class Model(Parameterized):
         sgd.run()
         self.optimization_runs.append(sgd)
 
-    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3):
+    def _checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3):
         """
         Check the gradient of the ,odel by comparing to a numerical
         estimate.  If the verbose flag is passed, invividual
@@ -405,18 +406,18 @@ class Model(Parameterized):
             dx = step * np.sign(np.random.uniform(-1, 1, x.size))
 
             # evaulate around the point x
-            f1, g1 = self.objective_and_gradients(x + dx)
-            f2, g2 = self.objective_and_gradients(x - dx)
+            f1 = self.objective_function(x + dx)
+            f2 = self.objective_function(x - dx)
             gradient = self.objective_function_gradients(x)
 
             numerical_gradient = (f1 - f2) / (2 * dx)
             global_ratio = (f1 - f2) / (2 * np.dot(dx, np.where(gradient==0, 1e-32, gradient)))
-            
+
             return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() < tolerance)
         else:
             # check the gradient of each parameter individually, and do some pretty printing
             try:
-                names = self._get_param_names_transformed()
+                names = self._get_param_names()
             except NotImplementedError:
                 names = ['Variable %i' % i for i in range(len(x))]
             # Prepare for pretty-printing
@@ -430,39 +431,46 @@ class Model(Parameterized):
             header_string = map(lambda x: '|'.join(x), [header_string])
             separator = '-' * len(header_string[0])
             print '\n'.join([header_string[0], separator])
-
             if target_param is None:
                 param_list = range(len(x))
             else:
-                param_list = self.grep_param_names(target_param, transformed=True, search=True)
-                if not np.any(param_list):
+                param_list = self._raveled_index_for(target_param)
+                if self._has_fixes():
+                    param_list = np.intersect1d(param_list, np.r_[:self.size][self._fixes_], True)
+
+                if param_list.size == 0:
                     print "No free parameters to check"
                     return
 
-
-            for i in param_list:
+            gradient = self.objective_function_gradients(x)
+            np.where(gradient==0, 1e-312, gradient)
+            ret = True
+            for i, ind in enumerate(param_list):
                 xx = x.copy()
-                xx[i] += step
-                f1, g1 = self.objective_and_gradients(xx)
-                xx[i] -= 2.*step
-                f2, g2 = self.objective_and_gradients(xx)
-                gradient = self.objective_function_gradients(x)[i]
-
+                xx[ind] += step
+                f1 = self.objective_function(xx)
+                xx[ind] -= 2.*step
+                f2 = self.objective_function(xx)
+                
                 numerical_gradient = (f1 - f2) / (2 * step)
-                ratio = (f1 - f2) / (2 * step * np.where(gradient==0, 1e-312, gradient))
-                difference = np.abs((f1 - f2) / 2 / step - gradient)
+                ratio = (f1 - f2) / (2 * step * gradient[ind])
+                difference = np.abs((f1 - f2) / 2 / step - gradient[ind])
 
                 if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance:
-                    formatted_name = "\033[92m {0} \033[0m".format(names[i])
+                    formatted_name = "\033[92m {0} \033[0m".format(names[ind])
+                    ret &= True
                 else:
-                    formatted_name = "\033[91m {0} \033[0m".format(names[i])
+                    formatted_name = "\033[91m {0} \033[0m".format(names[ind])
+                    ret &= False
+
                 r = '%.6f' % float(ratio)
                 d = '%.6f' % float(difference)
-                g = '%.6f' % gradient
+                g = '%.6f' % gradient[ind]
                 ng = '%.6f' % float(numerical_gradient)
                 grad_string = "{0:<{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name, r, d, g, ng, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4])
                 print grad_string
-
+            return ret
+        
     def input_sensitivity(self):
         """
         return an array describing the sesitivity of the model to each input
diff --git a/GPy/core/parameterization/array_core.py b/GPy/core/parameterization/array_core.py
index 4c31f23b..4b5b7700 100644
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@@ -15,8 +15,36 @@ class ListArray(np.ndarray):
     def __new__(cls, input_array):
         obj = np.asanyarray(input_array).view(cls)
         return obj
-    def __eq__(self, other):
-        return other is self
+    #def __eq__(self, other):
+    #    return other is self
+
+class ParamList(list):
+
+    def __contains__(self, other):
+        for el in self:
+            if el is other:
+                return True
+        return False
+    
+    pass
+class C(np.ndarray):
+    __array_priority__ = 1.
+    def __new__(cls, array):
+        obj = array.view(cls)
+        return obj
+    #def __array_finalize__(self, obj):
+    #    #print 'finalize'
+    #    return obj
+    def __array_prepare__(self, out_arr, context):
+        #print 'prepare'
+        while type(out_arr) is C:
+            out_arr = out_arr.base
+        return out_arr
+    def __array_wrap__(self, out_arr, context):
+        #print 'wrap', type(self), type(out_arr), context
+        while type(out_arr) is C:
+            out_arr = out_arr.base
+        return out_arr
 
 class ObservableArray(ListArray, Observable):
     """
@@ -25,7 +53,7 @@ class ObservableArray(ListArray, Observable):
     will be called every time this array changes. The callable
     takes exactly one argument, which is this array itself.
     """
-    __array_priority__ = 0 # Never give back Param
+    __array_priority__ = -1 # Never give back ObservableArray
     def __new__(cls, input_array):
         cls.__name__ = "ObservableArray\n     "
         obj = super(ObservableArray, cls).__new__(cls, input_array).view(cls)
@@ -36,16 +64,19 @@ class ObservableArray(ListArray, Observable):
         if obj is None: return
         self._observers_ = getattr(obj, '_observers_', None)
     def __setitem__(self, s, val, update=True):
-        if self.ndim:
-            if not np.all(np.equal(self[s], val)):
-                super(ObservableArray, self).__setitem__(s, val)
-                if update:
-                    self._notify_observers()
-        else:
-            if not np.all(np.equal(self, val)):
-                super(ObservableArray, self).__setitem__(Ellipsis, val)
-                if update:
-                    self._notify_observers()
+        super(ObservableArray, self).__setitem__(s, val)
+        if update:
+            self._notify_observers()
+#         if self.ndim:
+#             if not np.all(np.equal(self[s], val)):
+#                 super(ObservableArray, self).__setitem__(s, val)
+#                 if update:
+#                     self._notify_observers()
+#         else:
+#             if not np.all(np.equal(self, val)):
+#                 super(ObservableArray, self).__setitem__(Ellipsis, val)
+#                 if update:
+#                     self._notify_observers()
     def __getslice__(self, start, stop):
         return self.__getitem__(slice(start, stop))
     def __setslice__(self, start, stop, val):
diff --git a/GPy/core/parameterization/index_operations.py b/GPy/core/parameterization/index_operations.py
index 99b5a4de..d52211c5 100644
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@@ -90,11 +90,6 @@ class ParameterIndexOperations(object):
         return self._properties.values()
 
     def properties_for(self, index):
-#         already_seen = dict()
-#         for ni in index:
-#             if ni not in already_seen:
-#                 already_seen[ni] = [prop for prop in self.iter_properties() if ni in self._properties[prop]] 
-#             yield already_seen[ni]
         return vectorize(lambda i: [prop for prop in self.iter_properties() if i in self._properties[prop]], otypes=[list])(index)
         
     def add(self, prop, indices):
@@ -111,70 +106,11 @@ class ParameterIndexOperations(object):
                 self._properties[prop] = diff
             else:
                 del self._properties[prop]
-            #[self._reverse[i].remove(prop) for i in removed if prop in self._reverse[i]] 
             return removed.astype(int)
-#         else:
-#             for a in self.properties(): 
-#                 if numpy.all(a==prop) and a._parent_index_ == prop._parent_index_:
-#                     ind = create_raveled_indices(indices, shape, offset)
-#                     diff = remove_indices(self[a], ind)
-#                     removed = numpy.intersect1d(self[a], ind, True)
-#                     if not index_empty(diff):
-#                         self._properties[a] = diff
-#                     else:
-#                         del self._properties[a]
-#                     [self._reverse[i].remove(a) for i in removed if a in self._reverse[i]] 
-#                     return removed.astype(int)
         return numpy.array([]).astype(int)
     def __getitem__(self, prop):
         return self._properties[prop]
        
-# class TieIndexOperations(object):
-#     def __init__(self, params):
-#         self.params = params
-#         self.tied_from = ParameterIndexOperations()
-#         self.tied_to = ParameterIndexOperations()
-#     def add(self, tied_from, tied_to):
-#         rav_from = self.params._raveled_index_for(tied_from)
-#         rav_to = self.params._raveled_index_for(tied_to)
-#         self.tied_from.add(tied_to, rav_from)
-#         self.tied_to.add(tied_to, rav_to)
-#         return rav_from, rav_to
-#     def remove(self, tied_from, tied_to):
-#         rav_from = self.params._raveled_index_for(tied_from)
-#         rav_to = self.params._raveled_index_for(tied_to)
-#         rem_from = self.tied_from.remove(tied_to, rav_from)
-#         rem_to = self.tied_to.remove(tied_to, rav_to)
-#         left_from = self.tied_from._properties.pop(tied_to)
-#         left_to = self.tied_to._properties.pop(tied_to)
-#         self.tied_from[numpy.delete(tied_to, rem_from)] = left_from
-#         self.tied_to[numpy.delete(tied_to, rem_to)] = left_to
-#         return rav_from, rav_to
-#     def from_to_for(self, index):
-#         return self.tied_from.properties_for(index), self.tied_to.properties_for(index)
-#     def iter_from_to_indices(self):
-#         for k, f in self.tied_from.iteritems():
-#             yield f, self.tied_to[k]
-#     def iter_to_indices(self):
-#         return self.tied_to.iterindices()
-#     def iter_from_indices(self):
-#         return self.tied_from.iterindices()
-#     def iter_from_items(self):
-#         for f, i in self.tied_from.iteritems():
-#             yield f, i
-#     def iter_properties(self):
-#         return self.tied_from.iter_properties()
-#     def properties(self):
-#         return self.tied_from.properties()
-#     def from_to_indices(self, param):
-#         return self.tied_from[param], self.tied_to[param]
-#     
-# # def create_raveled_indices(index, shape, offset=0):
-# #     if isinstance(index, (tuple, list)): i = [slice(None)] + list(index)
-# #     else: i = [slice(None), index]
-# #     ind = numpy.array(numpy.ravel_multi_index(numpy.indices(shape)[i], shape)).flat + numpy.int_(offset)
-# #     return ind
-
 def combine_indices(arr1, arr2):
     return numpy.union1d(arr1, arr2)
 
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 462210dc..b76b3858 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -3,8 +3,8 @@
 
 import itertools
 import numpy
-from parameter_core import Constrainable, adjust_name_for_printing
-from array_core import ObservableArray
+from parameter_core import Constrainable, Gradcheckable, adjust_name_for_printing
+from array_core import ObservableArray, ParamList
 
 ###### printing
 __constraints_name__ = "Constraint"
@@ -12,47 +12,41 @@ __index_name__ = "Index"
 __tie_name__ = "Tied to"
 __precision__ = numpy.get_printoptions()['precision'] # numpy printing precision used, sublassing numpy ndarray after all
 __print_threshold__ = 5
-######      
+######
 
 class Float(numpy.float64, Constrainable):
     def __init__(self, f, base):
         super(Float,self).__init__(f)
         self._base = base
-        
-        
-class Param(ObservableArray, Constrainable):
+
+
+class Param(ObservableArray, Constrainable, Gradcheckable):
     """
     Parameter object for GPy models.
 
-    :param name:        name of the parameter to be printed
-    :param input_array: array which this parameter handles
-    
+    :param str name:           name of the parameter to be printed
+    :param input_array:        array which this parameter handles
+    :type input_array:         numpy.ndarray
+    :param default_constraint: The default constraint for this parameter
+    :type default_constraint:  
+
     You can add/remove constraints by calling constrain on the parameter itself, e.g:
-    
+
         - self[:,1].constrain_positive()
         - self[0].tie_to(other)
         - self.untie()
         - self[:3,:].unconstrain()
         - self[1].fix()
-        
+
     Fixing parameters will fix them to the value they are right now. If you change
     the fixed value, it will be fixed to the new value!
-    
+
     See :py:class:`GPy.core.parameterized.Parameterized` for more details on constraining etc.
 
-    This ndarray can be stored in lists and checked if it is in.
-
-    >>> import numpy as np
-    >>> x = np.random.normal(size=(10,3))
-    >>> x in [[1], x, [3]]
-    True
-    
-    WARNING: This overrides the functionality of x==y!!!
-    Use numpy.equal(x,y) for element-wise equality testing.
     """
-    __array_priority__ = 0 # Never give back Param
+    __array_priority__ = -1 # Never give back Param
     _fixes_ = None
-    def __new__(cls, name, input_array, *args, **kwargs):
+    def __new__(cls, name, input_array, default_constraint=None):
         obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array))
         obj._current_slice_ = (slice(obj.shape[0]),)
         obj._realshape_ = obj.shape
@@ -66,16 +60,16 @@ class Param(ObservableArray, Constrainable):
         obj.gradient = None
         return obj
 
-    def __init__(self, name, input_array):
-        super(Param, self).__init__(name=name)
-        
+    def __init__(self, name, input_array, default_constraint=None):
+        super(Param, self).__init__(name=name, default_constraint=default_constraint)
+
     def __array_finalize__(self, obj):
         # see InfoArray.__array_finalize__ for comments
         if obj is None: return
         super(Param, self).__array_finalize__(obj)
         self._direct_parent_ = getattr(obj, '_direct_parent_', None)
         self._parent_index_ = getattr(obj, '_parent_index_', None)
-        self._highest_parent_ = getattr(obj, '_highest_parent_', None)
+        self._default_constraint_ = getattr(obj, '_default_constraint_', None)
         self._current_slice_ = getattr(obj, '_current_slice_', None)
         self._tied_to_me_ = getattr(obj, '_tied_to_me_', None)
         self._tied_to_ = getattr(obj, '_tied_to_', None)
@@ -86,7 +80,7 @@ class Param(ObservableArray, Constrainable):
         self._original_ = getattr(obj, '_original_', None)
         self._name = getattr(obj, 'name', None)
         self.gradient = getattr(obj, 'gradient', None)
-        
+
     def __array_wrap__(self, out_arr, context=None):
         return out_arr.view(numpy.ndarray)
     #===========================================================================
@@ -94,11 +88,11 @@ class Param(ObservableArray, Constrainable):
     #===========================================================================
     def __reduce_ex__(self):
         func, args, state = super(Param, self).__reduce__()
-        return func, args, (state, 
+        return func, args, (state,
                             (self.name,
                              self._direct_parent_,
                              self._parent_index_,
-                             self._highest_parent_,
+                             self._default_constraint_,
                              self._current_slice_,
                              self._realshape_,
                              self._realsize_,
@@ -119,7 +113,7 @@ class Param(ObservableArray, Constrainable):
         self._realsize_ = state.pop()
         self._realshape_ = state.pop()
         self._current_slice_ = state.pop()
-        self._highest_parent_ = state.pop()
+        self._default_constraint_ = state.pop()
         self._parent_index_ = state.pop()
         self._direct_parent_ = state.pop()
         self.name = state.pop()
@@ -132,13 +126,13 @@ class Param(ObservableArray, Constrainable):
         self.flat = param
         self._notify_tied_parameters()
         self._notify_observers()
-        
+
     def _get_params(self):
         return self.flat
 #     @property
 #     def name(self):
 #         """
-#         Name of this parameter. 
+#         Name of this parameter.
 #         This can be a callable without parameters. The callable will be called
 #         every time the name property is accessed.
 #         """
@@ -153,28 +147,9 @@ class Param(ObservableArray, Constrainable):
     @property
     def _parameters_(self):
         return []
-    def _connect_highest_parent(self, highest_parent):
-        self._highest_parent_ = highest_parent
     def _collect_gradient(self, target):
         target[:] = self.gradient.flat
     #===========================================================================
-    # Fixing Parameters:
-    #===========================================================================
-    def constrain_fixed(self, warning=True):
-        """
-        Constrain this paramter to be fixed to the current value it carries.
-        
-        :param warning: print a warning for overwriting constraints.
-        """
-        self._highest_parent_._fix(self,warning)
-    fix = constrain_fixed
-    def unconstrain_fixed(self):
-        """
-        This parameter will no longer be fixed.
-        """
-        self._highest_parent_._unfix(self)
-    unfix = unconstrain_fixed
-    #===========================================================================
     # Tying operations -> bugged, TODO
     #===========================================================================
     def tie_to(self, param):
@@ -190,22 +165,22 @@ class Param(ObservableArray, Constrainable):
         Note: For now only one parameter can have ties, so all of a parameter
               will be removed, when re-tieing!
         """
-        #Note: this method will tie to the parameter which is the last in 
+        #Note: this method will tie to the parameter which is the last in
         #      the chain of ties. Thus, if you tie to a tied parameter,
         #      this tie will be created to the parameter the param is tied
         #      to.
 
-        assert isinstance(param, Param), "Argument {1} not of type {0}".format(Param,param.__class__)
+        assert isinstance(param, Param), "Argument {1} not of type {0}".format(Param, param.__class__)
         param = numpy.atleast_1d(param)
         if param.size != 1:
             raise NotImplementedError, "Broadcast tying is not implemented yet"
         try:
-            if self._original_: 
+            if self._original_:
                 self[:] = param
             else: # this happens when indexing created a copy of the array
                 self._direct_parent_._get_original(self)[self._current_slice_] = param
         except ValueError:
-            raise ValueError("Trying to tie {} with shape {} to {} with shape {}".format(self.name, self.shape, param.name, param.shape))            
+            raise ValueError("Trying to tie {} with shape {} to {} with shape {}".format(self.name, self.shape, param.name, param.shape))
         if param is self:
             raise RuntimeError, 'Cyclic tieing is not allowed'
 #         if len(param._tied_to_) > 0:
@@ -248,7 +223,7 @@ class Param(ObservableArray, Constrainable):
                 t_rav_i = t._raveled_index()
                 tr_rav_i = tied_to_me._raveled_index()
                 new_index = list(set(t_rav_i) | set(tr_rav_i))
-                tmp = t._direct_parent_._get_original(t)[numpy.unravel_index(new_index,t._realshape_)]
+                tmp = t._direct_parent_._get_original(t)[numpy.unravel_index(new_index, t._realshape_)]
                 self._tied_to_me_[tmp] = self._tied_to_me_[t] | set(self._raveled_index())
                 del self._tied_to_me_[t]
                 return
@@ -261,7 +236,7 @@ class Param(ObservableArray, Constrainable):
                 import ipdb;ipdb.set_trace()
                 new_index = list(set(t_rav_i) - set(tr_rav_i))
                 if new_index:
-                    tmp = t._direct_parent_._get_original(t)[numpy.unravel_index(new_index,t._realshape_)]
+                    tmp = t._direct_parent_._get_original(t)[numpy.unravel_index(new_index, t._realshape_)]
                     self._tied_to_me_[tmp] = self._tied_to_me_[t]
                     del self._tied_to_me_[t]
                     if len(self._tied_to_me_[tmp]) == 0:
@@ -269,12 +244,12 @@ class Param(ObservableArray, Constrainable):
                 else:
                     del self._tied_to_me_[t]
     def _on_tied_parameter_changed(self, val, ind):
-        if not self._updated_: #not fast_array_equal(self, val[ind]):
+        if not self._updated_:  # not fast_array_equal(self, val[ind]):
             val = numpy.atleast_1d(val)
             self._updated_ = True
             if self._original_:
                 self.__setitem__(slice(None), val[ind], update=False)
-            else: # this happens when indexing created a copy of the array
+            else:  # this happens when indexing created a copy of the array
                 self._direct_parent_._get_original(self).__setitem__(self._current_slice_, val[ind], update=False)
             self._notify_tied_parameters()
             self._updated_ = False
@@ -293,7 +268,7 @@ class Param(ObservableArray, Constrainable):
     def unset_prior(self, *priors):
         """
         :param priors: priors to remove from this parameter
-        
+
         Remove all priors from this parameter
         """
         self._highest_parent_._remove_prior(self, *priors)
@@ -303,11 +278,11 @@ class Param(ObservableArray, Constrainable):
     def __getitem__(self, s, *args, **kwargs):
         if not isinstance(s, tuple):
             s = (s,)
-        if not reduce(lambda a,b: a or numpy.any(b is Ellipsis), s, False) and len(s) <= self.ndim:
+        if not reduce(lambda a, b: a or numpy.any(b is Ellipsis), s, False) and len(s) <= self.ndim:
             s += (Ellipsis,)
         new_arr = super(Param, self).__getitem__(s, *args, **kwargs)
         try: new_arr._current_slice_ = s; new_arr._original_ = self.base is new_arr.base
-        except AttributeError: pass# returning 0d array or float, double etc
+        except AttributeError: pass  # returning 0d array or float, double etc
         return new_arr
     def __setitem__(self, s, val, update=True):
         super(Param, self).__setitem__(s, val, update=update)
@@ -324,12 +299,12 @@ class Param(ObservableArray, Constrainable):
             if numpy.all(si == Ellipsis):
                 continue
             if isinstance(si, slice):
-                a = si.indices(self._realshape_[i])[0] 
+                a = si.indices(self._realshape_[i])[0]
             elif isinstance(si, (list,numpy.ndarray,tuple)):
                 a = si[0]
             else: a = si
-            if a<0:
-                a = self._realshape_[i]+a
+            if a < 0:
+                a = self._realshape_[i] + a
             internal_offset += a * extended_realshape[i]
         return internal_offset
     def _raveled_index(self, slice_index=None):
@@ -337,8 +312,8 @@ class Param(ObservableArray, Constrainable):
         # of this object
         extended_realshape = numpy.cumprod((1,) + self._realshape_[:0:-1])[::-1]
         ind = self._indices(slice_index)
-        if ind.ndim < 2: ind=ind[:,None]
-        return numpy.asarray(numpy.apply_along_axis(lambda x: numpy.sum(extended_realshape*x), 1, ind), dtype=int)
+        if ind.ndim < 2: ind = ind[:, None]
+        return numpy.asarray(numpy.apply_along_axis(lambda x: numpy.sum(extended_realshape * x), 1, ind), dtype=int)
     def _expand_index(self, slice_index=None):
         # this calculates the full indexing arrays from the slicing objects given by get_item for _real..._ attributes
         # it basically translates slices to their respective index arrays and turns negative indices around
@@ -351,11 +326,11 @@ class Param(ObservableArray, Constrainable):
                 if isinstance(a, slice):
                     start, stop, step = a.indices(b)
                     return numpy.r_[start:stop:step]
-                elif isinstance(a, (list,numpy.ndarray,tuple)):
+                elif isinstance(a, (list, numpy.ndarray, tuple)):
                     a = numpy.asarray(a, dtype=int)
-                    a[a<0] = b + a[a<0]
-                elif a<0:
-                    a = b+a
+                    a[a < 0] = b + a[a < 0]
+                elif a < 0:
+                    a = b + a
                 return numpy.r_[a]
             return numpy.r_[:b]
         return itertools.imap(f, itertools.izip_longest(slice_index[:self._realndim_], self._realshape_, fillvalue=slice(self.size)))
@@ -379,7 +354,7 @@ class Param(ObservableArray, Constrainable):
     #===========================================================================
     @property
     def _description_str(self):
-        if self.size <= 1: return ["%f"%self]
+        if self.size <= 1: return ["%f" % self]
         else: return [str(self.shape)]
     def _parameter_names(self, add_name):
         return [self.name]
@@ -391,31 +366,31 @@ class Param(ObservableArray, Constrainable):
         return [self.shape]
     @property
     def _constraints_str(self):
-        return [' '.join(map(lambda c: str(c[0]) if c[1].size==self._realsize_ else "{"+str(c[0])+"}", self._highest_parent_._constraints_iter_items(self)))]
+        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self._highest_parent_._constraints_iter_items(self)))]
     @property
     def _ties_str(self):
         return [t._short() for t in self._tied_to_] or ['']
     @property
     def name_hirarchical(self):
         if self.has_parent():
-            return self._direct_parent_.hirarchy_name()+adjust_name_for_printing(self.name)
+            return self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name)
         return adjust_name_for_printing(self.name)
     def __repr__(self, *args, **kwargs):
         name = "\033[1m{x:s}\033[0;0m:\n".format(
                             x=self.name_hirarchical)
-        return name + super(Param, self).__repr__(*args,**kwargs)
+        return name + super(Param, self).__repr__(*args, **kwargs)
     def _ties_for(self, rav_index):
-        #size = sum(p.size for p in self._tied_to_)
+        # size = sum(p.size for p in self._tied_to_)
         ties = numpy.empty(shape=(len(self._tied_to_), numpy.size(rav_index)), dtype=Param)
         for i, tied_to in enumerate(self._tied_to_):
             for t, ind in tied_to._tied_to_me_.iteritems():
                 if t._parent_index_ == self._parent_index_:
-                    matches = numpy.where(rav_index[:,None] == t._raveled_index()[None, :])
+                    matches = numpy.where(rav_index[:, None] == t._raveled_index()[None, :])
                     tt_rav_index = tied_to._raveled_index()
                     ind_rav_matches = numpy.where(tt_rav_index == numpy.array(list(ind)))[0]
                     if len(ind) != 1: ties[i, matches[0][ind_rav_matches]] = numpy.take(tt_rav_index, matches[1], mode='wrap')[ind_rav_matches]
                     else: ties[i, matches[0]] = numpy.take(tt_rav_index, matches[1], mode='wrap')
-        return map(lambda a: sum(a,[]), zip(*[[[tie.flatten()] if tx!=None else [] for tx in t] for t,tie in zip(ties,self._tied_to_)]))
+        return map(lambda a: sum(a, []), zip(*[[[tie.flatten()] if tx != None else [] for tx in t] for t, tie in zip(ties, self._tied_to_)]))
     def _constraints_for(self, rav_index):
         return self._highest_parent_._constraints_for(self, rav_index)
     def _indices(self, slice_index=None):
@@ -424,13 +399,13 @@ class Param(ObservableArray, Constrainable):
             slice_index = self._current_slice_
         if isinstance(slice_index, (tuple, list)):
             clean_curr_slice = [s for s in slice_index if numpy.any(s != Ellipsis)]
-            if (all(isinstance(n, (numpy.ndarray, list, tuple)) for n in clean_curr_slice) 
-                and len(set(map(len,clean_curr_slice))) <= 1):
+            if (all(isinstance(n, (numpy.ndarray, list, tuple)) for n in clean_curr_slice)
+                and len(set(map(len, clean_curr_slice))) <= 1):
                 return numpy.fromiter(itertools.izip(*clean_curr_slice),
-                    dtype=[('',int)]*self._realndim_,count=len(clean_curr_slice[0])).view((int, self._realndim_))
+                    dtype=[('', int)] * self._realndim_, count=len(clean_curr_slice[0])).view((int, self._realndim_))
         expanded_index = list(self._expand_index(slice_index))
         return numpy.fromiter(itertools.product(*expanded_index),
-                 dtype=[('',int)]*self._realndim_,count=reduce(lambda a,b: a*b.size,expanded_index,1)).view((int, self._realndim_))
+                 dtype=[('', int)] * self._realndim_, count=reduce(lambda a, b: a * b.size, expanded_index, 1)).view((int, self._realndim_))
     def _max_len_names(self, gen, header):
         return reduce(lambda a, b:max(a, len(b)), gen, len(header))
     def _max_len_values(self):
@@ -443,9 +418,9 @@ class Param(ObservableArray, Constrainable):
         if self._realsize_ < 2:
             return name
         ind = self._indices()
-        if ind.size > 4: indstr = ','.join(map(str,ind[:2])) + "..." + ','.join(map(str,ind[-2:])) 
-        else: indstr = ','.join(map(str,ind))
-        return name+'['+indstr+']'
+        if ind.size > 4: indstr = ','.join(map(str, ind[:2])) + "..." + ','.join(map(str, ind[-2:]))
+        else: indstr = ','.join(map(str, ind))
+        return name + '[' + indstr + ']'
     def __str__(self, constr_matrix=None, indices=None, ties=None, lc=None, lx=None, li=None, lt=None):
         filter_ = self._current_slice_
         vals = self.flat
@@ -458,10 +433,10 @@ class Param(ObservableArray, Constrainable):
         if lx is None: lx = self._max_len_values()
         if li is None: li = self._max_len_index(indices)
         if lt is None: lt = self._max_len_names(ties, __tie_name__)
-        header = "  {i:^{2}s}  |  \033[1m{x:^{1}s}\033[0;0m  |  {c:^{0}s}  |  {t:^{3}s}".format(lc,lx,li,lt, x=self.name_hirarchical, c=__constraints_name__, i=__index_name__, t=__tie_name__) # nice header for printing
+        header = "  {i:^{2}s}  |  \033[1m{x:^{1}s}\033[0;0m  |  {c:^{0}s}  |  {t:^{3}s}".format(lc, lx, li, lt, x=self.name_hirarchical, c=__constraints_name__, i=__index_name__, t=__tie_name__)  # nice header for printing
         if not ties: ties = itertools.cycle([''])
-        return "\n".join([header]+["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {t:^{4}s}  ".format(lc,lx,__precision__,li,lt, x=x, c=" ".join(map(str,c)), t=(t or ''), i=i) for i,x,c,t in itertools.izip(indices,vals,constr_matrix,ties)]) # return all the constraints with right indices
-        #except: return super(Param, self).__str__()
+        return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, x=x, c=" ".join(map(str, c)), t=(t or ''), i=i) for i, x, c, t in itertools.izip(indices, vals, constr_matrix, ties)])  # return all the constraints with right indices
+        # except: return super(Param, self).__str__()
 
 class ParamConcatenation(object):
     def __init__(self, params):
@@ -472,12 +447,12 @@ class ParamConcatenation(object):
 
         See :py:class:`GPy.core.parameter.Param` for more details on constraining.
         """
-        #self.params = params
-        self.params = []
+        # self.params = params
+        self.params = ParamList([])
         for p in params:
             for p in p.flattened_parameters:
                 if p not in self.params:
-                    self.params.append(p)           
+                    self.params.append(p)
         self._param_sizes = [p.size for p in self.params]
         startstops = numpy.cumsum([0] + self._param_sizes)
         self._param_slices_ = [slice(start, stop) for start,stop in zip(startstops, startstops[1:])]
@@ -485,15 +460,15 @@ class ParamConcatenation(object):
     # Get/set items, enable broadcasting
     #===========================================================================
     def __getitem__(self, s):
-        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True; 
+        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True;
         params = [p._get_params()[ind[ps]] for p,ps in zip(self.params, self._param_slices_) if numpy.any(p._get_params()[ind[ps]])]
         if len(params)==1: return params[0]
         return ParamConcatenation(params)
     def __setitem__(self, s, val, update=True):
-        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True; 
+        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True;
         vals = self._vals(); vals[s] = val; del val
-        [numpy.place(p, ind[ps], vals[ps]) and p._notify_tied_parameters() 
-         for p, ps in zip(self.params, self._param_slices_)]
+        [numpy.place(p, ind[ps], vals[ps]) and p._notify_tied_parameters()
+        for p, ps in zip(self.params, self._param_slices_)]
         if update:
             self.params[0]._highest_parent_.parameters_changed()
     def _vals(self):
@@ -501,46 +476,68 @@ class ParamConcatenation(object):
     #===========================================================================
     # parameter operations:
     #===========================================================================
+    def update_all_params(self):
+        self.params[0]._highest_parent_.parameters_changed()
+
     def constrain(self, constraint, warning=True):
-        [param.constrain(constraint) for param in self.params]
+        [param.constrain(constraint, update=False) for param in self.params]
+        self.update_all_params()
     constrain.__doc__ = Param.constrain.__doc__
+
     def constrain_positive(self, warning=True):
-        [param.constrain_positive(warning) for param in self.params]
+        [param.constrain_positive(warning, update=False) for param in self.params]
+        self.update_all_params()
     constrain_positive.__doc__ = Param.constrain_positive.__doc__
+
     def constrain_fixed(self, warning=True):
         [param.constrain_fixed(warning) for param in self.params]
     constrain_fixed.__doc__ = Param.constrain_fixed.__doc__
     fix = constrain_fixed
+
     def constrain_negative(self, warning=True):
-        [param.constrain_negative(warning) for param in self.params]
+        [param.constrain_negative(warning, update=False) for param in self.params]
+        self.update_all_params()
     constrain_negative.__doc__ = Param.constrain_negative.__doc__
+
     def constrain_bounded(self, lower, upper, warning=True):
-        [param.constrain_bounded(lower, upper, warning) for param in self.params]
+        [param.constrain_bounded(lower, upper, warning, update=False) for param in self.params]
+        self.update_all_params()
     constrain_bounded.__doc__ = Param.constrain_bounded.__doc__
+
     def unconstrain(self, *constraints):
         [param.unconstrain(*constraints) for param in self.params]
     unconstrain.__doc__ = Param.unconstrain.__doc__
+
     def unconstrain_negative(self):
         [param.unconstrain_negative() for param in self.params]
     unconstrain_negative.__doc__ = Param.unconstrain_negative.__doc__
+
     def unconstrain_positive(self):
         [param.unconstrain_positive() for param in self.params]
     unconstrain_positive.__doc__ = Param.unconstrain_positive.__doc__
+
     def unconstrain_fixed(self):
         [param.unconstrain_fixed() for param in self.params]
     unconstrain_fixed.__doc__ = Param.unconstrain_fixed.__doc__
     unfix = unconstrain_fixed
+
     def unconstrain_bounded(self, lower, upper):
         [param.unconstrain_bounded(lower, upper) for param in self.params]
     unconstrain_bounded.__doc__ = Param.unconstrain_bounded.__doc__
+
     def untie(self, *ties):
         [param.untie(*ties) for param in self.params]
-    __lt__ = lambda self, val: self._vals()<val
-    __le__ = lambda self, val: self._vals()<=val
-    __eq__ = lambda self, val: self._vals()==val
-    __ne__ = lambda self, val: self._vals()!=val
-    __gt__ = lambda self, val: self._vals()>val
-    __ge__ = lambda self, val: self._vals()>=val
+
+    def checkgrad(self, verbose=0, step=1e-6, tolerance=1e-3):
+        return self.params[0]._highest_parent_._checkgrad(self, verbose, step, tolerance)
+    #checkgrad.__doc__ = Gradcheckable.checkgrad.__doc__
+    
+    __lt__ = lambda self, val: self._vals() < val
+    __le__ = lambda self, val: self._vals() <= val
+    __eq__ = lambda self, val: self._vals() == val
+    __ne__ = lambda self, val: self._vals() != val
+    __gt__ = lambda self, val: self._vals() > val
+    __ge__ = lambda self, val: self._vals() >= val
     def __str__(self, *args, **kwargs):
         def f(p):
             ind = p._raveled_index()
@@ -557,9 +554,9 @@ class ParamConcatenation(object):
         return "\n{}\n".format(" -"+"- | -".join(['-'*l for l in [li,lx,lc,lt]])).join(strings)
     def __repr__(self):
         return "\n".join(map(repr,self.params))
-    
+
 if __name__ == '__main__':
-    
+
 
     from GPy.core.parameterized import Parameterized
     from GPy.core.parameter import Param
@@ -569,17 +566,17 @@ if __name__ == '__main__':
     print "random done"
     p = Param("q_mean", X)
     p1 = Param("q_variance", numpy.random.rand(*p.shape))
-    p2 = Param("Y", numpy.random.randn(p.shape[0],1))
-    
+    p2 = Param("Y", numpy.random.randn(p.shape[0], 1))
+
     p3 = Param("variance", numpy.random.rand())
     p4 = Param("lengthscale", numpy.random.rand(2))
-    
+
     m = Parameterized()
     rbf = Parameterized(name='rbf')
-    
+
     rbf.add_parameter(p3,p4)
     m.add_parameter(p,p1,rbf)
-    
+
     print "setting params"
     #print m.q_v[3:5,[1,4,5]]
     print "constraining variance"
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index a826b10c..1075d808 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -20,7 +20,7 @@ class Observable(object):
     def _notify_observers(self):
         [callble(self) for callble in self._observers_.itervalues()]
 
-        
+
 class Pickleable(object):
     def _getstate(self):
         """
@@ -36,9 +36,9 @@ class Pickleable(object):
         Set the state (memento pattern) of this class to the given state.
         Usually this is just the counterpart to _getstate, such that
         an object is a copy of another when calling
-    
+
             copy = <classname>.__new__(*args,**kw)._setstate(<to_be_copied>._getstate())
-            
+
         See python doc "pickling" (`__getstate__` and `__setstate__`) for details.
         """
         raise NotImplementedError, "To be able to use pickling you need to implement this method"
@@ -48,19 +48,24 @@ class Pickleable(object):
 #===============================================================================
 
 class Parentable(object):
-    def __init__(self, direct_parent=None, highest_parent=None, parent_index=None):
-        super(Parentable,self).__init__()        
+    def __init__(self, direct_parent=None, parent_index=None):
+        super(Parentable,self).__init__()
         self._direct_parent_ = direct_parent
         self._parent_index_ = parent_index
-        self._highest_parent_ = highest_parent
-        
+
     def has_parent(self):
-        return self._direct_parent_ is not None and self._highest_parent_ is not None
-    
+        return self._direct_parent_ is not None
+
+    @property
+    def _highest_parent_(self):
+        if self._direct_parent_ is None:
+            return self
+        return self._direct_parent_._highest_parent_
+
 class Nameable(Parentable):
     _name = None
-    def __init__(self, name, direct_parent=None, highest_parent=None, parent_index=None):
-        super(Nameable,self).__init__(direct_parent, highest_parent, parent_index)
+    def __init__(self, name, direct_parent=None, parent_index=None):
+        super(Nameable,self).__init__(direct_parent, parent_index)
         self._name = name or self.__class__.__name__
 
     @property
@@ -69,13 +74,45 @@ class Nameable(Parentable):
     @name.setter
     def name(self, name):
         from_name = self.name
-        self._name = name        
+        self._name = name
         if self.has_parent():
-            self._direct_parent_._name_changed(self, from_name)
-            
+            self._direct_parent_._name_changed(self, from_name)    
+
+class Gradcheckable(Parentable):
+    #===========================================================================
+    # Gradchecking
+    #===========================================================================
+    def checkgrad(self, verbose=0, step=1e-6, tolerance=1e-3):
+        if self.has_parent():
+            return self._highest_parent_._checkgrad(self, verbose=verbose, step=step, tolerance=tolerance)
+        return self._checkgrad(self[''], verbose=verbose, step=step, tolerance=tolerance)
+    def _checkgrad(self, param):
+        raise NotImplementedError, "Need log likelihood to check gradient against"
+
+
 class Constrainable(Nameable):
-    def __init__(self, name):
+    def __init__(self, name, default_constraint=None):
         super(Constrainable,self).__init__(name)
+        self._default_constraint_ = default_constraint
+    #===========================================================================
+    # Fixing Parameters:
+    #===========================================================================
+    def constrain_fixed(self, value=None, warning=True):
+        """
+        Constrain this paramter to be fixed to the current value it carries.
+
+        :param warning: print a warning for overwriting constraints.
+        """
+        if value is not None:
+            self[:] = value
+        self._highest_parent_._fix(self,warning)
+    fix = constrain_fixed
+    def unconstrain_fixed(self):
+        """
+        This parameter will no longer be fixed.
+        """
+        self._highest_parent_._unfix(self)
+    unfix = unconstrain_fixed
     #===========================================================================
     # Constrain operations -> done
     #===========================================================================
@@ -84,7 +121,7 @@ class Constrainable(Nameable):
         :param transform: the :py:class:`GPy.core.transformations.Transformation`
                           to constrain the this parameter to.
         :param warning: print a warning if re-constraining parameters.
-        
+
         Constrain the parameter to the given
         :py:class:`GPy.core.transformations.Transformation`.
         """
@@ -97,37 +134,37 @@ class Constrainable(Nameable):
                 self._add_constrain(p, transform, warning)
             if update:
                 self.parameters_changed()
-                
-    def constrain_positive(self, warning=True):
+
+    def constrain_positive(self, warning=True, update=True):
         """
         :param warning: print a warning if re-constraining parameters.
-        
+
         Constrain this parameter to the default positive constraint.
         """
-        self.constrain(Logexp(), warning)
+        self.constrain(Logexp(), warning=warning, update=update)
 
-    def constrain_negative(self, warning=True):
+    def constrain_negative(self, warning=True, update=True):
         """
         :param warning: print a warning if re-constraining parameters.
-        
+
         Constrain this parameter to the default negative constraint.
         """
-        self.constrain(NegativeLogexp(), warning)
+        self.constrain(NegativeLogexp(), warning=warning, update=update)
 
-    def constrain_bounded(self, lower, upper, warning=True):
+    def constrain_bounded(self, lower, upper, warning=True, update=True):
         """
         :param lower, upper: the limits to bound this parameter to
         :param warning: print a warning if re-constraining parameters.
-        
+
         Constrain this parameter to lie within the given range.
         """
-        self.constrain(Logistic(lower, upper), warning)
+        self.constrain(Logistic(lower, upper), warning=warning, update=update)
 
     def unconstrain(self, *transforms):
         """
         :param transforms: The transformations to unconstrain from.
-        
-        remove all :py:class:`GPy.core.transformations.Transformation` 
+
+        remove all :py:class:`GPy.core.transformations.Transformation`
         transformats of this parameter object.
         """
         if self.has_parent():
@@ -138,20 +175,20 @@ class Constrainable(Nameable):
 
     def unconstrain_positive(self):
         """
-        Remove positive constraint of this parameter. 
+        Remove positive constraint of this parameter.
         """
         self.unconstrain(Logexp())
 
     def unconstrain_negative(self):
         """
-        Remove negative constraint of this parameter. 
+        Remove negative constraint of this parameter.
         """
         self.unconstrain(NegativeLogexp())
 
     def unconstrain_bounded(self, lower, upper):
         """
         :param lower, upper: the limits to unbound this parameter from
-        
+
         Remove (lower, upper) bounded constrain from this parameter/
         """
         self.unconstrain(Logistic(lower, upper))
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 7abaf4a3..678b119b 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -8,9 +8,10 @@ import cPickle
 import itertools
 from re import compile, _pattern_type
 from param import ParamConcatenation, Param
-from parameter_core import Constrainable, Pickleable, Observable, adjust_name_for_printing
+from parameter_core import Constrainable, Pickleable, Observable, adjust_name_for_printing, Gradcheckable
 from index_operations import ParameterIndexOperations,\
     index_empty
+from array_core import ParamList
 
 #===============================================================================
 # Printing:
@@ -23,7 +24,7 @@ FIXED = False
 UNFIXED = True
 #===============================================================================
 
-class Parameterized(Constrainable, Pickleable, Observable):
+class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
     """
     Parameterized class
 
@@ -69,7 +70,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
         super(Parameterized, self).__init__(name=name)
         self._in_init_ = True
         self._constraints_ = None#ParameterIndexOperations()
-        self._parameters_ = []
+        self._parameters_ = ParamList()
         self.size = sum(p.size for p in self._parameters_)
         if not self._has_fixes():
             self._fixes_ = None
@@ -170,6 +171,8 @@ class Parameterized(Constrainable, Pickleable, Observable):
             for t, ind in param._constraints_.iteritems():
                 self.constraints.add(t, ind+self._offset_for(param))
             param._constraints_.clear()
+        if param._default_constraint_ is not None:
+            self._add_constrain(param, param._default_constraint_, False)
         if self._has_fixes() and np.all(self._fixes_): # ==UNFIXED
             self._fixes_= None
 
@@ -188,10 +191,10 @@ class Parameterized(Constrainable, Pickleable, Observable):
             note: if it is a string object it will not (!) be regexp-matched
                   automatically.
         """
-        self._parameters_ = [p for p in self._parameters_
+        self._parameters_ = ParamList([p for p in self._parameters_
                         if not (p._parent_index_ in names_params_indices
                                 or p.name in names_params_indices
-                                or p in names_params_indices)]
+                                or p in names_params_indices)])
         self._connect_parameters()
 
     def parameters_changed(self):
@@ -216,7 +219,6 @@ class Parameterized(Constrainable, Pickleable, Observable):
         for i,p in enumerate(self._parameters_):
             p._direct_parent_ = self
             p._parent_index_ = i
-            p._connect_highest_parent(self)
             not_unique = []
             sizes.append(p.size+sizes[-1])
             self._param_slices_.append(slice(sizes[-2], sizes[-1]))
@@ -230,15 +232,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
             elif not (pname in not_unique):
                 self.__dict__[pname] = p
                 self._added_names_.add(pname)
-        
-    def _connect_highest_parent(self, highest_parent):
-        self._highest_parent_ = highest_parent
-        if not hasattr(self, "_parameters_") or len(self._parameters_) < 1:
-            # no parameters for this class
-            return
-        for p in self._parameters_:
-            p._connect_highest_parent(highest_parent)
-        
+            
     #===========================================================================
     # Pickling operations
     #===========================================================================
@@ -256,6 +250,16 @@ class Parameterized(Constrainable, Pickleable, Observable):
             cPickle.dump(self, f, protocol)
     def copy(self):
         """Returns a (deep) copy of the current model """
+        #dc = dict()
+        #for k, v in self.__dict__.iteritems():
+            #if k not in ['_highest_parent_', '_direct_parent_']:
+                #dc[k] = copy.deepcopy(v)
+
+        #dc = copy.deepcopy(self.__dict__)
+        #dc['_highest_parent_'] = None
+        #dc['_direct_parent_'] = None
+        #s = self.__class__.new()
+        #s.__dict__ = dc
         return copy.deepcopy(self)
     def __getstate__(self):
         if self._has_get_set_state():
@@ -310,8 +314,11 @@ class Parameterized(Constrainable, Pickleable, Observable):
     #===========================================================================
     # Optimization handles:
     #===========================================================================
-    def _get_param_names_transformed(self):
+    def _get_param_names(self):
         n = numpy.array([p.name_hirarchical+'['+str(i)+']' for p in self.flattened_parameters for i in p._indices()])
+        return n
+    def _get_param_names_transformed(self):
+        n = self._get_param_names()
         if self._has_fixes():
             return n[self._fixes_]
         return n
@@ -372,6 +379,8 @@ class Parameterized(Constrainable, Pickleable, Observable):
         that is an int array, containing the indexes for the flattened
         param inside this parameterized logic.
         """
+        if isinstance(param, ParamConcatenation):
+            return numpy.hstack((self._raveled_index_for(p) for p in param.params))
         return param._raveled_index() + self._offset_for(param)
 
     def _raveled_index(self):
@@ -381,7 +390,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
         """
         return numpy.r_[:self.size]
     #===========================================================================
-    # Handle ties:
+    # Fixing parameters:
     #===========================================================================
     def _set_fixed(self, param_or_index):
         if not self._has_fixes(): self._fixes_ = numpy.ones(self.size, dtype=bool)
@@ -406,9 +415,6 @@ class Parameterized(Constrainable, Pickleable, Observable):
         if self._has_fixes():
             return self._fixes_[self._raveled_index_for(param)]
         return numpy.ones(self.size, dtype=bool)[self._raveled_index_for(param)]
-    #===========================================================================
-    # Fixing parameters:
-    #===========================================================================
     def _fix(self, param, warning=True):
         f = self._add_constrain(param, __fixed__, warning)
         self._set_fixed(f)
@@ -419,6 +425,8 @@ class Parameterized(Constrainable, Pickleable, Observable):
     #===========================================================================
     # Convenience for fixed, tied checking of param:
     #===========================================================================
+    def fixed_indices(self):
+        return np.array([x.is_fixed for x in self._parameters_])
     def _is_fixed(self, param):
         # returns if the whole param is fixed
         if not self._has_fixes():
@@ -448,7 +456,8 @@ class Parameterized(Constrainable, Pickleable, Observable):
         # if removing constraints before adding new is not wanted, just delete the above line!
         self.constraints.add(transform, rav_i)
         param = self._get_original(param)
-        param._set_params(transform.initialize(param._get_params()))
+        if not (transform == __fixed__):
+            param._set_params(transform.initialize(param._get_params()), update=False)
         if warning and any(reconstrained):
             # if you want to print the whole params object, which was reconstrained use:
             # m = str(param[self._backtranslate_index(param, reconstrained)])
diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py
index fd2c3ee5..c4cab1e9 100644
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@@ -4,9 +4,9 @@
 
 import numpy as np
 from domains import _POSITIVE,_NEGATIVE, _BOUNDED
-import sys 
+import sys
 import weakref
-_lim_val = -np.log(sys.float_info.epsilon) 
+_lim_val = -np.log(sys.float_info.epsilon)
 
 class Transformation(object):
     domain = None
@@ -94,7 +94,7 @@ class LogexpClipped(Logexp):
     def __str__(self):
         return '+ve_c'
 
-    
+
 class Exponent(Transformation):
     # TODO: can't allow this to go to zero, need to set a lower bound. Similar with negative Exponent below. See old MATLAB code.
     domain = _POSITIVE
@@ -162,9 +162,11 @@ class Logistic(Transformation):
     def initialize(self, f):
         if np.any(np.logical_or(f < self.lower, f > self.upper)):
             print "Warning: changing parameters to satisfy constraints"
-        return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
+        #return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
+        #FIXME: Max, zeros_like right?
+        return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(np.zeros_like(f)), f)
     def __str__(self):
         return '{},{}'.format(self.lower, self.upper)
 
- 
+
 
diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index e9868b82..b73e25da 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -3,10 +3,8 @@ Created on 6 Nov 2013
 
 @author: maxz
 '''
-import numpy as np
 from parameterized import Parameterized
 from param import Param
-from ...util.misc import param_to_array
 
 class Normal(Parameterized):
     '''
@@ -26,6 +24,7 @@ class Normal(Parameterized):
 
         See  GPy.plotting.matplot_dep.variational_plots
         """
+        import sys
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import variational_plots
+        from ...plotting.matplot_dep import variational_plots
         return variational_plots.plot(self,*args)
diff --git a/GPy/core/svigp.py b/GPy/core/svigp.py
index 9e4f3b12..a2c7acee 100644
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@@ -154,13 +154,13 @@ class SVIGP(GP):
             self.psi2 = None
 
     def dL_dtheta(self):
-        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z)
+        dL_dtheta = self.kern._param_grad_helper(self.dL_dKmm, self.Z)
         if self.has_uncertain_inputs:
             dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch)
             dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch)
             dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch)
         else:
-            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X_batch, self.Z)
+            dL_dtheta += self.kern._param_grad_helper(self.dL_dpsi1, self.X_batch, self.Z)
             dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch)
         return dL_dtheta
 
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 46fc6797..e2ba4912 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -15,54 +15,54 @@ def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False):
     num_inducing = 5
     if plot:
         output_dim = 1
-        input_dim = 2
+        input_dim = 3
     else:
-        input_dim = 2
+        input_dim = 1
         output_dim = 25
 
     # generate GPLVM-like data
     X = _np.random.rand(num_inputs, input_dim)
     lengthscales = _np.random.rand(input_dim)
     k = (GPy.kern.rbf(input_dim, .5, lengthscales, ARD=True)
-         + GPy.kern.white(input_dim, 0.01))
+         #+ GPy.kern.white(input_dim, 0.01)
+         )
     K = k.K(X)
     Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, output_dim).T
-    lik = Gaussian(Y, normalize=True)
 
-    k = GPy.kern.rbf_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
-    # k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.rbf_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
+    k = GPy.kern.linear(input_dim)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
     # k = GPy.kern.rbf(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
     # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.rbf(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
     # k = GPy.kern.rbf(input_dim, .5, 2., ARD=0) + GPy.kern.rbf(input_dim, .3, .2, ARD=0)
     # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)
 
-    m = GPy.models.BayesianGPLVM(lik, input_dim, kernel=k, num_inducing=num_inducing)
+    m = GPy.models.BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
     #===========================================================================
     # randomly obstruct data with percentage p
     p = .8
     Y_obstruct = Y.copy()
     Y_obstruct[_np.random.uniform(size=(Y.shape)) < p] = _np.nan
     #===========================================================================
-    m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
+    #m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
     m.lengthscales = lengthscales
 
     if plot:
         import matplotlib.pyplot as pb
         m.plot()
         pb.title('PCA initialisation')
-        m2.plot()
-        pb.title('PCA initialisation')
+        #m2.plot()
+        #pb.title('PCA initialisation')
 
     if optimize:
         m.optimize('scg', messages=verbose)
-        m2.optimize('scg', messages=verbose)
+        #m2.optimize('scg', messages=verbose)
         if plot:
             m.plot()
             pb.title('After optimisation')
-            m2.plot()
-            pb.title('After optimisation')
+            #m2.plot()
+            #pb.title('After optimisation')
 
-    return m, m2
+    return m
 
 def gplvm_oil_100(optimize=True, verbose=1, plot=True):
     import GPy
@@ -264,16 +264,16 @@ def bgplvm_simulation(optimize=True, verbose=1,
     D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
     _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
     Y = Ylist[0]
-    k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    k = kern.linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
-    m['noise'] = Y.var() / 100.
+    m.Gaussian_noise = Y.var() / 100.
 
     if optimize:
         print "Optimizing model:"
         m.optimize('scg', messages=verbose, max_iters=max_iters,
                    gtol=.05)
     if plot:
-        m.plot_X_1d("BGPLVM Latent Space 1D")
+        m.q.plot("BGPLVM Latent Space 1D")
         m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
     return m
 
diff --git a/GPy/examples/non_gaussian.py b/GPy/examples/non_gaussian.py
index bda80137..23122691 100644
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@@ -37,39 +37,43 @@ def student_t_approx(optimize=True, plot=True):
 
     # Kernel object
     kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
-    kernel2 = kernel1.copy()
-    kernel3 = kernel1.copy()
-    kernel4 = kernel1.copy()
+    kernel2 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel3 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel4 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
 
     #Gaussian GP model on clean data
     m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
     # optimize
     m1.ensure_default_constraints()
-    m1.constrain_fixed('white', 1e-5)
+    m1['white'] = 1e-5
+    m1['white'].constrain_fixed('white')
     m1.randomize()
 
     #Gaussian GP model on corrupt data
     m2 = GPy.models.GPRegression(X, Yc.copy(), kernel=kernel2)
     m2.ensure_default_constraints()
-    m2.constrain_fixed('white', 1e-5)
+    m1['white'] = 1e-5
+    m1['white'].constrain_fixed('white')
     m2.randomize()
 
     #Student t GP model on clean data
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
-    m3 = GPy.models.GPRegression(X, Y.copy(), kernel3, likelihood=stu_t_likelihood)
+    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
+    laplace_inf = GPy.inference.latent_function_inference.LaplaceInference()
+    m3 = GPy.core.GP(X, Y.copy(), kernel3, likelihood=t_distribution, inference_method=laplace_inf)
     m3.ensure_default_constraints()
-    m3.constrain_bounded('t_noise', 1e-6, 10.)
-    m3.constrain_fixed('white', 1e-5)
+    m3['t_noise'].constrain_bounded(1e-6, 10.)
+    m3['white'] = 1e-5
+    m3['white'].constrain_fixed()
     m3.randomize()
 
     #Student t GP model on corrupt data
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
-    m4 = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
+    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
+    laplace_inf = GPy.inference.latent_function_inference.LaplaceInference()
+    m4 = GPy.core.GP(X, Yc.copy(), kernel4, likelihood=t_distribution, inference_method=laplace_inf)
     m4.ensure_default_constraints()
-    m4.constrain_bounded('t_noise', 1e-6, 10.)
-    m4.constrain_fixed('white', 1e-5)
+    m4['t_noise'].constrain_bounded(1e-6, 10.)
+    m4['white'] = 1e-5
+    m4['white'].constrain_fixed()
     m4.randomize()
 
     if optimize:
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 65a50f0e..4dea1342 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -281,11 +281,12 @@ def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
     f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
     Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
 
-    noise_model = GPy.likelihoods.poisson()
-    likelihood = GPy.likelihoods.Laplace(Y,noise_model)
+    kern = GPy.kern.rbf(1)
+    poisson_lik = GPy.likelihoods.Poisson()
+    laplace_inf = GPy.inference.latent_function_inference.LaplaceInference()
 
     # create simple GP Model
-    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
+    m = GPy.core.GP(X, Y, kernel=kern, likelihood=poisson_lik, inference_method=laplace_inf)
 
     if optimize:
         m.optimize(optimizer)
diff --git a/GPy/inference/latent_function_inference/fitc.py b/GPy/inference/latent_function_inference/fitc.py
index ac725294..82d0973f 100644
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@@ -97,10 +97,10 @@ class VarDTC(object):
             K_pp_K = np.dot(Kmmipsi1[:,i:(i+1)],Kmmipsi1[:,i:(i+1)].T)
             _dpsi1 = (-V_n**2 - alpha_n + 2.*gamma_k - gamma_n**2) * Kmmipsi1.T[i:(i+1),:]
             _dKmm = .5*(V_n**2 + alpha_n + gamma_n**2 - 2.*gamma_k) * K_pp_K #Diag_dD_dKmm
-            self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1,self.X[i:i+1,:],self.Z)
-            self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm,self.Z)
-            self._dKmm_dX += self.kern.dK_dX(_dKmm ,self.Z)
-            self._dpsi1_dX += self.kern.dK_dX(_dpsi1.T,self.Z,self.X[i:i+1,:])
+            self._dpsi1_dtheta += self.kern._param_grad_helper(_dpsi1,self.X[i:i+1,:],self.Z)
+            self._dKmm_dtheta += self.kern._param_grad_helper(_dKmm,self.Z)
+            self._dKmm_dX += self.kern.gradients_X(_dKmm ,self.Z)
+            self._dpsi1_dX += self.kern.gradients_X(_dpsi1.T,self.Z,self.X[i:i+1,:])
 
         # the partial derivative vector for the likelihood
         if self.likelihood.num_params == 0:
@@ -144,15 +144,15 @@ class VarDTC(object):
 
     def dL_dtheta(self):
         dL_dtheta = self.kern.dKdiag_dtheta(self._dL_dpsi0,self.X)
-        dL_dtheta += self.kern.dK_dtheta(self._dL_dpsi1,self.X,self.Z)
-        dL_dtheta += self.kern.dK_dtheta(self._dL_dKmm,X=self.Z)
+        dL_dtheta += self.kern._param_grad_helper(self._dL_dpsi1,self.X,self.Z)
+        dL_dtheta += self.kern._param_grad_helper(self._dL_dKmm,X=self.Z)
         dL_dtheta += self._dKmm_dtheta
         dL_dtheta += self._dpsi1_dtheta
         return dL_dtheta
 
     def dL_dZ(self):
-        dL_dZ = self.kern.dK_dX(self._dL_dpsi1.T,self.Z,self.X)
-        dL_dZ += self.kern.dK_dX(self._dL_dKmm,X=self.Z)
+        dL_dZ = self.kern.gradients_X(self._dL_dpsi1.T,self.Z,self.X)
+        dL_dZ += self.kern.gradients_X(self._dL_dKmm,X=self.Z)
         dL_dZ += self._dpsi1_dX
         dL_dZ += self._dKmm_dX
         return dL_dZ
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 6e252406..bc81a86a 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -11,9 +11,8 @@
 #http://gaussianprocess.org/gpml/code.
 
 import numpy as np
-from ...util.linalg import mdot, jitchol, pddet, dpotrs, dtrtrs, dpotri, symmetrify
+from ...util.linalg import mdot, jitchol, dpotrs, dtrtrs, dpotri, symmetrify
 from ...util.misc import param_to_array
-from functools import partial as partial_func
 from posterior import Posterior
 import warnings
 from scipy import optimize
@@ -32,7 +31,7 @@ class LaplaceInference(object):
         self._mode_finding_tolerance = 1e-7
         self._mode_finding_max_iter = 40
         self.bad_fhat = True
-
+        self._previous_Ki_fhat = None
 
     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
         """
@@ -50,16 +49,17 @@ class LaplaceInference(object):
             Ki_f_init = np.zeros_like(Y)
         else:
             Ki_f_init = self._previous_Ki_fhat
+
         f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
 
         #Compute hessian and other variables at mode
-        log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, Y_metadata)
+        log_marginal, woodbury_vector, woodbury_inv, dL_dK, dL_dthetaL = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, kern, Y_metadata)
 
-        #likelihood.gradient = self.likelihood_gradients()
         kern.update_gradients_full(dL_dK, X)
+        likelihood.update_gradients(dL_dthetaL)
 
         self._previous_Ki_fhat = Ki_fhat.copy()
-        return Posterior(woodbury_vector=woodbury_vector, woodbury_inv = K_Wi_i, K=K), log_marginal, {'dL_dK':dL_dK}
+        return Posterior(woodbury_vector=woodbury_vector, woodbury_inv=woodbury_inv, K=K), log_marginal, {'dL_dK':dL_dK}
 
     def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
         """
@@ -84,7 +84,6 @@ class LaplaceInference(object):
         Ki_f = Ki_f_init.copy()
         f = np.dot(K, Ki_f)
 
-
         #define the objective function (to be maximised)
         def obj(Ki_f, f):
             return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + likelihood.logpdf(f, Y, extra_data=Y_metadata)
@@ -133,13 +132,15 @@ class LaplaceInference(object):
 
         return f, Ki_f
 
-
-    def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, Y_metadata):
+    def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, kern, Y_metadata):
         """
         At the mode, compute the hessian and effective covariance matrix.
 
         returns: logZ : approximation to the marginal likelihood
-        Cov : the approximation to the covariance matrix
+                 woodbury_vector : variable required for calculating the approximation to the covariance matrix
+                 woodbury_inv : variable required for calculating the approximation to the covariance matrix
+                 dL_dthetaL : array of derivatives (1 x num_kernel_params)
+                 dL_dthetaL : array of derivatives (1 x num_likelihood_params)
         """
         #At this point get the hessian matrix (or vector as W is diagonal)
         W = -likelihood.d2logpdf_df2(f_hat, Y, extra_data=Y_metadata)
@@ -153,45 +154,54 @@ class LaplaceInference(object):
         #compute the log marginal
         log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + likelihood.logpdf(f_hat, Y, extra_data=Y_metadata) - np.sum(np.log(np.diag(L)))
 
-        #compute dL_dK
-        explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
-
-        #Implicit
-        d3lik_d3fhat = likelihood.d3logpdf_df3(f_hat, Y, extra_data=Y_metadata)
-        dL_dfhat = 0.5*(np.diag(Ki_W_i)[:, None]*d3lik_d3fhat) #why isn't this -0.5? s2 in R&W p126 line 9.
+        #Compute vival matrices for derivatives
+        dW_df = -likelihood.d3logpdf_df3(f_hat, Y, extra_data=Y_metadata) # -d3lik_d3fhat
         woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, extra_data=Y_metadata)
-        implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(np.eye(Y.shape[0]) - np.dot(K, K_Wi_i))
+        dL_dfhat = -0.5*(np.diag(Ki_W_i)[:, None]*dW_df) #why isn't this -0.5? s2 in R&W p126 line 9.
+        #BiK, _ = dpotrs(L, K, lower=1)
+        #dL_dfhat = 0.5*np.diag(BiK)[:, None]*dW_df
+        I_KW_i = np.eye(Y.shape[0]) - np.dot(K, K_Wi_i)
 
-        dL_dK = explicit_part + implicit_part
-
-        return log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector
-
-
-    def likelihood_gradients(self):
-        """
-        Gradients with respect to likelihood parameters (dL_dthetaL)
-
-        :rtype: array of derivatives (1 x num_likelihood_params)
-        """
-        dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = likelihood._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
-
-        num_params = len(self._get_param_names())
-        # make space for one derivative for each likelihood parameter
-        dL_dthetaL = np.zeros(num_params)
-        for thetaL_i in range(num_params):
+        ####################
+        #compute dL_dK#
+        ####################
+        if kern.size > 0 and not kern.is_fixed:
             #Explicit
-            dL_dthetaL_exp = ( np.sum(dlik_dthetaL[:, thetaL_i])
-                             #- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
-                             + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[:, thetaL_i])
-                             )
+            explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
 
             #Implicit
-            dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[:, thetaL_i])
-            dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
-            dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+            implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(I_KW_i)
 
-        return dL_dthetaL
+            dL_dK = explicit_part + implicit_part
+        else:
+            dL_dK = np.zeros(likelihood.size)
+
+        ####################
+        #compute dL_dthetaL#
+        ####################
+        if likelihood.size > 0 and not likelihood.is_fixed:
+            dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = likelihood._laplace_gradients(f_hat, Y, extra_data=Y_metadata)
+
+            num_params = likelihood.size
+            # make space for one derivative for each likelihood parameter
+            dL_dthetaL = np.zeros(num_params)
+            for thetaL_i in range(num_params):
+                #Explicit
+                dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i])
+                                # The + comes from the fact that dlik_hess_dthetaL == -dW_dthetaL
+                                + 0.5*np.sum(np.diag(Ki_W_i).flatten()*dlik_hess_dthetaL[:, thetaL_i].flatten())
+                                )
+
+                #Implicit
+                dfhat_dthetaL = mdot(I_KW_i, K, dlik_grad_dthetaL[:, thetaL_i])
+                #dfhat_dthetaL = mdot(Ki_W_i, dlik_grad_dthetaL[:, thetaL_i])
+                dL_dthetaL_imp = np.dot(dL_dfhat.T, dfhat_dthetaL)
+                dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+
+        else:
+            dL_dthetaL = np.zeros(likelihood.size)
+
+        return log_marginal, woodbury_vector, K_Wi_i, dL_dK, dL_dthetaL
 
     def _compute_B_statistics(self, K, W, log_concave):
         """
@@ -219,12 +229,11 @@ class LaplaceInference(object):
         LiW12, _ = dtrtrs(L, np.diagflat(W_12), lower=1, trans=0)
         K_Wi_i = np.dot(LiW12.T, LiW12) # R = W12BiW12, in R&W p 126, eq 5.25
 
-        #here's a better way to compute the required matrix. 
+        #here's a better way to compute the required matrix.
         # you could do the model finding witha backsub, instead of a dot...
         #L2 = L/W_12
         #K_Wi_i_2 , _= dpotri(L2)
         #symmetrify(K_Wi_i_2)
 
-
         return K_Wi_i, L, LiW12
 
diff --git a/GPy/inference/latent_function_inference/varDTC.py b/GPy/inference/latent_function_inference/varDTC.py
index 6d46ad14..07ae17c5 100644
--- a/GPy/inference/latent_function_inference/varDTC.py
+++ b/GPy/inference/latent_function_inference/varDTC.py
@@ -80,7 +80,7 @@ class VarDTC(object):
             # no backsubstitution because of bound explosion on tr(A) if not...
             LmInv, _ = dtrtri(Lm, lower=1)
             A = LmInv.T.dot(psi2_beta.dot(LmInv))
-            print A.sum()
+            #print A.sum()
         else:
             if het_noise:
                 tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 6a8bc745..98945c2d 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -8,6 +8,7 @@ from parts.prod import Prod as prod
 from parts.linear import Linear
 from parts.kernpart import Kernpart
 from ..core.parameterization import Parameterized
+from GPy.core.parameterization.param import Param
 
 class kern(Parameterized):
     def __init__(self, input_dim, parts=[], input_slices=None):
@@ -84,7 +85,7 @@ class kern(Parameterized):
 #         represents the gradient in the transformed space (i.e. that given by
 #         get_params_transformed())
 #
-#         :param g: the gradient vector for the current model, usually created by dK_dtheta
+#         :param g: the gradient vector for the current model, usually created by _param_grad_helper
 #         """
 #         x = self._get_params()
 #         [np.place(g, index, g[index] * constraint.gradfactor(x[index]))
@@ -160,6 +161,9 @@ class kern(Parameterized):
 
         return newkern
 
+    def __call__(self, X, X2=None):
+        return self.K(X, X2)
+
     def __mul__(self, other):
         """ Here we overload the '*' operator. See self.prod for more information"""
         return self.prod(other)
@@ -291,7 +295,7 @@ class kern(Parameterized):
     def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
         [p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_]
 
-    def dK_dtheta(self, dL_dK, X, X2=None):
+    def _param_grad_helper(self, dL_dK, X, X2=None):
         """
         Compute the gradient of the covariance function with respect to the parameters.
 
@@ -307,9 +311,9 @@ class kern(Parameterized):
         assert X.shape[1] == self.input_dim
         target = np.zeros(self.size)
         if X2 is None:
-            [p.dK_dtheta(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
+            [p._param_grad_helper(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
         else:
-            [p.dK_dtheta(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
+            [p._param_grad_helper(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
 
         return self._transform_gradients(target)
 
@@ -481,6 +485,7 @@ from GPy.core.model import Model
 class Kern_check_model(Model):
     """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel."""
     def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Model.__init__(self, 'kernel_test_model')
         num_samples = 20
         num_samples2 = 10
         if kernel==None:
@@ -492,14 +497,12 @@ class Kern_check_model(Model):
                 dL_dK = np.ones((X.shape[0], X.shape[0]))
             else:
                 dL_dK = np.ones((X.shape[0], X2.shape[0]))
-
+        
         self.kernel=kernel
+        self.add_parameter(kernel)
         self.X = X
         self.X2 = X2
         self.dL_dK = dL_dK
-        #self.constrained_indices=[]
-        #self.constraints=[]
-        Model.__init__(self, 'kernel_test_model')
 
     def is_positive_definite(self):
         v = np.linalg.eig(self.kernel.K(self.X))[0]
@@ -508,15 +511,6 @@ class Kern_check_model(Model):
         else:
             return True
 
-    def _get_params(self):
-        return self.kernel._get_params()
-
-    def _get_param_names(self):
-        return self.kernel._get_param_names()
-
-    def _set_params(self, x):
-        self.kernel._set_params(x)
-
     def log_likelihood(self):
         return (self.dL_dK*self.kernel.K(self.X, self.X2)).sum()
 
@@ -529,7 +523,7 @@ class Kern_check_dK_dtheta(Kern_check_model):
         Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
 
     def _log_likelihood_gradients(self):
-        return self.kernel.dK_dtheta(self.dL_dK, self.X, self.X2)
+        return self.kernel._param_grad_helper(self.dL_dK, self.X, self.X2)
 
 class Kern_check_dKdiag_dtheta(Kern_check_model):
     """This class allows gradient checks of the gradient of the diagonal of a kernel with respect to the parameters."""
@@ -537,6 +531,8 @@ class Kern_check_dKdiag_dtheta(Kern_check_model):
         Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
         if dL_dK==None:
             self.dL_dK = np.ones((self.X.shape[0]))
+    def parameters_changed(self):
+        self.kernel.update_gradients_full(self.dL_dK, self.X)        
 
     def log_likelihood(self):
         return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
@@ -548,41 +544,25 @@ class Kern_check_dK_dX(Kern_check_model):
     """This class allows gradient checks for the gradient of a kernel with respect to X. """
     def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
         Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
-
+        self.remove_parameter(kernel)
+        self.X = Param('X', self.X)
+        self.add_parameter(self.X)
     def _log_likelihood_gradients(self):
-        return self.kernel.dK_dX(self.dL_dK, self.X, self.X2).flatten()
+        return self.kernel.gradients_X(self.dL_dK, self.X, self.X2).flatten()
 
-    def _get_param_names(self):
-        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-
-    def _get_params(self):
-        return self.X.flatten()
-
-    def _set_params(self, x):
-        self.X=x.reshape(self.X.shape)
-
-class Kern_check_dKdiag_dX(Kern_check_model):
+class Kern_check_dKdiag_dX(Kern_check_dK_dX):
     """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """
     def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
-        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
+        Kern_check_dK_dX.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
         if dL_dK==None:
             self.dL_dK = np.ones((self.X.shape[0]))
-
+        
     def log_likelihood(self):
         return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
 
     def _log_likelihood_gradients(self):
         return self.kernel.dKdiag_dX(self.dL_dK, self.X).flatten()
 
-    def _get_param_names(self):
-        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-
-    def _get_params(self):
-        return self.X.flatten()
-
-    def _set_params(self, x):
-        self.X=x.reshape(self.X.shape)
-
 def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
     """
     This function runs on kernels to check the correctness of their
@@ -657,7 +637,7 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
     except NotImplementedError:
         result=True
         if verbose:
-            print("dK_dX not implemented for " + kern.name)
+            print("gradients_X not implemented for " + kern.name)
     if result and verbose:
         print("Check passed.")
     if not result:
@@ -673,7 +653,7 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
     except NotImplementedError:
         result=True
         if verbose:
-            print("dK_dX not implemented for " + kern.name)
+            print("gradients_X not implemented for " + kern.name)
     if result and verbose:
         print("Check passed.")
     if not result:
@@ -689,7 +669,7 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
     except NotImplementedError:
         result=True
         if verbose:
-            print("dK_dX not implemented for " + kern.name)
+            print("gradients_X not implemented for " + kern.name)
     if result and verbose:
         print("Check passed.")
     if not result:
diff --git a/GPy/kern/parts/Brownian.py b/GPy/kern/parts/Brownian.py
index bdfa0df5..488e9b7a 100644
--- a/GPy/kern/parts/Brownian.py
+++ b/GPy/kern/parts/Brownian.py
@@ -43,7 +43,7 @@ class Brownian(Kernpart):
     def Kdiag(self,X,target):
         target += self.variance*X.flatten()
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         if X2 is None:
             X2 = X
         target += np.sum(np.fmin(X,X2.T)*dL_dK)
@@ -51,7 +51,7 @@ class Brownian(Kernpart):
     def dKdiag_dtheta(self,dL_dKdiag,X,target):
         target += np.dot(X.flatten(), dL_dKdiag)
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         raise NotImplementedError, "TODO"
         #target += self.variance
         #target -= self.variance*theta(X-X2.T)
diff --git a/GPy/kern/parts/Matern32.py b/GPy/kern/parts/Matern32.py
index 40da79f0..08fa452c 100644
--- a/GPy/kern/parts/Matern32.py
+++ b/GPy/kern/parts/Matern32.py
@@ -74,7 +74,7 @@ class Matern32(Kernpart):
         """Compute the diagonal of the covariance matrix associated to X."""
         np.add(target, self.variance, target)
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """derivative of the covariance matrix with respect to the parameters."""
         if X2 is None: X2 = X
         dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
@@ -96,7 +96,7 @@ class Matern32(Kernpart):
         """derivative of the diagonal of the covariance matrix with respect to the parameters."""
         target[0] += np.sum(dL_dKdiag)
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         """derivative of the covariance matrix with respect to X."""
         if X2 is None:
             dist = np.sqrt(np.sum(np.square((X[:, None, :] - X[None, :, :]) / self.lengthscale), -1))[:, :, None]
@@ -105,8 +105,8 @@ class Matern32(Kernpart):
         else:
             dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
             ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
-        dK_dX = -np.transpose(3 * self.variance * dist * np.exp(-np.sqrt(3) * dist) * ddist_dX, (1, 0, 2))
-        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+        gradients_X = -np.transpose(3 * self.variance * dist * np.exp(-np.sqrt(3) * dist) * ddist_dX, (1, 0, 2))
+        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         pass
diff --git a/GPy/kern/parts/Matern52.py b/GPy/kern/parts/Matern52.py
index 4bf4a1a8..7d36254c 100644
--- a/GPy/kern/parts/Matern52.py
+++ b/GPy/kern/parts/Matern52.py
@@ -74,7 +74,7 @@ class Matern52(Kernpart):
         """Compute the diagonal of the covariance matrix associated to X."""
         np.add(target,self.variance,target)
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to the parameters."""
         if X2 is None: X2 = X
         dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
@@ -96,7 +96,7 @@ class Matern52(Kernpart):
         """derivative of the diagonal of the covariance matrix with respect to the parameters."""
         target[0] += np.sum(dL_dKdiag)
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to X."""
         if X2 is None:
             dist = np.sqrt(np.sum(np.square((X[:,None,:]-X[None,:,:])/self.lengthscale),-1))[:,:,None]
@@ -104,8 +104,8 @@ class Matern52(Kernpart):
         else:
             dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
             ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
-        dK_dX = -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
-        target += np.sum(dK_dX*dL_dK.T[:,:,None],0)
+        gradients_X = -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
+        target += np.sum(gradients_X*dL_dK.T[:,:,None],0)
 
     def dKdiag_dX(self,dL_dKdiag,X,target):
         pass
diff --git a/GPy/kern/parts/ODE_1.py b/GPy/kern/parts/ODE_1.py
index 416278e3..15faf108 100644
--- a/GPy/kern/parts/ODE_1.py
+++ b/GPy/kern/parts/ODE_1.py
@@ -90,7 +90,7 @@ class ODE_1(Kernpart):
 
         np.add(self.varianceU*self.varianceY*(k1+k2+k3), target, target)
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """derivative of the covariance matrix with respect to the parameters."""
         if X2 is None: X2 = X
         dist = np.abs(X - X2.T)
diff --git a/GPy/kern/parts/eq_ode1.py b/GPy/kern/parts/eq_ode1.py
index 70e3c49d..bf0ca7e4 100644
--- a/GPy/kern/parts/eq_ode1.py
+++ b/GPy/kern/parts/eq_ode1.py
@@ -124,7 +124,7 @@ class Eq_ode1(Kernpart):
         #target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
         pass
     
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         
         # First extract times and indices.
         self._extract_t_indices(X, X2, dL_dK=dL_dK)
@@ -193,7 +193,7 @@ class Eq_ode1(Kernpart):
     def dKdiag_dtheta(self,dL_dKdiag,index,target):
         pass
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         pass
 
     def _extract_t_indices(self, X, X2=None, dL_dK=None):
diff --git a/GPy/kern/parts/exponential.py b/GPy/kern/parts/exponential.py
index f568b66b..372d4d9b 100644
--- a/GPy/kern/parts/exponential.py
+++ b/GPy/kern/parts/exponential.py
@@ -75,7 +75,7 @@ class Exponential(Kernpart):
         """Compute the diagonal of the covariance matrix associated to X."""
         np.add(target, self.variance, target)
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """derivative of the covariance matrix with respect to the parameters."""
         if X2 is None: X2 = X
         dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
@@ -95,13 +95,13 @@ class Exponential(Kernpart):
         # NB: derivative of diagonal elements wrt lengthscale is 0
         target[0] += np.sum(dL_dKdiag)
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         """derivative of the covariance matrix with respect to X."""
         if X2 is None: X2 = X
         dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
         ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
-        dK_dX = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
-        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+        gradients_X = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
+        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         pass
diff --git a/GPy/kern/parts/finite_dimensional.py b/GPy/kern/parts/finite_dimensional.py
index 6cc2325f..9e54fb34 100644
--- a/GPy/kern/parts/finite_dimensional.py
+++ b/GPy/kern/parts/finite_dimensional.py
@@ -50,7 +50,7 @@ class FiniteDimensional(Kernpart):
     def Kdiag(self,X,target):
         product = np.diag(self.K(X, X))
         np.add(target,product,target)
-    def dK_dtheta(self,X,X2,target):
+    def _param_grad_helper(self,X,X2,target):
         """Return shape is NxMx(Ntheta)"""
         if X2 is None: X2 = X
         FX = np.column_stack([f(X) for f in self.F])
diff --git a/GPy/kern/parts/fixed.py b/GPy/kern/parts/fixed.py
index 67baea91..680f0b14 100644
--- a/GPy/kern/parts/fixed.py
+++ b/GPy/kern/parts/fixed.py
@@ -31,10 +31,10 @@ class Fixed(Kernpart):
     def K(self, X, X2, target):
         target += self.variance * self.fixed_K
 
-    def dK_dtheta(self, partial, X, X2, target):
+    def _param_grad_helper(self, partial, X, X2, target):
         target += (partial * self.fixed_K).sum()
 
-    def dK_dX(self, partial, X, X2, target):
+    def gradients_X(self, partial, X, X2, target):
         pass
 
     def dKdiag_dX(self, partial, X, target):
diff --git a/GPy/kern/parts/gibbs.py b/GPy/kern/parts/gibbs.py
index f47144e1..68241245 100644
--- a/GPy/kern/parts/gibbs.py
+++ b/GPy/kern/parts/gibbs.py
@@ -85,7 +85,7 @@ class Gibbs(Kernpart):
         """Compute the diagonal of the covariance matrix for X."""
         np.add(target, self.variance, target)
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """Derivative of the covariance with respect to the parameters."""
         self._K_computations(X, X2)
         self._dK_computations(dL_dK)
@@ -97,7 +97,7 @@ class Gibbs(Kernpart):
 
         target+= np.hstack([(dL_dK*self._K_dvar).sum(), gmapping])
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         """Derivative of the covariance matrix with respect to X."""
         # First account for gradients arising from presence of X in exponent.
         self._K_computations(X, X2)
@@ -105,8 +105,8 @@ class Gibbs(Kernpart):
             _K_dist = 2*(X[:, None, :] - X[None, :, :])
         else:
             _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_co
-        dK_dX = (-2.*self.variance)*np.transpose((self._K_dvar/self._w2)[:, :, None]*_K_dist, (1, 0, 2))
-        target += np.sum(dK_dX*dL_dK.T[:, :, None], 0)
+        gradients_X = (-2.*self.variance)*np.transpose((self._K_dvar/self._w2)[:, :, None]*_K_dist, (1, 0, 2))
+        target += np.sum(gradients_X*dL_dK.T[:, :, None], 0)
         # Now account for gradients arising from presence of X in lengthscale.
         self._dK_computations(dL_dK)
         if X2 is None:
diff --git a/GPy/kern/parts/hetero.py b/GPy/kern/parts/hetero.py
index d3939563..507f6251 100644
--- a/GPy/kern/parts/hetero.py
+++ b/GPy/kern/parts/hetero.py
@@ -80,7 +80,7 @@ class Hetero(Kernpart):
         """Helper function for computing the diagonal elements of the covariance."""
         return self.mapping.f(X).flatten()**2
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """Derivative of the covariance with respect to the parameters."""
         if (X2 is None) or (X2 is X):
             dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
@@ -90,7 +90,7 @@ class Hetero(Kernpart):
         """Gradient of diagonal of covariance with respect to parameters."""
         target += 2.*self.mapping.df_dtheta(dL_dKdiag[:, None]*self.mapping.f(X), X)
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         """Derivative of the covariance matrix with respect to X."""
         if X2==None or X2 is X:
             dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
diff --git a/GPy/kern/parts/hierarchical.py b/GPy/kern/parts/hierarchical.py
index c629f6b9..3ca6b444 100644
--- a/GPy/kern/parts/hierarchical.py
+++ b/GPy/kern/parts/hierarchical.py
@@ -50,19 +50,19 @@ class Hierarchical(Kernpart):
         #X,slices = X[:,:-1],index_to_slices(X[:,-1])
         #[[self.k.Kdiag(X[s],target[s]) for s in slices_i] for slices_i in slices]
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         X, X2, slices, slices2 = self._sort_slices(X,X2)
-        [[[[k.dK_dtheta(dL_dK[s,s2],X[s],X2[s2],target[p_start:p_stop]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_, slices2_)] for k, p_start, p_stop, slices_, slices2_ in zip(self.parts, self.param_starts, self.param_stops, slices, slices2)]
+        [[[[k._param_grad_helper(dL_dK[s,s2],X[s],X2[s2],target[p_start:p_stop]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_, slices2_)] for k, p_start, p_stop, slices_, slices2_ in zip(self.parts, self.param_starts, self.param_stops, slices, slices2)]
 
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         raise NotImplementedError
         #X,slices = X[:,:-1],index_to_slices(X[:,-1])
         #if X2 is None:
             #X2,slices2 = X,slices
         #else:
             #X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
-        #[[[self.k.dK_dX(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+        #[[[self.k.gradients_X(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
 #
     def dKdiag_dX(self,dL_dKdiag,X,target):
         raise NotImplementedError
diff --git a/GPy/kern/parts/independent_outputs.py b/GPy/kern/parts/independent_outputs.py
index f88b0ff5..98f1203d 100644
--- a/GPy/kern/parts/independent_outputs.py
+++ b/GPy/kern/parts/independent_outputs.py
@@ -70,22 +70,22 @@ class IndependentOutputs(Kernpart):
         X,slices = X[:,:-1],index_to_slices(X[:,-1])
         [[self.k.Kdiag(X[s],target[s]) for s in slices_i] for slices_i in slices]
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         X,slices = X[:,:-1],index_to_slices(X[:,-1])
         if X2 is None:
             X2,slices2 = X,slices
         else:
             X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
-        [[[self.k.dK_dtheta(dL_dK[s,s2],X[s],X2[s2],target) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+        [[[self.k._param_grad_helper(dL_dK[s,s2],X[s],X2[s2],target) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
 
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         X,slices = X[:,:-1],index_to_slices(X[:,-1])
         if X2 is None:
             X2,slices2 = X,slices
         else:
             X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
-        [[[self.k.dK_dX(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+        [[[self.k.gradients_X(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
 
     def dKdiag_dX(self,dL_dKdiag,X,target):
         X,slices = X[:,:-1],index_to_slices(X[:,-1])
diff --git a/GPy/kern/parts/kernpart.py b/GPy/kern/parts/kernpart.py
index 8314e7a7..06f1446b 100644
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
@@ -20,7 +20,6 @@ class Kernpart(Parameterized):
         # the number of optimisable parameters
         # the name of the covariance function.
         # link to parameterized objects
-        self._parameters_ = []
         #self._X = None
     
     def connect_input(self, X):
@@ -76,14 +75,14 @@ class Kernpart(Parameterized):
         raise NotImplementedError
     def Kdiag(self,X,target):
         raise NotImplementedError
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         raise NotImplementedError
     def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        # In the base case compute this by calling dK_dtheta. Need to
+        # In the base case compute this by calling _param_grad_helper. Need to
         # override for stationary covariances (for example) to save
         # time.
         for i in range(X.shape[0]):
-            self.dK_dtheta(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target)
+            self._param_grad_helper(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target)
     def psi0(self,Z,mu,S,target):
         raise NotImplementedError
     def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
@@ -106,12 +105,19 @@ class Kernpart(Parameterized):
         raise NotImplementedError
     def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
         raise NotImplementedError
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         raise NotImplementedError
     def dKdiag_dX(self, dL_dK, X, target):
         raise NotImplementedError
-
-
+    def update_gradients_full(self, dL_dK, X):
+        """Set the gradients of all parameters when doing full (N) inference."""
+        raise NotImplementedError
+    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+        """Set the gradients of all parameters when doing sparse (M) inference."""
+        raise NotImplementedError
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        """Set the gradients of all parameters when doing variational (M) inference with uncertain inputs."""
+        raise NotImplementedError
 
 class Kernpart_stationary(Kernpart):
     def __init__(self, input_dim, lengthscale=None, ARD=False):
diff --git a/GPy/kern/parts/linear.py b/GPy/kern/parts/linear.py
index 62f1ac36..1b805219 100644
--- a/GPy/kern/parts/linear.py
+++ b/GPy/kern/parts/linear.py
@@ -8,6 +8,7 @@ from kernpart import Kernpart
 from ...util.linalg import tdot
 from ...util.misc import fast_array_equal, param_to_array
 from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
 
 class Linear(Kernpart):
     """
@@ -43,8 +44,9 @@ class Linear(Kernpart):
             else:
                 variances = np.ones(self.input_dim)
         
-        self.variances = Param('variances', variances)
-        self.add_parameters(self.variances)
+        self.variances = Param('variances', variances, Logexp())
+        self.variances.gradient = np.zeros(self.variances.shape)
+        self.add_parameter(self.variances)
         self.variances.add_observer(self, self.update_variance)
 
         # initialize cache
@@ -57,21 +59,35 @@ class Linear(Kernpart):
     def on_input_change(self, X):
         self._K_computations(X, None)
 
-#     def _get_params(self):
-#         return self.variances
-# 
-#     def _set_params(self, x):
-#         assert x.size == (self.num_params)
-#         self.variances = x
-    #def parameters_changed(self):
-    #    self.variances2 = np.square(self.variances)
-# 
-#     def _get_param_names(self):
-#         if self.num_params == 1:
-#             return ['variance']
-#         else:
-#             return ['variance_%i' % i for i in range(self.variances.size)]
-
+    def update_gradients_full(self, dL_dK, X):
+        #self.variances.gradient[:] = 0
+        self._param_grad_helper(dL_dK, X, self.variances.gradient)
+    
+    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+        tmp = dL_dKdiag[:, None] * X ** 2
+        if self.ARD:
+            self.variances.gradient = tmp.sum(0)
+        else:
+            self.variances.gradient = tmp.sum()
+        self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient)
+        self._param_grad_helper(dL_dKnm, X, Z, self.variances.gradient)
+        
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        self._psi_computations(Z, mu, S)
+        # psi0:
+        tmp = dL_dpsi0[:, None] * self.mu2_S
+        if self.ARD: self.variances.gradient[:] = tmp.sum(0)
+        else: self.variances.gradient[:] = tmp.sum()
+        #psi1
+        self._param_grad_helper(dL_dpsi1, mu, Z, self.variances.gradient)
+        #psi2
+        tmp = dL_dpsi2[:, :, :, None] * (self.ZAinner[:, :, None, :] * (2 * Z)[None, None, :, :])
+        if self.ARD: self.variances.gradient += tmp.sum(0).sum(0).sum(0)
+        else: self.variances.gradient += tmp.sum()
+        #from Kmm
+        self._K_computations(Z, None)
+        self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient)
+        
     def K(self, X, X2, target):
         if self.ARD:
             XX = X * np.sqrt(self.variances)
@@ -88,7 +104,7 @@ class Linear(Kernpart):
     def Kdiag(self, X, target):
         np.add(target, np.sum(self.variances * np.square(X), -1), target)
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         if self.ARD:
             if X2 is None:
                 [np.add(target[i:i + 1], np.sum(dL_dK * tdot(X[:, i:i + 1])), target[i:i + 1]) for i in range(self.input_dim)]
@@ -100,14 +116,7 @@ class Linear(Kernpart):
                 self._K_computations(X, X2)
             target += np.sum(self._dot_product * dL_dK)
 
-    def dKdiag_dtheta(self, dL_dKdiag, X, target):
-        tmp = dL_dKdiag[:, None] * X ** 2
-        if self.ARD:
-            target += tmp.sum(0)
-        else:
-            target += tmp.sum()
-
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         if X2 is None:
             target += 2*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
         else:
@@ -124,14 +133,6 @@ class Linear(Kernpart):
         self._psi_computations(Z, mu, S)
         target += np.sum(self.variances * self.mu2_S, 1)
 
-    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S, target):
-        self._psi_computations(Z, mu, S)
-        tmp = dL_dpsi0[:, None] * self.mu2_S
-        if self.ARD:
-            target += tmp.sum(0)
-        else:
-            target += tmp.sum()
-
     def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
         target_mu += dL_dpsi0[:, None] * (2.0 * mu * self.variances)
         target_S += dL_dpsi0[:, None] * self.variances
@@ -140,17 +141,13 @@ class Linear(Kernpart):
         """the variance, it does nothing"""
         self._psi1 = self.K(mu, Z, target)
 
-    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
-        """the variance, it does nothing"""
-        self.dK_dtheta(dL_dpsi1, mu, Z, target)
-
     def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
         """Do nothing for S, it does not affect psi1"""
         self._psi_computations(Z, mu, S)
         target_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
 
     def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
-        self.dK_dX(dL_dpsi1.T, Z, mu, target)
+        self.gradients_X(dL_dpsi1.T, Z, mu, target)
 
     def psi2(self, Z, mu, S, target):
         self._psi_computations(Z, mu, S)
@@ -164,25 +161,17 @@ class Linear(Kernpart):
     def dpsi2_dtheta_new(self, dL_dpsi2, Z, mu, S, target):
         tmp = np.zeros((mu.shape[0], Z.shape[0]))
         self.K(mu,Z,tmp)
-        self.dK_dtheta(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target)
+        self._param_grad_helper(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target)
         result= 2.*(dL_dpsi2[:,:,:,None]*S[:,None,None,:]*self.variances*Z[None,:,None,:]*Z[None,None,:,:]).sum(0).sum(0).sum(0)
         if self.ARD:
             target += result.sum(0).sum(0).sum(0)
         else:
             target += result.sum()
 
-    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
-        self._psi_computations(Z, mu, S)
-        tmp = dL_dpsi2[:, :, :, None] * (self.ZAinner[:, :, None, :] * (2 * Z)[None, None, :, :])
-        if self.ARD:
-            target += tmp.sum(0).sum(0).sum(0)
-        else:
-            target += tmp.sum()
-
     def dpsi2_dmuS_new(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
         tmp = np.zeros((mu.shape[0], Z.shape[0]))
         self.K(mu,Z,tmp)
-        self.dK_dX(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target_mu)
+        self.gradients_X(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target_mu)
 
         Zs = Z*self.variances
         Zs_sq = Zs[:,None,:]*Zs[None,:,:]
@@ -288,11 +277,11 @@ class Linear(Kernpart):
         if not (fast_array_equal(X, self._X) and fast_array_equal(X2, self._X2)):
             self._X = X.copy()
             if X2 is None:
-                self._dot_product = tdot(X)
+                self._dot_product = tdot(param_to_array(X))
                 self._X2 = None
             else:
                 self._X2 = X2.copy()
-                self._dot_product = np.dot(X, X2.T)
+                self._dot_product = np.dot(param_to_array(X), param_to_array(X2.T))  
 
     def _psi_computations(self, Z, mu, S):
         # here are the "statistics" for psi1 and psi2
diff --git a/GPy/kern/parts/mlp.py b/GPy/kern/parts/mlp.py
index e68aaa72..59979a62 100644
--- a/GPy/kern/parts/mlp.py
+++ b/GPy/kern/parts/mlp.py
@@ -77,7 +77,7 @@ class MLP(Kernpart):
         self._K_diag_computations(X)
         target+= self.variance*self._K_diag_dvar
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """Derivative of the covariance with respect to the parameters."""
         self._K_computations(X, X2)
         denom3 = self._K_denom*self._K_denom*self._K_denom
@@ -107,7 +107,7 @@ class MLP(Kernpart):
             
         target[0] += np.sum(self._K_dvar*dL_dK)
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         """Derivative of the covariance matrix with respect to X"""
         self._K_computations(X, X2)
         arg = self._K_asin_arg
diff --git a/GPy/kern/parts/periodic_Matern32.py b/GPy/kern/parts/periodic_Matern32.py
index 0de57f82..24ec45f9 100644
--- a/GPy/kern/parts/periodic_Matern32.py
+++ b/GPy/kern/parts/periodic_Matern32.py
@@ -112,7 +112,7 @@ class PeriodicMatern32(Kernpart):
         np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
 
     @silence_errors
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)"""
         if X2 is None: X2 = X
         FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
diff --git a/GPy/kern/parts/periodic_Matern52.py b/GPy/kern/parts/periodic_Matern52.py
index 882084fd..1f9d90b3 100644
--- a/GPy/kern/parts/periodic_Matern52.py
+++ b/GPy/kern/parts/periodic_Matern52.py
@@ -114,7 +114,7 @@ class PeriodicMatern52(Kernpart):
         np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
 
     @silence_errors
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)"""
         if X2 is None: X2 = X
         FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
diff --git a/GPy/kern/parts/periodic_exponential.py b/GPy/kern/parts/periodic_exponential.py
index d8c193e0..4562cd56 100644
--- a/GPy/kern/parts/periodic_exponential.py
+++ b/GPy/kern/parts/periodic_exponential.py
@@ -110,7 +110,7 @@ class PeriodicExponential(Kernpart):
         np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
 
     @silence_errors
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to the parameters (shape is N x num_inducing x num_params)"""
         if X2 is None: X2 = X
         FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
diff --git a/GPy/kern/parts/poly.py b/GPy/kern/parts/poly.py
index 98c520f0..0deb11f4 100644
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
@@ -86,7 +86,7 @@ class POLY(Kernpart):
         self._K_diag_computations(X)
         target+= self.variance*self._K_diag_dvar
 
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         """Derivative of the covariance with respect to the parameters."""
         self._K_computations(X, X2)
         base = self.variance*self.degree*self._K_poly_arg**(self.degree-1)
@@ -99,7 +99,7 @@ class POLY(Kernpart):
         target[2] += base_cov_grad.sum()
 
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         """Derivative of the covariance matrix with respect to X"""
         self._K_computations(X, X2)
         arg = self._K_poly_arg
diff --git a/GPy/kern/parts/prod.py b/GPy/kern/parts/prod.py
index 62eed2aa..364c91b3 100644
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@@ -54,15 +54,15 @@ class Prod(Kernpart):
         self.k1.update_gradients_full(dL_dK*self._K2, X[:,self.slice1])
         self.k2.update_gradients_full(dL_dK*self._K1, X[:,self.slice2])
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """Derivative of the covariance matrix with respect to the parameters."""
         self._K_computations(X,X2)
         if X2 is None:
-            self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.num_params])
-            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.slice2], None, target[self.k1.num_params:])
+            self.k1._param_grad_helper(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.num_params])
+            self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.slice2], None, target[self.k1.num_params:])
         else:
-            self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:self.k1.num_params])
-            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[self.k1.num_params:])
+            self.k1._param_grad_helper(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:self.k1.num_params])
+            self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[self.k1.num_params:])
 
     def Kdiag(self,X,target):
         """Compute the diagonal of the covariance matrix associated to X."""
@@ -81,21 +81,21 @@ class Prod(Kernpart):
         self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,self.slice1],target[:self.k1.num_params])
         self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.slice2],target[self.k1.num_params:])
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to X."""
         self._K_computations(X,X2)
         if X2 is None:
             if not isinstance(self.k1,Coregionalize) and not isinstance(self.k2,Coregionalize):
-                self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], None, target[:,self.slice1])
-                self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], None, target[:,self.slice2])
+                self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], None, target[:,self.slice1])
+                self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], None, target[:,self.slice2])
             else:#if isinstance(self.k1,Coregionalize) or isinstance(self.k2,Coregionalize):
-                #NOTE The indices column in the inputs makes the ki.dK_dX fail when passing None instead of X[:,self.slicei]
+                #NOTE The indices column in the inputs makes the ki.gradients_X fail when passing None instead of X[:,self.slicei]
                 X2 = X
-                self.k1.dK_dX(2.*dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
-                self.k2.dK_dX(2.*dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
+                self.k1.gradients_X(2.*dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
+                self.k2.gradients_X(2.*dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
         else:
-            self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
-            self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
+            self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
+            self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         K1 = np.zeros(X.shape[0])
@@ -103,8 +103,8 @@ class Prod(Kernpart):
         self.k1.Kdiag(X[:,self.slice1],K1)
         self.k2.Kdiag(X[:,self.slice2],K2)
 
-        self.k1.dK_dX(dL_dKdiag*K2, X[:,self.slice1], target[:,self.slice1])
-        self.k2.dK_dX(dL_dKdiag*K1, X[:,self.slice2], target[:,self.slice2])
+        self.k1.gradients_X(dL_dKdiag*K2, X[:,self.slice1], target[:,self.slice1])
+        self.k2.gradients_X(dL_dKdiag*K1, X[:,self.slice2], target[:,self.slice2])
 
     def _K_computations(self,X,X2):
         if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
diff --git a/GPy/kern/parts/prod_orthogonal.py b/GPy/kern/parts/prod_orthogonal.py
index 237c9557..e7dd1fdc 100644
--- a/GPy/kern/parts/prod_orthogonal.py
+++ b/GPy/kern/parts/prod_orthogonal.py
@@ -41,15 +41,15 @@ class prod_orthogonal(Kernpart):
         self._K_computations(X,X2)
         target += self._K1 * self._K2
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to the parameters."""
         self._K_computations(X,X2)
         if X2 is None:
-            self.k1.dK_dtheta(dL_dK*self._K2, X[:,:self.k1.input_dim], None, target[:self.k1.num_params])
-            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.k1.input_dim:], None, target[self.k1.num_params:])
+            self.k1._param_grad_helper(dL_dK*self._K2, X[:,:self.k1.input_dim], None, target[:self.k1.num_params])
+            self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.k1.input_dim:], None, target[self.k1.num_params:])
         else:
-            self.k1.dK_dtheta(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target[:self.k1.num_params])
-            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target[self.k1.num_params:])
+            self.k1._param_grad_helper(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target[:self.k1.num_params])
+            self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target[self.k1.num_params:])
 
     def Kdiag(self,X,target):
         """Compute the diagonal of the covariance matrix associated to X."""
@@ -67,11 +67,11 @@ class prod_orthogonal(Kernpart):
         self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,:self.k1.input_dim],target[:self.k1.num_params])
         self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.k1.input_dim:],target[self.k1.num_params:])
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to X."""
         self._K_computations(X,X2)
-        self.k1.dK_dX(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target)
-        self.k2.dK_dX(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target)
+        self.k1.gradients_X(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target)
+        self.k2.gradients_X(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target)
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         K1 = np.zeros(X.shape[0])
@@ -79,8 +79,8 @@ class prod_orthogonal(Kernpart):
         self.k1.Kdiag(X[:,0:self.k1.input_dim],K1)
         self.k2.Kdiag(X[:,self.k1.input_dim:],K2)
 
-        self.k1.dK_dX(dL_dKdiag*K2, X[:,:self.k1.input_dim], target)
-        self.k2.dK_dX(dL_dKdiag*K1, X[:,self.k1.input_dim:], target)
+        self.k1.gradients_X(dL_dKdiag*K2, X[:,:self.k1.input_dim], target)
+        self.k2.gradients_X(dL_dKdiag*K1, X[:,self.k1.input_dim:], target)
 
     def _K_computations(self,X,X2):
         if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
diff --git a/GPy/kern/parts/rational_quadratic.py b/GPy/kern/parts/rational_quadratic.py
index a75a5b11..c36cee9f 100644
--- a/GPy/kern/parts/rational_quadratic.py
+++ b/GPy/kern/parts/rational_quadratic.py
@@ -52,7 +52,7 @@ class RationalQuadratic(Kernpart):
     def Kdiag(self,X,target):
         target += self.variance
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         if X2 is None: X2 = X
         dist2 = np.square((X-X2.T)/self.lengthscale)
 
@@ -68,7 +68,7 @@ class RationalQuadratic(Kernpart):
         target[0] += np.sum(dL_dKdiag)
         # here self.lengthscale and self.power have no influence on Kdiag so target[1:] are unchanged
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to X."""
         if X2 is None:
             dist2 = np.square((X-X.T)/self.lengthscale)
diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py
index 4247eb9c..4b38bd0f 100644
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@@ -8,6 +8,7 @@ from kernpart import Kernpart
 from ...util.linalg import tdot
 from ...util.misc import fast_array_equal, param_to_array
 from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
 
 class RBF(Kernpart):
     """
@@ -50,9 +51,12 @@ class RBF(Kernpart):
             else:
                 lengthscale = np.ones(self.input_dim)
 
-        self.variance = Param('variance', variance)
+        self.variance = Param('variance', variance, Logexp())
+        
         self.lengthscale = Param('lengthscale', lengthscale)
         self.lengthscale.add_observer(self, self.update_lengthscale)
+        self.update_lengthscale(self.lengthscale)
+        
         self.add_parameters(self.variance, self.lengthscale)
         self.parameters_changed() # initializes cache
 
@@ -114,7 +118,7 @@ class RBF(Kernpart):
         self._K_computations(X, Z)
         self.variance.gradient += np.sum(dL_dKnm * self._K_dvar)
         if self.ARD:
-            self.lengthscales.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
+            self.lengthscale.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
 
         else:
             self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKnm)
@@ -123,7 +127,7 @@ class RBF(Kernpart):
         self._K_computations(Z, None)
         self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
         if self.ARD:
-            self.lengthscales.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
+            self.lengthscale.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
         else:
             self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
 
@@ -138,7 +142,7 @@ class RBF(Kernpart):
         d_length = self._psi1[:,:,None] * ((self._psi1_dist_sq - 1.)/(self.lengthscale*self._psi1_denom) +1./self.lengthscale)
         dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
         if not self.ARD:
-            self.lengthscale.gradeint = dpsi1_dlength.sum()
+            self.lengthscale.gradient = dpsi1_dlength.sum()
         else:
             self.lengthscale.gradient = dpsi1_dlength.sum(0).sum(0)
 
@@ -157,7 +161,7 @@ class RBF(Kernpart):
         self._K_computations(Z, None)
         self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
         if self.ARD:
-            self.lengthscales.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
+            self.lengthscale.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
         else:
             self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
 
@@ -168,8 +172,8 @@ class RBF(Kernpart):
             _K_dist = 2*(X[:, None, :] - X[None, :, :])
         else:
             _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
-        dK_dX = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
-        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+        gradients_X = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
+        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         pass
diff --git a/GPy/kern/parts/rbf_inv.py b/GPy/kern/parts/rbf_inv.py
index c4461267..8405ae84 100644
--- a/GPy/kern/parts/rbf_inv.py
+++ b/GPy/kern/parts/rbf_inv.py
@@ -97,7 +97,7 @@ class RBFInv(RBF):
 #             return ['variance'] + ['inv_lengthscale%i' % i for i in range(self.inv_lengthscale.size)]
 
     # TODO: Rewrite computations so that lengthscale is not needed (but only inv. lengthscale)
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
         self._K_computations(X, X2)
         target[0] += np.sum(self._K_dvar * dL_dK)
         if self.ARD:
@@ -142,14 +142,14 @@ class RBFInv(RBF):
         else:
             target[1] += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK) * (-self.lengthscale2)
 
-    def dK_dX(self, dL_dK, X, X2, target):
+    def gradients_X(self, dL_dK, X, X2, target):
         self._K_computations(X, X2)
         if X2 is None:            
             _K_dist = 2*(X[:, None, :] - X[None, :, :])
         else:
             _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
-        dK_dX = (-self.variance * self.inv_lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
-        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+        gradients_X = (-self.variance * self.inv_lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
+        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         pass
diff --git a/GPy/kern/parts/rbfcos.py b/GPy/kern/parts/rbfcos.py
index b6411e0a..9a4b8ab2 100644
--- a/GPy/kern/parts/rbfcos.py
+++ b/GPy/kern/parts/rbfcos.py
@@ -73,7 +73,7 @@ class RBFCos(Kernpart):
     def Kdiag(self,X,target):
         np.add(target,self.variance,target)
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         self._K_computations(X,X2)
         target[0] += np.sum(dL_dK*self._dvar)
         if self.ARD:
@@ -88,7 +88,7 @@ class RBFCos(Kernpart):
     def dKdiag_dtheta(self,dL_dKdiag,X,target):
         target[0] += np.sum(dL_dKdiag)
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         #TODO!!!
         raise NotImplementedError
 
diff --git a/GPy/kern/parts/spline.py b/GPy/kern/parts/spline.py
index c31258f3..3e57b8a9 100644
--- a/GPy/kern/parts/spline.py
+++ b/GPy/kern/parts/spline.py
@@ -50,7 +50,7 @@ class Spline(Kernpart):
     def Kdiag(self,X,target):
         target += self.variance*X.flatten()**3/3.
 
-    def dK_dtheta(self,X,X2,target):
+    def _param_grad_helper(self,X,X2,target):
         target += 0.5*(t*s**2) - s**3/6. + (s_t)**3*theta(s_t)/6.
 
     def dKdiag_dtheta(self,X,target):
diff --git a/GPy/kern/parts/ss_rbf.py b/GPy/kern/parts/ss_rbf.py
index a234d428..cab8fd11 100644
--- a/GPy/kern/parts/ss_rbf.py
+++ b/GPy/kern/parts/ss_rbf.py
@@ -144,8 +144,8 @@ class SS_RBF(Kernpart):
             _K_dist = 2*(X[:, None, :] - X[None, :, :])
         else:
             _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
-        dK_dX = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
-        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+        gradients_X = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
+        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         pass
diff --git a/GPy/kern/parts/symmetric.py b/GPy/kern/parts/symmetric.py
index d47fbd9d..8eec2acc 100644
--- a/GPy/kern/parts/symmetric.py
+++ b/GPy/kern/parts/symmetric.py
@@ -40,7 +40,7 @@ class Symmetric(Kernpart):
         self.k.K(X,AX2,target)
         self.k.K(AX,AX2,target)
 
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to the parameters."""
         AX = np.dot(X,self.transform)
         if X2 is None:
@@ -48,13 +48,13 @@ class Symmetric(Kernpart):
             ZX2 = AX
         else:
             AX2 = np.dot(X2, self.transform)
-        self.k.dK_dtheta(dL_dK,X,X2,target)
-        self.k.dK_dtheta(dL_dK,AX,X2,target)
-        self.k.dK_dtheta(dL_dK,X,AX2,target)
-        self.k.dK_dtheta(dL_dK,AX,AX2,target)
+        self.k._param_grad_helper(dL_dK,X,X2,target)
+        self.k._param_grad_helper(dL_dK,AX,X2,target)
+        self.k._param_grad_helper(dL_dK,X,AX2,target)
+        self.k._param_grad_helper(dL_dK,AX,AX2,target)
 
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to X."""
         AX = np.dot(X,self.transform)
         if X2 is None:
@@ -62,10 +62,10 @@ class Symmetric(Kernpart):
             ZX2 = AX
         else:
             AX2 = np.dot(X2, self.transform)
-        self.k.dK_dX(dL_dK, X, X2, target)
-        self.k.dK_dX(dL_dK, AX, X2, target)
-        self.k.dK_dX(dL_dK, X, AX2, target)
-        self.k.dK_dX(dL_dK, AX ,AX2, target)
+        self.k.gradients_X(dL_dK, X, X2, target)
+        self.k.gradients_X(dL_dK, AX, X2, target)
+        self.k.gradients_X(dL_dK, X, AX2, target)
+        self.k.gradients_X(dL_dK, AX ,AX2, target)
 
     def Kdiag(self,X,target):
         """Compute the diagonal of the covariance matrix associated to X."""
diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index ea603eab..a09d4bfc 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -348,7 +348,7 @@ class spkern(Kernpart):
     def Kdiag(self,X,target):
         self._weave_inline(self._Kdiag_code, X, target)
 
-    def dK_dtheta(self,partial,X,Z,target):
+    def _param_grad_helper(self,partial,X,Z,target):
         if Z is None:
             self._weave_inline(self._dK_dtheta_code_X, X, target, Z, partial)
         else:
@@ -357,7 +357,7 @@ class spkern(Kernpart):
     def dKdiag_dtheta(self,partial,X,target):
         self._weave_inline(self._dKdiag_dtheta_code, X, target, Z=None, partial=partial)
                
-    def dK_dX(self,partial,X,Z,target):
+    def gradients_X(self,partial,X,Z,target):
         if Z is None:
             self._weave_inline(self._dK_dX_code_X, X, target, Z, partial)
         else:
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index f77eafbe..071fe62d 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -5,7 +5,7 @@
 """
 A lot of this code assumes that the link function is the identity. 
 
-I think laplace code is okay, but I'm quite sure that the EP moments will only work if the link is identity. 
+I think laplace code is okay, but I'm quite sure that the EP moments will only work if the link is identity.
 
 Furthermore, exact Guassian inference can only be done for the identity link, so we should be asserting so for all calls which relate to that.
 
@@ -130,7 +130,10 @@ class Gaussian(Likelihood):
         :rtype: float
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
-        return -0.5*(np.sum((y-link_f)**2/self.variance) + self.ln_det_K + self.N*np.log(2.*np.pi))
+        N = y.shape[0]
+        ln_det_cov = N*np.log(self.variance)
+
+        return -0.5*(np.sum((y-link_f)**2/self.variance) + ln_det_cov + N*np.log(2.*np.pi))
 
     def dlogpdf_dlink(self, link_f, y, extra_data=None):
         """
@@ -175,7 +178,8 @@ class Gaussian(Likelihood):
             (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
-        hess = -(1.0/self.variance)*np.ones((self.N, 1))
+        N = y.shape[0]
+        hess = -(1.0/self.variance)*np.ones((N, 1))
         return hess
 
     def d3logpdf_dlink3(self, link_f, y, extra_data=None):
@@ -194,7 +198,8 @@ class Gaussian(Likelihood):
         :rtype: Nx1 array
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
-        d3logpdf_dlink3 = np.diagonal(0*self.I)[:, None]
+        N = y.shape[0]
+        d3logpdf_dlink3 = np.zeros((N,1))
         return d3logpdf_dlink3
 
     def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
@@ -215,7 +220,8 @@ class Gaussian(Likelihood):
         assert np.asarray(link_f).shape == np.asarray(y).shape
         e = y - link_f
         s_4 = 1.0/(self.variance**2)
-        dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.sum(np.square(e))
+        N = y.shape[0]
+        dlik_dsigma = -0.5*N/self.variance + 0.5*s_4*np.sum(np.square(e))
         return np.sum(dlik_dsigma) # Sure about this sum?
 
     def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
@@ -255,7 +261,8 @@ class Gaussian(Likelihood):
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
         s_4 = 1.0/(self.variance**2)
-        d2logpdf_dlink2_dvar = np.diag(s_4*self.I)[:, None]
+        N = y.shape[0]
+        d2logpdf_dlink2_dvar = np.ones((N,1))*s_4
         return d2logpdf_dlink2_dvar
 
     def dlogpdf_link_dtheta(self, f, y, extra_data=None):
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index b0ecfc37..701a5a2f 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -13,12 +13,12 @@ from ..core.parameterization import Parameterized
 
 class Likelihood(Parameterized):
     """
-    Likelihood base class, used to defing p(y|f). 
+    Likelihood base class, used to defing p(y|f).
 
     All instances use _inverse_ link functions, which can be swapped out. It is
     expected that inherriting classes define a default inverse link function
 
-    To use this class, inherrit and define missing functionality. 
+    To use this class, inherrit and define missing functionality.
 
     Inherriting classes *must* implement:
        pdf_link : a bound method which turns the output of the link function into the pdf
@@ -27,7 +27,7 @@ class Likelihood(Parameterized):
     To enable use with EP, inherriting classes *must* define:
        TODO: a suitable derivative function for any parameters of the class
     It is also desirable to define:
-       moments_match_ep : a function to compute the EP moments If this isn't defined, the moments will be computed using 1D quadrature. 
+       moments_match_ep : a function to compute the EP moments If this isn't defined, the moments will be computed using 1D quadrature.
 
     To enable use with Laplace approximation, inherriting classes *must* define:
        Some derivative functions *AS TODO*
@@ -36,7 +36,7 @@ class Likelihood(Parameterized):
 
     """
     def __init__(self, gp_link, name):
-        super(Likelihood, self).__init__(name) 
+        super(Likelihood, self).__init__(name)
         assert isinstance(gp_link,link_functions.GPTransformation), "gp_link is not a valid GPTransformation."
         self.gp_link = gp_link
         self.log_concave = False
@@ -44,6 +44,10 @@ class Likelihood(Parameterized):
     def _gradients(self,partial):
         return np.zeros(0)
 
+    def update_gradients(self, partial):
+        if self.size > 0:
+            raise NotImplementedError('Must be implemented for likelihoods with parameters to be optimized')
+
     def _preprocess_values(self,Y):
         """
         In case it is needed, this function assess the output values or makes any pertinent transformation on them.
@@ -303,31 +307,31 @@ class Likelihood(Parameterized):
         """
         TODO: Doc strings
         """
-        if len(self._get_param_names()) > 0:
+        if self.size > 0:
             link_f = self.gp_link.transf(f)
             return self.dlogpdf_link_dtheta(link_f, y, extra_data=extra_data)
         else:
             #Is no parameters so return an empty array for its derivatives
-            return np.empty([1, 0])
+            return np.zeros([1, 0])
 
     def dlogpdf_df_dtheta(self, f, y, extra_data=None):
         """
         TODO: Doc strings
         """
-        if len(self._get_param_names()) > 0:
+        if self.size > 0:
             link_f = self.gp_link.transf(f)
             dlink_df = self.gp_link.dtransf_df(f)
             dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
             return chain_1(dlogpdf_dlink_dtheta, dlink_df)
         else:
             #Is no parameters so return an empty array for its derivatives
-            return np.empty([f.shape[0], 0])
+            return np.zeros([f.shape[0], 0])
 
     def d2logpdf_df2_dtheta(self, f, y, extra_data=None):
         """
         TODO: Doc strings
         """
-        if len(self._get_param_names()) > 0:
+        if self.size > 0:
             link_f = self.gp_link.transf(f)
             dlink_df = self.gp_link.dtransf_df(f)
             d2link_df2 = self.gp_link.d2transf_df2(f)
@@ -336,7 +340,7 @@ class Likelihood(Parameterized):
             return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
         else:
             #Is no parameters so return an empty array for its derivatives
-            return np.empty([f.shape[0], 0])
+            return np.zeros([f.shape[0], 0])
 
     def _laplace_gradients(self, f, y, extra_data=None):
         dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, extra_data=extra_data)
@@ -345,9 +349,12 @@ class Likelihood(Parameterized):
 
         #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
         # ensure we have gradients for every parameter we want to optimize
-        assert dlogpdf_dtheta.shape[1] == len(self._get_param_names())
-        assert dlogpdf_df_dtheta.shape[1] == len(self._get_param_names())
-        assert d2logpdf_df2_dtheta.shape[1] == len(self._get_param_names())
+        try:
+            assert len(dlogpdf_dtheta) == self.size #1 x num_param array
+            assert dlogpdf_df_dtheta.shape[1] == self.size #f x num_param matrix
+            assert d2logpdf_df2_dtheta.shape[1] == self.size #f x num_param matrix
+        except Exception as e:
+            import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
 
         return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
 
diff --git a/GPy/likelihoods/poisson.py b/GPy/likelihoods/poisson.py
index 355516bb..ba6915b8 100644
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@@ -19,8 +19,11 @@ class Poisson(Likelihood):
     .. Note::
         Y is expected to take values in {0,1,2,...}
     """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
-        super(Poisson, self).__init__(gp_link,analytical_mean,analytical_variance)
+    def __init__(self, gp_link=None):
+        if gp_link is None:
+            gp_link = link_functions.Log_ex_1()
+
+        super(Poisson, self).__init__(gp_link, name='Poisson')
 
     def _preprocess_values(self,Y):
         return Y
diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py
index 587e1b23..ac93f204 100644
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@@ -8,6 +8,7 @@ import link_functions
 from scipy import stats, integrate
 from scipy.special import gammaln, gamma
 from likelihood import Likelihood
+from ..core.parameterization import Param
 
 class StudentT(Likelihood):
     """
@@ -19,26 +20,30 @@ class StudentT(Likelihood):
         p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
 
     """
-    def __init__(self,gp_link=None,analytical_mean=True,analytical_variance=True, deg_free=5, sigma2=2):
-        self.v = deg_free
-        self.sigma2 = sigma2
+    def __init__(self,gp_link=None, deg_free=5, sigma2=2):
+        if gp_link is None:
+            gp_link = link_functions.Identity()
+
+        super(StudentT, self).__init__(gp_link, name='Student_T')
+
+        self.sigma2 = Param('t_noise', float(sigma2))
+        self.v = Param('deg_free', float(deg_free))
+        self.add_parameter(self.sigma2)
+        self.add_parameter(self.v)
+        self.v.constrain_fixed()
 
-        self._set_params(np.asarray(sigma2))
-        super(StudentT, self).__init__(gp_link,analytical_mean,analytical_variance)
         self.log_concave = False
 
-    def _get_params(self):
-        return np.asarray(self.sigma2)
+    def parameters_changed(self):
+        self.variance = (self.v / float(self.v - 2)) * self.sigma2
 
-    def _get_param_names(self):
-        return ["t_noise_std2"]
-
-    def _set_params(self, x):
-        self.sigma2 = float(x)
-
-    @property
-    def variance(self, extra_data=None):
-        return (self.v / float(self.v - 2)) * self.sigma2
+    def update_gradients(self, partial):
+        """
+        Pull out the gradients, be careful as the order must match the order
+        in which the parameters are added
+        """
+        self.sigma2.gradient = partial[0]
+        self.v.gradient = partial[1]
 
     def pdf_link(self, link_f, y, extra_data=None):
         """
@@ -82,10 +87,14 @@ class StudentT(Likelihood):
         """
         assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         e = y - link_f
+        #FIXME:
+        #Why does np.log(1 + (1/self.v)*((y-link_f)**2)/self.sigma2) suppress the divide by zero?!
+        #But np.log(1 + (1/float(self.v))*((y-link_f)**2)/self.sigma2) throws it correctly
+        #print - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
         objective = (+ gammaln((self.v + 1) * 0.5)
-                     - gammaln(self.v * 0.5)
-                     - 0.5*np.log(self.sigma2 * self.v * np.pi)
-                     - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
+                    - gammaln(self.v * 0.5)
+                    - 0.5*np.log(self.sigma2 * self.v * np.pi)
+                    - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
                     )
         return np.sum(objective)
 
@@ -222,17 +231,20 @@ class StudentT(Likelihood):
 
     def dlogpdf_link_dtheta(self, f, y, extra_data=None):
         dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, extra_data=extra_data)
-        return np.asarray([[dlogpdf_dvar]])
+        dlogpdf_dv = np.zeros_like(dlogpdf_dvar) #FIXME: Not done yet
+        return np.hstack((dlogpdf_dvar, dlogpdf_dv))
 
     def dlogpdf_dlink_dtheta(self, f, y, extra_data=None):
         dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)
-        return dlogpdf_dlink_dvar
+        dlogpdf_dlink_dv = np.zeros_like(dlogpdf_dlink_dvar) #FIXME: Not done yet
+        return np.hstack((dlogpdf_dlink_dvar, dlogpdf_dlink_dv))
 
     def d2logpdf_dlink2_dtheta(self, f, y, extra_data=None):
         d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)
-        return d2logpdf_dlink2_dvar
+        d2logpdf_dlink2_dv = np.zeros_like(d2logpdf_dlink2_dvar) #FIXME: Not done yet
+        return np.hstack((d2logpdf_dlink2_dvar, d2logpdf_dlink2_dv))
 
-    def _predictive_variance_analytical(self, mu, sigma, predictive_mean=None):
+    def predictive_variance(self, mu, sigma, predictive_mean=None):
         """
         Compute predictive variance of student_t*normal p(y*|f*)p(f*)
 
@@ -252,7 +264,7 @@ class StudentT(Likelihood):
 
         return true_var
 
-    def _predictive_mean_analytical(self, mu, sigma):
+    def predictive_mean(self, mu, sigma):
         """
         Compute mean of the prediction
         """
diff --git a/GPy/mappings/kernel.py b/GPy/mappings/kernel.py
index ccd1462a..94ce203f 100644
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@@ -57,4 +57,4 @@ class Kernel(Mapping):
         return np.hstack((self._df_dA.flatten(), self._df_dbias))
 
     def df_dX(self, dL_df, X):
-        return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X)
+        return self.kern.gradients_X((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X)
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 7a22b5ea..78851147 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -2,7 +2,6 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-import itertools
 from gplvm import GPLVM
 from .. import kern
 from ..core import SparseGP
@@ -23,15 +22,10 @@ class BayesianGPLVM(SparseGP, GPLVM):
     :type init: 'PCA'|'random'
 
     """
-    def __init__(self, likelihood_or_Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, name='bayesian gplvm', **kwargs):
-        if type(likelihood_or_Y) is np.ndarray:
-            likelihood = Gaussian(likelihood_or_Y)
-        else:
-            likelihood = likelihood_or_Y
-
+    def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
+                 Z=None, kernel=None, inference_method=None, likelihood=Gaussian(), name='bayesian gplvm', **kwargs):
         if X == None:
-            X = self.initialise_latent(init, input_dim, likelihood.Y)
+            X = self.initialise_latent(init, input_dim, Y)
         self.init = init
 
         if X_variance is None:
@@ -44,9 +38,9 @@ class BayesianGPLVM(SparseGP, GPLVM):
         if kernel is None:
             kernel = kern.rbf(input_dim) # + kern.white(input_dim)
 
-        SparseGP.__init__(self, X=X, likelihood=likelihood, kernel=kernel, Z=Z, X_variance=X_variance, name=name, **kwargs)
-        self.q = Normal(self.X, self.X_variance)
-        self.add_parameter(self.q, gradient=self._dbound_dmuS, index=0)
+        self.q = Normal(X, X_variance)
+        SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, X_variance, name, **kwargs)
+        self.add_parameter(self.q, index=0)
         self.ensure_default_constraints()
 
     def _getstate(self):
@@ -94,9 +88,9 @@ class BayesianGPLVM(SparseGP, GPLVM):
         return dKL_dmu, dKL_dS
 
     def dL_dmuS(self):
-        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi0_dmuS(self.dL_dpsi0, self.Z, self.X, self.X_variance)
-        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi1_dmuS(self.dL_dpsi1, self.Z, self.X, self.X_variance)
-        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.dL_dpsi2, self.Z, self.X, self.X_variance)
+        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi0_dmuS(self.grad_dict['dL_dpsi0'], self.Z, self.X, self.X_variance)
+        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi1_dmuS(self.grad_dict['dL_dpsi1'], self.Z, self.X, self.X_variance)
+        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.grad_dict['dL_dpsi2'], self.Z, self.X, self.X_variance)
         dL_dmu = dL_dmu_psi0 + dL_dmu_psi1 + dL_dmu_psi2
         dL_dS = dL_dS_psi0 + dL_dS_psi1 + dL_dS_psi2
 
@@ -107,10 +101,25 @@ class BayesianGPLVM(SparseGP, GPLVM):
         var_S = np.sum(self.X_variance - np.log(self.X_variance))
         return 0.5 * (var_mean + var_S) - 0.5 * self.input_dim * self.num_data
 
-    def log_likelihood(self):
-        ll = SparseGP.log_likelihood(self)
-        kl = self.KL_divergence()
-        return ll - kl
+    def parameters_changed(self):
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
+        self._log_marginal_likelihood -= self.KL_divergence()
+
+        #The derivative of the bound wrt the inducing inputs Z
+        self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
+        self.Z.gradient += self.kern.dpsi1_dZ(self.grad_dict['dL_dpsi1'], self.Z, self.X, self.X_variance)
+        self.Z.gradient += self.kern.dpsi2_dZ(self.grad_dict['dL_dpsi2'], self.Z, self.X, self.X_variance)
+        
+        dL_dmu, dL_dS = self.dL_dmuS()
+        dKL_dmu, dKL_dS = self.dKL_dmuS()
+        self.q.means.gradient = dL_dmu - dKL_dmu
+        self.q.variances.gradient = dL_dS - dKL_dS
+    
+
+#     def log_likelihood(self):        
+#         ll = SparseGP.log_likelihood(self)
+#         kl = self.KL_divergence()
+#         return ll - kl
 
     def _dbound_dmuS(self):
         dKL_dmu, dKL_dS = self.dKL_dmuS()
@@ -181,18 +190,18 @@ class BayesianGPLVM(SparseGP, GPLVM):
         """
         dmu_dX = np.zeros_like(Xnew)
         for i in range(self.Z.shape[0]):
-            dmu_dX += self.kern.dK_dX(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :])
+            dmu_dX += self.kern.gradients_X(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :])
         return dmu_dX
 
     def dmu_dXnew(self, Xnew):
         """
         Individual gradient of prediction at Xnew w.r.t. each sample in Xnew
         """
-        dK_dX = np.zeros((Xnew.shape[0], self.num_inducing))
+        gradients_X = np.zeros((Xnew.shape[0], self.num_inducing))
         ones = np.ones((1, 1))
         for i in range(self.Z.shape[0]):
-            dK_dX[:, i] = self.kern.dK_dX(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
-        return np.dot(dK_dX, self.Cpsi1Vf)
+            gradients_X[:, i] = self.kern.gradients_X(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
+        return np.dot(gradients_X, self.Cpsi1Vf)
 
     def plot_steepest_gradient_map(self, *args, ** kwargs):
         """
diff --git a/GPy/models/bcgplvm.py b/GPy/models/bcgplvm.py
index 9f5866c3..f21a01f4 100644
--- a/GPy/models/bcgplvm.py
+++ b/GPy/models/bcgplvm.py
@@ -44,7 +44,7 @@ class BCGPLVM(GPLVM):
         GP._set_params(self, x[self.mapping.num_params:])
 
     def _log_likelihood_gradients(self):
-        dL_df = self.kern.dK_dX(self.dL_dK, self.X)
+        dL_df = self.kern.gradients_X(self.dL_dK, self.X)
         dL_dtheta = self.mapping.df_dtheta(dL_df, self.likelihood.Y)
         return np.hstack((dL_dtheta.flatten(), GP._log_likelihood_gradients(self)))
 
diff --git a/GPy/models/gplvm.py b/GPy/models/gplvm.py
index fc328ff2..06481b81 100644
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@@ -60,7 +60,7 @@ class GPLVM(GP):
     def jacobian(self,X):
         target = np.zeros((X.shape[0],X.shape[1],self.output_dim))
         for i in range(self.output_dim):
-            target[:,:,i]=self.kern.dK_dX(np.dot(self.Ki,self.likelihood.Y[:,i])[None, :],X,self.X)
+            target[:,:,i]=self.kern.gradients_X(np.dot(self.Ki,self.likelihood.Y[:,i])[None, :],X,self.X)
         return target
 
     def magnification(self,X):
diff --git a/GPy/models/gradient_checker.py b/GPy/models/gradient_checker.py
index 775334ac..b7c78449 100644
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@@ -1,9 +1,10 @@
 # ## Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from GPy.core.model import Model
+from ..core.model import Model
 import itertools
 import numpy
+from ..core.parameterization import Param
 
 def get_shape(x):
     if isinstance(x, numpy.ndarray):
@@ -24,42 +25,42 @@ class GradientChecker(Model):
         """
         :param f: Function to check gradient for
         :param df: Gradient of function to check
-        :param x0: 
+        :param x0:
             Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
-            Can be a list of arrays, if takes a list of arrays. This list will be passed 
+            Can be a list of arrays, if takes a list of arrays. This list will be passed
             to f and df in the same order as given here.
             If only one argument, make sure not to pass a list!!!
-            
+
         :type x0: [array-like] | array-like | float | int
         :param names:
             Names to print, when performing gradcheck. If a list was passed to x0
             a list of names with the same length is expected.
         :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
-        
+
         Examples:
         ---------
             from GPy.models import GradientChecker
             N, M, Q = 10, 5, 3
-        
+
             Sinusoid:
-            
+
                 X = numpy.random.rand(N, Q)
                 grad = GradientChecker(numpy.sin,numpy.cos,X,'x')
                 grad.checkgrad(verbose=1)
-    
+
             Using GPy:
-            
+
                 X, Z = numpy.random.randn(N,Q), numpy.random.randn(M,Q)
                 kern = GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True)
-                grad = GradientChecker(kern.K, 
+                grad = GradientChecker(kern.K,
                                        lambda x: 2*kern.dK_dX(numpy.ones((1,1)), x),
                                        x0 = X.copy(),
-                                       names='X')  
+                                       names='X')
                 grad.checkgrad(verbose=1)
                 grad.randomize()
-                grad.checkgrad(verbose=1)      
+                grad.checkgrad(verbose=1)
         """
-        Model.__init__(self)
+        Model.__init__(self, 'GradientChecker')
         if isinstance(x0, (list, tuple)) and names is None:
             self.shapes = [get_shape(xi) for xi in x0]
             self.names = ['X{i}'.format(i=i) for i in range(len(x0))]
@@ -72,8 +73,10 @@ class GradientChecker(Model):
         else:
             self.names = names
             self.shapes = [get_shape(x0)]
+
         for name, xi in zip(self.names, at_least_one_element(x0)):
-            self.__setattr__(name, xi)
+            self.__setattr__(name, Param(name, xi))
+            self.add_parameter(self.__getattribute__(name))
 #         self._param_names = []
 #         for name, shape in zip(self.names, self.shapes):
 #             self._param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
@@ -93,20 +96,18 @@ class GradientChecker(Model):
     def _log_likelihood_gradients(self):
         return numpy.atleast_1d(self.df(*self._get_x(), **self.kwargs)).flatten()
 
+    #def _get_params(self):
+        #return numpy.atleast_1d(numpy.hstack(map(lambda name: flatten_if_needed(self.__getattribute__(name)), self.names)))
 
-    def _get_params(self):
-        return numpy.atleast_1d(numpy.hstack(map(lambda name: flatten_if_needed(self.__getattribute__(name)), self.names)))
+    #def _set_params(self, x):
+        #current_index = 0
+        #for name, shape in zip(self.names, self.shapes):
+            #current_size = numpy.prod(shape)
+            #self.__setattr__(name, x[current_index:current_index + current_size].reshape(shape))
+            #current_index += current_size
 
-
-    def _set_params(self, x):
-        current_index = 0
-        for name, shape in zip(self.names, self.shapes):
-            current_size = numpy.prod(shape)
-            self.__setattr__(name, x[current_index:current_index + current_size].reshape(shape))
-            current_index += current_size
-
-    def _get_param_names(self):
-        _param_names = []
-        for name, shape in zip(self.names, self.shapes):
-            _param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
-        return _param_names
+    #def _get_param_names(self):
+        #_param_names = []
+        #for name, shape in zip(self.names, self.shapes):
+            #_param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
+        #return _param_names
diff --git a/GPy/models/sparse_gplvm.py b/GPy/models/sparse_gplvm.py
index e3024264..5c10d0b8 100644
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@@ -52,7 +52,7 @@ class SparseGPLVM(SparseGPRegression, GPLVM):
 
     def dL_dX(self):
         dL_dX = self.kern.dKdiag_dX(self.dL_dpsi0, self.X)
-        dL_dX += self.kern.dK_dX(self.dL_dpsi1, self.X, self.Z)
+        dL_dX += self.kern.gradients_X(self.dL_dpsi1, self.X, self.Z)
 
         return dL_dX
 
diff --git a/GPy/plotting/matplot_dep/variational_plots.py b/GPy/plotting/matplot_dep/variational_plots.py
index 9f791dd1..7c89a088 100644
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@@ -1,4 +1,5 @@
-import pylab as pb
+import pylab as pb, numpy as np
+from ...util.misc import param_to_array
 
 def plot(parameterized, fignum=None, ax=None, colors=None):
     """
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 0fceac60..40cd66dd 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -5,7 +5,7 @@ import unittest
 import numpy as np
 import GPy
 
-verbose = False
+verbose = True
 
 try:
     import sympy
diff --git a/GPy/testing/likelihood_tests.py b/GPy/testing/likelihood_tests.py
index d14c9a41..7f48ac95 100644
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@@ -4,10 +4,11 @@ import GPy
 from GPy.models import GradientChecker
 import functools
 import inspect
-from GPy.likelihoods.noise_models import gp_transformations
+from GPy.likelihoods import link_functions
+from ..core.parameterization import Param
 from functools import partial
 #np.random.seed(300)
-np.random.seed(7)
+#np.random.seed(7)
 
 def dparam_partial(inst_func, *args):
     """
@@ -22,12 +23,14 @@ def dparam_partial(inst_func, *args):
           the f or Y that are being used in the function whilst we tweak the
           param
     """
-    def param_func(param, inst_func, args):
-        inst_func.im_self._set_params(param)
+    def param_func(param_val, param_name, inst_func, args):
+        #inst_func.im_self._set_params(param)
+        #inst_func.im_self.add_parameter(Param(param_name, param_val))
+        inst_func.im_self[param_name] = param_val
         return inst_func(*args)
     return functools.partial(param_func, inst_func=inst_func, args=args)
 
-def dparam_checkgrad(func, dfunc, params, args, constraints=None, randomize=False, verbose=False):
+def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None, randomize=False, verbose=False):
     """
     checkgrad expects a f: R^N -> R^1 and df: R^N -> R^N
     However if we are holding other parameters fixed and moving something else
@@ -38,27 +41,34 @@ def dparam_checkgrad(func, dfunc, params, args, constraints=None, randomize=Fals
     The number of parameters and N is the number of data
     Need to take a slice out from f and a slice out of df
     """
-    #print "\n{} likelihood: {} vs {}".format(func.im_self.__class__.__name__,
-                                           #func.__name__, dfunc.__name__)
+    print "\n{} likelihood: {} vs {}".format(func.im_self.__class__.__name__,
+                                           func.__name__, dfunc.__name__)
     partial_f = dparam_partial(func, *args)
     partial_df = dparam_partial(dfunc, *args)
     gradchecking = True
-    for param in params:
-        fnum = np.atleast_1d(partial_f(param)).shape[0]
-        dfnum = np.atleast_1d(partial_df(param)).shape[0]
+    zipped_params = zip(params, params_names)
+    for param_ind, (param_val, param_name) in enumerate(zipped_params):
+        #Check one parameter at a time, make sure it is 2d (as some gradients only return arrays) then strip out the parameter
+        fnum = np.atleast_2d(partial_f(param_val, param_name))[:, param_ind].shape[0]
+        dfnum = np.atleast_2d(partial_df(param_val, param_name))[:, param_ind].shape[0]
         for fixed_val in range(dfnum):
             #dlik and dlik_dvar gives back 1 value for each
             f_ind = min(fnum, fixed_val+1) - 1
             print "fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val)
             #Make grad checker with this param moving, note that set_params is NOT being called
             #The parameter is being set directly with __setattr__
-            grad = GradientChecker(lambda x: np.atleast_1d(partial_f(x))[f_ind],
-                                   lambda x : np.atleast_1d(partial_df(x))[fixed_val],
-                                   param, 'p')
-            #This is not general for more than one param...
+            #Check only the parameter and function value we wish to check at a time
+            grad = GradientChecker(lambda p_val: np.atleast_2d(partial_f(p_val, param_name))[f_ind, param_ind],
+                                   lambda p_val: np.atleast_2d(partial_df(p_val, param_name))[fixed_val, param_ind],
+                                   param_val, [param_name])
+
             if constraints is not None:
-                for constraint in constraints:
-                    constraint('p', grad)
+                for constrain_param, constraint in constraints:
+                    if grad.grep_param_names(constrain_param):
+                        constraint(constrain_param, grad)
+                    else:
+                        print "parameter didn't exist"
+                    print constrain_param, " ", constraint
             if randomize:
                 grad.randomize()
             if verbose:
@@ -76,7 +86,7 @@ class TestNoiseModels(object):
     Generic model checker
     """
     def setUp(self):
-        self.N = 5
+        self.N = 15
         self.D = 3
         self.X = np.random.rand(self.N, self.D)*10
 
@@ -94,7 +104,7 @@ class TestNoiseModels(object):
         self.var = np.random.rand(1)
 
         #Make a bigger step as lower bound can be quite curved
-        self.step = 1e-3
+        self.step = 1e-4
 
     def tearDown(self):
         self.Y = None
@@ -107,17 +117,20 @@ class TestNoiseModels(object):
         ####################################################
         # Constraint wrappers so we can just list them off #
         ####################################################
+        def constrain_fixed(regex, model):
+            model[regex].constrain_fixed()
+
         def constrain_negative(regex, model):
-            model.constrain_negative(regex)
+            model[regex].constrain_negative()
 
         def constrain_positive(regex, model):
-            model.constrain_positive(regex)
+            model[regex].constrain_positive()
 
         def constrain_bounded(regex, model, lower, upper):
             """
             Used like: partial(constrain_bounded, lower=0, upper=1)
             """
-            model.constrain_bounded(regex, lower, upper)
+            model[regex].constrain_bounded(lower, upper)
 
         """
         Dictionary where we nest models we would like to check
@@ -134,71 +147,81 @@ class TestNoiseModels(object):
                 }
         """
         noise_models = {"Student_t_default": {
-                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [self.var],
-                                "constraints": [constrain_positive]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
+                                #"constraints": [("t_noise", constrain_positive), ("deg_free", partial(constrain_fixed, value=5))]
                                 },
                             "laplace": True
                             },
                         "Student_t_1_var": {
-                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [1.0],
-                                "constraints": [constrain_positive]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_small_deg_free": {
+                            "model": GPy.likelihoods.StudentT(deg_free=1.5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
                         "Student_t_small_var": {
-                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
-                                "vals": [0.01],
-                                "constraints": [constrain_positive]
+                                "vals": [0.0001],
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
                         "Student_t_large_var": {
-                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [10.0],
-                                "constraints": [constrain_positive]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
                         "Student_t_approx_gauss": {
-                            "model": GPy.likelihoods.student_t(deg_free=1000, sigma2=self.var),
+                            "model": GPy.likelihoods.StudentT(deg_free=1000, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [self.var],
-                                "constraints": [constrain_positive]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
                         "Student_t_log": {
-                            "model": GPy.likelihoods.student_t(gp_link=gp_transformations.Log(), deg_free=5, sigma2=self.var),
+                            "model": GPy.likelihoods.StudentT(gp_link=link_functions.Log(), deg_free=5, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [self.var],
-                                "constraints": [constrain_positive]
+                                "constraints": [("t_noise", constrain_positive), ("deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
                         "Gaussian_default": {
-                            "model": GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N),
+                            "model": GPy.likelihoods.Gaussian(variance=self.var),
                             "grad_params": {
-                                "names": ["noise_model_variance"],
+                                "names": ["variance"],
                                 "vals": [self.var],
-                                "constraints": [constrain_positive]
+                                "constraints": [("variance", constrain_positive)]
                                 },
                             "laplace": True,
                             "ep": True
                             },
                         #"Gaussian_log": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log(), variance=self.var, D=self.D, N=self.N),
+                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Log(), variance=self.var, D=self.D, N=self.N),
                             #"grad_params": {
                                 #"names": ["noise_model_variance"],
                                 #"vals": [self.var],
@@ -207,7 +230,7 @@ class TestNoiseModels(object):
                             #"laplace": True
                             #},
                         #"Gaussian_probit": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Probit(), variance=self.var, D=self.D, N=self.N),
+                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Probit(), variance=self.var, D=self.D, N=self.N),
                             #"grad_params": {
                                 #"names": ["noise_model_variance"],
                                 #"vals": [self.var],
@@ -216,7 +239,7 @@ class TestNoiseModels(object):
                             #"laplace": True
                             #},
                         #"Gaussian_log_ex": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
+                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
                             #"grad_params": {
                                 #"names": ["noise_model_variance"],
                                 #"vals": [self.var],
@@ -225,31 +248,31 @@ class TestNoiseModels(object):
                             #"laplace": True
                             #},
                         "Bernoulli_default": {
-                            "model": GPy.likelihoods.bernoulli(),
+                            "model": GPy.likelihoods.Bernoulli(),
                             "link_f_constraints": [partial(constrain_bounded, lower=0, upper=1)],
                             "laplace": True,
                             "Y": self.binary_Y,
                             "ep": True
                             },
-                        "Exponential_default": {
-                            "model": GPy.likelihoods.exponential(),
-                            "link_f_constraints": [constrain_positive],
-                            "Y": self.positive_Y,
-                            "laplace": True,
-                        },
-                        "Poisson_default": {
-                            "model": GPy.likelihoods.poisson(),
-                            "link_f_constraints": [constrain_positive],
-                            "Y": self.integer_Y,
-                            "laplace": True,
-                            "ep": False #Should work though...
-                        },
-                        "Gamma_default": {
-                            "model": GPy.likelihoods.gamma(),
-                            "link_f_constraints": [constrain_positive],
-                            "Y": self.positive_Y,
-                            "laplace": True
-                        }
+                        #"Exponential_default": {
+                            #"model": GPy.likelihoods.exponential(),
+                            #"link_f_constraints": [constrain_positive],
+                            #"Y": self.positive_Y,
+                            #"laplace": True,
+                        #},
+                        #"Poisson_default": {
+                            #"model": GPy.likelihoods.poisson(),
+                            #"link_f_constraints": [constrain_positive],
+                            #"Y": self.integer_Y,
+                            #"laplace": True,
+                            #"ep": False #Should work though...
+                        #},
+                        #"Gamma_default": {
+                            #"model": GPy.likelihoods.gamma(),
+                            #"link_f_constraints": [constrain_positive],
+                            #"Y": self.positive_Y,
+                            #"laplace": True
+                        #}
                     }
 
         for name, attributes in noise_models.iteritems():
@@ -286,8 +309,8 @@ class TestNoiseModels(object):
             else:
                 ep = False
 
-            if len(param_vals) > 1:
-                raise NotImplementedError("Cannot support multiple params in likelihood yet!")
+            #if len(param_vals) > 1:
+                #raise NotImplementedError("Cannot support multiple params in likelihood yet!")
 
             #Required by all
             #Normal derivatives
@@ -302,13 +325,13 @@ class TestNoiseModels(object):
                 yield self.t_d3logpdf_df3, model, Y, f
                 yield self.t_d3logpdf_dlink3, model, Y, f, link_f_constraints
                 #Params
-                yield self.t_dlogpdf_dparams, model, Y, f, param_vals, param_constraints
-                yield self.t_dlogpdf_df_dparams, model, Y, f, param_vals, param_constraints
-                yield self.t_d2logpdf2_df2_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_dlogpdf_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_df_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_d2logpdf2_df2_dparams, model, Y, f, param_vals, param_names, param_constraints
                 #Link params
-                yield self.t_dlogpdf_link_dparams, model, Y, f, param_vals, param_constraints
-                yield self.t_dlogpdf_dlink_dparams, model, Y, f, param_vals, param_constraints
-                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_dlogpdf_link_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_dlink_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, param_vals, param_names, param_constraints
 
                 #laplace likelihood gradcheck
                 yield self.t_laplace_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
@@ -370,33 +393,33 @@ class TestNoiseModels(object):
     # df_dparams #
     ##############
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_dparams(self, model, Y, f, params, param_constraints):
+    def t_dlogpdf_dparams(self, model, Y, f, params, params_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
-                    params, args=(f, Y), constraints=param_constraints,
-                    randomize=True, verbose=True)
+                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_df_dparams(self, model, Y, f, params, param_constraints):
+    def t_dlogpdf_df_dparams(self, model, Y, f, params, params_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
-                    params, args=(f, Y), constraints=param_constraints,
-                    randomize=True, verbose=True)
+                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf2_df2_dparams(self, model, Y, f, params, param_constraints):
+    def t_d2logpdf2_df2_dparams(self, model, Y, f, params, params_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
-                    params, args=(f, Y), constraints=param_constraints,
-                    randomize=True, verbose=True)
+                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
                 )
 
     ################
@@ -454,32 +477,32 @@ class TestNoiseModels(object):
     # dlink_dparams #
     #################
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_link_dparams(self, model, Y, f, params, param_constraints):
+    def t_dlogpdf_link_dparams(self, model, Y, f, params, param_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.logpdf_link, model.dlogpdf_link_dtheta,
-                    params, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_dlink_dparams(self, model, Y, f, params, param_constraints):
+    def t_dlogpdf_dlink_dparams(self, model, Y, f, params, param_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.dlogpdf_dlink, model.dlogpdf_dlink_dtheta,
-                    params, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, params, param_constraints):
+    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, params, param_names, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.d2logpdf_dlink2, model.d2logpdf_dlink2_dtheta,
-                    params, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
@@ -493,18 +516,23 @@ class TestNoiseModels(object):
         Y = Y/Y.max()
         white_var = 1e-6
         kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
-        laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), model)
-        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=laplace_likelihood)
+        laplace_likelihood = GPy.inference.latent_function_inference.LaplaceInference()
+        m = GPy.core.GP(X.copy(), Y.copy(), kernel, likelihood=model, inference_method=laplace_likelihood)
         m.ensure_default_constraints()
-        m.constrain_fixed('white', white_var)
+        m['white'].constrain_fixed(white_var)
 
-        for param_num in range(len(param_names)):
-            name = param_names[param_num]
-            m[name] = param_vals[param_num]
-            constraints[param_num](name, m)
+        #Set constraints
+        for constrain_param, constraint in constraints:
+            constraint(constrain_param, m)
 
         print m
         m.randomize()
+
+        #Set params
+        for param_num in range(len(param_names)):
+            name = param_names[param_num]
+            m[name] = param_vals[param_num]
+
         #m.optimize(max_iters=8)
         print m
         m.checkgrad(verbose=1, step=step)
@@ -525,10 +553,10 @@ class TestNoiseModels(object):
         Y = Y/Y.max()
         white_var = 1e-6
         kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
-        ep_likelihood = GPy.likelihoods.EP(Y.copy(), model)
-        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=ep_likelihood)
+        ep_inf = GPy.inference.latent_function_inference.EP()
+        m = GPy.core.GP(X.copy(), Y.copy(), kernel=kernel, likelihood=model, inference_method=ep_inf)
         m.ensure_default_constraints()
-        m.constrain_fixed('white', white_var)
+        m['white'].constrain_fixed(white_var)
 
         for param_num in range(len(param_names)):
             name = param_names[param_num]
@@ -559,8 +587,8 @@ class LaplaceTests(unittest.TestCase):
         self.var = 0.2
 
         self.var = np.random.rand(1)
-        self.stu_t = GPy.likelihoods.student_t(deg_free=5, sigma2=self.var)
-        self.gauss = GPy.likelihoods.gaussian(gp_transformations.Log(), variance=self.var, D=self.D, N=self.N)
+        self.stu_t = GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var)
+        self.gauss = GPy.likelihoods.Gaussian(gp_link=link_functions.Log(), variance=self.var)
 
         #Make a bigger step as lower bound can be quite curved
         self.step = 1e-6
@@ -584,7 +612,7 @@ class LaplaceTests(unittest.TestCase):
         noise = np.random.randn(*self.X.shape)*self.real_std
         self.Y = np.sin(self.X*2*np.pi) + noise
         self.f = np.random.rand(self.N, 1)
-        self.gauss = GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N)
+        self.gauss = GPy.likelihoods.Gaussian(variance=self.var)
 
         dlogpdf_df = functools.partial(self.gauss.dlogpdf_df, y=self.Y)
         d2logpdf_df2 = functools.partial(self.gauss.d2logpdf_df2, y=self.Y)
@@ -605,23 +633,27 @@ class LaplaceTests(unittest.TestCase):
         #Yc = Y.copy()
         #Yc[75:80] += 1
         kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
-        kernel2 = kernel1.copy()
+        #FIXME: Make sure you can copy kernels when params is fixed
+        #kernel2 = kernel1.copy()
+        kernel2 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
 
-        m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
-        m1.constrain_fixed('white', 1e-6)
-        m1['noise'] = initial_var_guess
-        m1.constrain_bounded('noise', 1e-4, 10)
-        m1.constrain_bounded('rbf', 1e-4, 10)
+        gauss_distr1 = GPy.likelihoods.Gaussian(variance=initial_var_guess)
+        exact_inf = GPy.inference.latent_function_inference.ExactGaussianInference()
+        m1 = GPy.core.GP(X, Y.copy(), kernel=kernel1, likelihood=gauss_distr1, inference_method=exact_inf)
+        m1['white'].constrain_fixed(1e-6)
+        m1['variance'] = initial_var_guess
+        m1['variance'].constrain_bounded(1e-4, 10)
+        m1['rbf'].constrain_bounded(1e-4, 10)
         m1.ensure_default_constraints()
         m1.randomize()
 
-        gauss_distr = GPy.likelihoods.gaussian(variance=initial_var_guess, D=1, N=Y.shape[0])
-        laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), gauss_distr)
-        m2 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel2, likelihood=laplace_likelihood)
+        gauss_distr2 = GPy.likelihoods.Gaussian(variance=initial_var_guess)
+        laplace_inf = GPy.inference.latent_function_inference.LaplaceInference()
+        m2 = GPy.core.GP(X, Y.copy(), kernel=kernel2, likelihood=gauss_distr2, inference_method=laplace_inf)
         m2.ensure_default_constraints()
-        m2.constrain_fixed('white', 1e-6)
-        m2.constrain_bounded('rbf', 1e-4, 10)
-        m2.constrain_bounded('noise', 1e-4, 10)
+        m2['white'].constrain_fixed(1e-6)
+        m2['rbf'].constrain_bounded(1e-4, 10)
+        m2['variance'].constrain_bounded(1e-4, 10)
         m2.randomize()
 
         if debug:
@@ -667,7 +699,7 @@ class LaplaceTests(unittest.TestCase):
 
 
         #Check Y's are the same
-        np.testing.assert_almost_equal(Y, m2.likelihood.Y, decimal=5)
+        np.testing.assert_almost_equal(m1.Y, m2.Y, decimal=5)
         #Check marginals are the same
         np.testing.assert_almost_equal(m1.log_likelihood(), m2.log_likelihood(), decimal=2)
         #Check marginals are the same with random
diff --git a/GPy/testing/psi_stat_gradient_tests.py b/GPy/testing/psi_stat_gradient_tests.py
index e373aaa3..56586d3b 100644
--- a/GPy/testing/psi_stat_gradient_tests.py
+++ b/GPy/testing/psi_stat_gradient_tests.py
@@ -9,42 +9,44 @@ import numpy
 import GPy
 import itertools
 from GPy.core import Model
+from GPy.core.parameterization.param import Param
+from GPy.core.parameterization.transformations import Logexp
 
 class PsiStatModel(Model):
     def __init__(self, which, X, X_variance, Z, num_inducing, kernel):
+        super(PsiStatModel, self).__init__(name='psi stat test')
         self.which = which
-        self.X = X
-        self.X_variance = X_variance
-        self.Z = Z
+        self.X = Param("X", X)
+        self.X_variance = Param('X_variance', X_variance, Logexp())
+        self.Z = Param("Z", Z)
         self.N, self.input_dim = X.shape
         self.num_inducing, input_dim = Z.shape
         assert self.input_dim == input_dim, "shape missmatch: Z:{!s} X:{!s}".format(Z.shape, X.shape)
         self.kern = kernel
-        super(PsiStatModel, self).__init__()
         self.psi_ = self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance)
-    def _get_param_names(self):
-        Xnames = ["{}_{}_{}".format(what, i, j) for what, i, j in itertools.product(['X', 'X_variance'], range(self.N), range(self.input_dim))]
-        Znames = ["Z_{}_{}".format(i, j) for i, j in itertools.product(range(self.num_inducing), range(self.input_dim))]
-        return Xnames + Znames + self.kern._get_param_names()
-    def _get_params(self):
-        return numpy.hstack([self.X.flatten(), self.X_variance.flatten(), self.Z.flatten(), self.kern._get_params()])
-    def _set_params(self, x, save_old=True, save_count=0):
-        start, end = 0, self.X.size
-        self.X = x[start:end].reshape(self.N, self.input_dim)
-        start, end = end, end + self.X_variance.size
-        self.X_variance = x[start: end].reshape(self.N, self.input_dim)
-        start, end = end, end + self.Z.size
-        self.Z = x[start: end].reshape(self.num_inducing, self.input_dim)
-        self.kern._set_params(x[end:])
+        self.add_parameters(self.X, self.X_variance, self.Z, self.kern)
+    
     def log_likelihood(self):
         return self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance).sum()
-    def _log_likelihood_gradients(self):
+    
+    def parameters_changed(self):
         psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
+        self.X.gradient = psimu
+        self.X_variance.gradient = psiS
         #psimu, psiS = numpy.ones(self.N * self.input_dim), numpy.ones(self.N * self.input_dim)
-        psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
+        try: psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
+        except AttributeError: psiZ = numpy.zeros_like(self.Z)
+        self.Z.gradient = psiZ
         #psiZ = numpy.ones(self.num_inducing * self.input_dim)
-        thetagrad = self.kern.__getattribute__("d" + self.which + "_dtheta")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance).flatten()
-        return numpy.hstack((psimu.flatten(), psiS.flatten(), psiZ.flatten(), thetagrad))
+        N,M = self.X.shape[0], self.Z.shape[0]
+        dL_dpsi0, dL_dpsi1, dL_dpsi2 = numpy.zeros([N]), numpy.zeros([N,M]), numpy.zeros([N,M,M])
+        if self.which == 'psi0': dL_dpsi0 += 1
+        if self.which == 'psi1': dL_dpsi1 += 1
+        if self.which == 'psi2': dL_dpsi2 += 1
+        self.kern.update_gradients_variational(numpy.zeros([1,1]), 
+                                               dL_dpsi0, 
+                                               dL_dpsi1, 
+                                               dL_dpsi2, self.X, self.X_variance, self.Z)
 
 class DPsiStatTest(unittest.TestCase):
     input_dim = 5
@@ -57,61 +59,66 @@ class DPsiStatTest(unittest.TestCase):
     Y = X.dot(numpy.random.randn(input_dim, input_dim))
 #     kernels = [GPy.kern.linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)), GPy.kern.rbf(input_dim, ARD=True), GPy.kern.bias(input_dim)]
 
-    kernels = [GPy.kern.linear(input_dim), GPy.kern.rbf(input_dim), GPy.kern.bias(input_dim),
-               GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim),
-               GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim)]
+    kernels = [
+               GPy.kern.linear(input_dim), 
+               GPy.kern.rbf(input_dim), 
+               #GPy.kern.bias(input_dim),
+               #GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim),
+               #GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim)
+               ]
 
     def testPsi0(self):
         for k in self.kernels:
             m = PsiStatModel('psi0', X=self.X, X_variance=self.X_var, Z=self.Z,\
                              num_inducing=self.num_inducing, kernel=k)
-            m.ensure_default_constraints()
+            #m.ensure_default_constraints(warning=0)
             m.randomize()
-            assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k.parts)))
+            import ipdb;ipdb.set_trace()
+            assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k._parameters_)))
         
     def testPsi1(self):
         for k in self.kernels:
             m = PsiStatModel('psi1', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-            m.ensure_default_constraints()
+            m.ensure_default_constraints(warning=0)
             m.randomize()
-            assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
+            assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k._parameters_)))
 
     def testPsi2_lin(self):
         k = self.kernels[0]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                  num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints()
+        m.ensure_default_constraints(warning=0)
         m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
     def testPsi2_lin_bia(self):
         k = self.kernels[3]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints()
+        m.ensure_default_constraints(warning=0)
         m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
     def testPsi2_rbf(self):
         k = self.kernels[1]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints()
+        m.ensure_default_constraints(warning=0)
         m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
     def testPsi2_rbf_bia(self):
         k = self.kernels[-1]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints()
+        m.ensure_default_constraints(warning=0)
         m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
     def testPsi2_bia(self):
         k = self.kernels[2]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
-        m.ensure_default_constraints()
+        m.ensure_default_constraints(warning=0)
         m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
 
 
 if __name__ == "__main__":
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index d8893021..374f9600 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -1,10 +1,9 @@
-import numpy as np
-from ..core.parameterization.array_core import ObservableArray
+from ..core.parameterization.array_core import ObservableArray, ParamList
 class Cacher(object):
     def __init__(self, operation, limit=5):
         self.limit = int(limit)
         self.operation=operation
-        self.cached_inputs = []
+        self.cached_inputs = ParamList([])
         self.cached_outputs = []
         self.inputs_changed = []