diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py
index a42d76ed..25651827 100644
--- a/GPy/core/__init__.py
+++ b/GPy/core/__init__.py
@@ -4,6 +4,7 @@
 from model import *
 from parameterization.parameterized import adjust_name_for_printing, Parameterizable
 from parameterization.param import Param, ParamConcatenation
+from parameterization.observable_array import ObsAr
 
 from gp import GP
 from sparse_gp import SparseGP
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 5be3e944..490bcc72 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -208,27 +208,9 @@ class GP(Model):
         from ..plotting.matplot_dep import models_plots
         return models_plots.plot_fit(self,*args,**kwargs)
 
-    def _getstate(self):
+    def input_sensitivity(self):
         """
-
-        Get the current state of the class, here we return everything that is
-        needed to recompute the model.
-
+        Returns the sensitivity for each dimension of this model
         """
+        return self.kern.input_sensitivity()
 
-        return Model._getstate(self) + [self.X,
-                self.num_data,
-                self.input_dim,
-                self.kern,
-                self.likelihood,
-                self.output_dim,
-                ]
-
-    def _setstate(self, state):
-        self.output_dim = state.pop()
-        self.likelihood = state.pop()
-        self.kern = state.pop()
-        self.input_dim = state.pop()
-        self.num_data = state.pop()
-        self.X = state.pop()
-        Model._setstate(self, state)
diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py
index efd9476f..6eaaf96c 100644
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@@ -34,7 +34,7 @@ class Mapping(Parameterized):
         raise NotImplementedError
 
     def df_dtheta(self, dL_df, X):
-        """The gradient of the outputs of the multi-layer perceptron with respect to each of the parameters.
+        """The gradient of the outputs of the mapping with respect to each of the parameters.
 
         :param dL_df: gradient of the objective with respect to the function.
         :type dL_df: ndarray (num_data x output_dim)
@@ -50,7 +50,7 @@ class Mapping(Parameterized):
         """
         Plots the mapping associated with the model.
           - In one dimension, the function is plotted.
-          - In two dimsensions, a contour-plot shows the function
+          - In two dimensions, a contour-plot shows the function
           - In higher dimensions, we've not implemented this yet !TODO!
 
         Can plot only part of the data and part of the posterior functions
diff --git a/GPy/core/model.py b/GPy/core/model.py
index 1f53885c..38e8d4cf 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -24,37 +24,9 @@ class Model(Parameterized):
 
     def log_likelihood(self):
         raise NotImplementedError, "this needs to be implemented to use the model class"
-
     def _log_likelihood_gradients(self):
         return self.gradient
 
-    def _getstate(self):
-        """
-        Get the current state of the class.
-        Inherited from Parameterized, so add those parameters to the state
-
-        :return: list of states from the model.
-
-        """
-        return Parameterized._getstate(self) + \
-            [self.priors, self.optimization_runs,
-             self.sampling_runs, self.preferred_optimizer]
-
-    def _setstate(self, state):
-        """
-        set state from previous call to _getstate
-        call Parameterized with the rest of the state
-
-        :param state: the state of the model.
-        :type state: list as returned from _getstate.
-
-        """
-        self.preferred_optimizer = state.pop()
-        self.sampling_runs = state.pop()
-        self.optimization_runs = state.pop()
-        self.priors = state.pop()
-        Parameterized._setstate(self, state)
-
     def optimize_restarts(self, num_restarts=10, robust=False, verbose=True, parallel=False, num_processes=None, **kwargs):
         """
         Perform random restarts of the model, and set the model to the best
@@ -142,13 +114,60 @@ class Model(Parameterized):
         """
         raise DeprecationWarning, 'parameters now have default constraints'
 
-    def input_sensitivity(self):
+    def objective_function(self):
         """
-        Returns the sensitivity for each dimension of this kernel.
-        """
-        return self.kern.input_sensitivity()
+        The objective function for the given algorithm.
 
-    def objective_function(self, x):
+        This function is the true objective, which wants to be minimized. 
+        Note that all parameters are already set and in place, so you just need 
+        to return the objective function here.
+
+        For probabilistic models this is the negative log_likelihood
+        (including the MAP prior), so we return it here. If your model is not 
+        probabilistic, just return your objective here!
+        """
+        return -float(self.log_likelihood()) - self.log_prior()
+
+    def objective_function_gradients(self):
+        """
+        The gradients for the objective function for the given algorithm.
+
+        You can find the gradient for the parameters in self.gradient at all times.
+        This is the place, where gradients get stored for parameters.
+
+        This function is the true objective, which wants to be minimized. 
+        Note that all parameters are already set and in place, so you just need 
+        to return the gradient here.
+
+        For probabilistic models this is the gradient of the negative log_likelihood
+        (including the MAP prior), so we return it here. If your model is not 
+        probabilistic, just return your gradient here!
+        """
+        return -(self._log_likelihood_gradients() + self._log_prior_gradients())
+
+    def _grads(self, x):
+        """
+        Gets the gradients from the likelihood and the priors.
+
+        Failures are handled robustly. The algorithm will try several times to
+        return the gradients, and will raise the original exception if
+        the objective cannot be computed.
+
+        :param x: the parameters of the model.
+        :type x: np.array
+        """
+        try:
+            self._set_params_transformed(x)
+            obj_grads = self._transform_gradients(self.objective_function_gradients())
+            self._fail_count = 0
+        except (LinAlgError, ZeroDivisionError, ValueError):
+            if self._fail_count >= self._allowed_failures:
+                raise
+            self._fail_count += 1
+            obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
+        return obj_grads
+
+    def _objective(self, x):
         """
         The objective function passed to the optimizer. It combines
         the likelihood and the priors.
@@ -162,55 +181,26 @@ class Model(Parameterized):
         """
         try:
             self._set_params_transformed(x)
+            obj = self.objective_function()
             self._fail_count = 0
-        except (LinAlgError, ZeroDivisionError, ValueError) as e:
+        except (LinAlgError, ZeroDivisionError, ValueError):
             if self._fail_count >= self._allowed_failures:
-                raise e
+                raise
             self._fail_count += 1
             return np.inf
-        return -float(self.log_likelihood()) - self.log_prior()
+        return obj
 
-    def objective_function_gradients(self, x):
-        """
-        Gets the gradients from the likelihood and the priors.
-
-        Failures are handled robustly. The algorithm will try several times to
-        return the gradients, and will raise the original exception if
-        the objective cannot be computed.
-
-        :param x: the parameters of the model.
-        :type x: np.array
-        """
+    def _objective_grads(self, x):
         try:
             self._set_params_transformed(x)
-            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+            obj_f, obj_grads = self.objective_function(), self._transform_gradients(self.objective_function_gradients())
             self._fail_count = 0
-        except (LinAlgError, ZeroDivisionError, ValueError) as e:
+        except (LinAlgError, ZeroDivisionError, ValueError):
             if self._fail_count >= self._allowed_failures:
-                raise e
-            self._fail_count += 1
-            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
-        return obj_grads
-
-    def objective_and_gradients(self, x):
-        """
-        Compute the objective function of the model and the gradient of the model at the point given by x.
-
-        :param x: the point at which gradients are to be computed.
-        :type x: np.array
-        """
-
-        try:
-            self._set_params_transformed(x)
-            obj_f = -float(self.log_likelihood()) - self.log_prior()
-            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
-            self._fail_count = 0
-        except (LinAlgError, ZeroDivisionError, ValueError) as e:
-            if self._fail_count >= self._allowed_failures:
-                raise e
+                raise
             self._fail_count += 1
             obj_f = np.inf
-            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
+            obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
         return obj_f, obj_grads
 
     def optimize(self, optimizer=None, start=None, **kwargs):
@@ -241,7 +231,7 @@ class Model(Parameterized):
         optimizer = optimization.get_optimizer(optimizer)
         opt = optimizer(start, model=self, **kwargs)
 
-        opt.run(f_fp=self.objective_and_gradients, f=self.objective_function, fp=self.objective_function_gradients)
+        opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
 
         self.optimization_runs.append(opt)
 
@@ -289,19 +279,22 @@ class Model(Parameterized):
 
             # just check the global ratio
             dx = np.zeros(x.shape)
-            dx[transformed_index] = step * np.sign(np.random.uniform(-1, 1, transformed_index.size))
+            dx[transformed_index] = step * (np.sign(np.random.uniform(-1, 1, transformed_index.size)) if transformed_index.size != 2 else 1.)
 
             # evaulate around the point x
-            f1 = self.objective_function(x + dx)
-            f2 = self.objective_function(x - dx)
-            gradient = self.objective_function_gradients(x)
+            f1 = self._objective(x + dx)
+            f2 = self._objective(x - dx)
+            gradient = self._grads(x)
 
             dx = dx[transformed_index]
             gradient = gradient[transformed_index]
 
             denominator = (2 * np.dot(dx, gradient))
             global_ratio = (f1 - f2) / np.where(denominator==0., 1e-32, denominator)
-            return np.abs(1. - global_ratio) < tolerance or np.abs(f1-f2).sum() + np.abs((2 * np.dot(dx, gradient))).sum() < tolerance
+            global_diff = np.abs(f1 - f2) < tolerance and np.allclose(gradient, 0, atol=tolerance)
+            if global_ratio is np.nan:
+                global_ratio = 0
+            return np.abs(1. - global_ratio) < tolerance or global_diff
         else:
             # check the gradient of each parameter individually, and do some pretty printing
             try:
@@ -337,15 +330,15 @@ class Model(Parameterized):
                     print "No free parameters to check"
                     return
 
-            gradient = self.objective_function_gradients(x).copy()
+            gradient = self._grads(x).copy()
             np.where(gradient == 0, 1e-312, gradient)
             ret = True
             for nind, xind in itertools.izip(param_index, transformed_index):
                 xx = x.copy()
                 xx[xind] += step
-                f1 = self.objective_function(xx)
+                f1 = self._objective(xx)
                 xx[xind] -= 2.*step
-                f2 = self.objective_function(xx)
+                f2 = self._objective(xx)
                 numerical_gradient = (f1 - f2) / (2 * step)
                 if np.all(gradient[xind]==0): ratio = (f1-f2) == gradient[xind]
                 else: ratio = (f1 - f2) / (2 * step * gradient[xind])
diff --git a/GPy/core/parameterization/index_operations.py b/GPy/core/parameterization/index_operations.py
index c22d8b6b..12b3a298 100644
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@@ -24,12 +24,6 @@ class ParameterIndexOperations(object):
             for t, i in constraints.iteritems():
                 self.add(t, i)
 
-    def __getstate__(self):
-        return self._properties
-
-    def __setstate__(self, state):
-        self._properties = state
-
     def iteritems(self):
         return self._properties.iteritems()
 
@@ -92,13 +86,18 @@ class ParameterIndexOperations(object):
         for i, v in parameter_index_view.iteritems():
             self.add(i, v+offset)
 
-
     def copy(self):
+        return self.__deepcopy__(None)
+
+    def __deepcopy__(self, memo):
         return ParameterIndexOperations(dict(self.iteritems()))
 
     def __getitem__(self, prop):
         return self._properties[prop]
 
+    def __delitem__(self, prop):
+        del self._properties[prop]
+
     def __str__(self, *args, **kwargs):
         import pprint
         return pprint.pformat(dict(self._properties))
@@ -183,7 +182,7 @@ class ParameterIndexOperationsView(object):
 
 
     def remove(self, prop, indices):
-        removed = self._param_index_ops.remove(prop, indices+self._offset)
+        removed = self._param_index_ops.remove(prop, numpy.array(indices)+self._offset)
         if removed.size > 0:
             return removed - self._size + 1
         return removed
@@ -193,6 +192,9 @@ class ParameterIndexOperationsView(object):
         ind = self._filter_index(self._param_index_ops[prop])
         return ind
 
+    def __delitem__(self, prop):
+        self.remove(prop, self[prop])
+
     def __str__(self, *args, **kwargs):
         import pprint
         return pprint.pformat(dict(self.iteritems()))
@@ -203,6 +205,9 @@ class ParameterIndexOperationsView(object):
 
 
     def copy(self):
+        return self.__deepcopy__(None)
+
+    def __deepcopy__(self, memo):
         return ParameterIndexOperations(dict(self.iteritems()))
     pass
 
diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py
index ca0589c9..604d0a01 100644
--- a/GPy/core/parameterization/lists_and_dicts.py
+++ b/GPy/core/parameterization/lists_and_dicts.py
@@ -5,6 +5,7 @@ Created on 27 Feb 2014
 '''
 
 from collections import defaultdict
+import weakref
 
 def intarray_default_factory():
     import numpy as np
@@ -28,4 +29,90 @@ class ArrayList(list):
                 return True
         return False
 
+    def index(self, item):
+        index = 0
+        for el in self:
+            if el is item:
+                return index
+            index += 1
+        raise ValueError, "{} is not in list".format(item)
+    pass
+
+class ObservablesList(object):
+    def __init__(self):
+        self._poc = []
+
+    def __getitem__(self, ind):
+        p,o,c = self._poc[ind]
+        return p, o(), c
+
+    def remove(self, priority, observable, callble):
+        """
+        """
+        self.flush()
+        for i in range(len(self) - 1, -1, -1):
+            p,o,c = self[i]
+            if priority==p and observable==o and callble==c:
+                del self._poc[i]
+
+    def __repr__(self):
+        return self._poc.__repr__()
+
+    def add(self, priority, observable, callble):
+        ins = 0
+        for pr, _, _ in self:
+            if priority > pr:
+                break
+            ins += 1
+        self._poc.insert(ins, (priority, weakref.ref(observable), callble))
+        
+    def __str__(self):
+        ret = []
+        curr_p = None
+        for p, o, c in self:
+            curr = ''
+            if curr_p != p:
+                pre = "{!s}: ".format(p)
+                curr_pre = pre
+            else: curr_pre = " "*len(pre)
+            curr_p = p
+            curr += curr_pre
+            ret.append(curr + ", ".join(map(repr, [o,c])))
+        return '\n'.join(ret)
+
+    def flush(self):
+        self._poc = [(p,o,c) for p,o,c in self._poc if o() is not None]
+
+    def __iter__(self):
+        self.flush()
+        for p, o, c in self._poc:
+            if o() is not None:
+                yield p, o(), c 
+
+    def __len__(self):
+        self.flush()
+        return self._poc.__len__()
+
+    def __deepcopy__(self, memo):
+        self.flush()
+        s = ObservablesList()
+        import copy
+        s._poc = copy.deepcopy(self._poc, memo)
+        return s
+
+    def __getstate__(self):
+        self.flush()
+        from ...util.caching import Cacher
+        obs = []
+        for p, o, c in self:
+            if (getattr(o, c.__name__, None) is not None 
+                and not isinstance(o, Cacher)):
+                obs.append((p,o,c.__name__))
+        return obs
+
+    def __setstate__(self, state):
+        self._poc = []
+        for p, o, c in state:
+            self.add(p,o,getattr(o, c))
+
     pass
diff --git a/GPy/core/parameterization/array_core.py b/GPy/core/parameterization/observable_array.py
similarity index 52%
rename from GPy/core/parameterization/array_core.py
rename to GPy/core/parameterization/observable_array.py
index 780367c8..56d33bfc 100644
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/observable_array.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-__updated__ = '2014-03-21'
+__updated__ = '2014-03-31'
 
 import numpy as np
-from parameter_core import Observable
+from parameter_core import Observable, Pickleable
 
-class ObsAr(np.ndarray, Observable):
+class ObsAr(np.ndarray, Pickleable, Observable):
     """
     An ndarray which reports changes to its observers.
     The observers can add themselves with a callable, which
@@ -25,37 +25,34 @@ class ObsAr(np.ndarray, Observable):
     def __array_finalize__(self, obj):
         # see InfoArray.__array_finalize__ for comments
         if obj is None: return
-        self._observer_callables_ = getattr(obj, '_observer_callables_', None)
+        self.observers = getattr(obj, 'observers', None)
 
     def __array_wrap__(self, out_arr, context=None):
         return out_arr.view(np.ndarray)
 
+    def copy(self):
+        memo = {}
+        memo[id(self)] = self
+        return self.__deepcopy__(memo)
+
+    def __deepcopy__(self, memo):
+        s = self.__new__(self.__class__, input_array=self.view(np.ndarray).copy())
+        memo[id(self)] = s
+        import copy
+        s.__dict__.update(copy.deepcopy(self.__dict__, memo))
+        return s
+
     def __reduce__(self):
-        func, args, state = np.ndarray.__reduce__(self)
-        return func, args, (state, Observable._getstate(self))
+        func, args, state = super(ObsAr, self).__reduce__()
+        return func, args, (state, Pickleable.__getstate__(self))
 
     def __setstate__(self, state):
         np.ndarray.__setstate__(self, state[0])
-        Observable._setstate(self, state[1])
-
-    def _s_not_empty(self, s):
-        # this checks whether there is something picked by this slice.
-        return True
-        # TODO:  disarmed, for performance increase,
-        if not isinstance(s, (list,tuple,np.ndarray)):
-            return True
-        if isinstance(s, (list,tuple)):
-            return len(s)!=0
-        if isinstance(s, np.ndarray):
-            if s.dtype is bool:
-                return np.all(s)
-            else:
-                return s.size != 0
+        Pickleable.__setstate__(self, state[1])
 
     def __setitem__(self, s, val):
-        if self._s_not_empty(s):
-            super(ObsAr, self).__setitem__(s, val)
-            self.notify_observers()
+        super(ObsAr, self).__setitem__(s, val)
+        self.notify_observers()
 
     def __getslice__(self, start, stop):
         return self.__getitem__(slice(start, stop))
@@ -63,12 +60,6 @@ class ObsAr(np.ndarray, Observable):
     def __setslice__(self, start, stop, val):
         return self.__setitem__(slice(start, stop), val)
 
-    def __copy__(self, *args):
-        return ObsAr(self.view(np.ndarray).copy())
-
-    def copy(self, *args):
-        return self.__copy__(*args)
-
     def __ilshift__(self, *args, **kwargs):
         r = np.ndarray.__ilshift__(self, *args, **kwargs)
         self.notify_observers()
@@ -143,77 +134,4 @@ class ObsAr(np.ndarray, Observable):
     def __imul__(self, *args, **kwargs):
         r = np.ndarray.__imul__(self, *args, **kwargs)
         self.notify_observers()
-        return r
-
-
-#     def __rrshift__(self, *args, **kwargs):
-#         r = np.ndarray.__rrshift__(self, *args, **kwargs)
-#         self.notify_observers()
-#         return r
-
-
-#     def __ror__(self, *args, **kwargs):
-#         r =  np.ndarray.__ror__(self, *args, **kwargs)
-#         self.notify_observers()
-#         return r
-
-
-#     def __rxor__(self, *args, **kwargs):
-#         r = np.ndarray.__rxor__(self, *args, **kwargs)
-#         self.notify_observers()
-#         return r
-
-
-
-#     def __rdivmod__(self, *args, **kwargs):
-#         r = np.ndarray.__rdivmod__(self, *args, **kwargs)
-#         self.notify_observers()
-#         return r
-
-
-#     def __radd__(self, *args, **kwargs):
-#         r = np.ndarray.__radd__(self, *args, **kwargs)
-#         self.notify_observers()
-#         return r
-
-
-#     def __rdiv__(self, *args, **kwargs):
-#         r = np.ndarray.__rdiv__(self, *args, **kwargs)
-#         self.notify_observers()
-#         return r
-
-
-#     def __rtruediv__(self, *args, **kwargs):
-#         r = np.ndarray.__rtruediv__(self, *args, **kwargs)
-#         self.notify_observers()
-#         return r
-
-
-#     def __rshift__(self, *args, **kwargs):
-#         r = np.ndarray.__rshift__(self, *args, **kwargs)
-#         self.notify_observers()
-#         return r
-
-
-#     def __rmul__(self, *args, **kwargs):
-#         r = np.ndarray.__rmul__(self, *args, **kwargs)
-#         self.notify_observers()
-#         return r
-
-
-#     def __rpow__(self, *args, **kwargs):
-#         r = np.ndarray.__rpow__(self, *args, **kwargs)
-#         self.notify_observers()
-#         return r
-
-
-#     def __rsub__(self, *args, **kwargs):
-#         r = np.ndarray.__rsub__(self, *args, **kwargs)
-#         self.notify_observers()
-#         return r
-
-#     def __rfloordiv__(self, *args, **kwargs):
-#         r = np.ndarray.__rfloordiv__(self, *args, **kwargs)
-#         self.notify_observers()
-#         return r
-
+        return r
\ No newline at end of file
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 984fc950..9c3d7bd3 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -4,7 +4,7 @@
 import itertools
 import numpy
 from parameter_core import OptimizationHandlable, adjust_name_for_printing
-from array_core import ObsAr
+from observable_array import ObsAr
 
 ###### printing
 __constraints_name__ = "Constraint"
@@ -43,14 +43,13 @@ class Param(OptimizationHandlable, ObsAr):
     _fixes_ = None
     _parameters_ = []
     def __new__(cls, name, input_array, default_constraint=None):
-        obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array, name=name, default_constraint=default_constraint))
+        obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array))
         cls.__name__ = "Param"
         obj._current_slice_ = (slice(obj.shape[0]),)
         obj._realshape_ = obj.shape
         obj._realsize_ = obj.size
         obj._realndim_ = obj.ndim
         obj._original_ = True
-        obj._gradient_array_ = numpy.zeros(obj.shape, dtype=numpy.float64)
         return obj
 
     def __init__(self, name, input_array, default_constraint=None, *a, **kw):
@@ -60,7 +59,7 @@ class Param(OptimizationHandlable, ObsAr):
         import pydot
         node = pydot.Node(id(self), shape='record', label=self.name)
         G.add_node(node)
-        for o in self._observer_callables_.keys():
+        for o in self.observers.keys():
             label = o.name if hasattr(o, 'name') else str(o)
             observed_node = pydot.Node(id(o), label=label)
             G.add_node(observed_node)
@@ -87,74 +86,24 @@ class Param(OptimizationHandlable, ObsAr):
         self.priors = getattr(obj, 'priors', None)
 
     @property
-    def _param_array_(self):
+    def param_array(self):
         return self
 
     @property
     def gradient(self):
+        """
+        Return a view on the gradient, which is in the same shape as this parameter is.
+        Note: this is not the real gradient array, it is just a view on it.
+
+        To work on the real gradient array use: self.full_gradient
+        """
+        if getattr(self, '_gradient_array_', None) is None:
+            self._gradient_array_ = numpy.empty(self._realshape_, dtype=numpy.float64)
         return self._gradient_array_[self._current_slice_]
 
     @gradient.setter
     def gradient(self, val):
-        self.gradient[:] = val
-
-    #===========================================================================
-    # Pickling operations
-    #===========================================================================
-    def __reduce__(self):
-        func, args, state = super(Param, self).__reduce__()
-        return func, args, (state,
-                            (self._name,
-                             self._parent_,
-                             self._parent_index_,
-                             self._default_constraint_,
-                             self._current_slice_,
-                             self._realshape_,
-                             self._realsize_,
-                             self._realndim_,
-                             self.constraints,
-                             self.priors
-                            )
-                            )
-
-    def __setstate__(self, state):
-        super(Param, self).__setstate__(state[0])
-        state = list(state[1])
-        self.priors = state.pop()
-        self.constraints = state.pop()
-        self._realndim_ = state.pop()
-        self._realsize_ = state.pop()
-        self._realshape_ = state.pop()
-        self._current_slice_ = state.pop()
-        self._default_constraint_ = state.pop()
-        self._parent_index_ = state.pop()
-        self._parent_ = state.pop()
-        self._name = state.pop()
-
-    def copy(self, *args):
-        constr = self.constraints.copy()
-        priors = self.priors.copy()
-        p = Param(self.name, self.view(numpy.ndarray).copy(), self._default_constraint_)
-        p.constraints = constr
-        p.priors = priors
-        return p
-    #===========================================================================
-    # get/set parameters
-    #===========================================================================
-#     def _set_params(self, param, trigger_parent=True):
-#         self.flat = param
-#         if trigger_parent: min_priority = None
-#         else: min_priority = -numpy.inf
-#         self.notify_observers(None, min_priority)
-#
-#     def _get_params(self):
-#         return self.flat
-#
-#     def _collect_gradient(self, target):
-#         target += self.gradient.flat
-#
-#     def _set_gradient(self, g):
-#         self.gradient = g.reshape(self._realshape_)
+        self._gradient_array_[self._current_slice_] = val
 
     #===========================================================================
     # Array operations -> done
@@ -172,24 +121,6 @@ class Param(OptimizationHandlable, ObsAr):
     def __setitem__(self, s, val):
         super(Param, self).__setitem__(s, val)
 
-    #===========================================================================
-    # Index Operations:
-    #===========================================================================
-    #def _internal_offset(self):
-    #    internal_offset = 0
-    #    extended_realshape = numpy.cumprod((1,) + self._realshape_[:0:-1])[::-1]
-    #    for i, si in enumerate(self._current_slice_[:self._realndim_]):
-    #        if numpy.all(si == Ellipsis):
-    #            continue
-    #        if isinstance(si, slice):
-    #            a = si.indices(self._realshape_[i])[0]
-    #        elif isinstance(si, (list,numpy.ndarray,tuple)):
-    #            a = si[0]
-    #        else: a = si
-    #        if a < 0:
-    #            a = self._realshape_[i] + a
-    #        internal_offset += a * extended_realshape[i]
-    #    return internal_offset
 
     def _raveled_index(self, slice_index=None):
         # return an index array on the raveled array, which is formed by the current_slice
@@ -235,13 +166,21 @@ class Param(OptimizationHandlable, ObsAr):
     def is_fixed(self):
         from transformations import __fixed__
         return self.constraints[__fixed__].size == self.size
-    #def round(self, decimals=0, out=None):
-    #    view = super(Param, self).round(decimals, out).view(Param)
-    #    view.__array_finalize__(self)
-    #    return view
-    #round.__doc__ = numpy.round.__doc__
+
     def _get_original(self, param):
         return self
+
+    #===========================================================================
+    # Pickling and copying
+    #===========================================================================
+    def __deepcopy__(self, memo):
+        s = self.__new__(self.__class__, name=self.name, input_array=self.view(numpy.ndarray).copy())
+        memo[id(self)] = s
+        import copy
+        s.__dict__.update(copy.deepcopy(self.__dict__, memo))
+        return s
+
+
     #===========================================================================
     # Printing -> done
     #===========================================================================
@@ -250,7 +189,8 @@ class Param(OptimizationHandlable, ObsAr):
         if self.size <= 1:
             return [str(self.view(numpy.ndarray)[0])]
         else: return [str(self.shape)]
-    def parameter_names(self, add_self=False, adjust_for_printing=False):
+    def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
+        # this is just overwrighting the parameterized calls to parameter names, in order to maintain OOP
         if adjust_for_printing:
             return [adjust_name_for_printing(self.name)]
         return [self.name]
@@ -261,6 +201,9 @@ class Param(OptimizationHandlable, ObsAr):
     def parameter_shapes(self):
         return [self.shape]
     @property
+    def num_params(self):
+        return 0
+    @property
     def _constraints_str(self):
         return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.iteritems()))]
     @property
@@ -282,8 +225,8 @@ class Param(OptimizationHandlable, ObsAr):
         if isinstance(slice_index, (tuple, list)):
             clean_curr_slice = [s for s in slice_index if numpy.any(s != Ellipsis)]
             for i in range(self._realndim_-len(clean_curr_slice)):
-                i+=len(clean_curr_slice)
-                clean_curr_slice += range(self._realshape_[i])
+                i+=1
+                clean_curr_slice += [range(self._realshape_[i])]
             if (all(isinstance(n, (numpy.ndarray, list, tuple)) for n in clean_curr_slice)
                 and len(set(map(len, clean_curr_slice))) <= 1):
                 return numpy.fromiter(itertools.izip(*clean_curr_slice),
@@ -368,7 +311,7 @@ class ParamConcatenation(object):
     #===========================================================================
     def __getitem__(self, s):
         ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True;
-        params = [p._param_array_[ind[ps]] for p,ps in zip(self.params, self._param_slices_) if numpy.any(p._param_array_[ind[ps]])]
+        params = [p.param_array[ind[ps]] for p,ps in zip(self.params, self._param_slices_) if numpy.any(p.param_array[ind[ps]])]
         if len(params)==1: return params[0]
         return ParamConcatenation(params)
     def __setitem__(self, s, val, update=True):
@@ -381,7 +324,7 @@ class ParamConcatenation(object):
         if update:
             self.update_all_params()
     def values(self):
-        return numpy.hstack([p._param_array_ for p in self.params])
+        return numpy.hstack([p.param_array.flat for p in self.params])
     #===========================================================================
     # parameter operations:
     #===========================================================================
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 1cdeee0b..43bc7177 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -13,10 +13,10 @@ Observable Pattern for patameterization
 
 """
 
-from transformations import Transformation, Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
+from transformations import Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
 import numpy as np
 
-__updated__ = '2014-03-24'
+__updated__ = '2014-03-31'
 
 class HierarchyError(Exception):
     """
@@ -31,71 +31,8 @@ def adjust_name_for_printing(name):
         return name.replace(" ", "_").replace(".", "_").replace("-", "_m_").replace("+", "_p_").replace("!", "_I_").replace("**", "_xx_").replace("*", "_x_").replace("/", "_l_").replace("@", '_at_')
     return ''
 
-class InterfacePickleFunctions(object):
-    def __init__(self, *a, **kw):
-        super(InterfacePickleFunctions, self).__init__()
 
-    def _getstate(self):
-        """
-        Returns the state of this class in a memento pattern.
-        The state must be a list-like structure of all the fields
-        this class needs to run.
-
-        See python doc "pickling" (`__getstate__` and `__setstate__`) for details.
-        """
-        raise NotImplementedError, "To be able to use pickling you need to implement this method"
-    def _setstate(self, state):
-        """
-        Set the state (memento pattern) of this class to the given state.
-        Usually this is just the counterpart to _getstate, such that
-        an object is a copy of another when calling
-
-            copy = <classname>.__new__(*args,**kw)._setstate(<to_be_copied>._getstate())
-
-        See python doc "pickling" (`__getstate__` and `__setstate__`) for details.
-        """
-        raise NotImplementedError, "To be able to use pickling you need to implement this method"
-
-class Pickleable(InterfacePickleFunctions):
-    """
-    Make an object pickleable (See python doc 'pickling').
-
-    This class allows for pickling support by Memento pattern.
-    _getstate returns a memento of the class, which gets pickled.
-    _setstate(<memento>) (re-)sets the state of the class to the memento
-    """
-    def __init__(self, *a, **kw):
-        super(Pickleable, self).__init__()
-    #===========================================================================
-    # Pickling operations
-    #===========================================================================
-    def pickle(self, f, protocol=-1):
-        """
-        :param f: either filename or open file object to write to.
-                  if it is an open buffer, you have to make sure to close
-                  it properly.
-        :param protocol: pickling protocol to use, python-pickle for details.
-        """
-        import cPickle
-        if isinstance(f, str):
-            with open(f, 'w') as f:
-                cPickle.dump(self, f, protocol)
-        else:
-            cPickle.dump(self, f, protocol)
-    def __getstate__(self):
-        if self._has_get_set_state():
-            return self._getstate()
-        return self.__dict__
-    def __setstate__(self, state):
-        if self._has_get_set_state():
-            self._setstate(state)
-            # TODO: maybe parameters_changed() here?
-            return
-        self.__dict__ = state
-    def _has_get_set_state(self):
-        return '_getstate' in vars(self.__class__) and '_setstate' in vars(self.__class__)
-
-class Observable(Pickleable):
+class Observable(object):
     """
     Observable pattern for parameterization.
 
@@ -105,23 +42,25 @@ class Observable(Pickleable):
     """
     _updated = True
     def __init__(self, *args, **kwargs):
-        super(Observable, self).__init__(*args, **kwargs)
-        self._observer_callables_ = []
+        super(Observable, self).__init__()
+        from lists_and_dicts import ObservablesList
+        self.observers = ObservablesList()
 
     def add_observer(self, observer, callble, priority=0):
-        self._insert_sorted(priority, observer, callble)
+        self.observers.add(priority, observer, callble)
 
     def remove_observer(self, observer, callble=None):
         to_remove = []
-        for p, obs, clble in self._observer_callables_:
+        for poc in self.observers:
+            _, obs, clble = poc
             if callble is not None:
                 if (obs == observer) and (callble == clble):
-                    to_remove.append((p, obs, clble))
+                    to_remove.append(poc)
             else:
                 if obs is observer:
-                    to_remove.append((p, obs, clble))
+                    to_remove.append(poc)
         for r in to_remove:
-            self._observer_callables_.remove(r)
+            self.observers.remove(*r)
 
     def notify_observers(self, which=None, min_priority=None):
         """
@@ -136,32 +75,18 @@ class Observable(Pickleable):
         if which is None:
             which = self
         if min_priority is None:
-            [callble(self, which=which) for _, _, callble in self._observer_callables_]
+            [callble(self, which=which) for _, _, callble in self.observers]
         else:
-            for p, _, callble in self._observer_callables_:
+            for p, _, callble in self.observers:
                 if p <= min_priority:
                     break
                 callble(self, which=which)
 
-    def _insert_sorted(self, p, o, c):
-        ins = 0
-        for pr, _, _ in self._observer_callables_:
-            if p > pr:
-                break
-            ins += 1
-        self._observer_callables_.insert(ins, (p, o, c))
-
-    def _getstate(self):
-        return [self._observer_callables_]
-
-    def _setstate(self, state):
-        self._observer_callables_ = state.pop()
-
 #===============================================================================
 # Foundation framework for parameterized and param objects:
 #===============================================================================
 
-class Parentable(Observable):
+class Parentable(object):
     """
     Enable an Object to have a parent.
 
@@ -171,7 +96,7 @@ class Parentable(Observable):
     _parent_ = None
     _parent_index_ = None
     def __init__(self, *args, **kwargs):
-        super(Parentable, self).__init__(*args, **kwargs)
+        super(Parentable, self).__init__()
 
     def has_parent(self):
         """
@@ -207,7 +132,84 @@ class Parentable(Observable):
         """
         pass
 
-class Gradcheckable(Parentable):
+class Pickleable(object):
+    """
+    Make an object pickleable (See python doc 'pickling').
+
+    This class allows for pickling support by Memento pattern.
+    _getstate returns a memento of the class, which gets pickled.
+    _setstate(<memento>) (re-)sets the state of the class to the memento
+    """
+    def __init__(self, *a, **kw):
+        super(Pickleable, self).__init__()
+    #===========================================================================
+    # Pickling operations
+    #===========================================================================
+    def pickle(self, f, protocol=-1):
+        """
+        :param f: either filename or open file object to write to.
+                  if it is an open buffer, you have to make sure to close
+                  it properly.
+        :param protocol: pickling protocol to use, python-pickle for details.
+        """
+        import cPickle as pickle
+        import pickle #TODO: cPickle
+        if isinstance(f, str):
+            with open(f, 'w') as f:
+                pickle.dump(self, f, protocol)
+        else:
+            pickle.dump(self, f, protocol)
+
+        #===========================================================================
+    # copy and pickling
+    #===========================================================================
+    def copy(self):
+        """Returns a (deep) copy of the current model"""
+        #raise NotImplementedError, "Copy is not yet implemented, TODO: Observable hierarchy"
+        import copy
+        memo = {}
+        memo[id(self._parent_)] = None
+        memo[id(self.gradient)] = None
+        memo[id(self.param_array)] = None
+        memo[id(self._fixes_)] = None
+        c = copy.deepcopy(self, memo)
+        c._parent_index_ = None
+        return c
+
+    def __deepcopy__(self, memo):
+        s = self.__new__(self.__class__)
+        memo[id(self)] = s
+        import copy
+        s.__dict__.update(copy.deepcopy(self.__dict__, memo))
+        return s
+
+    def __getstate__(self):
+        ignore_list = ([#'_parent_', '_parent_index_',
+                        #'observers',
+                        '_param_array_', '_gradient_array_', '_fixes_',
+                        '_Cacher_wrap__cachers']
+                       #+ self.parameter_names(recursive=False)
+                       )
+        dc = dict()
+        for k,v in self.__dict__.iteritems():
+            if k not in ignore_list:
+                #if hasattr(v, "__getstate__"):
+                #dc[k] = v.__getstate__()
+                #else:
+                dc[k] = v
+        return dc
+ 
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        return self
+
+    #def __getstate__(self, memo):
+    #    raise NotImplementedError, "get state must be implemented to be able to pickle objects"
+
+    #def __setstate__(self, memo):
+    #    raise NotImplementedError, "set state must be implemented to be able to pickle objects"
+
+class Gradcheckable(Pickleable, Parentable):
     """
     Adds the functionality for an object to be gradcheckable.
     It is just a thin wrapper of a call to the highest parent for now.
@@ -312,7 +314,7 @@ class Indexable(object):
         raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?"
 
 
-class Constrainable(Nameable, Indexable):
+class Constrainable(Nameable, Indexable, Observable):
     """
     Make an object constrainable with Priors and Transformations.
     TODO: Mappings!!
@@ -394,6 +396,7 @@ class Constrainable(Nameable, Indexable):
             self._fixes_[fixed_indices] = FIXED
         else:
             self._fixes_ = None
+            del self.constraints[__fixed__]
 
     def _has_fixes(self):
         return hasattr(self, "_fixes_") and self._fixes_ is not None and self._fixes_.size == self.size
@@ -429,14 +432,14 @@ class Constrainable(Nameable, Indexable):
     def log_prior(self):
         """evaluate the prior"""
         if self.priors.size > 0:
-            x = self._param_array_
+            x = self.param_array
             return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.iteritems()), 0)
         return 0.
 
     def _log_prior_gradients(self):
         """evaluate the gradients of the priors"""
         if self.priors.size > 0:
-            x = self._param_array_
+            x = self.param_array
             ret = np.zeros(x.size)
             [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.iteritems()]
             return ret
@@ -455,7 +458,7 @@ class Constrainable(Nameable, Indexable):
         Constrain the parameter to the given
         :py:class:`GPy.core.transformations.Transformation`.
         """
-        self._param_array_[:] = transform.initialize(self._param_array_)
+        self.param_array[:] = transform.initialize(self.param_array)
         reconstrained = self.unconstrain()
         self._add_to_index_operations(self.constraints, reconstrained, transform, warning)
         self.notify_observers(self, None if trigger_parent else -np.inf)
@@ -565,14 +568,14 @@ class OptimizationHandlable(Constrainable):
         super(OptimizationHandlable, self).__init__(name, default_constraint=default_constraint, *a, **kw)
 
     def transform(self):
-        [np.put(self._param_array_, ind, c.finv(self._param_array_.flat[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
+        [np.put(self.param_array, ind, c.finv(self.param_array.flat[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
 
     def untransform(self):
-        [np.put(self._param_array_, ind, c.f(self._param_array_.flat[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
+        [np.put(self.param_array, ind, c.f(self.param_array.flat[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
 
     def _get_params_transformed(self):
         # transformed parameters (apply transformation rules)
-        p = self._param_array_.copy()
+        p = self.param_array.copy()
         [np.put(p, ind, c.finv(p[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
         if self.has_parent() and self.constraints[__fixed__].size != 0:
             fixes = np.ones(self.size).astype(bool)
@@ -583,14 +586,14 @@ class OptimizationHandlable(Constrainable):
         return p
 
     def _set_params_transformed(self, p):
-        if p is self._param_array_:
+        if p is self.param_array:
             p = p.copy()
         if self.has_parent() and self.constraints[__fixed__].size != 0:
             fixes = np.ones(self.size).astype(bool)
             fixes[self.constraints[__fixed__]] = FIXED
-            self._param_array_.flat[fixes] = p
-        elif self._has_fixes(): self._param_array_.flat[self._fixes_] = p
-        else: self._param_array_.flat = p
+            self.param_array.flat[fixes] = p
+        elif self._has_fixes(): self.param_array.flat[self._fixes_] = p
+        else: self.param_array.flat = p
         self.untransform()
         self._trigger_params_changed()
 
@@ -600,36 +603,29 @@ class OptimizationHandlable(Constrainable):
 
     def _size_transformed(self):
         return self.size - self.constraints[__fixed__].size
-#
-#     def _untransform_params(self, p):
-#         # inverse apply transformations for parameters
-#         #p = p.copy()
-#         if self._has_fixes(): tmp = self._get_params(); tmp[self._fixes_] = p; p = tmp; del tmp
-#         [np.put(p, ind, c.f(p[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
-#         return p
-#
-#     def _get_params(self):
-#         """
-#         get all parameters
-#         """
-#         return self._param_array_
-#         p = np.empty(self.size, dtype=np.float64)
-#         if self.size == 0:
-#             return p
-#         [np.put(p, ind, par._get_params()) for ind, par in itertools.izip(self._param)]
-#         return p
 
-#     def _set_params(self, params, trigger_parent=True):
-#         self._param_array_.flat = params
-#         if trigger_parent: min_priority = None
-#         else: min_priority = -np.inf
-#         self.notify_observers(None, min_priority)
-        # don't overwrite this anymore!
-        # raise NotImplementedError, "Abstract superclass: This needs to be implemented in Param and Parameterizable"
+    @property
+    def num_params(self):
+        """
+        Return the number of parameters of this parameter_handle.
+        Param objects will allways return 0.
+        """
+        raise NotImplemented, "Abstract, please implement in respective classes"
 
-    #===========================================================================
-    # Optimization handles:
-    #===========================================================================
+    def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
+        """
+        Get the names of all parameters of this model.
+
+        :param bool add_self: whether to add the own name in front of names
+        :param bool adjust_for_printing: whether to call `adjust_name_for_printing` on names
+        :param bool recursive: whether to traverse through hierarchy and append leaf node names
+        """
+        if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x)
+        else: adjust = lambda x: x
+        if recursive: names = [xi for x in self._parameters_ for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)]
+        else: names = [adjust(x.name) for x in self._parameters_]
+        if add_self: names = map(lambda x: adjust(self.name) + "." + x, names)
+        return names
     def _get_param_names(self):
         n = np.array([p.hierarchy_name() + '[' + str(i) + ']' for p in self.flattened_parameters for i in p._indices()])
         return n
@@ -663,16 +659,30 @@ class OptimizationHandlable(Constrainable):
     # For shared memory arrays. This does nothing in Param, but sets the memory
     # for all parameterized objects
     #===========================================================================
+    @property
+    def full_gradient(self):
+        """
+        Note to users:
+        This does not return the gradient in the right shape! Use self.gradient
+        for the right gradient array.
+
+        To work on the gradient array, use this as the gradient handle.
+        This method exists for in memory use of parameters.
+        When trying to access the true gradient array, use this.
+        """
+        self.gradient # <<< ensure _gradient_array_
+        return self._gradient_array_
+
     def _propagate_param_grad(self, parray, garray):
         pi_old_size = 0
         for pi in self._parameters_:
             pislice = slice(pi_old_size, pi_old_size + pi.size)
 
-            self._param_array_[pislice] = pi._param_array_.flat  # , requirements=['C', 'W']).flat
-            self._gradient_array_[pislice] = pi._gradient_array_.flat  # , requirements=['C', 'W']).flat
+            self.param_array[pislice] = pi.param_array.flat  # , requirements=['C', 'W']).flat
+            self.full_gradient[pislice] = pi.full_gradient.flat  # , requirements=['C', 'W']).flat
 
-            pi._param_array_.data = parray[pislice].data
-            pi._gradient_array_.data = garray[pislice].data
+            pi.param_array.data = parray[pislice].data
+            pi.full_gradient.data = garray[pislice].data
 
             pi._propagate_param_grad(parray[pislice], garray[pislice])
             pi_old_size += pi.size
@@ -681,26 +691,32 @@ class Parameterizable(OptimizationHandlable):
     def __init__(self, *args, **kwargs):
         super(Parameterizable, self).__init__(*args, **kwargs)
         from GPy.core.parameterization.lists_and_dicts import ArrayList
-        _parameters_ = ArrayList()
+        self._parameters_ = ArrayList()
         self.size = 0
-        self._param_array_ = np.empty(self.size, dtype=np.float64)
-        self._gradient_array_ = np.empty(self.size, dtype=np.float64)
         self._added_names_ = set()
 
-    def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
-        """
-        Get the names of all parameters of this model.
+    @property
+    def param_array(self):
+        if not hasattr(self, '_param_array_'):
+            self._param_array_ = np.empty(self.size, dtype=np.float64)
+        return self._param_array_
 
-        :param bool add_self: whether to add the own name in front of names
-        :param bool adjust_for_printing: whether to call `adjust_name_for_printing` on names
-        :param bool recursive: whether to traverse through hierarchy and append leaf node names
-        """
-        if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x)
-        else: adjust = lambda x: x
-        if recursive: names = [xi for x in self._parameters_ for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)]
-        else: names = [adjust(x.name) for x in self._parameters_]
-        if add_self: names = map(lambda x: adjust(self.name) + "." + x, names)
-        return names
+    @param_array.setter
+    def param_array(self, arr):
+        self._param_array_ = arr
+
+    #=========================================================================
+    # Gradient handling
+    #=========================================================================
+    @property
+    def gradient(self):
+        if not hasattr(self, '_gradient_array_'):
+            self._gradient_array_ = np.empty(self.size, dtype=np.float64)
+        return self._gradient_array_
+
+    @gradient.setter
+    def gradient(self, val):
+        self._gradient_array_[:] = val
 
     @property
     def num_params(self):
@@ -737,34 +753,6 @@ class Parameterizable(OptimizationHandlable):
         self._remove_parameter_name(None, old_name)
         self._add_parameter_name(param)
 
-    #=========================================================================
-    # Gradient handling
-    #=========================================================================
-    @property
-    def gradient(self):
-        return self._gradient_array_
-
-    @gradient.setter
-    def gradient(self, val):
-        self._gradient_array_[:] = val
-    #===========================================================================
-    # def _collect_gradient(self, target):
-    #     [p._collect_gradient(target[s]) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
-    #===========================================================================
-
-    #===========================================================================
-    # def _set_params(self, params, trigger_parent=True):
-    #     [p._set_params(params[s], trigger_parent=False) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
-    #     if trigger_parent: min_priority = None
-    #     else: min_priority = -np.inf
-    #     self.notify_observers(None, min_priority)
-    #===========================================================================
-
-    #===========================================================================
-    # def _set_gradient(self, g):
-    #     [p._set_gradient(g[s]) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
-    #===========================================================================
-
     def add_parameter(self, param, index=None, _ignore_added_names=False):
         """
         :param parameters:  the parameters to add
@@ -864,7 +852,7 @@ class Parameterizable(OptimizationHandlable):
             # no parameters for this class
             return
         old_size = 0
-        self._param_array_ = np.empty(self.size, dtype=np.float64)
+        self.param_array = np.empty(self.size, dtype=np.float64)
         self._gradient_array_ = np.empty(self.size, dtype=np.float64)
 
         self._param_slices_ = []
@@ -874,15 +862,15 @@ class Parameterizable(OptimizationHandlable):
 
             pslice = slice(old_size, old_size + p.size)
             # first connect all children
-            p._propagate_param_grad(self._param_array_[pslice], self._gradient_array_[pslice])
+            p._propagate_param_grad(self.param_array[pslice], self.full_gradient[pslice])
             # then connect children to self
-            self._param_array_[pslice] = p._param_array_.flat  # , requirements=['C', 'W']).ravel(order='C')
-            self._gradient_array_[pslice] = p._gradient_array_.flat  # , requirements=['C', 'W']).ravel(order='C')
+            self.param_array[pslice] = p.param_array.flat  # , requirements=['C', 'W']).ravel(order='C')
+            self.full_gradient[pslice] = p.full_gradient.flat  # , requirements=['C', 'W']).ravel(order='C')
 
-            if not p._param_array_.flags['C_CONTIGUOUS']:
-                import ipdb;ipdb.set_trace()
-            p._param_array_.data = self._param_array_[pslice].data
-            p._gradient_array_.data = self._gradient_array_[pslice].data
+            if not p.param_array.flags['C_CONTIGUOUS']:
+                raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS"
+            p.param_array.data = self.param_array[pslice].data
+            p.full_gradient.data = self.full_gradient[pslice].data
 
             self._param_slices_.append(pslice)
 
@@ -898,41 +886,22 @@ class Parameterizable(OptimizationHandlable):
         self.notify_observers(which=which)
 
     #===========================================================================
-    # TODO: not working yet
+    # Pickling
     #===========================================================================
+    def __setstate__(self, state):
+        super(Parameterizable, self).__setstate__(state)
+        self._connect_parameters()
+        self._connect_fixes()
+        self._notify_parent_change()
+
+        self.parameters_changed()
+
     def copy(self):
-        """Returns a (deep) copy of the current model"""
-        raise NotImplementedError, "Copy is not yet implemented, TODO: Observable hierarchy"
-        import copy
-        from .index_operations import ParameterIndexOperations, ParameterIndexOperationsView
-        from .lists_and_dicts import ArrayList
-
-        dc = dict()
-        for k, v in self.__dict__.iteritems():
-            if k not in ['_parent_', '_parameters_', '_parent_index_', '_observer_callables_'] + self.parameter_names(recursive=False):
-                if isinstance(v, (Constrainable, ParameterIndexOperations, ParameterIndexOperationsView)):
-                    dc[k] = v.copy()
-                else:
-                    dc[k] = copy.deepcopy(v)
-            if k == '_parameters_':
-                params = [p.copy() for p in v]
-
-        dc['_parent_'] = None
-        dc['_parent_index_'] = None
-        dc['_observer_callables_'] = []
-        dc['_parameters_'] = ArrayList()
-        dc['constraints'].clear()
-        dc['priors'].clear()
-        dc['size'] = 0
-
-        s = self.__new__(self.__class__)
-        s.__dict__ = dc
-
-        for p in params:
-            s.add_parameter(p, _ignore_added_names=True)
-
-        return s
-
+        c = super(Parameterizable, self).copy()
+        c._connect_parameters()
+        c._connect_fixes()
+        c._notify_parent_change()
+        return c
     #===========================================================================
     # From being parentable, we have to define the parent_change notification
     #===========================================================================
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 6460c988..a794ab40 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -17,7 +17,7 @@ class ParametersChangedMeta(type):
         instance.parameters_changed()
         return instance
 
-class Parameterized(Parameterizable, Pickleable):
+class Parameterized(Parameterizable):
     """
     Parameterized class
 
@@ -63,7 +63,7 @@ class Parameterized(Parameterizable, Pickleable):
     # Metaclass for parameters changed after init. 
     # This makes sure, that parameters changed will always be called after __init__
     # **Never** call parameters_changed() yourself 
-    __metaclass__ = ParametersChangedMeta    
+    __metaclass__ = ParametersChangedMeta
     #===========================================================================
     def __init__(self, name=None, parameters=[], *a, **kw):
         super(Parameterized, self).__init__(name=name, *a, **kw)
@@ -90,7 +90,7 @@ class Parameterized(Parameterizable, Pickleable):
             child_node = child.build_pydot(G)
             G.add_edge(pydot.Edge(node, child_node))
 
-        for o in self._observer_callables_.keys():
+        for o in self.observers.keys():
             label = o.name if hasattr(o, 'name') else str(o)
             observed_node = pydot.Node(id(o), label=label)
             G.add_node(observed_node)
@@ -101,48 +101,13 @@ class Parameterized(Parameterizable, Pickleable):
             return G
         return node
 
-    def _getstate(self):
-        """
-        Get the current state of the class,
-        here just all the indices, rest can get recomputed
-        For inheriting from Parameterized:
-
-        Allways append the state of the inherited object
-        and call down to the inherited object in _setstate!!
-        """
-        return [
-                self._fixes_,
-                self.priors,
-                self.constraints,
-                self._parameters_,
-                self._name,
-                self._added_names_,
-                ]
-
-    def _setstate(self, state):
-        self._added_names_ = state.pop()
-        self._name = state.pop()
-        self._parameters_ = state.pop()
-        self.constraints = state.pop()
-        self.priors = state.pop()
-        self._fixes_ = state.pop()
-        self._connect_parameters()
-        self.parameters_changed()
-    #===========================================================================
-    # Override copy to handle programmatically added observers
-    #===========================================================================
-    def copy(self):
-        c = super(Pickleable, self).copy()
-        c.add_observer(c, c._parameters_changed_notification, -100)
-        return c
-
     #===========================================================================
     # Gradient control
     #===========================================================================
     def _transform_gradients(self, g):
         if self.has_parent():
             return g
-        [numpy.put(g, i, g[i] * c.gradfactor(self._param_array_[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        [numpy.put(g, i, g[i] * c.gradfactor(self.param_array[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
         if self._has_fixes(): return g[self._fixes_]
         return g
 
@@ -174,7 +139,7 @@ class Parameterized(Parameterizable, Pickleable):
         this is not in the global view of things!
         """
         return numpy.r_[:self.size]
-    
+
     #===========================================================================
     # Convenience for fixed, tied checking of param:
     #===========================================================================
@@ -189,7 +154,7 @@ class Parameterized(Parameterizable, Pickleable):
         # you can retrieve the original param through this method, by passing
         # the copy here
         return self._parameters_[param._parent_index_]
-    
+
     #===========================================================================
     # Get/set parameters:
     #===========================================================================
@@ -206,7 +171,7 @@ class Parameterized(Parameterizable, Pickleable):
 
     def __getitem__(self, name, paramlist=None):
         if isinstance(name, (int, slice, tuple, np.ndarray)):
-            return self._param_array_[name]
+            return self.param_array[name]
         else:
             if paramlist is None:
                 paramlist = self.grep_param_names(name)
@@ -222,7 +187,7 @@ class Parameterized(Parameterizable, Pickleable):
     def __setitem__(self, name, value, paramlist=None):
         if isinstance(name, (slice, tuple, np.ndarray)):
             try:
-                self._param_array_[name] = value
+                self.param_array[name] = value
             except:
                 raise ValueError, "Setting by slice or index only allowed with array-like"
             self._trigger_params_changed()
diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index ac1dfc63..3730baed 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -66,7 +66,7 @@ class SpikeAndSlabPrior(VariationalPrior):
         self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum(axis=0)
 
 class VariationalPosterior(Parameterized):
-    def __init__(self, means=None, variances=None, name=None, *a, **kw):
+    def __init__(self, means=None, variances=None, name='latent space', *a, **kw):
         super(VariationalPosterior, self).__init__(name=name, *a, **kw)
         self.mean = Param("mean", means)
         self.variance = Param("variance", variances, Logexp())
@@ -124,6 +124,7 @@ class NormalPosterior(VariationalPosterior):
         import sys
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ...plotting.matplot_dep import variational_plots
+        import matplotlib
         return variational_plots.plot(self,*args)
 
 class SpikeAndSlabPosterior(VariationalPosterior):
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 7bf0ca2a..7552b8ac 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -106,15 +106,3 @@ class SparseGP(GP):
         return mu, var
 
 
-    def _getstate(self):
-        """
-        Get the current state of the class,
-        """
-        return GP._getstate(self) + [
-                self.Z,
-                self.num_inducing]
-
-    def _setstate(self, state):
-        self.num_inducing = state.pop()
-        self.Z = state.pop()
-        GP._setstate(self, state)
diff --git a/GPy/core/svigp.py b/GPy/core/svigp.py
index a2c7acee..60e8371c 100644
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@@ -89,57 +89,57 @@ class SVIGP(GP):
         self._param_steplength_trace = []
         self._vb_steplength_trace = []
 
-    def _getstate(self):
-        steplength_params = [self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength]
-        return GP._getstate(self) + \
-            [self.get_vb_param(),
-             self.Z,
-             self.num_inducing,
-             self.has_uncertain_inputs,
-             self.X_variance,
-             self.X_batch,
-             self.X_variance_batch,
-             steplength_params,
-             self.batchcounter,
-             self.batchsize,
-             self.epochs,
-             self.momentum,
-             self.data_prop,
-             self._param_trace,
-             self._param_steplength_trace,
-             self._vb_steplength_trace,
-             self._ll_trace,
-             self._grad_trace,
-             self.Y,
-             self._permutation,
-             self.iterations
-            ]
-
-    def _setstate(self, state):
-        self.iterations = state.pop()
-        self._permutation = state.pop()
-        self.Y = state.pop()
-        self._grad_trace = state.pop()
-        self._ll_trace = state.pop()
-        self._vb_steplength_trace = state.pop()
-        self._param_steplength_trace = state.pop()
-        self._param_trace = state.pop()
-        self.data_prop = state.pop()
-        self.momentum = state.pop()
-        self.epochs = state.pop()
-        self.batchsize = state.pop()
-        self.batchcounter = state.pop()
-        steplength_params = state.pop()
-        (self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength) = steplength_params
-        self.X_variance_batch = state.pop()
-        self.X_batch = state.pop()
-        self.X_variance = state.pop()
-        self.has_uncertain_inputs = state.pop()
-        self.num_inducing = state.pop()
-        self.Z = state.pop()
-        vb_param = state.pop()
-        GP._setstate(self, state)
-        self.set_vb_param(vb_param)
+#     def _getstate(self):
+#         steplength_params = [self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength]
+#         return GP._getstate(self) + \
+#             [self.get_vb_param(),
+#              self.Z,
+#              self.num_inducing,
+#              self.has_uncertain_inputs,
+#              self.X_variance,
+#              self.X_batch,
+#              self.X_variance_batch,
+#              steplength_params,
+#              self.batchcounter,
+#              self.batchsize,
+#              self.epochs,
+#              self.momentum,
+#              self.data_prop,
+#              self._param_trace,
+#              self._param_steplength_trace,
+#              self._vb_steplength_trace,
+#              self._ll_trace,
+#              self._grad_trace,
+#              self.Y,
+#              self._permutation,
+#              self.iterations
+#             ]
+# 
+#     def _setstate(self, state):
+#         self.iterations = state.pop()
+#         self._permutation = state.pop()
+#         self.Y = state.pop()
+#         self._grad_trace = state.pop()
+#         self._ll_trace = state.pop()
+#         self._vb_steplength_trace = state.pop()
+#         self._param_steplength_trace = state.pop()
+#         self._param_trace = state.pop()
+#         self.data_prop = state.pop()
+#         self.momentum = state.pop()
+#         self.epochs = state.pop()
+#         self.batchsize = state.pop()
+#         self.batchcounter = state.pop()
+#         steplength_params = state.pop()
+#         (self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength) = steplength_params
+#         self.X_variance_batch = state.pop()
+#         self.X_batch = state.pop()
+#         self.X_variance = state.pop()
+#         self.has_uncertain_inputs = state.pop()
+#         self.num_inducing = state.pop()
+#         self.Z = state.pop()
+#         vb_param = state.pop()
+#         GP._setstate(self, state)
+#         self.set_vb_param(vb_param)
 
     def _compute_kernel_matrices(self):
         # kernel computations, using BGPLVM notation
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index ea997d63..07623d6b 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -277,7 +277,9 @@ def bgplvm_simulation(optimize=True, verbose=1,
     k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
     #k = kern.RBF(Q, ARD=True, lengthscale=10.)
     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
-    
+    m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape)
+    m.likelihood.variance = .1
+
     if optimize:
         print "Optimizing model:"
         m.optimize('bfgs', messages=verbose, max_iters=max_iters,
@@ -299,15 +301,16 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
     _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
     Y = Ylist[0]
     k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
-    
+
     inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
     m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, kernel=k)
     m.inference_method = VarDTCMissingData()
     m.Y[inan] = _np.nan
-    m.X.variance *= .1
+    m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape)
+    m.likelihood.variance = .01
     m.parameters_changed()
     m.Yreal = Y
-    
+
     if optimize:
         print "Optimizing model:"
         m.optimize('bfgs', messages=verbose, max_iters=max_iters,
@@ -321,18 +324,15 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
 def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
     from GPy import kern
     from GPy.models import MRD
-    from GPy.likelihoods import Gaussian
 
     D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
     _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
-    
+
     #Ylist = [Ylist[0]]
-    k = [kern.Linear(Q, ARD=True) + kern.White(Q, 1e-4) for _ in range(len(Ylist))]
-    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernel=k, initx="", initz='permute', **kw)
-    
-    m['.*noise'] = [Y.var()/500. for Y in Ylist]
-    #for i, Y in enumerate(Ylist):
-    #    m['.*Y_{}.*Gaussian.*noise'.format(i)] = Y.var(1) / 500.
+    k = kern.Linear(Q, ARD=True)
+    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernel=k, initx="PCA_concat", initz='permute', **kw)
+
+    m['.*noise'] = [Y.var()/40. for Y in Ylist]
 
     if optimize:
         print "Optimizing Model:"
diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index bd3fcefb..074b67a6 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -32,7 +32,8 @@ class ExactGaussianInference(object):
             return Y
         else:
             #if Y in self.cache, return self.Cache[Y], else store Y in cache and return L.
-            raise NotImplementedError, 'TODO' #TODO
+            print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
+            return Y
 
     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
         """
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 12315a29..9ba3f83f 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2013, 2014 GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 #
 #Parts of this file were influenced by the Matlab GPML framework written by
@@ -91,7 +91,11 @@ class Laplace(object):
         iteration = 0
         while difference > self._mode_finding_tolerance and iteration < self._mode_finding_max_iter:
             W = -likelihood.d2logpdf_df2(f, Y, Y_metadata=Y_metadata)
+            if np.any(np.isnan(W)):
+                raise ValueError('One or more element(s) of W is NaN')
             grad = likelihood.dlogpdf_df(f, Y, Y_metadata=Y_metadata)
+            if np.any(np.isnan(grad)):
+                raise ValueError('One or more element(s) of grad is NaN')
 
             W_f = W*f
 
@@ -141,25 +145,30 @@ class Laplace(object):
         """
         #At this point get the hessian matrix (or vector as W is diagonal)
         W = -likelihood.d2logpdf_df2(f_hat, Y, Y_metadata=Y_metadata)
+        if np.any(np.isnan(W)):
+            raise ValueError('One or more element(s) of W is NaN')
 
         K_Wi_i, L, LiW12 = self._compute_B_statistics(K, W, likelihood.log_concave)
 
         #compute vital matrices
         C = np.dot(LiW12, K)
-        Ki_W_i  = K - C.T.dot(C) #Could this be wrong?
+        Ki_W_i  = K - C.T.dot(C) 
 
         #compute the log marginal
         log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + likelihood.logpdf(f_hat, Y, Y_metadata=Y_metadata) - np.sum(np.log(np.diag(L)))
 
-        #Compute vival matrices for derivatives
+        # Compute matrices for derivatives
         dW_df = -likelihood.d3logpdf_df3(f_hat, Y, Y_metadata=Y_metadata) # -d3lik_d3fhat
-        dL_dfhat = -0.5*(np.diag(Ki_W_i)[:, None]*dW_df) #why isn't this -0.5? s2 in R&W p126 line 9.
+        if np.any(np.isnan(dW_df)):
+            raise ValueError('One or more element(s) of dW_df is NaN')
+
+        dL_dfhat = -0.5*(np.diag(Ki_W_i)[:, None]*dW_df) # s2 in R&W p126 line 9.
         #BiK, _ = dpotrs(L, K, lower=1)
         #dL_dfhat = 0.5*np.diag(BiK)[:, None]*dW_df
         I_KW_i = np.eye(Y.shape[0]) - np.dot(K, K_Wi_i)
 
         ####################
-        #compute dL_dK#
+        #  compute dL_dK   #
         ####################
         if kern.size > 0 and not kern.is_fixed:
             #Explicit
@@ -202,12 +211,12 @@ class Laplace(object):
     def _compute_B_statistics(self, K, W, log_concave):
         """
         Rasmussen suggests the use of a numerically stable positive definite matrix B
-        Which has a positive diagonal element and can be easyily inverted
+        Which has a positive diagonal elements and can be easily inverted
 
         :param K: Prior Covariance matrix evaluated at locations X
         :type K: NxN matrix
         :param W: Negative hessian at a point (diagonal matrix)
-        :type W: Vector of diagonal values of hessian (1xN)
+        :type W: Vector of diagonal values of Hessian (1xN)
         :returns: (W12BiW12, L_B, Li_W12)
         """
         if not log_concave:
@@ -218,7 +227,8 @@ class Laplace(object):
                                 # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                 # To cause the posterior to become less certain than the prior and likelihood,
                                 # This is a property only held by non-log-concave likelihoods
-
+        if np.any(np.isnan(W)):
+            raise ValueError('One or more element(s) of W is NaN')
         #W is diagonal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
         B = np.eye(K.shape[0]) + W_12*K*W_12.T
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index ee2d6250..7344b204 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -23,6 +23,7 @@ class VarDTC(object):
     def __init__(self, limit=1):
         #self._YYTfactor_cache = caching.cache()
         from ...util.caching import Cacher
+        self.limit = limit
         self.get_trYYT = Cacher(self._get_trYYT, limit)
         self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
 
@@ -33,6 +34,17 @@ class VarDTC(object):
     def _get_trYYT(self, Y):
         return param_to_array(np.sum(np.square(Y)))
 
+    def __getstate__(self):
+        # has to be overridden, as Cacher objects cannot be pickled. 
+        return self.limit
+
+    def __setstate__(self, state):
+        # has to be overridden, as Cacher objects cannot be pickled. 
+        self.limit = state
+        from ...util.caching import Cacher
+        self.get_trYYT = Cacher(self._get_trYYT, self.limit)
+        self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit)
+
     def _get_YYTfactor(self, Y):
         """
         find a matrix L which satisfies LLT = YYT.
@@ -126,7 +138,7 @@ class VarDTC(object):
         delit += output_dim * np.eye(num_inducing)
         # Compute dL_dKmm
         dL_dKmm = backsub_both_sides(Lm, delit)
-        
+
         # derivatives of L w.r.t. psi
         dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm,
             VVT_factor, Cpsi1Vf, DBi_plus_BiPBi,
@@ -179,6 +191,7 @@ class VarDTC(object):
         return post, log_marginal, grad_dict
 
 class VarDTCMissingData(object):
+    const_jitter = 1e-6
     def __init__(self, limit=1):
         from ...util.caching import Cacher
         self._Y = Cacher(self._subarray_computations, limit)
@@ -250,7 +263,7 @@ class VarDTCMissingData(object):
 
         for y, trYYT, [v, ind] in itertools.izip(Ys, traces, self._subarray_indices):
             if het_noise: beta = beta_all[ind]
-            else: beta = beta_all[0]
+            else: beta = beta_all
 
             VVT_factor = (beta*y)
             VVT_factor_all[v, ind].flat = VVT_factor.flat
@@ -311,7 +324,7 @@ class VarDTCMissingData(object):
                 het_noise, uncertain_inputs, LB,
                 _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
                 psi0, psi1, beta,
-                data_fit, num_data, output_dim, trYYT)
+                data_fit, num_data, output_dim, trYYT, Y)
 
             if full_VVT_factor: woodbury_vector[:, ind] = Cpsi1Vf
             else:
diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 0e265a64..37cd71ec 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -1,9 +1,9 @@
 from _src.kern import Kern
 from _src.rbf import RBF
-from _src.linear import Linear
+from _src.linear import Linear, LinearFull
 from _src.static import Bias, White
 from _src.brownian import Brownian
-from _src.sympykern import Sympykern
+from _src.symbolic import Symbolic
 from _src.stationary import Exponential, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from _src.mlp import MLP
 from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index 57e611ed..fb0e114b 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -23,7 +23,6 @@ class Add(CombinationKernel):
         If a list of parts (of this kernel!) `which_parts` is given, only
         the parts of the list are taken to compute the covariance.
         """
-        assert X.shape[1] > max(np.r_[self.active_dims])
         if which_parts is None:
             which_parts = self.parts
         elif not isinstance(which_parts, (list, tuple)):
@@ -33,7 +32,6 @@ class Add(CombinationKernel):
 
     @Cache_this(limit=2, force_kwargs=['which_parts'])
     def Kdiag(self, X, which_parts=None):
-        assert X.shape[1] > max(np.r_[self.active_dims])
         if which_parts is None:
             which_parts = self.parts
         elif not isinstance(which_parts, (list, tuple)):
@@ -160,21 +158,13 @@ class Add(CombinationKernel):
             target_S += b
         return target_mu, target_S
 
-    def _getstate(self):
-        """
-        Get the current state of the class,
-        here just all the indices, rest can get recomputed
-        """
-        return super(Add, self)._getstate()
-
-    def _setstate(self, state):
-        super(Add, self)._setstate(state)
-
     def add(self, other, name='sum'):
         if isinstance(other, Add):
             other_params = other._parameters_[:]
             for p in other_params:
                 other.remove_parameter(p)
             self.add_parameters(*other_params)
-        else: self.add_parameter(other)
+        else:
+            self.add_parameter(other)
+        self.input_dim, self.active_dims = self.get_input_dim_active_dims(self.parts)
         return self
\ No newline at end of file
diff --git a/GPy/kern/_src/independent_outputs.py b/GPy/kern/_src/independent_outputs.py
index cf015d02..4a9671aa 100644
--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@@ -54,85 +54,93 @@ class IndependentOutputs(CombinationKernel):
             self.kern = kernels
         super(IndependentOutputs, self).__init__(kernels=kernels, extra_dims=[index_dim], name=name)
         self.index_dim = index_dim
-        self.kerns = kernels if len(kernels) != 1 else itertools.repeat(kernels[0])
 
     def K(self,X ,X2=None):
         slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
         if X2 is None:
             target = np.zeros((X.shape[0], X.shape[0]))
-            [[target.__setitem__((s,ss), kern.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices_i, slices_i)] for kern, slices_i in zip(self.kerns, slices)]
+            [[target.__setitem__((s,ss), kern.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices_i, slices_i)] for kern, slices_i in zip(kerns, slices)]
         else:
             slices2 = index_to_slices(X2[:,self.index_dim])
             target = np.zeros((X.shape[0], X2.shape[0]))
-            [[target.__setitem__((s,s2), kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices_i, slices_j)] for kern, slices_i,slices_j in zip(self.kerns, slices,slices2)]
+            [[target.__setitem__((s,s2), kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices_i, slices_j)] for kern, slices_i,slices_j in zip(kerns, slices,slices2)]
         return target
 
     def Kdiag(self,X):
         slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
         target = np.zeros(X.shape[0])
-        [[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(self.kerns, slices)]
+        [[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
         return target
 
     def update_gradients_full(self,dL_dK,X,X2=None):
         slices = index_to_slices(X[:,self.index_dim])
-        if self.single_kern: target = np.zeros(self.kern.size)
-        else: target = [np.zeros(kern.size) for kern, _ in zip(self.kerns, slices)]
+        if self.single_kern: 
+            target = np.zeros(self.kern.size)
+            kerns = itertools.repeat(self.kern)
+        else: 
+            kerns = self.kern
+            target = [np.zeros(kern.size) for kern, _ in zip(kerns, slices)]
         def collate_grads(kern, i, dL, X, X2):
             kern.update_gradients_full(dL,X,X2)
             if self.single_kern: target[:] += kern.gradient
             else: target[i][:] += kern.gradient
         if X2 is None:
-            [[collate_grads(kern, i, dL_dK[s,ss], X[s], X[ss]) for s,ss in itertools.product(slices_i, slices_i)] for i,(kern,slices_i) in enumerate(zip(self.kerns,slices))]
+            [[collate_grads(kern, i, dL_dK[s,ss], X[s], X[ss]) for s,ss in itertools.product(slices_i, slices_i)] for i,(kern,slices_i) in enumerate(zip(kerns,slices))]
         else:
             slices2 = index_to_slices(X2[:,self.index_dim])
-            [[[collate_grads(kern, i, dL_dK[s,s2],X[s],X2[s2]) for s in slices_i] for s2 in slices_j] for i,(kern,slices_i,slices_j) in enumerate(zip(self.kerns,slices,slices2))]
+            [[[collate_grads(kern, i, dL_dK[s,s2],X[s],X2[s2]) for s in slices_i] for s2 in slices_j] for i,(kern,slices_i,slices_j) in enumerate(zip(kerns,slices,slices2))]
         if self.single_kern: kern.gradient = target
-        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(self.kerns, slices))]
+        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
 
     def gradients_X(self,dL_dK, X, X2=None):
         target = np.zeros(X.shape)
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
         if X2 is None:
             # TODO: make use of index_to_slices
             values = np.unique(X[:,self.index_dim])
             slices = [X[:,self.index_dim]==i for i in values]
             [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None))
-              for kern, s in zip(self.kerns, slices)]
+              for kern, s in zip(kerns, slices)]
             #slices = index_to_slices(X[:,self.index_dim])
             #[[np.add(target[s], kern.gradients_X(dL_dK[s,s], X[s]), out=target[s]) 
-            #  for s in slices_i] for kern, slices_i in zip(self.kerns, slices)]
+            #  for s in slices_i] for kern, slices_i in zip(kerns, slices)]
             #import ipdb;ipdb.set_trace()
             #[[(np.add(target[s ], kern.gradients_X(dL_dK[s ,ss],X[s ], X[ss]), out=target[s ]),
             #   np.add(target[ss], kern.gradients_X(dL_dK[ss,s ],X[ss], X[s ]), out=target[ss]))
-            #  for s, ss in itertools.combinations(slices_i, 2)] for kern, slices_i in zip(self.kerns, slices)]
+            #  for s, ss in itertools.combinations(slices_i, 2)] for kern, slices_i in zip(kerns, slices)]
         else:
             values = np.unique(X[:,self.index_dim])
             slices = [X[:,self.index_dim]==i for i in values]
             slices2 = [X2[:,self.index_dim]==i for i in values]
             [target.__setitem__(s, kern.gradients_X(dL_dK[s, :][:, s2],X[s],X2[s2]))
-              for kern, s, s2 in zip(self.kerns, slices, slices2)]
+              for kern, s, s2 in zip(kerns, slices, slices2)]
             # TODO: make work with index_to_slices
             #slices = index_to_slices(X[:,self.index_dim])
             #slices2 = index_to_slices(X2[:,self.index_dim])
-            #[[target.__setitem__(s, target[s] + kern.gradients_X(dL_dK[s,s2], X[s], X2[s2])) for s, s2 in itertools.product(slices_i, slices_j)] for kern, slices_i,slices_j in zip(self.kerns, slices,slices2)]
+            #[[target.__setitem__(s, target[s] + kern.gradients_X(dL_dK[s,s2], X[s], X2[s2])) for s, s2 in itertools.product(slices_i, slices_j)] for kern, slices_i,slices_j in zip(kerns, slices,slices2)]
         return target
 
     def gradients_X_diag(self, dL_dKdiag, X):
         slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
         target = np.zeros(X.shape)
-        [[target.__setitem__(s, kern.gradients_X_diag(dL_dKdiag[s],X[s])) for s in slices_i] for kern, slices_i in zip(self.kerns, slices)]
+        [[target.__setitem__(s, kern.gradients_X_diag(dL_dKdiag[s],X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
         return target
 
     def update_gradients_diag(self, dL_dKdiag, X):
         slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
         if self.single_kern: target = np.zeros(self.kern.size)
-        else: target = [np.zeros(kern.size) for kern, _ in zip(self.kerns, slices)]
+        else: target = [np.zeros(kern.size) for kern, _ in zip(kerns, slices)]
         def collate_grads(kern, i, dL, X):
             kern.update_gradients_diag(dL,X)
             if self.single_kern: target[:] += kern.gradient
             else: target[i][:] += kern.gradient
-        [[collate_grads(kern, i, dL_dKdiag[s], X[s,:]) for s in slices_i] for i, (kern, slices_i) in enumerate(zip(self.kerns, slices))]
+        [[collate_grads(kern, i, dL_dKdiag[s], X[s,:]) for s in slices_i] for i, (kern, slices_i) in enumerate(zip(kerns, slices))]
         if self.single_kern: kern.gradient = target
-        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(self.kerns, slices))]
+        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
 
 class Hierarchical(CombinationKernel):
     """
@@ -148,30 +156,30 @@ class Hierarchical(CombinationKernel):
     def __init__(self, kern, name='hierarchy'):
         assert all([k.input_dim==kerns[0].input_dim for k in kerns])
         super(Hierarchical, self).__init__(kerns[0].input_dim + len(kerns) - 1, name)
-        self.kerns = kerns
-        self.add_parameters(self.kerns)
+        kerns = kerns
+        self.add_parameters(kerns)
 
     def K(self,X ,X2=None):
-        X, slices = X[:,:-self.levels], [index_to_slices(X[:,i]) for i in range(self.kerns[0].input_dim, self.input_dim)]
-        K = self.kerns[0].K(X, X2)
+        X, slices = X[:,:-self.levels], [index_to_slices(X[:,i]) for i in range(kerns[0].input_dim, self.input_dim)]
+        K = kerns[0].K(X, X2)
         if X2 is None:
-            [[[np.copyto(K[s,s], k.K(X[s], None)) for s in slices_i] for slices_i in slices_k] for k, slices_k in zip(self.kerns[1:], slices)]
+            [[[np.copyto(K[s,s], k.K(X[s], None)) for s in slices_i] for slices_i in slices_k] for k, slices_k in zip(kerns[1:], slices)]
         else:
             X2, slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
-            [[[[np.copyto(K[s, s2], self.kern.K(X[s],X2[s2])) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_k,slices_k2)] for k, slices_k, slices_k2 in zip(self.kerns[1:], slices, slices2)]
+            [[[[np.copyto(K[s, s2], self.kern.K(X[s],X2[s2])) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_k,slices_k2)] for k, slices_k, slices_k2 in zip(kerns[1:], slices, slices2)]
         return target
 
     def Kdiag(self,X):
-        X, slices = X[:,:-self.levels], [index_to_slices(X[:,i]) for i in range(self.kerns[0].input_dim, self.input_dim)]
-        K = self.kerns[0].K(X, X2)
-        [[[np.copyto(target[s], self.kern.Kdiag(X[s])) for s in slices_i] for slices_i in slices_k] for k, slices_k in zip(self.kerns[1:], slices)]
+        X, slices = X[:,:-self.levels], [index_to_slices(X[:,i]) for i in range(kerns[0].input_dim, self.input_dim)]
+        K = kerns[0].K(X, X2)
+        [[[np.copyto(target[s], self.kern.Kdiag(X[s])) for s in slices_i] for slices_i in slices_k] for k, slices_k in zip(kerns[1:], slices)]
         return target
 
     def update_gradients_full(self,dL_dK,X,X2=None):
         X,slices = X[:,:-1],index_to_slices(X[:,-1])
         if X2 is None:
-            self.kerns[0].update_gradients_full(dL_dK, X, None)
-            for k, slices_k in zip(self.kerns[1:], slices):
+            kerns[0].update_gradients_full(dL_dK, X, None)
+            for k, slices_k in zip(kerns[1:], slices):
                 target = np.zeros(k.size)
                 def collate_grads(dL, X, X2):
                     k.update_gradients_full(dL,X,X2)
@@ -180,8 +188,8 @@ class Hierarchical(CombinationKernel):
                 k._set_gradient(target)
         else:
             X2, slices2 = X2[:,:-1], index_to_slices(X2[:,-1])
-            self.kerns[0].update_gradients_full(dL_dK, X, None)
-            for k, slices_k in zip(self.kerns[1:], slices):
+            kerns[0].update_gradients_full(dL_dK, X, None)
+            for k, slices_k in zip(kerns[1:], slices):
                 target = np.zeros(k.size)
                 def collate_grads(dL, X, X2):
                     k.update_gradients_full(dL,X,X2)
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index be8a15b2..dbe4c1f8 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -11,13 +11,12 @@ from ...util.caching import Cache_this
 
 class Kern(Parameterized):
     #===========================================================================
-    # This adds input slice support. The rather ugly code for slicing can be 
+    # This adds input slice support. The rather ugly code for slicing can be
     # found in kernel_slice_operations
     __metaclass__ = KernCallsViaSlicerMeta
     #===========================================================================
-    _debug=False
     _support_GPU=False
-    def __init__(self, input_dim, active_dims, name, useGPU=False,*a, **kw):
+    def __init__(self, input_dim, active_dims, name, useGPU=False, *a, **kw):
         """
         The base class for a kernel: a positive definite function
         which forms of a covariance function (kernel).
@@ -178,22 +177,6 @@ class Kern(Parameterized):
         #else: kernels.append(other)
         return Prod([self, other], name)
 
-    def _getstate(self):
-        """
-        Get the current state of the class,
-        here just all the indices, rest can get recomputed
-        """
-        return super(Kern, self)._getstate() + [
-                self.active_dims,
-                self.input_dim,
-                self._sliced_X]
-
-    def _setstate(self, state):
-        self._sliced_X = state.pop()
-        self.input_dim = state.pop()
-        self.active_dims = state.pop()
-        super(Kern, self)._setstate(state)
-
 class CombinationKernel(Kern):
     """
     Abstract super class for combination kernels.
@@ -211,9 +194,7 @@ class CombinationKernel(Kern):
         :param array-like|slice extra_dims: if needed extra dimensions for the combination kernel to work on
         """
         assert all([isinstance(k, Kern) for k in kernels])
-        active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
-        input_dim = active_dims.max()+1 + len(extra_dims)
-        active_dims = slice(active_dims.max()+1+len(extra_dims))
+        input_dim, active_dims = self.get_input_dim_active_dims(kernels, extra_dims)
         # initialize the kernel with the full input_dim
         super(CombinationKernel, self).__init__(input_dim, active_dims, name)
         self.extra_dims = extra_dims
@@ -223,6 +204,12 @@ class CombinationKernel(Kern):
     def parts(self):
         return self._parameters_
 
+    def get_input_dim_active_dims(self, kernels, extra_dims = None):
+        active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
+        input_dim = active_dims.max()+1 + (len(np.r_[extra_dims]) if extra_dims is not None else 0)
+        active_dims = slice(0, input_dim, 1)
+        return input_dim, active_dims
+
     def input_sensitivity(self):
         in_sen = np.zeros((self.num_params, self.input_dim))
         for i, p in enumerate(self.parts):
diff --git a/GPy/kern/_src/kernel_slice_operations.py b/GPy/kern/_src/kernel_slice_operations.py
index 9beb40ab..a4bb8f62 100644
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@@ -5,130 +5,135 @@ Created on 11 Mar 2014
 '''
 from ...core.parameterization.parameterized import ParametersChangedMeta
 import numpy as np
+from functools import wraps
+
+def put_clean(dct, name, func):
+    if name in dct:
+        dct['_clean_{}'.format(name)] = dct[name]
+        dct[name] = func(dct[name])
 
 class KernCallsViaSlicerMeta(ParametersChangedMeta):
-    def __call__(self, *args, **kw):
-        instance = super(ParametersChangedMeta, self).__call__(*args, **kw)
-        instance.K = _slice_wrapper(instance, instance.K)
-        instance.Kdiag = _slice_wrapper(instance, instance.Kdiag, diag=True)
-        instance.update_gradients_full = _slice_wrapper(instance, instance.update_gradients_full, diag=False, derivative=True)
-        instance.update_gradients_diag = _slice_wrapper(instance, instance.update_gradients_diag, diag=True, derivative=True)
-        instance.gradients_X = _slice_wrapper(instance, instance.gradients_X, diag=False, derivative=True, ret_X=True)
-        instance.gradients_X_diag = _slice_wrapper(instance, instance.gradients_X_diag, diag=True, derivative=True, ret_X=True)
-        instance.psi0 = _slice_wrapper(instance, instance.psi0, diag=False, derivative=False)
-        instance.psi1 = _slice_wrapper(instance, instance.psi1, diag=False, derivative=False)
-        instance.psi2 = _slice_wrapper(instance, instance.psi2, diag=False, derivative=False)
-        instance.update_gradients_expectations = _slice_wrapper(instance, instance.update_gradients_expectations, derivative=True, psi_stat=True)
-        instance.gradients_Z_expectations = _slice_wrapper(instance, instance.gradients_Z_expectations, derivative=True, psi_stat_Z=True, ret_X=True)
-        instance.gradients_qX_expectations = _slice_wrapper(instance, instance.gradients_qX_expectations, derivative=True, psi_stat=True, ret_X=True)
-        instance.parameters_changed()
-        return instance
+    def __new__(cls, name, bases, dct):
+        put_clean(dct, 'K', _slice_K)
+        put_clean(dct, 'Kdiag', _slice_Kdiag)
+        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
+        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
+        put_clean(dct, 'gradients_X', _slice_gradients_X)
+        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)
 
-def _slice_wrapper(kern, operation, diag=False, derivative=False, psi_stat=False, psi_stat_Z=False, ret_X=False):
-    """
-    This method wraps the functions in kernel to make sure all kernels allways see their respective input dimension.
-    The different switches are:
-        diag: if X2 exists
-        derivative: if first arg is dL_dK
-        psi_stat: if first 3 args are dL_dpsi0..2
-        psi_stat_Z: if first 2 args are dL_dpsi1..2
-    """
-    if derivative:
-        if diag:
-            def x_slice_wrapper(dL_dKdiag, X):
-                ret_X_not_sliced = ret_X and kern._sliced_X == 0
-                if ret_X_not_sliced:
-                    ret = np.zeros(X.shape)
-                X = kern._slice_X(X) if not kern._sliced_X else X
-                # if the return value is of shape X.shape, we need to make sure to return the right shape
-                kern._sliced_X += 1
-                try:
-                    if ret_X_not_sliced: ret[:, kern.active_dims] = operation(dL_dKdiag, X)
-                    else: ret = operation(dL_dKdiag, X)
-                except:
-                    raise
-                finally:
-                    kern._sliced_X -= 1
-                return ret
-        elif psi_stat:
-            def x_slice_wrapper(dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-                ret_X_not_sliced = ret_X and kern._sliced_X == 0
-                if ret_X_not_sliced:
-                    ret1, ret2 = np.zeros(variational_posterior.shape), np.zeros(variational_posterior.shape)
-                Z, variational_posterior = kern._slice_X(Z) if not kern._sliced_X else Z, kern._slice_X(variational_posterior) if not kern._sliced_X else variational_posterior
-                kern._sliced_X += 1
-                # if the return value is of shape X.shape, we need to make sure to return the right shape
-                try:
-                    if ret_X_not_sliced:
-                        ret = list(operation(dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior))
-                        r2 = ret[:2]
-                        ret[0] = ret1
-                        ret[1] = ret2
-                        ret[0][:, kern.active_dims] = r2[0]
-                        ret[1][:, kern.active_dims] = r2[1]
-                        del r2
-                    else: ret = operation(dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)
-                except:
-                    raise
-                finally:
-                    kern._sliced_X -= 1
-                return ret
-        elif psi_stat_Z:
-            def x_slice_wrapper(dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-                ret_X_not_sliced = ret_X and kern._sliced_X == 0
-                if ret_X_not_sliced: ret = np.zeros(Z.shape)
-                Z, variational_posterior = kern._slice_X(Z) if not kern._sliced_X else Z, kern._slice_X(variational_posterior) if not kern._sliced_X else variational_posterior
-                kern._sliced_X += 1
-                try:
-                    if ret_X_not_sliced:
-                        ret[:, kern.active_dims] = operation(dL_dpsi1, dL_dpsi2, Z, variational_posterior)
-                    else: ret = operation(dL_dpsi1, dL_dpsi2, Z, variational_posterior)
-                except:
-                    raise
-                finally:
-                    kern._sliced_X -= 1
-                return ret
+        put_clean(dct, 'psi0', _slice_psi)
+        put_clean(dct, 'psi1', _slice_psi)
+        put_clean(dct, 'psi2', _slice_psi)
+        put_clean(dct, 'update_gradients_expectations', _slice_update_gradients_expectations)
+        put_clean(dct, 'gradients_Z_expectations', _slice_gradients_Z_expectations)
+        put_clean(dct, 'gradients_qX_expectations', _slice_gradients_qX_expectations)
+        return super(KernCallsViaSlicerMeta, cls).__new__(cls, name, bases, dct)
+
+class _Slice_wrap(object):
+    def __init__(self, k, X, X2=None):
+        self.k = k
+        self.shape = X.shape
+        if self.k._sliced_X == 0:
+            assert X.shape[1] > max(np.r_[self.k.active_dims]), "At least {} dimensional X needed".format(max(np.r_[self.k.active_dims]))
+            self.X = self.k._slice_X(X)
+            self.X2 = self.k._slice_X(X2) if X2 is not None else X2
+            self.ret = True
         else:
-            def x_slice_wrapper(dL_dK, X, X2=None):
-                ret_X_not_sliced = ret_X and kern._sliced_X == 0
-                if ret_X_not_sliced:
-                    ret = np.zeros(X.shape)
-                X, X2 = kern._slice_X(X) if not kern._sliced_X else X, kern._slice_X(X2) if X2 is not None and not kern._sliced_X else X2
-                kern._sliced_X += 1
-                try:
-                    if ret_X_not_sliced: ret[:, kern.active_dims] = operation(dL_dK, X, X2)
-                    else: ret = operation(dL_dK, X, X2)
-                except:
-                    raise
-                finally:
-                    kern._sliced_X -= 1
-                return ret
-    else:
-        if diag:
-            def x_slice_wrapper(X, *args, **kw):
-                X = kern._slice_X(X) if not kern._sliced_X else X
-                kern._sliced_X += 1
-                try:
-                    ret = operation(X, *args, **kw)
-                except:
-                    raise
-                finally:
-                    kern._sliced_X -= 1
-                return ret
-        else: 
-            def x_slice_wrapper(X, X2=None, *args, **kw):
-                X, X2 = kern._slice_X(X) if not kern._sliced_X else X, kern._slice_X(X2) if X2 is not None and not kern._sliced_X else X2
-                kern._sliced_X += 1
-                try:
-                    ret = operation(X, X2, *args, **kw)
-                except: raise
-                finally:
-                    kern._sliced_X -= 1
-                return ret
-    x_slice_wrapper._operation = operation
-    x_slice_wrapper.__name__ = ("slicer("+str(operation)
-                                +(","+str(bool(diag)) if diag else'')
-                                +(','+str(bool(derivative)) if derivative else '')
-                                +')')
-    x_slice_wrapper.__doc__ = "**sliced**\n" + (operation.__doc__ or "")
-    return x_slice_wrapper
\ No newline at end of file
+            self.X = X
+            self.X2 = X2
+            self.ret = False
+    def __enter__(self):
+        self.k._sliced_X += 1
+        return self
+    def __exit__(self, *a):
+        self.k._sliced_X -= 1
+    def handle_return_array(self, return_val):
+        if self.ret:
+            ret = np.zeros(self.shape)
+            ret[:, self.k.active_dims] = return_val
+            return ret
+        return return_val
+
+def _slice_K(f):
+    @wraps(f)
+    def wrap(self, X, X2 = None, *a, **kw):
+        with _Slice_wrap(self, X, X2) as s:
+            ret = f(self, s.X, s.X2, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_Kdiag(f):
+    @wraps(f)
+    def wrap(self, X, *a, **kw):
+        with _Slice_wrap(self, X, None) as s:
+            ret = f(self, s.X, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_update_gradients_full(f):
+    @wraps(f)
+    def wrap(self, dL_dK, X, X2=None):
+        with _Slice_wrap(self, X, X2) as s:
+            ret = f(self, dL_dK, s.X, s.X2)
+        return ret
+    return wrap
+
+def _slice_update_gradients_diag(f):
+    @wraps(f)
+    def wrap(self, dL_dKdiag, X):
+        with _Slice_wrap(self, X, None) as s:
+            ret = f(self, dL_dKdiag, s.X)
+        return ret
+    return wrap
+
+def _slice_gradients_X(f):
+    @wraps(f)
+    def wrap(self, dL_dK, X, X2=None):
+        with _Slice_wrap(self, X, X2) as s:
+            ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
+        return ret
+    return wrap
+
+def _slice_gradients_X_diag(f):
+    @wraps(f)
+    def wrap(self, dL_dKdiag, X):
+        with _Slice_wrap(self, X, None) as s:
+            ret = s.handle_return_array(f(self, dL_dKdiag, s.X))
+        return ret
+    return wrap
+
+def _slice_psi(f):
+    @wraps(f)
+    def wrap(self, Z, variational_posterior):
+        with _Slice_wrap(self, Z, variational_posterior) as s:
+            ret = f(self, s.X, s.X2)
+        return ret
+    return wrap
+
+def _slice_update_gradients_expectations(f):
+    @wraps(f)
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        with _Slice_wrap(self, Z, variational_posterior) as s:
+            ret = f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2)
+        return ret
+    return wrap
+
+def _slice_gradients_Z_expectations(f):
+    @wraps(f)
+    def wrap(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        with _Slice_wrap(self, Z, variational_posterior) as s:
+            ret = s.handle_return_array(f(self, dL_dpsi1, dL_dpsi2, s.X, s.X2))
+        return ret
+    return wrap
+
+def _slice_gradients_qX_expectations(f):
+    @wraps(f)
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        with _Slice_wrap(self, variational_posterior, Z) as s:
+            ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X))
+            r2 = ret[:2]
+            ret[0] = s.handle_return_array(r2[0])
+            ret[1] = s.handle_return_array(r2[1])
+            del r2
+        return ret
+    return wrap
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index 7d9eeac2..f9dacf02 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -121,7 +121,7 @@ class Linear(Kern):
             gamma = variational_posterior.binary_prob
             mu = variational_posterior.mean
             return np.einsum('nq,q,mq,nq->nm',gamma,self.variances,Z,mu)
-#            return (self.variances*gamma*mu).sum(axis=1)       
+#            return (self.variances*gamma*mu).sum(axis=1)
         else:
             return self.K(variational_posterior.mean, Z) #the variance, it does nothing
 
@@ -177,7 +177,7 @@ class Linear(Kern):
 
             grad =  np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, self.variances,mu) +\
                  np.einsum('nmo,noq->mq',dL_dpsi2,_dpsi2_dZ)
-            
+
             return grad
         else:
             #psi1
@@ -191,15 +191,15 @@ class Linear(Kern):
             gamma = variational_posterior.binary_prob
             mu = variational_posterior.mean
             S = variational_posterior.variance
-            mu2S = np.square(mu)+S            
+            mu2S = np.square(mu)+S
             _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma)
-            
+
             grad_gamma = np.einsum('n,q,nq->nq',dL_dpsi0,self.variances,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,self.variances,Z,mu) +\
                  np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dgamma)
             grad_mu = np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*self.variances,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,self.variances,Z) +\
                  np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dmu)
             grad_S = np.einsum('n,nq,q->nq',dL_dpsi0,gamma,self.variances) + np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dS)
-            
+
             return grad_mu, grad_S, grad_gamma
         else:
             grad_mu, grad_S = np.zeros(variational_posterior.mean.shape), np.zeros(variational_posterior.mean.shape)
@@ -210,7 +210,7 @@ class Linear(Kern):
             grad_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
             # psi2
             self._weave_dpsi2_dmuS(dL_dpsi2, Z, variational_posterior, grad_mu, grad_S)
-    
+
             return grad_mu, grad_S
 
     #--------------------------------------------------#
@@ -313,3 +313,47 @@ class Linear(Kern):
 
     def input_sensitivity(self):
         return np.ones(self.input_dim) * self.variances
+
+class LinearFull(Kern):
+    def __init__(self, input_dim, rank, W=None, kappa=None, active_dims=None, name='linear_full'):
+        super(LinearFull, self).__init__(input_dim, active_dims, name)
+        if W is None:
+            W = np.ones((input_dim, rank))
+        if kappa is None:
+            kappa = np.ones(input_dim)
+        assert W.shape == (input_dim, rank)
+        assert kappa.shape == (input_dim,)
+
+        self.W = Param('W', W)
+        self.kappa = Param('kappa', kappa, Logexp())
+        self.add_parameters(self.W, self.kappa)
+
+    def K(self, X, X2=None):
+        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+        return np.einsum('ij,jk,lk->il', X, P, X if X2 is None else X2)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        self.kappa.gradient = np.einsum('ij,ik,kj->j', X, dL_dK, X if X2 is None else X2)
+        self.W.gradient = np.einsum('ij,kl,ik,lm->jm', X, X if X2 is None else X2, dL_dK, self.W)
+        self.W.gradient += np.einsum('ij,kl,ik,jm->lm', X, X if X2 is None else X2, dL_dK, self.W)
+
+    def Kdiag(self, X):
+        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+        return np.einsum('ij,jk,ik->i', X, P, X)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.kappa.gradient = np.einsum('ij,i->j', np.square(X), dL_dKdiag)
+        self.W.gradient = 2.*np.einsum('ij,ik,jl,i->kl', X, X, self.W, dL_dKdiag)
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+        if X2 is None:
+            return 2.*np.einsum('ij,jk,kl->il', dL_dK, X, P)
+        else:
+            return np.einsum('ij,jk,kl->il', dL_dK, X2, P)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+        return 2.*np.einsum('jk,i,ij->ik', P, dL_dKdiag, X)
+
+
diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index e00f38c3..b8f92f27 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -23,7 +23,6 @@ class Prod(CombinationKernel):
 
     @Cache_this(limit=2, force_kwargs=['which_parts'])
     def K(self, X, X2=None, which_parts=None):
-        assert X.shape[1] == self.input_dim
         if which_parts is None:
             which_parts = self.parts
         elif not isinstance(which_parts, (list, tuple)):
@@ -33,7 +32,6 @@ class Prod(CombinationKernel):
 
     @Cache_this(limit=2, force_kwargs=['which_parts'])
     def Kdiag(self, X, which_parts=None):
-        assert X.shape[1] == self.input_dim
         if which_parts is None:
             which_parts = self.parts
         return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))
@@ -58,8 +56,6 @@ class Prod(CombinationKernel):
     def gradients_X_diag(self, dL_dKdiag, X):
         target = np.zeros(X.shape)
         for k1,k2 in itertools.combinations(self.parts, 2):
-            target += k1.gradients_X(dL_dKdiag*k2.Kdiag(X), X)
-            target += k2.gradients_X(dL_dKdiag*k1.Kdiag(X), X)
+            target += k1.gradients_X_diag(dL_dKdiag*k2.Kdiag(X), X)
+            target += k2.gradients_X_diag(dL_dKdiag*k1.Kdiag(X), X)
         return target
-
-
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 2534ad9b..e08d94f9 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -74,9 +74,6 @@ class RBF(Stationary):
         # Spike-and-Slab GPLVM
         if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             if self.useGPU:
-#                 dL_dpsi0_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi0))
-#                 dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1))
-#                 dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2))
                 self.psicomp.update_gradients_expectations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
             else:
                 
@@ -99,7 +96,7 @@ class RBF(Stationary):
                     self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
                 else:
                     self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum()
-                
+                    
         elif isinstance(variational_posterior, variational.NormalPosterior):
             l2 = self.lengthscale**2
             if l2.size != self.input_dim:
@@ -107,8 +104,6 @@ class RBF(Stationary):
 
             #contributions from psi0:
             self.variance.gradient = np.sum(dL_dpsi0)
-            if self._debug:
-                num_grad = self.lengthscale.gradient.copy()
             self.lengthscale.gradient = 0.
 
             #from psi1
@@ -128,8 +123,6 @@ class RBF(Stationary):
             else:
                 self.lengthscale.gradient += self._weave_psi2_lengthscale_grads(dL_dpsi2, psi2, Zdist_sq, S, mudist_sq, l2)
 
-            if self._debug:
-                import ipdb;ipdb.set_trace()
             self.variance.gradient += 2.*np.sum(dL_dpsi2 * psi2)/self.variance
 
         else:
@@ -139,8 +132,6 @@ class RBF(Stationary):
         # Spike-and-Slab GPLVM
         if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             if self.useGPU:
-#                 dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1))
-#                 dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2))
                 return self.psicomp.gradients_Z_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
             else:
                 _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
@@ -177,8 +168,6 @@ class RBF(Stationary):
         # Spike-and-Slab GPLVM
         if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             if self.useGPU:
-#                 dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1))
-#                 dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2))
                 return self.psicomp.gradients_qX_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
             else:   
                 ndata = variational_posterior.mean.shape[0]
diff --git a/GPy/kern/_src/sympykern.py b/GPy/kern/_src/symbolic.py
similarity index 56%
rename from GPy/kern/_src/sympykern.py
rename to GPy/kern/_src/symbolic.py
index 6f066e98..c7bbae73 100644
--- a/GPy/kern/_src/sympykern.py
+++ b/GPy/kern/_src/symbolic.py
@@ -11,9 +11,9 @@ from kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 
-class Sympykern(Kern):
+class Symbolic(Kern):
     """
-    A kernel object, where all the hard work in done by sympy.
+    A kernel object, where all the hard work is done by sympy.
 
     :param k: the covariance function
     :type k: a positive definite sympy function of x_0, z_0, x_1, z_1, x_2, z_2...
@@ -26,10 +26,8 @@ class Sympykern(Kern):
      - to handle multiple inputs, call them x_1, z_1, etc
      - to handle multpile correlated outputs, you'll need to add parameters with an index, such as lengthscale_i and lengthscale_j.
     """
-    def __init__(self, input_dim, k=None, output_dim=1, name=None, param=None, active_dims=None):
+    def __init__(self, input_dim, k=None, output_dim=1, name='symbolic', param=None, active_dims=None, operators=None):
 
-        if name is None:
-            name='sympykern'
         if k is None:
             raise ValueError, "You must provide an argument for the covariance function."
         super(Sympykern, self).__init__(input_dim, active_dims, name)
@@ -60,7 +58,6 @@ class Sympykern(Kern):
         # extract parameter names from the covariance
         thetas = sorted([e for e in sp_vars if not (e.name[0:2]=='x_' or e.name[0:2]=='z_')],key=lambda e:e.name)
 
-
         # Look for parameters with index (subscripts), they are associated with different outputs.
         if self.output_dim>1:
             self._sp_theta_i = sorted([e for e in thetas if (e.name[-2:]=='_i')], key=lambda e:e.name)
@@ -97,8 +94,8 @@ class Sympykern(Kern):
             val = 1.0
             # TODO: what if user has passed a parameter vector, how should that be stored and interpreted? This is the old way before params class.
             if param is not None:
-                if param.has_key(theta):
-                    val = param[theta]
+                if param.has_key(theta.name):
+                    val = param[theta.name]
             setattr(self, theta.name, Param(theta.name, val, None))
             self.add_parameters(getattr(self, theta.name))
 
@@ -117,6 +114,12 @@ class Sympykern(Kern):
             self.arg_list += self._sp_theta_i + self._sp_theta_j
             self.diag_arg_list += self._sp_theta_i
 
+        # Check if there are additional linear operators on the covariance.
+        self._sp_operators = operators
+        # TODO: Deal with linear operators
+        #if self._sp_operators:
+        #    for operator in self._sp_operators:
+                
         # psi_stats aren't yet implemented.
         if False:
             self.compute_psi_stats()
@@ -254,3 +257,176 @@ class Sympykern(Kern):
                     self._reverse_arguments[theta_i.name] = self._arguments[theta_j.name].T
                     self._reverse_arguments[theta_j.name] = self._arguments[theta_i.name].T
 
+if False:
+    class Symcombine(CombinationKernel):
+        """
+        Combine list of given sympy covariances together with the provided operations.
+        """
+        def __init__(self, subkerns, operations, name='sympy_combine'):
+            super(Symcombine, self).__init__(subkerns, name)
+            for subkern, operation in zip(subkerns, operations):
+                self._sp_k += self._k_double_operate(subkern._sp_k, operation)
+
+        #def _double_operate(self, k, operation):
+
+
+        @Cache_this(limit=2, force_kwargs=['which_parts'])
+        def K(self, X, X2=None, which_parts=None):
+            """
+            Combine covariances with a linear operator.
+            """
+            assert X.shape[1] == self.input_dim
+            if which_parts is None:
+                which_parts = self.parts
+            elif not isinstance(which_parts, (list, tuple)):
+                # if only one part is given
+                which_parts = [which_parts]
+            return reduce(np.add, (p.K(X, X2) for p in which_parts))
+
+        @Cache_this(limit=2, force_kwargs=['which_parts'])
+        def Kdiag(self, X, which_parts=None):
+            assert X.shape[1] == self.input_dim
+            if which_parts is None:
+                which_parts = self.parts
+            elif not isinstance(which_parts, (list, tuple)):
+                # if only one part is given
+                which_parts = [which_parts]
+            return reduce(np.add, (p.Kdiag(X) for p in which_parts))
+
+        def update_gradients_full(self, dL_dK, X, X2=None):
+            [p.update_gradients_full(dL_dK, X, X2) for p in self.parts]
+
+        def update_gradients_diag(self, dL_dK, X):
+            [p.update_gradients_diag(dL_dK, X) for p in self.parts]
+
+        def gradients_X(self, dL_dK, X, X2=None):
+            """Compute the gradient of the objective function with respect to X.
+
+            :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
+            :type dL_dK: np.ndarray (num_samples x num_inducing)
+            :param X: Observed data inputs
+            :type X: np.ndarray (num_samples x input_dim)
+            :param X2: Observed data inputs (optional, defaults to X)
+            :type X2: np.ndarray (num_inducing x input_dim)"""
+
+            target = np.zeros(X.shape)
+            [target.__iadd__(p.gradients_X(dL_dK, X, X2)) for p in self.parts]
+            return target
+
+        def gradients_X_diag(self, dL_dKdiag, X):
+            target = np.zeros(X.shape)
+            [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts]
+            return target
+
+        def psi0(self, Z, variational_posterior):
+            return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
+
+        def psi1(self, Z, variational_posterior):
+            return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))
+
+        def psi2(self, Z, variational_posterior):
+            psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
+            #return psi2
+            # compute the "cross" terms
+            from static import White, Bias
+            from rbf import RBF
+            #from rbf_inv import RBFInv
+            from linear import Linear
+            #ffrom fixed import Fixed
+
+            for p1, p2 in itertools.combinations(self.parts, 2):
+                # i1, i2 = p1.active_dims, p2.active_dims
+                # white doesn;t combine with anything
+                if isinstance(p1, White) or isinstance(p2, White):
+                    pass
+                # rbf X bias
+                #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
+                elif isinstance(p1,  Bias) and isinstance(p2, (RBF, Linear)):
+                    tmp = p2.psi1(Z, variational_posterior)
+                    psi2 += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
+                #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+                elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)):
+                    tmp = p1.psi1(Z, variational_posterior)
+                    psi2 += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
+                elif isinstance(p2, (RBF, Linear)) and isinstance(p1, (RBF, Linear)):
+                    assert np.intersect1d(p1.active_dims, p2.active_dims).size == 0, "only non overlapping kernel dimensions allowed so far"
+                    tmp1 = p1.psi1(Z, variational_posterior)
+                    tmp2 = p2.psi1(Z, variational_posterior)
+                    psi2 += (tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :])
+                else:
+                    raise NotImplementedError, "psi2 cannot be computed for this kernel"
+            return psi2
+
+        def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+            from static import White, Bias
+            for p1 in self.parts:
+                #compute the effective dL_dpsi1. Extra terms appear becaue of the cross terms in psi2!
+                eff_dL_dpsi1 = dL_dpsi1.copy()
+                for p2 in self.parts:
+                    if p2 is p1:
+                        continue
+                    if isinstance(p2, White):
+                        continue
+                    elif isinstance(p2, Bias):
+                        eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
+                    else:# np.setdiff1d(p1.active_dims, ar2, assume_unique): # TODO: Careful, not correct for overlapping active_dims
+                        eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
+                p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
+
+        def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+            from static import White, Bias
+            target = np.zeros(Z.shape)
+            for p1 in self.parts:
+                #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
+                eff_dL_dpsi1 = dL_dpsi1.copy()
+                for p2 in self.parts:
+                    if p2 is p1:
+                        continue
+                    if isinstance(p2, White):
+                        continue
+                    elif isinstance(p2, Bias):
+                        eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
+                    else:
+                        eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
+                target += p1.gradients_Z_expectations(eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
+            return target
+
+        def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+            from static import White, Bias
+            target_mu = np.zeros(variational_posterior.shape)
+            target_S = np.zeros(variational_posterior.shape)
+            for p1 in self._parameters_:
+                #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
+                eff_dL_dpsi1 = dL_dpsi1.copy()
+                for p2 in self._parameters_:
+                    if p2 is p1:
+                        continue
+                    if isinstance(p2, White):
+                        continue
+                    elif isinstance(p2, Bias):
+                        eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
+                    else:
+                        eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
+                a, b = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
+                target_mu += a
+                target_S += b
+            return target_mu, target_S
+
+        def _getstate(self):
+            """
+            Get the current state of the class,
+            here just all the indices, rest can get recomputed
+            """
+            return super(Add, self)._getstate()
+
+        def _setstate(self, state):
+            super(Add, self)._setstate(state)
+
+        def add(self, other, name='sum'):
+            if isinstance(other, Add):
+                other_params = other._parameters_.copy()
+                for p in other_params:
+                    other.remove_parameter(p)
+                self.add_parameters(*other_params)
+            else: self.add_parameter(other)
+            return self
diff --git a/GPy/likelihoods/.DS_Store b/GPy/likelihoods/.DS_Store
deleted file mode 100644
index 8228ae90..00000000
Binary files a/GPy/likelihoods/.DS_Store and /dev/null differ
diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index 28e44541..d7ad5753 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -6,3 +6,17 @@ from poisson import Poisson
 from student_t import StudentT
 from likelihood import Likelihood
 from mixed_noise import MixedNoise
+# TODO need to fix this in a config file.
+try:
+    import sympy as sym
+    sympy_available=True
+except ImportError:
+    sympy_available=False
+if sympy_available:
+    # These are likelihoods that rely on symbolic.
+    from symbolic import Symbolic
+    #from sstudent_t import SstudentT
+    from negative_binomial import Negative_binomial
+    #from skew_normal import Skew_normal
+    #from skew_exponential import Skew_exponential
+    #from null_category import Null_category
diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py
index 371fbe63..7b867954 100644
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@@ -15,7 +15,7 @@ class Bernoulli(Likelihood):
         p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
 
     .. Note::
-        Y is expected to take values in {-1, 1} TODO: {0, 1}??
+        Y takes values in either {-1, 1} or {0, 1}.
         link function should have the domain [0, 1], e.g. probit (default) or Heaviside
 
     .. See also::
@@ -54,10 +54,10 @@ class Bernoulli(Likelihood):
         """
         if Y_i == 1:
             sign = 1.
-        elif Y_i == 0:
+        elif Y_i == 0 or Y_i == -1:
             sign = -1
         else:
-            raise ValueError("bad value for Bernouilli observation (0, 1)")
+            raise ValueError("bad value for Bernoulli observation (0, 1)")
         if isinstance(self.gp_link, link_functions.Probit):
             z = sign*v_i/np.sqrt(tau_i**2 + tau_i)
             Z_hat = std_norm_cdf(z)
@@ -95,15 +95,15 @@ class Bernoulli(Likelihood):
         else:
             return np.nan
 
-    def pdf_link(self, link_f, y, Y_metadata=None):
+    def pdf_link(self, inv_link_f, y, Y_metadata=None):
         """
-        Likelihood function given link(f)
+        Likelihood function given inverse link of f.
 
         .. math::
             p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
 
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
         :param Y_metadata: Y_metadata not used in bernoulli
@@ -113,102 +113,106 @@ class Bernoulli(Likelihood):
         .. Note:
             Each y_i must be in {0, 1}
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        #objective = (link_f**y) * ((1.-link_f)**(1.-y))
-        objective = np.where(y, link_f, 1.-link_f)
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        #objective = (inv_link_f**y) * ((1.-inv_link_f)**(1.-y))
+        objective = np.where(y, inv_link_f, 1.-inv_link_f)
         return np.exp(np.sum(np.log(objective)))
 
-    def logpdf_link(self, link_f, y, Y_metadata=None):
+    def logpdf_link(self, inv_link_f, y, Y_metadata=None):
         """
-        Log Likelihood function given link(f)
+        Log Likelihood function given inverse link of f.
 
         .. math::
             \\ln p(y_{i}|\\lambda(f_{i})) = y_{i}\\log\\lambda(f_{i}) + (1-y_{i})\\log (1-f_{i})
 
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
         :param Y_metadata: Y_metadata not used in bernoulli
-        :returns: log likelihood evaluated at points link(f)
+        :returns: log likelihood evaluated at points inverse link of f.
         :rtype: float
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        #objective = y*np.log(link_f) + (1.-y)*np.log(link_f)
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        #objective = y*np.log(inv_link_f) + (1.-y)*np.log(inv_link_f)
         state = np.seterr(divide='ignore')
-        objective = np.where(y==1, np.log(link_f), np.log(1-link_f))
+        # TODO check y \in {0, 1} or {-1, 1}
+        objective = np.where(y==1, np.log(inv_link_f), np.log(1-inv_link_f))
         np.seterr(**state)
         return np.sum(objective)
 
-    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
+    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
         """
-        Gradient of the pdf at y, given link(f) w.r.t link(f)
+        Gradient of the pdf at y, given inverse link of f w.r.t inverse link of f.
 
         .. math::
             \\frac{d\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - \\frac{(1 - y_{i})}{(1 - \\lambda(f_{i}))}
 
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
         :param Y_metadata: Y_metadata not used in bernoulli
-        :returns: gradient of log likelihood evaluated at points link(f)
+        :returns: gradient of log likelihood evaluated at points inverse link of f.
         :rtype: Nx1 array
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        #grad = (y/link_f) - (1.-y)/(1-link_f)
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        #grad = (y/inv_link_f) - (1.-y)/(1-inv_link_f)
         state = np.seterr(divide='ignore')
-        grad = np.where(y, 1./link_f, -1./(1-link_f))
+        # TODO check y \in {0, 1} or {-1, 1}
+        grad = np.where(y, 1./inv_link_f, -1./(1-inv_link_f))
         np.seterr(**state)
         return grad
 
-    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
+    def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
         """
-        Hessian at y, given link_f, w.r.t link_f the hessian will be 0 unless i == j
-        i.e. second derivative logpdf at y given link(f_i) link(f_j)  w.r.t link(f_i) and link(f_j)
+        Hessian at y, given inv_link_f, w.r.t inv_link_f the hessian will be 0 unless i == j
+        i.e. second derivative logpdf at y given inverse link of f_i and inverse link of f_j  w.r.t inverse link of f_i and inverse link of f_j.
 
 
         .. math::
             \\frac{d^{2}\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)^{2}} = \\frac{-y_{i}}{\\lambda(f)^{2}} - \\frac{(1-y_{i})}{(1-\\lambda(f))^{2}}
 
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
         :param Y_metadata: Y_metadata not used in bernoulli
-        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f))
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points inverse link of f.
         :rtype: Nx1 array
 
         .. Note::
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+            (the distribution for y_i depends only on inverse link of f_i not on inverse link of f_(j!=i)
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        #d2logpdf_dlink2 = -y/(link_f**2) - (1-y)/((1-link_f)**2)
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        #d2logpdf_dlink2 = -y/(inv_link_f**2) - (1-y)/((1-inv_link_f)**2)
         state = np.seterr(divide='ignore')
-        d2logpdf_dlink2 = np.where(y, -1./np.square(link_f), -1./np.square(1.-link_f))
+        # TODO check y \in {0, 1} or {-1, 1}
+        d2logpdf_dlink2 = np.where(y, -1./np.square(inv_link_f), -1./np.square(1.-inv_link_f))
         np.seterr(**state)
         return d2logpdf_dlink2
 
-    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
+    def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
         """
-        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+        Third order derivative log-likelihood function at y given inverse link of f w.r.t inverse link of f
 
         .. math::
             \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f)^{3}} - \\frac{2(1-y_{i}}{(1-\\lambda(f))^{3}}
 
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
+        :param inv_link_f: latent variables passed through inverse link of f.
+        :type inv_link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
         :param Y_metadata: Y_metadata not used in bernoulli
-        :returns: third derivative of log likelihood evaluated at points link(f)
+        :returns: third derivative of log likelihood evaluated at points inverse_link(f)
         :rtype: Nx1 array
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        #d3logpdf_dlink3 = 2*(y/(link_f**3) - (1-y)/((1-link_f)**3))
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        #d3logpdf_dlink3 = 2*(y/(inv_link_f**3) - (1-y)/((1-inv_link_f)**3))
         state = np.seterr(divide='ignore')
-        d3logpdf_dlink3 = np.where(y, 2./(link_f**3), -2./((1.-link_f)**3))
+        # TODO check y \in {0, 1} or {-1, 1}
+        d3logpdf_dlink3 = np.where(y, 2./(inv_link_f**3), -2./((1.-inv_link_f)**3))
         np.seterr(**state)
         return d3logpdf_dlink3
 
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index aabe93ef..33b51536 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -16,20 +16,20 @@ class Likelihood(Parameterized):
     Likelihood base class, used to defing p(y|f).
 
     All instances use _inverse_ link functions, which can be swapped out. It is
-    expected that inherriting classes define a default inverse link function
+    expected that inheriting classes define a default inverse link function
 
-    To use this class, inherrit and define missing functionality.
+    To use this class, inherit and define missing functionality.
 
-    Inherriting classes *must* implement:
+    Inheriting classes *must* implement:
        pdf_link : a bound method which turns the output of the link function into the pdf
        logpdf_link : the logarithm of the above
 
-    To enable use with EP, inherriting classes *must* define:
+    To enable use with EP, inheriting classes *must* define:
        TODO: a suitable derivative function for any parameters of the class
     It is also desirable to define:
        moments_match_ep : a function to compute the EP moments If this isn't defined, the moments will be computed using 1D quadrature.
 
-    To enable use with Laplace approximation, inherriting classes *must* define:
+    To enable use with Laplace approximation, inheriting classes *must* define:
        Some derivative functions *AS TODO*
 
     For exact Gaussian inference, define *JH TODO*
@@ -159,7 +159,7 @@ class Likelihood(Parameterized):
 
     def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
         """
-        Numerical approximation to the predictive variance: V(Y_star)
+        Approximation to the predictive variance: V(Y_star)
 
         The following variance decomposition is used:
         V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
@@ -208,28 +208,28 @@ class Likelihood(Parameterized):
         # V(Y_star) = E[ V(Y_star|f_star) ] + E(Y_star**2|f_star) - E[Y_star|f_star]**2
         return exp_var + var_exp
 
-    def pdf_link(self, link_f, y, Y_metadata=None):
+    def pdf_link(self, inv_link_f, y, Y_metadata=None):
         raise NotImplementedError
 
-    def logpdf_link(self, link_f, y, Y_metadata=None):
+    def logpdf_link(self, inv_link_f, y, Y_metadata=None):
         raise NotImplementedError
 
-    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
+    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
         raise NotImplementedError
 
-    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
+    def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
         raise NotImplementedError
 
-    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
+    def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
         raise NotImplementedError
 
-    def dlogpdf_link_dtheta(self, link_f, y, Y_metadata=None):
+    def dlogpdf_link_dtheta(self, inv_link_f, y, Y_metadata=None):
         raise NotImplementedError
 
-    def dlogpdf_dlink_dtheta(self, link_f, y, Y_metadata=None):
+    def dlogpdf_dlink_dtheta(self, inv_link_f, y, Y_metadata=None):
         raise NotImplementedError
 
-    def d2logpdf_dlink2_dtheta(self, link_f, y, Y_metadata=None):
+    def d2logpdf_dlink2_dtheta(self, inv_link_f, y, Y_metadata=None):
         raise NotImplementedError
 
     def pdf(self, f, y, Y_metadata=None):
@@ -247,8 +247,8 @@ class Likelihood(Parameterized):
         :returns: likelihood evaluated for this point
         :rtype: float
         """
-        link_f = self.gp_link.transf(f)
-        return self.pdf_link(link_f, y, Y_metadata=Y_metadata)
+        inv_link_f = self.gp_link.transf(f)
+        return self.pdf_link(inv_link_f, y, Y_metadata=Y_metadata)
 
     def logpdf(self, f, y, Y_metadata=None):
         """
@@ -265,8 +265,8 @@ class Likelihood(Parameterized):
         :returns: log likelihood evaluated for this point
         :rtype: float
         """
-        link_f = self.gp_link.transf(f)
-        return self.logpdf_link(link_f, y, Y_metadata=Y_metadata)
+        inv_link_f = self.gp_link.transf(f)
+        return self.logpdf_link(inv_link_f, y, Y_metadata=Y_metadata)
 
     def dlogpdf_df(self, f, y, Y_metadata=None):
         """
@@ -284,8 +284,8 @@ class Likelihood(Parameterized):
         :returns: derivative of log likelihood evaluated for this point
         :rtype: 1xN array
         """
-        link_f = self.gp_link.transf(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, Y_metadata=Y_metadata)
+        inv_link_f = self.gp_link.transf(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
         dlink_df = self.gp_link.dtransf_df(f)
         return chain_1(dlogpdf_dlink, dlink_df)
 
@@ -305,10 +305,10 @@ class Likelihood(Parameterized):
         :returns: second derivative of log likelihood evaluated for this point (diagonal only)
         :rtype: 1xN array
         """
-        link_f = self.gp_link.transf(f)
-        d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, Y_metadata=Y_metadata)
+        inv_link_f = self.gp_link.transf(f)
+        d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
         dlink_df = self.gp_link.dtransf_df(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, Y_metadata=Y_metadata)
+        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
         d2link_df2 = self.gp_link.d2transf_df2(f)
         return chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)
 
@@ -328,12 +328,12 @@ class Likelihood(Parameterized):
         :returns: third derivative of log likelihood evaluated for this point
         :rtype: float
         """
-        link_f = self.gp_link.transf(f)
-        d3logpdf_dlink3 = self.d3logpdf_dlink3(link_f, y, Y_metadata=Y_metadata)
+        inv_link_f = self.gp_link.transf(f)
+        d3logpdf_dlink3 = self.d3logpdf_dlink3(inv_link_f, y, Y_metadata=Y_metadata)
         dlink_df = self.gp_link.dtransf_df(f)
-        d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, Y_metadata=Y_metadata)
+        d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
         d2link_df2 = self.gp_link.d2transf_df2(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, Y_metadata=Y_metadata)
+        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
         d3link_df3 = self.gp_link.d3transf_df3(f)
         return chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
 
@@ -342,10 +342,10 @@ class Likelihood(Parameterized):
         TODO: Doc strings
         """
         if self.size > 0:
-            link_f = self.gp_link.transf(f)
-            return self.dlogpdf_link_dtheta(link_f, y, Y_metadata=Y_metadata)
+            inv_link_f = self.gp_link.transf(f)
+            return self.dlogpdf_link_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
         else:
-            #Is no parameters so return an empty array for its derivatives
+            # There are no parameters so return an empty array for derivatives
             return np.zeros([1, 0])
 
     def dlogpdf_df_dtheta(self, f, y, Y_metadata=None):
@@ -353,12 +353,12 @@ class Likelihood(Parameterized):
         TODO: Doc strings
         """
         if self.size > 0:
-            link_f = self.gp_link.transf(f)
+            inv_link_f = self.gp_link.transf(f)
             dlink_df = self.gp_link.dtransf_df(f)
-            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, Y_metadata=Y_metadata)
+            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
             return chain_1(dlogpdf_dlink_dtheta, dlink_df)
         else:
-            #Is no parameters so return an empty array for its derivatives
+            # There are no parameters so return an empty array for derivatives
             return np.zeros([f.shape[0], 0])
 
     def d2logpdf_df2_dtheta(self, f, y, Y_metadata=None):
@@ -366,14 +366,14 @@ class Likelihood(Parameterized):
         TODO: Doc strings
         """
         if self.size > 0:
-            link_f = self.gp_link.transf(f)
+            inv_link_f = self.gp_link.transf(f)
             dlink_df = self.gp_link.dtransf_df(f)
             d2link_df2 = self.gp_link.d2transf_df2(f)
-            d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(link_f, y, Y_metadata=Y_metadata)
-            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, Y_metadata=Y_metadata)
+            d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
             return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
         else:
-            #Is no parameters so return an empty array for its derivatives
+            # There are no parameters so return an empty array for derivatives
             return np.zeros([f.shape[0], 0])
 
     def _laplace_gradients(self, f, y, Y_metadata=None):
@@ -383,12 +383,9 @@ class Likelihood(Parameterized):
 
         #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
         # ensure we have gradients for every parameter we want to optimize
-        try:
-            assert len(dlogpdf_dtheta) == self.size #1 x num_param array
-            assert dlogpdf_df_dtheta.shape[1] == self.size #f x num_param matrix
-            assert d2logpdf_df2_dtheta.shape[1] == self.size #f x num_param matrix
-        except Exception as e:
-            import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+        assert len(dlogpdf_dtheta) == self.size #1 x num_param array
+        assert dlogpdf_df_dtheta.shape[1] == self.size #f x num_param matrix
+        assert d2logpdf_df2_dtheta.shape[1] == self.size #f x num_param matrix
 
         return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
 
@@ -411,7 +408,10 @@ class Likelihood(Parameterized):
         #compute the quantiles by sampling!!!
         N_samp = 1000
         s = np.random.randn(mu.shape[0], N_samp)*np.sqrt(var) + mu
+        #ss_f = s.flatten()
+        #ss_y = self.samples(ss_f, Y_metadata)
         ss_y = self.samples(s, Y_metadata)
+        #ss_y = ss_y.reshape(mu.shape[0], N_samp)
 
         return [np.percentile(ss_y ,q, axis=1)[:,None] for q in quantiles]
 
diff --git a/GPy/likelihoods/link_functions.py b/GPy/likelihoods/link_functions.py
index 942fe2f4..86384155 100644
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@@ -71,6 +71,7 @@ class Probit(GPTransformation):
 
         g(f) = \\Phi^{-1} (mu)
 
+    
     """
     def transf(self,f):
         return std_norm_cdf(f)
diff --git a/GPy/likelihoods/negative_binomial.py b/GPy/likelihoods/negative_binomial.py
new file mode 100644
index 00000000..5bc5b727
--- /dev/null
+++ b/GPy/likelihoods/negative_binomial.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2014 The GPy authors (see AUTHORS.txt)
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+try:
+    import sympy as sym
+    sympy_available=True
+    from sympy.utilities.lambdify import lambdify
+    from GPy.util.symbolic import gammaln, ln_cum_gaussian, cum_gaussian
+except ImportError:
+    sympy_available=False
+
+import numpy as np
+from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
+import link_functions
+from symbolic import Symbolic
+from scipy import stats
+
+if sympy_available:
+    class Negative_binomial(Symbolic):
+        """
+        Negative binomial
+
+        .. math::
+            p(y_{i}|\pi(f_{i})) = \left(\frac{r}{r+f_i}\right)^r \frac{\Gamma(r+y_i)}{y!\Gamma(r)}\left(\frac{f_i}{r+f_i}\right)^{y_i}
+
+        .. Note::
+            Y takes non zero integer values..
+            link function should have a positive domain, e.g. log (default).
+
+        .. See also::
+            symbolic.py, for the parent class
+        """
+        def __init__(self, gp_link=None):
+            if gp_link is None:
+                gp_link = link_functions.Log()
+
+            dispersion = sym.Symbol('dispersion', positive=True, real=True)
+            y = sym.Symbol('y', nonnegative=True, integer=True)
+            f = sym.Symbol('f', positive=True, real=True) 
+            log_pdf=dispersion*sym.log(dispersion) - (dispersion+y)*sym.log(dispersion+f) + gammaln(y+dispersion) - gammaln(y+1) - gammaln(dispersion) + y*sym.log(f)  
+            super(Negative_binomial, self).__init__(log_pdf=log_pdf, gp_link=gp_link, name='Negative_binomial')
+
+            # TODO: Check this.
+            self.log_concave = False
+
diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py
index 47efd443..c057e789 100644
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@@ -26,8 +26,8 @@ class StudentT(Likelihood):
             gp_link = link_functions.Identity()
 
         super(StudentT, self).__init__(gp_link, name='Student_T')
-
-        self.sigma2 = Param('t_noise', float(sigma2), Logexp())
+        # sigma2 is not a noise parameter, it is a squared scale.
+        self.sigma2 = Param('t_scale2', float(sigma2), Logexp())
         self.v = Param('deg_free', float(deg_free))
         self.add_parameter(self.sigma2)
         self.add_parameter(self.v)
@@ -46,23 +46,23 @@ class StudentT(Likelihood):
         self.sigma2.gradient = grads[0]
         self.v.gradient = grads[1]
 
-    def pdf_link(self, link_f, y, Y_metadata=None):
+    def pdf_link(self, inv_link_f, y, Y_metadata=None):
         """
         Likelihood function given link(f)
 
         .. math::
             p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \\lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
 
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
+        :param inv_link_f: latent variables link(f)
+        :type inv_link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
         :param Y_metadata: Y_metadata which is not used in student t distribution
         :returns: likelihood evaluated for this point
         :rtype: float
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
         #Careful gamma(big_number) is infinity!
         objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
                      / (np.sqrt(self.v * np.pi * self.sigma2)))
@@ -70,15 +70,15 @@ class StudentT(Likelihood):
                     )
         return np.prod(objective)
 
-    def logpdf_link(self, link_f, y, Y_metadata=None):
+    def logpdf_link(self, inv_link_f, y, Y_metadata=None):
         """
         Log Likelihood Function given link(f)
 
         .. math::
             \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\Gamma\\left(\\frac{v+1}{2}\\right) - \\ln \\Gamma\\left(\\frac{v}{2}\\right) - \\ln \\sqrt{v \\pi\\sigma^{2}} - \\frac{v+1}{2}\\ln \\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)
 
-        :param link_f: latent variables (link(f))
-        :type link_f: Nx1 array
+        :param inv_link_f: latent variables (link(f))
+        :type inv_link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
         :param Y_metadata: Y_metadata which is not used in student t distribution
@@ -86,11 +86,11 @@ class StudentT(Likelihood):
         :rtype: float
 
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
         #FIXME:
-        #Why does np.log(1 + (1/self.v)*((y-link_f)**2)/self.sigma2) suppress the divide by zero?!
-        #But np.log(1 + (1/float(self.v))*((y-link_f)**2)/self.sigma2) throws it correctly
+        #Why does np.log(1 + (1/self.v)*((y-inv_link_f)**2)/self.sigma2) suppress the divide by zero?!
+        #But np.log(1 + (1/float(self.v))*((y-inv_link_f)**2)/self.sigma2) throws it correctly
         #print - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
         objective = (+ gammaln((self.v + 1) * 0.5)
                     - gammaln(self.v * 0.5)
@@ -99,15 +99,15 @@ class StudentT(Likelihood):
                     )
         return np.sum(objective)
 
-    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
+    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
         """
         Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
 
         .. math::
             \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{(v+1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v}
 
-        :param link_f: latent variables (f)
-        :type link_f: Nx1 array
+        :param inv_link_f: latent variables (f)
+        :type inv_link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
         :param Y_metadata: Y_metadata which is not used in student t distribution
@@ -115,12 +115,12 @@ class StudentT(Likelihood):
         :rtype: Nx1 array
 
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
         grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
         return grad
 
-    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
+    def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
         """
         Hessian at y, given link(f), w.r.t link(f)
         i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
@@ -129,8 +129,8 @@ class StudentT(Likelihood):
         .. math::
             \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{(v+1)((y_{i}-\lambda(f_{i}))^{2} - \\sigma^{2}v)}{((y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v)^{2}}
 
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
+        :param inv_link_f: latent variables inv_link(f)
+        :type inv_link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
         :param Y_metadata: Y_metadata which is not used in student t distribution
@@ -141,90 +141,90 @@ class StudentT(Likelihood):
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
             (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
         hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
         return hess
 
-    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
+    def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
         """
         Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
 
         .. math::
             \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{-2(v+1)((y_{i} - \lambda(f_{i}))^3 - 3(y_{i} - \lambda(f_{i})) \\sigma^{2} v))}{((y_{i} - \lambda(f_{i})) + \\sigma^{2} v)^3}
 
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
+        :param inv_link_f: latent variables link(f)
+        :type inv_link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
         :param Y_metadata: Y_metadata which is not used in student t distribution
         :returns: third derivative of likelihood evaluated at points f
         :rtype: Nx1 array
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
         d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
                        ((e**2 + self.sigma2*self.v)**3)
                     )
         return d3lik_dlink3
 
-    def dlogpdf_link_dvar(self, link_f, y, Y_metadata=None):
+    def dlogpdf_link_dvar(self, inv_link_f, y, Y_metadata=None):
         """
         Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise)
 
         .. math::
             \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\sigma^{2}} = \\frac{v((y_{i} - \lambda(f_{i}))^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})}
 
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
+        :param inv_link_f: latent variables link(f)
+        :type inv_link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
         :param Y_metadata: Y_metadata which is not used in student t distribution
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: float
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
         dlogpdf_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
         return np.sum(dlogpdf_dvar)
 
-    def dlogpdf_dlink_dvar(self, link_f, y, Y_metadata=None):
+    def dlogpdf_dlink_dvar(self, inv_link_f, y, Y_metadata=None):
         """
         Derivative of the dlogpdf_dlink w.r.t variance parameter (t_noise)
 
         .. math::
             \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^2 + \\sigma^2 v)^2}
 
-        :param link_f: latent variables link_f
-        :type link_f: Nx1 array
+        :param inv_link_f: latent variables inv_link_f
+        :type inv_link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
         :param Y_metadata: Y_metadata which is not used in student t distribution
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
         dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
         return dlogpdf_dlink_dvar
 
-    def d2logpdf_dlink2_dvar(self, link_f, y, Y_metadata=None):
+    def d2logpdf_dlink2_dvar(self, inv_link_f, y, Y_metadata=None):
         """
         Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (t_noise)
 
         .. math::
             \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - \lambda(f_{i}))^{2})}{(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})^{3}}
 
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
+        :param inv_link_f: latent variables link(f)
+        :type inv_link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
         :param Y_metadata: Y_metadata which is not used in student t distribution
         :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
+        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+        e = y - inv_link_f
         d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
                               / ((self.sigma2*self.v + (e**2))**3)
                            )
@@ -246,11 +246,12 @@ class StudentT(Likelihood):
         return np.hstack((d2logpdf_dlink2_dvar, d2logpdf_dlink2_dv))
 
     def predictive_mean(self, mu, sigma, Y_metadata=None):
-        return self.gp_link.transf(mu) # only true in link is monotoci, which it is.
+        # The comment here confuses mean and median. 
+        return self.gp_link.transf(mu) # only true if link is monotonic, which it is.
 
     def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
-        if self.deg_free <2.:
-            return np.empty(mu.shape)*np.nan #not defined for small degress fo freedom
+        if self.deg_free<=2.:
+            return np.empty(mu.shape)*np.nan # does not exist for degrees of freedom <= 2.
         else:
             return super(StudentT, self).predictive_variance(mu, variance, predictive_mean, Y_metadata)
 
diff --git a/GPy/likelihoods/symbolic.py b/GPy/likelihoods/symbolic.py
new file mode 100644
index 00000000..5d052119
--- /dev/null
+++ b/GPy/likelihoods/symbolic.py
@@ -0,0 +1,298 @@
+# Copyright (c) 2014 GPy Authors
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+try:
+    import sympy as sym
+    sympy_available=True
+    from sympy.utilities.lambdify import lambdify
+except ImportError:
+    sympy_available=False
+
+import numpy as np
+import link_functions
+from scipy import stats, integrate
+from scipy.special import gammaln, gamma, erf, polygamma
+from GPy.util.functions import cum_gaussian, ln_cum_gaussian
+from likelihood import Likelihood
+from ..core.parameterization import Param
+
+
+if sympy_available:
+    class Symbolic(Likelihood):
+        """
+        Symbolic likelihood.
+
+        Likelihood where the form of the likelihood is provided by a sympy expression.
+
+        """
+        def __init__(self, log_pdf=None, logZ=None, missing_log_pdf=None, gp_link=None, name='symbolic', log_concave=False, param=None, func_modules=[]):
+
+            if gp_link is None:
+                gp_link = link_functions.Identity()
+
+            if log_pdf is None:
+                raise ValueError, "You must provide an argument for the log pdf."
+
+            self.func_modules = func_modules + [{'gamma':gamma, 'gammaln':gammaln, 'erf':erf,'polygamma':polygamma, 'cum_gaussian':cum_gaussian, 'ln_cum_gaussian':ln_cum_gaussian}, 'numpy']
+
+            super(Symbolic, self).__init__(gp_link, name=name)
+            self.missing_data = False
+            self._sym_log_pdf = log_pdf
+            if missing_log_pdf:
+                self.missing_data = True
+                self._sym_missing_log_pdf = missing_log_pdf
+
+            # pull the variable names out of the symbolic pdf
+            sym_vars = [e for e in self._sym_log_pdf.atoms() if e.is_Symbol]
+            self._sym_f = [e for e in sym_vars if e.name=='f']
+            if not self._sym_f:
+                raise ValueError('No variable f in log pdf.')
+            self._sym_y = [e for e in sym_vars if e.name=='y']
+            if not self._sym_y:
+                raise ValueError('No variable y in log pdf.')
+            self._sym_theta = sorted([e for e in sym_vars if not (e.name=='f' or e.name=='y')],key=lambda e:e.name)
+
+            theta_names = [theta.name for theta in self._sym_theta]
+            if self.missing_data:
+                # pull the variable names out of missing data
+                sym_vars = [e for e in self._sym_missing_log_pdf.atoms() if e.is_Symbol]
+                sym_f = [e for e in sym_vars if e.name=='f']
+                if not sym_f:
+                    raise ValueError('No variable f in missing log pdf.')
+                sym_y = [e for e in sym_vars if e.name=='y']
+                if sym_y:
+                    raise ValueError('Data is present in missing data portion of likelihood.')
+                # additional missing data parameters
+                missing_theta = sorted([e for e in sym_vars if not (e.name=='f' or e.name=='missing' or e.name in theta_names)],key=lambda e:e.name)
+                self._sym_theta += missing_theta
+                self._sym_theta = sorted(self._sym_theta, key=lambda e:e.name)
+
+            # These are all the arguments need to compute likelihoods.
+            self.arg_list = self._sym_y + self._sym_f + self._sym_theta
+
+            # these are arguments for computing derivatives.
+            derivative_arguments = self._sym_f + self._sym_theta
+
+            # Do symbolic work to compute derivatives.
+            self._log_pdf_derivatives = {theta.name : sym.diff(self._sym_log_pdf,theta).simplify() for theta in derivative_arguments}
+            self._log_pdf_second_derivatives = {theta.name : sym.diff(self._log_pdf_derivatives['f'],theta).simplify() for theta in derivative_arguments}
+            self._log_pdf_third_derivatives = {theta.name : sym.diff(self._log_pdf_second_derivatives['f'],theta).simplify() for theta in derivative_arguments}
+
+            if self.missing_data:
+                # Do symbolic work to compute derivatives.
+                self._missing_log_pdf_derivatives = {theta.name : sym.diff(self._sym_missing_log_pdf,theta).simplify() for theta in derivative_arguments}
+                self._missing_log_pdf_second_derivatives = {theta.name : sym.diff(self._missing_log_pdf_derivatives['f'],theta).simplify() for theta in derivative_arguments}
+                self._missing_log_pdf_third_derivatives = {theta.name : sym.diff(self._missing_log_pdf_second_derivatives['f'],theta).simplify() for theta in derivative_arguments}
+
+            
+            # Add parameters to the model.
+            for theta in self._sym_theta:
+                val = 1.0
+                # TODO: need to decide how to handle user passing values for the se parameter vectors.
+                if param is not None:
+                    if param.has_key(theta.name):
+                        val = param[theta.name]
+                setattr(self, theta.name, Param(theta.name, val, None))
+                self.add_parameters(getattr(self, theta.name))
+
+
+            # Is there some way to check whether the pdf is log
+            # concave? For the moment, need user to specify.
+            self.log_concave = log_concave
+
+            # initialise code arguments
+            self._arguments = {} 
+
+            # generate the code for the pdf and derivatives
+            self._gen_code()
+
+        def _gen_code(self):
+            """Generate the code from the symbolic parts that will be used for likleihod computation."""
+            # TODO: Check here whether theano is available and set up
+            # functions accordingly.
+            self._log_pdf_function = lambdify(self.arg_list, self._sym_log_pdf, self.func_modules)
+
+            # compute code for derivatives (for implicit likelihood terms
+            # we need up to 3rd derivatives)
+            setattr(self, '_first_derivative_code', {key: lambdify(self.arg_list, self._log_pdf_derivatives[key], self.func_modules) for key in self._log_pdf_derivatives.keys()})
+            setattr(self, '_second_derivative_code', {key: lambdify(self.arg_list, self._log_pdf_second_derivatives[key], self.func_modules) for key in self._log_pdf_second_derivatives.keys()})
+            setattr(self, '_third_derivative_code', {key: lambdify(self.arg_list, self._log_pdf_third_derivatives[key], self.func_modules) for key in self._log_pdf_third_derivatives.keys()})
+
+            if self.missing_data:
+                setattr(self, '_missing_first_derivative_code', {key: lambdify(self.arg_list, self._missing_log_pdf_derivatives[key], self.func_modules) for key in self._missing_log_pdf_derivatives.keys()})
+                setattr(self, '_missing_second_derivative_code', {key: lambdify(self.arg_list, self._missing_log_pdf_second_derivatives[key], self.func_modules) for key in self._missing_log_pdf_second_derivatives.keys()})
+                setattr(self, '_missing_third_derivative_code', {key: lambdify(self.arg_list, self._missing_log_pdf_third_derivatives[key], self.func_modules) for key in self._missing_log_pdf_third_derivatives.keys()})
+
+            # TODO: compute EP code parts based on logZ. We need dlogZ/dmu, d2logZ/dmu2 and dlogZ/dtheta
+
+        def parameters_changed(self):
+            pass
+
+        def update_gradients(self, grads):
+            """
+            Pull out the gradients, be careful as the order must match the order
+            in which the parameters are added
+            """
+            # The way the Laplace approximation is run requires the
+            # covariance function to compute the true gradient (because it
+            # is dependent on the mode). This means we actually compute
+            # the gradient outside this object. This function would
+            # normally ask the object to update its gradients internally,
+            # but here it provides them externally, because they are
+            # computed in the inference code. TODO: Thought: How does this
+            # effect EP? Shouldn't this be done by a separate
+            # Laplace-approximation specific call?
+            for grad, theta in zip(grads, self._sym_theta):
+                parameter = getattr(self, theta.name)
+                setattr(parameter, 'gradient', grad)
+
+        def _arguments_update(self, f, y):
+            """Set up argument lists for the derivatives."""
+            # If we do make use of Theano, then at this point we would
+            # need to do a lot of precomputation to ensure that the
+            # likelihoods and gradients are computed together, then check
+            # for parameter changes before updating.
+            for i, fvar in enumerate(self._sym_f):
+                self._arguments[fvar.name] =  f
+            for i, yvar in enumerate(self._sym_y):
+                self._arguments[yvar.name] = y
+            for theta in self._sym_theta:
+                self._arguments[theta.name] = np.asarray(getattr(self, theta.name))
+
+        def pdf_link(self, inv_link_f, y, Y_metadata=None):
+            """
+            Likelihood function given inverse link of f.
+
+            :param inv_link_f: inverse link of latent variables.
+            :type inv_link_f: Nx1 array
+            :param y: data
+            :type y: Nx1 array
+            :param Y_metadata: Y_metadata which is not used in student t distribution
+            :returns: likelihood evaluated for this point
+            :rtype: float
+            """
+            return np.exp(self.logpdf_link(inv_link_f, y, Y_metadata=None))
+
+        def logpdf_link(self, inv_link_f, y, Y_metadata=None):
+            """
+            Log Likelihood Function given inverse link of latent variables.
+
+            :param inv_inv_link_f: latent variables (inverse link of f)
+            :type inv_inv_link_f: Nx1 array
+            :param y: data
+            :type y: Nx1 array
+            :param Y_metadata: Y_metadata 
+            :returns: likelihood evaluated for this point
+            :rtype: float
+
+            """
+            assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
+            self._arguments_update(inv_link_f, y)
+            if self.missing_data:
+                ll = np.where(np.isnan(y), self._missing_log_pdf_function(**self._missing_arguments), self._log_pdf_function(**self._arguments))
+            else:
+                ll = np.where(np.isnan(y), 0., self._log_pdf_function(**self._arguments))
+            return np.sum(ll)
+
+        def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
+            """
+            Gradient of log likelihood with respect to the inverse link function.
+
+            :param inv_inv_link_f: latent variables (inverse link of f)
+            :type inv_inv_link_f: Nx1 array
+            :param y: data
+            :type y: Nx1 array
+            :param Y_metadata: Y_metadata 
+            :returns: gradient of likelihood with respect to each point.
+            :rtype: Nx1 array
+
+            """
+            assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape 
+            self._arguments_update(inv_link_f, y)
+            if self.missing_data:
+                return np.where(np.isnan(y), self._missing_first_derivative_code['f'](**self._missing_argments), self._first_derivative_code['f'](**self._argments)) 
+            else:
+                return np.where(np.isnan(y), 0., self._first_derivative_code['f'](**self._arguments))
+
+        def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
+            """
+            Hessian of log likelihood given inverse link of latent variables with respect to that inverse link.
+            i.e. second derivative logpdf at y given inv_link(f_i) and inv_link(f_j)  w.r.t inv_link(f_i) and inv_link(f_j).
+
+
+            :param inv_link_f: inverse link of the latent variables.
+            :type inv_link_f: Nx1 array
+            :param y: data
+            :type y: Nx1 array
+            :param Y_metadata: Y_metadata which is not used in student t distribution
+            :returns: Diagonal of Hessian matrix (second derivative of likelihood evaluated at points f)
+            :rtype: Nx1 array
+
+            .. Note::
+                Returns diagonal of Hessian, since every where else it is
+                0, as the likelihood factorizes over cases (the
+                distribution for y_i depends only on link(f_i) not on
+                link(f_(j!=i))
+            """
+            assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape 
+            self._arguments_update(inv_link_f, y)
+            if self.missing_data:
+                return np.where(np.isnan(y), self._missing_second_derivative_code['f'](**self._missing_argments), self._second_derivative_code['f'](**self._argments)) 
+            else:
+                return np.where(np.isnan(y), 0., self._second_derivative_code['f'](**self._arguments))
+
+        def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
+            assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape 
+            self._arguments_update(inv_link_f, y)
+            if self.missing_data:
+                return np.where(np.isnan(y), self._missing_third_derivative_code['f'](**self._missing_argments), self._third_derivative_code['f'](**self._argments)) 
+            else:
+                return np.where(np.isnan(y), 0., self._third_derivative_code['f'](**self._arguments))
+
+        def dlogpdf_link_dtheta(self, inv_link_f, y, Y_metadata=None):
+            assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape 
+            self._arguments_update(inv_link_f, y)
+            g = np.zeros((np.atleast_1d(y).shape[0], len(self._sym_theta)))
+            for i, theta in enumerate(self._sym_theta):
+                if self.missing_data:
+                    g[:, i:i+1] = np.where(np.isnan(y), self._missing_first_derivative_code[theta.name](**self._arguments), self._first_derivative_code[theta.name](**self._arguments))
+                else:
+                    g[:, i:i+1] = np.where(np.isnan(y), 0., self._first_derivative_code[theta.name](**self._arguments))
+            return g.sum(0)
+
+        def dlogpdf_dlink_dtheta(self, inv_link_f, y, Y_metadata=None):
+            assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape 
+            self._arguments_update(inv_link_f, y)
+            g = np.zeros((np.atleast_1d(y).shape[0], len(self._sym_theta)))
+            for i, theta in enumerate(self._sym_theta):
+                if self.missing_data:
+                    g[:, i:i+1] = np.where(np.isnan(y), self._missing_second_derivative_code[theta.name](**self._arguments), self._second_derivative_code[theta.name](**self._arguments))
+                else:
+                    g[:, i:i+1] = np.where(np.isnan(y), 0., self._second_derivative_code[theta.name](**self._arguments))
+            return g
+
+        def d2logpdf_dlink2_dtheta(self, inv_link_f, y, Y_metadata=None):
+            assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape 
+            self._arguments_update(inv_link_f, y)
+            g = np.zeros((np.atleast_1d(y).shape[0], len(self._sym_theta)))
+            for i, theta in enumerate(self._sym_theta):
+                if self.missing_data:
+                    g[:, i:i+1] = np.where(np.isnan(y), self._missing_third_derivative_code[theta.name](**self._arguments), self._third_derivative_code[theta.name](**self._arguments))
+                else:
+                    g[:, i:i+1] = np.where(np.isnan(y), 0., self._third_derivative_code[theta.name](**self._arguments))
+            return g
+
+        def predictive_mean(self, mu, sigma, Y_metadata=None):
+            raise NotImplementedError
+
+        def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
+            raise NotImplementedError
+
+        def conditional_mean(self, gp):
+            raise NotImplementedError
+
+        def conditional_variance(self, gp):
+            raise NotImplementedError
+
+        def samples(self, gp, Y_metadata=None):
+            raise NotImplementedError
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 1f01d4d5..03cd361c 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -27,7 +27,10 @@ class BayesianGPLVM(SparseGP):
                  Z=None, kernel=None, inference_method=None, likelihood=None, name='bayesian gplvm', **kwargs):
         if X == None:
             from ..util.initialization import initialize_latent
-            X = initialize_latent(init, input_dim, Y)
+            X, fracs = initialize_latent(init, input_dim, Y)
+        else:
+            fracs = np.ones(input_dim)
+
         self.init = init
 
         if X_variance is None:
@@ -39,7 +42,7 @@ class BayesianGPLVM(SparseGP):
         assert Z.shape[1] == X.shape[1]
 
         if kernel is None:
-            kernel = kern.RBF(input_dim) # + kern.white(input_dim)
+            kernel = kern.RBF(input_dim, lengthscale=fracs, ARD=True) # + kern.white(input_dim)
 
         if likelihood is None:
             likelihood = Gaussian()
@@ -48,21 +51,17 @@ class BayesianGPLVM(SparseGP):
         self.variational_prior = NormalPrior()
         X = NormalPosterior(X, X_variance)
 
+        if inference_method is None:
+            if np.any(np.isnan(Y)):
+                from ..inference.latent_function_inference.var_dtc import VarDTCMissingData
+                inference_method = VarDTCMissingData()
+            else:
+                from ..inference.latent_function_inference.var_dtc import VarDTC
+                inference_method = VarDTC()
+
         SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
         self.add_parameter(self.X, index=0)
 
-    def _getstate(self):
-        """
-        Get the current state of the class,
-        here just all the indices, rest can get recomputed
-        """
-        return SparseGP._getstate(self) + [self.init]
-
-    def _setstate(self, state):
-        self._const_jitter = None
-        self.init = state.pop()
-        SparseGP._setstate(self, state)
-
     def set_X_gradients(self, X, X_grad):
         """Set the gradients of the posterior distribution of X in its specific form."""
         X.mean.gradient, X.variance.gradient = X_grad
diff --git a/GPy/models/gp_classification.py b/GPy/models/gp_classification.py
index 9d918cda..2a4193ab 100644
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@@ -2,10 +2,10 @@
 # Copyright (c) 2013, the GPy Authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-import numpy as np
 from ..core import GP
 from .. import likelihoods
 from .. import kern
+from ..inference.latent_function_inference.expectation_propagation import EP
 
 class GPClassification(GP):
     """
@@ -27,4 +27,4 @@ class GPClassification(GP):
 
         likelihood = likelihoods.Bernoulli()
 
-        GP.__init__(self, X=X, Y=Y,  kernel=kernel, likelihood=likelihood, name='gp_classification')
+        GP.__init__(self, X=X, Y=Y,  kernel=kernel, likelihood=likelihood, inference_method=EP(), name='gp_classification')
diff --git a/GPy/models/gp_regression.py b/GPy/models/gp_regression.py
index 86e64a54..d56e72b9 100644
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@@ -29,8 +29,3 @@ class GPRegression(GP):
 
         super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression', Y_metadata=Y_metadata)
 
-    def _getstate(self):
-        return GP._getstate(self)
-
-    def _setstate(self, state):
-        return GP._setstate(self, state)
diff --git a/GPy/models/gplvm.py b/GPy/models/gplvm.py
index b85540ce..542dcd31 100644
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@@ -5,7 +5,6 @@
 import numpy as np
 import pylab as pb
 from .. import kern
-from ..util.linalg import PCA
 from ..core import GP, Param
 from ..likelihoods import Gaussian
 from .. import util
@@ -29,9 +28,11 @@ class GPLVM(GP):
         """
         if X is None:
             from ..util.initialization import initialize_latent
-            X = initialize_latent(init, input_dim, Y)
+            X, fracs = initialize_latent(init, input_dim, Y)
+        else:
+            fracs = np.ones(input_dim)
         if kernel is None:
-            kernel = kern.RBF(input_dim, ARD=input_dim > 1) + kern.Bias(input_dim, np.exp(-2))
+            kernel = kern.RBF(input_dim, lengthscale=fracs, ARD=input_dim > 1) + kern.Bias(input_dim, np.exp(-2))
 
         likelihood = Gaussian()
 
@@ -43,12 +44,6 @@ class GPLVM(GP):
         super(GPLVM, self).parameters_changed()
         self.X.gradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
 
-    def _getstate(self):
-        return GP._getstate(self)
-
-    def _setstate(self, state):
-        GP._setstate(self, state)
-
     def jacobian(self,X):
         target = np.zeros((X.shape[0],X.shape[1],self.output_dim))
         for i in range(self.output_dim):
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index ac2ef9cd..458a70a1 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -6,12 +6,12 @@ import itertools
 import pylab
 
 from ..core import Model
-from ..util.linalg import PCA
 from ..kern import Kern
 from ..core.parameterization.variational import NormalPosterior, NormalPrior
 from ..core.parameterization import Param, Parameterized
 from ..inference.latent_function_inference.var_dtc import VarDTCMissingData, VarDTC
 from ..likelihoods import Gaussian
+from GPy.util.initialization import initialize_latent
 
 class MRD(Model):
     """
@@ -51,27 +51,31 @@ class MRD(Model):
                  inference_method=None, likelihood=None, name='mrd', Ynames=None):
         super(MRD, self).__init__(name)
 
-        # sort out the kernels
-        if kernel is None:
-            from ..kern import RBF
-            self.kern = [RBF(input_dim, ARD=1, name='rbf'.format(i)) for i in range(len(Ylist))]
-        elif isinstance(kernel, Kern):
-            self.kern = [kernel.copy(name='{}'.format(kernel.name, i)) for i in range(len(Ylist))]
-        else:
-            assert len(kernel) == len(Ylist), "need one kernel per output"
-            assert all([isinstance(k, Kern) for k in kernel]), "invalid kernel object detected!"
-            self.kern = kernel
         self.input_dim = input_dim
         self.num_inducing = num_inducing
 
         self.Ylist = Ylist
         self._in_init_ = True
-        X = self._init_X(initx, Ylist)
+        X, fracs = self._init_X(initx, Ylist)
         self.Z = Param('inducing inputs', self._init_Z(initz, X))
         self.num_inducing = self.Z.shape[0] # ensure M==N if M>N
 
+        # sort out the kernels
+        if kernel is None:
+            from ..kern import RBF
+            self.kern = [RBF(input_dim, ARD=1, lengthscale=fracs[i], name='rbf'.format(i)) for i in range(len(Ylist))]
+        elif isinstance(kernel, Kern):
+            self.kern = []
+            for i in range(len(Ylist)):
+                k = kernel.copy()
+                self.kern.append(k)
+        else:
+            assert len(kernel) == len(Ylist), "need one kernel per output"
+            assert all([isinstance(k, Kern) for k in kernel]), "invalid kernel object detected!"
+            self.kern = kernel
+
         if X_variance is None:
-            X_variance = np.random.uniform(0, .2, X.shape)
+            X_variance = np.random.uniform(0.1, 0.2, X.shape)
 
         self.variational_prior = NormalPrior()
         self.X = NormalPosterior(X, X_variance)
@@ -107,9 +111,8 @@ class MRD(Model):
     def parameters_changed(self):
         self._log_marginal_likelihood = 0
         self.posteriors = []
-        self.Z.gradient = 0.
-        self.X.mean.gradient = 0.
-        self.X.variance.gradient = 0.
+        self.Z.gradient[:] = 0.
+        self.X.gradient[:] = 0.
 
         for y, k, l, i in itertools.izip(self.Ylist, self.kern, self.likelihood, self.inference_method):
             posterior, lml, grad_dict = i.inference(k, self.X, self.Z, l, y)
@@ -147,14 +150,22 @@ class MRD(Model):
         if Ylist is None:
             Ylist = self.Ylist
         if init in "PCA_concat":
-            X = PCA(np.hstack(Ylist), self.input_dim)[0]
+            X, fracs = initialize_latent('PCA', self.input_dim, np.hstack(Ylist))
+            fracs = [fracs]*self.input_dim
         elif init in "PCA_single":
             X = np.zeros((Ylist[0].shape[0], self.input_dim))
+            fracs = []
             for qs, Y in itertools.izip(np.array_split(np.arange(self.input_dim), len(Ylist)), Ylist):
-                X[:, qs] = PCA(Y, len(qs))[0]
+                x,frcs = initialize_latent('PCA', len(qs), Y)
+                X[:, qs] = x
+                fracs.append(frcs)
         else: # init == 'random':
             X = np.random.randn(Ylist[0].shape[0], self.input_dim)
-        return X
+            fracs = X.var(0)
+            fracs = [fracs]*self.input_dim
+        X -= X.mean()
+        X /= X.std()
+        return X, fracs
 
     def _init_Z(self, init="permute", X=None):
         if X is None:
diff --git a/GPy/models/sparse_gp_classification.py b/GPy/models/sparse_gp_classification.py
index 96f7ac5a..e2c77d95 100644
--- a/GPy/models/sparse_gp_classification.py
+++ b/GPy/models/sparse_gp_classification.py
@@ -46,11 +46,3 @@ class SparseGPClassification(SparseGP):
         SparseGP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X)
         self.ensure_default_constraints()
 
-    def _getstate(self):
-        return SparseGP._getstate(self)
-
-
-    def _setstate(self, state):
-        return SparseGP._setstate(self, state)
-
-    pass
diff --git a/GPy/models/sparse_gp_regression.py b/GPy/models/sparse_gp_regression.py
index 7edb93e4..f4d5513e 100644
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@@ -51,14 +51,6 @@ class SparseGPRegression(SparseGP):
 
         SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method=VarDTC())
 
-    def _getstate(self):
-        return SparseGP._getstate(self)
-
-    def _setstate(self, state):
-        return SparseGP._setstate(self, state)
-
-
-
 class SparseGPRegressionUncertainInput(SparseGP):
     """
     Gaussian Process model for regression with Gaussian variance on the inputs (X_variance)
diff --git a/GPy/models/sparse_gplvm.py b/GPy/models/sparse_gplvm.py
index 5c10d0b8..638da63e 100644
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@@ -28,14 +28,6 @@ class SparseGPLVM(SparseGPRegression, GPLVM):
         SparseGPRegression.__init__(self, X, Y, kernel=kernel, num_inducing=num_inducing)
         self.ensure_default_constraints()
 
-    def _getstate(self):
-        return SparseGPRegression._getstate(self)
-
-
-    def _setstate(self, state):
-        return SparseGPRegression._setstate(self, state)
-
-
     def _get_param_names(self):
         return (sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
                 + SparseGPRegression._get_param_names(self))
diff --git a/GPy/models/svigp_regression.py b/GPy/models/svigp_regression.py
index 3faa1cab..3397e31e 100644
--- a/GPy/models/svigp_regression.py
+++ b/GPy/models/svigp_regression.py
@@ -43,10 +43,3 @@ class SVIGPRegression(SVIGP):
         SVIGP.__init__(self, X, likelihood, kernel, Z, q_u=q_u, batchsize=batchsize)
         self.load_batch()
 
-    def _getstate(self):
-        return GPBase._getstate(self)
-
-
-    def _setstate(self, state):
-        return GPBase._setstate(self, state)
-
diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index d78f31df..4b982ed2 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -30,14 +30,6 @@ class WarpedGP(GP):
         GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
         self._set_params(self._get_params())
 
-    def _getstate(self):
-        return GP._getstate(self)
-
-
-    def _setstate(self, state):
-        return GP._setstate(self, state)
-
-
     def _scale_data(self, Y):
         self._Ymax = Y.max()
         self._Ymin = Y.min()
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index b626758f..57b64ae5 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -53,8 +53,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         which_data_rows = slice(None)
     if which_data_ycols == 'all':
         which_data_ycols = np.arange(model.output_dim)
-    if len(which_data_ycols)==0:
-        raise ValueError('No data selected for plotting')
+    #if len(which_data_ycols)==0:
+        #raise ValueError('No data selected for plotting')
     if ax is None:
         fig = pb.figure(num=fignum)
         ax = fig.add_subplot(111)
diff --git a/GPy/testing/index_operations_tests.py b/GPy/testing/index_operations_tests.py
index 12602879..37cec10b 100644
--- a/GPy/testing/index_operations_tests.py
+++ b/GPy/testing/index_operations_tests.py
@@ -33,6 +33,8 @@ class Test(unittest.TestCase):
         self.assertListEqual(self.param_index[one].tolist(), [3])
         self.assertListEqual(self.param_index.remove('not in there', [2,3,4]).tolist(), [])
 
+        self.assertListEqual(self.view.remove('not in there', [2,3,4]).tolist(), [])
+
     def test_shift_left(self):
         self.view.shift_left(0, 2)
         self.assertListEqual(self.param_index[three].tolist(), [2,5])
@@ -82,6 +84,10 @@ class Test(unittest.TestCase):
         self.assertEqual(self.param_index.size, 6)
         self.assertEqual(self.view.size, 5)
 
+    def test_print(self):
+        print self.param_index
+        print self.view
+
 if __name__ == "__main__":
     #import sys;sys.argv = ['', 'Test.test_index_view']
     unittest.main()
\ No newline at end of file
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 9ed218d8..91683edc 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -5,9 +5,11 @@ import unittest
 import numpy as np
 import GPy
 import sys
+from GPy.core.parameterization.param import Param
 
 verbose = 0
 
+np.random.seed(50)
 
 
 class Kern_check_model(GPy.core.Model):
@@ -29,7 +31,7 @@ class Kern_check_model(GPy.core.Model):
                 dL_dK = np.ones((X.shape[0], X2.shape[0]))
 
         self.kernel = kernel
-        self.X = GPy.core.parameterization.Param('X',X)
+        self.X = X
         self.X2 = X2
         self.dL_dK = dL_dK
 
@@ -76,10 +78,11 @@ class Kern_check_dK_dX(Kern_check_model):
     """This class allows gradient checks for the gradient of a kernel with respect to X. """
     def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
         Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
+        self.X = Param('X',X)
         self.add_parameter(self.X)
 
     def parameters_changed(self):
-        self.X.gradient =  self.kernel.gradients_X(self.dL_dK, self.X, self.X2)
+        self.X.gradient[:] =  self.kernel.gradients_X(self.dL_dK, self.X, self.X2)
 
 class Kern_check_dKdiag_dX(Kern_check_dK_dX):
     """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """
@@ -90,7 +93,7 @@ class Kern_check_dKdiag_dX(Kern_check_dK_dX):
         return (np.diag(self.dL_dK)*self.kernel.Kdiag(self.X)).sum()
 
     def parameters_changed(self):
-        self.X.gradient =  self.kernel.gradients_X_diag(self.dL_dK.diagonal(), self.X)
+        self.X.gradient[:] =  self.kernel.gradients_X_diag(self.dL_dK.diagonal(), self.X)
 
 
 
@@ -126,6 +129,7 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     if not result:
         print("Positive definite check failed for " + kern.name + " covariance function.")
         pass_checks = False
+        assert(result)
         return False
 
     if verbose:
@@ -137,6 +141,7 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
         print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
         Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=True)
         pass_checks = False
+        assert(result)
         return False
 
     if verbose:
@@ -148,6 +153,7 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
         print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
         Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=True)
         pass_checks = False
+        assert(result)
         return False
 
     if verbose:
@@ -164,6 +170,7 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
         print("Gradient of Kdiag(X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
         Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=True)
         pass_checks = False
+        assert(result)
         return False
 
     if verbose:
@@ -182,6 +189,8 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     if not result:
         print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
         testmodel.checkgrad(verbose=True)
+        import ipdb;ipdb.set_trace()
+        assert(result)
         pass_checks = False
         return False
 
@@ -201,6 +210,7 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     if not result:
         print("Gradient of K(X, X2) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
         testmodel.checkgrad(verbose=True)
+        assert(result)
         pass_checks = False
         return False
 
@@ -218,6 +228,7 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
         print("Gradient of Kdiag(X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
         Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=True)
         pass_checks = False
+        assert(result)
         return False
 
     return pass_checks
@@ -226,7 +237,7 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
 
 class KernelGradientTestsContinuous(unittest.TestCase):
     def setUp(self):
-        self.N, self.D = 100, 5
+        self.N, self.D = 10, 5
         self.X = np.random.randn(self.N,self.D)
         self.X2 = np.random.randn(self.N+10,self.D)
 
@@ -243,6 +254,17 @@ class KernelGradientTestsContinuous(unittest.TestCase):
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
+    def test_Prod2(self):
+        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
+    def test_Prod3(self):
+        k = GPy.kern.Matern32(2, active_dims=[2,3]) * (GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D))
+        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
     def test_Add(self):
         k = GPy.kern.Matern32(2, active_dims=[2,3]) + GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D)
         k += GPy.kern.Matern32(2, active_dims=[2,3]) + GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D)
@@ -276,6 +298,11 @@ class KernelGradientTestsContinuous(unittest.TestCase):
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
+    def test_LinearFull(self):
+        k = GPy.kern.LinearFull(self.D, self.D-1)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
 #TODO: turn off grad checkingwrt X for indexed kernels like coregionalize
 # class KernelGradientTestsContinuous1D(unittest.TestCase):
 #     def setUp(self):
@@ -338,7 +365,7 @@ class KernelTestsNonContinuous(unittest.TestCase):
         self.X2 = np.random.randn((N0+N1)*2, self.D+1)
         self.X2[:(N0*2), -1] = 0
         self.X2[(N0*2):, -1] = 1
-
+  
     def test_IndependentOutputs(self):
         k = GPy.kern.RBF(self.D)
         kern = GPy.kern.IndependentOutputs(k, -1, 'ind_single')
@@ -346,7 +373,7 @@ class KernelTestsNonContinuous(unittest.TestCase):
         k = [GPy.kern.RBF(1, active_dims=[1], name='rbf1'), GPy.kern.RBF(self.D, name='rbf012'), GPy.kern.RBF(2, active_dims=[0,2], name='rbf02')]
         kern = GPy.kern.IndependentOutputs(k, -1, name='ind_split')
         self.assertTrue(check_kernel_gradient_functions(kern, X=self.X, X2=self.X2, verbose=verbose, fixed_X_dims=-1))
-
+  
     def test_ODE_UY(self):
         kern = GPy.kern.ODE_UY(2, active_dims=[0, self.D])
         X = self.X[self.X[:,-1]!=2]
@@ -356,25 +383,25 @@ class KernelTestsNonContinuous(unittest.TestCase):
 
 if __name__ == "__main__":
     print "Running unit tests, please be (very) patient..."
-    #unittest.main()
-    np.random.seed(0)
-    N0 = 3
-    N1 = 9
-    N2 = 4
-    N = N0+N1+N2
-    D = 3
-    X = np.random.randn(N, D+1)
-    indices = np.random.random_integers(0, 2, size=N)
-    X[indices==0, -1] = 0
-    X[indices==1, -1] = 1
-    X[indices==2, -1] = 2
-    #X = X[X[:, -1].argsort(), :]
-    X2 = np.random.randn((N0+N1)*2, D+1)
-    X2[:(N0*2), -1] = 0
-    X2[(N0*2):, -1] = 1
-    k = [GPy.kern.RBF(1, active_dims=[1], name='rbf1'), GPy.kern.RBF(D, name='rbf012'), GPy.kern.RBF(2, active_dims=[0,2], name='rbf02')]
-    kern = GPy.kern.IndependentOutputs(k, -1, name='ind_split')
-    assert(check_kernel_gradient_functions(kern, X=X, X2=X2, verbose=verbose, fixed_X_dims=-1))
-    k = GPy.kern.RBF(D)
-    kern = GPy.kern.IndependentOutputs(k, -1, 'ind_single')
-    assert(check_kernel_gradient_functions(kern, X=X, X2=X2, verbose=verbose, fixed_X_dims=-1))
+    unittest.main()
+#     np.random.seed(0)
+#     N0 = 3
+#     N1 = 9
+#     N2 = 4
+#     N = N0+N1+N2
+#     D = 3
+#     X = np.random.randn(N, D+1)
+#     indices = np.random.random_integers(0, 2, size=N)
+#     X[indices==0, -1] = 0
+#     X[indices==1, -1] = 1
+#     X[indices==2, -1] = 2
+#     #X = X[X[:, -1].argsort(), :]
+#     X2 = np.random.randn((N0+N1)*2, D+1)
+#     X2[:(N0*2), -1] = 0
+#     X2[(N0*2):, -1] = 1
+#     k = [GPy.kern.RBF(1, active_dims=[1], name='rbf1'), GPy.kern.RBF(D, name='rbf012'), GPy.kern.RBF(2, active_dims=[0,2], name='rbf02')]
+#     kern = GPy.kern.IndependentOutputs(k, -1, name='ind_split')
+#     assert(check_kernel_gradient_functions(kern, X=X, X2=X2, verbose=verbose, fixed_X_dims=-1))
+#     k = GPy.kern.RBF(D)
+#     kern = GPy.kern.IndependentOutputs(k, -1, 'ind_single')
+#     assert(check_kernel_gradient_functions(kern, X=X, X2=X2, verbose=verbose, fixed_X_dims=-1))
diff --git a/GPy/testing/likelihood_tests.py b/GPy/testing/likelihood_tests.py
index 7276e108..867851a7 100644
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@@ -112,7 +112,7 @@ class TestNoiseModels(object):
         self.f = None
         self.X = None
 
-    def test_noise_models(self):
+    def test_scale2_models(self):
         self.setUp()
 
         ####################################################
@@ -150,64 +150,64 @@ class TestNoiseModels(object):
         noise_models = {"Student_t_default": {
                             "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
                             "grad_params": {
-                                "names": [".*t_noise"],
+                                "names": [".*t_scale2"],
                                 "vals": [self.var],
-                                "constraints": [(".*t_noise", constrain_positive), (".*deg_free", constrain_fixed)]
-                                #"constraints": [("t_noise", constrain_positive), ("deg_free", partial(constrain_fixed, value=5))]
+                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
+                                #"constraints": [("t_scale2", constrain_positive), ("deg_free", partial(constrain_fixed, value=5))]
                                 },
                             "laplace": True
                             },
                         "Student_t_1_var": {
                             "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
                             "grad_params": {
-                                "names": [".*t_noise"],
+                                "names": [".*t_scale2"],
                                 "vals": [1.0],
-                                "constraints": [(".*t_noise", constrain_positive), (".*deg_free", constrain_fixed)]
+                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
                         "Student_t_small_deg_free": {
                             "model": GPy.likelihoods.StudentT(deg_free=1.5, sigma2=self.var),
                             "grad_params": {
-                                "names": [".*t_noise"],
+                                "names": [".*t_scale2"],
                                 "vals": [self.var],
-                                "constraints": [(".*t_noise", constrain_positive), (".*deg_free", constrain_fixed)]
+                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
                         "Student_t_small_var": {
                             "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
                             "grad_params": {
-                                "names": [".*t_noise"],
+                                "names": [".*t_scale2"],
                                 "vals": [0.001],
-                                "constraints": [(".*t_noise", constrain_positive), (".*deg_free", constrain_fixed)]
+                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
                         "Student_t_large_var": {
                             "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
                             "grad_params": {
-                                "names": [".*t_noise"],
+                                "names": [".*t_scale2"],
                                 "vals": [10.0],
-                                "constraints": [(".*t_noise", constrain_positive), (".*deg_free", constrain_fixed)]
+                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
                         "Student_t_approx_gauss": {
                             "model": GPy.likelihoods.StudentT(deg_free=1000, sigma2=self.var),
                             "grad_params": {
-                                "names": [".*t_noise"],
+                                "names": [".*t_scale2"],
                                 "vals": [self.var],
-                                "constraints": [(".*t_noise", constrain_positive), (".*deg_free", constrain_fixed)]
+                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
                         "Student_t_log": {
                             "model": GPy.likelihoods.StudentT(gp_link=link_functions.Log(), deg_free=5, sigma2=self.var),
                             "grad_params": {
-                                "names": [".*t_noise"],
+                                "names": [".*t_scale2"],
                                 "vals": [self.var],
-                                "constraints": [(".*t_noise", constrain_positive), (".*deg_free", constrain_fixed)]
+                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
                                 },
                             "laplace": True
                             },
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 3c39c5e0..4d20035d 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -130,7 +130,14 @@ class MiscTests(unittest.TestCase):
         m2.kern[:] = m.kern[''].values()
         np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())
 
-class GradientTests(unittest.TestCase):
+    def test_model_optimize(self):
+        X = np.random.uniform(-3., 3., (20, 1))
+        Y = np.sin(X) + np.random.randn(20, 1) * 0.05
+        m = GPy.models.GPRegression(X,Y)
+        m.optimize()
+        print m
+
+class GradientTests(np.testing.TestCase):
     def setUp(self):
         ######################################
         # # 1 dimensional example
diff --git a/GPy/testing/observable_tests.py b/GPy/testing/observable_tests.py
index 90623703..05794dc3 100644
--- a/GPy/testing/observable_tests.py
+++ b/GPy/testing/observable_tests.py
@@ -93,12 +93,12 @@ class Test(unittest.TestCase):
 
     def test_set_params(self):
         self.assertEqual(self.par.params_changed_count, 0, 'no params changed yet')
-        self.par._param_array_[:] = 1
+        self.par.param_array[:] = 1
         self.par._trigger_params_changed()
         self.assertEqual(self.par.params_changed_count, 1, 'now params changed')
         self.assertEqual(self.parent.parent_changed_count, self.par.params_changed_count)
 
-        self.par._param_array_[:] = 2
+        self.par.param_array[:] = 2
         self.par._trigger_params_changed()
         self.assertEqual(self.par.params_changed_count, 2, 'now params changed')
         self.assertEqual(self.parent.parent_changed_count, self.par.params_changed_count)
diff --git a/GPy/testing/parameterized_tests.py b/GPy/testing/parameterized_tests.py
index dc59449f..8bfaab4e 100644
--- a/GPy/testing/parameterized_tests.py
+++ b/GPy/testing/parameterized_tests.py
@@ -7,7 +7,7 @@ import unittest
 import GPy
 import numpy as np
 from GPy.core.parameterization.parameter_core import HierarchyError
-from GPy.core.parameterization.array_core import ObsAr
+from GPy.core.parameterization.observable_array import ObsAr
 
 class ArrayCoreTest(unittest.TestCase):
     def setUp(self):
@@ -34,6 +34,7 @@ class ParameterizedTest(unittest.TestCase):
         self.param = Param('param', np.random.rand(25,2), Logistic(0, 1))
 
         self.test1 = GPy.core.Parameterized("test model")
+        self.test1.param = self.param
         self.test1.kern = self.rbf+self.white
         self.test1.add_parameter(self.test1.kern)
         self.test1.add_parameter(self.param, 0)
@@ -58,6 +59,9 @@ class ParameterizedTest(unittest.TestCase):
         self.assertListEqual(self.test1._fixes_.tolist(),[FIXED,UNFIXED,UNFIXED])
         self.test1.kern.rbf.fix()
         self.assertListEqual(self.test1._fixes_.tolist(),[FIXED]*3)
+        self.test1.fix()
+        self.assertTrue(self.test1.is_fixed)
+        self.assertListEqual(self.test1._fixes_.tolist(),[FIXED]*self.test1.size)
 
     def test_remove_parameter(self):
         from GPy.core.parameterization.transformations import FIXED, UNFIXED, __fixed__, Logexp
@@ -86,9 +90,9 @@ class ParameterizedTest(unittest.TestCase):
         self.assertListEqual(self.test1.constraints[Logexp()].tolist(), range(self.param.size, self.param.size+self.rbf.size))
 
     def test_remove_parameter_param_array_grad_array(self):
-        val = self.test1.kern._param_array_.copy()
+        val = self.test1.kern.param_array.copy()
         self.test1.kern.remove_parameter(self.white)
-        self.assertListEqual(self.test1.kern._param_array_.tolist(), val[:2].tolist())
+        self.assertListEqual(self.test1.kern.param_array.tolist(), val[:2].tolist())
 
     def test_add_parameter_already_in_hirarchy(self):
         self.assertRaises(HierarchyError, self.test1.add_parameter, self.white._parameters_[0])
diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py
new file mode 100644
index 00000000..d975aaa3
--- /dev/null
+++ b/GPy/testing/pickle_tests.py
@@ -0,0 +1,207 @@
+'''
+Created on 13 Mar 2014
+
+@author: maxz
+'''
+import unittest, itertools
+import cPickle as pickle
+import numpy as np
+from GPy.core.parameterization.index_operations import ParameterIndexOperations,\
+    ParameterIndexOperationsView
+import tempfile
+from GPy.core.parameterization.param import Param
+from GPy.core.parameterization.observable_array import ObsAr
+from GPy.core.parameterization.priors import Gaussian
+from GPy.kern._src.rbf import RBF
+from GPy.kern._src.linear import Linear
+from GPy.kern._src.static import Bias, White
+from GPy.examples.dimensionality_reduction import mrd_simulation,\
+    bgplvm_simulation
+from GPy.examples.regression import toy_rbf_1d_50
+from GPy.core.parameterization.variational import NormalPosterior
+from GPy.models.gp_regression import GPRegression
+
+class ListDictTestCase(unittest.TestCase):
+    def assertListDictEquals(self, d1, d2, msg=None):
+        for k,v in d1.iteritems():
+            self.assertListEqual(list(v), list(d2[k]), msg)
+    def assertArrayListEquals(self, l1, l2):
+        for a1, a2 in itertools.izip(l1,l2):
+            np.testing.assert_array_equal(a1, a2)
+
+class Test(ListDictTestCase):
+    def test_parameter_index_operations(self):
+        pio = ParameterIndexOperations(dict(test1=np.array([4,3,1,6,4]), test2=np.r_[2:130]))
+        piov = ParameterIndexOperationsView(pio, 20, 250)
+        self.assertListDictEquals(dict(piov.items()), dict(piov.copy().iteritems()))
+        self.assertListDictEquals(dict(pio.iteritems()), dict(pio.copy().items()))
+
+        self.assertArrayListEquals(pio.copy().indices(), pio.indices())
+        self.assertArrayListEquals(piov.copy().indices(), piov.indices())
+
+        with tempfile.TemporaryFile('w+b') as f:
+            pickle.dump(pio, f)
+            f.seek(0)
+            pio2 = pickle.load(f)
+            self.assertListDictEquals(pio._properties, pio2._properties)
+
+        with tempfile.TemporaryFile('w+b') as f:
+            pickle.dump(piov, f)
+            f.seek(0)
+            pio2 = pickle.load(f)
+            self.assertListDictEquals(dict(piov.items()), dict(pio2.iteritems()))
+
+    def test_param(self):
+        param = Param('test', np.arange(4*2).reshape(4,2))
+        param[0].constrain_positive()
+        param[1].fix()
+        param[2].set_prior(Gaussian(0,1))
+        pcopy = param.copy()
+        self.assertListEqual(param.tolist(), pcopy.tolist())
+        self.assertListEqual(str(param).split('\n'), str(pcopy).split('\n'))
+        self.assertIsNot(param, pcopy)
+        with tempfile.TemporaryFile('w+b') as f:
+            pickle.dump(param, f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        self.assertListEqual(param.tolist(), pcopy.tolist())
+        self.assertSequenceEqual(str(param), str(pcopy))
+
+    def test_observable_array(self):
+        obs = ObsAr(np.arange(4*2).reshape(4,2))
+        pcopy = obs.copy()
+        self.assertListEqual(obs.tolist(), pcopy.tolist())
+        with tempfile.TemporaryFile('w+b') as f:
+            pickle.dump(obs, f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        self.assertListEqual(obs.tolist(), pcopy.tolist())
+        self.assertSequenceEqual(str(obs), str(pcopy))
+
+    def test_parameterized(self):
+        par = RBF(1, active_dims=[1]) + Linear(2, active_dims=[0,2]) + Bias(3) + White(3)
+        par.gradient = 10
+        par.randomize()
+        pcopy = par.copy()
+        self.assertIsInstance(pcopy.constraints, ParameterIndexOperations)
+        self.assertIsInstance(pcopy.rbf.constraints, ParameterIndexOperationsView)
+        self.assertIs(pcopy.constraints, pcopy.rbf.constraints._param_index_ops)
+        self.assertIs(pcopy.constraints, pcopy.rbf.lengthscale.constraints._param_index_ops)
+        self.assertIs(pcopy.constraints, pcopy.linear.constraints._param_index_ops)
+        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
+        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertSequenceEqual(str(par), str(pcopy))
+        self.assertIsNot(par.param_array, pcopy.param_array)
+        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        with tempfile.TemporaryFile('w+b') as f:
+            par.pickle(f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
+        pcopy.gradient = 10
+        np.testing.assert_allclose(par.linear.full_gradient, pcopy.linear.full_gradient)
+        np.testing.assert_allclose(pcopy.linear.full_gradient, 10)
+        self.assertSequenceEqual(str(par), str(pcopy))
+
+    def test_model(self):
+        par = toy_rbf_1d_50(optimize=0, plot=0)
+        pcopy = par.copy()
+        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
+        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertSequenceEqual(str(par), str(pcopy))
+        self.assertIsNot(par.param_array, pcopy.param_array)
+        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertTrue(pcopy.checkgrad())
+        self.assert_(np.any(pcopy.gradient!=0.0))
+        with tempfile.TemporaryFile('w+b') as f:
+            par.pickle(f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
+        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        self.assertSequenceEqual(str(par), str(pcopy))
+        self.assert_(pcopy.checkgrad())
+
+    def test_modelrecreation(self):
+        par = toy_rbf_1d_50(optimize=0, plot=0)
+        pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
+        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
+        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertSequenceEqual(str(par), str(pcopy))
+        self.assertIsNot(par.param_array, pcopy.param_array)
+        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertTrue(pcopy.checkgrad())
+        self.assert_(np.any(pcopy.gradient!=0.0))
+        with tempfile.TemporaryFile('w+b') as f:
+            par.pickle(f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
+        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        self.assertSequenceEqual(str(par), str(pcopy))
+        self.assert_(pcopy.checkgrad())
+
+    def test_posterior(self):
+        X = np.random.randn(3,5)
+        Xv = np.random.rand(*X.shape)
+        par = NormalPosterior(X,Xv)
+        par.gradient = 10
+        pcopy = par.copy()
+        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
+        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertSequenceEqual(str(par), str(pcopy))
+        self.assertIsNot(par.param_array, pcopy.param_array)
+        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        with tempfile.TemporaryFile('w+b') as f:
+            par.pickle(f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
+        pcopy.gradient = 10
+        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        np.testing.assert_allclose(pcopy.mean.full_gradient, 10)
+        self.assertSequenceEqual(str(par), str(pcopy))
+
+    def test_model_concat(self):
+        par = mrd_simulation(optimize=0, plot=0, plot_sim=0)
+        par.randomize()
+        pcopy = par.copy()
+        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
+        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertSequenceEqual(str(par), str(pcopy))
+        self.assertIsNot(par.param_array, pcopy.param_array)
+        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertTrue(pcopy.checkgrad())
+        self.assert_(np.any(pcopy.gradient!=0.0))
+        with tempfile.TemporaryFile('w+b') as f:
+            par.pickle(f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
+        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        self.assertSequenceEqual(str(par), str(pcopy))
+        self.assert_(pcopy.checkgrad())
+
+    def _callback(self, what, which):
+        what.count += 1
+
+    def test_add_observer(self):
+        par = toy_rbf_1d_50(optimize=0, plot=0)
+        par.name = "original"
+        par.count = 0
+        par.add_observer(self, self._callback, 1)
+        pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
+        self.assertNotIn(par.observers[0], pcopy.observers)
+        pcopy = par.copy()
+        pcopy.name = "copy"
+        self.assertTrue(par.checkgrad())
+        self.assertTrue(pcopy.checkgrad())
+        self.assertTrue(pcopy.kern.checkgrad())
+        import ipdb;ipdb.set_trace()
+        self.assertIn(par.observers[0], pcopy.observers)
+        self.assertEqual(par.count, 3)
+        self.assertEqual(pcopy.count, 6) # 3 of each call to checkgrad
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.test_parameter_index_operations']
+    unittest.main()
\ No newline at end of file
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index 676c3ab8..bb162ee3 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -48,7 +48,7 @@ class Cacher(object):
                 if k in kw and kw[k] is not None:
                     return self.operation(*args, **kw)
         # TODO: WARNING !!! Cache OFFSWITCH !!! WARNING
-        #return self.operation(*args)
+        # return self.operation(*args, **kw)
 
         #if the result is cached, return the cached computation
         state = [all(a is b for a, b in itertools.izip_longest(args, cached_i)) for cached_i in self.cached_inputs]
@@ -98,11 +98,20 @@ class Cacher(object):
         self.cached_outputs = []
         self.inputs_changed = []
 
+    def __deepcopy__(self, memo=None):
+        return Cacher(self.operation, self.limit, self.ignore_args, self.force_kwargs)
+
+    def __getstate__(self, memo=None):
+        raise NotImplementedError, "Trying to pickle Cacher object with function {}, pickling functions not possible.".format(str(self.operation))
+
+    def __setstate__(self, memo=None):
+        raise NotImplementedError, "Trying to pickle Cacher object with function {}, pickling functions not possible.".format(str(self.operation))
+
     @property
     def __name__(self):
         return self.operation.__name__
 
-from functools import partial
+from functools import partial, update_wrapper
 
 class Cacher_wrap(object):
     def __init__(self, f, limit, ignore_args, force_kwargs):
@@ -110,6 +119,7 @@ class Cacher_wrap(object):
         self.ignore_args = ignore_args
         self.force_kwargs = force_kwargs
         self.f = f
+        update_wrapper(self, self.f)
     def __get__(self, obj, objtype=None):
         return partial(self, obj)
     def __call__(self, *args, **kwargs):
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index 54e42733..f53163f4 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -331,7 +331,7 @@ def football_data(season='1314', data_set='football_data'):
 
 # This will be for downloading google trends data.
 def google_trends(query_terms=['big data', 'machine learning', 'data science'], data_set='google_trends'):
-    """Data downloaded from Google trends for given query terms."""
+    """Data downloaded from Google trends for given query terms. Warning, if you use this function multiple times in a row you get blocked due to terms of service violations."""
     # Inspired by this notebook:
     # http://nbviewer.ipython.org/github/sahuguet/notebooks/blob/master/GoogleTrends%20meet%20Notebook.ipynb
 
diff --git a/GPy/util/functions.py b/GPy/util/functions.py
new file mode 100644
index 00000000..a9ee1084
--- /dev/null
+++ b/GPy/util/functions.py
@@ -0,0 +1,18 @@
+import numpy as np
+from scipy.special import erf, erfcx
+import sys
+epsilon = sys.float_info.epsilon
+lim_val = -np.log(epsilon) 
+
+def cum_gaussian(x):
+    g=0.5*(1+erf(x/np.sqrt(2)))
+    return np.where(g==0, epsilon, np.where(g==1, 1-epsilon, g)) 
+
+def ln_cum_gaussian(x):
+    return np.where(x < 0, -.5*x*x + np.log(.5) + np.log(erfcx(-np.sqrt(2)/2*x)), np.log(cum_gaussian(x)))
+
+def clip_exp(x):
+    if any(x>=lim_val) or any(x<=-lim_val):
+        return np.where(x<lim_val, np.where(x>-lim_val, np.exp(x), np.exp(-lim_val)), np.exp(lim_val))
+    else:
+        return np.exp(x)
diff --git a/GPy/util/initialization.py b/GPy/util/initialization.py
index 24194b41..22e63b6b 100644
--- a/GPy/util/initialization.py
+++ b/GPy/util/initialization.py
@@ -5,13 +5,15 @@ Created on 24 Feb 2014
 '''
 
 import numpy as np
-from linalg import PCA
+from GPy.util.pca import pca
 
 def initialize_latent(init, input_dim, Y):
     Xr = np.random.randn(Y.shape[0], input_dim)
     if init == 'PCA':
-        PC = PCA(Y, input_dim)[0]
+        p = pca(Y)
+        PC = p.project(Y, min(input_dim, Y.shape[1]))
         Xr[:PC.shape[0], :PC.shape[1]] = PC
     else:
-        pass
-    return Xr
\ No newline at end of file
+        var = Xr.var(0)
+        return Xr, var/var.max()
+    return Xr, p.fracs[:input_dim]
\ No newline at end of file
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 4745c4aa..b204f813 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -580,26 +580,3 @@ def backsub_both_sides(L, X, transpose='left'):
         tmp, _ = dtrtrs(L, X, lower=1, trans=0)
         return dtrtrs(L, tmp.T, lower=1, trans=0)[0].T
 
-def PCA(Y, input_dim):
-    """
-    Principal component analysis: maximum likelihood solution by SVD
-
-    :param Y: NxD np.array of data
-    :param input_dim: int, dimension of projection
-
-
-    :rval X: - Nxinput_dim np.array of dimensionality reduced data
-    :rval W: - input_dimxD mapping from X to Y
-
-    """
-    if not np.allclose(Y.mean(axis=0), 0.0):
-        print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"
-
-        # Y -= Y.mean(axis=0)
-
-    Z = linalg.svd(Y - Y.mean(axis=0), full_matrices=False)
-    [X, W] = [Z[0][:, 0:input_dim], np.dot(np.diag(Z[1]), Z[2]).T[:, 0:input_dim]]
-    v = X.std(axis=0)
-    X /= v;
-    W *= v;
-    return X, W.T
diff --git a/GPy/util/pca.py b/GPy/util/pca.py
new file mode 100644
index 00000000..6c548b3d
--- /dev/null
+++ b/GPy/util/pca.py
@@ -0,0 +1,122 @@
+'''
+Created on 10 Sep 2012
+
+@author: Max Zwiessele
+@copyright: Max Zwiessele 2012
+'''
+import numpy
+import pylab
+import matplotlib
+from numpy.linalg.linalg import LinAlgError
+
+class pca(object):
+    """
+    pca module with automatic primal/dual determination.
+    """
+    def __init__(self, X):
+        self.mu = X.mean(0)
+        self.sigma = X.std(0)
+
+        X = self.center(X)
+
+        # self.X = input
+        if X.shape[0] >= X.shape[1]:
+            # print "N >= D: using primal"
+            self.eigvals, self.eigvectors = self._primal_eig(X)
+        else:
+            # print "N < D: using dual"
+            self.eigvals, self.eigvectors = self._dual_eig(X)
+        self.sort = numpy.argsort(self.eigvals)[::-1]
+        self.eigvals = self.eigvals[self.sort]
+        self.eigvectors = self.eigvectors[:, self.sort]
+        self.fracs = self.eigvals / self.eigvals.sum()
+        self.Q = self.eigvals.shape[0]
+
+    def center(self, X):
+        """
+        Center `X` in pca space.
+        """
+        X = X - self.mu
+        X = X / numpy.where(self.sigma == 0, 1e-30, self.sigma)
+        return X
+
+    def _primal_eig(self, X):
+        return numpy.linalg.eigh(numpy.einsum('ji,jk->ik',X,X))
+
+    def _dual_eig(self, X):
+        dual_eigvals, dual_eigvects = numpy.linalg.eigh(numpy.einsum('ij,kj->ik',X,X))
+        relevant_dimensions = numpy.argsort(numpy.abs(dual_eigvals))[-X.shape[1]:]
+        eigvals = dual_eigvals[relevant_dimensions]
+        eigvects = dual_eigvects[:, relevant_dimensions]
+        eigvects = (1. / numpy.sqrt(X.shape[0] * numpy.abs(eigvals))) * X.T.dot(eigvects)
+        eigvects /= numpy.sqrt(numpy.diag(eigvects.T.dot(eigvects)))
+        return eigvals, eigvects
+
+    def project(self, X, Q=None):
+        """
+        Project X into pca space, defined by the Q highest eigenvalues.
+        Y = X dot V
+        """
+        if Q is None:
+            Q = self.Q
+        if Q > X.shape[1]:
+            raise IndexError("requested dimension larger then input dimension")
+        X = self.center(X)
+        return X.dot(self.eigvectors[:, :Q])
+
+    def plot_fracs(self, Q=None, ax=None, fignum=None):
+        """
+        Plot fractions of Eigenvalues sorted in descending order.
+        """
+        if ax is None:
+            fig = pylab.figure(fignum)
+            ax = fig.add_subplot(111)
+        if Q is None:
+            Q = self.Q
+        ticks = numpy.arange(Q)
+        bar = ax.bar(ticks - .4, self.fracs[:Q])
+        ax.set_xticks(ticks, map(lambda x: r"${}$".format(x), ticks + 1))
+        ax.set_ylabel("Eigenvalue fraction")
+        ax.set_xlabel("PC")
+        ax.set_ylim(0, ax.get_ylim()[1])
+        ax.set_xlim(ticks.min() - .5, ticks.max() + .5)
+        try:
+            pylab.tight_layout()
+        except:
+            pass
+        return bar
+
+    def plot_2d(self, X, labels=None, s=20, marker='o',
+                dimensions=(0, 1), ax=None, colors=None,
+                fignum=None, cmap=matplotlib.cm.jet, # @UndefinedVariable
+                ** kwargs):
+        """
+        Plot dimensions `dimensions` with given labels against each other in 
+        PC space. Labels can be any sequence of labels of dimensions X.shape[0].
+        Labels can be drawn with a subsequent call to legend()
+        """
+        if ax is None:
+            fig = pylab.figure(fignum)
+            ax = fig.add_subplot(111)
+        if labels is None:
+            labels = numpy.zeros(X.shape[0])
+        ulabels = []
+        for lab in labels:
+            if not lab in ulabels:
+                ulabels.append(lab)
+        nlabels = len(ulabels)
+        if colors is None:
+            colors = [cmap(float(i) / nlabels) for i in range(nlabels)]
+        X_ = self.project(X, self.Q)[:,dimensions]
+        kwargs.update(dict(s=s))
+        plots = list()
+        for i, l in enumerate(ulabels):
+            kwargs.update(dict(color=colors[i], marker=marker[i % len(marker)]))
+            plots.append(ax.scatter(*X_[labels == l, :].T, label=str(l), **kwargs))
+        ax.set_xlabel(r"PC$_1$")
+        ax.set_ylabel(r"PC$_2$")
+        try:
+            pylab.tight_layout()
+        except:
+            pass
+        return plots
\ No newline at end of file
diff --git a/GPy/util/symbolic.py b/GPy/util/symbolic.py
index 49c8c33a..5b3ac312 100644
--- a/GPy/util/symbolic.py
+++ b/GPy/util/symbolic.py
@@ -1,5 +1,55 @@
-from sympy import Function, S, oo, I, cos, sin, asin, log, erf, pi, exp, sqrt, sign
+from sympy import Function, S, oo, I, cos, sin, asin, log, erf, pi, exp, sqrt, sign, gamma,polygamma
 
+class gammaln(Function):
+    nargs = 1
+
+    def fdiff(self, argindex=1):
+        x=self.args[0]
+        return polygamma(0, x)
+
+    @classmethod
+    def eval(cls, x):
+        if x.is_Number:
+            return log(gamma(x))
+    
+
+class ln_cum_gaussian(Function):
+    nargs = 1
+
+    def fdiff(self, argindex=1):
+        x = self.args[0]
+        return exp(-ln_cum_gaussian(x) - 0.5*x*x)/sqrt(2*pi)
+
+    @classmethod
+    def eval(cls, x):
+        if x.is_Number:
+            return log(cum_gaussian(x))
+
+    def _eval_is_real(self):
+        return self.args[0].is_real
+
+class cum_gaussian(Function):
+    nargs = 1
+    def fdiff(self, argindex=1):
+        x = self.args[0]
+        return gaussian(x)
+
+    @classmethod
+    def eval(cls, x):
+        if x.is_Number:
+            return 0.5*(1+erf(sqrt(2)/2*x))
+
+    def _eval_is_real(self):
+        return self.args[0].is_real
+
+class gaussian(Function):
+    nargs = 1
+    @classmethod
+    def eval(cls, x):
+        return 1/sqrt(2*pi)*exp(-0.5*x*x)
+
+    def _eval_is_real(self):
+        return True
 
 class ln_diff_erf(Function):
     nargs = 2