diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py
index 839529d6..a42d76ed 100644
--- a/GPy/core/__init__.py
+++ b/GPy/core/__init__.py
@@ -2,7 +2,9 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from model import *
-from parameterization.parameterized import *
+from parameterization.parameterized import adjust_name_for_printing, Parameterizable
+from parameterization.param import Param, ParamConcatenation
+
 from gp import GP
 from sparse_gp import SparseGP
 from svigp import SVIGP
diff --git a/GPy/core/model.py b/GPy/core/model.py
index 55083aaf..c067d51d 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -4,12 +4,8 @@
 
 from .. import likelihoods
 from ..inference import optimization
-from ..util.linalg import jitchol
 from ..util.misc import opt_wrapper
 from parameterization import Parameterized
-from parameterization.parameterized import UNFIXED
-from parameterization.domains import _POSITIVE, _REAL
-from parameterization.index_operations import ParameterIndexOperations
 import multiprocessing as mp
 import numpy as np
 from numpy.linalg.linalg import LinAlgError
@@ -240,7 +236,7 @@ class Model(Parameterized):
         constrained positive.
         """
         raise DeprecationWarning, 'parameters now have default constraints'
-        positive_strings = ['variance', 'lengthscale', 'precision', 'kappa', 'sensitivity']
+        #positive_strings = ['variance', 'lengthscale', 'precision', 'kappa', 'sensitivity']
         # param_names = self._get_param_names()
         
 #         for s in positive_strings:
diff --git a/GPy/core/parameterization/array_core.py b/GPy/core/parameterization/array_core.py
index 9f2c7ae6..7892e94a 100644
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@@ -36,6 +36,8 @@ class ObservableArray(np.ndarray, Observable):
         # see InfoArray.__array_finalize__ for comments
         if obj is None: return
         self._observers_ = getattr(obj, '_observers_', None)
+    def __array_wrap__(self, out_arr, context=None):
+        return out_arr.view(np.ndarray)
 
     def _s_not_empty(self, s):
         # this checks whether there is something picked by this slice.
@@ -68,11 +70,6 @@ class ObservableArray(np.ndarray, Observable):
     def copy(self, *args):
         return self.__copy__(*args)
 
-    def __ror__(self, *args, **kwargs):
-        r =  np.ndarray.__ror__(self, *args, **kwargs)
-        self._notify_observers()
-        return r
-
     def __ilshift__(self, *args, **kwargs):
         r = np.ndarray.__ilshift__(self, *args, **kwargs)
         self._notify_observers()
@@ -83,71 +80,19 @@ class ObservableArray(np.ndarray, Observable):
         self._notify_observers()
         return r
 
-    def __rrshift__(self, *args, **kwargs):
-        r = np.ndarray.__rrshift__(self, *args, **kwargs)
-        self._notify_observers()
-        return r
 
     def __ixor__(self, *args, **kwargs):
         r = np.ndarray.__ixor__(self, *args, **kwargs)
         self._notify_observers()
         return r
 
-    def __rxor__(self, *args, **kwargs):
-        r = np.ndarray.__rxor__(self, *args, **kwargs)
-        self._notify_observers()
-        return r
-
-
-
-    def __rdivmod__(self, *args, **kwargs):
-        r = np.ndarray.__rdivmod__(self, *args, **kwargs)
-        self._notify_observers()
-        return r
-
-
-    def __radd__(self, *args, **kwargs):
-        r = np.ndarray.__radd__(self, *args, **kwargs)
-        self._notify_observers()
-        return r
-
-
-    def __rdiv__(self, *args, **kwargs):
-        r = np.ndarray.__rdiv__(self, *args, **kwargs)
-        self._notify_observers()
-        return r
-
-
-    def __rtruediv__(self, *args, **kwargs):
-        r = np.ndarray.__rtruediv__(self, *args, **kwargs)
-        self._notify_observers()
-        return r
-
 
     def __ipow__(self, *args, **kwargs):
         r = np.ndarray.__ipow__(self, *args, **kwargs)
         self._notify_observers()
         return r
 
-
-    def __rmul__(self, *args, **kwargs):
-        r = np.ndarray.__rmul__(self, *args, **kwargs)
-        self._notify_observers()
-        return r
-
-
-    def __rpow__(self, *args, **kwargs):
-        r = np.ndarray.__rpow__(self, *args, **kwargs)
-        self._notify_observers()
-        return r
-
-
-    def __rsub__(self, *args, **kwargs):
-        r = np.ndarray.__rsub__(self, *args, **kwargs)
-        self._notify_observers()
-        return r
-
-
+    
     def __ifloordiv__(self, *args, **kwargs):
         r = np.ndarray.__ifloordiv__(self, *args, **kwargs)
         self._notify_observers()
@@ -178,12 +123,6 @@ class ObservableArray(np.ndarray, Observable):
         return r
 
 
-    def __rfloordiv__(self, *args, **kwargs):
-        r = np.ndarray.__rfloordiv__(self, *args, **kwargs)
-        self._notify_observers()
-        return r
-
-
     def __iand__(self, *args, **kwargs):
         r = np.ndarray.__iand__(self, *args, **kwargs)
         self._notify_observers()
@@ -208,8 +147,74 @@ class ObservableArray(np.ndarray, Observable):
         return r
 
 
-    def __rshift__(self, *args, **kwargs):
-        r = np.ndarray.__rshift__(self, *args, **kwargs)
-        self._notify_observers()
-        return r
+#     def __rrshift__(self, *args, **kwargs):
+#         r = np.ndarray.__rrshift__(self, *args, **kwargs)
+#         self._notify_observers()
+#         return r
+
+
+#     def __ror__(self, *args, **kwargs):
+#         r =  np.ndarray.__ror__(self, *args, **kwargs)
+#         self._notify_observers()
+#         return r
+
+
+#     def __rxor__(self, *args, **kwargs):
+#         r = np.ndarray.__rxor__(self, *args, **kwargs)
+#         self._notify_observers()
+#         return r
+
+
+
+#     def __rdivmod__(self, *args, **kwargs):
+#         r = np.ndarray.__rdivmod__(self, *args, **kwargs)
+#         self._notify_observers()
+#         return r
+
+
+#     def __radd__(self, *args, **kwargs):
+#         r = np.ndarray.__radd__(self, *args, **kwargs)
+#         self._notify_observers()
+#         return r
+
+
+#     def __rdiv__(self, *args, **kwargs):
+#         r = np.ndarray.__rdiv__(self, *args, **kwargs)
+#         self._notify_observers()
+#         return r
+
+
+#     def __rtruediv__(self, *args, **kwargs):
+#         r = np.ndarray.__rtruediv__(self, *args, **kwargs)
+#         self._notify_observers()
+#         return r
+
+
+#     def __rshift__(self, *args, **kwargs):
+#         r = np.ndarray.__rshift__(self, *args, **kwargs)
+#         self._notify_observers()
+#         return r
+
+
+#     def __rmul__(self, *args, **kwargs):
+#         r = np.ndarray.__rmul__(self, *args, **kwargs)
+#         self._notify_observers()
+#         return r
+
+
+#     def __rpow__(self, *args, **kwargs):
+#         r = np.ndarray.__rpow__(self, *args, **kwargs)
+#         self._notify_observers()
+#         return r
+
+
+#     def __rsub__(self, *args, **kwargs):
+#         r = np.ndarray.__rsub__(self, *args, **kwargs)
+#         self._notify_observers()
+#         return r
+
+#     def __rfloordiv__(self, *args, **kwargs):
+#         r = np.ndarray.__rfloordiv__(self, *args, **kwargs)
+#         self._notify_observers()
+#         return r
 
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 37e710a8..75d9faf2 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -3,7 +3,7 @@
 
 import itertools
 import numpy
-from parameter_core import Constrainable, Gradcheckable, Indexable, Parameterizable, adjust_name_for_printing
+from parameter_core import Constrainable, Gradcheckable, Indexable, Parentable, adjust_name_for_printing
 from array_core import ObservableArray, ParamList
 
 ###### printing
@@ -15,7 +15,7 @@ __precision__ = numpy.get_printoptions()['precision'] # numpy printing precision
 __print_threshold__ = 5
 ######
 
-class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameterizable):
+class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parentable):
     """
     Parameter object for GPy models.
 
@@ -80,8 +80,6 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
         self.constraints = getattr(obj, 'constraints', None)
         self.priors = getattr(obj, 'priors', None)
 
-    #def __array_wrap__(self, out_arr, context=None):
-    #    return out_arr.view(numpy.ndarray)
     #===========================================================================
     # Pickling operations
     #===========================================================================
@@ -116,7 +114,14 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
         self._parent_index_ = state.pop()
         self._direct_parent_ = state.pop()
         self.name = state.pop()
-
+    
+    def copy(self, *args):
+        constr = self.constraints.copy()
+        priors = self.priors.copy()
+        p = Param(self.name, self.view(numpy.ndarray).copy(), self._default_constraint_)
+        p.constraints = constr
+        p.priors = priors
+        return p
     #===========================================================================
     # get/set parameters
     #===========================================================================
@@ -216,7 +221,9 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
     def _description_str(self):
         if self.size <= 1: return ["%f" % self]
         else: return [str(self.shape)]
-    def parameter_names(self, add_name=False):
+    def parameter_names(self, add_self=False, adjust_for_printing=False):
+        if adjust_for_printing:
+            return [adjust_name_for_printing(self.name)]
         return [self.name]
     @property
     def flattened_parameters(self):
@@ -233,14 +240,9 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
     @property
     def _ties_str(self):
         return [t._short() for t in self._tied_to_] or ['']
-    @property
-    def name_hirarchical(self):
-        if self.has_parent():
-            return self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name)
-        return adjust_name_for_printing(self.name)
     def __repr__(self, *args, **kwargs):
         name = "\033[1m{x:s}\033[0;0m:\n".format(
-                            x=self.name_hirarchical)
+                            x=self.hirarchy_name())
         return name + super(Param, self).__repr__(*args, **kwargs)
     def _ties_for(self, rav_index):
         # size = sum(p.size for p in self._tied_to_)
@@ -274,12 +276,12 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
         gen = map(lambda x: " ".join(map(str, x)), gen)
         return reduce(lambda a, b:max(a, len(b)), gen, len(header))
     def _max_len_values(self):
-        return reduce(lambda a, b:max(a, len("{x:=.{0}g}".format(__precision__, x=b))), self.flat, len(self.name_hirarchical))
+        return reduce(lambda a, b:max(a, len("{x:=.{0}g}".format(__precision__, x=b))), self.flat, len(self.hirarchy_name()))
     def _max_len_index(self, ind):
         return reduce(lambda a, b:max(a, len(str(b))), ind, len(__index_name__))
     def _short(self):
         # short string to print
-        name = self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name)
+        name = self.hirarchy_name()
         if self._realsize_ < 2:
             return name
         ind = self._indices()
@@ -302,8 +304,8 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
         if lp is None: lp = self._max_len_names(prirs, __tie_name__)
         sep = '-'
         header_format = "  {i:{5}^{2}s}  |  \033[1m{x:{5}^{1}s}\033[0;0m  |  {c:{5}^{0}s}  |  {p:{5}^{4}s}  |  {t:{5}^{3}s}"
-        if only_name: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.name_hirarchical, c=sep*lc, i=sep*li, t=sep*lt, p=sep*lp)  # nice header for printing
-        else: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.name_hirarchical, c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
+        if only_name: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hirarchy_name(), c=sep*lc, i=sep*li, t=sep*lt, p=sep*lp)  # nice header for printing
+        else: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hirarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
         if not ties: ties = itertools.cycle([''])
         return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {p:^{5}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, lp, x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)])  # return all the constraints with right indices
         # except: return super(Param, self).__str__()
@@ -335,19 +337,20 @@ class ParamConcatenation(object):
         if len(params)==1: return params[0]
         return ParamConcatenation(params)
     def __setitem__(self, s, val, update=True):
+        if isinstance(val, ParamConcatenation):
+            val = val._vals()
         ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True;
         vals = self._vals(); vals[s] = val; del val
-        [numpy.place(p, ind[ps], vals[ps])# and p._notify_tied_parameters()
+        [numpy.place(p, ind[ps], vals[ps]) and update and p._notify_parameters_changed()
          for p, ps in zip(self.params, self._param_slices_)]
-        if update:
-            self.params[0]._notify_parameters_changed()
     def _vals(self):
         return numpy.hstack([p._get_params() for p in self.params])
     #===========================================================================
     # parameter operations:
     #===========================================================================
     def update_all_params(self):
-        self.params[0]._notify_parameters_changed()
+        for p in self.params:
+            p._notify_parameters_changed()
 
     def constrain(self, constraint, warning=True):
         [param.constrain(constraint, update=False) for param in self.params]
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 275198b2..9a10f317 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -68,6 +68,10 @@ class Parentable(object):
             return self
         return self._direct_parent_._highest_parent_
 
+    def _notify_parameters_changed(self):
+        if self.has_parent():
+            self._direct_parent_._notify_parameters_changed()
+
 class Nameable(Parentable):
     _name = None
     def __init__(self, name, direct_parent=None, parent_index=None):
@@ -80,22 +84,56 @@ class Nameable(Parentable):
     @name.setter
     def name(self, name):
         from_name = self.name
+        assert isinstance(name, str)
         self._name = name
         if self.has_parent():
-            self._direct_parent_._name_changed(self, from_name)    
-
+            self._direct_parent_._name_changed(self, from_name)
+    def hirarchy_name(self, adjust_for_printing=True):
+        if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x)
+        else: adjust = lambda x: x
+        if self.has_parent():
+            return self._direct_parent_.hirarchy_name() + "." + adjust(self.name)
+        return adjust(self.name)
 
 class Parameterizable(Parentable):
     def __init__(self, *args, **kwargs):
         super(Parameterizable, self).__init__(*args, **kwargs)
         from GPy.core.parameterization.array_core import ParamList
         _parameters_ = ParamList()
+        self._added_names_ = set()
     
-    def parameter_names(self, add_name=False):
-        if add_name:
-            return [adjust_name_for_printing(self.name) + "." + xi for x in self._parameters_ for xi in x.parameter_names(add_name=True)]
-        return [xi for x in self._parameters_ for xi in x.parameter_names(add_name=True)]
+    def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
+        if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x)
+        else: adjust = lambda x: x
+        if recursive: names = [xi for x in self._parameters_ for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)]
+        else: names = [adjust(x.name) for x in self._parameters_]
+        if add_self: names = map(lambda x: adjust(self.name) + "." + x, names)
+        return names
+    
+    def _add_parameter_name(self, param):
+        pname = adjust_name_for_printing(param.name)
+        # and makes sure to not delete programmatically added parameters
+        if pname in self.__dict__:
+            if not (param is self.__dict__[pname]):
+                if pname in self._added_names_:
+                    del self.__dict__[pname]
+                    self._add_parameter_name(param)
+        else:
+            self.__dict__[pname] = param
+            self._added_names_.add(pname)
+            
+    def _remove_parameter_name(self, param=None, pname=None):
+        assert param is None or pname is None, "can only delete either param by name, or the name of a param"
+        pname = adjust_name_for_printing(pname) or adjust_name_for_printing(param.name)
+        if pname in self._added_names_:
+            del self.__dict__[pname]
+            self._added_names_.remove(pname)
+        self._connect_parameters()
 
+    def _name_changed(self, param, old_name):
+        self._remove_parameter_name(None, old_name)
+        self._add_parameter_name(param)
+            
     def _collect_gradient(self, target):
         import itertools
         [p._collect_gradient(target[s]) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
@@ -113,6 +151,38 @@ class Parameterizable(Parentable):
         [p._set_params(params[s], update=update) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
         self.parameters_changed()
 
+    def copy(self):
+        """Returns a (deep) copy of the current model"""
+        import copy
+        from .index_operations import ParameterIndexOperations, ParameterIndexOperationsView
+        from .array_core import ParamList
+        dc = dict()
+        for k, v in self.__dict__.iteritems():
+            if k not in ['_direct_parent_', '_parameters_', '_parent_index_'] + self.parameter_names():
+                if isinstance(v, (Constrainable, ParameterIndexOperations, ParameterIndexOperationsView)):
+                    dc[k] = v.copy()
+                else:
+                    dc[k] = copy.deepcopy(v)
+            if k == '_parameters_':
+                params = [p.copy() for p in v]
+        #dc = copy.deepcopy(self.__dict__)
+        dc['_direct_parent_'] = None
+        dc['_parent_index_'] = None
+        dc['_parameters_'] = ParamList()
+        s = self.__new__(self.__class__)
+        s.__dict__ = dc
+        #import ipdb;ipdb.set_trace()
+        for p in params:
+            s.add_parameter(p)
+        #dc._notify_parent_change()
+        return s
+        #return copy.deepcopy(self)
+
+    def _notify_parameters_changed(self):
+        self.parameters_changed()
+        if self.has_parent():
+            self._direct_parent_._notify_parameters_changed()
+
     def parameters_changed(self):
         """
         This method gets called when parameters have changed.
@@ -122,11 +192,6 @@ class Parameterizable(Parentable):
         """
         pass
 
-    def _notify_parameters_changed(self):
-        self.parameters_changed()
-        if self.has_parent():
-            self._direct_parent_._notify_parameters_changed()
-
 
 class Gradcheckable(Parentable):
     #===========================================================================
@@ -157,7 +222,7 @@ class Indexable(object):
         """
         raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?"        
         
-class Constrainable(Nameable, Indexable, Parameterizable):
+class Constrainable(Nameable, Indexable, Parentable):
     def __init__(self, name, default_constraint=None):
         super(Constrainable,self).__init__(name)
         self._default_constraint_ = default_constraint
@@ -167,6 +232,16 @@ class Constrainable(Nameable, Indexable, Parameterizable):
         if self._default_constraint_ is not None:
             self.constrain(self._default_constraint_)
     
+    def _disconnect_parent(self, constr=None):
+        if constr is None:
+            constr = self.constraints.copy()
+        self.constraints.clear()
+        self.constraints = constr
+        self._direct_parent_ = None
+        self._parent_index_ = None
+        self._connect_fixes()
+        self._notify_parent_change()
+        
     #===========================================================================
     # Fixing Parameters:
     #===========================================================================
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index c8a841c0..12bf936c 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -3,16 +3,15 @@
 
 
 import numpy; np = numpy
-import copy
 import cPickle
 import itertools
 from re import compile, _pattern_type
-from param import ParamConcatenation, Param
-from parameter_core import Constrainable, Pickleable, Observable, adjust_name_for_printing, Gradcheckable
-from transformations import __fixed__, FIXED, UNFIXED
+from param import ParamConcatenation
+from parameter_core import Constrainable, Pickleable, Observable, Parameterizable, Parentable, adjust_name_for_printing, Gradcheckable
+from transformations import __fixed__
 from array_core import ParamList
 
-class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
+class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable, Parameterizable, Parentable):
     """
     Parameterized class
 
@@ -63,7 +62,6 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
             self._fixes_ = None
         self._param_slices_ = []
         self._connect_parameters()
-        self._added_names_ = set()
         del self._in_init_
 
     def add_parameter(self, param, index=None):
@@ -117,17 +115,10 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
             raise RuntimeError, "Parameter {} does not belong to this object, remove parameters directly from their respective parents".format(param._short())
         del self._parameters_[param._parent_index_]
         self.size -= param.size
-        constr = param.constraints.copy()
-        param.constraints.clear()
-        param.constraints = constr
-        param._direct_parent_ = None
-        param._parent_index_ = None
-        param._connect_fixes()
-        param._notify_parent_change()
-        pname = adjust_name_for_printing(param.name)
-        if pname in self._added_names_:
-            del self.__dict__[pname]
-        self._connect_parameters()
+        
+        param._disconnect_parent()
+        self._remove_parameter_name(param)
+        
         #self._notify_parent_change()
         self._connect_fixes()
 
@@ -145,19 +136,9 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
         for i, p in enumerate(self._parameters_):
             p._direct_parent_ = self
             p._parent_index_ = i
-            not_unique = []
             sizes.append(p.size + sizes[-1])
             self._param_slices_.append(slice(sizes[-2], sizes[-1]))
-            pname = adjust_name_for_printing(p.name)
-            # and makes sure to not delete programmatically added parameters
-            if pname in self.__dict__:
-                if isinstance(self.__dict__[pname], (Parameterized, Param)):
-                    if not p is self.__dict__[pname]:
-                        not_unique.append(pname)
-                        del self.__dict__[pname]
-            elif not (pname in not_unique):
-                self.__dict__[pname] = p
-                self._added_names_.add(pname)
+            self._add_parameter_name(p)
 
     #===========================================================================
     # Pickling operations
@@ -174,19 +155,7 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
                 cPickle.dump(self, f, protocol)
         else:
             cPickle.dump(self, f, protocol)
-    def copy(self):
-        """Returns a (deep) copy of the current model """
-        # dc = dict()
-        # for k, v in self.__dict__.iteritems():
-            # if k not in ['_highest_parent_', '_direct_parent_']:
-                # dc[k] = copy.deepcopy(v)
 
-        # dc = copy.deepcopy(self.__dict__)
-        # dc['_highest_parent_'] = None
-        # dc['_direct_parent_'] = None
-        # s = self.__class__.new()
-        # s.__dict__ = dc
-        return copy.deepcopy(self)
     def __getstate__(self):
         if self._has_get_set_state():
             return self._getstate()
@@ -243,7 +212,7 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
     # Optimization handles:
     #===========================================================================
     def _get_param_names(self):
-        n = numpy.array([p.name_hirarchical + '[' + str(i) + ']' for p in self.flattened_parameters for i in p._indices()])
+        n = numpy.array([p.hirarchy_name() + '[' + str(i) + ']' for p in self.flattened_parameters for i in p._indices()])
         return n
     def _get_param_names_transformed(self):
         n = self._get_param_names()
@@ -265,14 +234,6 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
         if self._has_fixes(): tmp = self._get_params(); tmp[self._fixes_] = p; p = tmp; del tmp
         [numpy.put(p, ind, c.f(p[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
         return p
-    def _name_changed(self, param, old_name):
-        if hasattr(self, old_name) and old_name in self._added_names_:
-            delattr(self, old_name)
-            self._added_names_.remove(old_name)
-        pname = adjust_name_for_printing(param.name)
-        if pname not in self.__dict__:
-            self._added_names_.add(pname)
-            self.__dict__[pname] = param
     #===========================================================================
     # Indexable Handling
     #===========================================================================
@@ -335,10 +296,6 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
         # you can retrieve the original param through this method, by passing
         # the copy here
         return self._parameters_[param._parent_index_]
-    def hirarchy_name(self):
-        if self.has_parent():
-            return self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name) + "."
-        return ''
     #===========================================================================
     # Get/set parameters:
     #===========================================================================
@@ -348,13 +305,11 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
         """
         if not isinstance(regexp, _pattern_type): regexp = compile(regexp)
         found_params = []
-        for p in self._parameters_:
-            if regexp.match(p.name) is not None:
+        for n, p in itertools.izip(self.parameter_names(False, False, True), self.flattened_parameters):
+            if regexp.match(n) is not None:
                 found_params.append(p)
-            if isinstance(p, Parameterized):
-                found_params.extend(p.grep_param_names(regexp))
         return found_params
-        return [param for param in self._parameters_ if regexp.match(param.name) is not None]
+
     def __getitem__(self, name, paramlist=None):
         if paramlist is None:
             paramlist = self.grep_param_names(name)
@@ -366,36 +321,22 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
                     return ParamConcatenation(paramlist)
             return paramlist[-1]
         return ParamConcatenation(paramlist)
+    
     def __setitem__(self, name, value, paramlist=None):
         try: param = self.__getitem__(name, paramlist)
         except AttributeError as a: raise a
         param[:] = value
-#     def __getattr__(self, name):
-#         return self.__getitem__(name)
-#     def __getattribute__(self, name):
-#         #try:
-#             return object.__getattribute__(self, name)
-        # except AttributeError:
-        #    _, a, tb = sys.exc_info()
-        #    try:
-        #        return self.__getitem__(name)
-        #    except AttributeError:
-        #        raise AttributeError, a.message, tb
     def __setattr__(self, name, val):
-        # override the default behaviour, if setting a param, so broadcasting can by used
-        if hasattr(self, "_parameters_"):
-            paramlist = self.grep_param_names(name)
-            if len(paramlist) == 1: self.__setitem__(name, val, paramlist); return
+        # override the default behaviour, if setting a param, so broadcasting can by used        
+        if hasattr(self, '_parameters_'):
+            pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False)
+            if name in pnames: self._parameters_[pnames.index(name)][:] = val; return
         object.__setattr__(self, name, val);
     #===========================================================================
     # Printing:
     #===========================================================================
     def _short(self):
-        # short string to print
-        if self.has_parent():
-            return self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name)
-        else:
-            return adjust_name_for_printing(self.name)
+        return self.hirarchy_name()
     @property
     def flattened_parameters(self):
         return [xi for x in self._parameters_ for xi in x.flattened_parameters]
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index dd0f6c2c..1d436c53 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -2,12 +2,11 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ..util.linalg import mdot, tdot, symmetrify, backsub_both_sides, dtrtrs, dpotrs, dpotri
+from ..util.linalg import mdot
 from gp import GP
 from parameterization.param import Param
-from ..inference.latent_function_inference import varDTC
+from GPy.inference.latent_function_inference import var_dtc
 from .. import likelihoods
-from GPy.util.misc import param_to_array
 
 class SparseGP(GP):
     """
@@ -34,12 +33,12 @@ class SparseGP(GP):
 
     def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, X_variance=None, name='sparse gp'):
 
-        #pick a sensible inference method
+        # pick a sensible inference method
         if inference_method is None:
             if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = varDTC.VarDTC()
+                inference_method = var_dtc.VarDTC()
             else:
-                #inference_method = ??
+                # inference_method = ??
                 raise NotImplementedError, "what to do what to do?"
             print "defaulting to ", inference_method, "for latent function inference"
 
@@ -55,7 +54,7 @@ class SparseGP(GP):
         self.parameters_changed()
 
     def _update_gradients_Z(self, add=False):
-    #The derivative of the bound wrt the inducing inputs Z ( unless they're all fixed)
+    # The derivative of the bound wrt the inducing inputs Z ( unless they're all fixed)
         if not self.Z.is_fixed:
             if add: self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
             else: self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
@@ -78,13 +77,14 @@ class SparseGP(GP):
             mu = np.dot(Kx.T, self.posterior.woodbury_vector)
             if full_cov:
                 Kxx = self.kern.K(Xnew, which_parts=which_parts)
-                var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx) # NOTE this won't work for plotting
+                var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx), Kx, [1,0]).T
             else:
-                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
-                var = Kxx - np.sum(Kx * np.dot(self.posterior.woodbury_inv, Kx), 0)
+                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)[:, None]
+                #import ipdb;ipdb.set_trace()
+                var = Kxx - (np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T * Kx.T[:,:,None]).sum(1)
         else:
             # assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
-            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
+            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new)  # , which_parts=which_parts) TODO: which_parts
             mu = np.dot(Kx, self.Cpsi1V)
             if full_cov:
                 raise NotImplementedError, "TODO"
@@ -92,7 +92,7 @@ class SparseGP(GP):
                 Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new)
                 psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new)
                 var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
-        return mu, var[:,None]
+        return mu, var
 
 
     def _getstate(self):
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index f612ecd7..2924386f 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -1,14 +1,13 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as _np
-default_seed = _np.random.seed(123344)
+#default_seed = _np.random.seed(123344)
 
-def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False, output_dim=1e4):
+def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan=False):
     """
     model for testing purposes. Samples from a GP with rbf kernel and learns
     the samples with a new kernel. Normally not for optimization, just model cheking
     """
-    from GPy.likelihoods.gaussian import Gaussian
     import GPy
 
     num_inputs = 13
@@ -36,12 +35,17 @@ def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False,
     # k = GPy.kern.rbf(input_dim, .5, 2., ARD=0) + GPy.kern.rbf(input_dim, .3, .2, ARD=0)
     # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)
 
+    p = .3
+    
     m = GPy.models.BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+
+    if nan:
+        m.inference_method = GPy.inference.latent_function_inference.var_dtc.VarDTCMissingData()
+        m.Y[_np.random.binomial(1,p,size=(Y.shape)).astype(bool)] = _np.nan
+        m.parameters_changed()
+
     #===========================================================================
     # randomly obstruct data with percentage p
-    p = .8
-    Y_obstruct = Y.copy()
-    Y_obstruct[_np.random.uniform(size=(Y.shape)) < p] = _np.nan
     #===========================================================================
     #m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
     m.lengthscales = lengthscales
@@ -182,6 +186,8 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40,
     return m
 
 def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
+    _np.random.seed(1234)
+    
     x = _np.linspace(0, 4 * _np.pi, N)[:, None]
     s1 = _np.vectorize(lambda x: _np.sin(x))
     s2 = _np.vectorize(lambda x: _np.cos(x))
@@ -276,6 +282,36 @@ def bgplvm_simulation(optimize=True, verbose=1,
         m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
     return m
 
+def bgplvm_simulation_missing_data(optimize=True, verbose=1,
+                      plot=True, plot_sim=False,
+                      max_iters=2e4,
+                      ):
+    from GPy import kern
+    from GPy.models import BayesianGPLVM
+    from GPy.inference.latent_function_inference.var_dtc import VarDTCMissingData
+
+    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+    Y = Ylist[0]
+    k = kern.linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    
+    inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
+    m = BayesianGPLVM(Y, Q, init="random", num_inducing=num_inducing, kernel=k)
+    m.inference_method = VarDTCMissingData()
+    m.Y[inan] = _np.nan
+    m.q.variance *= .1
+    m.parameters_changed()
+    
+    if optimize:
+        print "Optimizing model:"
+        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
+                   gtol=.05)
+    if plot:
+        m.q.plot("BGPLVM Latent Space 1D")
+        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
+    return m
+
+
 def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
     from GPy import kern
     from GPy.models import MRD
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index c073369f..55567051 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -361,7 +361,7 @@ def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, o
         kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
     else:
         kernel = GPy.kern.rbf(X.shape[1], ARD=1)
-    kernel += GPy.kern.bias(X.shape[1])
+    #kernel += GPy.kern.bias(X.shape[1])
     X_variance = np.ones(X.shape) * 0.5
     m = GPy.models.SparseGPRegression(X, Y, kernel, X_variance=X_variance)
     # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
@@ -434,10 +434,14 @@ def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, opti
 
     return m
 
-def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, optimize=True, plot=True):
+def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, optimize=True, plot=True, nan=False):
     """Run a 2D example of a sparse GP regression."""
+    np.random.seed(1234)
     X = np.random.uniform(-3., 3., (num_samples, 2))
     Y = np.sin(X[:, 0:1]) * np.sin(X[:, 1:2]) + np.random.randn(num_samples, 1) * 0.05
+    if nan:
+        inan = np.random.binomial(1,.2,size=Y.shape)
+        Y[inan] = np.nan
 
     # construct kernel
     rbf = GPy.kern.rbf(2)
diff --git a/GPy/inference/__init__.py b/GPy/inference/__init__.py
index e69de29b..f1ffd595 100644
--- a/GPy/inference/__init__.py
+++ b/GPy/inference/__init__.py
@@ -0,0 +1,2 @@
+import latent_function_inference
+import optimization
\ No newline at end of file
diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py
index 4d635c46..a633c381 100644
--- a/GPy/inference/latent_function_inference/__init__.py
+++ b/GPy/inference/latent_function_inference/__init__.py
@@ -16,7 +16,9 @@ If the likelihood object is something other than Gaussian, then exact inference
 is not tractable. We then resort to a Laplace approximation (laplace.py) or
 expectation propagation (ep.py).
 
-The inference methods return a "Posterior" instance, which is a simple
+The inference methods return a 
+:class:`~GPy.inference.latent_function_inference.posterior.Posterior` 
+instance, which is a simple
 structure which contains a summary of the posterior. The model classes can then
 use this posterior object for making predictions, optimizing hyper-parameters,
 etc.
@@ -26,6 +28,18 @@ etc.
 from exact_gaussian_inference import ExactGaussianInference
 from laplace import Laplace
 expectation_propagation = 'foo' # TODO
-from varDTC import VarDTC
+from GPy.inference.latent_function_inference.var_dtc import VarDTC
 from dtc import DTC
 from fitc import FITC
+
+# class FullLatentFunctionData(object):
+#     
+# 
+# class LatentFunctionInference(object):
+#     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+#         """
+#         Do inference on the latent functions given a covariance function `kern`,
+#         inputs and outputs `X` and `Y`, and a likelihood `likelihood`. 
+#         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
+#         """
+#         raise NotImplementedError, "Abstract base class for full inference"
\ No newline at end of file
diff --git a/GPy/inference/latent_function_inference/dtc.py b/GPy/inference/latent_function_inference/dtc.py
index dbbff6d0..1a811de6 100644
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@@ -32,7 +32,7 @@ class DTC(object):
         #make sure the noise is not hetero
         beta = 1./np.squeeze(likelihood.variance)
         if beta.size <1:
-            raise NotImplementedError, "no hetero noise with this implementatino of DTC"
+            raise NotImplementedError, "no hetero noise with this implementation of DTC"
 
         Kmm = kern.K(Z)
         Knn = kern.Kdiag(X)
@@ -89,4 +89,85 @@ class DTC(object):
 
         return post, log_marginal, grad_dict
 
+class vDTC(object):
+    def __init__(self):
+        self.const_jitter = 1e-6
+
+    def inference(self, kern, X, X_variance, Z, likelihood, Y):
+        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
+
+        #TODO: MAX! fix this!
+        from ...util.misc import param_to_array
+        Y = param_to_array(Y)
+
+        num_inducing, _ = Z.shape
+        num_data, output_dim = Y.shape
+
+        #make sure the noise is not hetero
+        beta = 1./np.squeeze(likelihood.variance)
+        if beta.size <1:
+            raise NotImplementedError, "no hetero noise with this implementation of DTC"
+
+        Kmm = kern.K(Z)
+        Knn = kern.Kdiag(X)
+        Knm = kern.K(X, Z)
+        U = Knm
+        Uy = np.dot(U.T,Y)
+
+        #factor Kmm 
+        Kmmi, L, Li, _ = pdinv(Kmm)
+
+        # Compute A
+        LiUTbeta = np.dot(Li, U.T)*np.sqrt(beta)
+        A_ = tdot(LiUTbeta)
+        trace_term = -0.5*(np.sum(Knn)*beta - np.trace(A_))
+        A = A_ + np.eye(num_inducing)
+
+        # factor A
+        LA = jitchol(A)
+
+        # back substutue to get b, P, v
+        tmp, _ = dtrtrs(L, Uy, lower=1)
+        b, _ = dtrtrs(LA, tmp*beta, lower=1)
+        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
+        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
+        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
+        P = tdot(tmp.T)
+
+        #compute log marginal
+        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
+                       -np.sum(np.log(np.diag(LA)))*output_dim + \
+                       0.5*num_data*output_dim*np.log(beta) + \
+                       -0.5*beta*np.sum(np.square(Y)) + \
+                       0.5*np.sum(np.square(b)) + \
+                       trace_term
+
+        # Compute dL_dKmm
+        vvT_P = tdot(v.reshape(-1,1)) + P
+        LAL = Li.T.dot(A).dot(Li)
+        dL_dK = Kmmi - 0.5*(vvT_P + LAL)
+
+        # Compute dL_dU
+        vY = np.dot(v.reshape(-1,1),Y.T)
+        #dL_dU = vY - np.dot(vvT_P, U.T)
+        dL_dU = vY - np.dot(vvT_P - Kmmi, U.T)
+        dL_dU *= beta
+
+        #compute dL_dR
+        Uv = np.dot(U, v)
+        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - 1./beta + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) + np.sum(np.square(Uv), 1) )*beta**2
+        dL_dR -=beta*trace_term/num_data
+
+        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':np.zeros_like(Knn) + -0.5*beta, 'dL_dKnm':dL_dU.T}
+
+        #update gradients
+        kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
+        likelihood.update_gradients(dL_dR)
+
+        #construct a posterior object
+        post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)
+
+
+        return post, log_marginal, grad_dict
+
 
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 906a7867..50a40449 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -216,9 +216,9 @@ class Laplace(object):
         """
         if not log_concave:
             #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
-            # W[W<1e-6] = 1e-6
+            W[W<1e-6] = 1e-6
             # NOTE: when setting a parameter inside parameters_changed it will allways come to closed update circles!!!
-            W.__setitem__(W < 1e-6, 1e-6, update=False)  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            #W.__setitem__(W < 1e-6, 1e-6, update=False)  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                 # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                 # To cause the posterior to become less certain than the prior and likelihood,
                                 # This is a property only held by non-log-concave likelihoods
diff --git a/GPy/inference/latent_function_inference/varDTC.py b/GPy/inference/latent_function_inference/varDTC.py
deleted file mode 100644
index 90ca7a6a..00000000
--- a/GPy/inference/latent_function_inference/varDTC.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-from posterior import Posterior
-from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify
-import numpy as np
-from ...util.caching import Cacher
-from ...util.misc import param_to_array
-log_2_pi = np.log(2*np.pi)
-
-class VarDTC(object):
-    """
-    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
-
-    The function self.inference returns a Posterior object, which summarizes
-    the posterior.
-
-    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
-
-    """
-    def __init__(self):
-        #self._YYTfactor_cache = caching.cache()
-        self.const_jitter = 1e-6
-        self.get_trYYT = Cacher(self._get_trYYT, 1)
-        self.get_YYTfactor = Cacher(self._get_YYTfactor, 1)
-    
-    def _get_trYYT(self, Y):
-        return param_to_array(np.sum(np.square(Y)))
-
-    def _get_YYTfactor(self, Y):
-        """
-        find a matrix L which satisfies LLT = YYT. 
-
-        Note that L may have fewer columns than Y.
-        """
-        N, D = Y.shape
-        if (N>=D):
-            return param_to_array(Y)
-        else:
-            return jitchol(tdot(Y))
-            
-    def get_VVTfactor(self, Y, prec):
-        return Y * prec # TODO chache this, and make it effective
-
-    def inference(self, kern, X, X_variance, Z, likelihood, Y):
-
-        num_inducing, _ = Z.shape
-        num_data, output_dim = Y.shape
-
-        #see whether we're using variational uncertain inputs
-        uncertain_inputs = not (X_variance is None)
-
-        #see whether we've got a different noise variance for each datum
-        beta = 1./np.squeeze(likelihood.variance)
-        het_noise = False
-        if beta.size <1:
-            het_noise = True
-
-        # kernel computations, using BGPLVM notation
-        Kmm = kern.K(Z)
-        if uncertain_inputs:
-            psi0 = kern.psi0(Z, X, X_variance)
-            psi1 = kern.psi1(Z, X, X_variance)
-            psi2 = kern.psi2(Z, X, X_variance)
-        else:
-            psi0 = kern.Kdiag(X)
-            psi1 = kern.K(X, Z)
-
-        #factor Kmm # TODO: cache?
-        Lm = jitchol(Kmm)
-
-        # The rather complex computations of A
-        if uncertain_inputs:
-            if het_noise:
-                psi2_beta = (psi2 * (beta.flatten().reshape(num_data, 1, 1))).sum(0)
-            else:
-                psi2_beta = psi2.sum(0) * beta
-            if 0:
-                evals, evecs = linalg.eigh(psi2_beta)
-                clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
-                if not np.array_equal(evals, clipped_evals):
-                    pass # print evals
-                tmp = evecs * np.sqrt(clipped_evals)
-                tmp = tmp.T
-            # no backsubstitution because of bound explosion on tr(A) if not...
-            LmInv, _ = dtrtri(Lm, lower=1)
-            A = LmInv.dot(psi2_beta.dot(LmInv.T))
-            #print A.sum()
-        else:
-            if het_noise:
-                tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
-            else:
-                tmp = psi1 * (np.sqrt(beta))
-            tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
-            A = tdot(tmp)
-
-        # factor B
-        B = np.eye(num_inducing) + A
-        self.A = A
-        LB = jitchol(B)
-
-        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
-        #self.YYTfactor = self.get_YYTfactor(Y)
-        #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta)
-        VVT_factor = beta*param_to_array(Y)
-        trYYT = self.get_trYYT(Y)
-
-        psi1Vf = np.dot(psi1.T, VVT_factor)
-
-        # back substutue C into psi1Vf
-        tmp, info1 = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
-        _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
-        tmp, info2 = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
-        Cpsi1Vf, info3 = dtrtrs(Lm, tmp, lower=1, trans=1)
-
-
-        # Compute dL_dKmm
-        tmp = tdot(_LBi_Lmi_psi1Vf)
-        data_fit = np.trace(tmp)
-        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + tmp)
-        tmp = -0.5 * DBi_plus_BiPBi
-        tmp += -0.5 * B * output_dim
-        tmp += output_dim * np.eye(num_inducing)
-        dL_dKmm = backsub_both_sides(Lm, tmp)
-
-        # Compute dL_dpsi 
-        dL_dpsi0 = -0.5 * output_dim * (beta * np.ones([num_data, 1])).flatten()
-        dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
-        dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
-
-        if het_noise:
-            if uncertain_inputs:
-                dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :]
-            else:
-                dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T
-                dL_dpsi2 = None
-        else:
-            dL_dpsi2 = beta * dL_dpsi2_beta
-            if uncertain_inputs:
-                # repeat for each of the N psi_2 matrices
-                dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0)
-            else:
-                # subsume back into psi1 (==Kmn)
-                dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
-                dL_dpsi2 = None
-
-
-        # the partial derivative vector for the likelihood
-        if likelihood.size == 0:
-            # save computation here.
-            partial_for_likelihood = None
-        elif het_noise:
-            if uncertain_inputs:
-                raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
-            else:
-                LBi = chol_inv(LB)
-                Lmi_psi1, nil = dtrtrs(Lm, psi1.T, lower=1, trans=0)
-                _LBi_Lmi_psi1, _ = dtrtrs(LB, Lmi_psi1, lower=1, trans=0)
-
-                partial_for_likelihood = -0.5 * beta + 0.5 * likelihood.V**2
-                partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
-
-                partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
-
-                partial_for_likelihood += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * beta**2
-                partial_for_likelihood += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
-
-        else:
-            # likelihood is not heteroscedatic
-            partial_for_likelihood = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
-            partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
-            partial_for_likelihood += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
-
-        #compute log marginal likelihood
-        if het_noise:
-            lik_1 = -0.5 * num_data * output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(likelihood.V * likelihood.Y)
-            lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
-        else:
-            lik_1 = -0.5 * num_data * output_dim * (np.log(2.*np.pi) - np.log(beta)) - 0.5 * beta * trYYT
-            lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
-        lik_3 = -output_dim * (np.sum(np.log(np.diag(LB))))
-        lik_4 = 0.5 * data_fit
-        log_marginal = lik_1 + lik_2 + lik_3 + lik_4
-
-        #put the gradients in the right places
-        likelihood.update_gradients(partial_for_likelihood)
-
-        if uncertain_inputs:
-            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
-            kern.update_gradients_variational(mu=X, S=X_variance, Z=Z, **grad_dict)
-        else:
-            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1}
-            kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
-
-        #get sufficient things for posterior prediction
-        #TODO: do we really want to do this in  the loop?
-        if VVT_factor.shape[1] == Y.shape[1]:
-            woodbury_vector = Cpsi1Vf # == Cpsi1V
-        else:
-            print 'foobar'
-            psi1V = np.dot(Y.T*beta, psi1).T
-            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
-            tmp, _ = dpotrs(LB, tmp, lower=1)
-            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
-        Bi, _ = dpotri(LB, lower=1)
-        symmetrify(Bi)
-        Bi = dpotri(LB, lower=1)[0]
-        woodbury_inv = backsub_both_sides(Lm, np.eye(num_inducing) - Bi)
-
-
-        #construct a posterior object
-        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
-
-        return post, log_marginal, grad_dict
-
-
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
new file mode 100644
index 00000000..2f11cb08
--- /dev/null
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -0,0 +1,429 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from posterior import Posterior
+from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify
+import numpy as np
+from ...util.misc import param_to_array
+log_2_pi = np.log(2*np.pi)
+
+class VarDTC(object):
+    """
+    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
+
+    The function self.inference returns a Posterior object, which summarizes
+    the posterior.
+
+    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
+
+    """
+    const_jitter = 1e-6
+    def __init__(self):
+        #self._YYTfactor_cache = caching.cache()
+        from ...util.caching import Cacher
+        self.get_trYYT = Cacher(self._get_trYYT, 1)
+        self.get_YYTfactor = Cacher(self._get_YYTfactor, 1)
+    
+    def _get_trYYT(self, Y):
+        return param_to_array(np.sum(np.square(Y)))
+
+    def _get_YYTfactor(self, Y):
+        """
+        find a matrix L which satisfies LLT = YYT. 
+
+        Note that L may have fewer columns than Y.
+        """
+        N, D = Y.shape
+        if (N>=D):
+            return param_to_array(Y)
+        else:
+            return jitchol(tdot(Y))
+            
+    def get_VVTfactor(self, Y, prec):
+        return Y * prec # TODO chache this, and make it effective
+    
+    def inference(self, kern, X, X_variance, Z, likelihood, Y):
+
+        #see whether we're using variational uncertain inputs
+        uncertain_inputs = not (X_variance is None)
+        
+        _, output_dim = Y.shape
+        
+        #see whether we've got a different noise variance for each datum
+        beta = 1./np.squeeze(likelihood.variance)
+
+        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
+        #self.YYTfactor = self.get_YYTfactor(Y)
+        #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta)
+        VVT_factor = beta*Y
+        #VVT_factor = beta*Y
+        trYYT = self.get_trYYT(Y)
+    
+        # do the inference:
+        dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Cpsi1Vf, \
+         psi1, Lm, LB, log_marginal, Kmm, partial_for_likelihood  = _do_inference_on(
+                kern, X, X_variance, Z, likelihood, 
+                uncertain_inputs, output_dim, 
+                beta, VVT_factor, trYYT)
+
+        likelihood.update_gradients(partial_for_likelihood)
+
+        if uncertain_inputs:
+            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
+            kern.update_gradients_variational(mu=X, S=X_variance, Z=Z, **grad_dict)
+        else:
+            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1}
+            kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
+
+        #get sufficient things for posterior prediction
+        #TODO: do we really want to do this in  the loop?
+        if VVT_factor.shape[1] == Y.shape[1]:
+            woodbury_vector = Cpsi1Vf # == Cpsi1V
+        else:
+            print 'foobar'
+            psi1V = np.dot(Y.T*beta, psi1).T
+            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
+            tmp, _ = dpotrs(LB, tmp, lower=1)
+            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+        Bi, _ = dpotri(LB, lower=1)
+        symmetrify(Bi)
+        Bi = -dpotri(LB, lower=1)[0]
+        from ...util import diag
+        diag.add(Bi, 1)
+
+        woodbury_inv = backsub_both_sides(Lm, Bi)
+
+        #construct a posterior object
+        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
+
+        return post, log_marginal, grad_dict
+
+class VarDTCMissingData(object):
+    def __init__(self):
+        from ...util.caching import Cacher
+        self._Y = Cacher(self._subarray_computations, 1)
+        pass
+    
+    def _subarray_computations(self, Y):
+        inan = np.isnan(Y)
+        has_none = inan.any()
+        if has_none:
+            from ...util.subarray_and_sorting import common_subarrays
+            self._subarray_indices = []
+            for v,ind in common_subarrays(inan, 1).iteritems():
+                if not np.all(v):
+                    v = ~np.array(v, dtype=bool)
+                    ind = np.array(ind, dtype=int)
+                    if ind.size == Y.shape[1]:
+                        ind = slice(None)
+                    self._subarray_indices.append([v,ind])
+            Ys = [Y[v, :][:, ind] for v, ind in self._subarray_indices]
+            traces = [(y**2).sum() for y in Ys]
+            return Ys, traces
+        else:
+            self._subarray_indices = [[slice(None),slice(None)]]
+            return [Y], [(Y**2).sum()]
+    
+    def inference(self, kern, X, X_variance, Z, likelihood, Y):
+        Ys, traces = self._Y(Y)
+        beta_all = 1./likelihood.variance
+        uncertain_inputs = not (X_variance is None)
+        het_noise = beta_all.size != 1
+
+        import itertools
+        num_inducing = Z.shape[0]
+
+        dL_dpsi0_all = np.zeros(X.shape[0])
+        dL_dpsi1_all = np.zeros((X.shape[0], num_inducing))
+        if uncertain_inputs:
+            dL_dpsi2_all = np.zeros((X.shape[0], num_inducing, num_inducing))
+        
+        partial_for_likelihood = 0
+        woodbury_vector = np.zeros((num_inducing, Y.shape[1]))
+        woodbury_inv_all = np.zeros((num_inducing, num_inducing, Y.shape[1]))
+        dL_dKmm = 0
+        log_marginal = 0
+        
+        Kmm = kern.K(Z)
+        #factor Kmm 
+        Lm = jitchol(Kmm)
+        if uncertain_inputs: LmInv = dtrtri(Lm)
+
+        # kernel computations, using BGPLVM notation
+        psi0_all, psi1_all, psi2_all = _compute_psi(kern, X, X_variance, Z, uncertain_inputs)
+
+        VVT_factor_all = np.empty(Y.shape)
+        full_VVT_factor = VVT_factor_all.shape[1] == Y.shape[1]
+        if not full_VVT_factor:
+            psi1V = np.dot(Y.T*beta_all, psi1_all).T
+        
+        for y, trYYT, [v, ind] in itertools.izip(Ys, traces, self._subarray_indices):
+            if het_noise: beta = beta_all[ind]
+            else: beta = beta_all[0]
+            
+            VVT_factor = (beta*y)
+            VVT_factor_all[v, ind].flat = VVT_factor.flat
+            output_dim = y.shape[1]
+
+            psi0 = psi0_all[v]
+            psi1 = psi1_all[v, :]
+            if uncertain_inputs: psi2 = psi2_all[v, :]
+            else: psi2 = None
+            num_data = psi1.shape[0]
+            
+            if uncertain_inputs:
+                if het_noise: psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
+                else: psi2_beta = psi2.sum(0) * beta
+                A = LmInv.dot(psi2_beta.dot(LmInv.T))
+            else:
+                if het_noise: tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
+                else: tmp = psi1 * (np.sqrt(beta))
+                tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
+                A = tdot(tmp) #print A.sum()
+
+            # factor B
+            B = np.eye(num_inducing) + A
+            LB = jitchol(B)
+            
+            psi1Vf = psi1.T.dot(VVT_factor)
+            _LBi_Lmi_psi1Vf, Cpsi1Vf = _compute_psi1Vf(Lm, LB, psi1Vf)
+            
+            #LB_all[ind, :,:] = LB    
+            
+            # data fit and derivative of L w.r.t. Kmm
+            delit = tdot(_LBi_Lmi_psi1Vf)
+            data_fit = np.trace(delit)
+            DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
+            delit = -0.5 * DBi_plus_BiPBi
+            delit += -0.5 * B * output_dim
+            delit += output_dim * np.eye(num_inducing)
+            dL_dKmm += backsub_both_sides(Lm, delit)
+
+            # derivatives of L w.r.t. psi
+            dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, 
+                VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, 
+                psi1, het_noise, uncertain_inputs)
+            
+            #import ipdb;ipdb.set_trace()
+            dL_dpsi0_all[v] += dL_dpsi0
+            dL_dpsi1_all[v, :] += dL_dpsi1
+            if uncertain_inputs:
+                dL_dpsi2_all[v, :] += dL_dpsi2
+            
+            # log marginal likelihood
+            log_marginal += _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, 
+                psi0, A, LB, trYYT, data_fit)
+
+            #put the gradients in the right places
+            partial_for_likelihood += _compute_partial_for_likelihood(likelihood, 
+                het_noise, uncertain_inputs, LB, 
+                _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, 
+                psi0, psi1, beta, 
+                data_fit, num_data, output_dim, trYYT)
+            
+            if full_VVT_factor: woodbury_vector[:, ind] = Cpsi1Vf
+            else:
+                print 'foobar'
+                tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
+                tmp, _ = dpotrs(LB, tmp, lower=1)
+                woodbury_vector[:, ind] = dtrtrs(Lm, tmp, lower=1, trans=1)[0]
+                
+            #import ipdb;ipdb.set_trace()
+            Bi, _ = dpotri(LB, lower=1)
+            symmetrify(Bi)
+            Bi = -dpotri(LB, lower=1)[0]
+            from ...util import diag
+            diag.add(Bi, 1)
+            woodbury_inv_all[:, :, ind] = backsub_both_sides(Lm, Bi)[:,:,None]
+            
+        # gradients:
+        likelihood.update_gradients(partial_for_likelihood)
+
+        if uncertain_inputs:
+            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0_all, 'dL_dpsi1':dL_dpsi1_all, 'dL_dpsi2':dL_dpsi2_all}
+            kern.update_gradients_variational(mu=X, S=X_variance, Z=Z, **grad_dict)
+        else:
+            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0_all, 'dL_dKnm':dL_dpsi1_all}
+            kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
+
+        #get sufficient things for posterior prediction
+        #TODO: do we really want to do this in  the loop?
+        #if not full_VVT_factor:
+        #    print 'foobar'
+        #    psi1V = np.dot(Y.T*beta_all, psi1_all).T
+        #    tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
+        #    tmp, _ = dpotrs(LB_all, tmp, lower=1)
+        #    woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+        #import ipdb;ipdb.set_trace()
+        #Bi, _ = dpotri(LB_all, lower=1)
+        #symmetrify(Bi)
+        #Bi = -dpotri(LB_all, lower=1)[0]
+        #from ...util import diag
+        #diag.add(Bi, 1)
+    
+        #woodbury_inv = backsub_both_sides(Lm, Bi)
+        
+        post = Posterior(woodbury_inv=woodbury_inv_all, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
+
+        return post, log_marginal, grad_dict
+
+
+def _compute_A(num_data, uncertain_inputs, beta, het_noise, psi1, psi2, Lm):
+# The rather complex computations of A
+    if uncertain_inputs:
+        if het_noise:
+            psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
+        else:
+            psi2_beta = psi2.sum(0) * beta
+        #if 0:
+        #    evals, evecs = linalg.eigh(psi2_beta)
+        #    clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
+        #    if not np.array_equal(evals, clipped_evals):
+        #        pass # print evals
+        #    tmp = evecs * np.sqrt(clipped_evals)
+        #    tmp = tmp.T
+        # no backsubstitution because of bound explosion on tr(A) if not...
+        LmInv = dtrtri(Lm)
+        A = LmInv.dot(psi2_beta.dot(LmInv.T))
+    else:
+        if het_noise:
+            tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
+        else:
+            tmp = psi1 * (np.sqrt(beta))
+        tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
+        A = tdot(tmp) #print A.sum()
+    return A
+
+
+def _compute_psi(kern, X, X_variance, Z, uncertain_inputs):
+    if uncertain_inputs:
+        psi0 = kern.psi0(Z, X, X_variance)
+        psi1 = kern.psi1(Z, X, X_variance)
+        psi2 = kern.psi2(Z, X, X_variance)
+    else:
+        psi0 = kern.Kdiag(X)
+        psi1 = kern.K(X, Z)
+        psi2 = None
+    return psi0, psi1, psi2
+
+def _compute_Kmm(kern, X, X_variance, Z, uncertain_inputs):
+    Kmm = kern.K(Z)
+    psi0, psi1, psi2 = _compute_psi(kern, X, X_variance, Z, uncertain_inputs) 
+    return Kmm, psi0, psi1, psi2
+
+def _compute_dL_dKmm(num_inducing, output_dim, Lm, B, LB, _LBi_Lmi_psi1Vf):
+    # Compute dL_dKmm
+    delit = tdot(_LBi_Lmi_psi1Vf)
+    data_fit = np.trace(delit)
+    DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
+    delit = -0.5 * DBi_plus_BiPBi
+    delit += -0.5 * B * output_dim
+    delit += output_dim * np.eye(num_inducing)
+    dL_dKmm = backsub_both_sides(Lm, delit)
+    return DBi_plus_BiPBi, data_fit, dL_dKmm
+
+def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs):
+    dL_dpsi0 = -0.5 * output_dim * (beta * np.ones([num_data, 1])).flatten()
+    dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
+    dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
+    if het_noise:
+        if uncertain_inputs:
+            dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :]
+        else:
+            dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T
+            dL_dpsi2 = None
+    else:
+        dL_dpsi2 = beta * dL_dpsi2_beta
+        if uncertain_inputs:
+            # repeat for each of the N psi_2 matrices
+            dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0)
+        else:
+            # subsume back into psi1 (==Kmn)
+            dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
+            dL_dpsi2 = None
+
+    return dL_dpsi0, dL_dpsi1, dL_dpsi2
+
+
+def _compute_psi1Vf(Lm, LB, psi1Vf):
+    # back substutue C into psi1Vf
+    tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
+    _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
+    tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
+    Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+    return _LBi_Lmi_psi1Vf, Cpsi1Vf
+
+
+def _compute_partial_for_likelihood(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT):
+    # the partial derivative vector for the likelihood
+    if likelihood.size == 0:
+        # save computation here.
+        partial_for_likelihood = None
+    elif het_noise:
+        if uncertain_inputs:
+            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+        else:
+            from ...util.linalg import chol_inv
+            LBi = chol_inv(LB)
+            Lmi_psi1, nil = dtrtrs(Lm, psi1.T, lower=1, trans=0)
+            _LBi_Lmi_psi1, _ = dtrtrs(LB, Lmi_psi1, lower=1, trans=0)
+
+            partial_for_likelihood = -0.5 * beta + 0.5 * likelihood.V**2
+            partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
+
+            partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
+
+            partial_for_likelihood += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * beta**2
+            partial_for_likelihood += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
+
+    else:
+        # likelihood is not heteroscedatic
+        partial_for_likelihood = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
+        partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
+        partial_for_likelihood += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
+    return partial_for_likelihood
+
+def _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT, data_fit):
+#compute log marginal likelihood
+    if het_noise:
+        lik_1 = -0.5 * num_data * output_dim * np.log(2. * np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(likelihood.V * likelihood.Y)
+        lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
+    else:
+        lik_1 = -0.5 * num_data * output_dim * (np.log(2. * np.pi) - np.log(beta)) - 0.5 * beta * trYYT
+        lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
+    lik_3 = -output_dim * (np.sum(np.log(np.diag(LB))))
+    lik_4 = 0.5 * data_fit
+    log_marginal = lik_1 + lik_2 + lik_3 + lik_4
+    return log_marginal
+
+def _do_inference_on(kern, X, X_variance, Z, likelihood, uncertain_inputs, output_dim, beta, VVT_factor, trYYT):
+    het_noise = beta.size < 1
+    num_inducing = Z.shape[0]
+    num_data = X.shape[0]
+    # kernel computations, using BGPLVM notation
+    Kmm, psi0, psi1, psi2 = _compute_Kmm(kern, X, X_variance, Z, uncertain_inputs)
+    #factor Kmm # TODO: cache?
+    Lm = jitchol(Kmm)
+    A = _compute_A(num_data, uncertain_inputs, beta, het_noise, psi1, psi2, Lm)
+    # factor B
+    B = np.eye(num_inducing) + A
+    LB = jitchol(B)
+    psi1Vf = np.dot(psi1.T, VVT_factor)
+    _LBi_Lmi_psi1Vf, Cpsi1Vf = _compute_psi1Vf(Lm, LB, psi1Vf)
+    # data fit and derivative of L w.r.t. Kmm
+    DBi_plus_BiPBi, data_fit, dL_dKmm = _compute_dL_dKmm(num_inducing, output_dim, 
+        Lm, B, LB, _LBi_Lmi_psi1Vf)
+    # derivatives of L w.r.t. psi
+    dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, 
+        VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, 
+        psi1, het_noise, uncertain_inputs)
+    # log marginal likelihood
+    log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, 
+        psi0, A, LB, trYYT, data_fit)
+    #put the gradients in the right places
+    partial_for_likelihood = _compute_partial_for_likelihood(likelihood, 
+        het_noise, uncertain_inputs, LB, 
+        _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, 
+        psi0, psi1, beta, 
+        data_fit, num_data, output_dim, trYYT)
+    return dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Cpsi1Vf, psi1, Lm, LB, log_marginal, Kmm, partial_for_likelihood
diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index a362aff8..96afc78a 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -136,7 +136,7 @@ def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2,
     part = parts.poly.POLY(input_dim,variance,weight_variance,bias_variance,degree,ARD)
     return kern(input_dim, [part])
 
-def white(input_dim,variance=1.):
+def white(input_dim,variance=1.,name='white'):
     """
      Construct a white kernel.
 
@@ -146,7 +146,7 @@ def white(input_dim,variance=1.):
     :type variance: float
 
     """
-    part = parts.white.White(input_dim,variance)
+    part = parts.white.White(input_dim,variance,name=name)
     return kern(input_dim, [part])
 
 def eq_ode1(output_dim, W=None, rank=1,  kappa=None, length_scale=1., decay=None, delay=None):
diff --git a/GPy/kern/parts/linear.py b/GPy/kern/parts/linear.py
index db7c9834..828ece11 100644
--- a/GPy/kern/parts/linear.py
+++ b/GPy/kern/parts/linear.py
@@ -60,7 +60,7 @@ class Linear(Kernpart):
         self._K_computations(X, None)
 
     def update_gradients_full(self, dL_dK, X):
-        #self.variances.gradient[:] = 0
+        self.variances.gradient[:] = 0
         self._param_grad_helper(dL_dK, X, None, self.variances.gradient)
     
     def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
diff --git a/GPy/kern/parts/white.py b/GPy/kern/parts/white.py
index b0e961c7..c7e4c6dd 100644
--- a/GPy/kern/parts/white.py
+++ b/GPy/kern/parts/white.py
@@ -15,8 +15,8 @@ class White(Kernpart):
     :param variance:
     :type variance: float
     """
-    def __init__(self,input_dim,variance=1.):
-        super(White, self).__init__(input_dim, 'white')
+    def __init__(self,input_dim,variance=1., name='white'):
+        super(White, self).__init__(input_dim, name)
         self.input_dim = input_dim
         self.variance = Param('variance', variance, Logexp())
         self.add_parameters(self.variance)
diff --git a/GPy/models/gp_regression.py b/GPy/models/gp_regression.py
index cd24ac18..a72acc1a 100644
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@@ -28,6 +28,7 @@ class GPRegression(GP):
         likelihood = likelihoods.Gaussian()
 
         super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression')
+        self.parameters_changed()
 
     def _getstate(self):
         return GP._getstate(self)
diff --git a/GPy/models/gplvm.py b/GPy/models/gplvm.py
index 06481b81..630b1e8c 100644
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@@ -44,7 +44,7 @@ class GPLVM(GP):
             PC = PCA(Y, input_dim)[0]
             Xr[:PC.shape[0], :PC.shape[1]] = PC
         else:
-            raise NotImplementedError
+            pass
         return Xr
 
     def parameters_changed(self):
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index 3e105785..511ce5aa 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -10,6 +10,22 @@ import pylab
 from GPy.kern.kern import kern
 from GPy.models.bayesian_gplvm import BayesianGPLVM
 
+class MRD2(Model):
+    """
+    Apply MRD to all given datasets Y in Ylist. 
+    
+    Y_i in [n x p_i]
+    
+    The samples n in the datasets need 
+    to match up, whereas the dimensionality p_d can differ.
+    
+    :param [array-like] Ylist: List of datasets to apply MRD on
+    :param array-like q_mean: mean of starting latent space q in [n x q]
+    :param array-like q_variance: variance of starting latent space q in [n x q]
+    :param :class:`~GPy.inference.latent_function_inference
+    """
+    
+
 class MRD(Model):
     """
     Do MRD on given Datasets in Ylist.
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index 4e6589b4..c9896116 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -132,10 +132,10 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         #predict on the frame and plot
         if plot_raw:
             m, _ = model._raw_predict(Xgrid, which_parts=which_parts)
-            Y = model.likelihood.Y
+            Y = model.Y
         else:
             m, _, _, _ = model.predict(Xgrid, which_parts=which_parts)
-            Y = model.likelihood.data
+            Y = model.data
         for d in which_data_ycols:
             m_d = m[:,d].reshape(resolution, resolution).T
             ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index 8806e6d3..51ba56f3 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -30,7 +30,14 @@ class Cacher(object):
 
     def on_cache_changed(self, X):
         #print id(X)
-        i = self.cached_inputs.index(X)
+        Xbase = X
+        while Xbase is not None:
+            try:
+                i = self.cached_inputs.index(X)
+                break
+            except ValueError:
+                Xbase = X.base
+                continue
         self.inputs_changed[i] = True
 
                 
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 6bcddc3e..22b4f86c 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -101,17 +101,17 @@ def jitchol(A, maxtries=5):
 
 
 
-def dtrtri(L, lower=1):
-    """
-    Wrapper for lapack dtrtri function
-    Inverse of L
-
-    :param L: Triangular Matrix L
-    :param lower: is matrix lower (true) or upper (false)
-    :returns: Li, info
-    """
-    L = force_F_ordered(L)
-    return lapack.dtrtri(L, lower=lower)
+# def dtrtri(L, lower=1):
+#     """
+#     Wrapper for lapack dtrtri function
+#     Inverse of L
+# 
+#     :param L: Triangular Matrix L
+#     :param lower: is matrix lower (true) or upper (false)
+#     :returns: Li, info
+#     """
+#     L = force_F_ordered(L)
+#     return lapack.dtrtri(L, lower=lower)
 
 def dtrtrs(A, B, lower=1, trans=0, unitdiag=0):
     """
diff --git a/GPy/util/subarray_and_sorting.py b/GPy/util/subarray_and_sorting.py
index 49385771..ab7b7672 100644
--- a/GPy/util/subarray_and_sorting.py
+++ b/GPy/util/subarray_and_sorting.py
@@ -17,7 +17,7 @@ def common_subarrays(X, axis=0):
     
     :param :class:`np.ndarray` X: 2d array to check for common subarrays in
     :param int axis: axis to apply subarray detection over. 
-        When the index is 0, compare rows, columns, otherwise.   
+        When the index is 0, compare rows -- columns, otherwise.   
     
     Examples:
     =========