diff --git a/GPy/__init__.py b/GPy/__init__.py
index 32f0c1c4..d5520aa4 100644
--- a/GPy/__init__.py
+++ b/GPy/__init__.py
@@ -4,8 +4,6 @@ import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 
 from . import core
-from .core.parameterization import transformations, priors
-constraints = transformations
 from . import models
 from . import mappings
 from . import inference
@@ -13,16 +11,16 @@ from . import util
 from . import examples
 from . import likelihoods
 from . import testing
-from numpy.testing import Tester
 from . import kern
 from . import plotting
 
 # Direct imports for convenience:
-from .core import Model
-from .core.parameterization import Param, Parameterized, ObsAr
+from .core import ProbabilisticModel, priors
+from paramz import Param, Parameterized, ObsAr, transformations as constraints
 
 from .__version__ import __version__
 
+from numpy.testing import Tester
 #@nottest
 try:
     #Get rid of nose dependency by only ignoring if you have nose installed
diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py
index 4559d86d..ae1e6769 100644
--- a/GPy/core/__init__.py
+++ b/GPy/core/__init__.py
@@ -1,10 +1,8 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from GPy.core.probabilistic_model import *
-from .parameterization.parameterized import adjust_name_for_printing, Parameterizable
-from .parameterization.param import Param, ParamConcatenation
-from .parameterization.observable_array import ObsAr
+from .probabilistic_model import ProbabilisticModel
+from .parameterization import Param, Parameterized
 
 from .gp import GP
 from .svgp import SVGP
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 27cf0eea..dfb0f6b7 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -2,21 +2,20 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-import sys
 from .. import kern
-from GPy.core.probabilistic_model import Model
-from .parameterization import ObsAr
+from .probabilistic_model import ProbabilisticModel
+from paramz import ObsAr
 from .mapping import Mapping
 from .. import likelihoods
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
-from .parameterization.variational import VariationalPosterior
+from .variational import VariationalPosterior
 
 import logging
 import warnings
 from GPy.util.normalizer import MeanNorm
 logger = logging.getLogger("GP")
 
-class GP(Model):
+class GP(ProbabilisticModel):
     """
     General purpose Gaussian process model
 
diff --git a/GPy/core/parameterization/__init__.py b/GPy/core/parameterization/__init__.py
new file mode 100644
index 00000000..813d12fe
--- /dev/null
+++ b/GPy/core/parameterization/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from .param import Param
+from .parameterized import Parameterized
\ No newline at end of file
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
new file mode 100644
index 00000000..ee7b9bbd
--- /dev/null
+++ b/GPy/core/parameterization/param.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from paramz import Param
+from .priorizable import Priorizable
+
+class Param(Param, Priorizable):
+    pass
\ No newline at end of file
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
new file mode 100644
index 00000000..5e19e592
--- /dev/null
+++ b/GPy/core/parameterization/parameterized.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2014, Max Zwiessele, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from paramz  import Parameterized
+from .priorizable import Priorizable
+
+import logging
+logger = logging.getLogger("parameters changed meta")
+
+class Parameterized(Parameterized, Priorizable):
+    """
+    Parameterized class
+
+    Say m is a handle to a parameterized class.
+
+    Printing parameters:
+
+        - print m:           prints a nice summary over all parameters
+        - print m.name:      prints details for param with name 'name'
+        - print m[regexp]: prints details for all the parameters
+                             which match (!) regexp
+        - print m['']:       prints details for all parameters
+
+        Fields:
+
+            Name:       The name of the param, can be renamed!
+            Value:      Shape or value, if one-valued
+            Constrain:  constraint of the param, curly "{c}" brackets indicate
+                        some parameters are constrained by c. See detailed print
+                        to get exact constraints.
+            Tied_to:    which paramter it is tied to.
+
+    Getting and setting parameters:
+
+        Set all values in param to one:
+
+            m.name.to.param = 1
+
+    Handling of constraining, fixing and tieing parameters:
+
+        You can constrain parameters by calling the constrain on the param itself, e.g:
+
+            - m.name[:,1].constrain_positive()
+            - m.name[0].tie_to(m.name[1])
+
+        Fixing parameters will fix them to the value they are right now. If you change
+        the parameters value, the param will be fixed to the new value!
+
+        If you want to operate on all parameters use m[''] to wildcard select all paramters
+        and concatenate them. Printing m[''] will result in printing of all parameters in detail.
+    """
+    pass
\ No newline at end of file
diff --git a/GPy/core/parameterization/priorizable.py b/GPy/core/parameterization/priorizable.py
new file mode 100644
index 00000000..390fee71
--- /dev/null
+++ b/GPy/core/parameterization/priorizable.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+import numpy as np
+from paramz.transformations import Transformation, __fixed__
+from paramz.core.parameter_core import Parameterizable
+
+class Priorizable(Parameterizable):
+    def __init__(self, name, default_prior=None, *a, **kw):
+        super(Priorizable, self).__init__(name=name, *a, **kw)
+        self._default_prior_ = default_prior
+        from paramz.core.index_operations import ParameterIndexOperations
+        self.add_index_operation('priors', ParameterIndexOperations())
+        if self._default_prior_ is not None:
+            self.set_prior(self._default_prior_)
+
+    #===========================================================================
+    # Prior Operations
+    #===========================================================================
+    def set_prior(self, prior, warning=True):
+        """
+        Set the prior for this object to prior.
+        :param :class:`~GPy.priors.Prior` prior: a prior to set for this parameter
+        :param bool warning: whether to warn if another prior was set for this parameter
+        """
+        repriorized = self.unset_priors()
+        self._add_to_index_operations(self.priors, repriorized, prior, warning)
+
+        from paramz.domains import _REAL, _POSITIVE, _NEGATIVE
+        if prior.domain is _POSITIVE:
+            self.constrain_positive(warning)
+        elif prior.domain is _NEGATIVE:
+            self.constrain_negative(warning)
+        elif prior.domain is _REAL:
+            rav_i = self._raveled_index()
+            assert all(all(False if c is __fixed__ else c.domain is _REAL for c in con) for con in self.constraints.properties_for(rav_i)), 'Domain of prior and constraint have to match, please unconstrain if you REALLY wish to use this prior'
+
+    def unset_priors(self, *priors):
+        """
+        Un-set all priors given (in *priors) from this parameter handle.
+        """
+        return self._remove_from_index_operations(self.priors, priors)
+
+    def log_prior(self):
+        """evaluate the prior"""
+        if self.priors.size == 0:
+            return 0.
+        x = self.param_array
+        #evaluate the prior log densities
+        log_p = reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.items()), 0)
+
+        #account for the transformation by evaluating the log Jacobian (where things are transformed)
+        log_j = 0.
+        priored_indexes = np.hstack([i for p, i in self.priors.items()])
+        for c,j in self.constraints.items():
+            if not isinstance(c, Transformation):continue
+            for jj in j:
+                if jj in priored_indexes:
+                    log_j += c.log_jacobian(x[jj])
+        return log_p + log_j
+
+    def _log_prior_gradients(self):
+        """evaluate the gradients of the priors"""
+        if self.priors.size == 0:
+            return 0.
+        x = self.param_array
+        ret = np.zeros(x.size)
+        #compute derivate of prior density
+        [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.items()]
+        #add in jacobian derivatives if transformed
+        priored_indexes = np.hstack([i for p, i in self.priors.items()])
+        for c,j in self.constraints.items():
+            if not isinstance(c, Transformation):continue
+            for jj in j:
+                if jj in priored_indexes:
+                    ret[jj] += c.log_jacobian_grad(x[jj])
+        return ret
diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py
new file mode 100644
index 00000000..1799a06d
--- /dev/null
+++ b/GPy/core/parameterization/transformations.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2014, Max Zwiessele, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from paramz.transformations import *
\ No newline at end of file
diff --git a/GPy/core/priors.py b/GPy/core/priors.py
new file mode 100644
index 00000000..df2173ea
--- /dev/null
+++ b/GPy/core/priors.py
@@ -0,0 +1,1311 @@
+# Copyright (c) 2012 - 2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy.special import gammaln, digamma
+from ..util.linalg import pdinv
+from paramz.domains import _REAL, _POSITIVE
+import warnings
+import weakref
+
+
+class Prior(object):
+    domain = None
+    _instance = None
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance or cls._instance.__class__ is not cls:
+                newfunc = super(Prior, cls).__new__
+                if newfunc is object.__new__:
+                    cls._instance = newfunc(cls)  
+                else:
+                    cls._instance = newfunc(cls, *args, **kwargs)
+                return cls._instance
+
+    def pdf(self, x):
+        return np.exp(self.lnpdf(x))
+
+    def plot(self):
+        import sys
+
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ...plotting.matplot_dep import priors_plots
+
+        priors_plots.univariate_plot(self)
+
+    def __repr__(self, *args, **kwargs):
+        return self.__str__()
+
+
+class Gaussian(Prior):
+    """
+    Implementation of the univariate Gaussian probability function, coupled with random variables.
+
+    :param mu: mean
+    :param sigma: standard deviation
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = _REAL
+    _instances = []
+
+    def __new__(cls, mu=0, sigma=1):  # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().mu == mu and instance().sigma == sigma:
+                    return instance()
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, mu, sigma)            
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, mu, sigma):
+        self.mu = float(mu)
+        self.sigma = float(sigma)
+        self.sigma2 = np.square(self.sigma)
+        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
+
+    def __str__(self):
+        return "N({:.2g}, {:.2g})".format(self.mu, self.sigma)
+
+    def lnpdf(self, x):
+        return self.constant - 0.5 * np.square(x - self.mu) / self.sigma2
+
+    def lnpdf_grad(self, x):
+        return -(x - self.mu) / self.sigma2
+
+    def rvs(self, n):
+        return np.random.randn(n) * self.sigma + self.mu
+
+#     def __getstate__(self):
+#         return self.mu, self.sigma
+#
+#     def __setstate__(self, state):
+#         self.mu = state[0]
+#         self.sigma = state[1]
+#         self.sigma2 = np.square(self.sigma)
+#         self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
+
+class Uniform(Prior):
+    domain = _REAL
+    _instances = []
+
+    def __new__(cls, lower=0, upper=1):  # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().lower == lower and instance().upper == upper:
+                    return instance()
+        o = super(Prior, cls).__new__(cls, lower, upper)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, lower, upper):
+        self.lower = float(lower)
+        self.upper = float(upper)
+
+    def __str__(self):
+        return "[{:.2g}, {:.2g}]".format(self.lower, self.upper)
+
+    def lnpdf(self, x):
+        region = (x >= self.lower) * (x <= self.upper)
+        return region
+
+    def lnpdf_grad(self, x):
+        return np.zeros(x.shape)
+
+    def rvs(self, n):
+        return np.random.uniform(self.lower, self.upper, size=n)
+
+#     def __getstate__(self):
+#         return self.lower, self.upper
+#
+#     def __setstate__(self, state):
+#         self.lower = state[0]
+#         self.upper = state[1]
+
+class LogGaussian(Gaussian):
+    """
+    Implementation of the univariate *log*-Gaussian probability function, coupled with random variables.
+
+    :param mu: mean
+    :param sigma: standard deviation
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = _POSITIVE
+    _instances = []
+
+    def __new__(cls, mu=0, sigma=1):  # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().mu == mu and instance().sigma == sigma:
+                    return instance()
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, mu, sigma)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, mu, sigma):
+        self.mu = float(mu)
+        self.sigma = float(sigma)
+        self.sigma2 = np.square(self.sigma)
+        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
+
+    def __str__(self):
+        return "lnN({:.2g}, {:.2g})".format(self.mu, self.sigma)
+
+    def lnpdf(self, x):
+        return self.constant - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2 - np.log(x)
+
+    def lnpdf_grad(self, x):
+        return -((np.log(x) - self.mu) / self.sigma2 + 1.) / x
+
+    def rvs(self, n):
+        return np.exp(np.random.randn(int(n)) * self.sigma + self.mu)
+
+
+class MultivariateGaussian(Prior):
+    """
+    Implementation of the multivariate Gaussian probability function, coupled with random variables.
+
+    :param mu: mean (N-dimensional array)
+    :param var: covariance matrix (NxN)
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = _REAL
+    _instances = []
+
+    def __new__(cls, mu=0, var=1):  # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if np.all(instance().mu == mu) and np.all(instance().var == var):
+                    return instance()
+        o = super(Prior, cls).__new__(cls, mu, var)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, mu, var):
+        self.mu = np.array(mu).flatten()
+        self.var = np.array(var)
+        assert len(self.var.shape) == 2
+        assert self.var.shape[0] == self.var.shape[1]
+        assert self.var.shape[0] == self.mu.size
+        self.input_dim = self.mu.size
+        self.inv, self.hld = pdinv(self.var)
+        self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld
+
+    def summary(self):
+        raise NotImplementedError
+
+    def pdf(self, x):
+        return np.exp(self.lnpdf(x))
+
+    def lnpdf(self, x):
+        d = x - self.mu
+        return self.constant - 0.5 * np.sum(d * np.dot(d, self.inv), 1)
+
+    def lnpdf_grad(self, x):
+        d = x - self.mu
+        return -np.dot(self.inv, d)
+
+    def rvs(self, n):
+        return np.random.multivariate_normal(self.mu, self.var, n)
+
+    def plot(self):
+        import sys
+
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import priors_plots
+
+        priors_plots.multivariate_plot(self)
+
+    def __getstate__(self):
+        return self.mu, self.var
+
+    def __setstate__(self, state):
+        self.mu = state[0]
+        self.var = state[1]
+        assert len(self.var.shape) == 2
+        assert self.var.shape[0] == self.var.shape[1]
+        assert self.var.shape[0] == self.mu.size
+        self.input_dim = self.mu.size
+        self.inv, self.hld = pdinv(self.var)
+        self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld
+
+def gamma_from_EV(E, V):
+    warnings.warn("use Gamma.from_EV to create Gamma Prior", FutureWarning)
+    return Gamma.from_EV(E, V)
+
+
+class Gamma(Prior):
+    """
+    Implementation of the Gamma probability function, coupled with random variables.
+
+    :param a: shape parameter
+    :param b: rate parameter (warning: it's the *inverse* of the scale)
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = _POSITIVE
+    _instances = []
+
+    def __new__(cls, a=1, b=.5):  # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().a == a and instance().b == b:
+                    return instance()
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, a, b)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, a, b):
+        self.a = float(a)
+        self.b = float(b)
+        self.constant = -gammaln(self.a) + a * np.log(b)
+
+    def __str__(self):
+        return "Ga({:.2g}, {:.2g})".format(self.a, self.b)
+
+    def summary(self):
+        ret = {"E[x]": self.a / self.b, \
+               "E[ln x]": digamma(self.a) - np.log(self.b), \
+               "var[x]": self.a / self.b / self.b, \
+               "Entropy": gammaln(self.a) - (self.a - 1.) * digamma(self.a) - np.log(self.b) + self.a}
+        if self.a > 1:
+            ret['Mode'] = (self.a - 1.) / self.b
+        else:
+            ret['mode'] = np.nan
+        return ret
+
+    def lnpdf(self, x):
+        return self.constant + (self.a - 1) * np.log(x) - self.b * x
+
+    def lnpdf_grad(self, x):
+        return (self.a - 1.) / x - self.b
+
+    def rvs(self, n):
+        return np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
+
+    @staticmethod
+    def from_EV(E, V):
+        """
+        Creates an instance of a Gamma Prior  by specifying the Expected value(s)
+        and Variance(s) of the distribution.
+
+        :param E: expected value
+        :param V: variance
+        """
+        a = np.square(E) / V
+        b = E / V
+        return Gamma(a, b)
+
+    def __getstate__(self):
+        return self.a, self.b
+
+    def __setstate__(self, state):
+        self.a = state[0]
+        self.b = state[1]
+        self.constant = -gammaln(self.a) + self.a * np.log(self.b)
+
+class InverseGamma(Gamma):
+    """
+    Implementation of the inverse-Gamma probability function, coupled with random variables.
+
+    :param a: shape parameter
+    :param b: rate parameter (warning: it's the *inverse* of the scale)
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = _POSITIVE
+    _instances = []
+    def __new__(cls, a=1, b=.5): # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().a == a and instance().b == b:
+                    return instance()
+        o = super(Prior, cls).__new__(cls, a, b)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, a, b):
+        self.a = float(a)
+        self.b = float(b)
+        self.constant = -gammaln(self.a) + a * np.log(b)
+
+    def __str__(self):
+        return "iGa({:.2g}, {:.2g})".format(self.a, self.b)
+
+    def lnpdf(self, x):
+        return self.constant - (self.a + 1) * np.log(x) - self.b / x
+
+    def lnpdf_grad(self, x):
+        return -(self.a + 1.) / x + self.b / x ** 2
+
+    def rvs(self, n):
+        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
+
+
+class DGPLVM_KFDA(Prior):
+    """
+    Implementation of the Discriminative Gaussian Process Latent Variable function using
+    Kernel Fisher Discriminant Analysis by Seung-Jean Kim for implementing Face paper
+    by Chaochao Lu.
+
+    :param lambdaa: constant
+    :param sigma2: constant
+
+    .. Note:: Surpassing Human-Level Face paper dgplvm implementation
+
+    """
+    domain = _REAL
+    # _instances = []
+    # def __new__(cls, lambdaa, sigma2):  # Singleton:
+    #     if cls._instances:
+    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
+    #         for instance in cls._instances:
+    #             if instance().mu == mu and instance().sigma == sigma:
+    #                 return instance()
+    #     o = super(Prior, cls).__new__(cls, mu, sigma)
+    #     cls._instances.append(weakref.ref(o))
+    #     return cls._instances[-1]()
+
+    def __init__(self, lambdaa, sigma2, lbl, kern, x_shape):
+        """A description for init"""
+        self.datanum = lbl.shape[0]
+        self.classnum = lbl.shape[1]
+        self.lambdaa = lambdaa
+        self.sigma2 = sigma2
+        self.lbl = lbl
+        self.kern = kern
+        lst_ni = self.compute_lst_ni()
+        self.a = self.compute_a(lst_ni)
+        self.A = self.compute_A(lst_ni)
+        self.x_shape = x_shape
+
+    def get_class_label(self, y):
+        for idx, v in enumerate(y):
+            if v == 1:
+                return idx
+        return -1
+
+    # This function assigns each data point to its own class
+    # and returns the dictionary which contains the class name and parameters.
+    def compute_cls(self, x):
+        cls = {}
+        # Appending each data point to its proper class
+        for j in range(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in cls:
+                cls[class_label] = []
+            cls[class_label].append(x[j])
+        if len(cls) > 2:
+            for i in range(2, self.classnum):
+                del cls[i]
+        return cls
+
+    def x_reduced(self, cls):
+        x1 = cls[0]
+        x2 = cls[1]
+        x = np.concatenate((x1, x2), axis=0)
+        return x
+
+    def compute_lst_ni(self):
+        lst_ni = []
+        lst_ni1 = []
+        lst_ni2 = []
+        f1 = (np.where(self.lbl[:, 0] == 1)[0])
+        f2 = (np.where(self.lbl[:, 1] == 1)[0])
+        for idx in f1:
+            lst_ni1.append(idx)
+        for idx in f2:
+            lst_ni2.append(idx)
+        lst_ni.append(len(lst_ni1))
+        lst_ni.append(len(lst_ni2))
+        return lst_ni
+
+    def compute_a(self, lst_ni):
+        a = np.ones((self.datanum, 1))
+        count = 0
+        for N_i in lst_ni:
+            if N_i == lst_ni[0]:
+                a[count:count + N_i] = (float(1) / N_i) * a[count]
+                count += N_i
+            else:
+                if N_i == lst_ni[1]:
+                    a[count: count + N_i] = -(float(1) / N_i) * a[count]
+                    count += N_i
+        return a
+
+    def compute_A(self, lst_ni):
+        A = np.zeros((self.datanum, self.datanum))
+        idx = 0
+        for N_i in lst_ni:
+            B = float(1) / np.sqrt(N_i) * (np.eye(N_i) - ((float(1) / N_i) * np.ones((N_i, N_i))))
+            A[idx:idx + N_i, idx:idx + N_i] = B
+            idx += N_i
+        return A
+
+    # Here log function
+    def lnpdf(self, x):
+        x = x.reshape(self.x_shape)
+        K = self.kern.K(x)
+        a_trans = np.transpose(self.a)
+        paran = self.lambdaa * np.eye(x.shape[0]) + self.A.dot(K).dot(self.A)
+        inv_part = pdinv(paran)[0]
+        J = a_trans.dot(K).dot(self.a) - a_trans.dot(K).dot(self.A).dot(inv_part).dot(self.A).dot(K).dot(self.a)
+        J_star = (1. / self.lambdaa) * J
+        return (-1. / self.sigma2) * J_star
+
+    # Here gradient function
+    def lnpdf_grad(self, x):
+        x = x.reshape(self.x_shape)
+        K = self.kern.K(x)
+        paran = self.lambdaa * np.eye(x.shape[0]) + self.A.dot(K).dot(self.A)
+        inv_part = pdinv(paran)[0]
+        b = self.A.dot(inv_part).dot(self.A).dot(K).dot(self.a)
+        a_Minus_b = self.a - b
+        a_b_trans = np.transpose(a_Minus_b)
+        DJ_star_DK = (1. / self.lambdaa) * (a_Minus_b.dot(a_b_trans))
+        DJ_star_DX = self.kern.gradients_X(DJ_star_DK, x)
+        return (-1. / self.sigma2) * DJ_star_DX
+
+    def rvs(self, n):
+        return np.random.rand(n)  # A WRONG implementation
+
+    def __str__(self):
+        return 'DGPLVM_prior'
+
+    def __getstate___(self):
+        return self.lbl, self.lambdaa, self.sigma2, self.kern, self.x_shape
+
+    def __setstate__(self, state):
+        lbl, lambdaa, sigma2, kern, a, A, x_shape = state
+        self.datanum = lbl.shape[0]
+        self.classnum = lbl.shape[1]
+        self.lambdaa = lambdaa
+        self.sigma2 = sigma2
+        self.lbl = lbl
+        self.kern = kern
+        lst_ni = self.compute_lst_ni()
+        self.a = self.compute_a(lst_ni)
+        self.A = self.compute_A(lst_ni)
+        self.x_shape = x_shape
+
+
+class DGPLVM(Prior):
+    """
+    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
+
+    :param sigma2: constant
+
+    .. Note:: DGPLVM for Classification paper implementation
+
+    """
+    domain = _REAL
+    
+    def __new__(cls, sigma2, lbl, x_shape): 
+        return super(Prior, cls).__new__(cls, sigma2, lbl, x_shape)
+
+    def __init__(self, sigma2, lbl, x_shape):
+        self.sigma2 = sigma2
+        # self.x = x
+        self.lbl = lbl
+        self.classnum = lbl.shape[1]
+        self.datanum = lbl.shape[0]
+        self.x_shape = x_shape
+        self.dim = x_shape[1]
+
+    def get_class_label(self, y):
+        for idx, v in enumerate(y):
+            if v == 1:
+                return idx
+        return -1
+
+    # This function assigns each data point to its own class
+    # and returns the dictionary which contains the class name and parameters.
+    def compute_cls(self, x):
+        cls = {}
+        # Appending each data point to its proper class
+        for j in range(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in cls:
+                cls[class_label] = []
+            cls[class_label].append(x[j])
+        return cls
+
+    # This function computes mean of each class. The mean is calculated through each dimension
+    def compute_Mi(self, cls):
+        M_i = np.zeros((self.classnum, self.dim))
+        for i in cls:
+            # Mean of each class
+            class_i = cls[i]
+            M_i[i] = np.mean(class_i, axis=0)
+        return M_i
+
+    # Adding data points as tuple to the dictionary so that we can access indices
+    def compute_indices(self, x):
+        data_idx = {}
+        for j in range(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in data_idx:
+                data_idx[class_label] = []
+            t = (j, x[j])
+            data_idx[class_label].append(t)
+        return data_idx
+
+    # Adding indices to the list so we can access whole the indices
+    def compute_listIndices(self, data_idx):
+        lst_idx = []
+        lst_idx_all = []
+        for i in data_idx:
+            if len(lst_idx) == 0:
+                pass
+                #Do nothing, because it is the first time list is created so is empty
+            else:
+                lst_idx = []
+            # Here we put indices of each class in to the list called lst_idx_all
+            for m in range(len(data_idx[i])):
+                lst_idx.append(data_idx[i][m][0])
+            lst_idx_all.append(lst_idx)
+        return lst_idx_all
+
+    # This function calculates between classes variances
+    def compute_Sb(self, cls, M_i, M_0):
+        Sb = np.zeros((self.dim, self.dim))
+        for i in cls:
+            B = (M_i[i] - M_0).reshape(self.dim, 1)
+            B_trans = B.transpose()
+            Sb += (float(len(cls[i])) / self.datanum) * B.dot(B_trans)
+        return Sb
+
+    # This function calculates within classes variances
+    def compute_Sw(self, cls, M_i):
+        Sw = np.zeros((self.dim, self.dim))
+        for i in cls:
+            N_i = float(len(cls[i]))
+            W_WT = np.zeros((self.dim, self.dim))
+            for xk in cls[i]:
+                W = (xk - M_i[i])
+                W_WT += np.outer(W, W)
+            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+        return Sw
+
+    # Calculating beta and Bi for Sb
+    def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
+        # import pdb
+        # pdb.set_trace()
+        B_i = np.zeros((self.classnum, self.dim))
+        Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            # pdb.set_trace()
+            # Calculating Bi
+            B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
+        for k in range(self.datanum):
+            for i in data_idx:
+                N_i = float(len(data_idx[i]))
+                if k in lst_idx_all[i]:
+                    beta = (float(1) / N_i) - (float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+                else:
+                    beta = -(float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
+        return Sig_beta_B_i_all
+
+
+    # Calculating W_j s separately so we can access all the W_j s anytime
+    def compute_wj(self, data_idx, M_i):
+        W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                xj = tpl[1]
+                j = tpl[0]
+                W_i[j] = (xj - M_i[i])
+        return W_i
+
+    # Calculating alpha and Wj for Sw
+    def compute_sig_alpha_W(self, data_idx, lst_idx_all, W_i):
+        Sig_alpha_W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                k = tpl[0]
+                for j in lst_idx_all[i]:
+                    if k == j:
+                        alpha = 1 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                    else:
+                        alpha = 0 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+        return Sig_alpha_W_i
+
+    # This function calculates log of our prior
+    def lnpdf(self, x):
+        x = x.reshape(self.x_shape)
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        # sb_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
+
+    # This function calculates derivative of the log of prior function
+    def lnpdf_grad(self, x):
+        x = x.reshape(self.x_shape)
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        data_idx = self.compute_indices(x)
+        lst_idx_all = self.compute_listIndices(data_idx)
+        Sig_beta_B_i_all = self.compute_sig_beta_Bi(data_idx, M_i, M_0, lst_idx_all)
+        W_i = self.compute_wj(data_idx, M_i)
+        Sig_alpha_W_i = self.compute_sig_alpha_W(data_idx, lst_idx_all, W_i)
+
+        # Calculating inverse of Sb and its transpose and minus
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N_trans = np.transpose(Sb_inv_N)
+        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
+        Sw_trans = np.transpose(Sw)
+
+        # Calculating DJ/DXk
+        DJ_Dxk = 2 * (
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
+                Sig_alpha_W_i))
+        # Calculating derivative of the log of the prior
+        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        return DPx_Dx.T
+
+    # def frb(self, x):
+    #     from functools import partial
+    #     from GPy.models import GradientChecker
+    #     f = partial(self.lnpdf)
+    #     df = partial(self.lnpdf_grad)
+    #     grad = GradientChecker(f, df, x, 'X')
+    #     grad.checkgrad(verbose=1)
+
+    def rvs(self, n):
+        return np.random.rand(n)  # A WRONG implementation
+
+    def __str__(self):
+        return 'DGPLVM_prior_Raq'
+
+
+# ******************************************
+
+from . import Parameterized
+from . import Param
+
+class DGPLVM_Lamda(Prior, Parameterized):
+    """
+    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
+
+    :param sigma2: constant
+
+    .. Note:: DGPLVM for Classification paper implementation
+
+    """
+    domain = _REAL
+    # _instances = []
+    # def __new__(cls, mu, sigma): # Singleton:
+    #     if cls._instances:
+    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
+    #         for instance in cls._instances:
+    #             if instance().mu == mu and instance().sigma == sigma:
+    #                 return instance()
+    #     o = super(Prior, cls).__new__(cls, mu, sigma)
+    #     cls._instances.append(weakref.ref(o))
+    #     return cls._instances[-1]()
+
+    def __init__(self, sigma2, lbl, x_shape, lamda, name='DP_prior'):
+        super(DGPLVM_Lamda, self).__init__(name=name)
+        self.sigma2 = sigma2
+        # self.x = x
+        self.lbl = lbl
+        self.lamda = lamda
+        self.classnum = lbl.shape[1]
+        self.datanum = lbl.shape[0]
+        self.x_shape = x_shape
+        self.dim = x_shape[1]
+        self.lamda = Param('lamda', np.diag(lamda))
+        self.link_parameter(self.lamda)
+
+    def get_class_label(self, y):
+        for idx, v in enumerate(y):
+            if v == 1:
+                return idx
+        return -1
+
+    # This function assigns each data point to its own class
+    # and returns the dictionary which contains the class name and parameters.
+    def compute_cls(self, x):
+        cls = {}
+        # Appending each data point to its proper class
+        for j in xrange(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in cls:
+                cls[class_label] = []
+            cls[class_label].append(x[j])
+        return cls
+
+    # This function computes mean of each class. The mean is calculated through each dimension
+    def compute_Mi(self, cls):
+        M_i = np.zeros((self.classnum, self.dim))
+        for i in cls:
+            # Mean of each class
+            class_i = cls[i]
+            M_i[i] = np.mean(class_i, axis=0)
+        return M_i
+
+    # Adding data points as tuple to the dictionary so that we can access indices
+    def compute_indices(self, x):
+        data_idx = {}
+        for j in xrange(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in data_idx:
+                data_idx[class_label] = []
+            t = (j, x[j])
+            data_idx[class_label].append(t)
+        return data_idx
+
+    # Adding indices to the list so we can access whole the indices
+    def compute_listIndices(self, data_idx):
+        lst_idx = []
+        lst_idx_all = []
+        for i in data_idx:
+            if len(lst_idx) == 0:
+                pass
+                #Do nothing, because it is the first time list is created so is empty
+            else:
+                lst_idx = []
+            # Here we put indices of each class in to the list called lst_idx_all
+            for m in xrange(len(data_idx[i])):
+                lst_idx.append(data_idx[i][m][0])
+            lst_idx_all.append(lst_idx)
+        return lst_idx_all
+
+    # This function calculates between classes variances
+    def compute_Sb(self, cls, M_i, M_0):
+        Sb = np.zeros((self.dim, self.dim))
+        for i in cls:
+            B = (M_i[i] - M_0).reshape(self.dim, 1)
+            B_trans = B.transpose()
+            Sb += (float(len(cls[i])) / self.datanum) * B.dot(B_trans)
+        return Sb
+
+    # This function calculates within classes variances
+    def compute_Sw(self, cls, M_i):
+        Sw = np.zeros((self.dim, self.dim))
+        for i in cls:
+            N_i = float(len(cls[i]))
+            W_WT = np.zeros((self.dim, self.dim))
+            for xk in cls[i]:
+                W = (xk - M_i[i])
+                W_WT += np.outer(W, W)
+            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+        return Sw
+
+    # Calculating beta and Bi for Sb
+    def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
+        # import pdb
+        # pdb.set_trace()
+        B_i = np.zeros((self.classnum, self.dim))
+        Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            # pdb.set_trace()
+            # Calculating Bi
+            B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
+        for k in xrange(self.datanum):
+            for i in data_idx:
+                N_i = float(len(data_idx[i]))
+                if k in lst_idx_all[i]:
+                    beta = (float(1) / N_i) - (float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+                else:
+                    beta = -(float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
+        return Sig_beta_B_i_all
+
+
+    # Calculating W_j s separately so we can access all the W_j s anytime
+    def compute_wj(self, data_idx, M_i):
+        W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                xj = tpl[1]
+                j = tpl[0]
+                W_i[j] = (xj - M_i[i])
+        return W_i
+
+    # Calculating alpha and Wj for Sw
+    def compute_sig_alpha_W(self, data_idx, lst_idx_all, W_i):
+        Sig_alpha_W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                k = tpl[0]
+                for j in lst_idx_all[i]:
+                    if k == j:
+                        alpha = 1 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                    else:
+                        alpha = 0 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+        return Sig_alpha_W_i
+
+    # This function calculates log of our prior
+    def lnpdf(self, x):
+        x = x.reshape(self.x_shape)
+	
+	#!!!!!!!!!!!!!!!!!!!!!!!!!!!
+	#self.lamda.values[:] = self.lamda.values/self.lamda.values.sum()	
+
+        xprime = x.dot(np.diagflat(self.lamda))
+        x = xprime
+	# print x
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
+
+    # This function calculates derivative of the log of prior function
+    def lnpdf_grad(self, x):
+        x = x.reshape(self.x_shape)
+        xprime = x.dot(np.diagflat(self.lamda))
+        x = xprime
+	# print x
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        data_idx = self.compute_indices(x)
+        lst_idx_all = self.compute_listIndices(data_idx)
+        Sig_beta_B_i_all = self.compute_sig_beta_Bi(data_idx, M_i, M_0, lst_idx_all)
+        W_i = self.compute_wj(data_idx, M_i)
+        Sig_alpha_W_i = self.compute_sig_alpha_W(data_idx, lst_idx_all, W_i)
+
+        # Calculating inverse of Sb and its transpose and minus
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        Sb_inv_N_trans = np.transpose(Sb_inv_N)
+        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
+        Sw_trans = np.transpose(Sw)
+
+        # Calculating DJ/DXk
+        DJ_Dxk = 2 * (
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
+                Sig_alpha_W_i))
+        # Calculating derivative of the log of the prior
+        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+
+        DPxprim_Dx = np.diagflat(self.lamda).dot(DPx_Dx)
+
+        # Because of the GPy we need to transpose our matrix so that it gets the same shape as out matrix (denominator layout!!!)
+        DPxprim_Dx = DPxprim_Dx.T
+    
+        DPxprim_Dlamda = DPx_Dx.dot(x)
+
+    	# Because of the GPy we need to transpose our matrix so that it gets the same shape as out matrix (denominator layout!!!)
+        DPxprim_Dlamda = DPxprim_Dlamda.T
+
+        self.lamda.gradient = np.diag(DPxprim_Dlamda)
+	# print DPxprim_Dx
+        return DPxprim_Dx
+
+
+    # def frb(self, x):
+    #     from functools import partial
+    #     from GPy.models import GradientChecker
+    #     f = partial(self.lnpdf)
+    #     df = partial(self.lnpdf_grad)
+    #     grad = GradientChecker(f, df, x, 'X')
+    #     grad.checkgrad(verbose=1)
+
+    def rvs(self, n):
+        return np.random.rand(n)  # A WRONG implementation
+
+    def __str__(self):
+        return 'DGPLVM_prior_Raq_Lamda'
+
+# ******************************************
+
+class DGPLVM_T(Prior):
+    """
+    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
+
+    :param sigma2: constant
+
+    .. Note:: DGPLVM for Classification paper implementation
+
+    """
+    domain = _REAL
+    # _instances = []
+    # def __new__(cls, mu, sigma): # Singleton:
+    #     if cls._instances:
+    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
+    #         for instance in cls._instances:
+    #             if instance().mu == mu and instance().sigma == sigma:
+    #                 return instance()
+    #     o = super(Prior, cls).__new__(cls, mu, sigma)
+    #     cls._instances.append(weakref.ref(o))
+    #     return cls._instances[-1]()
+
+    def __init__(self, sigma2, lbl, x_shape, vec):
+        self.sigma2 = sigma2
+        # self.x = x
+        self.lbl = lbl
+        self.classnum = lbl.shape[1]
+        self.datanum = lbl.shape[0]
+        self.x_shape = x_shape
+        self.dim = x_shape[1]
+        self.vec = vec
+
+
+    def get_class_label(self, y):
+        for idx, v in enumerate(y):
+            if v == 1:
+                return idx
+        return -1
+
+    # This function assigns each data point to its own class
+    # and returns the dictionary which contains the class name and parameters.
+    def compute_cls(self, x):
+        cls = {}
+        # Appending each data point to its proper class
+        for j in range(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in cls:
+                cls[class_label] = []
+            cls[class_label].append(x[j])
+        return cls
+
+    # This function computes mean of each class. The mean is calculated through each dimension
+    def compute_Mi(self, cls):
+        M_i = np.zeros((self.classnum, self.dim))
+        for i in cls:
+            # Mean of each class
+	    # class_i = np.multiply(cls[i],vec)
+            class_i = cls[i]
+            M_i[i] = np.mean(class_i, axis=0)
+        return M_i
+
+    # Adding data points as tuple to the dictionary so that we can access indices
+    def compute_indices(self, x):
+        data_idx = {}
+        for j in range(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in data_idx:
+                data_idx[class_label] = []
+            t = (j, x[j])
+            data_idx[class_label].append(t)
+        return data_idx
+
+    # Adding indices to the list so we can access whole the indices
+    def compute_listIndices(self, data_idx):
+        lst_idx = []
+        lst_idx_all = []
+        for i in data_idx:
+            if len(lst_idx) == 0:
+                pass
+                #Do nothing, because it is the first time list is created so is empty
+            else:
+                lst_idx = []
+            # Here we put indices of each class in to the list called lst_idx_all
+            for m in range(len(data_idx[i])):
+                lst_idx.append(data_idx[i][m][0])
+            lst_idx_all.append(lst_idx)
+        return lst_idx_all
+
+    # This function calculates between classes variances
+    def compute_Sb(self, cls, M_i, M_0):
+        Sb = np.zeros((self.dim, self.dim))
+        for i in cls:
+            B = (M_i[i] - M_0).reshape(self.dim, 1)
+            B_trans = B.transpose()
+            Sb += (float(len(cls[i])) / self.datanum) * B.dot(B_trans)
+        return Sb
+
+    # This function calculates within classes variances
+    def compute_Sw(self, cls, M_i):
+        Sw = np.zeros((self.dim, self.dim))
+        for i in cls:
+            N_i = float(len(cls[i]))
+            W_WT = np.zeros((self.dim, self.dim))
+            for xk in cls[i]:
+                W = (xk - M_i[i])
+                W_WT += np.outer(W, W)
+            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+        return Sw
+
+    # Calculating beta and Bi for Sb
+    def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
+        # import pdb
+        # pdb.set_trace()
+        B_i = np.zeros((self.classnum, self.dim))
+        Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            # pdb.set_trace()
+            # Calculating Bi
+            B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
+        for k in range(self.datanum):
+            for i in data_idx:
+                N_i = float(len(data_idx[i]))
+                if k in lst_idx_all[i]:
+                    beta = (float(1) / N_i) - (float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+                else:
+                    beta = -(float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
+        return Sig_beta_B_i_all
+
+
+    # Calculating W_j s separately so we can access all the W_j s anytime
+    def compute_wj(self, data_idx, M_i):
+        W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                xj = tpl[1]
+                j = tpl[0]
+                W_i[j] = (xj - M_i[i])
+        return W_i
+
+    # Calculating alpha and Wj for Sw
+    def compute_sig_alpha_W(self, data_idx, lst_idx_all, W_i):
+        Sig_alpha_W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                k = tpl[0]
+                for j in lst_idx_all[i]:
+                    if k == j:
+                        alpha = 1 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                    else:
+                        alpha = 0 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+        return Sig_alpha_W_i
+
+    # This function calculates log of our prior
+    def lnpdf(self, x):
+        x = x.reshape(self.x_shape)
+        xprim = x.dot(self.vec)
+        x = xprim
+	# print x  
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+	#print 'SB_inv: ', Sb_inv_N
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
+
+    # This function calculates derivative of the log of prior function
+    def lnpdf_grad(self, x):
+        x = x.reshape(self.x_shape)
+        xprim = x.dot(self.vec)
+        x = xprim 
+	# print x       
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        data_idx = self.compute_indices(x)
+        lst_idx_all = self.compute_listIndices(data_idx)
+        Sig_beta_B_i_all = self.compute_sig_beta_Bi(data_idx, M_i, M_0, lst_idx_all)
+        W_i = self.compute_wj(data_idx, M_i)
+        Sig_alpha_W_i = self.compute_sig_alpha_W(data_idx, lst_idx_all, W_i)
+
+        # Calculating inverse of Sb and its transpose and minus
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+	#print 'SB_inv: ',Sb_inv_N
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N_trans = np.transpose(Sb_inv_N)
+        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
+        Sw_trans = np.transpose(Sw)
+
+        # Calculating DJ/DXk
+        DJ_Dxk = 2 * (
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
+                Sig_alpha_W_i))
+        # Calculating derivative of the log of the prior
+        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        return DPx_Dx.T
+
+    # def frb(self, x):
+    #     from functools import partial
+    #     from GPy.models import GradientChecker
+    #     f = partial(self.lnpdf)
+    #     df = partial(self.lnpdf_grad)
+    #     grad = GradientChecker(f, df, x, 'X')
+    #     grad.checkgrad(verbose=1)
+
+    def rvs(self, n):
+        return np.random.rand(n)  # A WRONG implementation
+
+    def __str__(self):
+        return 'DGPLVM_prior_Raq_TTT'
+
+
+
+
+class HalfT(Prior):
+    """
+    Implementation of the half student t probability function, coupled with random variables.
+
+    :param A: scale parameter
+    :param nu: degrees of freedom
+
+    """
+    domain = _POSITIVE
+    _instances = []
+
+    def __new__(cls, A, nu):  # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().A == A and instance().nu == nu:
+                    return instance()
+        o = super(Prior, cls).__new__(cls, A, nu)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, A, nu):
+        self.A = float(A)
+        self.nu = float(nu)
+        self.constant = gammaln(.5*(self.nu+1.)) - gammaln(.5*self.nu) - .5*np.log(np.pi*self.A*self.nu)
+
+    def __str__(self):
+        return "hT({:.2g}, {:.2g})".format(self.A, self.nu)
+
+    def lnpdf(self, theta):
+        return (theta > 0) * (self.constant - .5*(self.nu + 1) * np.log(1. + (1./self.nu) * (theta/self.A)**2))
+
+        # theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
+        # lnpdfs = np.zeros_like(theta)
+        # theta = np.array([theta])
+        # above_zero = theta.flatten()>1e-6
+        # v = self.nu
+        # sigma2=self.A
+        # stop
+        # lnpdfs[above_zero] = (+ gammaln((v + 1) * 0.5)
+        #     - gammaln(v * 0.5)
+        #     - 0.5*np.log(sigma2 * v * np.pi)
+        #     - 0.5*(v + 1)*np.log(1 + (1/np.float(v))*((theta[above_zero][0]**2)/sigma2))
+        # )
+        # return lnpdfs
+
+    def lnpdf_grad(self, theta):
+        theta = theta if isinstance(theta, np.ndarray) else np.array([theta])
+        grad = np.zeros_like(theta)
+        above_zero = theta > 1e-6
+        v = self.nu
+        sigma2 = self.A
+        grad[above_zero] = -0.5*(v+1)*(2*theta[above_zero])/(v*sigma2 + theta[above_zero][0]**2)
+        return grad
+
+    def rvs(self, n):
+        # return np.random.randn(n) * self.sigma + self.mu
+        from scipy.stats import t
+        # [np.abs(x) for x in t.rvs(df=4,loc=0,scale=50, size=10000)])
+        ret = t.rvs(self.nu, loc=0, scale=self.A, size=n)
+        ret[ret < 0] = 0
+        return ret
+
+
+class Exponential(Prior):
+    """
+    Implementation of the Exponential probability function,
+    coupled with random variables.
+
+    :param l: shape parameter
+
+    """
+    domain = _POSITIVE
+    _instances = []
+
+    def __new__(cls, l):  # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().l == l:
+                    return instance()
+        o = super(Exponential, cls).__new__(cls, l)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, l):
+        self.l = l
+
+    def __str__(self):
+        return "Exp({:.2g})".format(self.l)
+
+    def summary(self):
+        ret = {"E[x]": 1. / self.l,
+               "E[ln x]": np.nan,
+               "var[x]": 1. / self.l**2,
+               "Entropy": 1. - np.log(self.l),
+               "Mode": 0.}
+        return ret
+
+    def lnpdf(self, x):
+        return np.log(self.l) - self.l * x
+
+    def lnpdf_grad(self, x):
+        return - self.l
+
+    def rvs(self, n):
+        return np.random.exponential(scale=self.l, size=n)
diff --git a/GPy/core/probabilistic_model.py b/GPy/core/probabilistic_model.py
index 6f5242cc..d87fd29d 100644
--- a/GPy/core/probabilistic_model.py
+++ b/GPy/core/probabilistic_model.py
@@ -1,22 +1,9 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
+from .parameterization.priorizable import Priorizable
+from paramz import Model
 
-
-from .. import likelihoods
-from ..inference import optimization
-from ..util.misc import opt_wrapper
-from .parameterization import Parameterized
-import multiprocessing as mp
-import numpy as np
-from numpy.linalg.linalg import LinAlgError
-import itertools
-import sys
-from .verbose_optimization import VerboseOptimization
-# import numdifftools as ndt
-from functools import reduce
-from paramz.model import Model
-
-class ProbabilisticModel(Model):
+class ProbabilisticModel(Model, Priorizable):
 
     def __init__(self, name):
         super(ProbabilisticModel, self).__init__(name)  # Parameterized.__init__(self)
@@ -25,8 +12,7 @@ class ProbabilisticModel(Model):
         raise NotImplementedError("this needs to be implemented to use the model class")
 
     def _log_likelihood_gradients(self):
-        return self.gradient.copy()
-
+        return self.gradient#.copy()
 
     def objective_function(self):
         """
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 6b364ab0..761b517d 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -6,11 +6,9 @@ from .gp import GP
 from .parameterization.param import Param
 from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
-from .parameterization.variational import VariationalPosterior, NormalPosterior
-from ..util.linalg import mdot
+from .variational import VariationalPosterior
 
 import logging
-import itertools
 logger = logging.getLogger("sparse gp")
 
 class SparseGP(GP):
diff --git a/GPy/core/svgp.py b/GPy/core/svgp.py
index b87fd493..a678a1fd 100644
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@@ -38,7 +38,7 @@ class SVGP(SparseGP):
         #create the SVI inference method
         inf_method = svgp_inf()
 
-        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
+        super(SVGP, self).__init__(X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
                  name=name, Y_metadata=Y_metadata, normalizer=False)
 
         #assume the number of latent functions is one per col of Y unless specified
diff --git a/GPy/core/variational.py b/GPy/core/variational.py
index a9585c86..3cc9d00a 100644
--- a/GPy/core/variational.py
+++ b/GPy/core/variational.py
@@ -5,11 +5,9 @@ Created on 6 Nov 2013
 '''
 
 import numpy as np
-from .parameterized import Parameterized
-from .param import Param
-from .transformations import Logexp, Logistic,__fixed__
-from GPy.util.misc import param_to_array
-from GPy.util.caching import Cache_this
+from .parameterization.parameterized import Parameterized
+from .parameterization.param import Param
+from paramz.transformations import Logexp, Logistic,__fixed__
 
 class VariationalPrior(Parameterized):
     def __init__(self, name='latent space', **kw):
diff --git a/GPy/core/verbose_optimization.py b/GPy/core/verbose_optimization.py
deleted file mode 100644
index c4539736..00000000
--- a/GPy/core/verbose_optimization.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright (c) 2012-2014, Max Zwiessele.
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-from __future__ import print_function
-import numpy as np
-import sys
-import time
-import datetime
-
-def exponents(fnow, current_grad):
-    exps = [np.abs(np.float(fnow)), 1 if current_grad is np.nan else current_grad]
-    return np.sign(exps) * np.log10(exps).astype(int)
-
-class VerboseOptimization(object):
-    def __init__(self, model, opt, maxiters, verbose=False, current_iteration=0, ipython_notebook=True, clear_after_finish=False):
-        self.verbose = verbose
-        if self.verbose:
-            self.model = model
-            self.iteration = current_iteration
-            self.p_iter = self.iteration
-            self.maxiters = maxiters
-            self.len_maxiters = len(str(maxiters))
-            self.opt_name = opt.opt_name
-            self.model.add_observer(self, self.print_status)
-            self.status = 'running'
-            self.clear = clear_after_finish
-
-            self.update()
-
-            try:
-                from IPython.display import display
-                from IPython.html.widgets import IntProgress, HTML, Box, VBox, HBox, FlexBox
-                self.text = HTML(width='100%')
-                self.progress = IntProgress(min=0, max=maxiters)
-                #self.progresstext = Text(width='100%', disabled=True, value='0/{}'.format(maxiters))
-                self.model_show = HTML()
-                self.ipython_notebook = ipython_notebook
-            except:
-                # Not in Ipython notebook
-                self.ipython_notebook = False
-
-            if self.ipython_notebook:
-                left_col = VBox(children=[self.progress, self.text], padding=2, width='40%')
-                right_col = Box(children=[self.model_show], padding=2, width='60%')
-                self.hor_align = FlexBox(children = [left_col, right_col], width='100%', orientation='horizontal')
-
-                display(self.hor_align)
-
-                try:
-                    self.text.set_css('width', '100%')
-                    left_col.set_css({
-                             'padding': '2px',
-                             'width': "100%",
-                             })
-
-                    right_col.set_css({
-                             'padding': '2px',
-                             })
-
-                    self.hor_align.set_css({
-                             'width': "100%",
-                             })
-
-                    self.hor_align.remove_class('vbox')
-                    self.hor_align.add_class('hbox')
-
-                    left_col.add_class("box-flex1")
-                    right_col.add_class('box-flex0')
-
-                except:
-                    pass
-
-                #self.text.add_class('box-flex2')
-                #self.progress.add_class('box-flex1')
-            else:
-                self.exps = exponents(self.fnow, self.current_gradient)
-                print('Running {} Code:'.format(self.opt_name))
-                print('  {3:7s}   {0:{mi}s}   {1:11s}    {2:11s}'.format("i", "f", "|g|", "runtime", mi=self.len_maxiters))
-
-    def __enter__(self):
-        self.start = time.time()
-        self._time = self.start
-        return self
-
-    def print_out(self, seconds):
-        if seconds<60:
-            ms = (seconds%1)*100
-            self.timestring = "{s:0>2d}s{ms:0>2d}".format(s=int(seconds), ms=int(ms))
-        else:
-            m, s = divmod(seconds, 60)
-            if m>59:
-                h, m = divmod(m, 60)
-                if h>23:
-                    d, h = divmod(h, 24)
-                    self.timestring = '{d:0>2d}d{h:0>2d}h{m:0>2d}'.format(m=int(m), h=int(h), d=int(d))
-                else:
-                    self.timestring = '{h:0>2d}h{m:0>2d}m{s:0>2d}'.format(m=int(m), s=int(s), h=int(h))
-            else:
-                ms = (seconds%1)*100
-                self.timestring = '{m:0>2d}m{s:0>2d}s{ms:0>2d}'.format(m=int(m), s=int(s), ms=int(ms))
-        if self.ipython_notebook:
-            names_vals = [['optimizer', "{:s}".format(self.opt_name)],
-                          ['runtime', "{:>s}".format(self.timestring)],
-                          ['evaluation', "{:>0{l}}".format(self.iteration, l=self.len_maxiters)],
-                          ['objective', "{: > 12.3E}".format(self.fnow)],
-                          ['||gradient||', "{: >+12.3E}".format(float(self.current_gradient))],
-                          ['status', "{:s}".format(self.status)],
-                      ]
-            #message = "Lik:{:5.3E} Grad:{:5.3E} Lik:{:5.3E} Len:{!s}".format(float(m.log_likelihood()), np.einsum('i,i->', grads, grads), float(m.likelihood.variance), " ".join(["{:3.2E}".format(l) for l in m.kern.lengthscale.values]))
-            html_begin = """<style type="text/css">
-    .tg-opt  {font-family:"Courier New", Courier, monospace !important;padding:2px 3px;word-break:normal;border-collapse:collapse;border-spacing:0;border-color:#DCDCDC;margin:0px auto;width:100%;}
-    .tg-opt td{font-family:"Courier New", Courier, monospace !important;font-weight:bold;color:#444;background-color:#F7FDFA;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
-    .tg-opt th{font-family:"Courier New", Courier, monospace !important;font-weight:normal;color:#fff;background-color:#26ADE4;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
-    .tg-opt .tg-left{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:left;}
-    .tg-opt .tg-right{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:right;}
-    </style>
-    <table class="tg-opt">"""
-            html_end = "</table>"
-            html_body = ""
-            for name, val in names_vals:
-                html_body += "<tr>"
-                html_body += "<td class='tg-left'>{}</td>".format(name)
-                html_body += "<td class='tg-right'>{}</td>".format(val)
-                html_body += "</tr>"
-            self.text.value = html_begin + html_body + html_end
-            self.progress.value = (self.iteration+1)
-            #self.progresstext.value = '0/{}'.format((self.iteration+1))
-            self.model_show.value = self.model._repr_html_()
-        else:
-            n_exps = exponents(self.fnow, self.current_gradient)
-            if self.iteration - self.p_iter >= 20 * np.random.rand():
-                a = self.iteration >= self.p_iter * 2.78
-                b = np.any(n_exps < self.exps)
-                if a or b:
-                    self.p_iter = self.iteration
-                    print('')
-                if b:
-                    self.exps = n_exps
-            print('\r', end=' ')
-            print('{3:}  {0:>0{mi}g}  {1:> 12e}  {2:> 12e}'.format(self.iteration, float(self.fnow), float(self.current_gradient), "{:>8s}".format(self.timestring), mi=self.len_maxiters), end=' ') # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
-            sys.stdout.flush()
-
-    def print_status(self, me, which=None):
-        self.update()
-
-        t = time.time()
-        seconds = t-self.start
-        #sys.stdout.write(" "*len(self.message))
-        if t-self._time > .3 or seconds < .3:
-            self.print_out(seconds)
-            self._time = t
-
-        self.iteration += 1
-
-    def update(self):
-        self.fnow = self.model.objective_function()
-        if self.model.obj_grads is not None:
-            grad = self.model.obj_grads
-            self.current_gradient = np.dot(grad, grad)
-        else:
-            self.current_gradient = np.nan
-
-    def finish(self, opt):
-        self.status = opt.status
-        if self.verbose and self.ipython_notebook:
-            if 'conv' in self.status.lower():
-                self.progress.bar_style = 'success'
-            elif self.iteration >= self.maxiters:
-                self.progress.bar_style = 'warning'
-            else:
-                self.progress.bar_style = 'danger'
-
-    def __exit__(self, type, value, traceback):
-        if self.verbose:
-            self.stop = time.time()
-            self.model.remove_observer(self)
-            self.print_out(self.stop - self.start)
-
-            if not self.ipython_notebook:
-                print()
-                print('Runtime: {}'.format("{:>9s}".format(self.timestring)))
-                print('Optimization status: {0}'.format(self.status))
-                print()
-            elif self.clear:
-                self.hor_align.close()
diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py
index d293d4de..6017bda3 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ...util.linalg import jitchol, DSYR, dtrtrs, dtrtri
-from ...core.parameterization.observable_array import ObsAr
+from paramz import ObsAr
 from . import ExactGaussianInference, VarDTC
 from ...util import diag
 
diff --git a/GPy/inference/latent_function_inference/inferenceX.py b/GPy/inference/latent_function_inference/inferenceX.py
index f253a31e..0c8a2513 100644
--- a/GPy/inference/latent_function_inference/inferenceX.py
+++ b/GPy/inference/latent_function_inference/inferenceX.py
@@ -2,10 +2,9 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ...core import Model
-from ...core.parameterization import variational
+from ...core import ProbabilisticModel
+from ...core import variational
 from ...util.linalg import tdot
-from GPy.core.parameterization.variational import VariationalPosterior
 
 def infer_newX(model, Y_new, optimize=True, init='L2'):
     """
@@ -27,7 +26,7 @@ def infer_newX(model, Y_new, optimize=True, init='L2'):
 
     return infr_m.X, infr_m
 
-class InferenceX(Model):
+class InferenceX(ProbabilisticModel):
     """
     The model class for inference of new X with given new Y. (replacing the "do_test_latent" in Bayesian GPLVM)
     It is a tiny inference model created from the original GP model. The kernel, likelihood (only Gaussian is supported at the moment) 
@@ -62,14 +61,12 @@ class InferenceX(Model):
 #                 self.kern.GPU(True)
         from copy import deepcopy
         self.posterior = deepcopy(model.posterior)
-        from ...core.parameterization.variational import VariationalPosterior
-        if isinstance(model.X, VariationalPosterior):
+        if isinstance(model.X, variational.VariationalPosterior):
             self.uncertain_input = True
             from ...models.ss_gplvm import IBPPrior
             from ...models.ss_mrd import IBPPrior_SSMRD
             if isinstance(model.variational_prior, IBPPrior) or isinstance(model.variational_prior, IBPPrior_SSMRD):
-                from ...core.parameterization.variational import SpikeAndSlabPrior
-                self.variational_prior = SpikeAndSlabPrior(pi=0.5, learnPi=False, group_spike=False)
+                self.variational_prior = variational.SpikeAndSlabPrior(pi=0.5, learnPi=False, group_spike=False)
             else:
                 self.variational_prior = model.variational_prior.copy()
         else:
@@ -105,17 +102,16 @@ class InferenceX(Model):
         idx = dist.argmin(axis=1)
 
         from ...models import SSGPLVM
-        from ...util.misc import param_to_array
         if isinstance(model, SSGPLVM):
-            X = variational.SpikeAndSlabPosterior(param_to_array(model.X.mean[idx]), param_to_array(model.X.variance[idx]), param_to_array(model.X.gamma[idx]))
+            X = variational.SpikeAndSlabPosterior((model.X.mean[idx].values), (model.X.variance[idx].values), (model.X.gamma[idx].values))
             if model.group_spike:
                 X.gamma.fix()
         else:
             if self.uncertain_input and self.sparse_gp:
-                X = variational.NormalPosterior(param_to_array(model.X.mean[idx]), param_to_array(model.X.variance[idx]))
+                X = variational.NormalPosterior((model.X.mean[idx].values), (model.X.variance[idx].values))
             else:
                 from ...core import Param
-                X = Param('latent mean',param_to_array(model.X[idx]).copy())
+                X = Param('latent mean',(model.X[idx].values).copy())
 
         return X
 
@@ -160,8 +156,7 @@ class InferenceX(Model):
             self.X.gradient = X_grad
 
         if self.uncertain_input:
-            from ...core.parameterization.variational import SpikeAndSlabPrior
-            if isinstance(self.variational_prior, SpikeAndSlabPrior):
+            if isinstance(self.variational_prior, variational.SpikeAndSlabPrior):
                 # Update Log-likelihood
                 KL_div = self.variational_prior.KL_divergence(self.X)
                 # update for the KL divergence
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index bb114050..13ef954d 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -4,7 +4,7 @@
 from .posterior import Posterior
 from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify
 from ...util import diag
-from ...core.parameterization.variational import VariationalPosterior
+from ...core.variational import VariationalPosterior
 import numpy as np
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
@@ -23,8 +23,7 @@ class VarDTC(LatentFunctionInference):
     """
     const_jitter = 1e-8
     def __init__(self, limit=1):
-        #self._YYTfactor_cache = caching.cache()
-        from ...util.caching import Cacher
+        from paramz.caching import Cacher
         self.limit = limit
         self.get_trYYT = Cacher(self._get_trYYT, limit)
         self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
@@ -45,7 +44,7 @@ class VarDTC(LatentFunctionInference):
     def __setstate__(self, state):
         # has to be overridden, as Cacher objects cannot be pickled.
         self.limit = state
-        from ...util.caching import Cacher
+        from paramz.caching import Cacher
         self.get_trYYT = Cacher(self._get_trYYT, self.limit)
         self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit)
 
diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py
index 457ede66..4c0e1c7b 100644
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@@ -4,7 +4,7 @@
 from .posterior import Posterior
 from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri,pdinv
 from ...util import diag
-from ...core.parameterization.variational import VariationalPosterior
+from ...core.variational import VariationalPosterior
 import numpy as np
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
diff --git a/GPy/kern/src/ODE_UY.py b/GPy/kern/src/ODE_UY.py
index ae9c4574..19fb1e94 100644
--- a/GPy/kern/src/ODE_UY.py
+++ b/GPy/kern/src/ODE_UY.py
@@ -4,7 +4,7 @@
 from .kern import Kern
 from .independent_outputs import index_to_slices
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 import numpy as np
 
 class ODE_UY(Kern):
diff --git a/GPy/kern/src/ODE_UYC.py b/GPy/kern/src/ODE_UYC.py
index ff75a328..d02eb1d9 100644
--- a/GPy/kern/src/ODE_UYC.py
+++ b/GPy/kern/src/ODE_UYC.py
@@ -3,7 +3,7 @@
 
 from .kern import Kern
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 import numpy as np
 from .independent_outputs import index_to_slices
 
diff --git a/GPy/kern/src/ODE_st.py b/GPy/kern/src/ODE_st.py
index afa46d09..f9d4e684 100644
--- a/GPy/kern/src/ODE_st.py
+++ b/GPy/kern/src/ODE_st.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from .kern import Kern
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 import numpy as np
 from .independent_outputs import index_to_slices
 
diff --git a/GPy/kern/src/ODE_t.py b/GPy/kern/src/ODE_t.py
index 80625f51..ffd349ec 100644
--- a/GPy/kern/src/ODE_t.py
+++ b/GPy/kern/src/ODE_t.py
@@ -1,6 +1,6 @@
 from .kern import Kern
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 import numpy as np
 from .independent_outputs import index_to_slices
 
diff --git a/GPy/kern/src/add.py b/GPy/kern/src/add.py
index 9f80ac9d..1f07306e 100644
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 import itertools
-from ...util.caching import Cache_this
+from paramz.caching import Cache_this
 from .kern import CombinationKernel, Kern
 from functools import reduce
 
diff --git a/GPy/kern/src/basis_funcs.py b/GPy/kern/src/basis_funcs.py
index 3d644af2..7a5f84dd 100644
--- a/GPy/kern/src/basis_funcs.py
+++ b/GPy/kern/src/basis_funcs.py
@@ -3,8 +3,8 @@
 import numpy as np
 from .kern import Kern
 from ...core.parameterization.param import Param
-from ...core.parameterization.transformations import Logexp
-from ...util.caching import Cache_this
+from paramz.transformations import Logexp
+from paramz.caching import Cache_this
 from ...util.linalg import tdot, mdot
 
 class BasisFuncKernel(Kern):
diff --git a/GPy/kern/src/brownian.py b/GPy/kern/src/brownian.py
index d403fce7..68da4435 100644
--- a/GPy/kern/src/brownian.py
+++ b/GPy/kern/src/brownian.py
@@ -3,7 +3,7 @@
 
 from .kern import Kern
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 import numpy as np
 
 class Brownian(Kern):
diff --git a/GPy/kern/src/coregionalize.py b/GPy/kern/src/coregionalize.py
index 1ce4bff6..197d7ece 100644
--- a/GPy/kern/src/coregionalize.py
+++ b/GPy/kern/src/coregionalize.py
@@ -4,7 +4,7 @@
 from .kern import Kern
 import numpy as np
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 from ...util.config import config # for assesing whether to use cython
 try:
     from . import coregionalize_cython
diff --git a/GPy/kern/src/eq_ode2.py b/GPy/kern/src/eq_ode2.py
index 2d42a3e6..ef71ffe0 100644
--- a/GPy/kern/src/eq_ode2.py
+++ b/GPy/kern/src/eq_ode2.py
@@ -5,8 +5,8 @@ import numpy as np
 from scipy.special import wofz
 from .kern import Kern
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
-from ...util.caching import Cache_this
+from paramz.transformations import Logexp
+from paramz.caching import Cache_this
 
 class EQ_ODE2(Kern):
     """
diff --git a/GPy/kern/src/kern.py b/GPy/kern/src/kern.py
index 4d535b60..8e7ab4a4 100644
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@@ -3,8 +3,8 @@
 import sys
 import numpy as np
 from ...core.parameterization.parameterized import Parameterized
-from ...core.parameterization.observable_array import ObsAr
-from ...util.caching import Cache_this
+from paramz.core.observable_array import ObsAr
+from paramz.caching import Cache_this
 from .kernel_slice_operations import KernCallsViaSlicerMeta
 from functools import reduce
 import six
@@ -30,18 +30,16 @@ class Kern(Parameterized):
             tight dimensionality of inputs.
             You most likely want this to be the integer telling the number of
             input dimensions of the kernel.
-            If this is not an integer (!) we will work on the whole input matrix X,
-            and not check whether dimensions match or not (!).
 
-        _all_dims_active:
+        active_dims:
 
             is the active_dimensions of inputs X we will work on.
             All kernels will get sliced Xes as inputs, if _all_dims_active is not None
-            Only positive integers are allowed in _all_dims_active!
-            if _all_dims_active is None, slicing is switched off and all X will be passed through as given.
+            Only positive integers are allowed in active_dims!
+            if active_dims is None, slicing is switched off and all X will be passed through as given.
 
         :param int input_dim: the number of input dimensions to the function
-        :param array-like|None _all_dims_active: list of indices on which dimensions this kernel works on, or none if no slicing
+        :param array-like|None active_dims: list of indices on which dimensions this kernel works on, or none if no slicing
 
         Do not instantiate.
         """
diff --git a/GPy/kern/src/kernel_slice_operations.py b/GPy/kern/src/kernel_slice_operations.py
index 2bd1f923..57b34de9 100644
--- a/GPy/kern/src/kernel_slice_operations.py
+++ b/GPy/kern/src/kernel_slice_operations.py
@@ -7,9 +7,9 @@ This module provides a meta class for the kernels. The meta class is for
 slicing the inputs (X, X2) for the kernels, before K (or any other method involving X)
 gets calls. The `_all_dims_active` of a kernel decide which dimensions the kernel works on.
 '''
-from ...core.parameterization.parameterized import ParametersChangedMeta
 import numpy as np
 from functools import wraps
+from paramz.parameterized import ParametersChangedMeta
 
 def put_clean(dct, name, func):
     if name in dct:
diff --git a/GPy/kern/src/linear.py b/GPy/kern/src/linear.py
index 25f6299d..d0bc2e6f 100644
--- a/GPy/kern/src/linear.py
+++ b/GPy/kern/src/linear.py
@@ -6,8 +6,8 @@ import numpy as np
 from .kern import Kern
 from ...util.linalg import tdot
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
-from ...util.caching import Cache_this
+from paramz.transformations import Logexp
+from paramz.caching import Cache_this
 from .psi_comp import PSICOMP_Linear
 
 class Linear(Kern):
diff --git a/GPy/kern/src/mlp.py b/GPy/kern/src/mlp.py
index 8f9a276c..d86e5b15 100644
--- a/GPy/kern/src/mlp.py
+++ b/GPy/kern/src/mlp.py
@@ -3,9 +3,9 @@
 
 from .kern import Kern
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 import numpy as np
-from ...util.caching import Cache_this
+from paramz.caching import Cache_this
 four_over_tau = 2./np.pi
 
 class MLP(Kern):
diff --git a/GPy/kern/src/periodic.py b/GPy/kern/src/periodic.py
index 4c4d2234..fe0c6670 100644
--- a/GPy/kern/src/periodic.py
+++ b/GPy/kern/src/periodic.py
@@ -7,7 +7,7 @@ from .kern import Kern
 from ...util.linalg import mdot
 from ...util.decorators import silence_errors
 from ...core.parameterization.param import Param
-from ...core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 
 class Periodic(Kern):
     def __init__(self, input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name):
diff --git a/GPy/kern/src/poly.py b/GPy/kern/src/poly.py
index a5306c2a..216e3a00 100644
--- a/GPy/kern/src/poly.py
+++ b/GPy/kern/src/poly.py
@@ -4,7 +4,8 @@
 import numpy as np
 from .kern import Kern
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
+
 class Poly(Kern):
     """
     Polynomial kernel
diff --git a/GPy/kern/src/prod.py b/GPy/kern/src/prod.py
index 68883af4..ae00a949 100644
--- a/GPy/kern/src/prod.py
+++ b/GPy/kern/src/prod.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 from .kern import CombinationKernel
-from ...util.caching import Cache_this
+from paramz.caching import Cache_this
 import itertools
 from functools import reduce
 
diff --git a/GPy/kern/src/psi_comp/__init__.py b/GPy/kern/src/psi_comp/__init__.py
index 90ceca6b..c1706679 100644
--- a/GPy/kern/src/psi_comp/__init__.py
+++ b/GPy/kern/src/psi_comp/__init__.py
@@ -1,9 +1,9 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from ....core.parameterization.parameter_core import Pickleable
-from ....util.caching import Cache_this
-from ....core.parameterization import variational
+from paramz.core.pickleable import Pickleable
+from paramz.caching import Cache_this
+from ....core import variational
 #from linear_psi_comp import LINEAr
 
 class PSICOMP(Pickleable):
diff --git a/GPy/kern/src/psi_comp/gaussherm.py b/GPy/kern/src/psi_comp/gaussherm.py
index 35eab724..8600081a 100644
--- a/GPy/kern/src/psi_comp/gaussherm.py
+++ b/GPy/kern/src/psi_comp/gaussherm.py
@@ -8,7 +8,7 @@ An approximated psi-statistics implementation based on Gauss-Hermite Quadrature
 import numpy as np
 
 from ....core.parameterization import Param
-from ....util.caching import Cache_this
+from paramz.caching import Cache_this
 from ....util.linalg import tdot
 from . import PSICOMP
 
@@ -30,7 +30,7 @@ class PSICOMP_GH(PSICOMP):
     @Cache_this(limit=10, ignore_args=(0,))
     def comp_K(self, Z, qX):
         if self.Xs is None or self.Xs.shape != qX.mean.shape:
-            from ....core.parameterization import ObsAr
+            from paramz import ObsAr
             self.Xs = ObsAr(np.empty((self.degree,)+qX.mean.shape))
         mu, S = qX.mean.values, qX.variance.values
         S_sq = np.sqrt(S)
diff --git a/GPy/kern/src/psi_comp/rbf_psi_comp.py b/GPy/kern/src/psi_comp/rbf_psi_comp.py
index 5667eec6..6abaa93e 100644
--- a/GPy/kern/src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_comp.py
@@ -3,7 +3,7 @@ The module for psi-statistics for RBF kernel
 """
 
 import numpy as np
-from GPy.util.caching import Cacher
+from paramz.caching import Cacher
 
 def psicomputations(variance, lengthscale, Z, variational_posterior, return_psi2_n=False):
     # here are the "statistics" for psi0, psi1 and psi2
diff --git a/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py b/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
index 50945db6..c415ed4f 100644
--- a/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
@@ -3,7 +3,7 @@ The module for psi-statistics for RBF kernel
 """
 
 import numpy as np
-from ....util.caching import Cache_this
+from paramz.caching import Cache_this
 from . import PSICOMP_RBF
 from ....util import gpu_init
 
diff --git a/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
index 46f4a06e..844f944e 100644
--- a/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
@@ -4,7 +4,7 @@ The module for psi-statistics for RBF kernel for Spike-and-Slab GPLVM
 """
 
 import numpy as np
-from ....util.caching import Cache_this
+from paramz.caching import Cache_this
 from . import PSICOMP_RBF
 
 
diff --git a/GPy/kern/src/rbf.py b/GPy/kern/src/rbf.py
index 3607bea9..ff3ee277 100644
--- a/GPy/kern/src/rbf.py
+++ b/GPy/kern/src/rbf.py
@@ -6,7 +6,7 @@ import numpy as np
 from .stationary import Stationary
 from .psi_comp import PSICOMP_RBF, PSICOMP_RBF_GPU
 from ...core import Param
-from ...core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 
 class RBF(Stationary):
     """
diff --git a/GPy/kern/src/spline.py b/GPy/kern/src/spline.py
index c1b28764..2d822399 100644
--- a/GPy/kern/src/spline.py
+++ b/GPy/kern/src/spline.py
@@ -4,7 +4,7 @@
 import numpy as np
 from .kern import Kern
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 
 class Spline(Kern):
     """
diff --git a/GPy/kern/src/standard_periodic.py b/GPy/kern/src/standard_periodic.py
index 3da7a124..bc27107e 100644
--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@@ -15,7 +15,7 @@ Neural Networks and Machine Learning, pages 133-165. Springer, 1998.
 
 from .kern import Kern
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 
 import numpy as np
 
diff --git a/GPy/kern/src/static.py b/GPy/kern/src/static.py
index bcfec4a7..dc6fe7a0 100644
--- a/GPy/kern/src/static.py
+++ b/GPy/kern/src/static.py
@@ -5,7 +5,7 @@
 from .kern import Kern
 import numpy as np
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 
 class Static(Kern):
     def __init__(self, input_dim, variance, active_dims, name):
diff --git a/GPy/kern/src/stationary.py b/GPy/kern/src/stationary.py
index d5f26798..106e0098 100644
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@@ -2,15 +2,15 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-from .kern import Kern
-from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
-from ...util.linalg import tdot
-from ... import util
 import numpy as np
 from scipy import integrate
+from .kern import Kern
+from ...core.parameterization import Param
+from ...util.linalg import tdot
+from ... import util
 from ...util.config import config # for assesing whether to use cython
-from ...util.caching import Cache_this
+from paramz.caching import Cache_this
+from paramz.transformations import Logexp
 
 try:
     from . import stationary_cython
diff --git a/GPy/kern/src/trunclinear.py b/GPy/kern/src/trunclinear.py
index 81d7376f..3a35744f 100644
--- a/GPy/kern/src/trunclinear.py
+++ b/GPy/kern/src/trunclinear.py
@@ -5,8 +5,8 @@
 import numpy as np
 from .kern import Kern
 from ...core.parameterization import Param
-from ...core.parameterization.transformations import Logexp
-from ...util.caching import Cache_this
+from paramz.transformations import Logexp
+from paramz.caching import Cache_this
 
 class TruncLinear(Kern):
     """
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index e1299f73..eec0bbd0 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -16,7 +16,7 @@ from scipy import stats, special
 from . import link_functions
 from .likelihood import Likelihood
 from ..core.parameterization import Param
-from ..core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 from scipy import stats
 
 class Gaussian(Likelihood):
diff --git a/GPy/likelihoods/mixed_noise.py b/GPy/likelihoods/mixed_noise.py
index 84b3001d..db230b13 100644
--- a/GPy/likelihoods/mixed_noise.py
+++ b/GPy/likelihoods/mixed_noise.py
@@ -7,7 +7,7 @@ from . import link_functions
 from .likelihood import Likelihood
 from .gaussian import Gaussian
 from ..core.parameterization import Param
-from ..core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 from ..core.parameterization import Parameterized
 import itertools
 
diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py
index 79745ff6..e8de3c40 100644
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@@ -9,7 +9,7 @@ from scipy import stats, integrate
 from scipy.special import gammaln, gamma
 from .likelihood import Likelihood
 from ..core.parameterization import Param
-from ..core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 from scipy.special import psi as digamma
 
 class StudentT(Likelihood):
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index fd02cb3e..daca864a 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -5,7 +5,7 @@ import numpy as np
 from .. import kern
 from ..core.sparse_gp_mpi import SparseGP_MPI
 from ..likelihoods import Gaussian
-from ..core.parameterization.variational import NormalPosterior, NormalPrior
+from ..core.variational import NormalPosterior, NormalPrior
 from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch
 import logging
 
diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index 3ef43753..9df9e266 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -2,14 +2,12 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
+import logging
 from .. import kern
 from ..likelihoods import Gaussian
-from ..core.parameterization.variational import NormalPosterior, NormalPrior
-from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch
-import logging
-from GPy.models.sparse_gp_minibatch import SparseGPMiniBatch
-from GPy.core.parameterization.param import Param
-from GPy.core.parameterization.observable_array import ObsAr
+from ..core.variational import NormalPosterior, NormalPrior
+from .sparse_gp_minibatch import SparseGPMiniBatch
+from ..core.parameterization.param import Param
 
 class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
     """
diff --git a/GPy/models/dpgplvm.py b/GPy/models/dpgplvm.py
index 7f947c53..b4b1bbef 100644
--- a/GPy/models/dpgplvm.py
+++ b/GPy/models/dpgplvm.py
@@ -1,10 +1,7 @@
 # Copyright (c) 2015 the GPy Austhors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-import numpy as np
-from .. import kern
 from .bayesian_gplvm import BayesianGPLVM
-from ..core.parameterization.variational import NormalPosterior, NormalPrior
 
 class DPBayesianGPLVM(BayesianGPLVM):
     """
diff --git a/GPy/models/gp_kronecker_gaussian_regression.py b/GPy/models/gp_kronecker_gaussian_regression.py
index 158ec178..dc623d9c 100644
--- a/GPy/models/gp_kronecker_gaussian_regression.py
+++ b/GPy/models/gp_kronecker_gaussian_regression.py
@@ -2,11 +2,11 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from GPy.core.probabilistic_model import Model
-from ..core.parameterization import ObsAr
+from ..core import ProbabilisticModel
+from paramz import ObsAr
 from .. import likelihoods
 
-class GPKroneckerGaussianRegression(Model):
+class GPKroneckerGaussianRegression(ProbabilisticModel):
     """
     Kronecker GP regression
 
@@ -29,7 +29,7 @@ class GPKroneckerGaussianRegression(Model):
 
     """
     def __init__(self, X1, X2, Y, kern1, kern2, noise_var=1., name='KGPR'):
-        Model.__init__(self, name=name)
+        ProbabilisticModel.__init__(self, name=name)
         # accept the construction arguments
         self.X1 = ObsAr(X1)
         self.X2 = ObsAr(X2)
diff --git a/GPy/models/gp_var_gauss.py b/GPy/models/gp_var_gauss.py
index 6cce8640..05c55625 100644
--- a/GPy/models/gp_var_gauss.py
+++ b/GPy/models/gp_var_gauss.py
@@ -3,8 +3,6 @@
 
 import numpy as np
 from ..core import GP
-from ..core.parameterization import ObsAr
-from .. import kern
 from ..core.parameterization.param import Param
 from ..inference.latent_function_inference import VarGauss
 
diff --git a/GPy/models/gradient_checker.py b/GPy/models/gradient_checker.py
index c3815bfc..96c62c94 100644
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@@ -1,11 +1,11 @@
 # ## Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from GPy.core.probabilistic_model import Model
-import itertools
 import numpy
-from ..core.parameterization import Param
 np = numpy
+
+from ..core.parameterization import Param
+from ..core.probabilistic_model import ProbabilisticModel
 from ..util.block_matrices import get_blocks, get_block_shapes, unblock, get_blocks_3d, get_block_shapes_3d
 
 def get_shape(x):
@@ -21,7 +21,7 @@ def at_least_one_element(x):
 def flatten_if_needed(x):
     return numpy.atleast_1d(x).flatten()
 
-class GradientChecker(Model):
+class GradientChecker(ProbabilisticModel):
 
     def __init__(self, f, df, x0, names=None, *args, **kwargs):
         """
@@ -62,7 +62,7 @@ class GradientChecker(Model):
                 grad.randomize()
                 grad.checkgrad(verbose=1)
         """
-        Model.__init__(self, 'GradientChecker')
+        super(GradientChecker, self).__init__(name='GradientChecker')
         if isinstance(x0, (list, tuple)) and names is None:
             self.shapes = [get_shape(xi) for xi in x0]
             self.names = ['X{i}'.format(i=i) for i in range(len(x0))]
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index 7832e155..f8ce8997 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -5,18 +5,14 @@ import numpy as np
 import itertools, logging
 
 from ..kern import Kern
-from ..core.parameterization.variational import NormalPosterior, NormalPrior
-from ..core.parameterization import Param, Parameterized
-from ..core.parameterization.observable_array import ObsAr
+from ..core.variational import NormalPrior
+from ..core.parameterization import Param
+from paramz import ObsAr
 from ..inference.latent_function_inference.var_dtc import VarDTC
 from ..inference.latent_function_inference import InferenceMethodList
 from ..likelihoods import Gaussian
 from ..util.initialization import initialize_latent
-from ..core.sparse_gp import SparseGP, GP
-from GPy.core.parameterization.variational import VariationalPosterior
 from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
-from GPy.models.bayesian_gplvm import BayesianGPLVM
-from GPy.models.sparse_gp_minibatch import SparseGPMiniBatch
 
 class MRD(BayesianGPLVMMiniBatch):
     """
diff --git a/GPy/models/one_vs_all_classification.py b/GPy/models/one_vs_all_classification.py
index 10457d75..d8024019 100644
--- a/GPy/models/one_vs_all_classification.py
+++ b/GPy/models/one_vs_all_classification.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2013, the GPy Authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from ..core import GP
 from . import SparseGPClassification
 from .. import likelihoods
 from .. import kern
diff --git a/GPy/models/sparse_gp_classification.py b/GPy/models/sparse_gp_classification.py
index e1c468d1..e00a3533 100644
--- a/GPy/models/sparse_gp_classification.py
+++ b/GPy/models/sparse_gp_classification.py
@@ -62,7 +62,7 @@ class SparseGPClassificationUncertainInput(SparseGP):
     .. Note:: Multiple independent outputs are allowed using columns of Y
     """
     def __init__(self, X, X_variance, Y, kernel=None, Z=None, num_inducing=10, Y_metadata=None, normalizer=None):
-        from ..core.parameterization.variational import NormalPosterior
+        from ..core.variational import NormalPosterior
         if kernel is None:
             kernel = kern.RBF(X.shape[1])
 
diff --git a/GPy/models/sparse_gp_coregionalized_regression.py b/GPy/models/sparse_gp_coregionalized_regression.py
index 797d8b30..1e5049e2 100644
--- a/GPy/models/sparse_gp_coregionalized_regression.py
+++ b/GPy/models/sparse_gp_coregionalized_regression.py
@@ -4,7 +4,6 @@
 import numpy as np
 from ..core import SparseGP
 from ..inference.latent_function_inference import VarDTC
-from .. import likelihoods
 from .. import kern
 from .. import util
 
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index 54160e6f..69c65381 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -4,18 +4,15 @@
 from __future__ import print_function
 import numpy as np
 from ..core.parameterization.param import Param
+from ..core.variational import VariationalPosterior
 from ..core.sparse_gp import SparseGP
 from ..core.gp import GP
 from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
-from ..core.parameterization.variational import VariationalPosterior
 
 import logging
-from GPy.inference.latent_function_inference.posterior import Posterior
-from GPy.inference.optimization.stochastics import SparseGPStochastics,\
-    SparseGPMissing
-#no stochastics.py file added! from GPy.inference.optimization.stochastics import SparseGPStochastics,\
-    #SparseGPMissing
+from ..inference.latent_function_inference.posterior import Posterior
+from ..inference.optimization.stochastics import SparseGPStochastics, SparseGPMissing
 logger = logging.getLogger("sparse gp")
 
 class SparseGPMiniBatch(SparseGP):
diff --git a/GPy/models/sparse_gp_regression.py b/GPy/models/sparse_gp_regression.py
index faca7e9e..5d649452 100644
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@@ -3,12 +3,11 @@
 
 
 import numpy as np
-from ..core import SparseGP
 from ..core.sparse_gp_mpi import SparseGP_MPI
 from .. import likelihoods
 from .. import kern
 from ..inference.latent_function_inference import VarDTC
-from ..core.parameterization.variational import NormalPosterior
+from ..core.variational import NormalPosterior
 
 class SparseGPRegression(SparseGP_MPI):
     """
diff --git a/GPy/models/sparse_gplvm.py b/GPy/models/sparse_gplvm.py
index d1ad5884..22852d93 100644
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@@ -2,9 +2,8 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-import numpy as np
 import sys
-from GPy.models.sparse_gp_regression import SparseGPRegression
+from .sparse_gp_regression import SparseGPRegression
 
 class SparseGPLVM(SparseGPRegression):
     """
diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index 4ccf58ac..dfbbcbd4 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -7,7 +7,7 @@ from ..core.sparse_gp_mpi import SparseGP_MPI
 from .. import kern
 from ..core.parameterization import Param
 from ..likelihoods import Gaussian
-from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior,VariationalPrior
+from ..core.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior,VariationalPrior
 from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
 from ..kern.src.psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF_GPU
 
@@ -19,7 +19,7 @@ class IBPPosterior(SpikeAndSlabPosterior):
         """
         binary_prob : the probability of the distribution on the slab part.
         """
-        from ..core.parameterization.transformations import Logexp
+        from paramz.transformations import Logexp
         super(IBPPosterior, self).__init__(means, variances, binary_prob, group_spike=True, name=name)
         self.sharedX = sharedX
         if sharedX:
@@ -60,7 +60,7 @@ class IBPPosterior(SpikeAndSlabPosterior):
 class IBPPrior(VariationalPrior):
     def __init__(self, input_dim, alpha =2., name='IBPPrior', **kw):
         super(IBPPrior, self).__init__(name=name, **kw)
-        from ..core.parameterization.transformations import Logexp, __fixed__  
+        from paramz.transformations import Logexp, __fixed__  
         self.input_dim = input_dim
         self.variance = 1.
         self.alpha = Param('alpha', alpha, __fixed__)
diff --git a/GPy/models/ss_mrd.py b/GPy/models/ss_mrd.py
index 41289d3f..182ee60b 100644
--- a/GPy/models/ss_mrd.py
+++ b/GPy/models/ss_mrd.py
@@ -3,15 +3,15 @@ The Maniforld Relevance Determination model with the spike-and-slab prior
 """
 
 import numpy as np
-from ..core import Model
+from ..core import ProbabilisticModel
 from .ss_gplvm import SSGPLVM
-from ..core.parameterization.variational import SpikeAndSlabPrior,NormalPosterior,VariationalPrior
+from ..core.variational import SpikeAndSlabPrior,NormalPosterior,VariationalPrior
 from ..util.misc import param_to_array
 from ..kern import RBF
 from ..core import Param
 from numpy.linalg.linalg import LinAlgError
 
-class SSMRD(Model):
+class SSMRD(ProbabilisticModel):
     
     def __init__(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx = 'PCA_concat', initz = 'permute', 
                  num_inducing=10, Zs=None, kernels=None, inference_methods=None, likelihoods=None, group_spike=True,
@@ -117,13 +117,13 @@ class SSMRD(Model):
                 Gammas.append(gamma)
         return X, X_variance, Gammas, fracs
 
-    @Model.optimizer_array.setter
+    @ProbabilisticModel.optimizer_array.setter
     def optimizer_array(self, p):
         if self.mpi_comm != None:
             if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0:
                 self.mpi_comm.Bcast(np.int32(1),root=0)
             self.mpi_comm.Bcast(p, root=0)        
-        Model.optimizer_array.fset(self,p)
+        ProbabilisticModel.optimizer_array.fset(self,p)
         
     def optimize(self, optimizer=None, start=None, **kwargs):
         self._IN_OPTIMIZATION_ = True
@@ -214,7 +214,7 @@ class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
 class IBPPrior_SSMRD(VariationalPrior):
     def __init__(self, nModels, input_dim, alpha =2., tau=None, name='IBPPrior', **kw):
         super(IBPPrior_SSMRD, self).__init__(name=name, **kw)
-        from ..core.parameterization.transformations import Logexp, __fixed__  
+        from paramz.transformations import Logexp, __fixed__  
         self.nModels = nModels
         self._b_prob_all = 0.5
         self.input_dim = input_dim
diff --git a/GPy/plotting/gpy_plot/plot_util.py b/GPy/plotting/gpy_plot/plot_util.py
index 22d6627c..a2d4a7c0 100644
--- a/GPy/plotting/gpy_plot/plot_util.py
+++ b/GPy/plotting/gpy_plot/plot_util.py
@@ -334,7 +334,7 @@ def x_frame1D(X,plot_limits=None,resolution=None):
     """
     assert X.shape[1] ==1, "x_frame1D is defined for one-dimensional inputs"
     if plot_limits is None:
-        from ...core.parameterization.variational import VariationalPosterior
+        from ...core.variational import VariationalPosterior
         if isinstance(X, VariationalPosterior):
             xmin,xmax = X.mean.min(0),X.mean.max(0)
         else:
diff --git a/GPy/plotting/matplot_dep/visualize.py b/GPy/plotting/matplot_dep/visualize.py
index c5d2fe14..46558763 100644
--- a/GPy/plotting/matplot_dep/visualize.py
+++ b/GPy/plotting/matplot_dep/visualize.py
@@ -1,6 +1,6 @@
 import numpy as np
 import time
-from ...core.parameterization.variational import VariationalPosterior
+from .*core.variational import VariationalPosterior
 try:
     import matplotlib.pyplot as plt
     import matplotlib as mpl
diff --git a/GPy/testing/cacher_tests.py b/GPy/testing/cacher_tests.py
deleted file mode 100644
index 60f79ba2..00000000
--- a/GPy/testing/cacher_tests.py
+++ /dev/null
@@ -1,37 +0,0 @@
-'''
-Created on 4 Sep 2015
-
-@author: maxz
-'''
-import unittest
-from GPy.util.caching import Cacher
-from pickle import PickleError
-
-
-class Test(unittest.TestCase):
-    def setUp(self):
-        def op(x):
-            return x
-        self.cache = Cacher(op, 1)
-
-    def test_pickling(self):
-        self.assertRaises(PickleError, self.cache.__getstate__)
-        self.assertRaises(PickleError, self.cache.__setstate__)
-
-    def test_copy(self):
-        tmp = self.cache.__deepcopy__()
-        assert(tmp.operation is self.cache.operation)
-        self.assertEqual(tmp.limit, self.cache.limit)
-
-    def test_reset(self):
-        self.cache.reset()
-        self.assertDictEqual(self.cache.cached_input_ids, {}, )
-        self.assertDictEqual(self.cache.cached_outputs, {}, )
-        self.assertDictEqual(self.cache.inputs_changed, {}, )
-
-    def test_name(self):
-        assert(self.cache.__name__ == self.cache.operation.__name__)
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.testName']
-    unittest.main()
\ No newline at end of file
diff --git a/GPy/testing/gp_tests.py b/GPy/testing/gp_tests.py
index 63345c18..5cde5e98 100644
--- a/GPy/testing/gp_tests.py
+++ b/GPy/testing/gp_tests.py
@@ -5,7 +5,7 @@ Created on 4 Sep 2015
 '''
 import unittest
 import numpy as np, GPy
-from GPy.core.parameterization.variational import NormalPosterior
+from ..core.variational import NormalPosterior
 
 class Test(unittest.TestCase):
 
diff --git a/GPy/testing/index_operations_tests.py b/GPy/testing/index_operations_tests.py
deleted file mode 100644
index a97f1beb..00000000
--- a/GPy/testing/index_operations_tests.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright (c) 2014, Max Zwiessele
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import unittest
-import numpy as np
-from GPy.core.parameterization.index_operations import ParameterIndexOperations,\
-    ParameterIndexOperationsView
-
-one, two, three = 'one', 'two', 'three'
-
-class Test(unittest.TestCase):
-
-    def setUp(self):
-        self.param_index = ParameterIndexOperations()
-        self.param_index.add(one, [3,9])
-        self.param_index.add(two, [0,5])
-        self.param_index.add(three, [2,4,7,10])
-        self.view = ParameterIndexOperationsView(self.param_index, 2, 6)
-
-    def test_clear(self):
-        self.param_index.clear()
-        self.assertDictEqual(self.param_index._properties, {})
-
-    def test_remove(self):
-        removed = self.param_index.remove(three, np.r_[3:13])
-        self.assertListEqual(removed.tolist(), [4,7,10])
-        self.assertListEqual(self.param_index[three].tolist(), [2])
-        removed = self.param_index.remove(one, [1])
-        self.assertListEqual(removed.tolist(), [])
-        self.assertListEqual(self.param_index[one].tolist(), [3,9])
-        self.assertListEqual(self.param_index.remove('not in there', []).tolist(), [])
-        removed = self.param_index.remove(one, [9])
-        self.assertListEqual(removed.tolist(), [9])
-        self.assertListEqual(self.param_index[one].tolist(), [3])
-        self.assertListEqual(self.param_index.remove('not in there', [2,3,4]).tolist(), [])
-        self.assertListEqual(self.view.remove('not in there', [2,3,4]).tolist(), [])
-
-    def test_shift_left(self):
-        self.view.shift_left(0, 2)
-        self.assertListEqual(self.param_index[three].tolist(), [2,5,8])
-        self.assertListEqual(self.param_index[two].tolist(), [0,3])
-        self.assertListEqual(self.param_index[one].tolist(), [7])
-        #=======================================================================
-        #          0    1    2    3    4    5    6    7    8    9    10
-        #                                            one
-        #         two            two
-        #                   three          three          three
-        # view:             [0    1    2    3    4    5    ]
-        #=======================================================================
-        self.assertListEqual(self.view[three].tolist(), [0,3])
-        self.assertListEqual(self.view[two].tolist(), [1])
-        self.assertListEqual(self.view[one].tolist(), [5])
-        self.param_index.shift_left(7, 1)
-        #=======================================================================
-        #          0    1    2    3    4    5    6    7    8    9    10
-        #
-        #         two            two
-        #                   three          three     three
-        # view:             [0    1    2    3    4    5    ]
-        #=======================================================================
-        self.assertListEqual(self.param_index[three].tolist(), [2,5,7])
-        self.assertListEqual(self.param_index[two].tolist(), [0,3])
-        self.assertListEqual(self.param_index[one].tolist(), [])
-        self.assertListEqual(self.view[three].tolist(), [0,3,5])
-        self.assertListEqual(self.view[two].tolist(), [1])
-        self.assertListEqual(self.view[one].tolist(), [])
-
-    def test_shift_right(self):
-        self.view.shift_right(3, 2)
-        self.assertListEqual(self.param_index[three].tolist(), [2,4,9,12])
-        self.assertListEqual(self.param_index[two].tolist(), [0,7])
-        self.assertListEqual(self.param_index[one].tolist(), [3,11])
-
-    def test_index_view(self):
-        #=======================================================================
-        #          0    1    2    3    4    5    6    7    8    9    10
-        #                        one                           one
-        #         two                      two
-        #                   three     three          three          three
-        # view:             [0    1    2    3    4    5    ]
-        #=======================================================================
-        self.view = ParameterIndexOperationsView(self.param_index, 2, 6)
-        self.assertSetEqual(set(self.view.properties()), set([one, two, three]))
-        for v,p in zip(self.view.properties_for(np.r_[:6]), self.param_index.properties_for(np.r_[2:2+6])):
-            self.assertEqual(v, p)
-        self.assertSetEqual(set(self.view[two]), set([3]))
-        self.assertSetEqual(set(self.param_index[two]), set([0, 5]))
-        self.view.add(two, np.array([0]))
-        self.assertSetEqual(set(self.view[two]), set([0,3]))
-        self.assertSetEqual(set(self.param_index[two]), set([0, 2, 5]))
-        self.view.clear()
-        for v,p in zip(self.view.properties_for(np.r_[:6]), self.param_index.properties_for(np.r_[2:2+6])):
-            self.assertEqual(v, p)
-            self.assertEqual(v, [])
-        param_index = ParameterIndexOperations()
-        param_index.add(one, [3,9])
-        param_index.add(two, [0,5])
-        param_index.add(three, [2,4,7,10])
-        view2 = ParameterIndexOperationsView(param_index, 2, 8)
-        self.view.update(view2)
-        for [i,v],[i2,v2] in zip(sorted(param_index.items()), sorted(self.param_index.items())):
-            self.assertEqual(i, i2)
-            np.testing.assert_equal(v, v2)
-
-    def test_view_of_view(self):
-        #=======================================================================
-        #          0    1    2    3    4    5    6    7    8    9    10
-        #                        one                           one
-        #         two                      two
-        #                   three     three          three          three
-        # view:             [0    1    2    3    4    5    ]
-        # view2:                      [0    1    2    3    4    5    ]
-        #=======================================================================
-        view2 = ParameterIndexOperationsView(self.view, 2, 6)
-        view2.shift_right(0, 2)
-
-    def test_indexview_remove(self):
-        removed = self.view.remove(two, [3])
-        self.assertListEqual(removed.tolist(), [3])
-        removed = self.view.remove(three, np.r_[:5])
-        self.assertListEqual(removed.tolist(), [0, 2])
-
-    def test_misc(self):
-        #py3 fix
-        #for k,v in self.param_index.copy()._properties.iteritems():
-        for k,v in self.param_index.copy()._properties.items():
-            self.assertListEqual(self.param_index[k].tolist(), v.tolist())
-        self.assertEqual(self.param_index.size, 8)
-        self.assertEqual(self.view.size, 5)
-
-    def test_print(self):
-        print(self.param_index)
-        print(self.view)
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.test_index_view']
-    unittest.main()
diff --git a/GPy/testing/inference_tests.py b/GPy/testing/inference_tests.py
index 92def798..58ee0288 100644
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@@ -5,7 +5,7 @@
 The test cases for various inference algorithms
 """
 
-import unittest, itertools
+import unittest
 import numpy as np
 import GPy
 #np.seterr(invalid='raise')
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index e81ce2cf..efb313a2 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -4,7 +4,6 @@
 import unittest
 import numpy as np
 import GPy
-import sys
 from GPy.core.parameterization.param import Param
 from ..util.config import config
 
@@ -17,14 +16,14 @@ except ImportError:
     config.set('cython', 'working', 'False')
 
 
-class Kern_check_model(GPy.core.Model):
+class Kern_check_model(GPy.core.ProbabilisticModel):
     """
     This is a dummy model class used as a base class for checking that the
     gradients of a given kernel are implemented correctly. It enables
     checkgrad() to be called independently on a kernel.
     """
     def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
-        GPy.core.Model.__init__(self, 'kernel_test_model')
+        super(Kern_check_model, self).__init__('kernel_test_model')
         if kernel==None:
             kernel = GPy.kern.RBF(1)
         kernel.randomize(loc=1, scale=0.1)
@@ -457,7 +456,7 @@ class KernelTestsProductWithZeroValues(unittest.TestCase):
 class Kernel_Psi_statistics_GradientTests(unittest.TestCase):
 
     def setUp(self):
-        from GPy.core.parameterization.variational import NormalPosterior
+        from GPy.core.variational import NormalPosterior
         N,M,Q = 100,20,3
 
         X = np.random.randn(N,Q)
diff --git a/GPy/testing/mapping_tests.py b/GPy/testing/mapping_tests.py
index 2ff0e2d8..7c327f97 100644
--- a/GPy/testing/mapping_tests.py
+++ b/GPy/testing/mapping_tests.py
@@ -5,7 +5,7 @@ import unittest
 import numpy as np
 import GPy
 
-class MappingGradChecker(GPy.core.Model):
+class MappingGradChecker(GPy.core.ProbabilisticModel):
     """
     This class has everything we need to check the gradient of a mapping. It
     implement a simple likelihood which is a weighted sum of the outputs of the
diff --git a/GPy/testing/observable_tests.py b/GPy/testing/observable_tests.py
deleted file mode 100644
index 84059d98..00000000
--- a/GPy/testing/observable_tests.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2014, Max Zwiessele
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-import unittest
-from GPy.core.parameterization.parameterized import Parameterized
-from GPy.core.parameterization.param import Param
-import numpy
-
-# One trigger in init
-_trigger_start = -1
-
-class ParamTestParent(Parameterized):
-    parent_changed_count = _trigger_start
-    def parameters_changed(self):
-        self.parent_changed_count += 1
-
-class ParameterizedTest(Parameterized):
-    # One trigger after initialization
-    params_changed_count = _trigger_start
-    def parameters_changed(self):
-        self.params_changed_count += 1
-
-class Test(unittest.TestCase):
-
-    def setUp(self):
-        self.parent = ParamTestParent('test parent')
-        self.par = ParameterizedTest('test model')
-        self.par2 = ParameterizedTest('test model 2')
-        self.p = Param('test parameter', numpy.random.normal(1,2,(10,3)))
-
-        self.par.link_parameter(self.p)
-        self.par.link_parameter(Param('test1', numpy.random.normal(0,1,(1,))))
-        self.par.link_parameter(Param('test2', numpy.random.normal(0,1,(1,))))
-
-        self.par2.link_parameter(Param('par2 test1', numpy.random.normal(0,1,(1,))))
-        self.par2.link_parameter(Param('par2 test2', numpy.random.normal(0,1,(1,))))
-
-        self.parent.link_parameter(self.par)
-        self.parent.link_parameter(self.par2)
-
-        self._observer_triggered = None
-        self._trigger_count = 0
-        self._first = None
-        self._second = None
-
-    def _trigger(self, me, which):
-        self._observer_triggered = which
-        self._trigger_count += 1
-        if self._first is not None:
-            self._second = self._trigger
-        else:
-            self._first = self._trigger
-
-    def _trigger_priority(self, me, which):
-        if self._first is not None:
-            self._second = self._trigger_priority
-        else:
-            self._first = self._trigger_priority
-
-    def test_observable(self):
-        self.par.add_observer(self, self._trigger, -1)
-        self.assertEqual(self.par.params_changed_count, 0, 'no params changed yet')
-        self.assertEqual(self.par.params_changed_count, self.parent.parent_changed_count, 'parent should be triggered as often as param')
-
-        self.p[0,1] = 3 # trigger observers
-        self.assertIs(self._observer_triggered, self.p, 'observer should have triggered')
-        self.assertEqual(self._trigger_count, 1, 'observer should have triggered once')
-        self.assertEqual(self.par.params_changed_count, 1, 'params changed once')
-        self.assertEqual(self.par.params_changed_count, self.parent.parent_changed_count, 'parent should be triggered as often as param')
-
-        self.par.remove_observer(self)
-        self.p[0,1] = 4
-        self.assertIs(self._observer_triggered, self.p, 'observer should not have triggered')
-        self.assertEqual(self._trigger_count, 1, 'observer should have triggered once')
-        self.assertEqual(self.par.params_changed_count, 2, 'params changed second')
-        self.assertEqual(self.par.params_changed_count, self.parent.parent_changed_count, 'parent should be triggered as often as param')
-
-        self.par.add_observer(self, self._trigger, -1)
-        self.p[0,1] = 4
-        self.assertIs(self._observer_triggered, self.p, 'observer should have triggered')
-        self.assertEqual(self._trigger_count, 2, 'observer should have triggered once')
-        self.assertEqual(self.par.params_changed_count, 3, 'params changed second')
-        self.assertEqual(self.par.params_changed_count, self.parent.parent_changed_count, 'parent should be triggered as often as param')
-
-        self.par.remove_observer(self, self._trigger)
-        self.p[0,1] = 3
-        self.assertIs(self._observer_triggered, self.p, 'observer should not have triggered')
-        self.assertEqual(self._trigger_count, 2, 'observer should have triggered once')
-        self.assertEqual(self.par.params_changed_count, 4, 'params changed second')
-        self.assertEqual(self.par.params_changed_count, self.parent.parent_changed_count, 'parent should be triggered as often as param')
-
-    def test_set_params(self):
-        self.assertEqual(self.par.params_changed_count, 0, 'no params changed yet')
-        self.par.param_array[:] = 1
-        self.par._trigger_params_changed()
-        self.assertEqual(self.par.params_changed_count, 1, 'now params changed')
-        self.assertEqual(self.parent.parent_changed_count, self.par.params_changed_count)
-
-        self.par.param_array[:] = 2
-        self.par._trigger_params_changed()
-        self.assertEqual(self.par.params_changed_count, 2, 'now params changed')
-        self.assertEqual(self.parent.parent_changed_count, self.par.params_changed_count)
-
-
-    def test_priority_notify(self):
-        self.assertEqual(self.par.params_changed_count, 0)
-        self.par.notify_observers(0, None)
-        self.assertEqual(self.par.params_changed_count, 1)
-        self.assertEqual(self.parent.parent_changed_count, self.par.params_changed_count)
-
-        self.par.notify_observers(0, -numpy.inf)
-        self.assertEqual(self.par.params_changed_count, 2)
-        self.assertEqual(self.parent.parent_changed_count, 1)
-
-    def test_priority(self):
-        self.par.add_observer(self, self._trigger, -1)
-        self.par.add_observer(self, self._trigger_priority, 0)
-        self.par.notify_observers(0)
-        self.assertEqual(self._first, self._trigger_priority, 'priority should be first')
-        self.assertEqual(self._second, self._trigger, 'priority should be first')
-
-        self.par.remove_observer(self)
-        self._first = self._second = None
-
-        self.par.add_observer(self, self._trigger, 1)
-        self.par.add_observer(self, self._trigger_priority, 0)
-        self.par.notify_observers(0)
-        self.assertEqual(self._first, self._trigger, 'priority should be second')
-        self.assertEqual(self._second, self._trigger_priority, 'priority should be second')
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.testName']
-    unittest.main()
diff --git a/GPy/testing/parameterized_tests.py b/GPy/testing/parameterized_tests.py
deleted file mode 100644
index 762a82a1..00000000
--- a/GPy/testing/parameterized_tests.py
+++ /dev/null
@@ -1,264 +0,0 @@
-'''
-Created on Feb 13, 2014
-
-@author: maxzwiessele
-'''
-import unittest
-import GPy
-import numpy as np
-from GPy.core.parameterization.parameter_core import HierarchyError
-from GPy.core.parameterization.observable_array import ObsAr
-from GPy.core.parameterization.transformations import NegativeLogexp, Logistic
-from GPy.core.parameterization.parameterized import Parameterized
-from GPy.core.parameterization.param import Param
-from GPy.core.parameterization.index_operations import ParameterIndexOperations
-from functools import reduce
-
-class ArrayCoreTest(unittest.TestCase):
-    def setUp(self):
-        self.X = np.random.normal(1,1, size=(100,10))
-        self.obsX = ObsAr(self.X)
-
-    def test_init(self):
-        X = ObsAr(self.X)
-        X2 = ObsAr(X)
-        self.assertIs(X, X2, "no new Observable array, when Observable is given")
-
-    def test_slice(self):
-        t1 = self.X[2:78]
-        t2 = self.obsX[2:78]
-        self.assertListEqual(t1.tolist(), t2.tolist(), "Slicing should be the exact same, as in ndarray")
-
-class ParameterizedTest(unittest.TestCase):
-
-    def setUp(self):
-        self.rbf = GPy.kern.RBF(20)
-        self.white = GPy.kern.White(1)
-        from GPy.core.parameterization import Param
-        from GPy.core.parameterization.transformations import Logistic
-        self.param = Param('param', np.random.uniform(0,1,(10,5)), Logistic(0, 1))
-
-        self.test1 = GPy.core.Parameterized("test model")
-        self.test1.param = self.param
-        self.test1.kern = self.rbf+self.white
-        self.test1.link_parameter(self.test1.kern)
-        self.test1.link_parameter(self.param, 0)
-
-        # print self.test1:
-        #=============================================================================
-        # test_model.          |    Value    |  Constraint   |  Prior  |  Tied to
-        # param                |  (25L, 2L)  |   {0.0,1.0}   |         |
-        # add.rbf.variance     |        1.0  |  0.0,1.0 +ve  |         |
-        # add.rbf.lengthscale  |        1.0  |  0.0,1.0 +ve  |         |
-        # add.white.variance   |        1.0  |  0.0,1.0 +ve  |         |
-        #=============================================================================
-
-        x = np.linspace(-2,6,4)[:,None]
-        y = np.sin(x)
-        self.testmodel = GPy.models.GPRegression(x,y)
-        # print self.testmodel:
-        #=============================================================================
-        # GP_regression.           |  Value  |  Constraint  |  Prior  |  Tied to
-        # rbf.variance             |    1.0  |     +ve      |         |
-        # rbf.lengthscale          |    1.0  |     +ve      |         |
-        # Gaussian_noise.variance  |    1.0  |     +ve      |         |
-        #=============================================================================
-
-    def test_add_parameter(self):
-        self.assertEquals(self.rbf._parent_index_, 0)
-        self.assertEquals(self.white._parent_index_, 1)
-        self.assertEquals(self.param._parent_index_, 0)
-        pass
-
-    def test_fixes(self):
-        self.white.fix(warning=False)
-        self.test1.unlink_parameter(self.param)
-        self.assertTrue(self.test1._has_fixes())
-        from GPy.core.parameterization.transformations import FIXED, UNFIXED
-        self.assertListEqual(self.test1._fixes_.tolist(),[UNFIXED,UNFIXED,FIXED])
-        self.test1.kern.link_parameter(self.white, 0)
-        self.assertListEqual(self.test1._fixes_.tolist(),[FIXED,UNFIXED,UNFIXED])
-        self.test1.kern.rbf.fix()
-        self.assertListEqual(self.test1._fixes_.tolist(),[FIXED]*3)
-        self.test1.fix()
-        self.assertTrue(self.test1.is_fixed)
-        self.assertListEqual(self.test1._fixes_.tolist(),[FIXED]*self.test1.size)
-
-    def test_remove_parameter(self):
-        from GPy.core.parameterization.transformations import FIXED, UNFIXED, __fixed__, Logexp
-        self.white.fix()
-        self.test1.kern.unlink_parameter(self.white)
-        self.assertIs(self.test1._fixes_,None)
-
-        self.assertIsInstance(self.white.constraints, ParameterIndexOperations)
-        self.assertListEqual(self.white._fixes_.tolist(), [FIXED])
-        self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
-        self.assertIs(self.test1.constraints, self.param.constraints._param_index_ops)
-
-        self.test1.link_parameter(self.white, 0)
-        self.assertIs(self.test1.constraints, self.white.constraints._param_index_ops)
-        self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
-        self.assertIs(self.test1.constraints, self.param.constraints._param_index_ops)
-        self.assertListEqual(self.test1.constraints[__fixed__].tolist(), [0])
-        self.assertIs(self.white._fixes_,None)
-        self.assertListEqual(self.test1._fixes_.tolist(),[FIXED] + [UNFIXED] * 52)
-
-        self.test1.unlink_parameter(self.white)
-        self.assertIs(self.test1._fixes_,None)
-        self.assertListEqual(self.white._fixes_.tolist(), [FIXED])
-        self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
-        self.assertIs(self.test1.constraints, self.param.constraints._param_index_ops)
-        self.assertListEqual(self.test1.constraints[Logexp()].tolist(), list(range(self.param.size, self.param.size+self.rbf.size)))
-
-    def test_remove_parameter_param_array_grad_array(self):
-        val = self.test1.kern.param_array.copy()
-        self.test1.kern.unlink_parameter(self.white)
-        self.assertListEqual(self.test1.kern.param_array.tolist(), val[:2].tolist())
-
-    def test_add_parameter_already_in_hirarchy(self):
-        self.assertRaises(HierarchyError, self.test1.link_parameter, self.white.parameters[0])
-
-    def test_default_constraints(self):
-        self.assertIs(self.rbf.variance.constraints._param_index_ops, self.rbf.constraints._param_index_ops)
-        self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
-        self.assertListEqual(self.rbf.constraints.indices()[0].tolist(), list(range(2)))
-        from GPy.core.parameterization.transformations import Logexp
-        kern = self.test1.kern
-        self.test1.unlink_parameter(kern)
-        self.assertListEqual(kern.constraints[Logexp()].tolist(), list(range(3)))
-
-    def test_constraints(self):
-        self.rbf.constrain(GPy.transformations.Square(), False)
-        self.assertListEqual(self.test1.constraints[GPy.transformations.Square()].tolist(), list(range(self.param.size, self.param.size+self.rbf.size)))
-        self.assertListEqual(self.test1.constraints[GPy.transformations.Logexp()].tolist(), [self.param.size+self.rbf.size])
-
-        self.test1.kern.unlink_parameter(self.rbf)
-        self.assertListEqual(self.test1.constraints[GPy.transformations.Square()].tolist(), [])
-
-    def test_constraints_link_unlink(self):
-        self.test1.unlink_parameter(self.test1.kern)
-        self.test1.kern.rbf.unlink_parameter(self.test1.kern.rbf.lengthscale)
-        self.test1.kern.rbf.link_parameter(self.test1.kern.rbf.lengthscale)
-        self.test1.kern.rbf.unlink_parameter(self.test1.kern.rbf.lengthscale)
-        self.test1.link_parameter(self.test1.kern)
-
-    def test_constraints_views(self):
-        self.assertEqual(self.white.constraints._offset, self.param.size+self.rbf.size)
-        self.assertEqual(self.rbf.constraints._offset, self.param.size)
-        self.assertEqual(self.param.constraints._offset, 0)
-
-    def test_fixing_randomize(self):
-        self.white.fix(warning=True)
-        val = float(self.white.variance)
-        self.test1.randomize()
-        self.assertEqual(val, self.white.variance)
-
-    def test_randomize(self):
-        ps = self.test1.param.view(np.ndarray).copy()
-        self.test1.param[2:5].fix()
-        self.test1.param.randomize()
-        self.assertFalse(np.all(ps==self.test1.param),str(ps)+str(self.test1.param))
-
-    def test_fixing_randomize_parameter_handling(self):
-        self.rbf.fix(warning=True)
-        val = float(self.rbf.variance)
-        self.test1.kern.randomize()
-        self.assertEqual(val, self.rbf.variance)
-
-    def test_updates(self):
-        val = float(self.testmodel.log_likelihood())
-        self.testmodel.update_model(False)
-        self.testmodel.kern.randomize()
-        self.testmodel.likelihood.randomize()
-        self.assertEqual(val, self.testmodel.log_likelihood())
-        self.testmodel.update_model(True)
-        self.assertNotEqual(val, self.testmodel.log_likelihood())
-
-    def test_fixing_optimize(self):
-        self.testmodel.kern.lengthscale.fix()
-        val = float(self.testmodel.kern.lengthscale)
-        self.testmodel.randomize()
-        self.assertEqual(val, self.testmodel.kern.lengthscale)
-
-    def test_add_parameter_in_hierarchy(self):
-        self.test1.kern.rbf.link_parameter(Param("NEW", np.random.rand(2), NegativeLogexp()), 1)
-        self.assertListEqual(self.test1.constraints[NegativeLogexp()].tolist(), list(range(self.param.size+1, self.param.size+1 + 2)))
-        self.assertListEqual(self.test1.constraints[GPy.transformations.Logistic(0,1)].tolist(), list(range(self.param.size)))
-        self.assertListEqual(self.test1.constraints[GPy.transformations.Logexp(0,1)].tolist(), np.r_[50, 53:55].tolist())
-
-    def test_regular_expression_misc(self):
-        self.testmodel.kern.lengthscale.fix()
-        val = float(self.testmodel.kern.lengthscale)
-        self.testmodel.randomize()
-        self.assertEqual(val, self.testmodel.kern.lengthscale)
-
-        variances = self.testmodel['.*var'].values()
-        self.testmodel['.*var'].fix()
-        self.testmodel.randomize()
-        np.testing.assert_equal(variances, self.testmodel['.*var'].values())
-
-    def test_fix_unfix(self):
-        fixed = self.testmodel.kern.lengthscale.fix()
-        self.assertListEqual(fixed.tolist(), [0])
-        unfixed = self.testmodel.kern.lengthscale.unfix()
-        self.testmodel.kern.lengthscale.constrain_positive()
-        self.assertListEqual(unfixed.tolist(), [0])
-
-        fixed = self.testmodel.kern.fix()
-        self.assertListEqual(fixed.tolist(), [0,1])
-        unfixed = self.testmodel.kern.unfix()
-        self.assertListEqual(unfixed.tolist(), [0,1])
-
-    def test_constraints_in_init(self):
-        class Test(Parameterized):
-            def __init__(self, name=None, parameters=[], *a, **kw):
-                super(Test, self).__init__(name=name)
-                self.x = Param('x', np.random.uniform(0,1,(3,4)))
-                self.x[0].constrain_bounded(0,1)
-                self.link_parameter(self.x)
-                self.x[1].fix()
-        t = Test()
-        c = {Logistic(0,1): np.array([0, 1, 2, 3]), 'fixed': np.array([4, 5, 6, 7])}
-        np.testing.assert_equal(t.x.constraints[Logistic(0,1)], c[Logistic(0,1)])
-        np.testing.assert_equal(t.x.constraints['fixed'], c['fixed'])
-
-    def test_parameter_modify_in_init(self):
-        class TestLikelihood(Parameterized):
-            def __init__(self, param1 = 2., param2 = 3.):
-                super(TestLikelihood, self).__init__("TestLike")
-                self.p1 = Param('param1', param1)
-                self.p2 = Param('param2', param2)
-
-                self.link_parameter(self.p1)
-                self.link_parameter(self.p2)
-
-                self.p1.fix()
-                self.p1.unfix()
-                self.p2.constrain_negative()
-                self.p1.fix()
-                self.p2.constrain_positive()
-                self.p2.fix()
-                self.p2.constrain_positive()
-
-        m = TestLikelihood()
-        print(m)
-        val = m.p1.values.copy()
-        self.assert_(m.p1.is_fixed)
-        self.assert_(m.constraints[GPy.constraints.Logexp()].tolist(), [1])
-        m.randomize()
-        self.assertEqual(m.p1, val)
-
-    def test_checkgrad(self):
-        assert(self.testmodel.kern.checkgrad())
-        assert(self.testmodel.kern.lengthscale.checkgrad())
-        assert(self.testmodel.likelihood.checkgrad())
-
-    def test_printing(self):
-        print(self.test1)
-        print(self.param)
-        print(self.test1[''])
-        print(self.testmodel.hierarchy_name(False))
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.test_add_parameter']
-    unittest.main()
diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py
index 59818de7..285eb6bb 100644
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@@ -7,23 +7,11 @@ import unittest, itertools
 #import cPickle as pickle
 import pickle
 import numpy as np
-from GPy.core.parameterization.index_operations import ParameterIndexOperations,\
-    ParameterIndexOperationsView
 import tempfile
-from GPy.core.parameterization.param import Param
-from GPy.core.parameterization.observable_array import ObsAr
-from GPy.core.parameterization.priors import Gaussian
-from GPy.kern.src.rbf import RBF
-from GPy.kern.src.linear import Linear
-from GPy.kern.src.static import Bias, White
 from GPy.examples.dimensionality_reduction import mrd_simulation
-from GPy.core.parameterization.variational import NormalPosterior
+from GPy.core.variational import NormalPosterior
 from GPy.models.gp_regression import GPRegression
 from functools import reduce
-from GPy.util.caching import Cacher
-import GPy
-from pickle import PicklingError
-import GPy
 
 def toy_model():
     X = np.linspace(0,1,50)[:, None]
@@ -42,89 +30,6 @@ class ListDictTestCase(unittest.TestCase):
             np.testing.assert_array_equal(a1, a2)
 
 class Test(ListDictTestCase):
-    def test_parameter_index_operations(self):
-        pio = ParameterIndexOperations(dict(test1=np.array([4,3,1,6,4]), test2=np.r_[2:130]))
-        piov = ParameterIndexOperationsView(pio, 20, 250)
-        #py3 fix
-        #self.assertListDictEquals(dict(piov.items()), dict(piov.copy().iteritems()))
-        self.assertListDictEquals(dict(piov.items()), dict(piov.copy().items()))
-
-        #py3 fix
-        #self.assertListDictEquals(dict(pio.iteritems()), dict(pio.copy().items()))
-        self.assertListDictEquals(dict(pio.items()), dict(pio.copy().items()))
-
-        self.assertArrayListEquals(pio.copy().indices(), pio.indices())
-        self.assertArrayListEquals(piov.copy().indices(), piov.indices())
-
-        with tempfile.TemporaryFile('w+b') as f:
-            pickle.dump(pio, f)
-            f.seek(0)
-            pio2 = pickle.load(f)
-            self.assertListDictEquals(pio._properties, pio2._properties)
-
-        f = tempfile.TemporaryFile('w+b')
-        pickle.dump(piov, f)
-        f.seek(0)
-        pio2 = GPy.load(f)
-        f.close()
-
-        #py3 fix
-        #self.assertListDictEquals(dict(piov.items()), dict(pio2.iteritems()))
-        self.assertListDictEquals(dict(piov.items()), dict(pio2.items()))
-
-    def test_param(self):
-        param = Param('test', np.arange(4*2).reshape(4,2))
-        param[0].constrain_positive()
-        param[1].fix()
-        param[2].set_prior(Gaussian(0,1))
-        pcopy = param.copy()
-        self.assertListEqual(param.tolist(), pcopy.tolist())
-        self.assertListEqual(str(param).split('\n'), str(pcopy).split('\n'))
-        self.assertIsNot(param, pcopy)
-        with tempfile.TemporaryFile('w+b') as f:
-            pickle.dump(param, f)
-            f.seek(0)
-            pcopy = pickle.load(f)
-        self.assertListEqual(param.tolist(), pcopy.tolist())
-        self.assertSequenceEqual(str(param), str(pcopy))
-
-    def test_observable_array(self):
-        obs = ObsAr(np.arange(4*2).reshape(4,2))
-        pcopy = obs.copy()
-        self.assertListEqual(obs.tolist(), pcopy.tolist())
-        with tempfile.TemporaryFile('w+b') as f:
-            pickle.dump(obs, f)
-            f.seek(0)
-            pcopy = pickle.load(f)
-        self.assertListEqual(obs.tolist(), pcopy.tolist())
-        self.assertSequenceEqual(str(obs), str(pcopy))
-
-    def test_parameterized(self):
-        par = RBF(1, active_dims=[1]) + Linear(2, active_dims=[0,2]) + Bias(3) + White(3)
-        par.gradient = 10
-        par.randomize()
-        pcopy = par.copy()
-        self.assertIsInstance(pcopy.constraints, ParameterIndexOperations)
-        self.assertIsInstance(pcopy.rbf.constraints, ParameterIndexOperationsView)
-        self.assertIs(pcopy.constraints, pcopy.rbf.constraints._param_index_ops)
-        self.assertIs(pcopy.constraints, pcopy.rbf.lengthscale.constraints._param_index_ops)
-        self.assertIs(pcopy.constraints, pcopy.linear.constraints._param_index_ops)
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        pcopy.gradient = 10 # gradient does not get copied anymore
-        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
-        with tempfile.TemporaryFile('w+b') as f:
-            par.pickle(f)
-            f.seek(0)
-            pcopy = pickle.load(f)
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        pcopy.gradient = 10
-        np.testing.assert_allclose(par.linear.gradient_full, pcopy.linear.gradient_full)
-        np.testing.assert_allclose(pcopy.linear.gradient_full, 10)
-        self.assertSequenceEqual(str(par), str(pcopy))
-
     def test_model(self):
         par = toy_model()
         pcopy = par.copy()
diff --git a/GPy/testing/rv_transformation_tests.py b/GPy/testing/rv_transformation_tests.py
index 9c510aa4..2be58e54 100644
--- a/GPy/testing/rv_transformation_tests.py
+++ b/GPy/testing/rv_transformation_tests.py
@@ -10,12 +10,12 @@ import scipy.stats as st
 import GPy
 
 
-class TestModel(GPy.core.Model):
+class TestModel(GPy.core.ProbabilisticModel):
     """
     A simple GPy model with one parameter.
     """
     def __init__(self, theta=1.):
-        GPy.core.Model.__init__(self, 'test_model')
+        GPy.core.ProbabilisticModel.__init__(self, 'test_model')
         theta = GPy.core.Param('theta', theta)
         self.link_parameter(theta)
 
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index 6919f1a8..1c504f89 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -11,7 +11,6 @@ from . import mocap
 from . import decorators
 from . import classification
 from . import subarray_and_sorting
-from . import caching
 from . import diag
 from . import initialization
 from . import multioutput
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
deleted file mode 100644
index cbc1f3f1..00000000
--- a/GPy/util/caching.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-from ..core.parameterization.observable import Observable
-import collections, weakref
-from functools import reduce
-from pickle import PickleError
-
-class Cacher(object):
-    def __init__(self, operation, limit=5, ignore_args=(), force_kwargs=()):
-        """
-        Parameters:
-        ***********
-        :param callable operation: function to cache
-        :param int limit: depth of cacher
-        :param [int] ignore_args: list of indices, pointing at arguments to ignore in *args of operation(*args). This includes self!
-        :param [str] force_kwargs: list of kwarg names (strings). If a kwarg with that name is given, the cacher will force recompute and wont cache anything.
-        :param int verbose: verbosity level. 0: no print outs, 1: casual print outs, 2: debug level print outs
-        """
-        self.limit = int(limit)
-        self.ignore_args = ignore_args
-        self.force_kwargs = force_kwargs
-        self.operation = operation
-        self.order = collections.deque()
-        self.cached_inputs = {}  # point from cache_ids to a list of [ind_ids], which where used in cache cache_id
-
-        #=======================================================================
-        # point from each ind_id to [ref(obj), cache_ids]
-        # 0: a weak reference to the object itself
-        # 1: the cache_ids in which this ind_id is used (len will be how many times we have seen this ind_id)
-        self.cached_input_ids = {}
-        #=======================================================================
-
-        self.cached_outputs = {}  # point from cache_ids to outputs
-        self.inputs_changed = {}  # point from cache_ids to bools
-
-    def id(self, obj):
-        """returns the self.id of an object, to be used in caching individual self.ids"""
-        return hex(id(obj))
-
-    def combine_inputs(self, args, kw, ignore_args):
-        "Combines the args and kw in a unique way, such that ordering of kwargs does not lead to recompute"
-        inputs= args + tuple(c[1] for c in sorted(kw.items(), key=lambda x: x[0]))
-        # REMOVE the ignored arguments from input and PREVENT it from being checked!!!
-        return [a for i,a in enumerate(inputs) if i not in ignore_args]
-
-    def prepare_cache_id(self, combined_args_kw):
-        "get the cacheid (conc. string of argument self.ids in order)"
-        cache_id = "".join(self.id(a) for a in combined_args_kw)
-        return cache_id
-
-    def ensure_cache_length(self, cache_id):
-        "Ensures the cache is within its limits and has one place free"
-        if len(self.order) == self.limit:
-            # we have reached the limit, so lets release one element
-            cache_id = self.order.popleft()
-            combined_args_kw = self.cached_inputs[cache_id]
-            for ind in combined_args_kw:
-                if ind is not None:
-                    ind_id = self.id(ind)
-                    tmp = self.cached_input_ids.get(ind_id, None)
-                    if tmp is not None:
-                        ref, cache_ids = tmp
-                        if len(cache_ids) == 1 and ref() is not None:
-                            ref().remove_observer(self, self.on_cache_changed)
-                            del self.cached_input_ids[ind_id]
-                        else:
-                            cache_ids.remove(cache_id)
-                            self.cached_input_ids[ind_id] = [ref, cache_ids]
-            del self.cached_outputs[cache_id]
-            del self.inputs_changed[cache_id]
-            del self.cached_inputs[cache_id]
-
-    def add_to_cache(self, cache_id, inputs, output):
-        """This adds cache_id to the cache, with inputs and output"""
-        self.inputs_changed[cache_id] = False
-        self.cached_outputs[cache_id] = output
-        self.order.append(cache_id)
-        self.cached_inputs[cache_id] = inputs
-        for a in inputs:
-            if a is not None and not isinstance(a, int):
-                ind_id = self.id(a)
-                v = self.cached_input_ids.get(ind_id, [weakref.ref(a), []])
-                v[1].append(cache_id)
-                if len(v[1]) == 1:
-                    a.add_observer(self, self.on_cache_changed)
-                self.cached_input_ids[ind_id] = v
-
-    def __call__(self, *args, **kw):
-        """
-        A wrapper function for self.operation,
-        """
-        #=======================================================================
-        # !WARNING CACHE OFFSWITCH!
-        # return self.operation(*args, **kw)
-        #=======================================================================
-
-        # 1: Check whether we have forced recompute arguments:
-        if len(self.force_kwargs) != 0:
-            for k in self.force_kwargs:
-                if k in kw and kw[k] is not None:
-                    return self.operation(*args, **kw)
-
-        # 2: prepare_cache_id and get the unique self.id string for this call
-        inputs = self.combine_inputs(args, kw, self.ignore_args)
-        cache_id = self.prepare_cache_id(inputs)
-        # 2: if anything is not cachable, we will just return the operation, without caching
-        if reduce(lambda a, b: a or (not (isinstance(b, Observable) or b is None or isinstance(b,int))), inputs, False):
-#             print 'WARNING: '+self.operation.__name__ + ' not cacheable!'
-#             print [not (isinstance(b, Observable)) for b in inputs]
-            return self.operation(*args, **kw)
-        # 3&4: check whether this cache_id has been cached, then has it changed?
-        try:
-            if(self.inputs_changed[cache_id]):
-                # 4: This happens, when elements have changed for this cache self.id
-                self.inputs_changed[cache_id] = False
-                self.cached_outputs[cache_id] = self.operation(*args, **kw)
-        except KeyError:
-            # 3: This is when we never saw this chache_id:
-            self.ensure_cache_length(cache_id)
-            self.add_to_cache(cache_id, inputs, self.operation(*args, **kw))
-        except:
-            self.reset()
-            raise
-        # 5: We have seen this cache_id and it is cached:
-        return self.cached_outputs[cache_id]
-
-    def on_cache_changed(self, direct, which=None):
-        """
-        A callback funtion, which sets local flags when the elements of some cached inputs change
-
-        this function gets 'hooked up' to the inputs when we cache them, and upon their elements being changed we update here.
-        """
-        for what in [direct, which]:
-            if what is not None:
-                ind_id = self.id(what)
-                _, cache_ids = self.cached_input_ids.get(ind_id, [None, []])
-                for cache_id in cache_ids:
-                    self.inputs_changed[cache_id] = True
-
-    def reset(self):
-        """
-        Totally reset the cache
-        """
-        [a().remove_observer(self, self.on_cache_changed) if (a() is not None) else None for [a, _] in self.cached_input_ids.values()]
-        self.cached_input_ids = {}
-        self.cached_outputs = {}
-        self.inputs_changed = {}
-
-    def __deepcopy__(self, memo=None):
-        return Cacher(self.operation, self.limit, self.ignore_args, self.force_kwargs)
-
-    def __getstate__(self, memo=None):
-        raise PickleError("Trying to pickle Cacher object with function {}, pickling functions not possible.".format(str(self.operation)))
-
-    def __setstate__(self, memo=None):
-        raise PickleError("Trying to pickle Cacher object with function {}, pickling functions not possible.".format(str(self.operation)))
-
-    @property
-    def __name__(self):
-        return self.operation.__name__
-
-from functools import partial, update_wrapper
-
-class Cacher_wrap(object):
-    def __init__(self, f, limit, ignore_args, force_kwargs):
-        self.limit = limit
-        self.ignore_args = ignore_args
-        self.force_kwargs = force_kwargs
-        self.f = f
-        update_wrapper(self, self.f)
-    def __get__(self, obj, objtype=None):
-        return partial(self, obj)
-    def __call__(self, *args, **kwargs):
-        obj = args[0]
-        # import ipdb;ipdb.set_trace()
-        try:
-            caches = obj.__cachers
-        except AttributeError:
-            caches = obj.__cachers = {}
-        try:
-            cacher = caches[self.f]
-        except KeyError:
-            cacher = caches[self.f] = Cacher(self.f, self.limit, self.ignore_args, self.force_kwargs)
-        return cacher(*args, **kwargs)
-
-class Cache_this(object):
-    """
-    A decorator which can be applied to bound methods in order to cache them
-    """
-    def __init__(self, limit=5, ignore_args=(), force_kwargs=()):
-        self.limit = limit
-        self.ignore_args = ignore_args
-        self.force_args = force_kwargs
-    def __call__(self, f):
-        newf = Cacher_wrap(f, self.limit, self.ignore_args, self.force_args)
-        update_wrapper(newf, f)
-        return newf
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index b8b9a261..1617283c 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 from ..core.parameterization import Parameterized, Param
-from ..core.parameterization.transformations import Logexp
+from paramz.transformations import Logexp
 
 
 class WarpingFunction(Parameterized):