diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 6add95b0..48c05a8d 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -13,11 +13,12 @@ Observable Pattern for patameterization
 
 """
 
-from transformations import Transformation,Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
+from .transformations import Transformation,Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
 import numpy as np
 import re
 import logging
-from updateable import Updateable
+from .updateable import Updateable
+from functools import reduce
 
 class HierarchyError(Exception):
     """
@@ -36,7 +37,7 @@ def adjust_name_for_printing(name):
         name = name.replace("/", "_l_").replace("@", '_at_')
         name = name.replace("(", "_of_").replace(")", "")
         if re.match(r'^[a-zA-Z_][a-zA-Z0-9-_]*$', name) is None:
-            raise NameError, "name {} converted to {} cannot be further converted to valid python variable name!".format(name2, name)
+            raise NameError("name {} converted to {} cannot be further converted to valid python variable name!".format(name2, name))
         return name
     return ''
 
@@ -65,13 +66,13 @@ class Parentable(object):
         Gets called, when the parent changed, so we can adjust our
         inner attributes according to the new parent.
         """
-        raise NotImplementedError, "shouldnt happen, Parentable objects need to be able to change their parent"
+        raise NotImplementedError("shouldnt happen, Parentable objects need to be able to change their parent")
 
     def _disconnect_parent(self, *args, **kw):
         """
         Disconnect this object from its parent
         """
-        raise NotImplementedError, "Abstract superclass"
+        raise NotImplementedError("Abstract superclass")
 
     @property
     def _highest_parent_(self):
@@ -109,7 +110,10 @@ class Pickleable(object):
                   it properly.
         :param protocol: pickling protocol to use, python-pickle for details.
         """
-        import cPickle as pickle
+        try: #Py2
+            import cPickle as pickle
+        except ImportError: #Py3
+            import pickle
         if isinstance(f, str):
             with open(f, 'wb') as f:
                 pickle.dump(self, f, protocol)
@@ -138,9 +142,9 @@ class Pickleable(object):
             which = self
         which.traverse_parents(parents.append) # collect parents
         for p in parents:
-            if not memo.has_key(id(p)):memo[id(p)] = None # set all parents to be None, so they will not be copied
-        if not memo.has_key(id(self.gradient)):memo[id(self.gradient)] = None # reset the gradient
-        if not memo.has_key(id(self._fixes_)):memo[id(self._fixes_)] = None # fixes have to be reset, as this is now highest parent
+            if not id(p) in memo :memo[id(p)] = None # set all parents to be None, so they will not be copied
+        if not id(self.gradient) in memo:memo[id(self.gradient)] = None # reset the gradient
+        if not id(self._fixes_) in memo :memo[id(self._fixes_)] = None # fixes have to be reset, as this is now highest parent
         copy = copy.deepcopy(self, memo) # and start the copy
         copy._parent_index_ = None
         copy._trigger_params_changed()
@@ -163,14 +167,16 @@ class Pickleable(object):
                        '_Cacher_wrap__cachers', # never pickle cachers
                        ]
         dc = dict()
-        for k,v in self.__dict__.iteritems():
+        #py3 fix
+        #for k,v in self.__dict__.iteritems():
+        for k,v in self.__dict__.items():
             if k not in ignore_list:
                 dc[k] = v
         return dc
 
     def __setstate__(self, state):
         self.__dict__.update(state)
-        from lists_and_dicts import ObserverList
+        from .lists_and_dicts import ObserverList
         self.observers = ObserverList()
         self._setup_observers()
         self._optimizer_copy_transformed = False
@@ -214,7 +220,7 @@ class Gradcheckable(Pickleable, Parentable):
         Perform the checkgrad on the model.
         TODO: this can be done more efficiently, when doing it inside here
         """
-        raise HierarchyError, "This parameter is not in a model with a likelihood, and, therefore, cannot be gradient checked!"
+        raise HierarchyError("This parameter is not in a model with a likelihood, and, therefore, cannot be gradient checked!")
 
 class Nameable(Gradcheckable):
     """
@@ -268,7 +274,7 @@ class Indexable(Nameable, Updateable):
     def __init__(self, name, default_constraint=None, *a, **kw):
         super(Indexable, self).__init__(name=name, *a, **kw)
         self._default_constraint_ = default_constraint
-        from index_operations import ParameterIndexOperations
+        from .index_operations import ParameterIndexOperations
         self.constraints = ParameterIndexOperations()
         self.priors = ParameterIndexOperations()
         if self._default_constraint_ is not None:
@@ -310,7 +316,7 @@ class Indexable(Nameable, Updateable):
         that is an int array, containing the indexes for the flattened
         param inside this parameterized logic.
         """
-        from param import ParamConcatenation
+        from .param import ParamConcatenation
         if isinstance(param, ParamConcatenation):
             return np.hstack((self._raveled_index_for(p) for p in param.params))
         return param._raveled_index() + self._offset_for(param)
@@ -407,7 +413,7 @@ class Indexable(Nameable, Updateable):
         repriorized = self.unset_priors()
         self._add_to_index_operations(self.priors, repriorized, prior, warning)
 
-        from domains import _REAL, _POSITIVE, _NEGATIVE
+        from .domains import _REAL, _POSITIVE, _NEGATIVE
         if prior.domain is _POSITIVE:
             self.constrain_positive(warning)
         elif prior.domain is _NEGATIVE:
@@ -424,19 +430,38 @@ class Indexable(Nameable, Updateable):
 
     def log_prior(self):
         """evaluate the prior"""
-        if self.priors.size > 0:
-            x = self.param_array
-            return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.iteritems()), 0)
-        return 0.
+        if self.priors.size == 0:
+            return 0.
+        x = self.param_array
+        #evaluate the prior log densities
+        log_p = reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.items()), 0)
+
+        #account for the transformation by evaluating the log Jacobian (where things are transformed)
+        log_j = 0.
+        priored_indexes = np.hstack([i for p, i in self.priors.items()])
+        for c,j in self.constraints.items():
+            if not isinstance(c, Transformation):continue
+            for jj in j:
+                if jj in priored_indexes:
+                    log_j += c.log_jacobian(x[jj])
+        return log_p + log_j
 
     def _log_prior_gradients(self):
         """evaluate the gradients of the priors"""
-        if self.priors.size > 0:
-            x = self.param_array
-            ret = np.zeros(x.size)
-            [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.iteritems()]
-            return ret
-        return 0.
+        if self.priors.size == 0:
+            return 0.
+        x = self.param_array
+        ret = np.zeros(x.size)
+        #compute derivate of prior density
+        [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.items()]
+        #add in jacobian derivatives if transformed
+        priored_indexes = np.hstack([i for p, i in self.priors.items()])
+        for c,j in self.constraints.items():
+            if not isinstance(c, Transformation):continue
+            for jj in j:
+                if jj in priored_indexes:
+                    ret[jj] += c.log_jacobian_grad(x[jj])
+        return ret
 
     #===========================================================================
     # Tie parameters together
@@ -471,7 +496,7 @@ class Indexable(Nameable, Updateable):
             self.param_array[...] = transform.initialize(self.param_array)
         reconstrained = self.unconstrain()
         added = self._add_to_index_operations(self.constraints, reconstrained, transform, warning)
-        self.notify_observers(self, None if trigger_parent else -np.inf)
+        self.trigger_update(trigger_parent)
         return added
 
     def unconstrain(self, *transforms):
@@ -536,7 +561,7 @@ class Indexable(Nameable, Updateable):
         update the constraints and priors view, so that
         constraining is automized for the parent.
         """
-        from index_operations import ParameterIndexOperationsView
+        from .index_operations import ParameterIndexOperationsView
         #if getattr(self, "_in_init_"):
             #import ipdb;ipdb.set_trace()
             #self.constraints.update(param.constraints, start)
@@ -558,7 +583,7 @@ class Indexable(Nameable, Updateable):
         """
         if warning and reconstrained.size > 0:
             # TODO: figure out which parameters have changed and only print those
-            print "WARNING: reconstraining parameters {}".format(self.hierarchy_name() or self.name)
+            print("WARNING: reconstraining parameters {}".format(self.hierarchy_name() or self.name))
         index = self._raveled_index()
         which.add(what, index)
         return index
@@ -571,7 +596,7 @@ class Indexable(Nameable, Updateable):
         if len(transforms) == 0:
             transforms = which.properties()
         removed = np.empty((0,), dtype=int)
-        for t in transforms:
+        for t in list(transforms):
             unconstrained = which.remove(t, self._raveled_index())
             removed = np.union1d(removed, unconstrained)
             if t is __fixed__:
@@ -612,7 +637,9 @@ class OptimizationHandlable(Indexable):
 
         if not self._optimizer_copy_transformed:
             self._optimizer_copy_.flat = self.param_array.flat
-            [np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
+            #py3 fix
+            #[np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
+            [np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.items() if c != __fixed__]
             if self.has_parent() and (self.constraints[__fixed__].size != 0 or self._has_ties()):
                 fixes = np.ones(self.size).astype(bool)
                 fixes[self.constraints[__fixed__]] = FIXED
@@ -641,21 +668,25 @@ class OptimizationHandlable(Indexable):
         if f is None:
             self.param_array.flat = p
             [np.put(self.param_array, ind, c.f(self.param_array.flat[ind]))
-             for c, ind in self.constraints.iteritems() if c != __fixed__]
+             #py3 fix
+             #for c, ind in self.constraints.iteritems() if c != __fixed__]
+             for c, ind in self.constraints.items() if c != __fixed__]
         else:
             self.param_array.flat[f] = p
             [np.put(self.param_array, ind[f[ind]], c.f(self.param_array.flat[ind[f[ind]]]))
-             for c, ind in self.constraints.iteritems() if c != __fixed__]
+             #py3 fix
+             #for c, ind in self.constraints.iteritems() if c != __fixed__]
+             for c, ind in self.constraints.items() if c != __fixed__]
         #self._highest_parent_.tie.propagate_val()
 
         self._optimizer_copy_transformed = False
         self.trigger_update()
 
     def _get_params_transformed(self):
-        raise DeprecationWarning, "_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!"
+        raise DeprecationWarning("_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!")
 #
     def _set_params_transformed(self, p):
-        raise DeprecationWarning, "_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!"
+        raise DeprecationWarning("_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!")
 
     def _trigger_params_changed(self, trigger_parent=True):
         """
@@ -680,17 +711,32 @@ class OptimizationHandlable(Indexable):
         constraint to it.
         """
         self._highest_parent_.tie.collate_gradient()
-        [np.put(g, i, c.gradfactor(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        #py3 fix
+        #[np.put(g, i, c.gradfactor(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        [np.put(g, i, c.gradfactor(self.param_array[i], g[i])) for c, i in self.constraints.items() if c != __fixed__]
         if self._has_fixes(): return g[self._fixes_]
         return g
 
+    def _transform_gradients_non_natural(self, g):
+        """
+        Transform the gradients by multiplying the gradient factor for each
+        constraint to it.
+        """
+        self._highest_parent_.tie.collate_gradient()
+        #py3 fix
+        #[np.put(g, i, c.gradfactor_non_natural(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        [np.put(g, i, c.gradfactor_non_natural(self.param_array[i], g[i])) for c, i in self.constraints.items() if c != __fixed__]
+        if self._has_fixes(): return g[self._fixes_]
+        return g
+
+
     @property
     def num_params(self):
         """
         Return the number of parameters of this parameter_handle.
         Param objects will always return 0.
         """
-        raise NotImplemented, "Abstract, please implement in respective classes"
+        raise NotImplemented("Abstract, please implement in respective classes")
 
     def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
         """
@@ -739,7 +785,9 @@ class OptimizationHandlable(Indexable):
         self.optimizer_array = x  # makes sure all of the tied parameters get the same init (since there's only one prior object...)
         # now draw from prior where possible
         x = self.param_array.copy()
-        [np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.iteritems() if not p is None]
+        #Py3 fix
+        #[np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.iteritems() if not p is None]
+        [np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.items() if not p is None]
         unfixlist = np.ones((self.size,),dtype=np.bool)
         unfixlist[self.constraints[__fixed__]] = False
         self.param_array.flat[unfixlist] = x.view(np.ndarray).ravel()[unfixlist]
@@ -798,7 +846,7 @@ class Parameterizable(OptimizationHandlable):
     A parameterisable class.
 
     This class provides the parameters list (ArrayList) and standard parameter handling,
-    such as {add|remove}_parameter(), traverse hierarchy and param_array, gradient_array
+    such as {link|unlink}_parameter(), traverse hierarchy and param_array, gradient_array
     and the empty parameters_changed().
 
     This class is abstract and should not be instantiated.
@@ -936,7 +984,7 @@ class Parameterizable(OptimizationHandlable):
             self._add_parameter_name(param, ignore_added_names)
         # and makes sure to not delete programmatically added parameters
         for other in self.parameters[::-1]:
-            if other is not param and other.name.startswith(param.name):
+            if other is not param and other.name == param.name:
                 warn_and_retry(param, _name_digit.match(other.name))
                 return
         if pname not in dir(self):
@@ -1031,6 +1079,9 @@ class Parameterizable(OptimizationHandlable):
                     p = param_to_array(p)
                     d = f.create_dataset(n,p.shape,dtype=p.dtype)
                     d[:] = p
+                if hasattr(self, 'param_array'):
+                    d = f.create_dataset('param_array',self.param_array.shape, dtype=self.param_array.dtype)
+                    d[:] = self.param_array
                 f.close()
             except:
                 raise 'Fails to write the parameters into a HDF5 file!'
diff --git a/GPy/core/parameterization/priors.py b/GPy/core/parameterization/priors.py
index 84b6357e..83c83dfd 100644
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@@ -5,7 +5,7 @@
 import numpy as np
 from scipy.special import gammaln, digamma
 from ...util.linalg import pdinv
-from domains import _REAL, _POSITIVE
+from .domains import _REAL, _POSITIVE
 import warnings
 import weakref
 
@@ -15,8 +15,12 @@ class Prior(object):
     _instance = None
     def __new__(cls, *args, **kwargs):
         if not cls._instance or cls._instance.__class__ is not cls:
-            cls._instance = super(Prior, cls).__new__(cls, *args, **kwargs)
-        return cls._instance
+                newfunc = super(Prior, cls).__new__
+                if newfunc is object.__new__:
+                    cls._instance = newfunc(cls)  
+                else:
+                    cls._instance = newfunc(cls, *args, **kwargs)
+                return cls._instance
 
     def pdf(self, x):
         return np.exp(self.lnpdf(x))
@@ -52,7 +56,11 @@ class Gaussian(Prior):
             for instance in cls._instances:
                 if instance().mu == mu and instance().sigma == sigma:
                     return instance()
-        o = super(Prior, cls).__new__(cls, mu, sigma)
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, mu, sigma)            
         cls._instances.append(weakref.ref(o))
         return cls._instances[-1]()
 
@@ -140,7 +148,11 @@ class LogGaussian(Gaussian):
             for instance in cls._instances:
                 if instance().mu == mu and instance().sigma == sigma:
                     return instance()
-        o = super(Prior, cls).__new__(cls, mu, sigma)
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, mu, sigma)
         cls._instances.append(weakref.ref(o))
         return cls._instances[-1]()
 
@@ -258,7 +270,11 @@ class Gamma(Prior):
             for instance in cls._instances:
                 if instance().a == a and instance().b == b:
                     return instance()
-        o = super(Prior, cls).__new__(cls, a, b)
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, a, b)
         cls._instances.append(weakref.ref(o))
         return cls._instances[-1]()
 
@@ -398,7 +414,7 @@ class DGPLVM_KFDA(Prior):
     def compute_cls(self, x):
         cls = {}
         # Appending each data point to its proper class
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
             class_label = self.get_class_label(self.lbl[j])
             if class_label not in cls:
                 cls[class_label] = []
@@ -504,6 +520,219 @@ class DGPLVM(Prior):
 
     .. Note:: DGPLVM for Classification paper implementation
 
+    """
+    domain = _REAL
+    
+    def __new__(cls, sigma2, lbl, x_shape): 
+        return super(Prior, cls).__new__(cls, sigma2, lbl, x_shape)
+
+    def __init__(self, sigma2, lbl, x_shape):
+        self.sigma2 = sigma2
+        # self.x = x
+        self.lbl = lbl
+        self.classnum = lbl.shape[1]
+        self.datanum = lbl.shape[0]
+        self.x_shape = x_shape
+        self.dim = x_shape[1]
+
+    def get_class_label(self, y):
+        for idx, v in enumerate(y):
+            if v == 1:
+                return idx
+        return -1
+
+    # This function assigns each data point to its own class
+    # and returns the dictionary which contains the class name and parameters.
+    def compute_cls(self, x):
+        cls = {}
+        # Appending each data point to its proper class
+        for j in range(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in cls:
+                cls[class_label] = []
+            cls[class_label].append(x[j])
+        return cls
+
+    # This function computes mean of each class. The mean is calculated through each dimension
+    def compute_Mi(self, cls):
+        M_i = np.zeros((self.classnum, self.dim))
+        for i in cls:
+            # Mean of each class
+            class_i = cls[i]
+            M_i[i] = np.mean(class_i, axis=0)
+        return M_i
+
+    # Adding data points as tuple to the dictionary so that we can access indices
+    def compute_indices(self, x):
+        data_idx = {}
+        for j in range(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in data_idx:
+                data_idx[class_label] = []
+            t = (j, x[j])
+            data_idx[class_label].append(t)
+        return data_idx
+
+    # Adding indices to the list so we can access whole the indices
+    def compute_listIndices(self, data_idx):
+        lst_idx = []
+        lst_idx_all = []
+        for i in data_idx:
+            if len(lst_idx) == 0:
+                pass
+                #Do nothing, because it is the first time list is created so is empty
+            else:
+                lst_idx = []
+            # Here we put indices of each class in to the list called lst_idx_all
+            for m in range(len(data_idx[i])):
+                lst_idx.append(data_idx[i][m][0])
+            lst_idx_all.append(lst_idx)
+        return lst_idx_all
+
+    # This function calculates between classes variances
+    def compute_Sb(self, cls, M_i, M_0):
+        Sb = np.zeros((self.dim, self.dim))
+        for i in cls:
+            B = (M_i[i] - M_0).reshape(self.dim, 1)
+            B_trans = B.transpose()
+            Sb += (float(len(cls[i])) / self.datanum) * B.dot(B_trans)
+        return Sb
+
+    # This function calculates within classes variances
+    def compute_Sw(self, cls, M_i):
+        Sw = np.zeros((self.dim, self.dim))
+        for i in cls:
+            N_i = float(len(cls[i]))
+            W_WT = np.zeros((self.dim, self.dim))
+            for xk in cls[i]:
+                W = (xk - M_i[i])
+                W_WT += np.outer(W, W)
+            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+        return Sw
+
+    # Calculating beta and Bi for Sb
+    def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
+        # import pdb
+        # pdb.set_trace()
+        B_i = np.zeros((self.classnum, self.dim))
+        Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            # pdb.set_trace()
+            # Calculating Bi
+            B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
+        for k in range(self.datanum):
+            for i in data_idx:
+                N_i = float(len(data_idx[i]))
+                if k in lst_idx_all[i]:
+                    beta = (float(1) / N_i) - (float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+                else:
+                    beta = -(float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
+        return Sig_beta_B_i_all
+
+
+    # Calculating W_j s separately so we can access all the W_j s anytime
+    def compute_wj(self, data_idx, M_i):
+        W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                xj = tpl[1]
+                j = tpl[0]
+                W_i[j] = (xj - M_i[i])
+        return W_i
+
+    # Calculating alpha and Wj for Sw
+    def compute_sig_alpha_W(self, data_idx, lst_idx_all, W_i):
+        Sig_alpha_W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                k = tpl[0]
+                for j in lst_idx_all[i]:
+                    if k == j:
+                        alpha = 1 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                    else:
+                        alpha = 0 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+        return Sig_alpha_W_i
+
+    # This function calculates log of our prior
+    def lnpdf(self, x):
+        x = x.reshape(self.x_shape)
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
+
+    # This function calculates derivative of the log of prior function
+    def lnpdf_grad(self, x):
+        x = x.reshape(self.x_shape)
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        data_idx = self.compute_indices(x)
+        lst_idx_all = self.compute_listIndices(data_idx)
+        Sig_beta_B_i_all = self.compute_sig_beta_Bi(data_idx, M_i, M_0, lst_idx_all)
+        W_i = self.compute_wj(data_idx, M_i)
+        Sig_alpha_W_i = self.compute_sig_alpha_W(data_idx, lst_idx_all, W_i)
+
+        # Calculating inverse of Sb and its transpose and minus
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N_trans = np.transpose(Sb_inv_N)
+        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
+        Sw_trans = np.transpose(Sw)
+
+        # Calculating DJ/DXk
+        DJ_Dxk = 2 * (
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
+                Sig_alpha_W_i))
+        # Calculating derivative of the log of the prior
+        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        return DPx_Dx.T
+
+    # def frb(self, x):
+    #     from functools import partial
+    #     from GPy.models import GradientChecker
+    #     f = partial(self.lnpdf)
+    #     df = partial(self.lnpdf_grad)
+    #     grad = GradientChecker(f, df, x, 'X')
+    #     grad.checkgrad(verbose=1)
+
+    def rvs(self, n):
+        return np.random.rand(n)  # A WRONG implementation
+
+    def __str__(self):
+        return 'DGPLVM_prior_Raq'
+
+
+# ******************************************
+
+from .. import Parameterized
+from .. import Param
+class DGPLVM_Lamda(Prior, Parameterized):
+    """
+    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
+
+    :param sigma2: constant
+
+    .. Note:: DGPLVM for Classification paper implementation
+
     """
     domain = _REAL
     # _instances = []
@@ -517,14 +746,18 @@ class DGPLVM(Prior):
     #     cls._instances.append(weakref.ref(o))
     #     return cls._instances[-1]()
 
-    def __init__(self, sigma2, lbl, x_shape):
+    def __init__(self, sigma2, lbl, x_shape, lamda, name='DP_prior'):
+        super(DGPLVM_Lamda, self).__init__(name=name)
         self.sigma2 = sigma2
         # self.x = x
         self.lbl = lbl
+        self.lamda = lamda
         self.classnum = lbl.shape[1]
         self.datanum = lbl.shape[0]
         self.x_shape = x_shape
         self.dim = x_shape[1]
+        self.lamda = Param('lamda', np.diag(lamda))
+        self.link_parameter(self.lamda)
 
     def get_class_label(self, y):
         for idx, v in enumerate(y):
@@ -549,7 +782,8 @@ class DGPLVM(Prior):
         M_i = np.zeros((self.classnum, self.dim))
         for i in cls:
             # Mean of each class
-            M_i[i] = np.mean(cls[i], axis=0)
+            class_i = cls[i]
+            M_i[i] = np.mean(class_i, axis=0)
         return M_i
 
     # Adding data points as tuple to the dictionary so that we can access indices
@@ -654,6 +888,13 @@ class DGPLVM(Prior):
     # This function calculates log of our prior
     def lnpdf(self, x):
         x = x.reshape(self.x_shape)
+	
+	#!!!!!!!!!!!!!!!!!!!!!!!!!!!
+	#self.lamda.values[:] = self.lamda.values/self.lamda.values.sum()	
+
+        xprime = x.dot(np.diagflat(self.lamda))
+        x = xprime
+	# print x
         cls = self.compute_cls(x)
         M_0 = np.mean(x, axis=0)
         M_i = self.compute_Mi(cls)
@@ -661,12 +902,16 @@ class DGPLVM(Prior):
         Sw = self.compute_Sw(cls, M_i)
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
         return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
 
     # This function calculates derivative of the log of prior function
     def lnpdf_grad(self, x):
         x = x.reshape(self.x_shape)
+        xprime = x.dot(np.diagflat(self.lamda))
+        x = xprime
+	# print x
         cls = self.compute_cls(x)
         M_0 = np.mean(x, axis=0)
         M_i = self.compute_Mi(cls)
@@ -680,8 +925,251 @@ class DGPLVM(Prior):
 
         # Calculating inverse of Sb and its transpose and minus
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        Sb_inv_N_trans = np.transpose(Sb_inv_N)
+        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
+        Sw_trans = np.transpose(Sw)
+
+        # Calculating DJ/DXk
+        DJ_Dxk = 2 * (
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
+                Sig_alpha_W_i))
+        # Calculating derivative of the log of the prior
+        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+
+        DPxprim_Dx = np.diagflat(self.lamda).dot(DPx_Dx)
+
+        # Because of the GPy we need to transpose our matrix so that it gets the same shape as out matrix (denominator layout!!!)
+        DPxprim_Dx = DPxprim_Dx.T
+    
+        DPxprim_Dlamda = DPx_Dx.dot(x)
+
+    	# Because of the GPy we need to transpose our matrix so that it gets the same shape as out matrix (denominator layout!!!)
+        DPxprim_Dlamda = DPxprim_Dlamda.T
+
+        self.lamda.gradient = np.diag(DPxprim_Dlamda)
+	# print DPxprim_Dx
+        return DPxprim_Dx
+
+
+    # def frb(self, x):
+    #     from functools import partial
+    #     from GPy.models import GradientChecker
+    #     f = partial(self.lnpdf)
+    #     df = partial(self.lnpdf_grad)
+    #     grad = GradientChecker(f, df, x, 'X')
+    #     grad.checkgrad(verbose=1)
+
+    def rvs(self, n):
+        return np.random.rand(n)  # A WRONG implementation
+
+    def __str__(self):
+        return 'DGPLVM_prior_Raq_Lamda'
+
+# ******************************************
+
+class DGPLVM_T(Prior):
+    """
+    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
+
+    :param sigma2: constant
+
+    .. Note:: DGPLVM for Classification paper implementation
+
+    """
+    domain = _REAL
+    # _instances = []
+    # def __new__(cls, mu, sigma): # Singleton:
+    #     if cls._instances:
+    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
+    #         for instance in cls._instances:
+    #             if instance().mu == mu and instance().sigma == sigma:
+    #                 return instance()
+    #     o = super(Prior, cls).__new__(cls, mu, sigma)
+    #     cls._instances.append(weakref.ref(o))
+    #     return cls._instances[-1]()
+
+    def __init__(self, sigma2, lbl, x_shape, vec):
+        self.sigma2 = sigma2
+        # self.x = x
+        self.lbl = lbl
+        self.classnum = lbl.shape[1]
+        self.datanum = lbl.shape[0]
+        self.x_shape = x_shape
+        self.dim = x_shape[1]
+        self.vec = vec
+
+
+    def get_class_label(self, y):
+        for idx, v in enumerate(y):
+            if v == 1:
+                return idx
+        return -1
+
+    # This function assigns each data point to its own class
+    # and returns the dictionary which contains the class name and parameters.
+    def compute_cls(self, x):
+        cls = {}
+        # Appending each data point to its proper class
+        for j in range(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in cls:
+                cls[class_label] = []
+            cls[class_label].append(x[j])
+        return cls
+
+    # This function computes mean of each class. The mean is calculated through each dimension
+    def compute_Mi(self, cls):
+        M_i = np.zeros((self.classnum, self.dim))
+        for i in cls:
+            # Mean of each class
+	    # class_i = np.multiply(cls[i],vec)
+            class_i = cls[i]
+            M_i[i] = np.mean(class_i, axis=0)
+        return M_i
+
+    # Adding data points as tuple to the dictionary so that we can access indices
+    def compute_indices(self, x):
+        data_idx = {}
+        for j in range(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in data_idx:
+                data_idx[class_label] = []
+            t = (j, x[j])
+            data_idx[class_label].append(t)
+        return data_idx
+
+    # Adding indices to the list so we can access whole the indices
+    def compute_listIndices(self, data_idx):
+        lst_idx = []
+        lst_idx_all = []
+        for i in data_idx:
+            if len(lst_idx) == 0:
+                pass
+                #Do nothing, because it is the first time list is created so is empty
+            else:
+                lst_idx = []
+            # Here we put indices of each class in to the list called lst_idx_all
+            for m in range(len(data_idx[i])):
+                lst_idx.append(data_idx[i][m][0])
+            lst_idx_all.append(lst_idx)
+        return lst_idx_all
+
+    # This function calculates between classes variances
+    def compute_Sb(self, cls, M_i, M_0):
+        Sb = np.zeros((self.dim, self.dim))
+        for i in cls:
+            B = (M_i[i] - M_0).reshape(self.dim, 1)
+            B_trans = B.transpose()
+            Sb += (float(len(cls[i])) / self.datanum) * B.dot(B_trans)
+        return Sb
+
+    # This function calculates within classes variances
+    def compute_Sw(self, cls, M_i):
+        Sw = np.zeros((self.dim, self.dim))
+        for i in cls:
+            N_i = float(len(cls[i]))
+            W_WT = np.zeros((self.dim, self.dim))
+            for xk in cls[i]:
+                W = (xk - M_i[i])
+                W_WT += np.outer(W, W)
+            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+        return Sw
+
+    # Calculating beta and Bi for Sb
+    def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
+        # import pdb
+        # pdb.set_trace()
+        B_i = np.zeros((self.classnum, self.dim))
+        Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            # pdb.set_trace()
+            # Calculating Bi
+            B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
+        for k in range(self.datanum):
+            for i in data_idx:
+                N_i = float(len(data_idx[i]))
+                if k in lst_idx_all[i]:
+                    beta = (float(1) / N_i) - (float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+                else:
+                    beta = -(float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
+        return Sig_beta_B_i_all
+
+
+    # Calculating W_j s separately so we can access all the W_j s anytime
+    def compute_wj(self, data_idx, M_i):
+        W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                xj = tpl[1]
+                j = tpl[0]
+                W_i[j] = (xj - M_i[i])
+        return W_i
+
+    # Calculating alpha and Wj for Sw
+    def compute_sig_alpha_W(self, data_idx, lst_idx_all, W_i):
+        Sig_alpha_W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                k = tpl[0]
+                for j in lst_idx_all[i]:
+                    if k == j:
+                        alpha = 1 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                    else:
+                        alpha = 0 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+        return Sig_alpha_W_i
+
+    # This function calculates log of our prior
+    def lnpdf(self, x):
+        x = x.reshape(self.x_shape)
+        xprim = x.dot(self.vec)
+        x = xprim
+	# print x  
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+	#print 'SB_inv: ', Sb_inv_N
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
+
+    # This function calculates derivative of the log of prior function
+    def lnpdf_grad(self, x):
+        x = x.reshape(self.x_shape)
+        xprim = x.dot(self.vec)
+        x = xprim 
+	# print x       
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        data_idx = self.compute_indices(x)
+        lst_idx_all = self.compute_listIndices(data_idx)
+        Sig_beta_B_i_all = self.compute_sig_beta_Bi(data_idx, M_i, M_0, lst_idx_all)
+        W_i = self.compute_wj(data_idx, M_i)
+        Sig_alpha_W_i = self.compute_sig_alpha_W(data_idx, lst_idx_all, W_i)
+
+        # Calculating inverse of Sb and its transpose and minus
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+	#print 'SB_inv: ',Sb_inv_N
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
         Sb_inv_N_trans = np.transpose(Sb_inv_N)
         Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
         Sw_trans = np.transpose(Sw)
@@ -706,7 +1194,9 @@ class DGPLVM(Prior):
         return np.random.rand(n)  # A WRONG implementation
 
     def __str__(self):
-        return 'DGPLVM_prior'
+        return 'DGPLVM_prior_Raq_TTT'
+
+
 
 class HalfT(Prior):
     """
diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py
index 235b9d1c..830809d6 100644
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@@ -3,7 +3,7 @@
 
 
 import numpy as np
-from domains import _POSITIVE,_NEGATIVE, _BOUNDED
+from .domains import _POSITIVE,_NEGATIVE, _BOUNDED
 import weakref
 
 import sys
@@ -31,6 +31,16 @@ class Transformation(object):
         raise NotImplementedError
     def finv(self, model_param):
         raise NotImplementedError
+    def log_jacobian(self, model_param):
+        """
+        compute the log of the jacobian of f, evaluated at f(x)= model_param
+        """
+        raise NotImplementedError
+    def log_jacobian_grad(self, model_param):
+        """
+        compute the drivative of the log of the jacobian of f, evaluated at f(x)= model_param
+        """
+        raise NotImplementedError
     def gradfactor(self, model_param, dL_dmodel_param):
         """ df(opt_param)_dopt_param evaluated at self.f(opt_param)=model_param, times the gradient dL_dmodel_param,
 
@@ -42,6 +52,8 @@ class Transformation(object):
             \frac{\frac{\partial L}{\partial f}\left(\left.\partial f(x)}{\partial x}\right|_{x=f^{-1}(f)\right)}
         """
         raise NotImplementedError
+    def gradfactor_non_natural(self, model_param, dL_dmodel_param):
+        return self.gradfactor(model_param, dL_dmodel_param)
     def initialize(self, f):
         """ produce a sensible initial value for f(x)"""
         raise NotImplementedError
@@ -50,7 +62,7 @@ class Transformation(object):
         import matplotlib.pyplot as plt
         from ...plotting.matplot_dep import base_plots
         x = np.linspace(-8,8)
-        base_plots.meanplot(x, self.f(x),axes=axes*args,**kw)
+        base_plots.meanplot(x, self.f(x), *args, ax=axes, **kw)
         axes = plt.gca()
         axes.set_xlabel(xlabel)
         axes.set_ylabel(ylabel)
@@ -70,15 +82,41 @@ class Logexp(Transformation):
         return np.einsum('i,i->i', df, np.where(f>_lim_val, 1., 1. - np.exp(-f)))
     def initialize(self, f):
         if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         return np.abs(f)
+    def log_jacobian(self, model_param):
+        return np.where(model_param>_lim_val, model_param, np.log(np.exp(model_param+1e-20) - 1.)) - model_param
+    def log_jacobian_grad(self, model_param):
+        return 1./(np.exp(model_param)-1.)
+    def __str__(self):
+        return '+ve'
+
+class Exponent(Transformation):
+    domain = _POSITIVE
+    def f(self, x):
+        return np.where(x<_lim_val, np.where(x>-_lim_val, np.exp(x), np.exp(-_lim_val)), np.exp(_lim_val))
+    def finv(self, x):
+        return np.log(x)
+    def gradfactor(self, f, df):
+        return np.einsum('i,i->i', df, f)
+    def initialize(self, f):
+        if np.any(f < 0.):
+            print("Warning: changing parameters to satisfy constraints")
+        return np.abs(f)
+    def log_jacobian(self, model_param):
+        return np.log(model_param)
+    def log_jacobian_grad(self, model_param):
+        return 1./model_param
     def __str__(self):
         return '+ve'
 
 
+
 class NormalTheta(Transformation):
+    "Do not use, not officially supported!"
     _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
         if cls._instances:
             cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
@@ -98,6 +136,7 @@ class NormalTheta(Transformation):
         # that the values are ok
         # Before:
         theta[self.var_indices] = np.abs(-.5/theta[self.var_indices])
+        #theta[self.var_indices] = np.exp(-.5/theta[self.var_indices])
         theta[self.mu_indices] *= theta[self.var_indices]
         return theta # which is now {mu, var}
 
@@ -106,6 +145,7 @@ class NormalTheta(Transformation):
         varp = muvar[self.var_indices]
         muvar[self.mu_indices] /= varp
         muvar[self.var_indices] = -.5/varp
+        #muvar[self.var_indices] = -.5/np.log(varp)
 
         return muvar # which is now {theta1, theta2}
 
@@ -124,7 +164,7 @@ class NormalTheta(Transformation):
 
     def initialize(self, f):
         if np.any(f[self.var_indices] < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
             f[self.var_indices] = np.abs(f[self.var_indices])
         return f
 
@@ -139,9 +179,10 @@ class NormalTheta(Transformation):
         self.var_indices = state[1]
 
 class NormalNaturalAntti(NormalTheta):
+    "Do not use, not officially supported!"
     _instances = []
-    _logexp = Logexp()
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
         if cls._instances:
             cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
@@ -170,7 +211,7 @@ class NormalNaturalAntti(NormalTheta):
 
     def initialize(self, f):
         if np.any(f[self.var_indices] < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
             f[self.var_indices] = np.abs(f[self.var_indices])
         return f
 
@@ -178,8 +219,10 @@ class NormalNaturalAntti(NormalTheta):
         return "natantti"
 
 class NormalEta(Transformation):
+    "Do not use, not officially supported!"
     _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
         if cls._instances:
             cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
@@ -211,7 +254,7 @@ class NormalEta(Transformation):
 
     def initialize(self, f):
         if np.any(f[self.var_indices] < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
             f[self.var_indices] = np.abs(f[self.var_indices])
         return f
 
@@ -219,8 +262,10 @@ class NormalEta(Transformation):
         return "eta"
 
 class NormalNaturalThroughTheta(NormalTheta):
+    "Do not use, not officially supported!"
     _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
         if cls._instances:
             cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
@@ -250,13 +295,28 @@ class NormalNaturalThroughTheta(NormalTheta):
         #=======================================================================
         return dmuvar # which is now the gradient multiplicator
 
+    def gradfactor_non_natural(self, muvar, dmuvar):
+        mu = muvar[self.mu_indices]
+        var = muvar[self.var_indices]
+        #=======================================================================
+        # theta gradients
+        # This works and the gradient checks!
+        dmuvar[self.mu_indices] *= var
+        dmuvar[self.var_indices] *= 2*(var)**2
+        dmuvar[self.var_indices] += 2*dmuvar[self.mu_indices]*mu
+        #=======================================================================
+
+        return dmuvar # which is now the gradient multiplicator for {theta1, theta2}
+
     def __str__(self):
         return "natgrad"
 
 
 class NormalNaturalWhooot(NormalTheta):
+    "Do not use, not officially supported!"
     _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
         if cls._instances:
             cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
@@ -290,8 +350,10 @@ class NormalNaturalWhooot(NormalTheta):
         return "natgrad"
 
 class NormalNaturalThroughEta(NormalEta):
+    "Do not use, not officially supported!"
     _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
         if cls._instances:
             cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
@@ -332,7 +394,7 @@ class LogexpNeg(Transformation):
         return np.einsum('i,i->i', df, np.where(f>_lim_val, -1, -1 + np.exp(-f)))
     def initialize(self, f):
         if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         return np.abs(f)
     def __str__(self):
         return '+ve'
@@ -384,27 +446,11 @@ class LogexpClipped(Logexp):
         return np.einsum('i,i->i', df, gf) # np.where(f < self.lower, 0, gf)
     def initialize(self, f):
         if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         return np.abs(f)
     def __str__(self):
         return '+ve_c'
 
-class Exponent(Transformation):
-    # TODO: can't allow this to go to zero, need to set a lower bound. Similar with negative Exponent below. See old MATLAB code.
-    domain = _POSITIVE
-    def f(self, x):
-        return np.where(x<_lim_val, np.where(x>-_lim_val, np.exp(x), np.exp(-_lim_val)), np.exp(_lim_val))
-    def finv(self, x):
-        return np.log(x)
-    def gradfactor(self, f, df):
-        return np.einsum('i,i->i', df, f)
-    def initialize(self, f):
-        if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
-        return np.abs(f)
-    def __str__(self):
-        return '+ve'
-
 class NegativeExponent(Exponent):
     domain = _NEGATIVE
     def f(self, x):
@@ -440,7 +486,11 @@ class Logistic(Transformation):
             for instance in cls._instances:
                 if instance().lower == lower and instance().upper == upper:
                     return instance()
-        o = super(Transformation, cls).__new__(cls, lower, upper, *args, **kwargs)
+        newfunc = super(Transformation, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, lower, upper, *args, **kwargs)
         cls._instances.append(weakref.ref(o))
         return cls._instances[-1]()
     def __init__(self, lower, upper):
@@ -458,7 +508,7 @@ class Logistic(Transformation):
         return np.einsum('i,i->i', df, (f - self.lower) * (self.upper - f) / self.difference)
     def initialize(self, f):
         if np.any(np.logical_or(f < self.lower, f > self.upper)):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         #return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
         #FIXME: Max, zeros_like right?
         return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(np.zeros_like(f)), f)
diff --git a/GPy/inference/mcmc/__init__.py b/GPy/inference/mcmc/__init__.py
index 956448d4..b30b6ff0 100644
--- a/GPy/inference/mcmc/__init__.py
+++ b/GPy/inference/mcmc/__init__.py
@@ -1 +1,2 @@
 from hmc import HMC
+from samplers import *
diff --git a/GPy/inference/mcmc/samplers.py b/GPy/inference/mcmc/samplers.py
index 444d99d7..c2367359 100644
--- a/GPy/inference/mcmc/samplers.py
+++ b/GPy/inference/mcmc/samplers.py
@@ -4,23 +4,18 @@
 
 import numpy as np
 from scipy import linalg, optimize
-import Tango
 import sys
-import re
-import numdifftools as ndt
-import pdb
-import cPickle
 
 
 class Metropolis_Hastings:
     def __init__(self,model,cov=None):
         """Metropolis Hastings, with tunings according to Gelman et al. """
         self.model = model
-        current = self.model._get_params_transformed()
+        current = self.model.optimizer_array
         self.D = current.size
         self.chains = []
         if cov is None:
-            self.cov = model.Laplace_covariance()
+            self.cov = np.eye(self.D)
         else:
             self.cov = cov
         self.scale = 2.4/np.sqrt(self.D)
@@ -31,20 +26,20 @@ class Metropolis_Hastings:
         if start is None:
             self.model.randomize()
         else:
-            self.model._set_params_transformed(start)
+            self.model.optimizer_array = start
 
-
-
-    def sample(self, Ntotal, Nburn, Nthin, tune=True, tune_throughout=False, tune_interval=400):
-        current = self.model._get_params_transformed()
-        fcurrent = self.model.log_likelihood() + self.model.log_prior()
+    def sample(self, Ntotal=10000, Nburn=1000, Nthin=10, tune=True, tune_throughout=False, tune_interval=400):
+        current = self.model.optimizer_array
+        fcurrent = self.model.log_likelihood() + self.model.log_prior() + \
+                   self.model._log_det_jacobian()
         accepted = np.zeros(Ntotal,dtype=np.bool)
         for it in range(Ntotal):
             print "sample %d of %d\r"%(it,Ntotal),
             sys.stdout.flush()
             prop = np.random.multivariate_normal(current, self.cov*self.scale*self.scale)
-            self.model._set_params_transformed(prop)
-            fprop = self.model.log_likelihood() + self.model.log_prior()
+            self.model.optimizer_array = prop
+            fprop = self.model.log_likelihood() + self.model.log_prior() + \
+                    self.model._log_det_jacobian()
 
             if fprop>fcurrent:#sample accepted, going 'uphill'
                 accepted[it] = True
@@ -72,10 +67,11 @@ class Metropolis_Hastings:
 
     def predict(self,function,args):
         """Make a prediction for the function, to which we will pass the additional arguments"""
-        param = self.model._get_params()
+        param = self.model.param_array
         fs = []
         for p in self.chain:
-            self.model._set_params(p)
+            self.model.param_array = p
             fs.append(function(*args))
-        self.model._set_params(param)# reset model to starting state
+        # reset model to starting state
+        self.model.param_array = param
         return fs
diff --git a/GPy/testing/rv_transformation_tests.py b/GPy/testing/rv_transformation_tests.py
new file mode 100644
index 00000000..e6b3e3e7
--- /dev/null
+++ b/GPy/testing/rv_transformation_tests.py
@@ -0,0 +1,101 @@
+# Written by Ilias Bilionis
+"""
+Test if hyperparameters in models are properly transformed.
+"""
+
+
+import unittest
+import numpy as np
+import scipy.stats as st
+import GPy
+
+
+class TestModel(GPy.core.Model):
+    """
+    A simple GPy model with one parameter.
+    """
+    def __init__(self):
+        GPy.core.Model.__init__(self, 'test_model')
+        theta = GPy.core.Param('theta', 1.)
+        self.link_parameter(theta)
+
+    def log_likelihood(self):
+        return 0.
+
+
+class RVTransformationTestCase(unittest.TestCase):
+
+    def _test_trans(self, trans):
+        m = TestModel()
+        prior = GPy.priors.LogGaussian(.5, 0.1)
+        m.theta.set_prior(prior)
+        m.theta.unconstrain()
+        m.theta.constrain(trans)
+        # The PDF of the transformed variables
+        p_phi = lambda(phi): np.exp(-m._objective_grads(phi)[0])
+        # To the empirical PDF of:
+        theta_s = prior.rvs(100000)
+        phi_s = trans.finv(theta_s)
+        # which is essentially a kernel density estimation
+        kde = st.gaussian_kde(phi_s)
+        # We will compare the PDF here:
+        phi = np.linspace(phi_s.min(), phi_s.max(), 100)
+        # The transformed PDF of phi should be this:
+        pdf_phi = np.array([p_phi(p) for p in phi])
+        # UNCOMMENT TO SEE GRAPHICAL COMPARISON
+        #import matplotlib.pyplot as plt
+        #fig, ax = plt.subplots()
+        #ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Histogram')
+        #ax.plot(phi, kde(phi), '--', linewidth=2, label='Kernel Density Estimation')
+        #ax.plot(phi, pdf_phi, ':', linewidth=2, label='Transformed PDF')
+        #ax.set_xlabel(r'transformed $\theta$', fontsize=16)
+        #ax.set_ylabel('PDF', fontsize=16)
+        #plt.legend(loc='best')
+        #plt.show(block=True)
+        # END OF PLOT
+        # The following test cannot be very accurate
+        self.assertTrue(np.linalg.norm(pdf_phi - kde(phi)) / np.linalg.norm(kde(phi)) <= 1e-1)
+        # Check the gradients at a few random points
+        for i in xrange(10):
+            m.theta = theta_s[i]
+            self.assertTrue(m.checkgrad(verbose=True))
+
+    def test_Logexp(self):
+        self._test_trans(GPy.constraints.Logexp())
+        self._test_trans(GPy.constraints.Exponent())
+
+
+if __name__ == '__main__':
+    unittest.main()
+    quit()
+    m = TestModel()
+    prior = GPy.priors.LogGaussian(0., .9)
+    m.theta.set_prior(prior)
+
+    # The following should return the PDF in terms of the transformed quantities
+    p_phi = lambda(phi): np.exp(-m._objective_grads(phi)[0])
+
+    # Let's look at the transformation phi = log(exp(theta - 1))
+    trans = GPy.constraints.Exponent()
+    m.theta.constrain(trans)
+    # Plot the transformed probability density
+    phi = np.linspace(-8, 8, 100)
+    fig, ax = plt.subplots()
+    # Let's draw some samples of theta and transform them so that we see
+    # which one is right
+    theta_s = prior.rvs(10000)
+    # Transform it to the new variables
+    phi_s = trans.finv(theta_s)
+    # And draw their histogram
+    ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Empirical')
+    # This is to be compared to the PDF of the model expressed in terms of these new
+    # variables
+    ax.plot(phi, [p_phi(p) for p in phi], label='Transformed PDF', linewidth=2)
+    ax.set_xlim(-3, 10)
+    ax.set_xlabel(r'transformed $\theta$', fontsize=16)
+    ax.set_ylabel('PDF', fontsize=16)
+    plt.legend(loc='best')
+    # Now let's test the gradients
+    m.checkgrad(verbose=True)
+    # And show the plot
+    plt.show(block=True)