From 06336bf0d461bd90c3ca845e24803abeb4950f18 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@shef.ac.uk>
Date: Tue, 6 May 2014 16:30:17 +0100
Subject: [PATCH 1/8] switch psi2 statistics design

---
 GPy/core/sparse_gp.py                         |   2 +-
 .../var_dtc_parallel.py                       |  28 +--
 GPy/kern/_src/add.py                          |   2 +-
 GPy/kern/_src/kern.py                         |   2 +-
 GPy/kern/_src/kernel_slice_operations.py      |   4 +-
 GPy/kern/_src/linear.py                       |   2 +-
 GPy/kern/_src/psi_comp/ssrbf_psi_comp.py      | 192 ++++++++++++++----
 GPy/kern/_src/rbf.py                          | 120 ++++++-----
 GPy/kern/_src/static.py                       |   2 +-
 GPy/models/ss_gplvm.py                        |  11 +-
 GPy/util/initialization.py                    |   2 +-
 GPy/util/linalg.py                            |   2 +-
 12 files changed, 245 insertions(+), 124 deletions(-)

diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 7552b8ac..333fd526 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -66,7 +66,7 @@ class SparseGP(GP):
             #gradients wrt Z
             self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
             self.Z.gradient += self.kern.gradients_Z_expectations(
-                               self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X)
+                               self.grad_dict['dL_dpsi0'], self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X)
         else:
             #gradients wrt kernel
             self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py
index 87236e2a..7c347213 100644
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@@ -70,12 +70,13 @@ class VarDTC_minibatch(object):
         #see whether we've got a different noise variance for each datum
         beta = 1./np.fmax(likelihood.variance, 1e-6)
         het_noise = beta.size > 1
+        if het_noise:
+            self.batchsize = 1
         # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
         #self.YYTfactor = beta*self.get_YYTfactor(Y)
         YYT_factor = Y
         trYYT = self.get_trYYT(Y)
         
-        
         psi2_full = np.zeros((num_inducing,num_inducing))
         psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM
         psi0_full = 0
@@ -104,19 +105,18 @@ class VarDTC_minibatch(object):
                 YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum()
             else:
                 psi0_full += psi0.sum()
-                psi1Y_full += np.dot(Y_slice.T,psi1) # DxM
-                
+                psi1Y_full += np.dot(Y_slice.T,psi1) # DxM                
                 
             if uncertain_inputs:
                 if het_noise:
-                    psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2)
+                    psi2_full += beta_slice*psi2
                 else:
-                    psi2_full += psi2.sum(axis=0)
+                    psi2_full += psi2
             else:
                 if het_noise:
-                    psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1)
+                    psi2_full += beta_slice*np.outer(psi1,psi1)
                 else:
-                    psi2_full += tdot(psi1.T)
+                    psi2_full += np.outer(psi1,psi1)
                 
         if not het_noise:
             psi0_full *= beta
@@ -223,7 +223,7 @@ class VarDTC_minibatch(object):
             psi2 = None
             
         if het_noise:
-            beta = beta[n_start:n_end]
+            beta = beta[n_start] # assuming batchsize==1
 
         betaY = beta*Y_slice
         betapsi1 = np.einsum('n,nm->nm',beta,psi1)
@@ -244,7 +244,7 @@ class VarDTC_minibatch(object):
         dL_dpsi1 = np.dot(betaY,v.T)
         
         if uncertain_inputs:
-            dL_dpsi2 = np.einsum('n,mo->nmo',beta * np.ones((n_end-n_start,)),dL_dpsi2R)
+            dL_dpsi2 = beta* dL_dpsi2R
         else:
             dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2.
             dL_dpsi2 = None
@@ -262,11 +262,11 @@ class VarDTC_minibatch(object):
             dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1)
         else:
             if uncertain_inputs:
-                psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2)
+                psiR = np.einsum('mo,mo->',dL_dpsi2R,psi2)
             else:
                 psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R)
             
-            dL_dthetaL = ((np.square(betaY)).sum() + np.square(beta)*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum()
+            dL_dthetaL = ((np.square(betaY)).sum() + beta*beta*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - beta*beta*psiR- (betaY*np.dot(betapsi1,v)).sum()
 
         if uncertain_inputs:
             grad_dict = {'dL_dpsi0':dL_dpsi0,
@@ -296,7 +296,7 @@ def update_gradients(model):
     kern_grad = model.kern.gradient.copy()
             
     #gradients w.r.t. Z
-    model.Z.gradient[:,model.kern.active_dims] = model.kern.gradients_X(dL_dKmm, model.Z)
+    model.Z.gradient = model.kern.gradients_X(dL_dKmm, model.Z)
     
     isEnd = False
     while not isEnd:
@@ -309,8 +309,8 @@ def update_gradients(model):
             kern_grad += model.kern.gradient
     
             #gradients w.r.t. Z
-            model.Z.gradient[:,model.kern.active_dims] += model.kern.gradients_Z_expectations(
-                               grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice)
+            model.Z.gradient += model.kern.gradients_Z_expectations(
+                               dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice)
         
             #gradients w.r.t. posterior parameters of X
             X_grad = model.kern.gradients_qX_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index 88b8e40c..9589d6be 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -119,7 +119,7 @@ class Add(CombinationKernel):
                     eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
             p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
 
-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         from static import White, Bias
         target = np.zeros(Z.shape)
         for p1 in self.parts:
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 70bd42b9..09eb6dd9 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -115,7 +115,7 @@ class Kern(Parameterized):
         """
         raise NotImplementedError
 
-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         """
         Returns the derivative of the objective wrt Z, using the chain rule
         through the expectation variables.
diff --git a/GPy/kern/_src/kernel_slice_operations.py b/GPy/kern/_src/kernel_slice_operations.py
index c1c8d7f1..3473ffce 100644
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@@ -124,9 +124,9 @@ def _slice_update_gradients_expectations(f):
 
 def _slice_gradients_Z_expectations(f):
     @wraps(f)
-    def wrap(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         with _Slice_wrap(self, Z, variational_posterior) as s:
-            ret = s.handle_return_array(f(self, dL_dpsi1, dL_dpsi2, s.X, s.X2))
+            ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2))
         return ret
     return wrap
 
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index f9dacf02..007649b0 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -168,7 +168,7 @@ class Linear(Kern):
             else:
                 self.variances.gradient += 2.*np.sum(dL_dpsi2 * self.psi2(Z, variational_posterior))/self.variances
 
-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             gamma = variational_posterior.binary_prob
             mu = variational_posterior.mean
diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
index d8414cfb..5e30b5c5 100644
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@@ -9,12 +9,23 @@ import numpy as np
 from GPy.util.caching import Cache_this
 
 @Cache_this(limit=1)
-def _Z_distances(Z):
-    Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
-    Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
-    return Zhat, Zdist
+def psicomputations(variance, lengthscale, Z, mu, S, gamma):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi0, psi1 and psi2
+    # Produced intermediate results:
+    # _psi1                NxM
+    
+    psi0 = np.empty(mu.shape[0])
+    psi0[:] = variance
+    psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma)
+    psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma)
+    return psi0, psi1, psi2
 
-@Cache_this(limit=1)
 def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
     """
     Z - MxQ
@@ -22,15 +33,10 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
     S - NxQ
     gamma - NxQ
     """
-    # here are the "statistics" for psi1 and psi2
+    # here are the "statistics" for psi1
     # Produced intermediate results:
     # _psi1                NxM
-    # _dpsi1_dvariance     NxM
-    # _dpsi1_dlengthscale  NxMxQ
-    # _dpsi1_dZ            NxMxQ
-    # _dpsi1_dgamma        NxMxQ
-    # _dpsi1_dmu           NxMxQ
-    # _dpsi1_dS            NxMxQ
+
     
     lengthscale2 = np.square(lengthscale)
 
@@ -40,25 +46,15 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
     _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
     _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ
     _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ
-    _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom)) # NxMxQ
-    _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ
+    _psi1_exponent1 = np.log(gamma[:,None,:]) - (_psi1_dist_sq + np.log(_psi1_denom))/2. # NxMxQ
+    _psi1_exponent2 = np.log(1.-gamma[:,None,:]) - (np.square(Z[None,:,:])/lengthscale2)/2. # NxMxQ
     _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
     _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
     _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
-    _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ
-    _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ
-    _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ
     _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
-    _dpsi1_dvariance = _psi1 / variance # NxM
-    _dpsi1_dgamma = _psi1_q * (_psi1_exp_dist_sq/_psi1_denom_sqrt-_psi1_exp_Z) # NxMxQ
-    _dpsi1_dmu = _psi1_q * (_psi1_exp_dist_sq * _psi1_dist * _psi1_common) # NxMxQ
-    _dpsi1_dS = _psi1_q * (_psi1_exp_dist_sq * _psi1_common * 0.5 * (_psi1_dist_sq - 1.)) # NxMxQ
-    _dpsi1_dZ = _psi1_q * (- _psi1_common * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z) # NxMxQ
-    _dpsi1_dlengthscale = 2.*lengthscale*_psi1_q * (0.5*_psi1_common*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + 0.5*(1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z) # NxMxQ
 
-    return _psi1, _dpsi1_dvariance, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _dpsi1_dZ, _dpsi1_dlengthscale
+    return _psi1
 
-@Cache_this(limit=1)
 def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
     """
     Z - MxQ
@@ -66,19 +62,14 @@ def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
     S - NxQ
     gamma - NxQ
     """
-    # here are the "statistics" for psi1 and psi2
+    # here are the "statistics" for psi2
     # Produced intermediate results:
-    # _psi2                NxMxM
-    # _psi2_dvariance      NxMxM
-    # _psi2_dlengthscale   NxMxMxQ
-    # _psi2_dZ             NxMxMxQ
-    # _psi2_dgamma         NxMxMxQ
-    # _psi2_dmu            NxMxMxQ
-    # _psi2_dS             NxMxMxQ
+    # _psi2                MxM
     
     lengthscale2 = np.square(lengthscale)
     
-    _psi2_Zhat, _psi2_Zdist = _Z_distances(Z)
+    _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+    _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
     _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
     _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
 
@@ -93,15 +84,130 @@ def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
     _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
     _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
     _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
-    _psi2_q = np.square(variance) * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
+    _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
+
+    return _psi2
+
+def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    ARD = (len(lengthscale)!=1)
+    
+    dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+    dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+
+    dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
+    
+    dL_dlengscale = dl_psi1 + dl_psi2
+    if not ARD:
+        dL_dlengscale = dL_dlengscale.sum()
+
+    dL_dgamma = dgamma_psi1 + dgamma_psi2
+    dL_dmu = dmu_psi1 + dmu_psi2
+    dL_dS = dS_psi1 + dS_psi2
+    dL_dZ = dZ_psi1 + dZ_psi2
+    
+    return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
+
+def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma):
+    """
+    dL_dpsi1 - NxM
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi1
+    # Produced intermediate results: dL_dparams w.r.t. psi1
+    # _dL_dvariance     1
+    # _dL_dlengthscale  Q
+    # _dL_dZ            MxQ
+    # _dL_dgamma        NxQ
+    # _dL_dmu           NxQ
+    # _dL_dS            NxQ
+    
+    lengthscale2 = np.square(lengthscale)
+
+    # psi1
+    _psi1_denom = S / lengthscale2 + 1.  # NxQ
+    _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ
+    _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
+    _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom[:,None,:]) # NxMxQ
+    _psi1_common = gamma / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #NxQ
+    _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom[:, None,:])) # NxMxQ
+    _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ
+    _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
+    _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
+    _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
+    _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ
+    _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ
+    _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ
+    _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
+    _dL_dvariance = np.einsum('nm,nm->',dL_dpsi1, _psi1)/variance # 1
+    _dL_dgamma = np.einsum('nm,nmq,nmq->nq',dL_dpsi1, _psi1_q, (_psi1_exp_dist_sq/_psi1_denom_sqrt[:,None,:]-_psi1_exp_Z)) # NxQ
+    _dL_dmu = np.einsum('nm, nmq, nmq, nmq, nq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_dist,_psi1_common)  # NxQ
+    _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2.  # NxQ
+    _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z))
+    _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z))
+    
+#     _dpsi1_dmu = _psi1_q * (_psi1_exp_dist_sq * _psi1_dist * _psi1_common) # NxMxQ
+#     _dpsi1_dS = _psi1_q * (_psi1_exp_dist_sq * _psi1_common * 0.5 * (_psi1_dist_sq - 1.)) # NxMxQ
+#     _dpsi1_dZ = _psi1_q * (- _psi1_common * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z) # NxMxQ
+#     _dpsi1_dlengthscale = 2.*lengthscale*_psi1_q * (0.5*_psi1_common*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + 0.5*(1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z) # NxMxQ
+
+    return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma 
+
+def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    dL_dpsi2 - MxM
+    """
+    # here are the "statistics" for psi2
+    # Produced the derivatives w.r.t. psi2:
+    # _dL_dvariance      1
+    # _dL_dlengthscale   Q
+    # _dL_dZ             MxQ
+    # _dL_dgamma         NxQ
+    # _dL_dmu            NxQ
+    # _dL_dS             NxQ
+    
+    lengthscale2 = np.square(lengthscale)
+    
+    _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+    _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+    _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
+    _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
+
+    # psi2
+    _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ
+    _psi2_denom_sqrt = np.sqrt(_psi2_denom)
+    _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q
+    _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom[:,None,None,:])
+    _psi2_common = gamma/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # NxQ
+    _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom[:,None,None,:])+np.log(gamma[:,None,None,:]) #N,M,M,Q
+    _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ
+    _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
+    _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
+    _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
+    _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
     _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ
     _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ
-    _psi2 = np.square(variance) * np.exp(_psi2_exp_sum) # N,M,M
-    _dpsi2_dvariance = 2. * _psi2/variance # NxMxM
-    _dpsi2_dgamma = _psi2_q * (_psi2_exp_dist_sq/_psi2_denom_sqrt - _psi2_exp_Z) # NxMxMxQ
-    _dpsi2_dmu = _psi2_q * (-2.*_psi2_common*_psi2_mudist * _psi2_exp_dist_sq) # NxMxMxQ
-    _dpsi2_dS = _psi2_q * (_psi2_common * (2.*_psi2_mudist_sq - 1.) * _psi2_exp_dist_sq) # NxMxMxQ
-    _dpsi2_dZ = 2.*_psi2_q * (_psi2_common*(-_psi2_Zdist*_psi2_denom+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z) # NxMxMxQ
-    _dpsi2_dlengthscale = 2.*lengthscale* _psi2_q * (_psi2_common*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) # NxMxMxQ
+    _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
+    _dL_dvariance = np.einsum('mo,mo->',dL_dpsi2,_psi2)*2./variance
+    _dL_dgamma = np.einsum('mo,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,(_psi2_exp_dist_sq/_psi2_denom_sqrt[:,None,None,:] - _psi2_exp_Z))
+    _dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq)
+    _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq)
+    _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z))
+#    print _psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq #+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z)
+    _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z))
+    
+    
+#     _dpsi2_dvariance = 2. * _psi2/variance # NxMxM
+#     _dpsi2_dgamma = _psi2_q * (_psi2_exp_dist_sq/_psi2_denom_sqrt - _psi2_exp_Z) # NxMxMxQ
+#     _dpsi2_dmu = _psi2_q * (-2.*_psi2_common*_psi2_mudist * _psi2_exp_dist_sq) # NxMxMxQ
+#     _dpsi2_dS = _psi2_q * (_psi2_common * (2.*_psi2_mudist_sq - 1.) * _psi2_exp_dist_sq) # NxMxMxQ
+#     _dpsi2_dZ = 2.*_psi2_q * (_psi2_common*(-_psi2_Zdist*_psi2_denom+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z) # NxMxMxQ
+#     _dpsi2_dlengthscale = 2.*lengthscale* _psi2_q * (_psi2_common*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) # NxMxMxQ
 
-    return _psi2, _dpsi2_dvariance, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _dpsi2_dZ, _dpsi2_dlengthscale
+    return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index e0071fb9..5944e765 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -41,9 +41,11 @@ class RBF(Stationary):
     #---------------------------------------#
 
     def psi0(self, Z, variational_posterior):
-        if self.useGPU:
-            if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+            if self.useGPU:
                 return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[0]
+            else:
+                return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[0]
         else:
             return self.Kdiag(variational_posterior.mean)
 
@@ -52,7 +54,7 @@ class RBF(Stationary):
             if self.useGPU:
                 return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[1]
             else:
-                psi1, _, _, _, _, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+                return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[1]
         else:
             _, _, _, psi1 = self._psi1computations(Z, variational_posterior)
         return psi1
@@ -62,7 +64,7 @@ class RBF(Stationary):
             if self.useGPU:
                 return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[2]
             else:
-                psi2, _, _, _, _, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+                return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[2]
         else:
             _, _, _, _, psi2 = self._psi2computations(Z, variational_posterior)
         return psi2
@@ -73,26 +75,30 @@ class RBF(Stationary):
             if self.useGPU:
                 self.psicomp.update_gradients_expectations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
             else:
+#                 dL_dvar, dL_dlengscale, dL_dZ, dL_dgamma, dL_dmu, dL_dS = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
+                dL_dvar, dL_dlengscale, _, _, _, _ = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
+                self.variance.gradient = dL_dvar
+                self.lengthscale.gradient = dL_dlengscale
                 
-                _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-                _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-    
-                #contributions from psi0:
-                self.variance.gradient = np.sum(dL_dpsi0)
-    
-                #from psi1
-                self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance)
-                if self.ARD:
-                    self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
-                else:
-                    self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum()  
-    
-                #from psi2
-                self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum()
-                if self.ARD:
-                    self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
-                else:
-                    self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum()
+#                 _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#                 _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#     
+#                 #contributions from psi0:
+#                 self.variance.gradient = np.sum(dL_dpsi0)
+#     
+#                 #from psi1
+#                 self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance)
+#                 if self.ARD:
+#                     self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
+#                 else:
+#                     self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum()  
+#     
+#                 #from psi2
+#                 self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum()
+#                 if self.ARD:
+#                     self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
+#                 else:
+#                     self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum()
                     
         elif isinstance(variational_posterior, variational.NormalPosterior):
             l2 = self.lengthscale**2
@@ -125,22 +131,25 @@ class RBF(Stationary):
         else:
             raise ValueError, "unknown distriubtion received for psi-statistics"
 
-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         # Spike-and-Slab GPLVM
         if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             if self.useGPU:
                 return self.psicomp.gradients_Z_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
             else:
-                _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-                _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-    
-                #psi1
-                grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0)
-    
-                #psi2
-                grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1)
-    
-                return grad
+                _, _, dL_dZ, _, _, _ = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
+                return dL_dZ
+                
+#                 _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#                 _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#     
+#                 #psi1
+#                 grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0)
+#     
+#                 #psi2
+#                 grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1)
+#     
+#                 return grad
 
         elif isinstance(variational_posterior, variational.NormalPosterior):
             l2 = self.lengthscale **2
@@ -166,26 +175,29 @@ class RBF(Stationary):
         if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             if self.useGPU:
                 return self.psicomp.gradients_qX_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
-            else:   
-                ndata = variational_posterior.mean.shape[0]
-    
-                _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-                _, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-    
-                #psi1
-                grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1)
-                grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1)
-                grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1)
-    
-                #psi2
-                grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1)
-                grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1)
-                grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1)
-                
-                if self.group_spike_prob:
-                    grad_gamma[:] = grad_gamma.mean(axis=0)
-    
-                return grad_mu, grad_S, grad_gamma
+            else:
+                _, _, _, dL_dmu, dL_dS, dL_dgamma = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
+                return dL_dmu, dL_dS, dL_dgamma
+            
+#                 ndata = variational_posterior.mean.shape[0]
+#     
+#                 _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#                 _, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#     
+#                 #psi1
+#                 grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1)
+#                 grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1)
+#                 grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1)
+#     
+#                 #psi2
+#                 grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1)
+#                 grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1)
+#                 grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1)
+#                 
+#                 if self.group_spike_prob:
+#                     grad_gamma[:] = grad_gamma.mean(axis=0)
+#     
+#                 return grad_mu, grad_S, grad_gamma
 
         elif isinstance(variational_posterior, variational.NormalPosterior):
 
diff --git a/GPy/kern/_src/static.py b/GPy/kern/_src/static.py
index d853b4e0..68884937 100644
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@@ -25,7 +25,7 @@ class Static(Kern):
     def gradients_X_diag(self, dL_dKdiag, X):
         return np.zeros(X.shape)
 
-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         return np.zeros(Z.shape)
 
     def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index 57be302a..dba12700 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -11,7 +11,7 @@ from ..likelihoods import Gaussian
 from ..inference.optimization import SCG
 from ..util import linalg
 from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior
-from ..inference.latent_function_inference.var_dtc_parallel import update_gradients
+from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
 from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU
 
 
@@ -41,7 +41,7 @@ class SSGPLVM(SparseGP):
         if X_variance is None: # The variance of the variational approximation (S)
             X_variance = np.random.uniform(0,.1,X.shape)
             
-        gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+        gamma = np.empty_like(X, order='F') # The posterior probabilities of the binary variable in the variational approximation
         gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim)
         
         if group_spike:
@@ -60,12 +60,15 @@ class SSGPLVM(SparseGP):
         pi = np.empty((input_dim))
         pi[:] = 0.5
         self.variational_prior = SpikeAndSlabPrior(pi=pi) # the prior probability of the latent binary variable b
+        
+        X = np.asfortranarray(X)
+        X_variance = np.asfortranarray(X_variance)
+        gamma = np.asfortranarray(gamma)
         X = SpikeAndSlabPosterior(X, X_variance, gamma)
         
         if group_spike:
             kernel.group_spike_prob = True
             self.variational_prior.group_spike_prob = True
-            
 
         SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
         self.add_parameter(self.X, index=0)
@@ -76,7 +79,7 @@ class SSGPLVM(SparseGP):
         X.mean.gradient, X.variance.gradient, X.binary_prob.gradient = X_grad
 
     def parameters_changed(self):
-        if isinstance(self.inference_method, VarDTC_GPU):
+        if isinstance(self.inference_method, VarDTC_GPU) or isinstance(self.inference_method, VarDTC_minibatch):
             update_gradients(self)
             return
         
diff --git a/GPy/util/initialization.py b/GPy/util/initialization.py
index 22e63b6b..dd14c9a5 100644
--- a/GPy/util/initialization.py
+++ b/GPy/util/initialization.py
@@ -8,7 +8,7 @@ import numpy as np
 from GPy.util.pca import pca
 
 def initialize_latent(init, input_dim, Y):
-    Xr = np.random.randn(Y.shape[0], input_dim)
+    Xr = np.asfortranarray(np.random.randn(Y.shape[0], input_dim))
     if init == 'PCA':
         p = pca(Y)
         PC = p.project(Y, min(input_dim, Y.shape[1]))
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index b204f813..661a2b47 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -123,7 +123,7 @@ def dtrtrs(A, B, lower=1, trans=0, unitdiag=0):
     :returns:
 
     """
-    A = force_F_ordered(A)
+    A = np.asfortranarray(A)
     #Note: B does not seem to need to be F ordered!
     return lapack.dtrtrs(A, B, lower=lower, trans=trans, unitdiag=unitdiag)
 

From 8531c4bff0946d6b552cfe755f843bb07849f19f Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Mon, 12 May 2014 16:23:01 +0100
Subject: [PATCH 2/8] [params] indexing with boolean arrays switched off, rases
 proper error now

---
 GPy/core/parameterization/param.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 3ccbd169..7dae04b8 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -137,6 +137,8 @@ class Param(OptimizationHandlable, ObsAr):
             slice_index = self._current_slice_
         def f(a):
             a, b = a
+            if isinstance(a, numpy.ndarray) and a.dtype == bool:
+                raise ValueError, "Boolean indexing not implemented, use Param[np.where(index)] to index by boolean arrays!"
             if a not in (slice(None), Ellipsis):
                 if isinstance(a, slice):
                     start, stop, step = a.indices(b)

From 02b5ee1e4604b1ce6d2c5dced4a9366aea4e5365 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 15 May 2014 11:29:20 +0100
Subject: [PATCH 3/8] [parameterized] restructered a lot and finalized some
 stuff

---
 GPy/core/parameterization/lists_and_dicts.py |  27 +-
 GPy/core/parameterization/param.py           |  20 +-
 GPy/core/parameterization/parameter_core.py  | 351 +++++++------------
 GPy/core/parameterization/parameterized.py   | 215 ++++++++----
 GPy/core/parameterization/variational.py     |  16 +-
 GPy/kern/_src/add.py                         |   6 +-
 GPy/kern/_src/kern.py                        |   6 +-
 GPy/plotting/matplot_dep/kernel_plots.py     |   2 +-
 GPy/testing/parameterized_tests.py           |   2 +-
 GPy/testing/pickle_tests.py                  |  34 +-
 10 files changed, 354 insertions(+), 325 deletions(-)

diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py
index 084ab0db..96cf363c 100644
--- a/GPy/core/parameterization/lists_and_dicts.py
+++ b/GPy/core/parameterization/lists_and_dicts.py
@@ -38,7 +38,12 @@ class ArrayList(list):
         raise ValueError, "{} is not in list".format(item)
     pass
 
-class ObservablesList(object):
+class ObserverList(object):
+    """
+    A list which containts the observables.
+    It only holds weak references to observers, such that unbound
+    observers dont dangle in memory.
+    """
     def __init__(self):
         self._poc = []
 
@@ -46,27 +51,30 @@ class ObservablesList(object):
         p,o,c = self._poc[ind]
         return p, o(), c
 
-    def remove(self, priority, observable, callble):
+    def remove(self, priority, observer, callble):
         """
+        Remove one observer, which had priority and callble.
         """
         self.flush()
         for i in range(len(self) - 1, -1, -1):
             p,o,c = self[i]
-            if priority==p and observable==o and callble==c:
+            if priority==p and observer==o and callble==c:
                 del self._poc[i]
 
     def __repr__(self):
         return self._poc.__repr__()
-    
 
-    def add(self, priority, observable, callble):
-        if observable is not None:
+    def add(self, priority, observer, callble):
+        """
+        Add an observer with priority and callble
+        """
+        if observer is not None:
             ins = 0
             for pr, _, _ in self:
                 if priority > pr:
                     break
                 ins += 1
-            self._poc.insert(ins, (priority, weakref.ref(observable), callble))
+            self._poc.insert(ins, (priority, weakref.ref(observer), callble))
 
     def __str__(self):
         ret = []
@@ -83,6 +91,9 @@ class ObservablesList(object):
         return '\n'.join(ret)
 
     def flush(self):
+        """
+        Make sure all weak references, which point to nothing are flushed (deleted)
+        """
         self._poc = [(p,o,c) for p,o,c in self._poc if o() is not None]
 
     def __iter__(self):
@@ -95,7 +106,7 @@ class ObservablesList(object):
         return self._poc.__len__()
 
     def __deepcopy__(self, memo):
-        s = ObservablesList()
+        s = ObserverList()
         for p,o,c in self:
             import copy
             s.add(p, copy.deepcopy(o, memo), copy.deepcopy(c, memo))
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index b2103f43..b0655e03 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -4,7 +4,7 @@
 import itertools
 import numpy
 np = numpy
-from parameter_core import OptimizationHandlable, adjust_name_for_printing
+from parameter_core import Parameterizable, adjust_name_for_printing
 from observable_array import ObsAr
 
 ###### printing
@@ -16,7 +16,7 @@ __precision__ = numpy.get_printoptions()['precision'] # numpy printing precision
 __print_threshold__ = 5
 ######
 
-class Param(OptimizationHandlable, ObsAr):
+class Param(Parameterizable, ObsAr):
     """
     Parameter object for GPy models.
 
@@ -42,7 +42,7 @@ class Param(OptimizationHandlable, ObsAr):
     """
     __array_priority__ = -1 # Never give back Param
     _fixes_ = None
-    _parameters_ = []
+    parameters = []
     def __new__(cls, name, input_array, default_constraint=None):
         obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array))
         obj._current_slice_ = (slice(obj.shape[0]),)
@@ -87,6 +87,9 @@ class Param(OptimizationHandlable, ObsAr):
 
     @property
     def param_array(self):
+        """
+        As we are a leaf, this just returns self
+        """
         return self
 
     @property
@@ -139,6 +142,9 @@ class Param(OptimizationHandlable, ObsAr):
     def _raveled_index_for(self, obj):
         return self._raveled_index()
 
+    #===========================================================================
+    # Index recreation
+    #===========================================================================
     def _expand_index(self, slice_index=None):
         # this calculates the full indexing arrays from the slicing objects given by get_item for _real..._ attributes
         # it basically translates slices to their respective index arrays and turns negative indices around
@@ -177,15 +183,17 @@ class Param(OptimizationHandlable, ObsAr):
 
         This will function will just call visit on self, as Param are leaf nodes.
         """
+        self.__visited = True
         visit(self, *args, **kwargs)
-    
+        self.__visited = False
+
     def traverse_parents(self, visit, *args, **kwargs):
         """
         Traverse the hierarchy upwards, visiting all parents and their children, except self.
         See "visitor pattern" in literature. This is implemented in pre-order fashion.
-    
+
         Example:
-    
+
         parents = []
         self.traverse_parents(parents.append)
         print parents
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 8cda1216..f0fbf4ea 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -17,7 +17,7 @@ from transformations import Logexp, NegativeLogexp, Logistic, __fixed__, FIXED,
 import numpy as np
 import re
 
-__updated__ = '2014-05-12'
+__updated__ = '2014-05-15'
 
 class HierarchyError(Exception):
     """
@@ -52,13 +52,23 @@ class Observable(object):
     _updated = True
     def __init__(self, *args, **kwargs):
         super(Observable, self).__init__()
-        from lists_and_dicts import ObservablesList
-        self.observers = ObservablesList()
+        from lists_and_dicts import ObserverList
+        self.observers = ObserverList()
 
     def add_observer(self, observer, callble, priority=0):
+        """
+        Add an observer `observer` with the callback `callble` 
+        and priority `priority` to this observers list.
+        """
         self.observers.add(priority, observer, callble)
 
     def remove_observer(self, observer, callble=None):
+        """
+        Either (if callble is None) remove all callables, 
+        which were added alongside observer,
+        or remove callable `callble` which was added alongside
+        the observer `observer`.
+        """
         to_remove = []
         for poc in self.observers:
             _, obs, clble = poc
@@ -91,10 +101,6 @@ class Observable(object):
                     break
                 callble(self, which=which)
 
-#===============================================================================
-# Foundation framework for parameterized and param objects:
-#===============================================================================
-
 class Parentable(object):
     """
     Enable an Object to have a parent.
@@ -172,7 +178,11 @@ class Pickleable(object):
     # copy and pickling
     #===========================================================================
     def copy(self):
-        """Returns a (deep) copy of the current model"""
+        """
+        Returns a (deep) copy of the current parameter handle. 
+
+        All connections to parents of the copy will be cut.
+        """
         #raise NotImplementedError, "Copy is not yet implemented, TODO: Observable hierarchy"
         import copy
         memo = {}
@@ -196,12 +206,11 @@ class Pickleable(object):
         return s
 
     def __getstate__(self):
-        ignore_list = ([#'_parent_', '_parent_index_',
-                        #'observers',
-                        '_param_array_', '_gradient_array_', '_fixes_',
-                        '_Cacher_wrap__cachers']
-                       #+ self.parameter_names(recursive=False)
-                       )
+        ignore_list = ['_param_array_', # parameters get set from bottom to top
+                       '_gradient_array_', # as well as gradients
+                       '_fixes_', # and fixes
+                       '_Cacher_wrap__cachers', # never pickle cachers
+                       ]
         dc = dict()
         for k,v in self.__dict__.iteritems():
             if k not in ignore_list:
@@ -246,7 +255,6 @@ class Gradcheckable(Pickleable, Parentable):
         """
         raise HierarchyError, "This parameter is not in a model with a likelihood, and, therefore, cannot be gradient checked!"
 
-
 class Nameable(Gradcheckable):
     """
     Make an object nameable inside the hierarchy.
@@ -285,41 +293,8 @@ class Nameable(Gradcheckable):
             return self._parent_.hierarchy_name() + "." + adjust(self.name)
         return adjust(self.name)
 
-class Indexable(object):
-    """
-    Enable enraveled indexes and offsets for this object.
-    The raveled index of an object is the index for its parameters in a flattened int array.
-    """
-    def __init__(self, *a, **kw):
-        super(Indexable, self).__init__()
 
-    def _raveled_index(self):
-        """
-        Flattened array of ints, specifying the index of this object.
-        This has to account for shaped parameters!
-        """
-        raise NotImplementedError, "Need to be able to get the raveled Index"
-
-    def _offset_for(self, param):
-        """
-        Return the offset of the param inside this parameterized object.
-        This does not need to account for shaped parameters, as it
-        basically just sums up the parameter sizes which come before param.
-        """
-        return 0
-        #raise NotImplementedError, "shouldnt happen, offset required from non parameterization object?"
-
-    def _raveled_index_for(self, param):
-        """
-        get the raveled index for a param
-        that is an int array, containing the indexes for the flattened
-        param inside this parameterized logic.
-        """
-        return param._raveled_index()
-        #raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?"
-
-
-class Constrainable(Nameable, Indexable, Observable):
+class Indexable(Nameable, Observable):
     """
     Make an object constrainable with Priors and Transformations.
     TODO: Mappings!!
@@ -330,7 +305,7 @@ class Constrainable(Nameable, Indexable, Observable):
     :func:`constrain()` and :func:`unconstrain()` are main methods here
     """
     def __init__(self, name, default_constraint=None, *a, **kw):
-        super(Constrainable, self).__init__(name=name, *a, **kw)
+        super(Indexable, self).__init__(name=name, *a, **kw)
         self._default_constraint_ = default_constraint
         from index_operations import ParameterIndexOperations
         self.constraints = ParameterIndexOperations()
@@ -352,6 +327,39 @@ class Constrainable(Nameable, Indexable, Observable):
         self._connect_fixes()
         self._notify_parent_change()
 
+    #===========================================================================
+    # Indexable
+    #===========================================================================
+    def _offset_for(self, param):
+        """
+        Return the offset of the param inside this parameterized object.
+        This does not need to account for shaped parameters, as it
+        basically just sums up the parameter sizes which come before param.
+        """
+        if param.has_parent():
+            if param._parent_._get_original(param) in self.parameters:
+                return self._param_slices_[param._parent_._get_original(param)._parent_index_].start
+            return self._offset_for(param._parent_) + param._parent_._offset_for(param)
+        return 0
+
+    def _raveled_index_for(self, param):
+        """
+        get the raveled index for a param
+        that is an int array, containing the indexes for the flattened
+        param inside this parameterized logic.
+        """
+        from param import ParamConcatenation
+        if isinstance(param, ParamConcatenation):
+            return np.hstack((self._raveled_index_for(p) for p in param.params))
+        return param._raveled_index() + self._offset_for(param)
+
+    def _raveled_index(self):
+        """
+        Flattened array of ints, specifying the index of this object.
+        This has to account for shaped parameters!
+        """
+        return np.r_[:self.size]
+
     #===========================================================================
     # Fixing Parameters:
     #===========================================================================
@@ -406,9 +414,24 @@ class Constrainable(Nameable, Indexable, Observable):
             self._fixes_ = None
             del self.constraints[__fixed__]
 
+    #===========================================================================
+    # Convenience for fixed
+    #===========================================================================
     def _has_fixes(self):
         return hasattr(self, "_fixes_") and self._fixes_ is not None and self._fixes_.size == self.size
 
+    @property
+    def is_fixed(self):
+        for p in self.parameters:
+            if not p.is_fixed: return False
+        return True
+
+    def _get_original(self, param):
+        # if advanced indexing is activated it happens that the array is a copy
+        # you can retrieve the original param through this method, by passing
+        # the copy here
+        return self.parameters[param._parent_index_]
+
     #===========================================================================
     # Prior Operations
     #===========================================================================
@@ -432,8 +455,7 @@ class Constrainable(Nameable, Indexable, Observable):
 
     def unset_priors(self, *priors):
         """
-        Un-set all priors given from this parameter handle.
-
+        Un-set all priors given (in *priors) from this parameter handle.
         """
         return self._remove_from_index_operations(self.priors, priors)
 
@@ -535,7 +557,7 @@ class Constrainable(Nameable, Indexable, Observable):
         self.constraints = ParameterIndexOperationsView(parent.constraints, parent._offset_for(self), self.size)
         self.priors = ParameterIndexOperationsView(parent.priors, parent._offset_for(self), self.size)
         self._fixes_ = None
-        for p in self._parameters_:
+        for p in self.parameters:
             p._parent_changed(parent)
 
     def _add_to_index_operations(self, which, reconstrained, what, warning):
@@ -563,14 +585,13 @@ class Constrainable(Nameable, Indexable, Observable):
         removed = np.empty((0,), dtype=int)
         for t in transforms:
             unconstrained = which.remove(t, self._raveled_index())
-            print unconstrained
             removed = np.union1d(removed, unconstrained)
             if t is __fixed__:
                 self._highest_parent_._set_unfixed(self, unconstrained)
 
         return removed
 
-class OptimizationHandlable(Constrainable):
+class OptimizationHandlable(Indexable):
     """
     This enables optimization handles on an Object as done in GPy 0.4.
 
@@ -580,7 +601,7 @@ class OptimizationHandlable(Constrainable):
         super(OptimizationHandlable, self).__init__(name, default_constraint=default_constraint, *a, **kw)
 
     def _get_params_transformed(self):
-        # transformed parameters (apply transformation rules)
+        # transformed parameters (apply un-transformation rules)
         p = self.param_array.copy()
         [np.put(p, ind, c.finv(p[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
         if self.has_parent() and self.constraints[__fixed__].size != 0:
@@ -592,6 +613,11 @@ class OptimizationHandlable(Constrainable):
         return p
 
     def _set_params_transformed(self, p):
+        """
+        Set parameters p, but make sure they get transformed before setting.
+        This means, the optimizer sees p, whereas the model sees transformed(p), 
+        such that, the parameters the model sees are in the right domain.
+        """
         if not(p is self.param_array):
             if self.has_parent() and self.constraints[__fixed__].size != 0:
                 fixes = np.ones(self.size).astype(bool)
@@ -604,12 +630,33 @@ class OptimizationHandlable(Constrainable):
         self._trigger_params_changed()
 
     def _trigger_params_changed(self, trigger_parent=True):
-        [p._trigger_params_changed(trigger_parent=False) for p in self._parameters_]
+        """
+        First tell all children to update,
+        then update yourself.
+
+        If trigger_parent is True, we will tell the parent, otherwise not.
+        """
+        [p._trigger_params_changed(trigger_parent=False) for p in self.parameters]
         self.notify_observers(None, None if trigger_parent else -np.inf)
 
     def _size_transformed(self):
+        """
+        As fixes are not passed to the optimiser, the size of the model for the optimiser
+        is the size of all parameters minus the size of the fixes.
+        """
         return self.size - self.constraints[__fixed__].size
 
+    def _transform_gradients(self, g):
+        """
+        Transform the gradients by multiplying the gradient factor for each
+        constraint to it.
+        """
+        if self.has_parent():
+            return g
+        [np.put(g, i, g[i] * c.gradfactor(self.param_array[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        if self._has_fixes(): return g[self._fixes_]
+        return g
+
     @property
     def num_params(self):
         """
@@ -628,8 +675,8 @@ class OptimizationHandlable(Constrainable):
         """
         if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x)
         else: adjust = lambda x: x
-        if recursive: names = [xi for x in self._parameters_ for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)]
-        else: names = [adjust(x.name) for x in self._parameters_]
+        if recursive: names = [xi for x in self.parameters for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)]
+        else: names = [adjust(x.name) for x in self.parameters]
         if add_self: names = map(lambda x: adjust(self.name) + "." + x, names)
         return names
 
@@ -651,7 +698,7 @@ class OptimizationHandlable(Constrainable):
         Randomize the model.
         Make this draw from the prior if one exists, else draw from given random generator
 
-        :param rand_gen: numpy random number generator which takes args and kwargs
+        :param rand_gen: np random number generator which takes args and kwargs
         :param flaot loc: loc parameter for random number generator
         :param float scale: scale parameter for random number generator
         :param args, kwargs: will be passed through to random number generator
@@ -667,7 +714,7 @@ class OptimizationHandlable(Constrainable):
     # for all parameterized objects
     #===========================================================================
     @property
-    def full_gradient(self):
+    def gradient_full(self):
         """
         Note to users:
         This does not return the gradient in the right shape! Use self.gradient
@@ -681,26 +728,43 @@ class OptimizationHandlable(Constrainable):
         return self._gradient_array_
 
     def _propagate_param_grad(self, parray, garray):
+        """
+        For propagating the param_array and gradient_array.
+        This ensures the in memory view of each subsequent array.
+
+        1.) connect param_array of children to self.param_array
+        2.) tell all children to propagate further
+        """
         pi_old_size = 0
-        for pi in self._parameters_:
+        for pi in self.parameters:
             pislice = slice(pi_old_size, pi_old_size + pi.size)
 
             self.param_array[pislice] = pi.param_array.flat  # , requirements=['C', 'W']).flat
-            self.full_gradient[pislice] = pi.full_gradient.flat  # , requirements=['C', 'W']).flat
+            self.gradient_full[pislice] = pi.gradient_full.flat  # , requirements=['C', 'W']).flat
 
             pi.param_array.data = parray[pislice].data
-            pi.full_gradient.data = garray[pislice].data
+            pi.gradient_full.data = garray[pislice].data
 
             pi._propagate_param_grad(parray[pislice], garray[pislice])
             pi_old_size += pi.size
 
 class Parameterizable(OptimizationHandlable):
+    """
+    A parameterisable class.
+
+    This class provides the parameters list (ArrayList) and standard parameter handling,
+    such as {add|remove}_parameter(), traverse hierarchy and param_array, gradient_array
+    and the empty parameters_changed().
+
+    This class is abstract and should not be instantiated.
+    Use GPy.core.Parameterized() as node (or leaf) in the parameterized hierarchy.
+    Use GPy.core.Param() for a leaf in the parameterized hierarchy.
+    """
     def __init__(self, *args, **kwargs):
         super(Parameterizable, self).__init__(*args, **kwargs)
         from GPy.core.parameterization.lists_and_dicts import ArrayList
-        self._parameters_ = ArrayList()
+        self.parameters = ArrayList()
         self._param_array_ = None
-        self.size = 0
         self._added_names_ = set()
         self.__visited = False # for traversing in reverse order we need to know if we were here already
 
@@ -735,7 +799,7 @@ class Parameterizable(OptimizationHandlable):
         if not self.__visited:
             visit(self, *args, **kwargs)
             self.__visited = True
-            for c in self._parameters_:
+            for c in self.parameters:
                 c.traverse(visit, *args, **kwargs)
             self.__visited = False
 
@@ -743,9 +807,9 @@ class Parameterizable(OptimizationHandlable):
         """
         Traverse the hierarchy upwards, visiting all parents and their children except self.
         See "visitor pattern" in literature. This is implemented in pre-order fashion.
-    
+
         Example:
-    
+
         parents = []
         self.traverse_parents(parents.append)
         print parents
@@ -754,7 +818,7 @@ class Parameterizable(OptimizationHandlable):
             self.__visited = True
             self._parent_._traverse_parents(visit, *args, **kwargs)
             self.__visited = False
-        
+
     def _traverse_parents(self, visit, *args, **kwargs):
         if not self.__visited:
             self.__visited = True
@@ -779,7 +843,7 @@ class Parameterizable(OptimizationHandlable):
 
     @property
     def num_params(self):
-        return len(self._parameters_)
+        return len(self.parameters)
 
     def _add_parameter_name(self, param, ignore_added_names=False):
         pname = adjust_name_for_printing(param.name)
@@ -812,132 +876,6 @@ class Parameterizable(OptimizationHandlable):
         self._remove_parameter_name(None, old_name)
         self._add_parameter_name(param)
 
-    def add_parameter(self, param, index=None, _ignore_added_names=False):
-        """
-        :param parameters:  the parameters to add
-        :type parameters:   list of or one :py:class:`GPy.core.param.Param`
-        :param [index]:     index of where to put parameters
-
-        :param bool _ignore_added_names: whether the name of the parameter overrides a possibly existing field
-
-        Add all parameters to this param class, you can insert parameters
-        at any given index using the :func:`list.insert` syntax
-        """
-        if param in self._parameters_ and index is not None:
-            self.remove_parameter(param)
-            self.add_parameter(param, index)
-        # elif param.has_parent():
-        #    raise HierarchyError, "parameter {} already in another model ({}), create new object (or copy) for adding".format(param._short(), param._highest_parent_._short())
-        elif param not in self._parameters_:
-            if param.has_parent():
-                def visit(parent, self):
-                    if parent is self:
-                        raise HierarchyError, "You cannot add a parameter twice into the hierarchy"
-                param.traverse_parents(visit, self)
-                param._parent_.remove_parameter(param)
-            # make sure the size is set
-            if index is None:
-                self.constraints.update(param.constraints, self.size)
-                self.priors.update(param.priors, self.size)
-                self._parameters_.append(param)
-            else:
-                start = sum(p.size for p in self._parameters_[:index])
-                self.constraints.shift_right(start, param.size)
-                self.priors.shift_right(start, param.size)
-                self.constraints.update(param.constraints, start)
-                self.priors.update(param.priors, start)
-                self._parameters_.insert(index, param)
-
-            param.add_observer(self, self._pass_through_notify_observers, -np.inf)
-
-            parent = self
-            while parent is not None:
-                parent.size += param.size
-                parent = parent._parent_
-
-            self._connect_parameters()
-
-            self._highest_parent_._connect_parameters(ignore_added_names=_ignore_added_names)
-            self._highest_parent_._notify_parent_change()
-            self._highest_parent_._connect_fixes()
-
-        else:
-            raise HierarchyError, """Parameter exists already and no copy made"""
-
-
-    def add_parameters(self, *parameters):
-        """
-        convenience method for adding several
-        parameters without gradient specification
-        """
-        [self.add_parameter(p) for p in parameters]
-
-    def remove_parameter(self, param):
-        """
-        :param param: param object to remove from being a parameter of this parameterized object.
-        """
-        if not param in self._parameters_:
-            raise RuntimeError, "Parameter {} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
-
-        start = sum([p.size for p in self._parameters_[:param._parent_index_]])
-        self._remove_parameter_name(param)
-        self.size -= param.size
-        del self._parameters_[param._parent_index_]
-
-        param._disconnect_parent()
-        param.remove_observer(self, self._pass_through_notify_observers)
-        self.constraints.shift_left(start, param.size)
-
-        self._connect_parameters()
-        self._notify_parent_change()
-
-        parent = self._parent_
-        while parent is not None:
-            parent.size -= param.size
-            parent = parent._parent_
-
-        self._highest_parent_._connect_parameters()
-        self._highest_parent_._connect_fixes()
-        self._highest_parent_._notify_parent_change()
-
-    def _connect_parameters(self, ignore_added_names=False):
-        # connect parameterlist to this parameterized object
-        # This just sets up the right connection for the params objects
-        # to be used as parameters
-        # it also sets the constraints for each parameter to the constraints
-        # of their respective parents
-        if not hasattr(self, "_parameters_") or len(self._parameters_) < 1:
-            # no parameters for this class
-            return
-        if self.param_array.size != self.size:
-            self.param_array = np.empty(self.size, dtype=np.float64)
-        if self.gradient.size != self.size:
-            self._gradient_array_ = np.empty(self.size, dtype=np.float64)
-
-        old_size = 0
-        self._param_slices_ = []
-        for i, p in enumerate(self._parameters_):
-            p._parent_ = self
-            p._parent_index_ = i
-
-            pslice = slice(old_size, old_size + p.size)
-            # first connect all children
-            p._propagate_param_grad(self.param_array[pslice], self.full_gradient[pslice])
-            # then connect children to self
-            self.param_array[pslice] = p.param_array.flat  # , requirements=['C', 'W']).ravel(order='C')
-            self.full_gradient[pslice] = p.full_gradient.flat  # , requirements=['C', 'W']).ravel(order='C')
-
-            if not p.param_array.flags['C_CONTIGUOUS']:
-                raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS"
-            
-            p.param_array.data = self.param_array[pslice].data
-            p.full_gradient.data = self.full_gradient[pslice].data
-
-            self._param_slices_.append(pslice)
-
-            self._add_parameter_name(p, ignore_added_names=ignore_added_names)
-            old_size += p.size
-
     #===========================================================================
     # notification system
     #===========================================================================
@@ -947,30 +885,13 @@ class Parameterizable(OptimizationHandlable):
         self.notify_observers(which=which)
 
     #===========================================================================
-    # Pickling
-    #===========================================================================
-    def __setstate__(self, state):
-        super(Parameterizable, self).__setstate__(state)
-        self._connect_parameters()
-        self._connect_fixes()
-        self._notify_parent_change()
-
-        self.parameters_changed()
-
-    def copy(self):
-        c = super(Parameterizable, self).copy()
-        c._connect_parameters()
-        c._connect_fixes()
-        c._notify_parent_change()
-        return c
-    #===========================================================================
     # From being parentable, we have to define the parent_change notification
     #===========================================================================
     def _notify_parent_change(self):
         """
         Notify all parameters that the parent has changed
         """
-        for p in self._parameters_:
+        for p in self.parameters:
             p._parent_changed(self)
 
     def parameters_changed(self):
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 67694a1b..cfb2171d 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -3,13 +3,10 @@
 
 
 import numpy; np = numpy
-import cPickle
 import itertools
 from re import compile, _pattern_type
 from param import ParamConcatenation
-from parameter_core import Pickleable, Parameterizable, adjust_name_for_printing
-from transformations import __fixed__
-from lists_and_dicts import ArrayList
+from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
 
 class ParametersChangedMeta(type):
     def __call__(self, *args, **kw):
@@ -68,8 +65,7 @@ class Parameterized(Parameterizable):
     def __init__(self, name=None, parameters=[], *a, **kw):
         super(Parameterized, self).__init__(name=name, *a, **kw)
         self._in_init_ = True
-        self._parameters_ = ArrayList()
-        self.size = sum(p.size for p in self._parameters_)
+        self.size = sum(p.size for p in self.parameters)
         self.add_observer(self, self._parameters_changed_notification, -100)
         if not self._has_fixes():
             self._fixes_ = None
@@ -86,7 +82,7 @@ class Parameterized(Parameterizable):
             iamroot=True
         node = pydot.Node(id(self), shape='box', label=self.name)#, color='white')
         G.add_node(node)
-        for child in self._parameters_:
+        for child in self.parameters:
             child_node = child.build_pydot(G)
             G.add_edge(pydot.Edge(node, child_node))#, color='white'))
 
@@ -102,58 +98,133 @@ class Parameterized(Parameterizable):
         return node
 
     #===========================================================================
-    # Gradient control
+    # Add remove parameters:
     #===========================================================================
-    def _transform_gradients(self, g):
-        if self.has_parent():
-            return g
-        [numpy.put(g, i, g[i] * c.gradfactor(self.param_array[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
-        if self._has_fixes(): return g[self._fixes_]
-        return g
-
-
-    #===========================================================================
-    # Indexable
-    #===========================================================================
-    def _offset_for(self, param):
-        # get the offset in the parameterized index array for param
-        if param.has_parent():
-            if param._parent_._get_original(param) in self._parameters_:
-                return self._param_slices_[param._parent_._get_original(param)._parent_index_].start
-            return self._offset_for(param._parent_) + param._parent_._offset_for(param)
-        return 0
-
-    def _raveled_index_for(self, param):
+    def add_parameter(self, param, index=None, _ignore_added_names=False):
         """
-        get the raveled index for a param
-        that is an int array, containing the indexes for the flattened
-        param inside this parameterized logic.
-        """
-        if isinstance(param, ParamConcatenation):
-            return numpy.hstack((self._raveled_index_for(p) for p in param.params))
-        return param._raveled_index() + self._offset_for(param)
+        :param parameters:  the parameters to add
+        :type parameters:   list of or one :py:class:`GPy.core.param.Param`
+        :param [index]:     index of where to put parameters
 
-    def _raveled_index(self):
-        """
-        get the raveled index for this object,
-        this is not in the global view of things!
-        """
-        return numpy.r_[:self.size]
+        :param bool _ignore_added_names: whether the name of the parameter overrides a possibly existing field
 
-    #===========================================================================
-    # Convenience for fixed, tied checking of param:
-    #===========================================================================
-    @property
-    def is_fixed(self):
-        for p in self._parameters_:
-            if not p.is_fixed: return False
-        return True
+        Add all parameters to this param class, you can insert parameters
+        at any given index using the :func:`list.insert` syntax
+        """
+        if param in self.parameters and index is not None:
+            self.remove_parameter(param)
+            self.add_parameter(param, index)
+        # elif param.has_parent():
+        #    raise HierarchyError, "parameter {} already in another model ({}), create new object (or copy) for adding".format(param._short(), param._highest_parent_._short())
+        elif param not in self.parameters:
+            if param.has_parent():
+                def visit(parent, self):
+                    if parent is self:
+                        raise HierarchyError, "You cannot add a parameter twice into the hierarchy"
+                param.traverse_parents(visit, self)
+                param._parent_.remove_parameter(param)
+            # make sure the size is set
+            if index is None:
+                self.constraints.update(param.constraints, self.size)
+                self.priors.update(param.priors, self.size)
+                self.parameters.append(param)
+            else:
+                start = sum(p.size for p in self.parameters[:index])
+                self.constraints.shift_right(start, param.size)
+                self.priors.shift_right(start, param.size)
+                self.constraints.update(param.constraints, start)
+                self.priors.update(param.priors, start)
+                self.parameters.insert(index, param)
 
-    def _get_original(self, param):
-        # if advanced indexing is activated it happens that the array is a copy
-        # you can retrieve the original param through this method, by passing
-        # the copy here
-        return self._parameters_[param._parent_index_]
+            param.add_observer(self, self._pass_through_notify_observers, -np.inf)
+
+            parent = self
+            while parent is not None:
+                parent.size += param.size
+                parent = parent._parent_
+
+            self._connect_parameters()
+
+            self._highest_parent_._connect_parameters(ignore_added_names=_ignore_added_names)
+            self._highest_parent_._notify_parent_change()
+            self._highest_parent_._connect_fixes()
+
+        else:
+            raise HierarchyError, """Parameter exists already and no copy made"""
+
+
+    def add_parameters(self, *parameters):
+        """
+        convenience method for adding several
+        parameters without gradient specification
+        """
+        [self.add_parameter(p) for p in parameters]
+
+    def remove_parameter(self, param):
+        """
+        :param param: param object to remove from being a parameter of this parameterized object.
+        """
+        if not param in self.parameters:
+            raise RuntimeError, "Parameter {} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
+
+        start = sum([p.size for p in self.parameters[:param._parent_index_]])
+        self._remove_parameter_name(param)
+        self.size -= param.size
+        del self.parameters[param._parent_index_]
+
+        param._disconnect_parent()
+        param.remove_observer(self, self._pass_through_notify_observers)
+        self.constraints.shift_left(start, param.size)
+
+        self._connect_parameters()
+        self._notify_parent_change()
+
+        parent = self._parent_
+        while parent is not None:
+            parent.size -= param.size
+            parent = parent._parent_
+
+        self._highest_parent_._connect_parameters()
+        self._highest_parent_._connect_fixes()
+        self._highest_parent_._notify_parent_change()
+
+    def _connect_parameters(self, ignore_added_names=False):
+        # connect parameterlist to this parameterized object
+        # This just sets up the right connection for the params objects
+        # to be used as parameters
+        # it also sets the constraints for each parameter to the constraints
+        # of their respective parents
+        if not hasattr(self, "parameters") or len(self.parameters) < 1:
+            # no parameters for this class
+            return
+        if self.param_array.size != self.size:
+            self.param_array = np.empty(self.size, dtype=np.float64)
+        if self.gradient.size != self.size:
+            self._gradient_array_ = np.empty(self.size, dtype=np.float64)
+
+        old_size = 0
+        self._param_slices_ = []
+        for i, p in enumerate(self.parameters):
+            p._parent_ = self
+            p._parent_index_ = i
+
+            pslice = slice(old_size, old_size + p.size)
+            # first connect all children
+            p._propagate_param_grad(self.param_array[pslice], self.gradient_full[pslice])
+            # then connect children to self
+            self.param_array[pslice] = p.param_array.flat  # , requirements=['C', 'W']).ravel(order='C')
+            self.gradient_full[pslice] = p.gradient_full.flat  # , requirements=['C', 'W']).ravel(order='C')
+
+            if not p.param_array.flags['C_CONTIGUOUS']:
+                raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS"
+
+            p.param_array.data = self.param_array[pslice].data
+            p.gradient_full.data = self.gradient_full[pslice].data
+
+            self._param_slices_.append(pslice)
+
+            self._add_parameter_name(p, ignore_added_names=ignore_added_names)
+            old_size += p.size
 
     #===========================================================================
     # Get/set parameters:
@@ -200,10 +271,28 @@ class Parameterized(Parameterizable):
 
     def __setattr__(self, name, val):
         # override the default behaviour, if setting a param, so broadcasting can by used
-        if hasattr(self, "_parameters_"):
+        if hasattr(self, "parameters"):
             pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False)
-            if name in pnames: self._parameters_[pnames.index(name)][:] = val; return
+            if name in pnames: self.parameters[pnames.index(name)][:] = val; return
         object.__setattr__(self, name, val);
+
+    #===========================================================================
+    # Pickling
+    #===========================================================================
+    def __setstate__(self, state):
+        super(Parameterized, self).__setstate__(state)
+        self._connect_parameters()
+        self._connect_fixes()
+        self._notify_parent_change()
+
+        self.parameters_changed()
+    def copy(self):
+        c = super(Parameterized, self).copy()
+        c._connect_parameters()
+        c._connect_fixes()
+        c._notify_parent_change()
+        return c
+
     #===========================================================================
     # Printing:
     #===========================================================================
@@ -211,22 +300,22 @@ class Parameterized(Parameterizable):
         return self.hierarchy_name()
     @property
     def flattened_parameters(self):
-        return [xi for x in self._parameters_ for xi in x.flattened_parameters]
+        return [xi for x in self.parameters for xi in x.flattened_parameters]
     @property
     def _parameter_sizes_(self):
-        return [x.size for x in self._parameters_]
+        return [x.size for x in self.parameters]
     @property
     def parameter_shapes(self):
-        return [xi for x in self._parameters_ for xi in x.parameter_shapes]
+        return [xi for x in self.parameters for xi in x.parameter_shapes]
     @property
     def _constraints_str(self):
-        return [cs for p in self._parameters_ for cs in p._constraints_str]
+        return [cs for p in self.parameters for cs in p._constraints_str]
     @property
     def _priors_str(self):
-        return [cs for p in self._parameters_ for cs in p._priors_str]
+        return [cs for p in self.parameters for cs in p._priors_str]
     @property
     def _description_str(self):
-        return [xi for x in self._parameters_ for xi in x._description_str]
+        return [xi for x in self.parameters for xi in x._description_str]
     @property
     def _ties_str(self):
         return [','.join(x._ties_str) for x in self.flattened_parameters]
@@ -246,7 +335,7 @@ class Parameterized(Parameterizable):
         to_print = []
         for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
             to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
-        # to_print = [format_spec.format(p=p, const=c, t=t) if isinstance(p, Param) else p.__str__(header=False) for p, c, t in itertools.izip(self._parameters_, constrs, ts)]
+        # to_print = [format_spec.format(p=p, const=c, t=t) if isinstance(p, Param) else p.__str__(header=False) for p, c, t in itertools.izip(self.parameters, constrs, ts)]
         sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
         if header:
             header = "  {{0:<{0}s}}  |  {{1:^{1}s}}  |  {{2:^{2}s}}  |  {{3:^{3}s}}  |  {{4:^{4}s}}".format(nl, sl, cl, pl, tl).format(name, "Value", "Constraint", "Prior", "Tied to")
diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index 044d1592..fa32f642 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -81,7 +81,7 @@ class VariationalPosterior(Parameterized):
     def _raveled_index(self):
         index = np.empty(dtype=int, shape=0)
         size = 0
-        for p in self._parameters_:
+        for p in self.parameters:
             index = np.hstack((index, p._raveled_index()+size))
             size += p._realsize_ if hasattr(p, '_realsize_') else p.size
         return index
@@ -96,10 +96,10 @@ class VariationalPosterior(Parameterized):
             dc = self.__dict__.copy()
             dc['mean'] = self.mean[s]
             dc['variance'] = self.variance[s]
-            dc['_parameters_'] = copy.copy(self._parameters_)
+            dc['parameters'] = copy.copy(self.parameters)
             n.__dict__.update(dc)
-            n._parameters_[dc['mean']._parent_index_] = dc['mean']
-            n._parameters_[dc['variance']._parent_index_] = dc['variance']
+            n.parameters[dc['mean']._parent_index_] = dc['mean']
+            n.parameters[dc['variance']._parent_index_] = dc['variance']
             n._gradient_array_ = None
             oversize = self.size - self.mean.size - self.variance.size
             n.size = n.mean.size + n.variance.size + oversize
@@ -150,11 +150,11 @@ class SpikeAndSlabPosterior(VariationalPosterior):
             dc['mean'] = self.mean[s]
             dc['variance'] = self.variance[s]
             dc['binary_prob'] = self.binary_prob[s]
-            dc['_parameters_'] = copy.copy(self._parameters_)
+            dc['parameters'] = copy.copy(self.parameters)
             n.__dict__.update(dc)
-            n._parameters_[dc['mean']._parent_index_] = dc['mean']
-            n._parameters_[dc['variance']._parent_index_] = dc['variance']
-            n._parameters_[dc['binary_prob']._parent_index_] = dc['binary_prob']
+            n.parameters[dc['mean']._parent_index_] = dc['mean']
+            n.parameters[dc['variance']._parent_index_] = dc['variance']
+            n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
             n.ndim = n.mean.ndim
             n.shape = n.mean.shape
             n.num_data = n.mean.shape[0]
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index f5b54797..6336d1b9 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -141,10 +141,10 @@ class Add(CombinationKernel):
         from static import White, Bias
         target_mu = np.zeros(variational_posterior.shape)
         target_S = np.zeros(variational_posterior.shape)
-        for p1 in self._parameters_:
+        for p1 in self.parameters:
             #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
             eff_dL_dpsi1 = dL_dpsi1.copy()
-            for p2 in self._parameters_:
+            for p2 in self.parameters:
                 if p2 is p1:
                     continue
                 if isinstance(p2, White):
@@ -160,7 +160,7 @@ class Add(CombinationKernel):
 
     def add(self, other, name='sum'):
         if isinstance(other, Add):
-            other_params = other._parameters_[:]
+            other_params = other.parameters[:]
             for p in other_params:
                 other.remove_parameter(p)
             self.add_parameters(*other_params)
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index b8d10f47..005faa05 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -183,9 +183,9 @@ class Kern(Parameterized):
         assert isinstance(other, Kern), "only kernels can be added to kernels..."
         from prod import Prod
         #kernels = []
-        #if isinstance(self, Prod): kernels.extend(self._parameters_)
+        #if isinstance(self, Prod): kernels.extend(self.parameters)
         #else: kernels.append(self)
-        #if isinstance(other, Prod): kernels.extend(other._parameters_)
+        #if isinstance(other, Prod): kernels.extend(other.parameters)
         #else: kernels.append(other)
         return Prod([self, other], name)
 
@@ -222,7 +222,7 @@ class CombinationKernel(Kern):
 
     @property
     def parts(self):
-        return self._parameters_
+        return self.parameters
 
     def get_input_dim_active_dims(self, kernels, extra_dims = None):
         #active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
diff --git a/GPy/plotting/matplot_dep/kernel_plots.py b/GPy/plotting/matplot_dep/kernel_plots.py
index 2b990611..3c6b911f 100644
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@@ -68,7 +68,7 @@ def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False):
     
     for i in range(ard_params.shape[0]):
         c = Tango.nextMedium()
-        bars.append(plot_bars(fig, ax, x, ard_params[i,:], c, kernel._parameters_[i].name, bottom=bottom))
+        bars.append(plot_bars(fig, ax, x, ard_params[i,:], c, kernel.parameters[i].name, bottom=bottom))
         bottom += ard_params[i,:]
     
     ax.set_xlim(-.5, kernel.input_dim - .5)
diff --git a/GPy/testing/parameterized_tests.py b/GPy/testing/parameterized_tests.py
index bc989637..fa15d66d 100644
--- a/GPy/testing/parameterized_tests.py
+++ b/GPy/testing/parameterized_tests.py
@@ -95,7 +95,7 @@ class ParameterizedTest(unittest.TestCase):
         self.assertListEqual(self.test1.kern.param_array.tolist(), val[:2].tolist())
 
     def test_add_parameter_already_in_hirarchy(self):
-        self.assertRaises(HierarchyError, self.test1.add_parameter, self.white._parameters_[0])
+        self.assertRaises(HierarchyError, self.test1.add_parameter, self.white.parameters[0])
 
     def test_default_constraints(self):
         self.assertIs(self.rbf.variance.constraints._param_index_ops, self.rbf.constraints._param_index_ops)
diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py
index b62f5e45..94504645 100644
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@@ -89,28 +89,28 @@ class Test(ListDictTestCase):
         self.assertIs(pcopy.constraints, pcopy.rbf.lengthscale.constraints._param_index_ops)
         self.assertIs(pcopy.constraints, pcopy.linear.constraints._param_index_ops)
         self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
         self.assertSequenceEqual(str(par), str(pcopy))
         self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
         with tempfile.TemporaryFile('w+b') as f:
             par.pickle(f)
             f.seek(0)
             pcopy = pickle.load(f)
         self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
         pcopy.gradient = 10
-        np.testing.assert_allclose(par.linear.full_gradient, pcopy.linear.full_gradient)
-        np.testing.assert_allclose(pcopy.linear.full_gradient, 10)
+        np.testing.assert_allclose(par.linear.gradient_full, pcopy.linear.gradient_full)
+        np.testing.assert_allclose(pcopy.linear.gradient_full, 10)
         self.assertSequenceEqual(str(par), str(pcopy))
 
     def test_model(self):
         par = toy_rbf_1d_50(optimize=0, plot=0)
         pcopy = par.copy()
         self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
         self.assertSequenceEqual(str(par), str(pcopy))
         self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
         self.assertTrue(pcopy.checkgrad())
         self.assert_(np.any(pcopy.gradient!=0.0))
         with tempfile.TemporaryFile('w+b') as f:
@@ -118,7 +118,7 @@ class Test(ListDictTestCase):
             f.seek(0)
             pcopy = pickle.load(f)
         self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
         self.assertSequenceEqual(str(par), str(pcopy))
         self.assert_(pcopy.checkgrad())
 
@@ -126,10 +126,10 @@ class Test(ListDictTestCase):
         par = toy_rbf_1d_50(optimize=0, plot=0)
         pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
         self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
         self.assertSequenceEqual(str(par), str(pcopy))
         self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
         self.assertTrue(pcopy.checkgrad())
         self.assert_(np.any(pcopy.gradient!=0.0))
         pcopy.optimize('bfgs')
@@ -140,7 +140,7 @@ class Test(ListDictTestCase):
             f.seek(0)
             pcopy = pickle.load(f)
         self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
         self.assertSequenceEqual(str(par), str(pcopy))
         self.assert_(pcopy.checkgrad())
 
@@ -151,18 +151,18 @@ class Test(ListDictTestCase):
         par.gradient = 10
         pcopy = par.copy()
         self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
         self.assertSequenceEqual(str(par), str(pcopy))
         self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
         with tempfile.TemporaryFile('w+b') as f:
             par.pickle(f)
             f.seek(0)
             pcopy = pickle.load(f)
         self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
         pcopy.gradient = 10
-        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
-        np.testing.assert_allclose(pcopy.mean.full_gradient, 10)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
+        np.testing.assert_allclose(pcopy.mean.gradient_full, 10)
         self.assertSequenceEqual(str(par), str(pcopy))
 
     def test_model_concat(self):
@@ -170,10 +170,10 @@ class Test(ListDictTestCase):
         par.randomize()
         pcopy = par.copy()
         self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
         self.assertSequenceEqual(str(par), str(pcopy))
         self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
         self.assertTrue(pcopy.checkgrad())
         self.assert_(np.any(pcopy.gradient!=0.0))
         with tempfile.TemporaryFile('w+b') as f:
@@ -181,7 +181,7 @@ class Test(ListDictTestCase):
             f.seek(0)
             pcopy = pickle.load(f)
         self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
         self.assertSequenceEqual(str(par), str(pcopy))
         self.assert_(pcopy.checkgrad())
 

From 58a05f37b768bd7e0ce4c861f9685a1f6c277e8e Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 15 May 2014 14:06:00 +0100
Subject: [PATCH 4/8] [latentfunctioninference] superclass
 LatentFunctionInference added, which contains a call just before and just
 after optimization

---
 GPy/core/gp.py                                | 20 ++++++++++-
 GPy/core/model.py                             |  2 +-
 .../latent_function_inference/__init__.py     | 35 +++++++++++++++++--
 .../latent_function_inference/dtc.py          |  3 +-
 .../exact_gaussian_inference.py               |  3 +-
 .../expectation_propagation.py                |  3 +-
 .../latent_function_inference/fitc.py         |  3 +-
 .../latent_function_inference/laplace.py      |  3 +-
 .../latent_function_inference/var_dtc.py      |  5 +--
 .../latent_function_inference/var_dtc_gpu.py  |  3 +-
 .../var_dtc_parallel.py                       |  3 +-
 11 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 62e16de1..29d9032a 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -10,7 +10,7 @@ from model import Model
 from parameterization import ObsAr
 from .. import likelihoods
 from ..likelihoods.gaussian import Gaussian
-from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
+from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation, LatentFunctionInference
 from parameterization.variational import VariationalPosterior
 
 class GP(Model):
@@ -21,6 +21,7 @@ class GP(Model):
     :param Y: output observations
     :param kernel: a GPy kernel, defaults to rbf+white
     :param likelihood: a GPy likelihood
+    :param :class:`~GPy.inference.latent_function_inference.LatentFunctionInference` inference_method: The inference method to use for this GP
     :rtype: model object
 
     .. Note:: Multiple independent outputs are allowed using columns of Y
@@ -220,3 +221,20 @@ class GP(Model):
         """
         return self.kern.input_sensitivity()
 
+    def optimize(self, optimizer=None, start=None, **kwargs):
+        """
+        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
+        kwargs are passed to the optimizer. They can be:
+
+        :param max_f_eval: maximum number of function evaluations
+        :type max_f_eval: int
+        :messages: whether to display during optimisation
+        :type messages: bool
+        :param optimizer: which optimizer to use (defaults to self.preferred optimizer)
+        :type optimizer: string
+
+        TODO: valid args
+        """
+        self.inference_method.on_optimization_start()
+        super(GP, self).optimize(optimizer, start, **kwargs)
+        self.inference_method.on_optimization_end()
\ No newline at end of file
diff --git a/GPy/core/model.py b/GPy/core/model.py
index 38e8d4cf..71b21af6 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -220,7 +220,7 @@ class Model(Parameterized):
         if self.is_fixed:
             raise RuntimeError, "Cannot optimize, when everything is fixed"
         if self.size == 0:
-            raise RuntimeError, "Model without parameters cannot be minimized"
+            raise RuntimeError, "Model without parameters cannot be optimized"
 
         if optimizer is None:
             optimizer = self.preferred_optimizer
diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py
index 68004a08..878c1e4f 100644
--- a/GPy/inference/latent_function_inference/__init__.py
+++ b/GPy/inference/latent_function_inference/__init__.py
@@ -25,6 +25,20 @@ etc.
 
 """
 
+class LatentFunctionInference(object):
+    def on_optimization_start(self):
+        """
+        This function gets called, just before the optimization loop to start.
+        """
+        pass
+
+    def on_optimization_end(self):
+        """
+        This function gets called, just after the optimization loop ended.
+        """
+        pass
+
+
 from exact_gaussian_inference import ExactGaussianInference
 from laplace import Laplace
 from GPy.inference.latent_function_inference.var_dtc import VarDTC
@@ -38,11 +52,26 @@ from var_dtc_gpu import VarDTC_GPU
 # class FullLatentFunctionData(object):
 #
 #
-# class LatentFunctionInference(object):
-#     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+
+# class EMLikeLatentFunctionInference(LatentFunctionInference):
+#     def update_approximation(self):
+#         """
+#         This function gets called when the 
+#         """
+#     
+#     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
 #         Do inference on the latent functions given a covariance function `kern`,
-#         inputs and outputs `X` and `Y`, and a likelihood `likelihood`.
+#         inputs and outputs `X` and `Y`, inducing_inputs `Z`, and a likelihood `likelihood`.
+#         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
+#         """
+#         raise NotImplementedError, "Abstract base class for full inference"
+# 
+# class VariationalLatentFunctionInference(LatentFunctionInference):
+#     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+#         """
+#         Do inference on the latent functions given a covariance function `kern`,
+#         inputs and outputs `X` and `Y`, inducing_inputs `Z`, and a likelihood `likelihood`.
 #         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
 #         """
 #         raise NotImplementedError, "Abstract base class for full inference"
diff --git a/GPy/inference/latent_function_inference/dtc.py b/GPy/inference/latent_function_inference/dtc.py
index 1a84da6b..1b6b1dbd 100644
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@@ -4,9 +4,10 @@
 from posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 import numpy as np
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 
-class DTC(object):
+class DTC(LatentFunctionInference):
     """
     An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
 
diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index c0177e9f..0c02efe3 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -5,10 +5,11 @@ from posterior import Posterior
 from ...util.linalg import pdinv, dpotrs, tdot
 from ...util import diag
 import numpy as np
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 
 
-class ExactGaussianInference(object):
+class ExactGaussianInference(LatentFunctionInference):
     """
     An object for inference when the likelihood is Gaussian.
 
diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py
index 172f43fb..c2dea824 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@@ -1,9 +1,10 @@
 import numpy as np
 from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs
 from posterior import Posterior
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 
-class EP(object):
+class EP(LatentFunctionInference):
     def __init__(self, epsilon=1e-6, eta=1., delta=1.):
         """
         The expectation-propagation algorithm.
diff --git a/GPy/inference/latent_function_inference/fitc.py b/GPy/inference/latent_function_inference/fitc.py
index de47e5d5..a184c6c4 100644
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@@ -5,9 +5,10 @@ from posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 from ...util import diag
 import numpy as np
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 
-class FITC(object):
+class FITC(LatentFunctionInference):
     """
     An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
 
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 9ba3f83f..1c153518 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -16,8 +16,9 @@ from ...util.misc import param_to_array
 from posterior import Posterior
 import warnings
 from scipy import optimize
+from . import LatentFunctionInference
 
-class Laplace(object):
+class Laplace(LatentFunctionInference):
 
     def __init__(self):
         """
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index 0cc841ed..3043a7e8 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -7,9 +7,10 @@ from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
 from ...util.misc import param_to_array
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 
-class VarDTC(object):
+class VarDTC(LatentFunctionInference):
     """
     An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
 
@@ -190,7 +191,7 @@ class VarDTC(object):
         post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
         return post, log_marginal, grad_dict
 
-class VarDTCMissingData(object):
+class VarDTCMissingData(LatentFunctionInference):
     const_jitter = 1e-6
     def __init__(self, limit=1, inan=None):
         from ...util.caching import Cacher
diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py
index 9b2da1c9..d346d01f 100644
--- a/GPy/inference/latent_function_inference/var_dtc_gpu.py
+++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py
@@ -7,6 +7,7 @@ from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
 from ...util.misc import param_to_array
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 
 from ...util import gpu_init
@@ -19,7 +20,7 @@ try:
 except:
     pass
 
-class VarDTC_GPU(object):
+class VarDTC_GPU(LatentFunctionInference):
     """
     An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
 
diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py
index 87236e2a..ffda0aba 100644
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@@ -7,9 +7,10 @@ from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
 from ...util.misc import param_to_array
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 
-class VarDTC_minibatch(object):
+class VarDTC_minibatch(LatentFunctionInference):
     """
     An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
 

From 75d5209b98ff4e275eab2a690acece327f4ec4c7 Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Thu, 15 May 2014 14:10:24 +0100
Subject: [PATCH 5/8] minor changes

---
 .../expectation_propagation_dtc.py                 | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
index 3625a5bf..be1efd3a 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@@ -5,7 +5,13 @@ from posterior import Posterior
 log_2_pi = np.log(2*np.pi)
 
 class EPDTC(EP):
-    #def __init__(self, epsilon=1e-6, eta=1., delta=1.):
+    def __init__(self, epsilon=1e-6, eta=1., delta=1.):
+        self.epsilon, self.eta, self.delta = epsilon, eta, delta
+        self.reset()
+
+    def reset(self):
+        self.old_mutilde, self.old_vtilde = None, None
+        self._ep_approximation = None
 
     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
         num_data, output_dim = X.shape
@@ -20,8 +26,10 @@ class EPDTC(EP):
         KmmiKmn = np.dot(Kmmi,Kmn)
         K = np.dot(Kmn.T,KmmiKmn)
 
-
-        mu, Sigma, mu_tilde, tau_tilde, Z_hat = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
+        if self._ep_approximation is None:
+            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
+        else:
+            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation
 
         Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde))
 

From bd7a80dde5ce80585458bd40316badefc28f704c Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Thu, 15 May 2014 14:11:11 +0100
Subject: [PATCH 6/8] flags added

---
 .../latent_function_inference/expectation_propagation.py    | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py
index 172f43fb..603ff68c 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@@ -21,6 +21,7 @@ class EP(object):
 
     def reset(self):
         self.old_mutilde, self.old_vtilde = None, None
+        self._ep_approximation = None
 
     def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None):
         num_data, output_dim = X.shape
@@ -28,7 +29,10 @@ class EP(object):
 
         K = kern.K(X)
 
-        mu, Sigma, mu_tilde, tau_tilde, Z_hat = self.expectation_propagation(K, Y, likelihood, Y_metadata)
+        if self._ep_approximation is None:
+            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
+        else:
+            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation
 
         Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde))
 

From 3d76664af054eac7a53e1a9fffeb3cf800330f1e Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Thu, 15 May 2014 16:36:03 +0100
Subject: [PATCH 7/8] EP is back.

---
 GPy/examples/classification.py                |   9 +-
 .../expectation_propagation.py                |  10 +-
 .../expectation_propagation_dtc.py            | 243 ++++++++++++++++--
 GPy/likelihoods/bernoulli.py                  |   3 +
 4 files changed, 238 insertions(+), 27 deletions(-)

diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index 2b0a201d..ae9d8eb8 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -96,15 +96,11 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=
 
     # Optimize
     if optimize:
-        #m.update_likelihood_approximation()
-        # Parameters optimization:
         try:
             m.optimize('scg', messages=1)
         except Exception as e:
             return m
 
-        #m.pseudo_EM()
-
     # Plot
     if plot:
         fig, axes = pb.subplots(2, 1)
@@ -133,10 +129,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti
 
     # Optimize
     if optimize:
-        #m.update_likelihood_approximation()
-        # Parameters optimization:
-        #m.optimize()
-        m.pseudo_EM()
+        m.optimize()
 
     # Plot
     if plot:
diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py
index 870f2310..de8b0931 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@@ -24,6 +24,13 @@ class EP(LatentFunctionInference):
         self.old_mutilde, self.old_vtilde = None, None
         self._ep_approximation = None
 
+    def on_optimization_start(self):
+        self._ep_approximation = None
+
+    def on_optimization_end(self):
+        # TODO: update approximation in the end as well? Maybe even with a switch?
+        pass
+
     def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None):
         num_data, output_dim = X.shape
         assert output_dim ==1, "ep in 1D only (for now!)"
@@ -47,8 +54,6 @@ class EP(LatentFunctionInference):
 
         return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
 
-
-
     def expectation_propagation(self, K, Y, likelihood, Y_metadata):
 
         num_data, data_dim = Y.shape
@@ -113,4 +118,3 @@ class EP(LatentFunctionInference):
 
         mu_tilde = v_tilde/tau_tilde
         return mu, Sigma, mu_tilde, tau_tilde, Z_hat
-
diff --git a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
index be1efd3a..85d8cc89 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@@ -1,14 +1,56 @@
 import numpy as np
-from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs
-from expectation_propagation import EP
+from ...util import diag
+from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify, DSYR
+from ...util.misc import param_to_array
+from ...core.parameterization.variational import VariationalPosterior
+from . import LatentFunctionInference
 from posterior import Posterior
 log_2_pi = np.log(2*np.pi)
 
-class EPDTC(EP):
-    def __init__(self, epsilon=1e-6, eta=1., delta=1.):
+class EPDTC(LatentFunctionInference):
+    const_jitter = 1e-6
+    def __init__(self, epsilon=1e-6, eta=1., delta=1., limit=1):
+        from ...util.caching import Cacher
+        self.limit = limit
+        self.get_trYYT = Cacher(self._get_trYYT, limit)
+        self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
+
         self.epsilon, self.eta, self.delta = epsilon, eta, delta
         self.reset()
 
+    def set_limit(self, limit):
+        self.get_trYYT.limit = limit
+        self.get_YYTfactor.limit = limit
+
+    def _get_trYYT(self, Y):
+        return param_to_array(np.sum(np.square(Y)))
+
+    def __getstate__(self):
+        # has to be overridden, as Cacher objects cannot be pickled.
+        return self.limit
+
+    def __setstate__(self, state):
+        # has to be overridden, as Cacher objects cannot be pickled.
+        self.limit = state
+        from ...util.caching import Cacher
+        self.get_trYYT = Cacher(self._get_trYYT, self.limit)
+        self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit)
+
+    def _get_YYTfactor(self, Y):
+        """
+        find a matrix L which satisfies LLT = YYT.
+
+        Note that L may have fewer columns than Y.
+        """
+        N, D = Y.shape
+        if (N>=D):
+            return param_to_array(Y)
+        else:
+            return jitchol(tdot(Y))
+
+    def get_VVTfactor(self, Y, prec):
+        return Y * prec # TODO chache this, and make it effective
+
     def reset(self):
         self.old_mutilde, self.old_vtilde = None, None
         self._ep_approximation = None
@@ -20,28 +62,131 @@ class EPDTC(EP):
         Kmm = kern.K(Z)
         Kmn = kern.K(Z,X)
 
-        Lm = jitchol(Kmm)
-        Lmi = dtrtrs(Lm,np.eye(Lm.shape[0]))[0]
-        Kmmi = np.dot(Lmi.T,Lmi)
-        KmmiKmn = np.dot(Kmmi,Kmn)
-        K = np.dot(Kmn.T,KmmiKmn)
-
         if self._ep_approximation is None:
             mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
         else:
             mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation
 
-        Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde))
 
-        alpha, _ = dpotrs(LW, mu_tilde, lower=1)
+        if isinstance(X, VariationalPosterior):
+            uncertain_inputs = True
+            psi0 = kern.psi0(Z, X)
+            psi1 = Kmn.T#kern.psi1(Z, X)
+            psi2 = kern.psi2(Z, X)
+        else:
+            uncertain_inputs = False
+            psi0 = kern.Kdiag(X)
+            psi1 = Kmn.T#kern.K(X, Z)
+            psi2 = None
 
-        log_marginal =  0.5*(-num_data * log_2_pi - W_logdet - np.sum(alpha * mu_tilde)) # TODO: add log Z_hat??
+        #see whether we're using variational uncertain inputs
 
-        dL_dK = 0.5 * (tdot(alpha[:,None]) - Wi)
+        _, output_dim = Y.shape
+
+        #see whether we've got a different noise variance for each datum
+        #beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
+        beta = tau_tilde
+        VVT_factor = beta[:,None]*mu_tilde[:,None]
+        trYYT = self.get_trYYT(mu_tilde[:,None])
+
+        # do the inference:
+        het_noise = beta.size > 1
+        num_inducing = Z.shape[0]
+        num_data = Y.shape[0]
+        # kernel computations, using BGPLVM notation
+
+        Kmm = kern.K(Z).copy()
+        diag.add(Kmm, self.const_jitter)
+        Lm = jitchol(Kmm)
+
+        # The rather complex computations of A
+        if uncertain_inputs:
+            if het_noise:
+                psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
+            else:
+                psi2_beta = psi2.sum(0) * beta
+            LmInv = dtrtri(Lm)
+            A = LmInv.dot(psi2_beta.dot(LmInv.T))
+        else:
+            if het_noise:
+                tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
+            else:
+                tmp = psi1 * (np.sqrt(beta))
+            tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
+            A = tdot(tmp) #print A.sum()
+
+        # factor B
+        B = np.eye(num_inducing) + A
+        LB = jitchol(B)
+        psi1Vf = np.dot(psi1.T, VVT_factor)
+        # back substutue C into psi1Vf
+        tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
+        _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
+        tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
+        Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+
+        # data fit and derivative of L w.r.t. Kmm
+        delit = tdot(_LBi_Lmi_psi1Vf)
+        data_fit = np.trace(delit)
+        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
+        delit = -0.5 * DBi_plus_BiPBi
+        delit += -0.5 * B * output_dim
+        delit += output_dim * np.eye(num_inducing)
+        # Compute dL_dKmm
+        dL_dKmm = backsub_both_sides(Lm, delit)
+
+        # derivatives of L w.r.t. psi
+        dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm,
+            VVT_factor, Cpsi1Vf, DBi_plus_BiPBi,
+            psi1, het_noise, uncertain_inputs)
+
+        # log marginal likelihood
+        log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
+            psi0, A, LB, trYYT, data_fit, VVT_factor)
+
+        #put the gradients in the right places
+        dL_dR = _compute_dL_dR(likelihood,
+            het_noise, uncertain_inputs, LB,
+            _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
+            psi0, psi1, beta,
+            data_fit, num_data, output_dim, trYYT, mu_tilde[:,None])
+
+        dL_dthetaL = 0#likelihood.exact_inference_gradients(dL_dR,Y_metadata)
+
+        if uncertain_inputs:
+            grad_dict = {'dL_dKmm': dL_dKmm,
+                         'dL_dpsi0':dL_dpsi0,
+                         'dL_dpsi1':dL_dpsi1,
+                         'dL_dpsi2':dL_dpsi2,
+                         'dL_dthetaL':dL_dthetaL}
+        else:
+            grad_dict = {'dL_dKmm': dL_dKmm,
+                         'dL_dKdiag':dL_dpsi0,
+                         'dL_dKnm':dL_dpsi1,
+                         'dL_dthetaL':dL_dthetaL}
+
+        #get sufficient things for posterior prediction
+        #TODO: do we really want to do this in  the loop?
+        if VVT_factor.shape[1] == Y.shape[1]:
+            woodbury_vector = Cpsi1Vf # == Cpsi1V
+        else:
+            print 'foobar'
+            psi1V = np.dot(mu_tilde[:,None].T*beta, psi1).T
+            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
+            tmp, _ = dpotrs(LB, tmp, lower=1)
+            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+        Bi, _ = dpotri(LB, lower=1)
+        symmetrify(Bi)
+        Bi = -dpotri(LB, lower=1)[0]
+        diag.add(Bi, 1)
+
+        woodbury_inv = backsub_both_sides(Lm, Bi)
+
+        #construct a posterior object
+        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
+        return post, log_marginal, grad_dict
 
-        dL_dthetaL = np.zeros(likelihood.size)#TODO: derivatives of the likelihood parameters
 
-        return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
 
 
 
@@ -129,3 +274,69 @@ class EPDTC(EP):
 
         mu_tilde = v_tilde/tau_tilde
         return mu, Sigma, mu_tilde, tau_tilde, Z_hat
+
+def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs):
+    dL_dpsi0 = -0.5 * output_dim * (beta[:,None] * np.ones([num_data, 1])).flatten()
+    dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
+    dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
+    if het_noise:
+        if uncertain_inputs:
+            dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :]
+        else:
+            dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T
+            dL_dpsi2 = None
+    else:
+        dL_dpsi2 = beta * dL_dpsi2_beta
+        if uncertain_inputs:
+            # repeat for each of the N psi_2 matrices
+            dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0)
+        else:
+            # subsume back into psi1 (==Kmn)
+            dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
+            dL_dpsi2 = None
+
+    return dL_dpsi0, dL_dpsi1, dL_dpsi2
+
+
+def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT, Y):
+    # the partial derivative vector for the likelihood
+    if likelihood.size == 0:
+        # save computation here.
+        dL_dR = None
+    elif het_noise:
+        if uncertain_inputs:
+            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+        else:
+            #from ...util.linalg import chol_inv
+            #LBi = chol_inv(LB)
+            LBi, _ = dtrtrs(LB,np.eye(LB.shape[0]))
+
+            Lmi_psi1, nil = dtrtrs(Lm, psi1.T, lower=1, trans=0)
+            _LBi_Lmi_psi1, _ = dtrtrs(LB, Lmi_psi1, lower=1, trans=0)
+
+            dL_dR = -0.5 * beta + 0.5 * (beta*Y)**2
+            dL_dR += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
+
+            dL_dR += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
+
+            dL_dR += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * Y * beta**2
+            dL_dR += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
+    else:
+        # likelihood is not heteroscedatic
+        dL_dR = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
+        dL_dR += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
+        dL_dR += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
+    return dL_dR
+
+def _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT, data_fit,Y):
+    #compute log marginal likelihood
+    if het_noise:
+        lik_1 = -0.5 * num_data * output_dim * np.log(2. * np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(beta * np.square(Y).sum(axis=-1))
+        lik_2 = -0.5 * output_dim * (np.sum(beta.flatten() * psi0) - np.trace(A))
+    else:
+        lik_1 = -0.5 * num_data * output_dim * (np.log(2. * np.pi) - np.log(beta)) - 0.5 * beta * trYYT
+        lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
+    lik_3 = -output_dim * (np.sum(np.log(np.diag(LB))))
+    lik_4 = 0.5 * data_fit
+    log_marginal = lik_1 + lik_2 + lik_3 + lik_4
+    return log_marginal
diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py
index 7b867954..6c53958b 100644
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@@ -227,3 +227,6 @@ class Bernoulli(Likelihood):
         ns = np.ones_like(gp, dtype=int)
         Ysim = np.random.binomial(ns, self.gp_link.transf(gp))
         return Ysim.reshape(orig_shape)
+
+    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
+        pass

From a796b7bc81cf9d2b314b8882fea66003517d7f1b Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@shef.ac.uk>
Date: Thu, 15 May 2014 17:56:07 +0100
Subject: [PATCH 8/8] fix the bug in mocap demos

---
 GPy/examples/dimensionality_reduction.py |  8 +++++---
 GPy/plotting/matplot_dep/visualize.py    | 17 ++++++++++-------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index a15c2a93..484970b8 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -414,9 +414,10 @@ def stick(kernel=None, optimize=True, verbose=True, plot=True):
         ax = m.plot_latent()
         y = m.Y[0, :]
         data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        vis = GPy.plotting.matplot_dep.visualize.lvm(m.X[:1, :].copy(), m, data_show, latent_axes=ax)
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[:1, :].copy(), m, data_show, latent_axes=ax)
         raw_input('Press enter to finish')
-
+        lvm_visualizer.close()
+        data_show.close()
     return m
 
 def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
@@ -515,9 +516,10 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose
         ax = m.plot_latent()
         y = m.Y[0, :]
         data_show = GPy.plotting.matplot_dep.visualize.skeleton_show(y[None, :], data['skel'])
-        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0].copy(), m, data_show, latent_axes=ax)
         raw_input('Press enter to finish')
         lvm_visualizer.close()
+        data_show.close()
 
     return m
 
diff --git a/GPy/plotting/matplot_dep/visualize.py b/GPy/plotting/matplot_dep/visualize.py
index 6abd3872..d7884730 100644
--- a/GPy/plotting/matplot_dep/visualize.py
+++ b/GPy/plotting/matplot_dep/visualize.py
@@ -102,7 +102,8 @@ class lvm(matplotlib_show):
                 vals = param_to_array(model.X.mean)
             else:
                 vals = param_to_array(model.X)
-
+        if len(vals.shape)==1:
+            vals = vals[None,:]
         matplotlib_show.__init__(self, vals, axes=latent_axes)
 
         if isinstance(latent_axes,mpl.axes.Axes):
@@ -393,14 +394,13 @@ class mocap_data_show_vpython(vpython_show):
     def process_values(self):
         raise NotImplementedError, "this needs to be implemented to use the data_show class"
 
-
 class mocap_data_show(matplotlib_show):
     """Base class for visualizing motion capture data."""
 
     def __init__(self, vals, axes=None, connect=None):
         if axes==None:
             fig = plt.figure()
-            axes = fig.add_subplot(111, projection='3d')
+            axes = fig.add_subplot(111, projection='3d',aspect='equal')
         matplotlib_show.__init__(self, vals, axes)
 
         self.connect = connect
@@ -445,11 +445,12 @@ class mocap_data_show(matplotlib_show):
     def process_values(self):
         raise NotImplementedError, "this needs to be implemented to use the data_show class"
 
-    def initialize_axes(self):
+    def initialize_axes(self, boundary=0.05):
         """Set up the axes with the right limits and scaling."""
-        self.x_lim = np.array([self.vals[:, 0].min(), self.vals[:, 0].max()])
-        self.y_lim = np.array([self.vals[:, 1].min(), self.vals[:, 1].max()])
-        self.z_lim = np.array([self.vals[:, 2].min(), self.vals[:, 2].max()])
+        bs = [(self.vals[:, i].max()-self.vals[:, i].min())*boundary for i in xrange(3)]
+        self.x_lim = np.array([self.vals[:, 0].min()-bs[0], self.vals[:, 0].max()+bs[0]])
+        self.y_lim = np.array([self.vals[:, 1].min()-bs[1], self.vals[:, 1].max()+bs[1]])
+        self.z_lim = np.array([self.vals[:, 2].min()-bs[2], self.vals[:, 2].max()+bs[2]])
 
     def initialize_axes_modify(self):
         self.points_handle.remove()
@@ -472,6 +473,8 @@ class mocap_data_show(matplotlib_show):
 class stick_show(mocap_data_show):
     """Show a three dimensional point cloud as a figure. Connect elements of the figure together using the matrix connect."""
     def __init__(self, vals, connect=None, axes=None):
+        if len(vals.shape)==1:
+            vals = vals[None,:]
         mocap_data_show.__init__(self, vals, axes=axes, connect=connect)
 
     def process_values(self):