diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py
index a42d76ed..25651827 100644
--- a/GPy/core/__init__.py
+++ b/GPy/core/__init__.py
@@ -4,6 +4,7 @@
 from model import *
 from parameterization.parameterized import adjust_name_for_printing, Parameterizable
 from parameterization.param import Param, ParamConcatenation
+from parameterization.observable_array import ObsAr
 
 from gp import GP
 from sparse_gp import SparseGP
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 490bcc72..692e5d01 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -121,7 +121,7 @@ class GP(Model):
            If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
            This is to allow for different normalizations of the output dimensions.
 
-        """
+        """        
         #predict the latent function values
         mu, var = self._raw_predict(Xnew, full_cov=full_cov)
 
diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py
index 6902c249..604d0a01 100644
--- a/GPy/core/parameterization/lists_and_dicts.py
+++ b/GPy/core/parameterization/lists_and_dicts.py
@@ -5,6 +5,7 @@ Created on 27 Feb 2014
 '''
 
 from collections import defaultdict
+import weakref
 
 def intarray_default_factory():
     import numpy as np
@@ -41,60 +42,66 @@ class ObservablesList(object):
     def __init__(self):
         self._poc = []
 
-    def remove(self, value):
-        return self._poc.remove(value)
-
-
-    def __delitem__(self, ind):
-        return self._poc.__delitem__(ind)
-
-
-    def __setitem__(self, ind, item):
-        return self._poc.__setitem__(ind, item)
-
-
     def __getitem__(self, ind):
-        return self._poc.__getitem__(ind)
+        p,o,c = self._poc[ind]
+        return p, o(), c
 
+    def remove(self, priority, observable, callble):
+        """
+        """
+        self.flush()
+        for i in range(len(self) - 1, -1, -1):
+            p,o,c = self[i]
+            if priority==p and observable==o and callble==c:
+                del self._poc[i]
 
     def __repr__(self):
         return self._poc.__repr__()
 
-
-    def append(self, obj):
-        return self._poc.append(obj)
-
-
-    def index(self, value):
-        return self._poc.index(value)
-
-
-    def extend(self, iterable):
-        return self._poc.extend(iterable)
-
-
+    def add(self, priority, observable, callble):
+        ins = 0
+        for pr, _, _ in self:
+            if priority > pr:
+                break
+            ins += 1
+        self._poc.insert(ins, (priority, weakref.ref(observable), callble))
+        
     def __str__(self):
-        return self._poc.__str__()
+        ret = []
+        curr_p = None
+        for p, o, c in self:
+            curr = ''
+            if curr_p != p:
+                pre = "{!s}: ".format(p)
+                curr_pre = pre
+            else: curr_pre = " "*len(pre)
+            curr_p = p
+            curr += curr_pre
+            ret.append(curr + ", ".join(map(repr, [o,c])))
+        return '\n'.join(ret)
 
+    def flush(self):
+        self._poc = [(p,o,c) for p,o,c in self._poc if o() is not None]
 
     def __iter__(self):
-        return self._poc.__iter__()
-
-
-    def insert(self, index, obj):
-        return self._poc.insert(index, obj)
-
+        self.flush()
+        for p, o, c in self._poc:
+            if o() is not None:
+                yield p, o(), c 
 
     def __len__(self):
+        self.flush()
         return self._poc.__len__()
 
     def __deepcopy__(self, memo):
+        self.flush()
         s = ObservablesList()
         import copy
         s._poc = copy.deepcopy(self._poc, memo)
         return s
 
     def __getstate__(self):
+        self.flush()
         from ...util.caching import Cacher
         obs = []
         for p, o, c in self:
@@ -106,6 +113,6 @@ class ObservablesList(object):
     def __setstate__(self, state):
         self._poc = []
         for p, o, c in state:
-            self._poc.append((p,o,getattr(o, c)))
+            self.add(p,o,getattr(o, c))
 
     pass
diff --git a/GPy/core/parameterization/observable_array.py b/GPy/core/parameterization/observable_array.py
new file mode 100644
index 00000000..cd0c85d6
--- /dev/null
+++ b/GPy/core/parameterization/observable_array.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+__updated__ = '2014-03-31'
+
+import numpy as np
+from parameter_core import Observable, Pickleable
+
+class ObsAr(np.ndarray, Pickleable, Observable):
+    """
+    An ndarray which reports changes to its observers.
+    The observers can add themselves with a callable, which
+    will be called every time this array changes. The callable
+    takes exactly one argument, which is this array itself.
+    """
+    __array_priority__ = -1 # Never give back ObsAr
+    def __new__(cls, input_array, *a, **kw):
+        if not isinstance(input_array, ObsAr):
+            obj = np.atleast_1d(np.require(input_array, dtype=np.float64, requirements=['W', 'C'])).view(cls)
+        else: obj = input_array
+        #cls.__name__ = "ObsAr" # because of fixed printing of `array` in np printing
+        super(ObsAr, obj).__init__(*a, **kw)
+        return obj
+
+    def __array_finalize__(self, obj):
+        # see InfoArray.__array_finalize__ for comments
+        if obj is None: return
+        self.observers = getattr(obj, 'observers', None)
+
+    def __array_wrap__(self, out_arr, context=None):
+        return out_arr.view(np.ndarray)
+
+    def copy(self):
+        memo = {}
+        memo[id(self)] = self
+        return self.__deepcopy__(memo)
+
+    def __deepcopy__(self, memo):
+        s = self.__new__(self.__class__, input_array=self.view(np.ndarray).copy())
+        memo[id(self)] = s
+        import copy
+        s.__dict__.update(copy.deepcopy(self.__dict__, memo))
+        return s
+
+    def __reduce__(self):
+        func, args, state = super(ObsAr, self).__reduce__()
+        return func, args, (state, Pickleable.__getstate__(self))
+
+    def __setstate__(self, state):
+        np.ndarray.__setstate__(self, state[0])
+        Pickleable.__setstate__(self, state[1])
+
+    def __setitem__(self, s, val):
+        super(ObsAr, self).__setitem__(s, val)
+        self.notify_observers()
+
+    def __getslice__(self, start, stop):
+        return self.__getitem__(slice(start, stop))
+
+    def __setslice__(self, start, stop, val):
+        return self.__setitem__(slice(start, stop), val)
+
+    def __ilshift__(self, *args, **kwargs):
+        r = np.ndarray.__ilshift__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+    def __irshift__(self, *args, **kwargs):
+        r = np.ndarray.__irshift__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __ixor__(self, *args, **kwargs):
+        r = np.ndarray.__ixor__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __ipow__(self, *args, **kwargs):
+        r = np.ndarray.__ipow__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __ifloordiv__(self, *args, **kwargs):
+        r = np.ndarray.__ifloordiv__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __isub__(self, *args, **kwargs):
+        r = np.ndarray.__isub__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __ior__(self, *args, **kwargs):
+        r = np.ndarray.__ior__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __itruediv__(self, *args, **kwargs):
+        r = np.ndarray.__itruediv__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __idiv__(self, *args, **kwargs):
+        r = np.ndarray.__idiv__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __iand__(self, *args, **kwargs):
+        r = np.ndarray.__iand__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __imod__(self, *args, **kwargs):
+        r = np.ndarray.__imod__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __iadd__(self, *args, **kwargs):
+        r = np.ndarray.__iadd__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __imul__(self, *args, **kwargs):
+        r = np.ndarray.__imul__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 60bdfe9d..9c3d7bd3 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -59,7 +59,7 @@ class Param(OptimizationHandlable, ObsAr):
         import pydot
         node = pydot.Node(id(self), shape='record', label=self.name)
         G.add_node(node)
-        for o in self._observer_callables_.keys():
+        for o in self.observers.keys():
             label = o.name if hasattr(o, 'name') else str(o)
             observed_node = pydot.Node(id(o), label=label)
             G.add_node(observed_node)
@@ -324,7 +324,7 @@ class ParamConcatenation(object):
         if update:
             self.update_all_params()
     def values(self):
-        return numpy.hstack([p.param_array for p in self.params])
+        return numpy.hstack([p.param_array.flat for p in self.params])
     #===========================================================================
     # parameter operations:
     #===========================================================================
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 2dac9bf3..43bc7177 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -44,22 +44,23 @@ class Observable(object):
     def __init__(self, *args, **kwargs):
         super(Observable, self).__init__()
         from lists_and_dicts import ObservablesList
-        self._observer_callables_ = ObservablesList()
+        self.observers = ObservablesList()
 
     def add_observer(self, observer, callble, priority=0):
-        self._insert_sorted(priority, observer, callble)
+        self.observers.add(priority, observer, callble)
 
     def remove_observer(self, observer, callble=None):
         to_remove = []
-        for p, obs, clble in self._observer_callables_:
+        for poc in self.observers:
+            _, obs, clble = poc
             if callble is not None:
                 if (obs == observer) and (callble == clble):
-                    to_remove.append((p, obs, clble))
+                    to_remove.append(poc)
             else:
                 if obs is observer:
-                    to_remove.append((p, obs, clble))
+                    to_remove.append(poc)
         for r in to_remove:
-            self._observer_callables_.remove(r)
+            self.observers.remove(*r)
 
     def notify_observers(self, which=None, min_priority=None):
         """
@@ -74,21 +75,13 @@ class Observable(object):
         if which is None:
             which = self
         if min_priority is None:
-            [callble(self, which=which) for _, _, callble in self._observer_callables_]
+            [callble(self, which=which) for _, _, callble in self.observers]
         else:
-            for p, _, callble in self._observer_callables_:
+            for p, _, callble in self.observers:
                 if p <= min_priority:
                     break
                 callble(self, which=which)
 
-    def _insert_sorted(self, p, o, c):
-        ins = 0
-        for pr, _, _ in self._observer_callables_:
-            if p > pr:
-                break
-            ins += 1
-        self._observer_callables_.insert(ins, (p, o, c))
-
 #===============================================================================
 # Foundation framework for parameterized and param objects:
 #===============================================================================
@@ -192,7 +185,7 @@ class Pickleable(object):
 
     def __getstate__(self):
         ignore_list = ([#'_parent_', '_parent_index_',
-                        #'_observer_callables_',
+                        #'observers',
                         '_param_array_', '_gradient_array_', '_fixes_',
                         '_Cacher_wrap__cachers']
                        #+ self.parameter_names(recursive=False)
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 75085ca2..a794ab40 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -90,7 +90,7 @@ class Parameterized(Parameterizable):
             child_node = child.build_pydot(G)
             G.add_edge(pydot.Edge(node, child_node))
 
-        for o in self._observer_callables_.keys():
+        for o in self.observers.keys():
             label = o.name if hasattr(o, 'name') else str(o)
             observed_node = pydot.Node(id(o), label=label)
             G.add_node(observed_node)
diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index f8fd165f..3730baed 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -40,6 +40,7 @@ class SpikeAndSlabPrior(VariationalPrior):
         self.pi = Param('pi', pi, Logistic(1e-10,1.-1e-10))
         self.variance = Param('variance',variance)
         self.add_parameters(self.pi)
+        self.group_spike_prob = False
 
     def KL_divergence(self, variational_posterior):
         mu = variational_posterior.mean
@@ -55,7 +56,11 @@ class SpikeAndSlabPrior(VariationalPrior):
         S = variational_posterior.variance
         gamma = variational_posterior.binary_prob
 
-        gamma.gradient -= np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+(np.square(mu)+S-np.log(S)-1.)/2.
+        if self.group_spike_prob:
+            gamma_grad = np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+(np.square(mu)+S-np.log(S)-1.)/2.
+            gamma.gradient -= gamma_grad.mean(axis=0)
+        else:
+            gamma.gradient -= np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+(np.square(mu)+S-np.log(S)-1.)/2.
         mu.gradient -= gamma*mu
         S.gradient -= (1. - (1. / (S))) * gamma /2.
         self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum(axis=0)
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 07623d6b..c1911e75 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -409,12 +409,12 @@ def stick(kernel=None, optimize=True, verbose=True, plot=True):
     # optimize
     m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
     if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    if plot and GPy.plotting.matplot_dep.visualize.visual_available:
+    if plot:
         plt.clf
         ax = m.plot_latent()
-        y = m.likelihood.Y[0, :]
+        y = m.Y[0, :]
         data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        vis = GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, latent_axes=ax)
         raw_input('Press enter to finish')
 
     return m
diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py
index ee459a76..effa077c 100644
--- a/GPy/inference/latent_function_inference/__init__.py
+++ b/GPy/inference/latent_function_inference/__init__.py
@@ -32,6 +32,7 @@ from expectation_propagation import EP
 from dtc import DTC
 from fitc import FITC
 from var_dtc_parallel import VarDTC_minibatch
+from var_dtc_gpu import VarDTC_GPU
 
 # class FullLatentFunctionData(object):
 #
diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index 074b67a6..c0177e9f 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -32,7 +32,7 @@ class ExactGaussianInference(object):
             return Y
         else:
             #if Y in self.cache, return self.Cache[Y], else store Y in cache and return L.
-            print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
+            #print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
             return Y
 
     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py
new file mode 100644
index 00000000..9b2da1c9
--- /dev/null
+++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py
@@ -0,0 +1,545 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from posterior import Posterior
+from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs
+from ...util import diag
+from ...core.parameterization.variational import VariationalPosterior
+import numpy as np
+from ...util.misc import param_to_array
+log_2_pi = np.log(2*np.pi)
+
+from ...util import gpu_init
+
+try:
+    import scikits.cuda.linalg as culinalg
+    import pycuda.gpuarray as gpuarray
+    from scikits.cuda import cublas
+    from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis, outer_prod, mul_bcast_first, join_prod
+except:
+    pass
+
+class VarDTC_GPU(object):
+    """
+    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
+
+    The function self.inference returns a Posterior object, which summarizes
+    the posterior.
+
+    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
+
+    """
+    const_jitter = np.float64(1e-6)
+    def __init__(self, batchsize=None, gpu_memory=4., limit=1):
+        
+        self.batchsize = batchsize
+        self.gpu_memory = gpu_memory
+                
+        self.midRes = {}
+        self.batch_pos = 0 # the starting position of the current mini-batch
+        
+        self.cublas_handle = gpu_init.cublas_handle
+        
+        # Initialize GPU caches
+        self.gpuCache = None
+        
+    def _initGPUCache(self, kern, num_inducing, input_dim, output_dim, Y):
+        ndata = Y.shape[0]
+        if self.batchsize==None:
+            self.batchsize = self._estimateBatchSize(kern, ndata, num_inducing, input_dim, output_dim)
+        if self.gpuCache == None:
+            self.gpuCache = {# inference_likelihood
+                             'Kmm_gpu'              :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
+                             'Lm_gpu'               :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
+                             'ones_gpu'             :gpuarray.empty(num_inducing, np.float64,order='F'),
+                             'LL_gpu'               :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
+                             'b_gpu'                :gpuarray.empty((num_inducing,output_dim),np.float64,order='F'),
+                             'v_gpu'                :gpuarray.empty((num_inducing,output_dim),np.float64,order='F'),
+                             'vvt_gpu'              :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
+                             'KmmInvPsi2LLInvT_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
+                             'KmmInvPsi2P_gpu'      :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
+                             'dL_dpsi2R_gpu'        :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
+                             'dL_dKmm_gpu'          :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
+                             'psi1Y_gpu'            :gpuarray.empty((num_inducing,output_dim),np.float64,order='F'),
+                             'psi2_gpu'             :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
+                             'beta_gpu'             :gpuarray.empty((ndata,),np.float64,order='F'),
+                             'YT_gpu'               :gpuarray.to_gpu(np.asfortranarray(Y.T)), # DxN
+                             'betaYT_gpu'           :gpuarray.empty(Y.T.shape,np.float64,order='F'), # DxN
+                             'psi2_t_gpu'           :gpuarray.empty((num_inducing*num_inducing*self.batchsize),np.float64,order='F'),
+                             # inference_minibatch
+                             'dL_dpsi0_gpu'         :gpuarray.empty((self.batchsize,),np.float64,order='F'),
+                             'dL_dpsi1_gpu'         :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
+                             'dL_dpsi2_gpu'         :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'),
+                             'dL_dthetaL_gpu'       :gpuarray.empty((self.batchsize,),np.float64,order='F'),
+                             'betapsi1_gpu'         :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
+                             'thetaL_t_gpu'         :gpuarray.empty((self.batchsize,),np.float64,order='F'),
+                             'betaYT2_gpu'          :gpuarray.empty((output_dim,self.batchsize),np.float64,order='F'),
+                             'psi0p_gpu'            :gpuarray.empty((self.batchsize,),np.float64,order='F'),
+                             'psi1p_gpu'            :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
+                             'psi2p_gpu'            :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'),
+                             }
+            self.gpuCache['ones_gpu'].fill(1.0)
+            
+            YT_gpu = self.gpuCache['YT_gpu']
+            self._trYYT = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, YT_gpu.gpudata, 1, YT_gpu.gpudata, 1)
+            
+    def _estimateMemoryOccupation(self, N, M, D):
+        """
+        Estimate the best batch size.
+        N - the number of total datapoints
+        M - the number of inducing points
+        D - the number of observed (output) dimensions
+        return: the constant memory size, the memory occupation of batchsize=1
+        unit: GB
+        """
+        return (M+9.*M*M+3*M*D+N+2.*N*D)*8./1024./1024./1024., (4.+3.*M+D+3.*M*M)*8./1024./1024./1024.
+    
+    def _estimateBatchSize(self, kern, N, M, Q, D):
+        """
+        Estimate the best batch size.
+        N - the number of total datapoints
+        M - the number of inducing points
+        D - the number of observed (output) dimensions
+        return: the constant memory size, the memory occupation of batchsize=1
+        unit: GB
+        """
+        if kern.useGPU:
+            x0,x1 = kern.psicomp.estimateMemoryOccupation(N,M,Q)
+        else:
+            x0, x1 = 0.,0.
+        y0, y1 = self._estimateMemoryOccupation(N, M, D)
+        
+        opt_batchsize = min(int((self.gpu_memory-y0-x0)/(x1+y1)), N)
+        
+        return opt_batchsize
+        
+    def _get_YYTfactor(self, Y):
+        """
+        find a matrix L which satisfies LLT = YYT.
+
+        Note that L may have fewer columns than Y.
+        """
+        N, D = Y.shape
+        if (N>=D):
+            return param_to_array(Y)
+        else:
+            return jitchol(tdot(Y))
+        
+    def inference_likelihood(self, kern, X, Z, likelihood, Y):
+        """
+        The first phase of inference:
+        Compute: log-likelihood, dL_dKmm
+        
+        Cached intermediate results: Kmm, KmmInv,
+        """
+        
+        num_inducing, input_dim = Z.shape[0], Z.shape[1]
+        num_data, output_dim = Y.shape
+        
+        self._initGPUCache(kern, num_inducing, input_dim, output_dim, Y)
+
+        if isinstance(X, VariationalPosterior):
+            uncertain_inputs = True
+        else:
+            uncertain_inputs = False
+        
+        #see whether we've got a different noise variance for each datum
+        beta = 1./np.fmax(likelihood.variance, 1e-6)
+        het_noise = beta.size > 1
+        trYYT = self._trYYT
+        
+        psi1Y_gpu = self.gpuCache['psi1Y_gpu']
+        psi2_gpu = self.gpuCache['psi2_gpu']
+        beta_gpu = self.gpuCache['beta_gpu']
+        YT_gpu = self.gpuCache['YT_gpu']
+        betaYT_gpu = self.gpuCache['betaYT_gpu']
+        psi2_t_gpu = self.gpuCache['psi2_t_gpu']
+        
+        if het_noise:
+            beta_gpu.set(np.asfortranarray(beta))
+            mul_bcast(betaYT_gpu,beta_gpu,YT_gpu,beta_gpu.size)
+            YRY_full = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, betaYT_gpu.gpudata, 1, YT_gpu.gpudata, 1)
+        else:
+            beta_gpu.fill(beta)
+            betaYT_gpu.fill(0.)
+            cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1)
+            YRY_full = trYYT*beta
+
+        if kern.useGPU:
+            psi1Y_gpu.fill(0.)
+            psi2_gpu.fill(0.)
+            psi0_full = 0
+            
+            for n_start in xrange(0,num_data,self.batchsize):
+                n_end = min(self.batchsize+n_start, num_data)
+                ndata = n_end - n_start
+                X_slice = X[n_start:n_end]
+                beta_gpu_slice = beta_gpu[n_start:n_end]
+                betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end]
+                if ndata==self.batchsize:
+                    psi2_t_gpu_slice = psi2_t_gpu
+                else:
+                    psi2_t_gpu_slice = psi2_t_gpu[:num_inducing*num_inducing*ndata]
+                if uncertain_inputs:
+                    psi0p_gpu = kern.psi0(Z, X_slice)
+                    psi1p_gpu = kern.psi1(Z, X_slice)
+                    psi2p_gpu = kern.psi2(Z, X_slice)
+                else:
+                    psi0p_gpu = kern.Kdiag(X_slice)
+                    psi1p_gpu = kern.K(X_slice, Z)
+
+                cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing)
+                
+                if het_noise:
+                    psi0_full += cublas.cublasDdot(self.cublas_handle, psi0p_gpu.size, beta_gpu_slice.gpudata, 1, psi0p_gpu.gpudata, 1)
+                else:
+                    psi0_full += gpuarray.sum(psi0p_gpu).get()
+                                    
+                if uncertain_inputs:
+                    if het_noise:
+                        mul_bcast(psi2_t_gpu_slice,beta_gpu_slice,psi2p_gpu,beta_gpu_slice.size)
+                        sum_axis(psi2_gpu,psi2_t_gpu_slice,1,ndata)
+                    else:
+                        sum_axis(psi2_gpu,psi2p_gpu,1,ndata)
+                else:
+                    if het_noise:
+                        psi1_t_gpu = psi2_t_gpu_slice[:,num_inducing*ndata]
+                        mul_bcast(psi1_t_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size)
+                        cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, 1.0, psi1p_gpu.gpudata, ndata, psi1_t_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing)
+                    else:
+                        cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, beta, psi1p_gpu.gpudata, ndata, psi1p_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing)
+                    
+            if not het_noise:
+                psi0_full *= beta
+                if uncertain_inputs:
+                    cublas.cublasDscal(self.cublas_handle, psi2_gpu.size, beta, psi2_gpu.gpudata, 1)
+            
+        else:
+            psi2_full = np.zeros((num_inducing,num_inducing),order='F')
+            psi1Y_full = np.zeros((num_inducing,output_dim),order='F') # MxD
+            psi0_full = 0
+#             YRY_full = 0
+            
+            for n_start in xrange(0,num_data,self.batchsize):
+                n_end = min(self.batchsize+n_start, num_data)                
+                Y_slice = Y[n_start:n_end]
+                X_slice = X[n_start:n_end]
+                if uncertain_inputs:
+                    psi0 = kern.psi0(Z, X_slice)
+                    psi1 = kern.psi1(Z, X_slice)
+                    psi2 = kern.psi2(Z, X_slice)
+                else:
+                    psi0 = kern.Kdiag(X_slice)
+                    psi1 = kern.K(X_slice, Z)
+                    
+                if het_noise:
+                    beta_slice = beta[n_start:n_end]
+                    psi0_full += (beta_slice*psi0).sum()
+                    psi1Y_full += np.dot(psi1.T,beta_slice[:,None]*Y_slice) # MxD
+#                     YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum()
+                else:
+                    psi0_full += psi0.sum()
+                    psi1Y_full += np.dot(psi1.T,Y_slice) # MxD
+                                    
+                if uncertain_inputs:
+                    if het_noise:
+                        psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2)
+                    else:
+                        psi2_full += psi2.sum(axis=0)
+                else:
+                    if het_noise:
+                        psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1)
+                    else:
+                        psi2_full += tdot(psi1.T)
+                    
+            if not het_noise:
+                psi0_full *= beta
+                psi1Y_full *= beta
+                psi2_full *= beta
+#                 YRY_full = trYYT*beta
+            
+            psi1Y_gpu.set(psi1Y_full)
+            psi2_gpu.set(psi2_full)
+        
+        #======================================================================
+        # Compute Common Components
+        #======================================================================
+        
+        Kmm = kern.K(Z).copy()
+        Kmm_gpu = self.gpuCache['Kmm_gpu']
+        Kmm_gpu.set(np.asfortranarray(Kmm))
+        diag.add(Kmm, self.const_jitter)
+        ones_gpu = self.gpuCache['ones_gpu']
+        cublas.cublasDaxpy(self.cublas_handle, num_inducing, self.const_jitter, ones_gpu.gpudata, 1, Kmm_gpu.gpudata, num_inducing+1)
+#         assert np.allclose(Kmm, Kmm_gpu.get())
+        
+#         Lm = jitchol(Kmm)
+        #
+        Lm_gpu = self.gpuCache['Lm_gpu']
+        cublas.cublasDcopy(self.cublas_handle, Kmm_gpu.size, Kmm_gpu.gpudata, 1, Lm_gpu.gpudata, 1)
+        culinalg.cho_factor(Lm_gpu,'L')
+#         print np.abs(np.tril(Lm)-np.tril(Lm_gpu.get())).max()
+                
+#         Lambda = Kmm+psi2_full
+#         LL = jitchol(Lambda)
+        #
+        Lambda_gpu = self.gpuCache['LL_gpu']
+        cublas.cublasDcopy(self.cublas_handle, Kmm_gpu.size, Kmm_gpu.gpudata, 1, Lambda_gpu.gpudata, 1)
+        cublas.cublasDaxpy(self.cublas_handle, psi2_gpu.size, np.float64(1.0), psi2_gpu.gpudata, 1, Lambda_gpu.gpudata, 1)
+        LL_gpu = Lambda_gpu
+        culinalg.cho_factor(LL_gpu,'L')
+#         print np.abs(np.tril(LL)-np.tril(LL_gpu.get())).max()
+        
+#         b,_ = dtrtrs(LL, psi1Y_full)
+#         bbt_cpu = np.square(b).sum()
+        #
+        b_gpu = self.gpuCache['b_gpu']
+        cublas.cublasDcopy(self.cublas_handle, b_gpu.size, psi1Y_gpu.gpudata, 1, b_gpu.gpudata, 1)
+        cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, output_dim, np.float64(1.0), LL_gpu.gpudata, num_inducing, b_gpu.gpudata, num_inducing)
+        bbt = cublas.cublasDdot(self.cublas_handle, b_gpu.size, b_gpu.gpudata, 1, b_gpu.gpudata, 1)
+#         print np.abs(bbt-bbt_cpu)
+        
+#         v,_ = dtrtrs(LL.T,b,lower=False)
+#         vvt = np.einsum('md,od->mo',v,v)
+#         LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
+        #
+        v_gpu = self.gpuCache['v_gpu']
+        cublas.cublasDcopy(self.cublas_handle, v_gpu.size, b_gpu.gpudata, 1, v_gpu.gpudata, 1)
+        cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'T', 'N', num_inducing, output_dim, np.float64(1.0), LL_gpu.gpudata, num_inducing, v_gpu.gpudata, num_inducing)
+        vvt_gpu = self.gpuCache['vvt_gpu']
+        cublas.cublasDgemm(self.cublas_handle, 'N', 'T', num_inducing, num_inducing, output_dim, np.float64(1.0), v_gpu.gpudata, num_inducing, v_gpu.gpudata, num_inducing, np.float64(0.), vvt_gpu.gpudata, num_inducing)
+        LmInvPsi2LmInvT_gpu = self.gpuCache['KmmInvPsi2LLInvT_gpu']
+        cublas.cublasDcopy(self.cublas_handle, psi2_gpu.size, psi2_gpu.gpudata, 1, LmInvPsi2LmInvT_gpu.gpudata, 1)
+        cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing)
+        cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing)
+        #tr_LmInvPsi2LmInvT = cublas.cublasDasum(self.cublas_handle, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing+1)
+        tr_LmInvPsi2LmInvT = float(strideSum(LmInvPsi2LmInvT_gpu, num_inducing+1).get())
+#         print np.abs(vvt-vvt_gpu.get()).max()
+#         print np.abs(np.trace(LmInvPsi2LmInvT)-tr_LmInvPsi2LmInvT)
+        
+#         Psi2LLInvT = dtrtrs(LL,psi2_full)[0].T
+#         LmInvPsi2LLInvT= dtrtrs(Lm,Psi2LLInvT)[0]
+#         KmmInvPsi2LLInvT = dtrtrs(Lm,LmInvPsi2LLInvT,trans=True)[0]
+#         KmmInvPsi2P = dtrtrs(LL,KmmInvPsi2LLInvT.T, trans=True)[0].T
+        #
+        KmmInvPsi2LLInvT_gpu = LmInvPsi2LmInvT_gpu # Reuse GPU memory (size:MxM)
+        cublas.cublasDcopy(self.cublas_handle, psi2_gpu.size, psi2_gpu.gpudata, 1, KmmInvPsi2LLInvT_gpu.gpudata, 1)
+        cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing)
+        cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), LL_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing)
+        cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing)
+        KmmInvPsi2P_gpu = self.gpuCache['KmmInvPsi2P_gpu']
+        cublas.cublasDcopy(self.cublas_handle, KmmInvPsi2LLInvT_gpu.size, KmmInvPsi2LLInvT_gpu.gpudata, 1, KmmInvPsi2P_gpu.gpudata, 1)
+        cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), LL_gpu.gpudata, num_inducing, KmmInvPsi2P_gpu.gpudata, num_inducing)
+#         print np.abs(KmmInvPsi2P-KmmInvPsi2P_gpu.get()).max()
+        
+#         dL_dpsi2R = (output_dim*KmmInvPsi2P - vvt)/2. # dL_dpsi2 with R inside psi2
+        #
+        dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu']
+        cublas.cublasDcopy(self.cublas_handle, vvt_gpu.size, vvt_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1)
+        cublas.cublasDaxpy(self.cublas_handle, KmmInvPsi2P_gpu.size, np.float64(-output_dim), KmmInvPsi2P_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1)
+        cublas.cublasDscal(self.cublas_handle, dL_dpsi2R_gpu.size, np.float64(-0.5), dL_dpsi2R_gpu.gpudata, 1)
+#         print np.abs(dL_dpsi2R_gpu.get()-dL_dpsi2R).max()
+                        
+        #======================================================================
+        # Compute log-likelihood
+        #======================================================================
+        if het_noise:
+            logL_R = -np.log(beta).sum()
+        else:
+            logL_R = -num_data*np.log(beta)
+#         logL_old = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.-output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum())
+        
+        logdetKmm = float(logDiagSum(Lm_gpu,num_inducing+1).get())
+        logdetLambda = float(logDiagSum(LL_gpu,num_inducing+1).get())
+        logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-tr_LmInvPsi2LmInvT)+YRY_full-bbt)/2.+output_dim*(logdetKmm-logdetLambda)
+#         print np.abs(logL_old - logL)
+
+        #======================================================================
+        # Compute dL_dKmm
+        #======================================================================
+        
+#         dL_dKmm =  -(output_dim*np.einsum('md,od->mo',KmmInvPsi2LLInvT,KmmInvPsi2LLInvT) + vvt)/2.
+        #
+        dL_dKmm_gpu = self.gpuCache['dL_dKmm_gpu']
+        cublas.cublasDgemm(self.cublas_handle, 'N', 'T', num_inducing, num_inducing, num_inducing, np.float64(1.0), KmmInvPsi2LLInvT_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing, np.float64(0.), dL_dKmm_gpu.gpudata, num_inducing)
+        cublas.cublasDaxpy(self.cublas_handle, dL_dKmm_gpu.size, np.float64(1./output_dim), vvt_gpu.gpudata, 1, dL_dKmm_gpu.gpudata, 1)
+        cublas.cublasDscal(self.cublas_handle, dL_dKmm_gpu.size, np.float64(-output_dim/2.), dL_dKmm_gpu.gpudata, 1)
+#         print np.abs(dL_dKmm - dL_dKmm_gpu.get()).max()
+
+        #======================================================================
+        # Compute the Posterior distribution of inducing points p(u|Y)
+        #======================================================================
+                
+        post = Posterior(woodbury_inv=KmmInvPsi2P_gpu.get(), woodbury_vector=v_gpu.get(), K=Kmm_gpu.get(), mean=None, cov=None, K_chol=Lm_gpu.get())
+
+        return logL, dL_dKmm_gpu.get(), post
+
+    def inference_minibatch(self, kern, X, Z, likelihood, Y):
+        """
+        The second phase of inference: Computing the derivatives over a minibatch of Y 
+        Compute: dL_dpsi0, dL_dpsi1, dL_dpsi2, dL_dthetaL
+        return a flag showing whether it reached the end of Y (isEnd)
+        """
+
+        num_data, output_dim = Y.shape
+        num_inducing = Z.shape[0]
+
+        if isinstance(X, VariationalPosterior):
+            uncertain_inputs = True
+        else:
+            uncertain_inputs = False
+        
+        beta = 1./np.fmax(likelihood.variance, 1e-6)
+        het_noise = beta.size > 1
+        
+        n_start = self.batch_pos
+        n_end = min(self.batchsize+n_start, num_data)
+        if n_end==num_data:
+            isEnd = True
+            self.batch_pos = 0
+        else:
+            isEnd = False
+            self.batch_pos = n_end
+        
+        nSlice = n_end-n_start
+        X_slice = X[n_start:n_end]
+        
+        if kern.useGPU:
+            if uncertain_inputs:
+                psi0p_gpu = kern.psi0(Z, X_slice)
+                psi1p_gpu = kern.psi1(Z, X_slice)
+                psi2p_gpu = kern.psi2(Z, X_slice)
+            else:
+                psi0p_gpu = kern.Kdiag(X_slice)
+                psi1p_gpu = kern.K(X_slice, Z)
+                psi2p_gpu = self.gpuCache['psi2p_gpu']
+                if psi2p_gpu.shape[0] > nSlice:
+                    psi2p_gpu = psi2p_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
+        else:
+            if uncertain_inputs:
+                psi0 = kern.psi0(Z, X_slice)
+                psi1 = kern.psi1(Z, X_slice)
+                psi2 = kern.psi2(Z, X_slice)
+            else:
+                psi0 = kern.Kdiag(X_slice)
+                psi1 = kern.K(X_slice, Z)
+
+            psi0p_gpu = self.gpuCache['psi0p_gpu']
+            psi1p_gpu = self.gpuCache['psi1p_gpu']
+            psi2p_gpu = self.gpuCache['psi2p_gpu']
+            if psi0p_gpu.shape[0] > nSlice:
+                psi0p_gpu = psi0p_gpu[:nSlice]
+                psi1p_gpu = psi1p_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
+                psi2p_gpu = psi2p_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
+            psi0p_gpu.set(np.asfortranarray(psi0))
+            psi1p_gpu.set(np.asfortranarray(psi1))
+            if uncertain_inputs:
+                psi2p_gpu.set(np.asfortranarray(psi2))
+                
+        #======================================================================
+        # Prepare gpu memory
+        #======================================================================
+        
+        dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu']
+        v_gpu = self.gpuCache['v_gpu']        
+        betaYT_gpu = self.gpuCache['betaYT_gpu']
+        beta_gpu = self.gpuCache['beta_gpu']
+        dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu']
+        dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
+        dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
+        dL_dthetaL_gpu = self.gpuCache['dL_dthetaL_gpu']
+        psi2R_gpu = self.gpuCache['psi2_t_gpu'][:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
+        betapsi1_gpu = self.gpuCache['betapsi1_gpu']
+        thetaL_t_gpu = self.gpuCache['thetaL_t_gpu']
+        betaYT2_gpu = self.gpuCache['betaYT2_gpu']
+        
+        betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end]
+        beta_gpu_slice = beta_gpu[n_start:n_end]
+
+        # Adjust to the batch size
+        if dL_dpsi0_gpu.shape[0] > nSlice:
+            betaYT2_gpu = betaYT2_gpu[:,:nSlice]
+            dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice]
+            dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
+            dL_dpsi2_gpu = dL_dpsi2_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
+            dL_dthetaL_gpu = dL_dthetaL_gpu.ravel()[:nSlice]
+            psi2R_gpu = psi2R_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
+            thetaL_t_gpu = thetaL_t_gpu.ravel()[:nSlice]
+            betapsi1_gpu = betapsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
+        
+        mul_bcast(betapsi1_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size)
+
+        #======================================================================
+        # Compute dL_dpsi
+        #======================================================================
+        
+        dL_dpsi0_gpu.fill(0.)
+        cublas.cublasDaxpy(self.cublas_handle, dL_dpsi0_gpu.size, output_dim/(-2.), beta_gpu_slice.gpudata, 1, dL_dpsi0_gpu.gpudata, 1)
+        
+        cublas.cublasDgemm(self.cublas_handle, 'T', 'T', nSlice, num_inducing, output_dim, 1.0, betaYT_gpu_slice.gpudata, output_dim, v_gpu.gpudata, num_inducing, 0., dL_dpsi1_gpu.gpudata, nSlice)
+        
+        if uncertain_inputs:
+            outer_prod(dL_dpsi2_gpu,beta_gpu_slice,dL_dpsi2R_gpu,beta_gpu_slice.size)
+        else:
+            cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, 1.0, betapsi1_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice)
+            
+        #======================================================================
+        # Compute dL_dthetaL
+        #======================================================================
+        
+        if not uncertain_inputs:
+            join_prod(psi2p_gpu,psi1p_gpu,psi1p_gpu,nSlice,num_inducing)
+
+        mul_bcast_first(psi2R_gpu,dL_dpsi2R_gpu,psi2p_gpu,nSlice)
+        
+
+        dL_dthetaL_gpu.fill(0.)
+        
+        cublas.cublasDcopy(self.cublas_handle, betaYT_gpu_slice.size, betaYT_gpu_slice.gpudata, 1, betaYT2_gpu.gpudata, 1)
+        mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT2_gpu,betaYT2_gpu.size)
+        cublas.cublasDscal(self.cublas_handle, betaYT2_gpu.size, 0.5, betaYT2_gpu.gpudata, 1)
+        sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim)
+        
+        cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/(-2.0), beta_gpu_slice.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
+        cublas.cublasDcopy(self.cublas_handle, beta_gpu_slice.size, beta_gpu_slice.gpudata, 1, thetaL_t_gpu.gpudata, 1)
+        mul_bcast(thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu.size)
+        mul_bcast(thetaL_t_gpu,thetaL_t_gpu,psi0p_gpu,thetaL_t_gpu.size)
+        cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/2.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
+        
+        thetaL_t_gpu.fill(0.)
+        sum_axis(thetaL_t_gpu, psi2R_gpu, nSlice, num_inducing*num_inducing)
+        mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size)
+        mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size)
+        cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, -1.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
+        
+        cublas.cublasDgemm(self.cublas_handle, 'T', 'T', output_dim, nSlice, num_inducing, -1.0, v_gpu.gpudata, num_inducing, betapsi1_gpu.gpudata, nSlice, 0.0, betaYT2_gpu.gpudata, output_dim)
+        mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT_gpu_slice,betaYT2_gpu.size)
+        sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim)
+
+        if kern.useGPU:
+            dL_dpsi0 = dL_dpsi0_gpu
+            dL_dpsi1 = dL_dpsi1_gpu
+        else:
+            dL_dpsi0 = dL_dpsi0_gpu.get()
+            dL_dpsi1 = dL_dpsi1_gpu.get()            
+        if uncertain_inputs:
+            if kern.useGPU:
+                dL_dpsi2 = dL_dpsi2_gpu
+            else:
+                dL_dpsi2 = dL_dpsi2_gpu.get()
+        if het_noise:
+            dL_dthetaL = dL_dthetaL_gpu.get()
+        else:
+            dL_dthetaL = gpuarray.sum(dL_dthetaL_gpu).get()
+        if uncertain_inputs:
+            grad_dict = {'dL_dpsi0':dL_dpsi0,
+                         'dL_dpsi1':dL_dpsi1,
+                         'dL_dpsi2':dL_dpsi2,
+                         'dL_dthetaL':dL_dthetaL}
+        else:
+            grad_dict = {'dL_dKdiag':dL_dpsi0,
+                         'dL_dKnm':dL_dpsi1,
+                         'dL_dthetaL':dL_dthetaL}
+            
+        return isEnd, (n_start,n_end), grad_dict
+    
diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py
index bb69b88d..87236e2a 100644
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@@ -279,4 +279,55 @@ class VarDTC_minibatch(object):
                          'dL_dthetaL':dL_dthetaL}
             
         return isEnd, (n_start,n_end), grad_dict
+
+
+def update_gradients(model):
+    model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, model.X, model.Z, model.likelihood, model.Y)
     
+    het_noise = model.likelihood.variance.size > 1
+    
+    if het_noise:
+        dL_dthetaL = np.empty((model.Y.shape[0],))
+    else:
+        dL_dthetaL = 0
+
+    #gradients w.r.t. kernel
+    model.kern.update_gradients_full(dL_dKmm, model.Z, None)
+    kern_grad = model.kern.gradient.copy()
+            
+    #gradients w.r.t. Z
+    model.Z.gradient[:,model.kern.active_dims] = model.kern.gradients_X(dL_dKmm, model.Z)
+    
+    isEnd = False
+    while not isEnd:
+        isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, model.X, model.Z, model.likelihood, model.Y)
+        if isinstance(model.X, VariationalPosterior):
+            X_slice = model.X[n_range[0]:n_range[1]]
+            
+            #gradients w.r.t. kernel
+            model.kern.update_gradients_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
+            kern_grad += model.kern.gradient
+    
+            #gradients w.r.t. Z
+            model.Z.gradient[:,model.kern.active_dims] += model.kern.gradients_Z_expectations(
+                               grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice)
+        
+            #gradients w.r.t. posterior parameters of X
+            X_grad = model.kern.gradients_qX_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
+            model.set_X_gradients(X_slice, X_grad)
+                
+            if het_noise:
+                dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL']
+            else:
+                dL_dthetaL += grad_dict['dL_dthetaL']
+    
+    # Set the gradients w.r.t. kernel
+    model.kern.gradient = kern_grad
+
+    # Update Log-likelihood
+    model._log_marginal_likelihood -= model.variational_prior.KL_divergence(model.X)
+    # update for the KL divergence
+    model.variational_prior.update_gradients_KL(model.X)
+    
+    # dL_dthetaL
+    model.likelihood.update_gradients(dL_dthetaL)
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index de99bddb..f871e676 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -13,9 +13,10 @@ class Kern(Parameterized):
     #===========================================================================
     # This adds input slice support. The rather ugly code for slicing can be
     # found in kernel_slice_operations
-    __metaclass__ = KernCallsViaSlicerMeta
+    #__metaclass__ = KernCallsViaSlicerMeta
     #===========================================================================
-    def __init__(self, input_dim, active_dims, name, *a, **kw):
+    _support_GPU=False
+    def __init__(self, input_dim, active_dims, name, useGPU=False, *a, **kw):
         """
         The base class for a kernel: a positive definite function
         which forms of a covariance function (kernel).
@@ -39,6 +40,8 @@ class Kern(Parameterized):
             active_dim_size = len(self.active_dims)
         assert active_dim_size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, active_dim_size, self.active_dims)
         self._sliced_X = 0
+        
+        self.useGPU = self._support_GPU and useGPU
 
     @Cache_this(limit=10)
     def _slice_X(self, X):
diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
new file mode 100644
index 00000000..f49dc52a
--- /dev/null
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
@@ -0,0 +1,535 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+"""
+The package for the psi statistics computation on GPU
+"""
+
+import numpy as np
+from GPy.util.caching import Cache_this
+
+from ....util import gpu_init
+
+try:
+    import pycuda.gpuarray as gpuarray
+    from scikits.cuda import cublas
+    from pycuda.reduction import ReductionKernel    
+    from pycuda.elementwise import ElementwiseKernel
+    from ....util import linalg_gpu
+    
+    
+    # The kernel form computing psi1 het_noise
+    comp_psi1 = ElementwiseKernel(
+        "double *psi1, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q",
+        "psi1[i] = comp_psi1_element(var, l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)",
+        "comp_psi1",
+        preamble="""
+        #define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
+        #define IDX_NQ(n,q) (q*N+n)
+        #define IDX_MQ(m,q) (q*M+m)
+        #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b)))
+        
+        __device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx)
+        {
+            int n = idx%N;
+            int m = idx/N;
+            double psi1_exp=0;
+            for(int q=0;q<Q;q++){
+                double muZ = mu[IDX_NQ(n,q)]-Z[IDX_MQ(m,q)];
+                double exp1 = logGamma[IDX_NQ(n,q)] - (logpsi1denom[IDX_NQ(n,q)] + muZ*muZ/(S[IDX_NQ(n,q)]+l[q]) )/2.0;
+                double exp2 = log1Gamma[IDX_NQ(n,q)] - Z[IDX_MQ(m,q)]*Z[IDX_MQ(m,q)]/(l[q]*2.0);
+                psi1_exp += LOGEXPSUM(exp1,exp2);
+            }
+            return var*exp(psi1_exp);
+        }
+        """)
+        
+    # The kernel form computing psi2 het_noise
+    comp_psi2 = ElementwiseKernel(
+        "double *psi2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q",
+        "psi2[i] = comp_psi2_element(var, l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)",
+        "comp_psi2",
+        preamble="""
+        #define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
+        #define IDX_NQ(n,q) (q*N+n)
+        #define IDX_MQ(m,q) (q*M+m)
+        #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b)))
+        
+        __device__ double comp_psi2_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx)
+        {
+            // psi2 (n,m1,m2)
+            int m2 = idx/(M*N);
+            int m1 = (idx%(M*N))/N;
+            int n = idx%N;
+
+            double psi2_exp=0;
+            for(int q=0;q<Q;q++){ 
+                double dZ = Z[IDX_MQ(m1,q)]-Z[IDX_MQ(m2,q)];
+                double muZ = mu[IDX_NQ(n,q)] - (Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)])/2.0;
+                double exp1 = logGamma[IDX_NQ(n,q)] - (logpsi2denom[IDX_NQ(n,q)])/2.0 - dZ*dZ/(l[q]*4.0) - muZ*muZ/(2*S[IDX_NQ(n,q)]+l[q]);
+                double exp2 = log1Gamma[IDX_NQ(n,q)] - (Z[IDX_MQ(m1,q)]*Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)]*Z[IDX_MQ(m2,q)])/(l[q]*2.0);
+                psi2_exp += LOGEXPSUM(exp1,exp2);
+            }
+            return var*var*exp(psi2_exp);
+        }
+        """) 
+    
+    # compute psidenom
+    comp_logpsidenom = ElementwiseKernel(
+        "double *out, double *S, double *l, double scale, int N",
+        "out[i] = comp_logpsidenom_element(S, l, scale, N, i)",
+        "comp_logpsidenom",
+        preamble="""        
+        __device__ double comp_logpsidenom_element(double *S, double *l, double scale, int N, int idx)
+        {
+            int q = idx/N;
+            
+            return log(scale*S[idx]/l[q]+1.0);
+        }
+        """)
+    
+    # The kernel form computing psi1 het_noise
+    comp_dpsi1_dvar = ElementwiseKernel(
+        "double *dpsi1_dvar, double *psi1_neq, double *psi1exp1, double *psi1exp2, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q",
+        "dpsi1_dvar[i] = comp_dpsi1_dvar_element(psi1_neq, psi1exp1, psi1exp2, l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)",
+        "comp_dpsi1_dvar",
+        preamble="""
+        #define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
+        #define IDX_NQ(n,q) (q*N+n)
+        #define IDX_MQ(m,q) (q*M+m)
+        #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b)))
+        
+        __device__ double comp_dpsi1_dvar_element(double *psi1_neq, double *psi1exp1, double *psi1exp2, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx)
+        {
+            int n = idx%N;
+            int m = idx/N;
+            
+            double psi1_sum = 0;
+            for(int q=0;q<Q;q++){            
+                double muZ = mu[IDX_NQ(n,q)]-Z[IDX_MQ(m,q)];
+                double exp1_e = -(muZ*muZ/(S[IDX_NQ(n,q)]+l[q]) )/2.0;
+                double exp1 = logGamma[IDX_NQ(n,q)] - (logpsi1denom[IDX_NQ(n,q)])/2.0 + exp1_e;
+                double exp2_e = - Z[IDX_MQ(m,q)]*Z[IDX_MQ(m,q)]/(l[q]*2.0);
+                double exp2 = log1Gamma[IDX_NQ(n,q)] + exp2_e;
+                double psi1_q = LOGEXPSUM(exp1,exp2);
+                psi1_neq[IDX_NMQ(n,m,q)] = -psi1_q;
+                psi1exp1[IDX_NMQ(n,m,q)] = exp(exp1_e);
+                psi1exp2[IDX_MQ(m,q)] = exp(exp2_e);
+                psi1_sum += psi1_q;
+            }
+            for(int q=0;q<Q;q++) {
+                psi1_neq[IDX_NMQ(n,m,q)] = exp(psi1_neq[IDX_NMQ(n,m,q)]+psi1_sum);
+            }
+            return exp(psi1_sum);
+        }
+        """)
+    
+    # The kernel form computing psi1 het_noise
+    comp_psi1_der = ElementwiseKernel(
+        "double *dpsi1_dl, double *dpsi1_dmu, double *dpsi1_dS, double *dpsi1_dgamma, double *dpsi1_dZ, double *psi1_neq, double *psi1exp1, double *psi1exp2, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q",
+        "dpsi1_dl[i] = comp_psi1_der_element(dpsi1_dmu, dpsi1_dS, dpsi1_dgamma, dpsi1_dZ, psi1_neq, psi1exp1, psi1exp2, var, l, Z, mu, S, gamma, N, M, Q, i)",
+        "comp_psi1_der",
+        preamble="""
+        #define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
+        #define IDX_NQ(n,q) (q*N+n)
+        #define IDX_MQ(m,q) (q*M+m)
+        
+        __device__ double comp_psi1_der_element(double *dpsi1_dmu, double *dpsi1_dS, double *dpsi1_dgamma, double *dpsi1_dZ, double *psi1_neq, double *psi1exp1, double *psi1exp2, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q, int idx)
+        {
+            int q = idx/(M*N);
+            int m = (idx%(M*N))/N;
+            int n = idx%N;
+            
+            double neq = psi1_neq[IDX_NMQ(n,m,q)];
+            double gamma_c = gamma[IDX_NQ(n,q)];
+            double Z_c = Z[IDX_MQ(m,q)];
+            double S_c = S[IDX_NQ(n,q)];
+            double l_c = l[q];
+            double l_sqrt_c = sqrt(l[q]);
+            double psi1exp1_c = psi1exp1[IDX_NMQ(n,m,q)];
+            double psi1exp2_c = psi1exp2[IDX_MQ(m,q)];
+
+            double denom = S_c/l_c+1.0;
+            double denom_sqrt = sqrt(denom);
+            double Zmu = Z_c-mu[IDX_NQ(n,q)];
+            double psi1_common = gamma_c/(denom_sqrt*denom*l_c);
+            double gamma1 = 1-gamma_c;
+            
+            dpsi1_dgamma[IDX_NMQ(n,m,q)] = var*neq*(psi1exp1_c/denom_sqrt - psi1exp2_c);
+            dpsi1_dmu[IDX_NMQ(n,m,q)] = var*neq*(psi1_common*Zmu*psi1exp1_c);
+            dpsi1_dS[IDX_NMQ(n,m,q)] = var*neq*(psi1_common*(Zmu*Zmu/(S_c+l_c)-1.0)*psi1exp1_c)/2.0;
+            dpsi1_dZ[IDX_NMQ(n,m,q)] = var*neq*(-psi1_common*Zmu*psi1exp1_c-gamma1*Z_c/l_c*psi1exp2_c);
+            return var*neq*(psi1_common*(S_c/l_c+Zmu*Zmu/(S_c+l_c))*psi1exp1_c+gamma1*Z_c*Z_c/l_c*psi1exp2_c)*l_sqrt_c;    
+        }
+        """)
+    
+    # The kernel form computing psi1 het_noise
+    comp_dpsi2_dvar = ElementwiseKernel(
+        "double *dpsi2_dvar, double *psi2_neq, double *psi2exp1, double *psi2exp2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q",
+        "dpsi2_dvar[i] = comp_dpsi2_dvar_element(psi2_neq, psi2exp1, psi2exp2, var, l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)",
+        "comp_dpsi2_dvar",
+        preamble="""
+        #define IDX_NMMQ(n,m1,m2,q) (((q*M+m2)*M+m1)*N+n)
+        #define IDX_MMQ(m1,m2,q) ((q*M+m2)*M+m1)
+        #define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
+        #define IDX_NQ(n,q) (q*N+n)
+        #define IDX_MQ(m,q) (q*M+m)
+        #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b)))
+        
+        __device__ double comp_dpsi2_dvar_element(double *psi2_neq, double *psi2exp1, double *psi2exp2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx)
+        {
+            // psi2 (n,m1,m2)
+            int m2 = idx/(M*N);
+            int m1 = (idx%(M*N))/N;
+            int n = idx%N;
+
+            double psi2_sum=0;
+            for(int q=0;q<Q;q++){ 
+                double dZ = Z[IDX_MQ(m1,q)]-Z[IDX_MQ(m2,q)];
+                double muZ = mu[IDX_NQ(n,q)] - (Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)])/2.0;
+                double exp1_e = - dZ*dZ/(l[q]*4.0) - muZ*muZ/(2*S[IDX_NQ(n,q)]+l[q]);
+                double exp1 = logGamma[IDX_NQ(n,q)] - (logpsi2denom[IDX_NQ(n,q)])/2.0 +exp1_e;
+                double exp2_e = - (Z[IDX_MQ(m1,q)]*Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)]*Z[IDX_MQ(m2,q)])/(l[q]*2.0);
+                double exp2 = log1Gamma[IDX_NQ(n,q)] + exp2_e;
+                double psi2_q = LOGEXPSUM(exp1,exp2);
+                psi2_neq[IDX_NMMQ(n,m1,m2,q)] = -psi2_q;
+                psi2exp1[IDX_NMMQ(n,m1,m2,q)] = exp(exp1_e);
+                psi2exp2[IDX_MMQ(m1,m2,q)] = exp(exp2_e);
+                psi2_sum += psi2_q;
+            }
+            for(int q=0;q<Q;q++) {
+                psi2_neq[IDX_NMMQ(n,m1,m2,q)] = exp(psi2_neq[IDX_NMMQ(n,m1,m2,q)]+psi2_sum);
+            }
+            return 2*var*exp(psi2_sum);            
+        }
+        """)
+    
+    # The kernel form computing psi1 het_noise
+    comp_psi2_der = ElementwiseKernel(
+        "double *dpsi2_dl, double *dpsi2_dmu, double *dpsi2_dS, double *dpsi2_dgamma, double *dpsi2_dZ, double *psi2_neq, double *psi2exp1, double *psi2exp2, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q",
+        "dpsi2_dl[i] = comp_psi2_der_element(dpsi2_dmu, dpsi2_dS, dpsi2_dgamma, dpsi2_dZ, psi2_neq, psi2exp1, psi2exp2, var, l, Z, mu, S, gamma, N, M, Q, i)",
+        "comp_psi2_der",
+        preamble="""
+        #define IDX_NMMQ(n,m1,m2,q) (((q*M+m2)*M+m1)*N+n)
+        #define IDX_MMQ(m1,m2,q) ((q*M+m2)*M+m1)
+        #define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
+        #define IDX_NQ(n,q) (q*N+n)
+        #define IDX_MQ(m,q) (q*M+m)
+        
+        __device__ double comp_psi2_der_element(double *dpsi2_dmu, double *dpsi2_dS, double *dpsi2_dgamma, double *dpsi2_dZ, double *psi2_neq, double *psi2exp1, double *psi2exp2, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q, int idx)
+        {
+            // dpsi2 (n,m1,m2,q)
+            int q = idx/(M*M*N);
+            int m2 = (idx%(M*M*N))/(M*N);
+            int m1 = (idx%(M*N))/N;
+            int n = idx%N;
+            
+            double neq = psi2_neq[IDX_NMMQ(n,m1,m2,q)];
+            double gamma_c = gamma[IDX_NQ(n,q)];
+            double Z1_c = Z[IDX_MQ(m1,q)];
+            double Z2_c = Z[IDX_MQ(m2,q)];
+            double S_c = S[IDX_NQ(n,q)];
+            double l_c = l[q];
+            double l_sqrt_c = sqrt(l[q]);
+            double psi2exp1_c = psi2exp1[IDX_NMMQ(n,m1,m2,q)];
+            double psi2exp2_c = psi2exp2[IDX_MMQ(m1,m2,q)];
+
+            double dZ = Z2_c - Z1_c;
+            double muZ = mu[IDX_NQ(n,q)] - (Z1_c+Z2_c)/2.0;
+            double Z2 = Z1_c*Z1_c+Z2_c*Z2_c;
+            double denom = 2.0*S_c/l_c+1.0;
+            double denom_sqrt = sqrt(denom);
+            double psi2_common = gamma_c/(denom_sqrt*denom*l_c);
+            double gamma1 = 1-gamma_c;
+            double var2 = var*var;
+            
+            dpsi2_dgamma[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2exp1_c/denom_sqrt - psi2exp2_c);
+            dpsi2_dmu[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(-2.0*psi2_common*muZ*psi2exp1_c);
+            dpsi2_dS[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2_common*(2.0*muZ*muZ/(2.0*S_c+l_c)-1.0)*psi2exp1_c);
+            dpsi2_dZ[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2_common*(dZ*denom/-2.0+muZ)*psi2exp1_c-gamma1*Z2_c/l_c*psi2exp2_c)*2.0;
+            return var2*neq*(psi2_common*(S_c/l_c+dZ*dZ*denom/(4.0*l_c)+muZ*muZ/(2.0*S_c+l_c))*psi2exp1_c+gamma1*Z2/(2.0*l_c)*psi2exp2_c)*l_sqrt_c*2.0;    
+        }
+        """)
+        
+except:
+    pass
+
+class PSICOMP_SSRBF(object):
+    def __init__(self):
+        assert gpu_init.initSuccess, "GPU initialization failed!"
+        self.cublas_handle = gpu_init.cublas_handle
+        self.gpuCache = None
+        self.gpuCacheAll = None
+    
+    def _initGPUCache(self, N, M, Q):
+        if self.gpuCache!=None and self.gpuCache['mu_gpu'].shape[0] == N:
+            return
+        
+        if self.gpuCacheAll!=None and self.gpuCacheAll['mu_gpu'].shape[0]<N: # Too small cache -> reallocate
+            self._releaseMemory()
+            
+        if self.gpuCacheAll == None:
+            self.gpuCacheAll = {
+                             'l_gpu'                :gpuarray.empty((Q,),np.float64,order='F'),
+                             'Z_gpu'                :gpuarray.empty((M,Q),np.float64,order='F'),
+                             'mu_gpu'               :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'S_gpu'                :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'gamma_gpu'            :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'logGamma_gpu'         :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'log1Gamma_gpu'        :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'logpsi1denom_gpu'      :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'logpsi2denom_gpu'      :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'psi0_gpu'             :gpuarray.empty((N,),np.float64,order='F'),
+                             'psi1_gpu'             :gpuarray.empty((N,M),np.float64,order='F'),
+                             'psi2_gpu'             :gpuarray.empty((N,M,M),np.float64,order='F'),
+                             # derivatives psi1
+                             'psi1_neq_gpu'         :gpuarray.empty((N,M,Q),np.float64, order='F'),
+                             'psi1exp1_gpu'         :gpuarray.empty((N,M,Q),np.float64, order='F'),
+                             'psi1exp2_gpu'         :gpuarray.empty((N,M,Q),np.float64, order='F'),
+                             'dpsi1_dvar_gpu'       :gpuarray.empty((N,M),np.float64, order='F'),
+                             'dpsi1_dl_gpu'         :gpuarray.empty((N,M,Q),np.float64, order='F'),
+                             'dpsi1_dZ_gpu'         :gpuarray.empty((N,M,Q),np.float64, order='F'),
+                             'dpsi1_dgamma_gpu'     :gpuarray.empty((N,M,Q),np.float64, order='F'),
+                             'dpsi1_dmu_gpu'        :gpuarray.empty((N,M,Q),np.float64, order='F'),
+                             'dpsi1_dS_gpu'         :gpuarray.empty((N,M,Q),np.float64, order='F'),
+                             # derivatives psi2
+                             'psi2_neq_gpu'         :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
+                             'psi2exp1_gpu'         :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
+                             'psi2exp2_gpu'         :gpuarray.empty((M,M,Q),np.float64, order='F'),
+                             'dpsi2_dvar_gpu'       :gpuarray.empty((N,M,M),np.float64, order='F'),
+                             'dpsi2_dl_gpu'         :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
+                             'dpsi2_dZ_gpu'         :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
+                             'dpsi2_dgamma_gpu'     :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
+                             'dpsi2_dmu_gpu'        :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
+                             'dpsi2_dS_gpu'         :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
+                             # gradients
+                             'grad_l_gpu'           :gpuarray.empty((Q,),np.float64,order='F'),
+                             'grad_Z_gpu'           :gpuarray.empty((M,Q),np.float64,order='F'),
+                             'grad_mu_gpu'          :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'grad_S_gpu'           :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'grad_gamma_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
+                             }
+            self.gpuCache = self.gpuCacheAll
+        elif self.gpuCacheAll['mu_gpu'].shape[0]==N:
+            self.gpuCache = self.gpuCacheAll
+        else:
+            # remap to a smaller cache
+            self.gpuCache = self.gpuCacheAll.copy()
+            Nlist=['mu_gpu','S_gpu','gamma_gpu','logGamma_gpu','log1Gamma_gpu','logpsi1denom_gpu','logpsi2denom_gpu','psi0_gpu','psi1_gpu','psi2_gpu',
+                   'psi1_neq_gpu','psi1exp1_gpu','psi1exp2_gpu','dpsi1_dvar_gpu','dpsi1_dl_gpu','dpsi1_dZ_gpu','dpsi1_dgamma_gpu','dpsi1_dmu_gpu',
+                   'dpsi1_dS_gpu','psi2_neq_gpu','psi2exp1_gpu','dpsi2_dvar_gpu','dpsi2_dl_gpu','dpsi2_dZ_gpu','dpsi2_dgamma_gpu','dpsi2_dmu_gpu','dpsi2_dS_gpu','grad_mu_gpu','grad_S_gpu','grad_gamma_gpu',]
+            oldN = self.gpuCacheAll['mu_gpu'].shape[0]
+            for v in Nlist:
+                u = self.gpuCacheAll[v]
+                self.gpuCache[v] = u.ravel()[:u.size/oldN*N].reshape(*((N,)+u.shape[1:]))
+    
+    def _releaseMemory(self):
+        if self.gpuCacheAll!=None:
+            [v.gpudata.free() for v in self.gpuCacheAll.values()]
+            self.gpuCacheAll = None
+            self.gpuCache = None
+    
+    def estimateMemoryOccupation(self, N, M, Q):
+        """
+        Estimate the best batch size.
+        N - the number of total datapoints
+        M - the number of inducing points
+        Q - the number of hidden (input) dimensions
+        return: the constant memory size, the memory occupation of batchsize=1
+        unit: GB
+        """
+        return (2.*Q+2.*M*Q+M*M*Q)*8./1024./1024./1024., (1.+2.*M+10.*Q+2.*M*M+8.*M*Q+7.*M*M*Q)*8./1024./1024./1024.
+
+    @Cache_this(limit=1,ignore_args=(0,))
+    def psicomputations(self, variance, lengthscale, Z, mu, S, gamma):
+        """Compute Psi statitsitcs"""
+        if isinstance(lengthscale, np.ndarray) and len(lengthscale)>1:
+            ARD = True
+        else:
+            ARD = False
+        
+        N = mu.shape[0]
+        M = Z.shape[0]
+        Q = mu.shape[1]
+        
+        self._initGPUCache(N,M,Q)
+        l_gpu = self.gpuCache['l_gpu']
+        Z_gpu = self.gpuCache['Z_gpu']
+        mu_gpu = self.gpuCache['mu_gpu']
+        S_gpu = self.gpuCache['S_gpu']
+        gamma_gpu = self.gpuCache['gamma_gpu']
+        logGamma_gpu = self.gpuCache['logGamma_gpu']
+        log1Gamma_gpu = self.gpuCache['log1Gamma_gpu']
+        logpsi1denom_gpu = self.gpuCache['logpsi1denom_gpu']
+        logpsi2denom_gpu = self.gpuCache['logpsi2denom_gpu']
+        psi0_gpu = self.gpuCache['psi0_gpu']
+        psi1_gpu = self.gpuCache['psi1_gpu']
+        psi2_gpu = self.gpuCache['psi2_gpu']
+
+        if ARD:
+            l_gpu.set(np.asfortranarray(lengthscale**2))
+        else:
+            l_gpu.fill(lengthscale*lengthscale)
+        Z_gpu.set(np.asfortranarray(Z))
+        mu_gpu.set(np.asfortranarray(mu))
+        S_gpu.set(np.asfortranarray(S))
+        gamma_gpu.set(np.asfortranarray(gamma))
+        linalg_gpu.log(gamma_gpu,logGamma_gpu)
+        linalg_gpu.logOne(gamma_gpu,log1Gamma_gpu)
+        comp_logpsidenom(logpsi1denom_gpu, S_gpu,l_gpu,1.0,N)
+        comp_logpsidenom(logpsi2denom_gpu, S_gpu,l_gpu,2.0,N)
+        
+        psi0_gpu.fill(variance)
+        comp_psi1(psi1_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q)
+        comp_psi2(psi2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q)
+        
+        return psi0_gpu, psi1_gpu, psi2_gpu
+    
+    @Cache_this(limit=1,ignore_args=(0,))
+    def _psiDercomputations(self, variance, lengthscale, Z, mu, S, gamma):
+        """Compute the derivatives w.r.t. Psi statistics"""        
+        N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1]
+        
+        self._initGPUCache(N,M,Q)
+        l_gpu = self.gpuCache['l_gpu']
+        Z_gpu = self.gpuCache['Z_gpu']
+        mu_gpu = self.gpuCache['mu_gpu']
+        S_gpu = self.gpuCache['S_gpu']
+        gamma_gpu = self.gpuCache['gamma_gpu']
+        logGamma_gpu = self.gpuCache['logGamma_gpu']
+        log1Gamma_gpu = self.gpuCache['log1Gamma_gpu']
+        logpsi1denom_gpu = self.gpuCache['logpsi1denom_gpu']
+        logpsi2denom_gpu = self.gpuCache['logpsi2denom_gpu']
+
+        psi1_neq_gpu = self.gpuCache['psi1_neq_gpu']
+        psi1exp1_gpu = self.gpuCache['psi1exp1_gpu']
+        psi1exp2_gpu = self.gpuCache['psi1exp2_gpu']
+        dpsi1_dvar_gpu = self.gpuCache['dpsi1_dvar_gpu']
+        dpsi1_dl_gpu = self.gpuCache['dpsi1_dl_gpu']
+        dpsi1_dZ_gpu = self.gpuCache['dpsi1_dZ_gpu']
+        dpsi1_dgamma_gpu = self.gpuCache['dpsi1_dgamma_gpu']
+        dpsi1_dmu_gpu = self.gpuCache['dpsi1_dmu_gpu']
+        dpsi1_dS_gpu = self.gpuCache['dpsi1_dS_gpu']
+
+        psi2_neq_gpu = self.gpuCache['psi2_neq_gpu']
+        psi2exp1_gpu = self.gpuCache['psi2exp1_gpu']
+        psi2exp2_gpu = self.gpuCache['psi2exp2_gpu']
+        dpsi2_dvar_gpu = self.gpuCache['dpsi2_dvar_gpu']
+        dpsi2_dl_gpu = self.gpuCache['dpsi2_dl_gpu']
+        dpsi2_dZ_gpu = self.gpuCache['dpsi2_dZ_gpu']
+        dpsi2_dgamma_gpu = self.gpuCache['dpsi2_dgamma_gpu']
+        dpsi2_dmu_gpu = self.gpuCache['dpsi2_dmu_gpu']
+        dpsi2_dS_gpu = self.gpuCache['dpsi2_dS_gpu']
+
+        #==========================================================================================================
+        # Assuming the l_gpu, Z_gpu, mu_gpu, S_gpu, gamma_gpu, logGamma_gpu, log1Gamma_gpu, 
+        # logpsi1denom_gpu, logpsi2denom_gpu has been synchonized.
+        #==========================================================================================================
+        
+        # psi1 derivatives
+        comp_dpsi1_dvar(dpsi1_dvar_gpu, psi1_neq_gpu, psi1exp1_gpu,psi1exp2_gpu, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q)
+        comp_psi1_der(dpsi1_dl_gpu,dpsi1_dmu_gpu,dpsi1_dS_gpu,dpsi1_dgamma_gpu, dpsi1_dZ_gpu, psi1_neq_gpu,psi1exp1_gpu,psi1exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, gamma_gpu, N, M, Q)
+
+        # psi2 derivatives
+        comp_dpsi2_dvar(dpsi2_dvar_gpu, psi2_neq_gpu, psi2exp1_gpu,psi2exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q)
+        comp_psi2_der(dpsi2_dl_gpu,dpsi2_dmu_gpu,dpsi2_dS_gpu,dpsi2_dgamma_gpu, dpsi2_dZ_gpu, psi2_neq_gpu,psi2exp1_gpu,psi2exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, gamma_gpu, N, M, Q)
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
+        self._psiDercomputations(variance, lengthscale, Z, mu, S, gamma)
+        N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1]
+        
+        if isinstance(lengthscale, np.ndarray) and len(lengthscale)>1:
+            ARD = True
+        else:
+            ARD = False
+            
+        dpsi1_dvar_gpu = self.gpuCache['dpsi1_dvar_gpu']
+        dpsi2_dvar_gpu = self.gpuCache['dpsi2_dvar_gpu']
+        dpsi1_dl_gpu = self.gpuCache['dpsi1_dl_gpu']
+        dpsi2_dl_gpu = self.gpuCache['dpsi2_dl_gpu']
+        psi1_comb_gpu = self.gpuCache['psi1_neq_gpu']
+        psi2_comb_gpu = self.gpuCache['psi2_neq_gpu']
+        grad_l_gpu = self.gpuCache['grad_l_gpu']
+        
+        # variance
+        variance.gradient = gpuarray.sum(dL_dpsi0).get() \
+                            + cublas.cublasDdot(self.cublas_handle, dL_dpsi1.size, dL_dpsi1.gpudata, 1, dpsi1_dvar_gpu.gpudata, 1) \
+                            + cublas.cublasDdot(self.cublas_handle, dL_dpsi2.size, dL_dpsi2.gpudata, 1, dpsi2_dvar_gpu.gpudata, 1)
+
+        # lengscale
+        if ARD:
+            grad_l_gpu.fill(0.)
+            linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dl_gpu, dL_dpsi1.size)
+            linalg_gpu.sum_axis(grad_l_gpu, psi1_comb_gpu, 1, N*M)
+            linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dl_gpu, dL_dpsi2.size)
+            linalg_gpu.sum_axis(grad_l_gpu, psi2_comb_gpu, 1, N*M*M)            
+            lengthscale.gradient = grad_l_gpu.get()
+        else:
+            linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dl_gpu, dL_dpsi1.size)
+            linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dl_gpu, dL_dpsi2.size)
+            lengthscale.gradient = gpuarray.sum(psi1_comb_gpu).get() + gpuarray.sum(psi2_comb_gpu).get()
+                
+    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
+        self._psiDercomputations(variance, lengthscale, Z, mu, S, gamma)
+        N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1]
+
+        dpsi1_dZ_gpu = self.gpuCache['dpsi1_dZ_gpu']
+        dpsi2_dZ_gpu = self.gpuCache['dpsi2_dZ_gpu']
+        psi1_comb_gpu = self.gpuCache['psi1_neq_gpu']
+        psi2_comb_gpu = self.gpuCache['psi2_neq_gpu']
+        grad_Z_gpu = self.gpuCache['grad_Z_gpu']
+
+        grad_Z_gpu.fill(0.)
+        linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dZ_gpu, dL_dpsi1.size)
+        linalg_gpu.sum_axis(grad_Z_gpu, psi1_comb_gpu, 1, N)
+        linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dZ_gpu, dL_dpsi2.size)
+        linalg_gpu.sum_axis(grad_Z_gpu, psi2_comb_gpu, 1, N*M)
+        return grad_Z_gpu.get()
+        
+    def gradients_qX_expectations(self, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
+        self._psiDercomputations(variance, lengthscale, Z, mu, S, gamma)
+        N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1]
+
+        dpsi1_dmu_gpu = self.gpuCache['dpsi1_dmu_gpu']
+        dpsi2_dmu_gpu = self.gpuCache['dpsi2_dmu_gpu']
+        dpsi1_dS_gpu = self.gpuCache['dpsi1_dS_gpu']
+        dpsi2_dS_gpu = self.gpuCache['dpsi2_dS_gpu']
+        dpsi1_dgamma_gpu = self.gpuCache['dpsi1_dgamma_gpu']
+        dpsi2_dgamma_gpu = self.gpuCache['dpsi2_dgamma_gpu']
+        psi1_comb_gpu = self.gpuCache['psi1_neq_gpu']
+        psi2_comb_gpu = self.gpuCache['psi2_neq_gpu']
+        grad_mu_gpu = self.gpuCache['grad_mu_gpu']
+        grad_S_gpu = self.gpuCache['grad_S_gpu']
+        grad_gamma_gpu = self.gpuCache['grad_gamma_gpu']
+        
+        # mu gradients
+        grad_mu_gpu.fill(0.)
+        linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dmu_gpu, dL_dpsi1.size)
+        linalg_gpu.sum_axis(grad_mu_gpu, psi1_comb_gpu, N, M)
+        linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dmu_gpu, dL_dpsi2.size)
+        linalg_gpu.sum_axis(grad_mu_gpu, psi2_comb_gpu, N, M*M)
+
+        # S gradients
+        grad_S_gpu.fill(0.)
+        linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dS_gpu, dL_dpsi1.size)
+        linalg_gpu.sum_axis(grad_S_gpu, psi1_comb_gpu, N, M)
+        linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dS_gpu, dL_dpsi2.size)
+        linalg_gpu.sum_axis(grad_S_gpu, psi2_comb_gpu, N, M*M)
+
+        # gamma gradients
+        grad_gamma_gpu.fill(0.)
+        linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dgamma_gpu, dL_dpsi1.size)
+        linalg_gpu.sum_axis(grad_gamma_gpu, psi1_comb_gpu, N, M)
+        linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dgamma_gpu, dL_dpsi2.size)
+        linalg_gpu.sum_axis(grad_gamma_gpu, psi2_comb_gpu, N, M*M)
+        
+        return grad_mu_gpu.get(), grad_S_gpu.get(), grad_gamma_gpu.get()
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index c5914d58..e0071fb9 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -9,6 +9,7 @@ from stationary import Stationary
 from GPy.util.caching import Cache_this
 from ...core.parameterization import variational
 from psi_comp import ssrbf_psi_comp
+from psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF
 
 class RBF(Stationary):
     """
@@ -19,9 +20,15 @@ class RBF(Stationary):
        k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg)
 
     """
-    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='rbf'):
-        super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+    _support_GPU = True
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='rbf', useGPU=False):
+        super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=useGPU)
         self.weave_options = {}
+        self.group_spike_prob = False
+        
+        if self.useGPU:
+            self.psicomp = PSICOMP_SSRBF()
+            
 
     def K_of_r(self, r):
         return self.variance * np.exp(-0.5 * r**2)
@@ -34,18 +41,28 @@ class RBF(Stationary):
     #---------------------------------------#
 
     def psi0(self, Z, variational_posterior):
-        return self.Kdiag(variational_posterior.mean)
+        if self.useGPU:
+            if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+                return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[0]
+        else:
+            return self.Kdiag(variational_posterior.mean)
 
     def psi1(self, Z, variational_posterior):
         if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-            psi1, _, _, _, _, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+            if self.useGPU:
+                return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[1]
+            else:
+                psi1, _, _, _, _, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
         else:
             _, _, _, psi1 = self._psi1computations(Z, variational_posterior)
         return psi1
 
     def psi2(self, Z, variational_posterior):
         if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-            psi2, _, _, _, _, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+            if self.useGPU:
+                return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[2]
+            else:
+                psi2, _, _, _, _, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
         else:
             _, _, _, _, psi2 = self._psi2computations(Z, variational_posterior)
         return psi2
@@ -53,26 +70,30 @@ class RBF(Stationary):
     def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         # Spike-and-Slab GPLVM
         if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-            _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-            _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-
-            #contributions from psi0:
-            self.variance.gradient = np.sum(dL_dpsi0)
-
-            #from psi1
-            self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance)
-            if self.ARD:
-                self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
+            if self.useGPU:
+                self.psicomp.update_gradients_expectations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
             else:
-                self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum()
-
-            #from psi2
-            self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum()
-            if self.ARD:
-                self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
-            else:
-                self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum()
-
+                
+                _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+                _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+    
+                #contributions from psi0:
+                self.variance.gradient = np.sum(dL_dpsi0)
+    
+                #from psi1
+                self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance)
+                if self.ARD:
+                    self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
+                else:
+                    self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum()  
+    
+                #from psi2
+                self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum()
+                if self.ARD:
+                    self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
+                else:
+                    self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum()
+                    
         elif isinstance(variational_posterior, variational.NormalPosterior):
             l2 = self.lengthscale**2
             if l2.size != self.input_dim:
@@ -107,16 +128,19 @@ class RBF(Stationary):
     def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         # Spike-and-Slab GPLVM
         if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-            _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-            _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-
-            #psi1
-            grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0)
-
-            #psi2
-            grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1)
-
-            return grad
+            if self.useGPU:
+                return self.psicomp.gradients_Z_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
+            else:
+                _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+                _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+    
+                #psi1
+                grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0)
+    
+                #psi2
+                grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1)
+    
+                return grad
 
         elif isinstance(variational_posterior, variational.NormalPosterior):
             l2 = self.lengthscale **2
@@ -140,22 +164,28 @@ class RBF(Stationary):
     def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         # Spike-and-Slab GPLVM
         if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-            ndata = variational_posterior.mean.shape[0]
-
-            _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-            _, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-
-            #psi1
-            grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1)
-            grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1)
-            grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1)
-
-            #psi2
-            grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1)
-            grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1)
-            grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1)
-
-            return grad_mu, grad_S, grad_gamma
+            if self.useGPU:
+                return self.psicomp.gradients_qX_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
+            else:   
+                ndata = variational_posterior.mean.shape[0]
+    
+                _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+                _, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+    
+                #psi1
+                grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1)
+                grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1)
+                grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1)
+    
+                #psi2
+                grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1)
+                grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1)
+                grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1)
+                
+                if self.group_spike_prob:
+                    grad_gamma[:] = grad_gamma.mean(axis=0)
+    
+                return grad_mu, grad_S, grad_gamma
 
         elif isinstance(variational_posterior, variational.NormalPosterior):
 
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index b6fea5ef..37acbf2d 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -41,8 +41,8 @@ class Stationary(Kern):
 
     """
 
-    def __init__(self, input_dim, variance, lengthscale, ARD, active_dims, name):
-        super(Stationary, self).__init__(input_dim, active_dims, name)
+    def __init__(self, input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=False):
+        super(Stationary, self).__init__(input_dim, active_dims, name,useGPU=useGPU)
         self.ARD = ARD
         if not ARD:
             if lengthscale is None:
diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index 5d5d692a..d7ad5753 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -15,8 +15,8 @@ except ImportError:
 if sympy_available:
     # These are likelihoods that rely on symbolic.
     from symbolic import Symbolic
-    from sstudent_t import SstudentT
+    #from sstudent_t import SstudentT
     from negative_binomial import Negative_binomial
-    from skew_normal import Skew_normal
-    from skew_exponential import Skew_exponential
-    from null_category import Null_category
+    #from skew_normal import Skew_normal
+    #from skew_exponential import Skew_exponential
+    #from null_category import Null_category
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index d623c8f1..03cd361c 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -7,7 +7,9 @@ from ..core import SparseGP
 from ..likelihoods import Gaussian
 from ..inference.optimization import SCG
 from ..util import linalg
-from ..core.parameterization.variational import NormalPosterior, NormalPrior,VariationalPosterior
+from ..core.parameterization.variational import NormalPosterior, NormalPrior, VariationalPosterior
+from ..inference.latent_function_inference.var_dtc_parallel import update_gradients
+from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU
 
 class BayesianGPLVM(SparseGP):
     """
@@ -65,6 +67,10 @@ class BayesianGPLVM(SparseGP):
         X.mean.gradient, X.variance.gradient = X_grad
 
     def parameters_changed(self):
+        if isinstance(self.inference_method, VarDTC_GPU):
+            update_gradients(self)
+            return
+    
         super(BayesianGPLVM, self).parameters_changed()
         self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
 
@@ -153,57 +159,7 @@ class BayesianGPLVM(SparseGP):
         from ..plotting.matplot_dep import dim_reduction_plots
 
         return dim_reduction_plots.plot_steepest_gradient_map(self,*args,**kwargs)
-
-
-def update_gradients(model):
-    model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, model.X, model.Z, model.likelihood, model.Y)
     
-    het_noise = model.likelihood.variance.size > 1
-    
-    if het_noise:
-        dL_dthetaL = np.empty((model.Y.shape[0],))
-    else:
-        dL_dthetaL = 0
-
-    #gradients w.r.t. kernel
-    model.kern.update_gradients_full(dL_dKmm, model.Z, None)
-    kern_grad = model.kern.gradient.copy()
-            
-    #gradients w.r.t. Z
-    model.Z.gradient[:,model.kern.active_dims] = model.kern.gradients_X(dL_dKmm, model.Z)
-    
-    isEnd = False
-    while not isEnd:
-        isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, model.X, model.Z, model.likelihood, model.Y)
-        if isinstance(model.X, VariationalPosterior):
-            
-            #gradients w.r.t. kernel
-            model.kern.update_gradients_expectations(variational_posterior=model.X[n_range[0]:n_range[1]], Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
-            kern_grad += model.kern.gradient
-    
-            #gradients w.r.t. Z
-            model.Z.gradient[:,model.kern.active_dims] += model.kern.gradients_Z_expectations(
-                               grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=model.X[n_range[0]:n_range[1]])
-        
-            #gradients w.r.t. posterior parameters of X
-            X_grad = model.kern.gradients_qX_expectations(variational_posterior=model.X[n_range[0]:n_range[1]], Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
-            model.set_X_gradients(model.X[n_range[0]:n_range[1]], X_grad)
-                
-            if het_noise:
-                dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL']
-            else:
-                dL_dthetaL += grad_dict['dL_dthetaL']
-    
-    # Set the gradients w.r.t. kernel
-    model.kern.gradient = kern_grad
-
-    # Update Log-likelihood
-    model._log_marginal_likelihood -= model.variational_prior.KL_divergence(model.X)
-    # update for the KL divergence
-    model.variational_prior.update_gradients_KL(model.X)
-    
-    # dL_dthetaL
-    model.likelihood.update_gradients(dL_dthetaL)
 
 def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
     """
diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index 1c2ecf4c..57be302a 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -11,6 +11,9 @@ from ..likelihoods import Gaussian
 from ..inference.optimization import SCG
 from ..util import linalg
 from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior
+from ..inference.latent_function_inference.var_dtc_parallel import update_gradients
+from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU
+
 
 class SSGPLVM(SparseGP):
     """
@@ -25,11 +28,14 @@ class SSGPLVM(SparseGP):
 
     """
     def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike-and-Slab GPLVM', **kwargs):
+                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike-and-Slab GPLVM', group_spike=False, **kwargs):
 
-        if X == None: # The mean of variational approximation (mu)
+        if X == None:
             from ..util.initialization import initialize_latent
-            X = initialize_latent(init, input_dim, Y)
+            X, fracs = initialize_latent(init, input_dim, Y)
+        else:
+            fracs = np.ones(input_dim)
+
         self.init = init
 
         if X_variance is None: # The variance of the variational approximation (S)
@@ -38,6 +44,9 @@ class SSGPLVM(SparseGP):
         gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
         gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim)
         
+        if group_spike:
+            gamma[:] = gamma.mean(axis=0)
+        
         if Z is None:
             Z = np.random.permutation(X.copy())[:num_inducing]
         assert Z.shape[1] == X.shape[1]
@@ -46,18 +55,31 @@ class SSGPLVM(SparseGP):
             likelihood = Gaussian()
 
         if kernel is None:
-            kernel = kern.SSRBF(input_dim)
-            
+            kernel = kern.RBF(input_dim, lengthscale=fracs, ARD=True) # + kern.white(input_dim)
+                
         pi = np.empty((input_dim))
         pi[:] = 0.5
         self.variational_prior = SpikeAndSlabPrior(pi=pi) # the prior probability of the latent binary variable b
         X = SpikeAndSlabPosterior(X, X_variance, gamma)
+        
+        if group_spike:
+            kernel.group_spike_prob = True
+            self.variational_prior.group_spike_prob = True
+            
 
         SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
         self.add_parameter(self.X, index=0)
         self.add_parameter(self.variational_prior)
+        
+    def set_X_gradients(self, X, X_grad):
+        """Set the gradients of the posterior distribution of X in its specific form."""
+        X.mean.gradient, X.variance.gradient, X.binary_prob.gradient = X_grad
 
     def parameters_changed(self):
+        if isinstance(self.inference_method, VarDTC_GPU):
+            update_gradients(self)
+            return
+        
         super(SSGPLVM, self).parameters_changed()
         self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
 
diff --git a/GPy/plotting/matplot_dep/__init__.py b/GPy/plotting/matplot_dep/__init__.py
index e2706903..f493513a 100644
--- a/GPy/plotting/matplot_dep/__init__.py
+++ b/GPy/plotting/matplot_dep/__init__.py
@@ -15,3 +15,5 @@ import latent_space_visualizations
 import netpbmfile
 import inference_plots
 import maps
+import img_plots
+from ssgplvm import SSGPLVM_plot
diff --git a/GPy/plotting/matplot_dep/img_plots.py b/GPy/plotting/matplot_dep/img_plots.py
new file mode 100644
index 00000000..21dbd64f
--- /dev/null
+++ b/GPy/plotting/matplot_dep/img_plots.py
@@ -0,0 +1,56 @@
+"""
+The module contains the tools for ploting 2D image visualizations
+"""
+
+import numpy as np
+from matplotlib.cm import jet
+
+width_max = 15
+height_max = 12
+
+def _calculateFigureSize(x_size, y_size, fig_ncols, fig_nrows, pad):
+    width = (x_size*fig_ncols+pad*(fig_ncols-1))
+    height = (y_size*fig_nrows+pad*(fig_nrows-1))
+    if width > float(height)/height_max*width_max:
+        return (width_max, float(width_max)/width*height)
+    else:
+        return (float(height_max)/height*width, height_max)
+
+def plot_2D_images(figure, arr, symmetric=False, pad=None, zoom=None, mode=None, interpolation='nearest'):
+    ax = figure.add_subplot(111)
+    if len(arr.shape)==2:
+        arr = arr.reshape(*((1,)+arr.shape))
+    fig_num = arr.shape[0]
+    y_size = arr.shape[1]
+    x_size = arr.shape[2]
+    fig_ncols = int(np.ceil(np.sqrt(fig_num)))
+    fig_nrows = int(np.ceil((float)(fig_num)/fig_ncols))
+    if pad==None:
+        pad = max(int(min(y_size,x_size)/10),1)
+    
+    figsize = _calculateFigureSize(x_size, y_size, fig_ncols, fig_nrows, pad)
+    #figure.set_size_inches(figsize,forward=True)
+    #figure.subplots_adjust(left=0.05, bottom=0.05, right=0.95, top=0.95)
+    
+    if symmetric:
+        # symmetric around zero: fix zero as the middle color
+        mval = max(abs(arr.max()),abs(arr.min()))
+        arr = arr/(2.*mval)+0.5
+    else:
+        minval,maxval = arr.min(),arr.max()
+        arr = (arr-minval)/(maxval-minval)
+
+    if mode=='L':
+        arr_color = np.empty(arr.shape+(3,))
+        arr_color[:] = arr.reshape(*(arr.shape+(1,)))
+    elif mode==None or mode=='jet':
+        arr_color = jet(arr)
+    
+    buf = np.ones((y_size*fig_nrows+pad*(fig_nrows-1), x_size*fig_ncols+pad*(fig_ncols-1), 3),dtype=arr.dtype)
+    
+    for y in xrange(fig_nrows):
+        for x in xrange(fig_ncols):
+            if y*fig_ncols+x<fig_num:
+                buf[y*y_size+y*pad:(y+1)*y_size+y*pad, x*x_size+x*pad:(x+1)*x_size+x*pad] = arr_color[y*fig_ncols+x,:,:,:3]
+    img_plot = ax.imshow(buf, interpolation=interpolation)
+    ax.axis('off')
diff --git a/GPy/plotting/matplot_dep/ssgplvm.py b/GPy/plotting/matplot_dep/ssgplvm.py
new file mode 100644
index 00000000..4106e251
--- /dev/null
+++ b/GPy/plotting/matplot_dep/ssgplvm.py
@@ -0,0 +1,29 @@
+"""
+The module plotting results for SSGPLVM
+"""
+
+import pylab
+
+from ...models import SSGPLVM
+from img_plots import plot_2D_images
+from ...util.misc import param_to_array
+
+class SSGPLVM_plot(object):
+    def __init__(self,model, imgsize):
+        assert isinstance(model,SSGPLVM)
+        self.model = model
+        self.imgsize= imgsize
+        assert model.Y.shape[1] == imgsize[0]*imgsize[1]
+        
+    def plot_inducing(self):
+        fig1 = pylab.figure()
+        mean = self.model.posterior.mean
+        arr = mean.reshape(*(mean.shape[0],self.imgsize[1],self.imgsize[0]))
+        plot_2D_images(fig1, arr)
+        fig1.gca().set_title('The mean of inducing points')
+        
+        fig2 = pylab.figure()
+        covar = self.model.posterior.covariance
+        plot_2D_images(fig2, covar)
+        fig2.gca().set_title('The variance of inducing points')
+        
diff --git a/GPy/plotting/matplot_dep/variational_plots.py b/GPy/plotting/matplot_dep/variational_plots.py
index cf00d8a2..27cb4051 100644
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@@ -45,7 +45,7 @@ def plot(parameterized, fignum=None, ax=None, colors=None):
     fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
     return fig
 
-def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None):
+def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_side=True):
     """
     Plot latent space X in 1D:
 
@@ -58,7 +58,10 @@ def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None):
 
     """
     if ax is None:
-        fig = pb.figure(num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1]))))
+        if side_by_side:
+            fig = pb.figure(num=fignum, figsize=(16, min(12, (2 * parameterized.mean.shape[1]))))
+        else:
+            fig = pb.figure(num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1]))))
     if colors is None:
         colors = pb.gca()._get_lines.color_cycle
         pb.clf()
@@ -68,8 +71,15 @@ def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None):
     means, variances, gamma = param_to_array(parameterized.mean, parameterized.variance, parameterized.binary_prob)
     x = np.arange(means.shape[0])
     for i in range(means.shape[1]):
+        if side_by_side:
+            sub1 = (means.shape[1],2,2*i+1)
+            sub2 = (means.shape[1],2,2*i+2)
+        else:
+            sub1 = (means.shape[1]*2,1,2*i+1)
+            sub2 = (means.shape[1]*2,1,2*i+2)
+            
         # mean and variance plot
-        a = fig.add_subplot(means.shape[1]*2, 1, 2*i + 1)
+        a = fig.add_subplot(*sub1)
         a.plot(means, c='k', alpha=.3)
         plots.extend(a.plot(x, means.T[i], c=colors.next(), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
         a.fill_between(x,
@@ -82,7 +92,7 @@ def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None):
         if i < means.shape[1] - 1:
             a.set_xticklabels('')
         # binary prob plot
-        a = fig.add_subplot(means.shape[1]*2, 1, 2*i + 2)
+        a = fig.add_subplot(*sub2)
         a.bar(x,gamma[:,i],bottom=0.,linewidth=0,align='center')
         a.set_xlim(x.min(), x.max())
         a.set_ylim([0.,1.])
diff --git a/GPy/plotting/matplot_dep/visualize.py b/GPy/plotting/matplot_dep/visualize.py
index f8bcc9f9..cf457633 100644
--- a/GPy/plotting/matplot_dep/visualize.py
+++ b/GPy/plotting/matplot_dep/visualize.py
@@ -85,6 +85,7 @@ class vector_show(matplotlib_show):
 
 
 class lvm(matplotlib_show):
+    
     def __init__(self, vals, model, data_visualize, latent_axes=None, sense_axes=None, latent_index=[0,1]):
         """Visualize a latent variable model
 
@@ -98,7 +99,7 @@ class lvm(matplotlib_show):
                 vals = param_to_array(model.X.mean)
             else:
                 vals = param_to_array(model.X)
-        
+         
         vals = param_to_array(vals)
         matplotlib_show.__init__(self, vals, axes=latent_axes)
 
@@ -121,7 +122,7 @@ class lvm(matplotlib_show):
         self.move_on = False
         self.latent_index = latent_index
         self.latent_dim = model.input_dim
-
+ 
         # The red cross which shows current latent point.
         self.latent_values = vals
         self.latent_handle = self.latent_axes.plot([0],[0],'rx',mew=2)[0]
@@ -130,10 +131,10 @@ class lvm(matplotlib_show):
 
     def modify(self, vals):
         """When latent values are modified update the latent representation and ulso update the output visualization."""
-        self.vals = vals.copy()
+        self.vals = vals[None,:].copy()
         y = self.model.predict(self.vals)[0]
         self.data_visualize.modify(y)
-        self.latent_handle.set_data(self.vals[self.latent_index[0]], self.vals[self.latent_index[1]])
+        self.latent_handle.set_data(self.vals[:,self.latent_index[0]], self.vals[:,self.latent_index[1]])
         self.axes.figure.canvas.draw()
 
 
diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py
index fc52581a..d975aaa3 100644
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@@ -191,13 +191,14 @@ class Test(ListDictTestCase):
         par.count = 0
         par.add_observer(self, self._callback, 1)
         pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
-        self.assertNotIn(par._observer_callables_[0], pcopy._observer_callables_)
+        self.assertNotIn(par.observers[0], pcopy.observers)
         pcopy = par.copy()
         pcopy.name = "copy"
         self.assertTrue(par.checkgrad())
         self.assertTrue(pcopy.checkgrad())
         self.assertTrue(pcopy.kern.checkgrad())
-        self.assertIn(par._observer_callables_[0], pcopy._observer_callables_)
+        import ipdb;ipdb.set_trace()
+        self.assertIn(par.observers[0], pcopy.observers)
         self.assertEqual(par.count, 3)
         self.assertEqual(pcopy.count, 6) # 3 of each call to checkgrad
 
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index 8aea990c..77312278 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -15,6 +15,7 @@ import caching
 import diag
 import initialization
 import multioutput
+import linalg_gpu
 
 try:
     import sympy
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index ced56727..bb162ee3 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -66,6 +66,7 @@ class Cacher(object):
                 #first make sure the depth limit isn't exceeded
                 if len(self.cached_inputs) == self.limit:
                     args_ = self.cached_inputs.pop(0)
+                    args_ = [a for i,a in enumerate(args_) if i not in self.ignore_args and i not in self.force_kwargs]
                     [a.remove_observer(self, self.on_cache_changed) for a in args_ if a is not None]
                     self.inputs_changed.pop(0)
                     self.cached_outputs.pop(0)
diff --git a/GPy/util/gpu_init.py b/GPy/util/gpu_init.py
new file mode 100644
index 00000000..917d8158
--- /dev/null
+++ b/GPy/util/gpu_init.py
@@ -0,0 +1,16 @@
+"""
+The package for scikits.cuda initialization
+
+Global variables: initSuccess
+providing CUBLAS handle: cublas_handle
+"""
+
+try:
+    import pycuda.autoinit
+    from scikits.cuda import cublas
+    import scikits.cuda.linalg as culinalg
+    culinalg.init()
+    cublas_handle = cublas.cublasCreate()
+    initSuccess = True
+except:
+    initSuccess = False
\ No newline at end of file
diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py
new file mode 100644
index 00000000..bae6af01
--- /dev/null
+++ b/GPy/util/linalg_gpu.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+#
+# The utility functions for GPU computation
+#
+import numpy as np
+
+from ..util import gpu_init
+
+try:
+    from pycuda.reduction import ReductionKernel
+    from pycuda.elementwise import ElementwiseKernel
+    
+    # log|A| for A is a low triangle matrix
+    # logDiagSum(A, A.shape[0]+1)
+    logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step")
+    
+    strideSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?x[i]:0", arguments="double *x, int step")
+    
+    #=======================================================================================
+    # Element-wise functions
+    #=======================================================================================
+    
+    # log(X)
+    log = ElementwiseKernel("double *in, double *out", "out[i] = log(in[i])", "log_element")
+    
+    # log(1.0-X)
+    logOne = ElementwiseKernel("double *in, double *out", "out[i] = log(1.-in[i])", "logOne_element")
+    
+    # multiplication with broadcast on the last dimension (out = shorter[:,None]*longer)
+    mul_bcast = ElementwiseKernel("double *out, double *shorter, double *longer, int shorter_size", "out[i] = longer[i]*shorter[i%shorter_size]", "mul_bcast")
+    
+    # multiplication with broadcast on the first dimension (out = shorter[None,:]*longer)
+    mul_bcast_first = ElementwiseKernel("double *out, double *shorter, double *longer, int first_dim", "out[i] = longer[i]*shorter[i/first_dim]", "mul_bcast")
+    
+    # sum through the middle dimension (size_2) of a 3D matrix (size_1, size_2, size_3) 
+    sum_axis = ElementwiseKernel("double *out, double *in, int size_1, int size_2", "out[i] += sum_axis_element(in, size_1, size_2, i)", "sum_axis",preamble="""        
+        __device__ double sum_axis_element(double *in, int size_1, int size_2, int idx)
+        {
+            int k = idx/size_1;
+            int i = idx%size_1;
+            double sum=0;
+            for(int j=0;j<size_2;j++) {
+                sum += in[(k*size_2+j)*size_1+i];
+            }
+            return sum;
+        }
+        """)
+    
+    # the outer product between two vectors (out = np.dot(v1,v2.T))
+    outer_prod = ElementwiseKernel("double *out, double *v1, double *v2, int v1_size", "out[i] = v1[i%v1_size]*v2[i/v1_size]", "outer_prod")
+
+    # the outer product between two vectors (out = np.einsum('na,nb->nab',m1,m2) a=dim1, b=dim2 )
+    join_prod = ElementwiseKernel("double *out, double *m1, double *m2, int dim1, int dim2", "out[i] = m1[(i%dim1)*dim1+(i%(dim1*dim2))/dim1]*m2[(i%dim1)*dim1+i/(dim1*dim2)]", "join_prod")
+
+except:
+    pass