diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index fec61204..2b7ca7ad 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -60,7 +60,7 @@ class VarDTC(object):
         _, output_dim = Y.shape
 
         #see whether we've got a different noise variance for each datum
-        beta = 1./np.squeeze(likelihood.variance)
+        beta = 1./max(1e-6, np.squeeze(likelihood.variance))
 
         # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
         #self.YYTfactor = self.get_YYTfactor(Y)
@@ -214,7 +214,7 @@ class VarDTCMissingData(object):
             psi2_all = None
 
         Ys, traces = self._Y(Y)
-        beta_all = 1./likelihood.variance
+        beta_all = 1./max(1e-6, likelihood.variance)
         het_noise = beta_all.size != 1
 
         import itertools
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index d4a60077..38022bd4 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -159,7 +159,7 @@ class RBF(Stationary):
         grad_mu = np.sum(dL_dpsi1[:, :, None] * tmp * dist, 1)
         grad_S = np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (dist_sq - 1), 1)
         #psi2
-        denom, Zdist, Zdist_sq, mudist, mudist_sq, psi2 = self._psi2computations(Z, variational_posterior)
+        denom, _, _, mudist, mudist_sq, psi2 = self._psi2computations(Z, variational_posterior)
         tmp = psi2[:, :, :, None] / l2 / denom
         grad_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * mudist).sum(1).sum(1)
         grad_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*mudist_sq - 1)).sum(1).sum(1)
@@ -237,7 +237,7 @@ class RBF(Stationary):
         return denom, dist, dist_sq, psi1
 
 
-    #@cache_this(ignore_args=(1,))
+    @Cache_this(limit=1, ignore_args=(0,))
     def _Z_distances(self, Z):
         Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
         Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index ae4cd879..bc51d850 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -69,18 +69,18 @@ class Stationary(Kern):
     def dK_dr(self, r):
         raise NotImplementedError, "implement derivative of the covariance function wrt r to use this class"
 
-    #@Cache_this(limit=5, ignore_args=())
+    @Cache_this(limit=5, ignore_args=())
     def K(self, X, X2=None):
         r = self._scaled_dist(X, X2)
         return self.K_of_r(r)
 
-    #@Cache_this(limit=5, ignore_args=(0,))
+    @Cache_this(limit=5, ignore_args=(0,))
     def _dist(self, X, X2):
         if X2 is None:
             X2 = X
         return X[:, None, :] - X2[None, :, :]
 
-    #@Cache_this(limit=5, ignore_args=(0,))
+    @Cache_this(limit=5, ignore_args=(0,))
     def _unscaled_dist(self, X, X2=None):
         """
         Compute the square distance between each row of X and X2, or between
@@ -94,7 +94,7 @@ class Stationary(Kern):
             X2sq = np.sum(np.square(X2),1)
             return np.sqrt(-2.*np.dot(X, X2.T) + (X1sq[:,None] + X2sq[None,:]))
 
-    #@Cache_this(limit=5, ignore_args=())
+    @Cache_this(limit=5, ignore_args=())
     def _scaled_dist(self, X, X2=None):
         """
         Efficiently compute the scaled distance, r.
@@ -147,7 +147,7 @@ class Stationary(Kern):
         diagonal, where we return zero (the distance on the diagonal is zero).
         This term appears in derviatives.
         """
-        dist = self._scaled_dist(X, X2)
+        dist = self._scaled_dist(X, X2).copy()
         if X2 is None:
             nondiag = util.diag.offdiag_view(dist)
             nondiag[:] = 1./nondiag
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index 76d030ca..a2017407 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -39,7 +39,7 @@ class Cacher(object):
             return self.operation(*args)
 
         # TODO: WARNING !!! Cache OFFSWITCH !!! WARNING
-        return self.operation(*args)
+        # return self.operation(*args)
         
         #if the result is cached, return the cached computation
         state = [all(a is b for a, b in zip(args, cached_i)) for cached_i in self.cached_inputs]