merge the discriminative prior to devel

2026-05-10 20:42:39 +02:00 · 2014-10-16 17:21:33 +01:00 · 2014-10-16 17:21:33 +01:00 · 2d89fa821a
commit 2d89fa821a
parent 66755099cd e0d3633ec3
72 changed files with 1681 additions and 1894 deletions
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -93,6 +93,22 @@ class GP(Model):
        self.link_parameter(self.kern)
        self.link_parameter(self.likelihood)

+    def set_X(self,X):
+        # TODO: it does not work with BGPLVM
+        if isinstance(X, ObsAr):
+            self.X = X
+        else:
+            self.X = ObsAr(X)
+
+    def set_Y(self,Y):
+        if self.normalizer is not None:
+            self.normalizer.scale_by(Y)
+            self.Y_normalized = ObsAr(self.normalizer.normalize(Y))
+            self.Y = Y
+        else:
+            self.Y = ObsAr(Y)
+            self.Y_normalized = self.Y
+
    def parameters_changed(self):
        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata)
        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -13,11 +13,7 @@ class Mapping(Parameterized):
    def __init__(self, input_dim, output_dim, name='mapping'):
        self.input_dim = input_dim
        self.output_dim = output_dim
-
        super(Mapping, self).__init__(name=name)
-        # Model.__init__(self)
-        # All leaf nodes should call self._set_params(self._get_params()) at
-        # the end

    def f(self, X):
        raise NotImplementedError
@ -56,7 +52,8 @@ class Mapping(Parameterized):
        Can plot only part of the data and part of the posterior functions
        using which_data and which_functions

-        This is a convenience function: arguments are passed to GPy.plotting.matplot_dep.models_plots.plot_mapping
+        This is a convenience function: arguments are passed to
+        GPy.plotting.matplot_dep.models_plots.plot_mapping
        """

        if "matplotlib" in sys.modules:
@ -66,7 +63,10 @@ class Mapping(Parameterized):
            raise NameError, "matplotlib package has not been imported."

 class Bijective_mapping(Mapping):
-    """This is a mapping that is bijective, i.e. you can go from X to f and also back from f to X. The inverse mapping is called g()."""
+    """
+    This is a mapping that is bijective, i.e. you can go from X to f and
+    also back from f to X. The inverse mapping is called g().
+    """
    def __init__(self, input_dim, output_dim, name='bijective_mapping'):
        super(Bijective_apping, self).__init__(name=name)

@ -77,7 +77,11 @@ class Bijective_mapping(Mapping):
 from model import Model

 class Mapping_check_model(Model):
-    """This is a dummy model class used as a base class for checking that the gradients of a given mapping are implemented correctly. It enables checkgradient() to be called independently on each mapping."""
+    """
+    This is a dummy model class used as a base class for checking that the
+    gradients of a given mapping are implemented correctly. It enables
+    checkgradient() to be called independently on each mapping.
+    """
    def __init__(self, mapping=None, dL_df=None, X=None):
        num_samples = 20
        if mapping==None:
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -213,6 +213,7 @@ class Model(Parameterized):
    def optimize(self, optimizer=None, start=None, **kwargs):
        """
        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
+
        kwargs are passed to the optimizer. They can be:

        :param max_f_eval: maximum number of function evaluations
@ -222,7 +223,15 @@ class Model(Parameterized):
        :param optimizer: which optimizer to use (defaults to self.preferred optimizer)
        :type optimizer: string

-        TODO: valid args
+        Valid optimizers are:
+          - 'scg': scaled conjugate gradient method, recommended for stability.
+                   See also GPy.inference.optimization.scg
+          - 'fmin_tnc': truncated Newton method (see scipy.optimize.fmin_tnc)
+          - 'simplex': the Nelder-Mead simplex method (see scipy.optimize.fmin),
+          - 'lbfgsb': the l-bfgs-b method (see scipy.optimize.fmin_l_bfgs_b),
+          - 'sgd': stochastic gradient decsent (see scipy.optimize.sgd). For experts only!
+
+
        """
        if self.is_fixed:
            raise RuntimeError, "Cannot optimize, when everything is fixed"
@ -372,6 +381,16 @@ class Model(Parameterized):
            self.optimizer_array = x
            return ret

+    def _repr_html_(self):
+        """Representation of the model in html for notebook display."""
+        model_details = [['<b>Model</b>', self.name + '<br>'],
+                         ['<b>Log-likelihood</b>', '{}<br>'.format(float(self.log_likelihood()))],
+                         ["<b>Number of Parameters</b>", '{}<br>'.format(self.size)]]
+        from operator import itemgetter
+        to_print = [""] + ["{}: {}".format(name, detail) for name, detail in model_details] + ["<br><b>Parameters</b>:"]
+        to_print.append(super(Model, self)._repr_html_())
+        return "\n".join(to_print)
+
    def __str__(self):
        model_details = [['Name', self.name],
                         ['Log-likelihood', '{}'.format(float(self.log_likelihood()))],
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -49,11 +49,13 @@ class Param(Parameterizable, ObsAr):
        obj._realshape_ = obj.shape
        obj._realsize_ = obj.size
        obj._realndim_ = obj.ndim
-        obj._original_ = True
+        obj._original_ = obj
        return obj

    def __init__(self, name, input_array, default_constraint=None, *a, **kw):
+        self._in_init_ = True
        super(Param, self).__init__(name=name, default_constraint=default_constraint, *a, **kw)
+        self._in_init_ = False

    def build_pydot(self,G):
        import pydot
@ -127,7 +129,7 @@ class Param(Parameterizable, ObsAr):
        try:
            new_arr._current_slice_ = s
            new_arr._gradient_array_ = self.gradient[s]
-            new_arr._original_ = self.base is new_arr.base
+            new_arr._original_ = self._original_
        except AttributeError: pass  # returning 0d array or float, double etc
        return new_arr

@ -157,7 +159,7 @@ class Param(Parameterizable, ObsAr):
        return self.constraints[__fixed__].size == self.size

    def _get_original(self, param):
-        return self
+        return self._original_

    #===========================================================================
    # Pickling and copying
@ -249,6 +251,29 @@ class Param(Parameterizable, ObsAr):
        if ind.size > 4: indstr = ','.join(map(str, ind[:2])) + "..." + ','.join(map(str, ind[-2:]))
        else: indstr = ','.join(map(str, ind))
        return name + '[' + indstr + ']'
+
+    def _repr_html_(self, constr_matrix=None, indices=None, prirs=None, ties=None):
+        """Representation of the parameter in html for notebook display."""
+        filter_ = self._current_slice_
+        vals = self.flat
+        if indices is None: indices = self._indices(filter_)
+        ravi = self._raveled_index(filter_)
+        if constr_matrix is None: constr_matrix = self.constraints.properties_for(ravi)
+        if prirs is None: prirs = self.priors.properties_for(ravi)
+        if ties is None: ties = self._ties_for(ravi)
+        ties = [' '.join(map(lambda x: x, t)) for t in ties]
+        header_format = """
+<tr>
+  <td><b>{i}</b></td>
+  <td><b>{x}</b></td>
+  <td><b>{c}</b></td>
+  <td><b>{p}</b></td>
+  <td><b>{t}</b></td>
+</tr>"""
+        header = header_format.format(x=self.hierarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
+        if not ties: ties = itertools.cycle([''])
+        return "\n".join(['<table>'] + [header] + ["<tr><td>{i}</td><td align=\"right\">{x}</td><td>{c}</td><td>{p}</td><td>{t}</td></tr>".format(x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)] + ["</table>"])  
+
    def __str__(self, constr_matrix=None, indices=None, prirs=None, ties=None, lc=None, lx=None, li=None, lp=None, lt=None, only_name=False):
        filter_ = self._current_slice_
        vals = self.flat
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@ -14,12 +14,11 @@ Observable Pattern for patameterization
 """

 from transformations import Transformation,Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
-from ...util.misc import param_to_array
 import numpy as np
 import re
 import logging

-__updated__ = '2014-05-21'
+__updated__ = '2014-10-09'

 class HierarchyError(Exception):
    """
@ -92,16 +91,18 @@ class Observable(object):
    def toggle_update(self):
        self.update_model(not self.update())

-    def trigger_update(self):
+    def trigger_update(self, trigger_parent=True):
        """
        Update the model from the current state.
        Make sure that updates are on, otherwise this
        method will do nothing
+
+        :param bool trigger_parent: Whether to trigger the parent, after self has updated
        """
-        if not self.update_model():
+        if not self.update_model() or self._in_init_:
            #print "Warning: updates are off, updating the model will do nothing"
            return
-        self._trigger_params_changed()
+        self._trigger_params_changed(trigger_parent)

    def add_observer(self, observer, callble, priority=0):
        """
@ -635,10 +636,18 @@ class Indexable(Nameable, Observable):
        """
        From Parentable:
        Called when the parent changed
+
+        update the constraints and priors view, so that
+        constraining is automized for the parent.
        """
        from index_operations import ParameterIndexOperationsView
-        self.constraints = ParameterIndexOperationsView(parent.constraints, parent._offset_for(self), self.size)
-        self.priors = ParameterIndexOperationsView(parent.priors, parent._offset_for(self), self.size)
+        #if getattr(self, "_in_init_"):
+            #import ipdb;ipdb.set_trace()
+            #self.constraints.update(param.constraints, start)
+            #self.priors.update(param.priors, start)
+        offset = parent._offset_for(self)
+        self.constraints = ParameterIndexOperationsView(parent.constraints, offset, self.size)
+        self.priors = ParameterIndexOperationsView(parent.priors, offset, self.size)
        self._fixes_ = None
        for p in self.parameters:
            p._parent_changed(parent)
@ -741,6 +750,7 @@ class OptimizationHandlable(Indexable):
            self.param_array.flat[f] = p
            [np.put(self.param_array, ind[f[ind]], c.f(self.param_array.flat[ind[f[ind]]]))
             for c, ind in self.constraints.iteritems() if c != __fixed__]
+        #self._highest_parent_.tie.propagate_val()

        self._optimizer_copy_transformed = False
        self._trigger_params_changed()
@ -826,15 +836,16 @@ class OptimizationHandlable(Indexable):
        """
        # first take care of all parameters (from N(0,1))
        x = rand_gen(size=self._size_transformed(), *args, **kwargs)
+        updates = self.update_model()
        self.update_model(False) # Switch off the updates
        self.optimizer_array = x  # makes sure all of the tied parameters get the same init (since there's only one prior object...)
        # now draw from prior where possible
-        x = param_to_array(self.param_array).flat.copy()
+        x = self.param_array.copy()
        [np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.iteritems() if not p is None]
        unfixlist = np.ones((self.size,),dtype=np.bool)
        unfixlist[self.constraints[__fixed__]] = False
-        self.param_array.flat[unfixlist] = x[unfixlist]
-        self.update_model(True) 
+        self.param_array.flat[unfixlist] = x.view(np.ndarray).ravel()[unfixlist]
+        self.update_model(updates)

    #===========================================================================
    # For shared memory arrays. This does nothing in Param, but sets the memory
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -149,6 +149,7 @@ class Parameterized(Parameterizable):
                self.priors.update(param.priors, start)
                self.parameters.insert(index, param)

+            self._notify_parent_change()
            param.add_observer(self, self._pass_through_notify_observers, -np.inf)

            parent = self
@ -356,6 +357,35 @@ class Parameterized(Parameterizable):
    @property
    def _ties_str(self):
        return [','.join(x._ties_str) for x in self.flattened_parameters]
+
+    def _repr_html_(self, header=True):
+        """Representation of the parameters in html for notebook display."""
+        name = adjust_name_for_printing(self.name) + "."
+        constrs = self._constraints_str;
+        ts = self._ties_str
+        prirs = self._priors_str
+        desc = self._description_str; names = self.parameter_names()
+        nl = max([len(str(x)) for x in names + [name]])
+        sl = max([len(str(x)) for x in desc + ["Value"]])
+        cl = max([len(str(x)) if x else 0 for x in constrs + ["Constraint"]])
+        tl = max([len(str(x)) if x else 0 for x in ts + ["Tied to"]])
+        pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
+        format_spec = "<tr><td>{{name:<{0}s}}</td><td align=\"right\">{{desc:>{1}s}}</td><td>{{const:^{2}s}}</td><td>{{pri:^{3}s}}</td><td>{{t:^{4}s}}</td></tr>".format(nl, sl, cl, pl, tl)
+        to_print = []
+        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
+            to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
+        sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
+        if header:
+            header = """
+<tr>
+  <td><b>{name}</b>
+  <td><b>Value</b></td>
+  <td><b>Constraint</b></td>
+  <td><b>Prior</b></td>
+  <td><b>Tied to</b></td>""".format(name=name)
+            to_print.insert(0, header)
+        return '<table>' + '\n'.format(sep).join(to_print) + '\n</table>'
+
    def __str__(self, header=True):
        name = adjust_name_for_printing(self.name) + "."
        constrs = self._constraints_str;
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -58,7 +58,7 @@ class Gaussian(Prior):
        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)

    def __str__(self):
-        return "N(" + str(np.round(self.mu)) + ', ' + str(np.round(self.sigma2)) + ')'
+        return "N({:.2g}, {:.2g})".format(self.mu, self.sigma)

    def lnpdf(self, x):
        return self.constant - 0.5 * np.square(x - self.mu) / self.sigma2
@ -89,7 +89,7 @@ class Uniform(Prior):
        self.upper = float(upper)

    def __str__(self):
-        return "[" + str(np.round(self.lower)) + ', ' + str(np.round(self.upper)) + ']'
+        return "[{:.2g}, {:.2g}]".format(self.lower, self.upper)

    def lnpdf(self, x):
        region = (x >= self.lower) * (x <= self.upper)
@ -132,7 +132,7 @@ class LogGaussian(Prior):
        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)

    def __str__(self):
-        return "lnN(" + str(np.round(self.mu)) + ', ' + str(np.round(self.sigma2)) + ')'
+        return "lnN({:.2g}, {:.2g})".format(self.mu, self.sigma)

    def lnpdf(self, x):
        return self.constant - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2 - np.log(x)
@ -237,7 +237,7 @@ class Gamma(Prior):
        self.constant = -gammaln(self.a) + a * np.log(b)

    def __str__(self):
-        return "Ga(" + str(np.round(self.a)) + ', ' + str(np.round(self.b)) + ')'
+        return "Ga({:.2g}, {:.2g})".format(self.a, self.b)

    def summary(self):
        ret = {"E[x]": self.a / self.b, \
@ -272,8 +272,7 @@ class Gamma(Prior):
        b = E / V
        return Gamma(a, b)

-
-class inverse_gamma(Prior):
+class InverseGamma(Prior):
    """
    Implementation of the inverse-Gamma probability function, coupled with random variables.

@ -284,7 +283,7 @@ class inverse_gamma(Prior):

    """
    domain = _POSITIVE
-
+    _instances = []
    def __new__(cls, a, b): # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
@ -301,7 +300,7 @@ class inverse_gamma(Prior):
        self.constant = -gammaln(self.a) + a * np.log(b)

    def __str__(self):
-        return "iGa(" + str(np.round(self.a)) + ', ' + str(np.round(self.b)) + ')'
+        return "iGa({:.2g}, {:.2g})".format(self.a, self.b)

    def lnpdf(self, x):
        return self.constant - (self.a + 1) * np.log(x) - self.b / x
@ -312,7 +311,6 @@ class inverse_gamma(Prior):
    def rvs(self, n):
        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)

-
 class DGPLVM_KFDA(Prior):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable function using
@ -656,3 +654,64 @@ class DGPLVM(Prior):
    def __str__(self):
        return 'DGPLVM_prior'

+class HalfT(Prior):
+    """
+    Implementation of the half student t probability function, coupled with random variables.
+
+    :param A: scale parameter
+    :param nu: degrees of freedom
+
+    """
+    domain = _POSITIVE
+    _instances = []
+    def __new__(cls, A, nu): # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().A == A and instance().nu == nu:
+                   return instance()
+        o = super(Prior, cls).__new__(cls, A, nu)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+    def __init__(self, A, nu):
+        self.A = float(A)
+        self.nu = float(nu)
+        self.constant = gammaln(.5*(self.nu+1.)) - gammaln(.5*self.nu) - .5*np.log(np.pi*self.A*self.nu)
+
+    def __str__(self):
+        return "hT({:.2g}, {:.2g})".format(self.A, self.nu)
+
+    def lnpdf(self,theta):
+        return (theta>0) * ( self.constant -.5*(self.nu+1) * np.log( 1.+ (1./self.nu) * (theta/self.A)**2 ) )
+
+        #theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
+        #lnpdfs = np.zeros_like(theta)
+        #theta = np.array([theta])
+        #above_zero = theta.flatten()>1e-6
+        #v = self.nu
+        #sigma2=self.A
+        #stop
+        #lnpdfs[above_zero] = (+ gammaln((v + 1) * 0.5)
+        #    - gammaln(v * 0.5)
+        #    - 0.5*np.log(sigma2 * v * np.pi)
+        #    - 0.5*(v + 1)*np.log(1 + (1/np.float(v))*((theta[above_zero][0]**2)/sigma2))
+        #)
+        #return lnpdfs
+
+    def lnpdf_grad(self,theta):
+        theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
+        grad = np.zeros_like(theta)
+        above_zero = theta>1e-6
+        v = self.nu
+        sigma2=self.A
+        grad[above_zero] = -0.5*(v+1)*(2*theta[above_zero])/(v*sigma2 + theta[above_zero][0]**2)
+        return grad
+
+    def rvs(self, n):
+         #return np.random.randn(n) * self.sigma + self.mu
+         from scipy.stats import t
+         #[np.abs(x) for x in t.rvs(df=4,loc=0,scale=50, size=10000)])
+         ret = t.rvs(self.nu,loc=0,scale=self.A, size=n)
+         ret[ret<0] = 0
+         return ret
+
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -9,6 +9,11 @@ from .. import likelihoods
 from parameterization.variational import VariationalPosterior

 import logging
+from GPy.inference.latent_function_inference.posterior import Posterior
+from GPy.inference.optimization.stochastics import SparseGPStochastics,\
+    SparseGPMissing
+#no stochastics.py file added! from GPy.inference.optimization.stochastics import SparseGPStochastics,\
+    #SparseGPMissing
 logger = logging.getLogger("sparse gp")

 class SparseGP(GP):
@ -34,12 +39,13 @@ class SparseGP(GP):

    """

-    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, name='sparse gp', Y_metadata=None, normalizer=False):
-
+    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
+                 name='sparse gp', Y_metadata=None, normalizer=False,
+                 missing_data=False, stochastic=False, batchsize=1):
        #pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC()
+                inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
            else:
                #inference_method = ??
                raise NotImplementedError, "what to do what to do?"
@ -49,46 +55,281 @@ class SparseGP(GP):
        self.num_inducing = Z.shape[0]

        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
+        self.missing_data = missing_data
+
+        if stochastic and missing_data:
+            self.missing_data = True
+            self.ninan = ~np.isnan(Y)
+            self.stochastics = SparseGPStochastics(self, batchsize)
+        elif stochastic and not missing_data:
+            self.missing_data = False
+            self.stochastics = SparseGPStochastics(self, batchsize)
+        elif missing_data:
+            self.missing_data = True
+            self.ninan = ~np.isnan(Y)
+            self.stochastics = SparseGPMissing(self)
+        else:
+            self.stochastics = False
+
        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
+        if self.missing_data:
+            self.Ylist = []
+            overall = self.Y_normalized.shape[1]
+            m_f = lambda i: "Precomputing Y for missing data: {: >7.2%}".format(float(i+1)/overall)
+            message = m_f(-1)
+            print message,
+            for d in xrange(overall):
+                self.Ylist.append(self.Y_normalized[self.ninan[:, d], d][:, None])
+                print ' '*(len(message)+1) + '\r',
+                message = m_f(d)
+                print message,
+            print ''
+
+        self.posterior = None

    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)

-    def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
-        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
-        if isinstance(self.X, VariationalPosterior):
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
+        """
+        This is the standard part, which usually belongs in parameters_changed.
+
+        For automatic handling of subsampling (such as missing_data, stochastics etc.), we need to put this into an inner
+        loop, in order to ensure a different handling of gradients etc of different
+        subsets of data.
+
+        The dict in current_values will be passed aroung as current_values for
+        the rest of the algorithm, so this is the place to store current values,
+        such as subsets etc, if necessary.
+
+        If Lm and dL_dKmm can be precomputed (or only need to be computed once)
+        pass them in here, so they will be passed to the inference_method.
+
+        subset_indices is a dictionary of indices. you can put the indices however you
+        like them into this dictionary for inner use of the indices inside the
+        algorithm.
+        """
+        posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None)
+        current_values = {}
+        likelihood.update_gradients(grad_dict['dL_dthetaL'])
+        current_values['likgrad'] = likelihood.gradient.copy()
+        if subset_indices is None:
+            subset_indices = {}
+        if isinstance(X, VariationalPosterior):
            #gradients wrt kernel
-            dL_dKmm = self.grad_dict['dL_dKmm']
-            self.kern.update_gradients_full(dL_dKmm, self.Z, None)
-            target = self.kern.gradient.copy()
-            self.kern.update_gradients_expectations(variational_posterior=self.X,
-                                                    Z=self.Z,
-                                                    dL_dpsi0=self.grad_dict['dL_dpsi0'],
-                                                    dL_dpsi1=self.grad_dict['dL_dpsi1'],
-                                                    dL_dpsi2=self.grad_dict['dL_dpsi2'])
-            self.kern.gradient += target
+            dL_dKmm = grad_dict['dL_dKmm']
+            kern.update_gradients_full(dL_dKmm, Z, None)
+            current_values['kerngrad'] = kern.gradient.copy()
+            kern.update_gradients_expectations(variational_posterior=X,
+                                                    Z=Z,
+                                                    dL_dpsi0=grad_dict['dL_dpsi0'],
+                                                    dL_dpsi1=grad_dict['dL_dpsi1'],
+                                                    dL_dpsi2=grad_dict['dL_dpsi2'])
+            current_values['kerngrad'] += kern.gradient

            #gradients wrt Z
-            self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
-            self.Z.gradient += self.kern.gradients_Z_expectations(
-                               self.grad_dict['dL_dpsi0'],
-                               self.grad_dict['dL_dpsi1'],
-                               self.grad_dict['dL_dpsi2'],
-                               Z=self.Z,
-                               variational_posterior=self.X)
+            current_values['Zgrad'] = kern.gradients_X(dL_dKmm, Z)
+            current_values['Zgrad'] += kern.gradients_Z_expectations(
+                               grad_dict['dL_dpsi0'],
+                               grad_dict['dL_dpsi1'],
+                               grad_dict['dL_dpsi2'],
+                               Z=Z,
+                               variational_posterior=X)
        else:
            #gradients wrt kernel
-            self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
-            target = self.kern.gradient.copy()
-            self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z)
-            target += self.kern.gradient
-            self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z, None)
-            self.kern.gradient += target
+            kern.update_gradients_diag(grad_dict['dL_dKdiag'], X)
+            current_values['kerngrad'] = kern.gradient.copy()
+            kern.update_gradients_full(grad_dict['dL_dKnm'], X, Z)
+            current_values['kerngrad'] += kern.gradient
+            kern.update_gradients_full(grad_dict['dL_dKmm'], Z, None)
+            current_values['kerngrad'] += kern.gradient
            #gradients wrt Z
-            self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
-            self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
+            current_values['Zgrad'] = kern.gradients_X(grad_dict['dL_dKmm'], Z)
+            current_values['Zgrad'] += kern.gradients_X(grad_dict['dL_dKnm'].T, Z, X)
+        return posterior, log_marginal_likelihood, grad_dict, current_values, subset_indices
+
+    def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None):
+        """
+        This is for automatic updates of values in the inner loop of missing
+        data handling. Both arguments are dictionaries and the values in
+        full_values will be updated by the current_gradients.
+
+        If a key from current_values does not exist in full_values, it will be
+        initialized to the value in current_values.
+
+        If there is indices needed for the update, value_indices can be used for
+        that. If value_indices has the same key, as current_values, the update
+        in full_values will be indexed by the indices in value_indices.
+
+        grads:
+            dictionary of standing gradients (you will have to carefully make sure, that
+            the ordering is right!). The values in here will be updated such that
+            full_values[key] += current_values[key]  forall key in full_gradients.keys()
+
+        gradients:
+            dictionary of gradients in the current set of parameters.
+
+        value_indices:
+            dictionary holding indices for the update in full_values.
+            if the key exists the update rule is:def df(x):
+    m.stochastics.do_stochastics()
+    grads = m._grads(x)
+    print '\r',
+    message = "Lik: {: 6.4E} Grad: {: 6.4E} Dim: {} Lik: {} Len: {!s}".format(float(m.log_likelihood()), np.einsum('i,i->', grads, grads), m.stochastics.d, float(m.likelihood.variance), " ".join(["{:3.2E}".format(l) for l in m.kern.lengthscale.values]))
+    print message,
+    return grads
+
+def grad_stop(threshold):
+    def inner(args):
+        g = args['gradient']
+        return np.sqrt(np.einsum('i,i->',g,g)) < threshold
+    return inner
+
+def maxiter_stop(maxiter):
+    def inner(args):
+        return args['n_iter'] == maxiter
+    return inner
+
+def optimize(m, maxiter=1000):
+    #opt = climin.RmsProp(m.optimizer_array.copy(), df, 1e-6, decay=0.9, momentum=0.9, step_adapt=1e-7)
+    opt = climin.Adadelta(m.optimizer_array.copy(), df, 1e-2, decay=0.9)
+    ret = opt.minimize_until((grad_stop(.1), maxiter_stop(maxiter)))
+    print
+    return ret
+            full_values[key][value_indices[key]] += current_values[key]
+        """
+        for key in current_values.keys():
+            if value_indices is not None and value_indices.has_key(key):
+                index = value_indices[key]
+            else:
+                index = slice(None)
+            if full_values.has_key(key):
+                full_values[key][index] += current_values[key]
+            else:
+                full_values[key] = current_values[key]
+
+    def _inner_values_update(self, current_values):
+        """
+        This exists if there is more to do with the current values.
+        It will be called allways in the inner loop, so that
+        you can do additional inner updates for the inside of the missing data
+        loop etc. This can also be used for stochastic updates, when only working on
+        one dimension of the output.
+        """
+        pass
+
+    def _outer_values_update(self, full_values):
+        """
+        Here you put the values, which were collected before in the right places.
+        E.g. set the gradients of parameters, etc.
+        """
+        self.likelihood.gradient = full_values['likgrad']
+        self.kern.gradient = full_values['kerngrad']
+        self.Z.gradient = full_values['Zgrad']
+
+    def _outer_init_full_values(self):
+        """
+        If full_values has indices in values_indices, we might want to initialize
+        the full_values differently, so that subsetting is possible.
+
+        Here you can initialize the full_values for the values needed.
+
+        Keep in mind, that if a key does not exist in full_values when updating
+        values, it will be set (so e.g. for Z there is no need to initialize Zgrad,
+        as there is no subsetting needed. For X in BGPLVM on the other hand we probably need
+        to initialize the gradients for the mean and the variance in order to
+        have the full gradient for indexing)
+        """
+        return {}
+
+    def _outer_loop_for_missing_data(self):
+        Lm = None
+        dL_dKmm = None
+
+        self._log_marginal_likelihood = 0
+        full_values = self._outer_init_full_values()
+
+        if self.posterior is None:
+            woodbury_inv = np.zeros((self.num_inducing, self.num_inducing, self.output_dim))
+            woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
+        else:
+            woodbury_inv = self.posterior._woodbury_inv
+            woodbury_vector = self.posterior._woodbury_vector
+
+        if not self.stochastics:
+            m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
+            message = m_f(-1)
+            print message,
+
+        for d in self.stochastics.d:
+            ninan = self.ninan[:, d]
+
+            if not self.stochastics:
+                print ' '*(len(message)) + '\r',
+                message = m_f(d)
+                print message,
+
+            posterior, log_marginal_likelihood, \
+                grad_dict, current_values, value_indices = self._inner_parameters_changed(
+                                self.kern, self.X[ninan],
+                                self.Z, self.likelihood,
+                                self.Ylist[d], self.Y_metadata,
+                                Lm, dL_dKmm,
+                                subset_indices=dict(outputs=d, samples=ninan))
+
+            self._inner_take_over_or_update(full_values, current_values, value_indices)
+            self._inner_values_update(current_values)
+
+            Lm = posterior.K_chol
+            dL_dKmm = grad_dict['dL_dKmm']
+            woodbury_inv[:, :, d] = posterior.woodbury_inv
+            woodbury_vector[:, d:d+1] = posterior.woodbury_vector
+            self._log_marginal_likelihood += log_marginal_likelihood
+        if not self.stochastics:
+            print ''
+
+        if self.posterior is None:
+            self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
+                                   K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol)
+        self._outer_values_update(full_values)
+
+    def _outer_loop_without_missing_data(self):
+        self._log_marginal_likelihood = 0
+
+        if self.posterior is None:
+            woodbury_inv = np.zeros((self.num_inducing, self.num_inducing, self.output_dim))
+            woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
+        else:
+            woodbury_inv = self.posterior._woodbury_inv
+            woodbury_vector = self.posterior._woodbury_vector
+
+        d = self.stochastics.d
+        posterior, log_marginal_likelihood, \
+            grad_dict, current_values, _ = self._inner_parameters_changed(
+                            self.kern, self.X,
+                            self.Z, self.likelihood,
+                            self.Y_normalized[:, d], self.Y_metadata)
+        self.grad_dict = grad_dict
+
+        self._log_marginal_likelihood += log_marginal_likelihood
+
+        self._outer_values_update(current_values)
+
+        woodbury_inv[:, :, d] = posterior.woodbury_inv[:, :, None]
+        woodbury_vector[:, d] = posterior.woodbury_vector
+        if self.posterior is None:
+            self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
+                                   K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol)
+
+    def parameters_changed(self):
+        if self.missing_data:
+            self._outer_loop_for_missing_data()
+        elif self.stochastics:
+            self._outer_loop_without_missing_data()
+        else:
+            self.posterior, self._log_marginal_likelihood, self.grad_dict, full_values, _ = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
+            self._outer_values_update(full_values)

    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@ -34,7 +34,8 @@ class SparseGP_MPI(SparseGP):

    """

-    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False):
+    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False,
+                 missing_data=False, stochastic=False, batchsize=1):
        self._IN_OPTIMIZATION_ = False
        if mpi_comm != None:
            if inference_method is None:
@ -42,7 +43,8 @@ class SparseGP_MPI(SparseGP):
            else:
                assert isinstance(inference_method, VarDTC_minibatch), 'inference_method has to support MPI!'

-        super(SparseGP_MPI, self).__init__(X, Y, Z, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
+        super(SparseGP_MPI, self).__init__(X, Y, Z, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer,
+                                           missing_data=missing_data, stochastic=stochastic, batchsize=batchsize)
        self.update_model(False)
        self.link_parameter(self.X, index=0)
        if variational_prior is not None:
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@ -42,10 +42,10 @@ class SVIGP(GP):
    """


-    def __init__(self, X, likelihood, kernel, Z, q_u=None, batchsize=10, X_variance=None):
-        GP.__init__(self, X, likelihood, kernel, normalize_X=False)
+    def __init__(self, X, Y, kernel, Z, q_u=None, batchsize=10):
+        raise NotImplementedError, "This is a work in progress, see github issue "
+        GP.__init__(self, X, Y, kernel)
        self.batchsize=batchsize
-        self.Y = self.likelihood.Y.copy()
        self.Z = Z
        self.num_inducing = Z.shape[0]

@ -57,15 +57,9 @@ class SVIGP(GP):
        self.param_steplength = 1e-5
        self.momentum = 0.9

-        if X_variance is None:
-            self.has_uncertain_inputs = False
-        else:
-            self.has_uncertain_inputs = True
-            self.X_variance = X_variance
-
-
        if q_u is None:
             q_u = np.hstack((np.random.randn(self.num_inducing*self.output_dim),-.5*np.eye(self.num_inducing).flatten()))
+
        self.set_vb_param(q_u)

        self._permutation = np.random.permutation(self.num_data)
@ -76,79 +70,22 @@ class SVIGP(GP):
        self._grad_trace = []

        #set the adaptive steplength parameters
-        self.hbar_t = 0.0
-        self.tau_t = 100.0
-        self.gbar_t = 0.0
-        self.gbar_t1 = 0.0
-        self.gbar_t2 = 0.0
-        self.hbar_tp = 0.0
-        self.tau_tp = 10000.0
-        self.gbar_tp = 0.0
-        self.adapt_param_steplength = True
-        self.adapt_vb_steplength = True
-        self._param_steplength_trace = []
-        self._vb_steplength_trace = []
-
-#     def _getstate(self):
-#         steplength_params = [self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength]
-#         return GP._getstate(self) + \
-#             [self.get_vb_param(),
-#              self.Z,
-#              self.num_inducing,
-#              self.has_uncertain_inputs,
-#              self.X_variance,
-#              self.X_batch,
-#              self.X_variance_batch,
-#              steplength_params,
-#              self.batchcounter,
-#              self.batchsize,
-#              self.epochs,
-#              self.momentum,
-#              self.data_prop,
-#              self._param_trace,
-#              self._param_steplength_trace,
-#              self._vb_steplength_trace,
-#              self._ll_trace,
-#              self._grad_trace,
-#              self.Y,
-#              self._permutation,
-#              self.iterations
-#             ]
-# 
-#     def _setstate(self, state):
-#         self.iterations = state.pop()
-#         self._permutation = state.pop()
-#         self.Y = state.pop()
-#         self._grad_trace = state.pop()
-#         self._ll_trace = state.pop()
-#         self._vb_steplength_trace = state.pop()
-#         self._param_steplength_trace = state.pop()
-#         self._param_trace = state.pop()
-#         self.data_prop = state.pop()
-#         self.momentum = state.pop()
-#         self.epochs = state.pop()
-#         self.batchsize = state.pop()
-#         self.batchcounter = state.pop()
-#         steplength_params = state.pop()
-#         (self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength) = steplength_params
-#         self.X_variance_batch = state.pop()
-#         self.X_batch = state.pop()
-#         self.X_variance = state.pop()
-#         self.has_uncertain_inputs = state.pop()
-#         self.num_inducing = state.pop()
-#         self.Z = state.pop()
-#         vb_param = state.pop()
-#         GP._setstate(self, state)
-#         self.set_vb_param(vb_param)
+        #self.hbar_t = 0.0
+        #self.tau_t = 100.0
+        #self.gbar_t = 0.0
+        #self.gbar_t1 = 0.0
+        #self.gbar_t2 = 0.0
+        #self.hbar_tp = 0.0
+        #self.tau_tp = 10000.0
+        #self.gbar_tp = 0.0
+        #self.adapt_param_steplength = True
+        #self.adapt_vb_steplength = True
+        #self._param_steplength_trace = []
+        #self._vb_steplength_trace = []

    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
-        if self.has_uncertain_inputs:
-            self.psi0 = self.kern.psi0(self.Z, self.X_batch, self.X_variance_batch)
-            self.psi1 = self.kern.psi1(self.Z, self.X_batch, self.X_variance_batch)
-            self.psi2 = self.kern.psi2(self.Z, self.X_batch, self.X_variance_batch)
-        else:
        self.psi0 = self.kern.Kdiag(self.X_batch)
        self.psi1 = self.kern.K(self.X_batch, self.Z)
        self.psi2 = None
@ -179,7 +116,7 @@ class SVIGP(GP):

    def load_batch(self):
        """
-        load a batch of data (set self.X_batch and self.likelihood.Y from self.X, self.Y)
+        load a batch of data (set self.X_batch and self.Y_batch from self.X, self.Y)
        """

        #if we've seen all the data, start again with them in a new random order
@ -191,9 +128,7 @@ class SVIGP(GP):
        this_perm = self._permutation[self.batchcounter:self.batchcounter+self.batchsize]

        self.X_batch = self.X[this_perm]
-        self.likelihood.set_data(self.Y[this_perm])
-        if self.has_uncertain_inputs:
-            self.X_variance_batch = self.X_variance[this_perm]
+        self.Y_batch = self.Y[this_perm]

        self.batchcounter += self.batchsize

@ -288,7 +223,9 @@ class SVIGP(GP):
        Compute the gradients of the lower bound wrt the canonical and
        Expectation parameters of u.

-        Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family)
+        Note that the natural gradient in either is given by the gradient in
+        the other (See Hensman et al 2012 Fast Variational inference in the
+        conjugate exponential Family)
        """

        # Gradient for eta
--- a/GPy/core/symbolic.py
+++ b/GPy/core/symbolic.py
@ -12,7 +12,7 @@ from sympy.utilities.iterables import numbered_symbols
 import scipy
 import GPy
 #from scipy.special import gammaln, gamma, erf, erfc, erfcx, polygamma
-from GPy.util.symbolic import normcdf, normcdfln, logistic, logisticln, erfcx, erfc, gammaln
+#@NDL you removed this file! #from GPy.util.symbolic import normcdf, normcdfln, logistic, logisticln, erfcx, erfc, gammaln
 def getFromDict(dataDict, mapList):
    return reduce(lambda d, k: d[k], mapList, dataDict)

--- a/GPy/defaults.cfg
+++ b/GPy/defaults.cfg
@ -19,4 +19,9 @@ dir=$HOME/tmp/GPy-datasets/
 # if you have an anaconda python installation please specify it here.
 installed = False
 location = None
-MKL = False # set this to true if you have the MKL optimizations installed
+ # set this to true if you have the MKL optimizations installed:
+MKL = False
+
+[weave]
+#if true, try to use weave, and fall back to numpy. if false, just use numpy.
+working = True
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -7,11 +7,6 @@ Gaussian Processes classification
 """
 import GPy

-try:
-    import pylab as pb
-except:
-    pass
-
 default_seed = 10000

 def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
@ -19,7 +14,9 @@ def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
    Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.

    """
-    data = GPy.util.datasets.oil()
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.oil()
    X = data['X']
    Xtest = data['Xtest']
    Y = data['Y'][:, 0:1]
@ -54,7 +51,9 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):

    """

-    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0

@ -71,7 +70,8 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):

    # Plot
    if plot:
-        fig, axes = pb.subplots(2, 1)
+        from matplotlib import pyplot as plt
+        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])

@ -87,7 +87,9 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=

    """

-    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0

@ -107,7 +109,8 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=

    # Plot
    if plot:
-        fig, axes = pb.subplots(2, 1)
+        from matplotlib import pyplot as plt
+        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])

@ -123,7 +126,9 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti

    """

-    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0

@ -137,7 +142,8 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti

    # Plot
    if plot:
-        fig, axes = pb.subplots(2, 1)
+        from matplotlib import pyplot as plt
+        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])

@ -153,7 +159,9 @@ def toy_heaviside(seed=default_seed, optimize=True, plot=True):

    """

-    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0

@ -171,7 +179,8 @@ def toy_heaviside(seed=default_seed, optimize=True, plot=True):

    # Plot
    if plot:
-        fig, axes = pb.subplots(2, 1)
+        from matplotlib import pyplot as plt
+        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])

@ -190,7 +199,9 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
    :param kernel: kernel to use in the model
    :type kernel: a GPy kernel
    """
-    data = GPy.util.datasets.crescent_data(seed=seed)
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.crescent_data(seed=seed)
    Y = data['Y']
    Y[Y.flatten()==-1] = 0

--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -1,6 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as _np
+
 #default_seed = _np.random.seed(123344)

 def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan=False):
@ -23,9 +24,6 @@ def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan
    X = _np.random.rand(num_inputs, input_dim)
    lengthscales = _np.random.rand(input_dim)
    k = GPy.kern.RBF(input_dim, .5, lengthscales, ARD=True)
-         ##+ GPy.kern.white(input_dim, 0.01)
-         #)
-    #k = GPy.kern.Linear(input_dim, ARD=1)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
    K = k.K(X)
    Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, (output_dim,)).T

@ -71,7 +69,8 @@ def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan

 def gplvm_oil_100(optimize=True, verbose=1, plot=True):
    import GPy
-    data = GPy.util.datasets.oil_100()
+    import pods
+    data = pods.datasets.oil_100()
    Y = data['X']
    # create simple GP model
    kernel = GPy.kern.RBF(6, ARD=True) + GPy.kern.Bias(6)
@ -83,8 +82,10 @@ def gplvm_oil_100(optimize=True, verbose=1, plot=True):

 def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_inducing=15, max_iters=50):
    import GPy
+    import pods
+
    _np.random.seed(0)
-    data = GPy.util.datasets.oil()
+    data = pods.datasets.oil()
    Y = data['X'][:N]
    Y = Y - Y.mean(0)
    Y /= Y.std(0)
@ -101,7 +102,7 @@ def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_induci

 def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=25, Q=4, sigma=.2):
    import GPy
-    from GPy.util.datasets import swiss_roll_generated
+    from pods.datasets import swiss_roll_generated
    from GPy.models import BayesianGPLVM

    data = swiss_roll_generated(num_samples=N, sigma=sigma)
@ -159,11 +160,11 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=25, Q=4
 def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
    import GPy
    from matplotlib import pyplot as plt
-    from ..util.misc import param_to_array
    import numpy as np
+    import pods

    _np.random.seed(0)
-    data = GPy.util.datasets.oil()
+    data = pods.datasets.oil()

    kernel = GPy.kern.RBF(Q, 1., 1./_np.random.uniform(0,1,(Q,)), ARD=True)# + GPy.kern.Bias(Q, _np.exp(-2))
    Y = data['X'][:N]
@ -177,7 +178,7 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40,
        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
        m.plot_latent(ax=latent_axes, labels=m.data_labels)
        data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0,:]))
-        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(param_to_array(m.X.mean)[0:1,:], # @UnusedVariable
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1,:], # @UnusedVariable
            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
        raw_input('Press enter to finish')
        plt.close(fig)
@ -186,11 +187,10 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40,
 def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
    import GPy
    from matplotlib import pyplot as plt
-    from ..util.misc import param_to_array
-    import numpy as np
+    import pods

    _np.random.seed(0)
-    data = GPy.util.datasets.oil()
+    data = pods.datasets.oil()

    kernel = GPy.kern.RBF(Q, 1., 1./_np.random.uniform(0,1,(Q,)), ARD=True)# + GPy.kern.Bias(Q, _np.exp(-2))
    Y = data['X'][:N]
@ -204,13 +204,50 @@ def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40
        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
        m.plot_latent(ax=latent_axes, labels=m.data_labels)
        data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0,:]))
-        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(param_to_array(m.X.mean)[0:1,:], # @UnusedVariable
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1,:], # @UnusedVariable
            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
        raw_input('Press enter to finish')
        plt.close(fig)
    return m

-def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
+def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
+    Q_signal = 4
+    import GPy
+    import numpy as np
+    np.random.seed(3000)
+
+    k = GPy.kern.Matern32(Q_signal, 10., lengthscale=1+(np.random.uniform(1,6,Q_signal)), ARD=1)
+    t = np.c_[[np.linspace(-1,5,N) for _ in range(Q_signal)]].T
+    K = k.K(t)
+    s2, s1, s3, sS = np.random.multivariate_normal(np.zeros(K.shape[0]), K, size=(4))[:,:,None]
+
+    Y1, Y2, Y3, S1, S2, S3 = _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS)
+
+    slist = [sS, s1, s2, s3]
+    slist_names = ["sS", "s1", "s2", "s3"]
+    Ylist = [Y1, Y2, Y3]
+
+    if plot_sim:
+        from matplotlib import pyplot as plt
+        import matplotlib.cm as cm
+        import itertools
+        fig = plt.figure("MRD Simulation Data", figsize=(8, 6))
+        fig.clf()
+        ax = fig.add_subplot(2, 1, 1)
+        labls = slist_names
+        for S, lab in itertools.izip(slist, labls):
+            ax.plot(S, label=lab)
+        ax.legend()
+        for i, Y in enumerate(Ylist):
+            ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
+            ax.imshow(Y, aspect='auto', cmap=cm.gray) # @UndefinedVariable
+            ax.set_title("Y{}".format(i + 1))
+        plt.draw()
+        plt.tight_layout()
+
+    return slist, [S1, S2, S3], Ylist
+
+def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
    _np.random.seed(1234)

    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
@ -229,34 +266,17 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
    s3 -= s3.mean(); s3 /= s3.std(0)
    sS -= sS.mean(); sS /= sS.std(0)

-    S1 = _np.hstack([s1, sS])
-    S2 = _np.hstack([s2, s3, sS])
-    S3 = _np.hstack([s3, sS])
-
-    Y1 = S1.dot(_np.random.randn(S1.shape[1], D1))
-    Y2 = S2.dot(_np.random.randn(S2.shape[1], D2))
-    Y3 = S3.dot(_np.random.randn(S3.shape[1], D3))
-
-    Y1 += .3 * _np.random.randn(*Y1.shape)
-    Y2 += .2 * _np.random.randn(*Y2.shape)
-    Y3 += .25 * _np.random.randn(*Y3.shape)
-
-    Y1 -= Y1.mean(0)
-    Y2 -= Y2.mean(0)
-    Y3 -= Y3.mean(0)
-    Y1 /= Y1.std(0)
-    Y2 /= Y2.std(0)
-    Y3 /= Y3.std(0)
+    Y1, Y2, Y3, S1, S2, S3 = _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS)

    slist = [sS, s1, s2, s3]
    slist_names = ["sS", "s1", "s2", "s3"]
    Ylist = [Y1, Y2, Y3]

    if plot_sim:
-        import pylab
+        from matplotlib import pyplot as plt
        import matplotlib.cm as cm
        import itertools
-        fig = pylab.figure("MRD Simulation Data", figsize=(8, 6))
+        fig = plt.figure("MRD Simulation Data", figsize=(8, 6))
        fig.clf()
        ax = fig.add_subplot(2, 1, 1)
        labls = slist_names
@ -267,28 +287,28 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
            ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
            ax.imshow(Y, aspect='auto', cmap=cm.gray) # @UndefinedVariable
            ax.set_title("Y{}".format(i + 1))
-        pylab.draw()
-        pylab.tight_layout()
+        plt.draw()
+        plt.tight_layout()

    return slist, [S1, S2, S3], Ylist

-# def bgplvm_simulation_matlab_compare():
-#     from GPy.util.datasets import simulation_BGPLVM
-#     from GPy import kern
-#     from GPy.models import BayesianGPLVM
-#
-#     sim_data = simulation_BGPLVM()
-#     Y = sim_data['Y']
-#     mu = sim_data['mu']
-#     num_inducing, [_, Q] = 3, mu.shape
-#
-#     k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
-#     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k,
-#                        _debug=False)
-#     m.auto_scale_factor = True
-#     m['noise'] = Y.var() / 100.
-#     m['linear_variance'] = .01
-#     return m
+def _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS):
+    S1 = _np.hstack([s1, sS])
+    S2 = _np.hstack([s2, s3, sS])
+    S3 = _np.hstack([s3, sS])
+    Y1 = S1.dot(_np.random.randn(S1.shape[1], D1))
+    Y2 = S2.dot(_np.random.randn(S2.shape[1], D2))
+    Y3 = S3.dot(_np.random.randn(S3.shape[1], D3))
+    Y1 += .3 * _np.random.randn(*Y1.shape)
+    Y2 += .2 * _np.random.randn(*Y2.shape)
+    Y3 += .25 * _np.random.randn(*Y3.shape)
+    Y1 -= Y1.mean(0)
+    Y2 -= Y2.mean(0)
+    Y3 -= Y3.mean(0)
+    Y1 /= Y1.std(0)
+    Y2 /= Y2.std(0)
+    Y3 /= Y3.std(0)
+    return Y1, Y2, Y3, S1, S2, S3

 def bgplvm_simulation(optimize=True, verbose=1,
                      plot=True, plot_sim=False,
@ -298,7 +318,7 @@ def bgplvm_simulation(optimize=True, verbose=1,
    from GPy.models import BayesianGPLVM

    D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 3, 9
-    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    #k = kern.RBF(Q, ARD=True, lengthscale=10.)
@ -323,7 +343,7 @@ def ssgplvm_simulation(optimize=True, verbose=1,
    from GPy.models import SSGPLVM

    D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 3, 9
-    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True, useGPU=useGPU)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    #k = kern.RBF(Q, ARD=True, lengthscale=10.)
@ -346,23 +366,19 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
                      ):
    from GPy import kern
    from GPy.models import BayesianGPLVM
-    from GPy.inference.latent_function_inference.var_dtc import VarDTCMissingData

    D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
-    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)

-    inan = _np.random.binomial(1, .8, size=Y.shape).astype(bool) # 80% missing data
+    inan = _np.random.binomial(1, .2, size=Y.shape).astype(bool) # 80% missing data
    Ymissing = Y.copy()
    Ymissing[inan] = _np.nan

    m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing,
-                      inference_method=VarDTCMissingData(inan=inan), kernel=k)
+                      kernel=k, missing_data=True)

-    m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape)
-    m.likelihood.variance = .01
-    m.parameters_changed()
    m.Yreal = Y

    if optimize:
@ -380,7 +396,7 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    from GPy.models import MRD

    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
-    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)

    #Ylist = [Ylist[0]]
    k = kern.Linear(Q, ARD=True)
@ -390,7 +406,7 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):

    if optimize:
        print "Optimizing Model:"
-        m.optimize(messages=verbose, max_iters=8e3, gtol=.1)
+        m.optimize(messages=verbose, max_iters=8e3)
    if plot:
        m.X.plot("MRD Latent Space 1D")
        m.plot_scales("MRD Scales")
@ -402,7 +418,7 @@ def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim
    from GPy.inference.latent_function_inference.var_dtc import VarDTCMissingData

    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
-    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)

    #Ylist = [Ylist[0]]
    k = kern.Linear(Q, ARD=True)
@ -431,8 +447,9 @@ def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim

 def brendan_faces(optimize=True, verbose=True, plot=True):
    import GPy
+    import pods

-    data = GPy.util.datasets.brendan_faces()
+    data = pods.datasets.brendan_faces()
    Q = 2
    Y = data['Y']
    Yn = Y - Y.mean()
@ -455,8 +472,9 @@ def brendan_faces(optimize=True, verbose=True, plot=True):

 def olivetti_faces(optimize=True, verbose=True, plot=True):
    import GPy
+    import pods

-    data = GPy.util.datasets.olivetti_faces()
+    data = pods.datasets.olivetti_faces()
    Q = 2
    Y = data['Y']
    Yn = Y - Y.mean()
@ -476,7 +494,9 @@ def olivetti_faces(optimize=True, verbose=True, plot=True):

 def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=True):
    import GPy
-    data = GPy.util.datasets.osu_run1()
+    import pods 
+
+    data = pods.datasets.osu_run1()
    # optimize
    if range == None:
        Y = data['Y'].copy()
@ -491,8 +511,9 @@ def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=Tru
 def stick(kernel=None, optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
+    import pods

-    data = GPy.util.datasets.osu_run1()
+    data = pods.datasets.osu_run1()
    # optimize
    m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
    if optimize: m.optimize('bfgs', messages=verbose, max_f_eval=10000)
@ -510,8 +531,9 @@ def stick(kernel=None, optimize=True, verbose=True, plot=True):
 def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
+    import pods

-    data = GPy.util.datasets.osu_run1()
+    data = pods.datasets.osu_run1()
    # optimize
    mapping = GPy.mappings.Linear(data['Y'].shape[1], 2)
    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
@ -529,8 +551,9 @@ def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
 def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
+    import pods

-    data = GPy.util.datasets.osu_run1()
+    data = pods.datasets.osu_run1()
    # optimize
    back_kernel=GPy.kern.RBF(data['Y'].shape[1], lengthscale=5.)
    mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
@ -542,15 +565,16 @@ def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
        y = m.likelihood.Y[0, :]
        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-        raw_input('Press enter to finish')
+        #raw_input('Press enter to finish')

    return m

 def robot_wireless(optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
+    import pods

-    data = GPy.util.datasets.robot_wireless()
+    data = pods.datasets.robot_wireless()
    # optimize
    m = GPy.models.BayesianGPLVM(data['Y'], 4, num_inducing=25)
    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
@ -564,8 +588,9 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import numpy as np
    import GPy
+    import pods

-    data = GPy.util.datasets.osu_run1()
+    data = pods.datasets.osu_run1()
    Q = 6
    kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True)
    m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
@ -595,8 +620,9 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):

 def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose=True, plot=True):
    import GPy
+    import pods

-    data = GPy.util.datasets.cmu_mocap(subject, motion)
+    data = pods.datasets.cmu_mocap(subject, motion)
    if in_place:
        # Make figure move in place.
        data['Y'][:, 0:3] = 0.0
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -13,7 +13,9 @@ import GPy

 def olympic_marathon_men(optimize=True, plot=True):
    """Run a standard Gaussian process regression on the Olympic marathon data."""
-    data = GPy.util.datasets.olympic_marathon_men()
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.olympic_marathon_men()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
@ -82,7 +84,9 @@ def epomeo_gpx(max_iters=200, optimize=True, plot=True):
    from the Mount Epomeo runs. Requires gpxpy to be installed on your system
    to load in the data.
    """
-    data = GPy.util.datasets.epomeo_gpx()
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.epomeo_gpx()
    num_data_list = []
    for Xpart in data['X']:
        num_data_list.append(Xpart.shape[0])
@ -107,9 +111,9 @@ def epomeo_gpx(max_iters=200, optimize=True, plot=True):
    k = k1**k2

    m = GPy.models.SparseGPRegression(t, Y, kernel=k, Z=Z, normalize_Y=True)
-    m.constrain_fixed('.*rbf_var', 1.)
-    m.constrain_fixed('iip')
-    m.constrain_bounded('noise_variance', 1e-3, 1e-1)
+    m.constrain_fixed('.*variance', 1.)
+    m.inducing_inputs.constrain_fixed()
+    m.Gaussian_noise.variance.constrain_bounded(1e-3, 1e-1)
    m.optimize(max_iters=max_iters,messages=True)

    return m
@ -125,7 +129,9 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
    length_scales = np.linspace(0.1, 60., resolution)
    log_SNRs = np.linspace(-3., 4., resolution)

-    data = GPy.util.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta',gene_number=gene_number)
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta',gene_number=gene_number)
    # data['Y'] = data['Y'][0::2, :]
    # data['X'] = data['X'][0::2, :]

@ -151,16 +157,16 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
        kern = GPy.kern.RBF(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50))

        m = GPy.models.GPRegression(data['X'], data['Y'], kernel=kern)
-        m['noise_variance'] = np.random.uniform(1e-3, 1)
-        optim_point_x[0] = m['rbf_lengthscale']
-        optim_point_y[0] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);
+        m.Gaussian_noise.variance = np.random.uniform(1e-3, 1)
+        optim_point_x[0] = m.rbf.lengthscale
+        optim_point_y[0] = np.log10(m.rbf.variance) - np.log10(m.Gaussian_noise.variance);

        # optimize
        if optimize:
            m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters)

-        optim_point_x[1] = m['rbf_lengthscale']
-        optim_point_y[1] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);
+        optim_point_x[1] = m.rbf.lengthscale
+        optim_point_y[1] = np.log10(m.rbf.variance) - np.log10(m.Gaussian_noise.variance);

        if plot:
            pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
@ -191,7 +197,7 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.RBF):
        noise_var = total_var / (1. + SNR)
        signal_var = total_var - noise_var
        model.kern['.*variance'] = signal_var
-        model['noise_variance'] = noise_var
+        model.Gaussian_noise.variance = noise_var
        length_scale_lls = []

        for length_scale in length_scales:
@ -205,13 +211,15 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.RBF):

 def olympic_100m_men(optimize=True, plot=True):
    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
-    data = GPy.util.datasets.olympic_100m_men()
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.olympic_100m_men()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])

    # set the lengthscale to be something sensible (defaults to 1)
-    m['rbf_lengthscale'] = 10
+    m.rbf.lengthscale = 10

    if optimize:
        m.optimize('bfgs', max_iters=200)
@ -222,7 +230,9 @@ def olympic_100m_men(optimize=True, plot=True):

 def toy_rbf_1d(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
-    data = GPy.util.datasets.toy_rbf_1d()
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.toy_rbf_1d()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
@ -236,7 +246,9 @@ def toy_rbf_1d(optimize=True, plot=True):

 def toy_rbf_1d_50(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
-    data = GPy.util.datasets.toy_rbf_1d_50()
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.toy_rbf_1d_50()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
@ -303,12 +315,11 @@ def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize
    # m.set_prior('.*lengthscale',len_prior)

    if optimize:
-        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+        m.optimize(optimizer='scg', max_iters=max_iters)

    if plot:
        m.kern.plot_ARD()

-    print m
    return m

 def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True, plot=True):
@ -343,24 +354,25 @@ def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, o
    # m.set_prior('.*lengthscale',len_prior)

    if optimize:
-        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+        m.optimize(optimizer='scg', max_iters=max_iters)

    if plot:
        m.kern.plot_ARD()

-    print m
    return m

 def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
    """Predict the location of a robot given wirelss signal strength readings."""
-    data = GPy.util.datasets.robot_wireless()
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.robot_wireless()

    # create simple GP Model
    m = GPy.models.GPRegression(data['Y'], data['X'], kernel=kernel)

    # optimize
    if optimize:
-        m.optimize(messages=True, max_iters=max_iters)
+        m.optimize(max_iters=max_iters)

    Xpredict = m.predict(data['Ytest'])[0]
    if plot:
@ -372,13 +384,14 @@ def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):

    sse = ((data['Xtest'] - Xpredict)**2).sum()

-    print m
    print('Sum of squares error on test data: ' + str(sse))
    return m

 def silhouette(max_iters=100, optimize=True, plot=True):
    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
-    data = GPy.util.datasets.silhouette()
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.silhouette()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
@ -390,7 +403,7 @@ def silhouette(max_iters=100, optimize=True, plot=True):
    print m
    return m

-def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True, checkgrad=True):
+def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True, checkgrad=False):
    """Run a 1D example of a sparse GP regression."""
    # sample inputs and outputs
    X = np.random.uniform(-3., 3., (num_samples, 1))
@ -401,10 +414,10 @@ def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, opti
    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)

    if checkgrad:
-        m.checkgrad(verbose=1)
+        m.checkgrad()

    if optimize:
-        m.optimize('tnc', messages=1, max_iters=max_iters)
+        m.optimize('tnc', max_iters=max_iters)

    if plot:
        m.plot()
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@ -124,6 +124,7 @@ class vDTC(object):
        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
        P = tdot(tmp.T)
+        stop

        #compute log marginal
        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@ -1,7 +1,6 @@
 import numpy as np
 from ...util import diag
 from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify, DSYR
-from ...util.misc import param_to_array
 from ...core.parameterization.variational import VariationalPosterior
 from . import LatentFunctionInference
 from posterior import Posterior
@ -23,7 +22,7 @@ class EPDTC(LatentFunctionInference):
        self.get_YYTfactor.limit = limit

    def _get_trYYT(self, Y):
-        return param_to_array(np.sum(np.square(Y)))
+        return np.sum(np.square(Y))

    def __getstate__(self):
        # has to be overridden, as Cacher objects cannot be pickled.
@ -44,7 +43,7 @@ class EPDTC(LatentFunctionInference):
        """
        N, D = Y.shape
        if (N>=D):
-            return param_to_array(Y)
+            return Y
        else:
            return jitchol(tdot(Y))

--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -12,7 +12,6 @@

 import numpy as np
 from ...util.linalg import mdot, jitchol, dpotrs, dtrtrs, dpotri, symmetrify, pdinv
-from ...util.misc import param_to_array
 from posterior import Posterior
 import warnings
 from scipy import optimize
@ -39,9 +38,6 @@ class Laplace(LatentFunctionInference):
        Returns a Posterior class containing essential quantities of the posterior
        """

-        #make Y a normal array!
-        Y = param_to_array(Y)
-
        # Compute K
        K = kern.K(X)

@ -86,7 +82,7 @@ class Laplace(LatentFunctionInference):

        #define the objective function (to be maximised)
        def obj(Ki_f, f):
-            return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + likelihood.logpdf(f, Y, Y_metadata=Y_metadata)
+            return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))

        difference = np.inf
        iteration = 0
@ -156,7 +152,7 @@ class Laplace(LatentFunctionInference):
        Ki_W_i  = K - C.T.dot(C)

        #compute the log marginal
-        log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + likelihood.logpdf(f_hat, Y, Y_metadata=Y_metadata) - np.sum(np.log(np.diag(L)))
+        log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + likelihood.logpdf(f_hat, Y, Y_metadata=Y_metadata).sum() - np.sum(np.log(np.diag(L)))

        # Compute matrices for derivatives
        dW_df = -likelihood.d3logpdf_df3(f_hat, Y, Y_metadata=Y_metadata) # -d3lik_d3fhat
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -6,7 +6,6 @@ from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrt
 from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
-from ...util.misc import param_to_array
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 import logging, itertools
@ -35,7 +34,7 @@ class VarDTC(LatentFunctionInference):
        self.get_YYTfactor.limit = limit

    def _get_trYYT(self, Y):
-        return param_to_array(np.sum(np.square(Y)))
+        return np.sum(np.square(Y))

    def __getstate__(self):
        # has to be overridden, as Cacher objects cannot be pickled.
@ -56,14 +55,16 @@ class VarDTC(LatentFunctionInference):
        """
        N, D = Y.shape
        if (N>=D):
-            return param_to_array(Y)
+            return Y.view(np.ndarray)
        else:
            return jitchol(tdot(Y))

    def get_VVTfactor(self, Y, prec):
        return Y * prec # TODO chache this, and make it effective

-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+
+
+    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None):

        _, output_dim = Y.shape
        uncertain_inputs = isinstance(X, VariationalPosterior)
@ -85,9 +86,9 @@ class VarDTC(LatentFunctionInference):

        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
+        if Lm is None:
            Lm = jitchol(Kmm)

-
        # The rather complex computations of A, and the psi stats
        if uncertain_inputs:
            psi0 = kern.psi0(Z, X)
@ -101,7 +102,6 @@ class VarDTC(LatentFunctionInference):
        else:
            psi0 = kern.Kdiag(X)
            psi1 = kern.K(X, Z)
-            psi2 = None
            if het_noise:
                tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
            else:
@ -123,6 +123,7 @@ class VarDTC(LatentFunctionInference):
        delit = tdot(_LBi_Lmi_psi1Vf)
        data_fit = np.trace(delit)
        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
+        if dL_dKmm is None:
            delit = -0.5 * DBi_plus_BiPBi
            delit += -0.5 * B * output_dim
            delit += output_dim * np.eye(num_inducing)
@ -209,6 +210,7 @@ class VarDTCMissingData(LatentFunctionInference):
            inan = np.isnan(Y)
            has_none = inan.any()
            self._inan = ~inan
+            inan = self._inan
        else:
            inan = self._inan
            has_none = True
@ -242,17 +244,15 @@ class VarDTCMissingData(LatentFunctionInference):
            self._subarray_indices = [[slice(None),slice(None)]]
            return [Y], [(Y**2).sum()]

-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None):
        if isinstance(X, VariationalPosterior):
            uncertain_inputs = True
            psi0_all = kern.psi0(Z, X)
            psi1_all = kern.psi1(Z, X)
-            psi2_all = kern.psi2(Z, X)
        else:
            uncertain_inputs = False
            psi0_all = kern.Kdiag(X)
            psi1_all = kern.K(X, Z)
-            psi2_all = None

        Ys, traces = self._Y(Y)
        beta_all = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
@ -274,10 +274,11 @@ class VarDTCMissingData(LatentFunctionInference):
        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        #factor Kmm
+        if Lm is None:
            Lm = jitchol(Kmm)
        if uncertain_inputs: LmInv = dtrtri(Lm)

-        size = Y.shape[1]
+        size = len(Ys)
        next_ten = 0
        for i, [y, v, trYYT] in enumerate(itertools.izip(Ys, self._inan.T, traces)):
            if ((i+1.)/size) >= next_ten:
@ -285,19 +286,19 @@ class VarDTCMissingData(LatentFunctionInference):
                next_ten += .1
            if het_noise: beta = beta_all[i]
            else: beta = beta_all
-
            VVT_factor = (y*beta)
            output_dim = 1#len(ind)

            psi0 = psi0_all[v]
            psi1 = psi1_all[v, :]
-            if uncertain_inputs: psi2 = psi2_all[v, :]
+            if uncertain_inputs:
+                psi2 = kern.psi2(Z, X[v, :])
            else: psi2 = None
            num_data = psi1.shape[0]

            if uncertain_inputs:
                if het_noise: psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
-                else: psi2_beta = psi2.sum(0) * beta
+                else: psi2_beta = psi2 * (beta)
                A = LmInv.dot(psi2_beta.dot(LmInv.T))
            else:
                if het_noise: tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
@ -332,11 +333,11 @@ class VarDTCMissingData(LatentFunctionInference):
            dL_dpsi0_all[v] += dL_dpsi0
            dL_dpsi1_all[v, :] += dL_dpsi1
            if uncertain_inputs:
-                dL_dpsi2_all[v, :] += dL_dpsi2
+                dL_dpsi2_all[v, :, :] += dL_dpsi2

            # log marginal likelihood
            log_marginal += _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
-                psi0, A, LB, trYYT, data_fit,VVT_factor)
+                psi0, A, LB, trYYT, data_fit, VVT_factor)

            #put the gradients in the right places
            dL_dR += _compute_dL_dR(likelihood,
@ -395,7 +396,6 @@ def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, C
            # subsume back into psi1 (==Kmn)
            dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
            dL_dpsi2 = None
-
    return dL_dpsi0, dL_dpsi1, dL_dpsi2


--- a/GPy/inference/latent_function_inference/var_dtc_gpu.py
+++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py
@ -6,7 +6,6 @@ from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs
 from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
-from ...util.misc import param_to_array
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)

@ -117,7 +116,7 @@ class VarDTC_GPU(LatentFunctionInference):
        """
        N, D = Y.shape
        if (N>=D):
-            return param_to_array(Y)
+            return Y.view(np.ndarray)
        else:
            return jitchol(tdot(Y))

--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -6,7 +6,6 @@ from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri,pdi
 from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
-from ...util.misc import param_to_array
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)

@ -60,7 +59,7 @@ class VarDTC_minibatch(LatentFunctionInference):
        self.get_YYTfactor.limit = limit

    def _get_trYYT(self, Y):
-        return param_to_array(np.sum(np.square(Y)))
+        return np.sum(np.square(Y))

    def _get_YYTfactor(self, Y):
        """
@ -70,7 +69,7 @@ class VarDTC_minibatch(LatentFunctionInference):
        """
        N, D = Y.shape
        if (N>=D):
-            return param_to_array(Y)
+            return Y.view(np.ndarray)
        else:
            return jitchol(tdot(Y))

--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@ -0,0 +1,48 @@
+'''
+Created on 9 Oct 2014
+
+@author: maxz
+'''
+
+class StochasticStorage(object):
+    '''
+    This is a container for holding the stochastic parameters,
+    such as subset indices or step length and so on.
+    '''
+    def __init__(self, model):
+        """
+        Initialize this stochastic container using the given model
+        """
+
+    def do_stochastics(self):
+        """
+        Update the internal state to the next batch of the stochastic
+        descent algorithm.
+        """
+        pass
+
+class SparseGPMissing(StochasticStorage):
+    def __init__(self, model, batchsize=1):
+        self.d = xrange(model.Y_normalized.shape[1])
+
+class SparseGPStochastics(StochasticStorage):
+    """
+    For the sparse gp we need to store the dimension we are in,
+    and the indices corresponding to those
+    """
+    def __init__(self, model, batchsize=1):
+        import itertools
+        self.batchsize = batchsize
+        if self.batchsize == 1:
+            self.dimensions = itertools.cycle(range(model.Y_normalized.shape[1]))
+        else:
+            import numpy as np
+            self.dimensions = lambda: np.random.choice(model.Y_normalized.shape[1], size=batchsize, replace=False)
+        self.d = None
+        self.do_stochastics()
+
+    def do_stochastics(self):
+        if self.batchsize == 1:
+            self.d = [self.dimensions.next()]
+        else:
+            self.d = self.dimensions()
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -17,17 +17,3 @@ from _src.poly import Poly
 from _src.trunclinear import TruncLinear,TruncLinear_inf
 from _src.splitKern import SplitKern,DiffGenomeKern

-# TODO: put this in an init file somewhere
-#I'm commenting this out because the files were not added. JH. Remember to add the files before commiting
-try:
-    import sympy as sym
-    sympy_available=True
-except ImportError:
-    sympy_available=False
-
-if sympy_available:
-    from _src.symbolic import Symbolic
-    from _src.eq import Eq
-    from _src.heat_eqinit import Heat_eqinit
-    #from _src.ode1_eq_lfm import Ode1_eq_lfm
-
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@ -6,6 +6,7 @@ import numpy as np
 from scipy import weave
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
+from ...util.config import config # for assesing whether to use weave

 class Coregionalize(Kern):
    """
@ -56,6 +57,27 @@ class Coregionalize(Kern):
        self.B = np.dot(self.W, self.W.T) + np.diag(self.kappa)

    def K(self, X, X2=None):
+        if config.getboolean('weave', 'working'):
+            try:
+                return self._K_weave(X, X2)
+            except:
+                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                config.set('weave', 'working', 'False')
+                return self._K_numpy(X, X2)
+        else:
+            return self._K_numpy(X, X2)
+
+
+    def _K_numpy(self, X, X2=None):
+        index = np.asarray(X, dtype=np.int)
+        if X2 is None:
+            return self.B[index,index.T]
+        else:
+            index2 = np.asarray(X2, dtype=np.int)
+            return self.B[index,index2.T]
+
+    def _K_weave(self, X, X2=None):
+        """compute the kernel function using scipy.weave"""
        index = np.asarray(X, dtype=np.int)

        if X2 is None:
@ -91,12 +113,33 @@ class Coregionalize(Kern):

    def update_gradients_full(self, dL_dK, X, X2=None):
        index = np.asarray(X, dtype=np.int)
-        dL_dK_small = np.zeros_like(self.B)
        if X2 is None:
            index2 = index
        else:
            index2 = np.asarray(X2, dtype=np.int)

+        #attempt to use weave for a nasty double indexing loop: fall back to numpy
+        if config.getboolean('weave', 'working'):
+            try:
+                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+            except:
+                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                config.set('weave', 'working', 'False')
+                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+        else:
+            dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+
+
+
+        dkappa = np.diag(dL_dK_small)
+        dL_dK_small += dL_dK_small.T
+        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
+
+        self.W.gradient = dW
+        self.kappa.gradient = dkappa
+
+    def _gradient_reduce_weave(self, dL_dK, index, index2):
+        dL_dK_small = np.zeros_like(self.B)
        code="""
        for(int i=0; i<num_inducing; i++){
          for(int j=0; j<N; j++){
@ -106,13 +149,16 @@ class Coregionalize(Kern):
        """
        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
        weave.inline(code, ['N', 'num_inducing', 'output_dim', 'dL_dK', 'dL_dK_small', 'index', 'index2'])
+        return dL_dK_small

-        dkappa = np.diag(dL_dK_small)
-        dL_dK_small += dL_dK_small.T
-        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
-
-        self.W.gradient = dW
-        self.kappa.gradient = dkappa
+    def _gradient_reduce_numpy(self, dL_dK, index, index2):
+        index, index2 = index[:,0], index2[:,0]
+        dL_dK_small = np.zeros_like(self.B)
+        for i in range(k.output_dim):
+            tmp1 = dL_dK[index==i]
+            for j in range(k.output_dim):
+                dL_dK_small[j,i] = tmp1[:,index2==j].sum()
+        return dL_dK_small

    def update_gradients_diag(self, dL_dKdiag, X):
        index = np.asarray(X, dtype=np.int).flatten()
--- a/GPy/kern/_src/eq.py
+++ b/GPy/kern/_src/eq.py
@ -1,30 +0,0 @@
-try:
-    import sympy as sym
-    sympy_available=True
-except ImportError:
-    sympy_available=False
-
-import numpy as np
-from symbolic import Symbolic
-
-class Eq(Symbolic):
-    """
-    The exponentiated quadratic covariance as a symbolic function. 
-
-    """
-    def __init__(self, input_dim, output_dim=1, variance=1.0, lengthscale=1.0, name='Eq'):
-
-        parameters = {'variance' : variance, 'lengthscale' : lengthscale}
-        x = sym.symbols('x_:' + str(input_dim))
-        z = sym.symbols('z_:' + str(input_dim))
-        variance = sym.var('variance',positive=True)
-        lengthscale = sym.var('lengthscale', positive=True)
-        dist_string = ' + '.join(['(x_%i - z_%i)**2' %(i, i) for i in range(input_dim)])
-        from sympy.parsing.sympy_parser import parse_expr
-        dist = parse_expr(dist_string)
-
-        # this is the covariance function               
-        f = variance*sym.exp(-dist/(2*lengthscale**2))
-        # extra input dim is to signify the output dimension. 
-        super(Eq, self).__init__(input_dim=input_dim, k=f, output_dim=output_dim, parameters=parameters, name=name)
-                                          
--- a/GPy/kern/_src/heat_eqinit.py
+++ b/GPy/kern/_src/heat_eqinit.py
@ -1,30 +0,0 @@
-try:
-    import sympy as sym
-    sympy_available=True
-except ImportError:
-    sympy_available=False
-
-import numpy as np
-from symbolic import Symbolic
-
-class Heat_eqinit(Symbolic):
-    """
-    A symbolic covariance based on laying down an initial condition of the heat equation with an exponentiated quadratic covariance. The covariance then has multiple outputs which are interpreted as observations of the diffused process with different diffusion coefficients (or at different times). 
-
-    """
-    def __init__(self, input_dim, output_dim=1, param=None, name='Heat_eqinit'):
-
-        x = sym.symbols('x_:' + str(input_dim))
-        z = sym.symbols('z_:' + str(input_dim))
-        scale = sym.var('scale_i scale_j',positive=True)
-        lengthscale = sym.var('lengthscale_i lengthscale_j', positive=True)
-        shared_lengthscale = sym.var('shared_lengthscale', positive=True)
-        dist_string = ' + '.join(['(x_%i - z_%i)**2' %(i, i) for i in range(input_dim)])
-        from sympy.parsing.sympy_parser import parse_expr
-        dist = parse_expr(dist_string)
-
-        # this is the covariance function               
-        f = scale_i*scale_j*sym.exp(-dist/(2*(shared_lengthscale**2 + lengthscale_i*lengthscale_j)))
-        # extra input dim is to signify the output dimension. 
-        super(Heat_eqinit, self).__init__(input_dim=input_dim+1, k=f, output_dim=output_dim, name=name)
-                                          
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -6,6 +6,7 @@ import numpy as np
 from ...core.parameterization.parameterized import Parameterized
 from kernel_slice_operations import KernCallsViaSlicerMeta
 from ...util.caching import Cache_this
+from GPy.core.parameterization.observable_array import ObsAr



@ -54,6 +55,20 @@ class Kern(Parameterized):

        self._sliced_X = 0
        self.useGPU = self._support_GPU and useGPU
+        self._return_psi2_n_flag = ObsAr(np.zeros(1)).astype(bool)
+
+    @property
+    def return_psi2_n(self):
+        """
+        Flag whether to pass back psi2 as NxMxM or MxM, by summing out N.
+        """
+        return self._return_psi2_n_flag[0]
+    @return_psi2_n.setter
+    def return_psi2_n(self, val):
+        def visit(self):
+            if isinstance(self, Kern):
+                self._return_psi2_n_flag[0]=val
+        self.traverse(visit)

    @Cache_this(limit=20)
    def _slice_X(self, X):
@ -163,6 +178,10 @@ class Kern(Parameterized):
        """ Here we overload the '*' operator. See self.prod for more information"""
        return self.prod(other)

+    def __imul__(self, other):
+        """ Here we overload the '*' operator. See self.prod for more information"""
+        return self.prod(other)
+
    def __pow__(self, other):
        """
        Shortcut for tensor `prod`.
@ -183,7 +202,7 @@ class Kern(Parameterized):
        :type tensor: bool

        """
-        assert isinstance(other, Kern), "only kernels can be added to kernels..."
+        assert isinstance(other, Kern), "only kernels can be multiplied to kernels..."
        from prod import Prod
        #kernels = []
        #if isinstance(self, Prod): kernels.extend(self.parameters)
--- a/GPy/kern/_src/poly.py
+++ b/GPy/kern/_src/poly.py
@ -3,7 +3,6 @@

 import numpy as np
 from kern import Kern
-from ...util.misc import param_to_array
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 class Poly(Kern):
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@ -18,7 +18,12 @@ class Prod(CombinationKernel):

    """
    def __init__(self, kernels, name='mul'):
-        assert len(kernels) == 2, 'only implemented for two kernels as of yet'
+        for i, kern in enumerate(kernels[:]):
+            if isinstance(kern, Prod):
+                del kernels[i]
+                for part in kern.parts[::-1]:
+                    kern.unlink_parameter(part)
+                    kernels.insert(i, part)
        super(Prod, self).__init__(kernels, name)

    @Cache_this(limit=2, force_kwargs=['which_parts'])
@ -37,25 +42,25 @@ class Prod(CombinationKernel):
        return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))

    def update_gradients_full(self, dL_dK, X, X2=None):
-        for k1,k2 in itertools.combinations(self.parts, 2):
-            k1.update_gradients_full(dL_dK*k2.K(X, X2), X, X2)
-            k2.update_gradients_full(dL_dK*k1.K(X, X2), X, X2)
+        k = self.K(X,X2)*dL_dK
+        for p in self.parts:
+            p.update_gradients_full(k/p.K(X,X2),X,X2)

    def update_gradients_diag(self, dL_dKdiag, X):
-        for k1,k2 in itertools.combinations(self.parts, 2):
-            k1.update_gradients_diag(dL_dKdiag*k2.Kdiag(X), X)
-            k2.update_gradients_diag(dL_dKdiag*k1.Kdiag(X), X)
+        k = self.Kdiag(X)*dL_dKdiag
+        for p in self.parts:
+            p.update_gradients_diag(k/p.Kdiag(X),X)

    def gradients_X(self, dL_dK, X, X2=None):
        target = np.zeros(X.shape)
-        for k1,k2 in itertools.combinations(self.parts, 2):
-            target += k1.gradients_X(dL_dK*k2.K(X, X2), X, X2)
-            target += k2.gradients_X(dL_dK*k1.K(X, X2), X, X2)
+        k = self.K(X,X2)*dL_dK
+        for p in self.parts:
+            target += p.gradients_X(k/p.K(X,X2),X,X2)
        return target

    def gradients_X_diag(self, dL_dKdiag, X):
        target = np.zeros(X.shape)
-        for k1,k2 in itertools.combinations(self.parts, 2):
-            target += k1.gradients_X_diag(dL_dKdiag*k2.Kdiag(X), X)
-            target += k2.gradients_X_diag(dL_dKdiag*k1.Kdiag(X), X)
+        k = self.Kdiag(X)*dL_dKdiag
+        for p in self.parts:
+            target += p.gradients_X_diag(k/p.Kdiag(X),X)
        return target
--- a/GPy/kern/_src/psi_comp/init.py
+++ b/GPy/kern/_src/psi_comp/init.py
@ -10,7 +10,6 @@ import sslinear_psi_comp
 import linear_psi_comp

 class PSICOMP_RBF(Pickleable):
-
    @Cache_this(limit=2, ignore_args=(0,))
    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
        if isinstance(variational_posterior, variational.NormalPosterior):
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@ -139,8 +139,7 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
    denom2 = np.square(denom)

    _psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM
-    
-    Lpsi2 = dL_dpsi2[None,:,:]*_psi2
+    Lpsi2 = dL_dpsi2*_psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out
    Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N
    Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ
    Lpsi2Z2 = np.einsum('nmo,oq,oq->nq',Lpsi2,Z,Z) #NxQ
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -8,7 +8,8 @@ from ...core.parameterization.transformations import Logexp
 from ...util.linalg import tdot
 from ... import util
 import numpy as np
-from scipy import integrate
+from scipy import integrate, weave
+from ...util.config import config # for assesing whether to use weave
 from ...util.caching import Cache_this

 class Stationary(Kern):
@ -71,6 +72,13 @@ class Stationary(Kern):

    @Cache_this(limit=5, ignore_args=())
    def K(self, X, X2=None):
+        """
+        Kernel function applied on inputs X and X2.
+        In the stationary case there is an inner function depending on the
+        distances from X to X2, called r.
+
+        K(X, X2) = K_of_r((X-X2)**2)
+        """
        r = self._scaled_dist(X, X2)
        return self.K_of_r(r)

@ -124,10 +132,22 @@ class Stationary(Kern):
        return ret

    def update_gradients_diag(self, dL_dKdiag, X):
+        """
+        Given the derivative of the objective with respect to the diagonal of
+        the covariance matrix, compute the derivative wrt the parameters of
+        this kernel and stor in the <parameter>.gradient field.
+
+        See also update_gradients_full
+        """
        self.variance.gradient = np.sum(dL_dKdiag)
        self.lengthscale.gradient = 0.

    def update_gradients_full(self, dL_dK, X, X2=None):
+        """
+        Given the derivative of the objective wrt the covariance matrix
+        (dL_dK), compute the gradient wrt the parameters of this kernel,
+        and store in the parameters object as e.g. self.variance.gradient
+        """
        self.variance.gradient = np.einsum('ij,ij,i', self.K(X, X2), dL_dK, 1./self.variance)

        #now the lengthscale gradient(s)
@ -139,6 +159,16 @@ class Stationary(Kern):
            #self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum(0).sum(0)/self.lengthscale**3
            tmp = dL_dr*self._inv_dist(X, X2)
            if X2 is None: X2 = X
+            
+
+            if config.getboolean('weave', 'working'):
+                try:
+                    self.lengthscale.gradient = self.weave_lengthscale_grads(tmp, X, X2)
+                except:
+                    print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                    config.set('weave', 'working', 'False')
+                    self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
+            else:
                self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
        else:
            r = self._scaled_dist(X, X2)
@ -154,10 +184,43 @@ class Stationary(Kern):
        dist = self._scaled_dist(X, X2).copy()
        return 1./np.where(dist != 0., dist, np.inf)

+    def weave_lengthscale_grads(self, tmp, X, X2):
+        """Use scipy.weave to compute derivatives wrt the lengthscales"""
+        N,M = tmp.shape
+        Q = X.shape[1]
+        if hasattr(X, 'values'):X = X.values
+        if hasattr(X2, 'values'):X2 = X2.values
+        grads = np.zeros(self.input_dim)
+        code = """
+        double gradq;
+        for(int q=0; q<Q; q++){
+          gradq = 0;
+          for(int n=0; n<N; n++){
+            for(int m=0; m<M; m++){
+              gradq += tmp(n,m)*(X(n,q)-X2(m,q))*(X(n,q)-X2(m,q));
+            }
+          }
+          grads[q] = gradq;
+        }
+        """
+        weave.inline(code, ['tmp', 'X', 'X2', 'grads', 'N', 'M', 'Q'], type_converters=weave.converters.blitz, support_code="#include <math.h>")
+        return -grads/self.lengthscale**3
+
    def gradients_X(self, dL_dK, X, X2=None):
        """
        Given the derivative of the objective wrt K (dL_dK), compute the derivative wrt X
        """
+        if config.getboolean('weave', 'working'):
+            try:
+                return self.gradients_X_weave(dL_dK, X, X2)
+            except:
+                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                config.set('weave', 'working', 'False')
+                return self.gradients_X_(dL_dK, X, X2)
+        else:
+            return self.gradients_X_(dL_dK, X, X2)
+
+    def gradients_X_(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        tmp = invdist*dL_dr
@ -171,11 +234,40 @@ class Stationary(Kern):

        #the lower memory way with a loop
        ret = np.empty(X.shape, dtype=np.float64)
-        [np.sum(tmp*(X[:,q][:,None]-X2[:,q][None,:]), axis=1, out=ret[:,q]) for q in xrange(self.input_dim)]
+        for q in xrange(self.input_dim):
+            np.sum(tmp*(X[:,q][:,None]-X2[:,q][None,:]), axis=1, out=ret[:,q])
        ret /= self.lengthscale**2

        return ret

+    def gradients_X_weave(self, dL_dK, X, X2=None):
+        invdist = self._inv_dist(X, X2)
+        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
+        tmp = invdist*dL_dr
+        if X2 is None:
+            tmp = tmp + tmp.T
+            X2 = X
+
+        ret = np.zeros(X.shape)
+        code = """
+        int n,q,d;
+        double retnd;
+        for(n=0;n<N;n++){
+          for(d=0;d<D;d++){
+            retnd = 0;
+            for(q=0;q<Q;q++){
+              retnd += tmp(n,q)*(X(n,d)-X2(q,d));
+            }
+            ret(n,d) = retnd;
+          }
+        }
+        """
+        from scipy import weave
+        N,D = X.shape
+        Q = tmp.shape[1]
+        weave.inline(code, ['ret', 'N', 'D', 'Q', 'tmp', 'X', 'X2'], type_converters=weave.converters.blitz)
+        return ret/self.lengthscale**2
+    
    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)

@ -309,6 +401,19 @@ class Matern52(Stationary):


 class ExpQuad(Stationary):
+    """
+    The Exponentiated quadratic covariance function.
+
+    .. math::
+
+       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r)
+
+    notes::
+     - Yes, this is exactly the same as the RBF covariance function, but the
+       RBF implementation also has some features for doing variational kernels
+       (the psi-statistics).
+
+    """
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='ExpQuad'):
        super(ExpQuad, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)

--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -6,18 +6,3 @@ from poisson import Poisson
 from student_t import StudentT
 from likelihood import Likelihood
 from mixed_noise import MixedNoise
-#TODO need to fix this in a config file.
-#TODO need to add the files to the git repo!
-try:
-    import sympy as sym
-    sympy_available=True
-except ImportError:
-    sympy_available=False
-if sympy_available:
-    #These are likelihoods that rely on symbolic.
-    from symbolic import Symbolic
-    from sstudent_t import SstudentT
-    from negative_binomial import Negative_binomial
-    from skew_normal import Skew_normal
-    from skew_exponential import Skew_exponential
-#    from null_category import Null_category
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -113,10 +113,8 @@ class Bernoulli(Likelihood):
        .. Note:
            Each y_i must be in {0, 1}
        """
-        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
        #objective = (inv_link_f**y) * ((1.-inv_link_f)**(1.-y))
-        objective = np.where(y, inv_link_f, 1.-inv_link_f)
-        return np.exp(np.sum(np.log(objective)))
+        return np.where(y, inv_link_f, 1.-inv_link_f)

    def logpdf_link(self, inv_link_f, y, Y_metadata=None):
        """
@ -133,13 +131,9 @@ class Bernoulli(Likelihood):
        :returns: log likelihood evaluated at points inverse link of f.
        :rtype: float
        """
-        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
        #objective = y*np.log(inv_link_f) + (1.-y)*np.log(inv_link_f)
-        state = np.seterr(divide='ignore')
-        # TODO check y \in {0, 1} or {-1, 1}
-        objective = np.where(y==1, np.log(inv_link_f), np.log(1-inv_link_f))
-        np.seterr(**state)
-        return np.sum(objective)
+        p = np.where(y==1, inv_link_f, 1.-inv_link_f)
+        return np.log(p)

    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
        """
@ -156,13 +150,10 @@ class Bernoulli(Likelihood):
        :returns: gradient of log likelihood evaluated at points inverse link of f.
        :rtype: Nx1 array
        """
-        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
        #grad = (y/inv_link_f) - (1.-y)/(1-inv_link_f)
-        state = np.seterr(divide='ignore')
-        # TODO check y \in {0, 1} or {-1, 1}
-        grad = np.where(y, 1./inv_link_f, -1./(1-inv_link_f))
-        np.seterr(**state)
-        return grad
+        #grad = np.where(y, 1./inv_link_f, -1./(1-inv_link_f))
+        denom = np.where(y, inv_link_f, -(1-inv_link_f))
+        return 1./denom

    def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
        """
@ -185,13 +176,12 @@ class Bernoulli(Likelihood):
            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
            (the distribution for y_i depends only on inverse link of f_i not on inverse link of f_(j!=i)
        """
-        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
        #d2logpdf_dlink2 = -y/(inv_link_f**2) - (1-y)/((1-inv_link_f)**2)
        state = np.seterr(divide='ignore')
        # TODO check y \in {0, 1} or {-1, 1}
-        d2logpdf_dlink2 = np.where(y, -1./np.square(inv_link_f), -1./np.square(1.-inv_link_f))
-        np.seterr(**state)
-        return d2logpdf_dlink2
+        #d2logpdf_dlink2 = np.where(y, -1./np.square(inv_link_f), -1./np.square(1.-inv_link_f))
+        arg = np.where(y, inv_link_f, 1.-inv_link_f)
+        return -1./np.square(arg)

    def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
        """
--- a/GPy/likelihoods/exponential.py
+++ b/GPy/likelihoods/exponential.py
@ -5,7 +5,6 @@
 import numpy as np
 from scipy import stats,special
 import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import link_functions
 from likelihood import Likelihood

--- a/GPy/likelihoods/gamma.py
+++ b/GPy/likelihoods/gamma.py
@ -5,7 +5,6 @@
 import numpy as np
 from scipy import stats,special
 import scipy as sp
-from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 from ..core.parameterization import Param
 import link_functions
 from likelihood import Likelihood
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -13,7 +13,6 @@ James 11/12/13

 import numpy as np
 from scipy import stats, special
-from GPy.util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
 import link_functions
 from likelihood import Likelihood
 from ..core.parameterization import Param
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -4,7 +4,6 @@
 import numpy as np
 from scipy import stats,special
 import scipy as sp
-from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import link_functions
 from ..util.misc import chain_1, chain_2, chain_3
 from scipy.integrate import quad
@ -132,6 +131,56 @@ class Likelihood(Parameterized):

        return z, mean, variance

+    def variational_expectations(self, Y, m, v, gh_points=None):
+        """
+        Use Gauss-Hermite Quadrature to compute 
+
+           E_p(f) [ log p(y|f) ]
+           d/dm E_p(f) [ log p(y|f) ]
+           d/dv E_p(f) [ log p(y|f) ]
+
+        where p(f) is a Gaussian with mean m and variance v. The shapes of Y, m and v should match.
+
+        if no gh_points are passed, we construct them using defualt options
+        """
+
+        if gh_points is None:
+            gh_x, gh_w = np.polynomial.hermite.hermgauss(12)
+        else:
+            gh_x, gh_w = gh_points
+
+        shape = m.shape
+        m,v,Y = m.flatten(), v.flatten(), Y.flatten()
+
+        #make a grid of points
+        X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + m[:,None]
+
+        #evaluate the likelhood for the grid. First ax indexes the data (and mu, var) and the second indexes the grid.
+        # broadcast needs to be handled carefully. 
+        logp = self.logpdf(X,Y[:,None])
+        dlogp_dx = self.dlogpdf_df(X, Y[:,None])
+        d2logp_dx2 = self.d2logpdf_df2(X, Y[:,None])
+
+        #clipping for numerical stability
+        logp = np.clip(logp,-1e6,1e6)
+        dlogp_dx = np.clip(dlogp_dx,-1e6,1e6)
+        d2logp_dx2 = np.clip(d2logp_dx2,-1e6,1e6)
+
+        #average over the gird to get derivatives of the Gaussian's parameters
+        F = np.dot(logp, gh_w)
+        dF_dm = np.dot(dlogp_dx, gh_w)
+        dF_dv = np.dot(d2logp_dx2, gh_w)/2.
+
+        if np.any(np.isnan(dF_dv)) or np.any(np.isinf(dF_dv)):
+            stop
+        if np.any(np.isnan(dF_dm)) or np.any(np.isinf(dF_dm)):
+            stop
+
+        return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape)
+
+
+
+
    def predictive_mean(self, mu, variance, Y_metadata=None):
        """
        Quadrature calculation of the predictive mean: E(Y_star|Y) = E( E(Y_star|f_star, Y) )
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@ -71,7 +71,6 @@ class Probit(GPTransformation):

        g(f) = \\Phi^{-1} (mu)
    
-    
    """
    def transf(self,f):
        return std_norm_cdf(f)
@ -88,6 +87,34 @@ class Probit(GPTransformation):
        f2 = f**2
        return -(1/(np.sqrt(2*np.pi)))*np.exp(-0.5*(f2))*(1-f2)

+
+class Cloglog(GPTransformation):
+    """
+    Complementary log-log link
+    .. math::
+
+        p(f) = 1 - e^{-e^f}
+
+        or
+
+        f = \log (-\log(1-p))
+    
+    """
+    def transf(self,f):
+        return 1-np.exp(-np.exp(f))
+
+    def dtransf_df(self,f):
+        return np.exp(f-np.exp(f))
+
+    def d2transf_df2(self,f):
+        ef = np.exp(f)
+        return -np.exp(f-ef)*(ef-1.)
+
+    def d3transf_df3(self,f):
+        ef = np.exp(f)
+        return np.exp(f-ef)*(1.-3*ef + ef**2)
+
+
 class Log(GPTransformation):
    """
    .. math::
--- a/GPy/likelihoods/mixed_noise.py
+++ b/GPy/likelihoods/mixed_noise.py
@ -1,6 +1,5 @@
 import numpy as np
 from scipy import stats, special
-from GPy.util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
 import link_functions
 from likelihood import Likelihood
 from gaussian import Gaussian
--- a/GPy/likelihoods/negative_binomial.py
+++ b/GPy/likelihoods/negative_binomial.py
@ -1,48 +0,0 @@
-# Copyright (c) 2014 The GPy authors (see AUTHORS.txt)
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-try:
-    import sympy as sym
-    sympy_available=True
-    from sympy.utilities.lambdify import lambdify
-    from GPy.util.symbolic import gammaln, logisticln
-except ImportError:
-    sympy_available=False
-
-import numpy as np
-from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
-import link_functions
-from symbolic import Symbolic
-from scipy import stats
-
-class Negative_binomial(Symbolic):
-    """
-    Negative binomial
-
-    .. math::
-        p(y_{i}|\pi(f_{i})) = \left(\frac{r}{r+f_i}\right)^r \frac{\Gamma(r+y_i)}{y!\Gamma(r)}\left(\frac{f_i}{r+f_i}\right)^{y_i}
-
-    .. Note::
-        Y takes non zero integer values..
-        link function should have a positive domain, e.g. log (default).
-
-    .. See also::
-        symbolic.py, for the parent class
-    """
-    def __init__(self, gp_link=None, dispersion=1.0):
-        parameters = {'dispersion':dispersion}
-        if gp_link is None:
-            gp_link = link_functions.Identity()
-
-        dispersion = sym.Symbol('dispersion', positive=True, real=True)
-        y_0 = sym.Symbol('y_0', nonnegative=True, integer=True)
-        f_0 = sym.Symbol('f_0', positive=True, real=True) 
-        gp_link = link_functions.Log()
-        log_pdf=dispersion*sym.log(dispersion) - (dispersion+y_0)*sym.log(dispersion+f_0) + gammaln(y_0+dispersion) - gammaln(y_0+1) - gammaln(dispersion) + y_0*sym.log(f_0)  
-        #log_pdf= -(dispersion+y)*logisticln(f-sym.log(dispersion)) + gammaln(y+dispersion) - gammaln(y+1) - gammaln(dispersion) + y*(f-sym.log(dispersion))  
-        super(Negative_binomial, self).__init__(log_pdf=log_pdf, parameters=parameters, gp_link=gp_link, name='Negative_binomial')
-
-        # TODO: Check this.
-        self.log_concave = False
-
--- a/GPy/likelihoods/ordinal.py
+++ b/GPy/likelihoods/ordinal.py
@ -5,7 +5,6 @@
 import sympy as sym
 from GPy.util.symbolic import gammaln, normcdfln, normcdf, IndMatrix, create_matrix
 import numpy as np
-from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
 import link_functions
 from symbolic import Symbolic
 from scipy import stats
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@ -5,7 +5,6 @@ from __future__ import division
 import numpy as np
 from scipy import stats,special
 import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import link_functions
 from likelihood import Likelihood

--- a/GPy/likelihoods/skew_exponential.py
+++ b/GPy/likelihoods/skew_exponential.py
@ -1,45 +0,0 @@
-# Copyright (c) 2014 The GPy authors (see AUTHORS.txt)
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import sympy as sym
-from GPy.util.symbolic import normcdfln
-import numpy as np
-from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
-#from GPy.util.functions import clip_exp
-import link_functions
-from symbolic import Symbolic
-from scipy import stats
-
-class Skew_exponential(Symbolic):
-    """
-    Negative binomial
-
-    .. math::
-
-    .. Note::
-        Y takes real values.
-        link function is identity
-
-    .. See also::
-        symbolic.py, for the parent class
-    """
-    def __init__(self, gp_link=None, shape=1.0, scale=1.0):
-        parameters={'scale':scale, 'shape':shape}
-        if gp_link is None:
-            gp_link = link_functions.Identity()
-
-        #func_modules = [{'exp':clip_exp}]
-
-        scale = sym.Symbol('scale', positive=True, real=True)
-        shape = sym.Symbol('shape', real=True)
-        y_0 = sym.Symbol('y_0', real=True)
-        f_0 = sym.Symbol('f_0', real=True) 
-        log_pdf=sym.log(shape)-sym.log(scale)-((y_0-f_0)/scale) + normcdfln(shape*(y_0-f_0)/scale) 
-        super(Skew_exponential, self).__init__(log_pdf=log_pdf, gp_link=gp_link, name='Skew_exponential', parameters=parameters)
-
-        # TODO: Check this.
-        self.log_concave = True
-
-
-            
--- a/GPy/likelihoods/skew_normal.py
+++ b/GPy/likelihoods/skew_normal.py
@ -1,53 +0,0 @@
-# Copyright (c) 2014 The GPy authors (see AUTHORS.txt)
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-try:
-    import sympy as sym
-    sympy_available=True
-    from sympy.utilities.lambdify import lambdify
-    from GPy.util.symbolic import normcdfln, normcdf
-except ImportError:
-    sympy_available=False
-
-import numpy as np
-from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
-from GPy.util.functions import clip_exp
-import link_functions
-from symbolic import Symbolic
-from scipy import stats
-
-class Skew_normal(Symbolic):
-    """
-    Skew Normal distribution.
-
-    .. math::
-
-    .. Note::
-    Y takes real values.
-    link function is identity
-
-    .. See also::
-    symbolic.py, for the parent class
-    """
-    def __init__(self, gp_link=None, shape=1.0, scale=1.0):
-        parameters = {'scale': scale, 'shape':shape}
-        if gp_link is None:
-            gp_link = link_functions.Identity()
-
-        # # this likelihood has severe problems with likelihoods saturating exponentials, so clip_exp is used in place of the true exp as a solution for dealing with the numerics.
-        # func_modules = [{'exp':clip_exp}]
-        func_modules = []
-
-        scale = sym.Symbol('scale', positive=True, real=True)
-        shape = sym.Symbol('shape', real=True)
-        y_0 = sym.Symbol('y_0', real=True)
-        f_0 = sym.Symbol('f_0', real=True) 
-        log_pdf=-sym.log(scale)-1./2*sym.log(2*sym.pi)-1./2*((y_0-f_0)/scale)**2 + sym.log(2) + normcdfln(shape*(y_0-f_0)/scale) 
-        super(Skew_normal, self).__init__(log_pdf=log_pdf, parameters=parameters, gp_link=gp_link, name='Skew_normal', func_modules=func_modules)
-
-        self.log_concave = True
-        
-
-
-            
--- a/GPy/likelihoods/sstudent_t.py
+++ b/GPy/likelihoods/sstudent_t.py
@ -1,45 +0,0 @@
-# Copyright (c) 2014 The GPy authors (see AUTHORS.txt)
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import sympy as sym
-from sympy.utilities.lambdify import lambdify
-from GPy.util.symbolic import gammaln
-
-import numpy as np
-import link_functions
-from symbolic import Symbolic
-from scipy import stats
-
-class SstudentT(Symbolic):
-    """
-    Symbolic variant of the Student-t distribution.
-
-    .. math::
-
-    .. Note::
-        Y takes real values.
-        link function is identity
-
-    .. See also::
-        symbolic.py, for the parent class
-    """
-    def __init__(self, gp_link=None, deg_free=5.0, t_scale2=1.0):
-        parameters = {'deg_free':5.0, 't_scale2':1.0}
-        if gp_link is None:
-            gp_link = link_functions.Identity()
-
-        # this likelihood has severe problems with likelihoods saturating ...
-        y_0 = sym.Symbol('y_0', real=True)
-        f_0 = sym.Symbol('f_0', real=True)
-        nu = sym.Symbol('nu', positive=True, real=True)
-        t_scale2 = sym.Symbol('t_scale2', positive=True, real=True)
-        log_pdf = (gammaln((nu + 1) * 0.5)
-                - gammaln(nu * 0.5)
-                - 0.5*sym.log(t_scale2 * nu * sym.pi)
-                   - 0.5*(nu + 1)*sym.log(1 + (1/nu)*(((y_0-f_0)**2)/t_scale2)))
-        super(SstudentT, self).__init__(log_pdf=log_pdf, parameters=parameters, gp_link=gp_link, name='SstudentT')
-        self.log_concave = False
-
-
-            
--- a/GPy/likelihoods/symbolic.py
+++ b/GPy/likelihoods/symbolic.py
@ -1,279 +0,0 @@
-# Copyright (c) 2014 GPy Authors
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-import sympy as sym
-import numpy as np
-from likelihood import Likelihood
-from ..core.symbolic import Symbolic_core
-
-
-class Symbolic(Likelihood, Symbolic_core):
-    """
-    Symbolic likelihood.
-
-    Likelihood where the form of the likelihood is provided by a sympy expression.
-
-    """
-    def __init__(self, log_pdf=None, logZ=None, missing_log_pdf=None, gp_link=None, name='symbolic', log_concave=False, parameters=None, func_modules=[]):
-
-        if gp_link is None:
-            gp_link = link_functions.Identity()
-
-        if log_pdf is None:
-            raise ValueError, "You must provide an argument for the log pdf."
-
-        Likelihood.__init__(self, gp_link, name=name)
-        functions = {'log_pdf':log_pdf}
-        self.cacheable = ['F', 'Y']
-
-        self.missing_data = False
-        if missing_log_pdf:
-            self.missing_data = True
-            functions['missing_log_pdf']=missing_log_pdf
-
-        self.ep_analytic = False
-        if logZ:
-            self.ep_analytic = True
-            functions['logZ'] = logZ
-            self.cacheable += ['M', 'V']            
-
-        Symbolic_core.__init__(self, functions, cacheable=self.cacheable, derivatives = ['F', 'theta'], parameters=parameters, func_modules=func_modules)   
-
-        # TODO: Is there an easy way to check whether the pdf is log
-        self.log_concave = log_concave
-
-     
-
-    def _set_derivatives(self, derivatives):
-        # these are arguments for computing derivatives.
-        print "Whoop"
-        Symbolic_core._set_derivatives(self, derivatives)
-
-        # add second and third derivatives for Laplace approximation.
-        derivative_arguments = []
-        if derivatives is not None:
-            for derivative in derivatives:
-                derivative_arguments += self.variables[derivative]
-            exprs = ['log_pdf']
-            if self.missing_data:
-                exprs.append('missing_log_pdf')
-            for expr in exprs:
-                self.expressions[expr]['second_derivative'] = {theta.name : self.stabilize(sym.diff(self.expressions[expr]['derivative']['f_0'], theta)) for theta in derivative_arguments}
-                self.expressions[expr]['third_derivative'] = {theta.name : self.stabilize(sym.diff(self.expressions[expr]['second_derivative']['f_0'], theta)) for theta in derivative_arguments}
-        if self.ep_analytic:
-            derivative_arguments = [M]
-            # add second derivative for EP 
-            exprs = ['logZ']
-            if self.missing_data:
-                exprs.append('missing_logZ')
-            for expr in exprs:
-                self.expressions[expr]['second_derivative'] = {theta.name : self.stabilize(sym.diff(self.expressions[expr]['derivative'], theta)) for theta in derivative_arguments}
-
-
-    def eval_update_cache(self, Y, **kwargs):
-        # TODO: place checks for inf/nan in here
-        # for all provided keywords
-        Symbolic_core.eval_update_cache(self, Y=Y, **kwargs)
-        # Y = np.atleast_2d(Y)
-        # for variable, code in sorted(self.code['parameters_changed'].iteritems()):
-        #     self._set_attribute(variable, eval(code, self.namespace))
-        # for i, theta in enumerate(self.variables['Y']):
-        #     missing = np.isnan(Y[:, i])
-        #     self._set_attribute('missing_' + str(i), missing)
-        #     self._set_attribute(theta.name, value[missing, i][:, None])
-        # for variable, value in kwargs.items():
-        #     # update their cached values
-        #     if value is not None:
-        #         if variable == 'F' or variable == 'M' or variable == 'V' or variable == 'Y_metadata':
-        #             for i, theta in enumerate(self.variables[variable]):
-        #                 self._set_attribute(theta.name, value[:, i][:, None])
-        #         else:
-        #             self._set_attribute(theta.name, value[:, i])
-        # for variable, code in sorted(self.code['update_cache'].iteritems()):
-        #     self._set_attribute(variable, eval(code, self.namespace))
-
-
-    def parameters_changed(self):
-        pass
-
-    def update_gradients(self, grads):
-        """
-        Pull out the gradients, be careful as the order must match the order
-        in which the parameters are added
-        """
-        # The way the Laplace approximation is run requires the
-        # covariance function to compute the true gradient (because it
-        # is dependent on the mode). This means we actually compute
-        # the gradient outside this object. This function would
-        # normally ask the object to update its gradients internally,
-        # but here it provides them externally, because they are
-        # computed in the inference code. TODO: Thought: How does this
-        # effect EP? Shouldn't this be done by a separate
-        # Laplace-approximation specific call?
-        for theta, grad in zip(self.variables['theta'], grads):
-            parameter = getattr(self, theta.name)
-            setattr(parameter, 'gradient', grad)
-
-    def pdf_link(self, f, y, Y_metadata=None):
-        """
-        Likelihood function given inverse link of f.
-
-        :param f: inverse link of latent variables.
-        :type f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param Y_metadata: Y_metadata which is not used in student t distribution
-        :returns: likelihood evaluated for this point
-        :rtype: float
-        """
-        return np.exp(self.logpdf_link(f, y, Y_metadata=None))
-
-    def logpdf_link(self, f, y, Y_metadata=None):
-        """
-        Log Likelihood Function given inverse link of latent variables.
-
-        :param f: latent variables (inverse link of f)
-        :type f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param Y_metadata: Y_metadata 
-        :returns: likelihood evaluated for this point
-        :rtype: float
-
-        """
-        assert np.atleast_1d(f).shape == np.atleast_1d(y).shape
-        if self.missing_data:
-            missing_flag = np.isnan(y)
-            not_missing_flag = np.logical_not(missing_flag)
-            ll = self.eval_function('missing_log_pdf', F=f[missing_flag]).sum()
-            ll += self.eval_function('log_pdf', F=f[not_missing_flag], Y=y[not_missing_flag], Y_metadata=Y_metadata[not_missing_flag]).sum()
-        else:
-            ll = self.eval_function('log_pdf', F=f, Y=y, Y_metadata=Y_metadata).sum()
-
-        return ll
-
-    def dlogpdf_dlink(self, f, y, Y_metadata=None):
-        """
-        Gradient of log likelihood with respect to the inverse link function.
-
-        :param f: latent variables (inverse link of f)
-        :type f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param Y_metadata: Y_metadata 
-        :returns: gradient of likelihood with respect to each point.
-        :rtype: Nx1 array
-
-        """
-        assert np.atleast_1d(f).shape == np.atleast_1d(y).shape 
-        self.eval_update_cache(F=f, Y=y, Y_metadata=Y_metadata)
-        if self.missing_data:
-            return np.where(np.isnan(y), 
-                            eval(self.code['missing_log_pdf']['derivative']['f_0'], self.namespace), 
-                            eval(self.code['log_pdf']['derivative']['f_0'], self.namespace)) 
-        else:
-            return np.where(np.isnan(y), 
-                            0., 
-                            eval(self.code['log_pdf']['derivative']['f_0'], self.namespace))
-
-    def d2logpdf_dlink2(self, f, y, Y_metadata=None):
-        """
-        Hessian of log likelihood given inverse link of latent variables with respect to that inverse link.
-        i.e. second derivative logpdf at y given inv_link(f_i) and inv_link(f_j)  w.r.t inv_link(f_i) and inv_link(f_j).
-
-
-        :param f: inverse link of the latent variables.
-        :type f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param Y_metadata: Y_metadata which is not used in student t distribution
-        :returns: Diagonal of Hessian matrix (second derivative of likelihood evaluated at points f)
-        :rtype: Nx1 array
-
-        .. Note::
-            Returns diagonal of Hessian, since every where else it is
-            0, as the likelihood factorizes over cases (the
-            distribution for y_i depends only on link(f_i) not on
-            link(f_(j!=i))
-        """
-        assert np.atleast_1d(f).shape == np.atleast_1d(y).shape         
-        self.eval_update_cache(F=f, Y=y, Y_metadata=Y_metadata)
-        if self.missing_data:
-            return np.where(np.isnan(y), 
-                            eval(self.code['missing_log_pdf']['second_derivative']['f_0'], self.namespace), 
-                            eval(self.code['log_pdf']['second_derivative']['f_0'], self.namespace)) 
-        else:
-            return np.where(np.isnan(y), 
-                            0., 
-                            eval(self.code['log_pdf']['second_derivative']['f_0'], self.namespace))
-
-    def d3logpdf_dlink3(self, f, y, Y_metadata=None):
-        assert np.atleast_1d(f).shape == np.atleast_1d(y).shape 
-        self.eval_update_cache(F=f, Y=y, Y_metadata=Y_metadata)
-        if self.missing_data:
-            return np.where(np.isnan(y), 
-                            eval(self.code['missing_log_pdf']['third_derivative']['f_0'], self.namespace), 
-                            eval(self.code['log_pdf']['third_derivative']['f_0'], self.namespace))
-        else:
-            return np.where(np.isnan(y), 
-                            0., 
-                            eval(self.code['log_pdf']['third_derivative']['f_0'], self.namespace))
-
-    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
-        assert np.atleast_1d(f).shape == np.atleast_1d(y).shape 
-        self.eval_update_cache(F=f, Y=y, Y_metadata=Y_metadata)
-        g = np.zeros((np.atleast_1d(y).shape[0], len(self.variables['theta'])))
-        for i, theta in enumerate(self.variables['theta']):
-            if self.missing_data:
-                g[:, i:i+1] = np.where(np.isnan(y), 
-                                       eval(self.code['missing_log_pdf']['derivative'][theta.name], self.namespace), 
-                                       eval(self.code['log_pdf']['derivative'][theta.name], self.namespace))
-            else:
-                g[:, i:i+1] = np.where(np.isnan(y), 
-                                       0., 
-                                       eval(self.code['log_pdf']['derivative'][theta.name], self.namespace))
-        return g.sum(0)
-
-    def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
-        assert np.atleast_1d(f).shape == np.atleast_1d(y).shape 
-        self.eval_update_cache(F=f, Y=y, Y_metadata=Y_metadata)
-        g = np.zeros((np.atleast_1d(y).shape[0], len(self.variables['theta'])))
-        for i, theta in enumerate(self.variables['theta']):
-            if self.missing_data:
-                g[:, i:i+1] = np.where(np.isnan(y), 
-                                       eval(self.code['missing_log_pdf']['second_derivative'][theta.name], self.namespace), 
-                                       eval(self.code['log_pdf']['second_derivative'][theta.name], self.namespace))
-            else:
-                g[:, i:i+1] = np.where(np.isnan(y), 
-                                       0., 
-                                       eval(self.code['log_pdf']['second_derivative'][theta.name], self.namespace))
-        return g
-
-    def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
-        assert np.atleast_1d(f).shape == np.atleast_1d(y).shape 
-        self.eval_update_cache(F=f, Y=y, Y_metadata=Y_metadata)
-        g = np.zeros((np.atleast_1d(y).shape[0], len(self.variables['theta'])))
-        for i, theta in enumerate(self.variables['theta']):
-            if self.missing_data:
-                g[:, i:i+1] = np.where(np.isnan(y), 
-                                       eval(self.code['missing_log_pdf']['third_derivative'][theta.name], self.namespace), 
-                                       eval(self.code['log_pdf']['third_derivative'][theta.name], self.namespace))
-            else:
-                g[:, i:i+1] = np.where(np.isnan(y), 
-                                       0., 
-                                       eval(self.code['log_pdf']['third_derivative'][theta.name], self.namespace))
-        return g
-
-    def predictive_mean(self, mu, sigma, Y_metadata=None):
-        raise NotImplementedError
-
-    def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
-        raise NotImplementedError
-
-    def conditional_mean(self, gp):
-        raise NotImplementedError
-
-    def conditional_variance(self, gp):
-        raise NotImplementedError
-
-    def samples(self, gp, Y_metadata=None):
-        raise NotImplementedError
--- a/GPy/mappings/init.py
+++ b/GPy/mappings/init.py
@ -5,13 +5,3 @@ from kernel import Kernel
 from linear import Linear
 from mlp import MLP
 #from rbf import RBF
-# TODO need to fix this in a config file.
-try:
-    import sympy as sym
-    sympy_available=True
-except ImportError:
-    sympy_available=False
-
-if sympy_available:
-    # These are likelihoods that rely on symbolic.
-    from symbolic import Symbolic
--- a/GPy/mappings/symbolic.py
+++ b/GPy/mappings/symbolic.py
@ -1,57 +0,0 @@
-# Copyright (c) 2014 GPy Authors
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import sympy as sym
-from ..core.mapping import Mapping, Bijective_mapping
-from ..core.symbolic import Symbolic_core
-import numpy as np
-
-class Symbolic(Mapping, Symbolic_core):
-    """
-    Symbolic mapping
-
-    Mapping where the form of the mapping is provided by a sympy expression.
-
-    """
-    def __init__(self, input_dim, output_dim, f=None, name='symbolic', parameters=None, func_modules=[]):
-
-
-        if f is None:
-            raise ValueError, "You must provide an argument for the function."
-
-        Mapping.__init__(self, input_dim, output_dim, name=name)
-        Symbolic_core.__init__(self, {'f': f}, ['X'], derivatives = ['X', 'theta'], parameters=parameters, func_modules=func_modules)
-
-        self._initialize_cache()
-        self.parameters_changed()
-
-    def _initialize_cache(self):
-        self._set_attribute('x_0', np.random.normal(size=(3, self.input_dim)))
-        
-
-    def parameters_changed(self):
-        self.eval_parameters_changed()
-
-    def update_cache(self, X=None):
-        self.eval_update_cache(X=X)
-
-    def update_gradients(self, partial, X=None):
-        for name, val in self.eval_update_gradients('f', partial, X=X).iteritems():
-            setattr(getattr(self, name), 'gradient', val)
-
-    def gradients_X(self, partial, X=None):
-        return self.eval_gradients_X('f', partial, X=X)
-
-    def f(self, X=None):
-        """
-        """
-        return self.eval_function('f', X=X)
-
-
-    def df_dX(self, X):
-        """
-        """
-        pass
-
-    def df_dtheta(self, X):
-        pass
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -3,16 +3,14 @@

 import numpy as np
 from .. import kern
-from ..core import SparseGP
+from ..core.sparse_gp_mpi import SparseGP_MPI
 from ..likelihoods import Gaussian
-from ..inference.optimization import SCG
-from ..util import linalg
-from ..core.parameterization.variational import NormalPosterior, NormalPrior, VariationalPosterior
-from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
+from ..core.parameterization.variational import NormalPosterior, NormalPrior
+from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch
 from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU
 import logging

-class BayesianGPLVM(SparseGP):
+class BayesianGPLVM(SparseGP_MPI):
    """
    Bayesian Gaussian Process Latent Variable Model

@ -25,12 +23,14 @@ class BayesianGPLVM(SparseGP):

    """
    def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, inference_method=None, likelihood=None, name='bayesian gplvm', mpi_comm=None, normalizer=None):
+                 Z=None, kernel=None, inference_method=None, likelihood=None,
+                 name='bayesian gplvm', mpi_comm=None, normalizer=None,
+                 missing_data=False, stochastic=False, batchsize=1):
        self.mpi_comm = mpi_comm
        self.__IN_OPTIMIZATION__ = False

        self.logger = logging.getLogger(self.__class__.__name__)
-        if X == None:
+        if X is None:
            from ..util.initialization import initialize_latent
            self.logger.info("initializing latent space X with method {}".format(init))
            X, fracs = initialize_latent(init, input_dim, Y)
@ -59,35 +59,24 @@ class BayesianGPLVM(SparseGP):
        X = NormalPosterior(X, X_variance)

        if inference_method is None:
-            inan = np.isnan(Y)
-            if np.any(inan):
-                from ..inference.latent_function_inference.var_dtc import VarDTCMissingData
-                self.logger.debug("creating inference_method with var_dtc missing data")
-                inference_method = VarDTCMissingData(inan=inan)
-            elif mpi_comm is not None:
+            if mpi_comm is not None:
                inference_method = VarDTC_minibatch(mpi_comm=mpi_comm)
            else:
                from ..inference.latent_function_inference.var_dtc import VarDTC
                self.logger.debug("creating inference_method var_dtc")
-                inference_method = VarDTC()
+                inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])
        if isinstance(inference_method,VarDTC_minibatch):
            inference_method.mpi_comm = mpi_comm

        if kernel.useGPU and isinstance(inference_method, VarDTC_GPU):
            kernel.psicomp.GPU_direct = True

-        SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, normalizer=normalizer)
-        self.logger.info("Adding X as parameter")
-        self.link_parameter(self.X, index=0)
-
-        if mpi_comm != None:
-            from ..util.mpi import divide_data
-            N_start, N_end, N_list = divide_data(Y.shape[0], mpi_comm)
-            self.N_range = (N_start, N_end)
-            self.N_list = np.array(N_list)
-            self.Y_local = self.Y[N_start:N_end]
-            print 'MPI RANK: '+str(self.mpi_comm.rank)+' with datasize: '+str(self.N_range)
-            mpi_comm.Bcast(self.param_array, root=0)
+        super(BayesianGPLVM,self).__init__(X, Y, Z, kernel, likelihood=likelihood,
+                                           name=name, inference_method=inference_method,
+                                           normalizer=normalizer, mpi_comm=mpi_comm,
+                                           variational_prior=self.variational_prior,
+                                           missing_data=missing_data, stochastic=stochastic,
+                                           batchsize=batchsize)

    def set_X_gradients(self, X, X_grad):
        """Set the gradients of the posterior distribution of X in its specific form."""
@ -97,15 +86,62 @@ class BayesianGPLVM(SparseGP):
        """Get the gradients of the posterior distribution of X in its specific form."""
        return X.mean.gradient, X.variance.gradient

+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
+        posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVM, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices)
+
+        kl_fctr = 1.
+        if self.missing_data:
+            d = self.output_dim
+            log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)/d
+        else:
+            log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)
+
+        current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
+                                            variational_posterior=X,
+                                            Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'],
+                                            dL_dpsi1=grad_dict['dL_dpsi1'],
+                                            dL_dpsi2=grad_dict['dL_dpsi2'])
+
+        # Subsetting Variational Posterior objects, makes the gradients
+        # empty. We need them to be 0 though:
+        X.mean.gradient[:] = 0
+        X.variance.gradient[:] = 0
+
+        self.variational_prior.update_gradients_KL(X)
+        if self.missing_data:
+            current_values['meangrad'] += kl_fctr*X.mean.gradient/d
+            current_values['vargrad'] += kl_fctr*X.variance.gradient/d
+        else:
+            current_values['meangrad'] += kl_fctr*X.mean.gradient
+            current_values['vargrad'] += kl_fctr*X.variance.gradient
+
+        if subset_indices is not None:
+            value_indices['meangrad'] = subset_indices['samples']
+            value_indices['vargrad'] = subset_indices['samples']
+        return posterior, log_marginal_likelihood, grad_dict, current_values, value_indices
+
+    def _outer_values_update(self, full_values):
+        """
+        Here you put the values, which were collected before in the right places.
+        E.g. set the gradients of parameters, etc.
+        """
+        super(BayesianGPLVM, self)._outer_values_update(full_values)
+        self.X.mean.gradient = full_values['meangrad']
+        self.X.variance.gradient = full_values['vargrad']
+
+    def _outer_init_full_values(self):
+        return dict(meangrad=np.zeros(self.X.mean.shape),
+                    vargrad=np.zeros(self.X.variance.shape))
+
    def parameters_changed(self):
-        if isinstance(self.inference_method, VarDTC_GPU) or isinstance(self.inference_method, VarDTC_minibatch):
-            update_gradients(self, mpi_comm=self.mpi_comm)
+        super(BayesianGPLVM,self).parameters_changed()
+        if isinstance(self.inference_method, VarDTC_minibatch):
            return

-        super(BayesianGPLVM, self).parameters_changed()
-        self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
+        #super(BayesianGPLVM, self).parameters_changed()
+        #self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)

-        self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2'])
+        #self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2'])

        # This is testing code -------------------------
 #         i = np.random.randint(self.X.shape[0])
@ -124,7 +160,7 @@ class BayesianGPLVM(SparseGP):
        # -----------------------------------------------

        # update for the KL divergence
-        self.variational_prior.update_gradients_KL(self.X)
+        #self.variational_prior.update_gradients_KL(self.X)

    def plot_latent(self, labels=None, which_indices=None,
                resolution=50, ax=None, marker='o', s=40,
@ -189,7 +225,7 @@ class BayesianGPLVM(SparseGP):
        """
        dmu_dX = np.zeros_like(Xnew)
        for i in range(self.Z.shape[0]):
-            dmu_dX += self.kern.gradients_X(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :])
+            dmu_dX += self.kern.gradients_X(self.grad_dict['dL_dpsi1'][i:i + 1, :], Xnew, self.Z[i:i + 1, :])
        return dmu_dX

    def dmu_dXnew(self, Xnew):
@ -200,7 +236,7 @@ class BayesianGPLVM(SparseGP):
        ones = np.ones((1, 1))
        for i in range(self.Z.shape[0]):
            gradients_X[:, i] = self.kern.gradients_X(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
-        return np.dot(gradients_X, self.Cpsi1Vf)
+        return np.dot(gradients_X, self.grad_dict['dL_dpsi1'])

    def plot_steepest_gradient_map(self, *args, ** kwargs):
        """
@ -211,50 +247,6 @@ class BayesianGPLVM(SparseGP):
        from ..plotting.matplot_dep import dim_reduction_plots

        return dim_reduction_plots.plot_steepest_gradient_map(self,*args,**kwargs)
-    def __getstate__(self):
-        dc = super(BayesianGPLVM, self).__getstate__()
-        dc['mpi_comm'] = None
-        if self.mpi_comm != None:
-            del dc['N_range']
-            del dc['N_list']
-            del dc['Y_local']
-        return dc
-
-    def __setstate__(self, state):
-        return super(BayesianGPLVM, self).__setstate__(state)
-
-    #=====================================================
-    # The MPI parallelization
-    #     - can move to model at some point
-    #=====================================================
-
-    def _set_params_transformed(self, p):
-        if self.mpi_comm != None:
-            if self.__IN_OPTIMIZATION__ and self.mpi_comm.rank==0:
-                self.mpi_comm.Bcast(np.int32(1),root=0)
-            self.mpi_comm.Bcast(p, root=0)
-        super(BayesianGPLVM, self)._set_params_transformed(p)
-
-    def optimize(self, optimizer=None, start=None, **kwargs):
-        self.__IN_OPTIMIZATION__ = True
-        if self.mpi_comm==None:
-            super(BayesianGPLVM, self).optimize(optimizer,start,**kwargs)
-        elif self.mpi_comm.rank==0:
-            super(BayesianGPLVM, self).optimize(optimizer,start,**kwargs)
-            self.mpi_comm.Bcast(np.int32(-1),root=0)
-        elif self.mpi_comm.rank>0:
-            x = self._get_params_transformed().copy()
-            flag = np.empty(1,dtype=np.int32)
-            while True:
-                self.mpi_comm.Bcast(flag,root=0)
-                if flag==1:
-                    self._set_params_transformed(x)
-                elif flag==-1:
-                    break
-                else:
-                    self.__IN_OPTIMIZATION__ = False
-                    raise Exception("Unrecognizable flag for synchronization!")
-        self.__IN_OPTIMIZATION__ = False


 def latent_cost_and_grad(mu_S, input_dim, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@ -7,7 +7,6 @@ from ..core import SparseGP
 from .. import likelihoods
 from .. import kern
 from ..inference.latent_function_inference import VarDTC
-from ..util.misc import param_to_array
 from ..core.parameterization.variational import NormalPosterior

 class SparseGPRegression(SparseGP):
@ -40,7 +39,7 @@ class SparseGPRegression(SparseGP):
        # Z defaults to a subset of the data
        if Z is None:
            i = np.random.permutation(num_data)[:min(num_inducing, num_data)]
-            Z = param_to_array(X)[i].copy()
+            Z = X.view(np.ndarray)[i].copy()
        else:
            assert Z.shape[1] == input_dim

--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@ -1,7 +1,6 @@

 import numpy as np
 from latent_space_visualizations.controllers.imshow_controller import ImshowController,ImAnnotateController
-from ...util.misc import param_to_array
 from ...core.parameterization.variational import VariationalPosterior
 from .base_plots import x_frame2D
 import itertools
@ -55,9 +54,9 @@ def plot_latent(model, labels=None, which_indices=None,
    #fethch the data points X that we'd like to plot
    X = model.X
    if isinstance(X, VariationalPosterior):
-        X = param_to_array(X.mean)
+        X = X.mean
    else:
-        X = param_to_array(X)
+        X = X


    if X.shape[0] > 1000:
@ -175,7 +174,7 @@ def plot_latent(model, labels=None, which_indices=None,
    ax.set_aspect('auto') # set a nice aspect ratio

    if plot_inducing:
-        Z = param_to_array(model.Z)
+        Z = model.Z
        ax.plot(Z[:, input_1], Z[:, input_2], '^w')

    ax.set_xlim((xmin, xmax))
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@ -35,8 +35,7 @@ def add_bar_labels(fig, ax, bars, bottom=0):


 def plot_bars(fig, ax, x, ard_params, color, name, bottom=0):
-    from ...util.misc import param_to_array
-    return ax.bar(left=x, height=param_to_array(ard_params), width=.8,
+    return ax.bar(left=x, height=ard_params.view(np.ndarray), width=.8,
                  bottom=bottom, align='center',
                  color=color, edgecolor='k', linewidth=1.2,
                  label=name.replace("_"," "))
@ -100,9 +99,7 @@ def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False, filtering=Non
    return ax


-def plot(kernel, x=None, plot_limits=None, which_parts='all', resolution=None, *args, **kwargs):
-    if which_parts == 'all':
-        which_parts = [True] * kernel.size
+def plot(kernel, x=None, plot_limits=None, resolution=None, *args, **kwargs):
    if kernel.input_dim == 1:
        if x is None:
            x = np.zeros((1, 1))
@ -133,7 +130,7 @@ def plot(kernel, x=None, plot_limits=None, which_parts='all', resolution=None, *
            assert x.size == 2, "The size of the fixed variable x is not 2"
            x = x.reshape((1, 2))

-        if plot_limits == None:
+        if plot_limits is None:
            xmin, xmax = (x - 5).flatten(), (x + 5).flatten()
        elif len(plot_limits) == 2:
            xmin, xmax = plot_limits
@ -142,12 +139,10 @@ def plot(kernel, x=None, plot_limits=None, which_parts='all', resolution=None, *

        resolution = resolution or 51
        xx, yy = np.mgrid[xmin[0]:xmax[0]:1j * resolution, xmin[1]:xmax[1]:1j * resolution]
-        xg = np.linspace(xmin[0], xmax[0], resolution)
-        yg = np.linspace(xmin[1], xmax[1], resolution)
        Xnew = np.vstack((xx.flatten(), yy.flatten())).T
-        Kx = kernel.K(Xnew, x, which_parts)
+        Kx = kernel.K(Xnew, x)
        Kx = Kx.reshape(resolution, resolution).T
-        pb.contour(xg, yg, Kx, vmin=Kx.min(), vmax=Kx.max(), cmap=pb.cm.jet, *args, **kwargs) # @UndefinedVariable
+        pb.contour(xx, xx, Kx, vmin=Kx.min(), vmax=Kx.max(), cmap=pb.cm.jet, *args, **kwargs) # @UndefinedVariable
        pb.xlim(xmin[0], xmax[0])
        pb.ylim(xmin[1], xmax[1])
        pb.xlabel("x1")
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@ -8,7 +8,6 @@ except:
    pass
 import numpy as np
 from base_plots import gpplot, x_frame1D, x_frame2D
-from ...util.misc import param_to_array
 from ...models.gp_coregionalized_regression import GPCoregionalizedRegression
 from ...models.sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
 from scipy import sparse
@ -67,7 +66,6 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
        X_variance = model.X.variance
    else:
        X = model.X
-    #X, Y = param_to_array(X, model.Y)
    Y = model.Y
    if sparse.issparse(Y): Y = Y.todense().view(np.ndarray)

--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@ -1,5 +1,4 @@
 import pylab as pb, numpy as np
-from ...util.misc import param_to_array

 def plot(parameterized, fignum=None, ax=None, colors=None):
    """
@ -21,7 +20,7 @@ def plot(parameterized, fignum=None, ax=None, colors=None):
    else:
        colors = iter(colors)
    plots = []
-    means, variances = param_to_array(parameterized.mean, parameterized.variance)
+    means, variances = parameterized.mean, parameterized.variance
    x = np.arange(means.shape[0])
    for i in range(means.shape[1]):
        if ax is None:
@ -68,7 +67,7 @@ def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_sid
    else:
        colors = iter(colors)
    plots = []
-    means, variances, gamma = param_to_array(parameterized.mean, parameterized.variance, parameterized.binary_prob)
+    means, variances, gamma = parameterized.mean, parameterized.variance, parameterized.binary_prob
    x = np.arange(means.shape[0])
    for i in range(means.shape[1]):
        if side_by_side:
--- a/GPy/plotting/matplot_dep/visualize.py
+++ b/GPy/plotting/matplot_dep/visualize.py
@ -4,7 +4,6 @@ import GPy
 import numpy as np
 import matplotlib as mpl
 import time
-from ...util.misc import param_to_array
 from GPy.core.parameterization.variational import VariationalPosterior
 try:
    import visual
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -215,7 +215,10 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
    if verbose:
        print("Checking gradients of Kdiag(X) wrt X.")
    try:
-        result = Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)
+        testmodel = Kern_check_dKdiag_dX(kern, X=X)
+        if fixed_X_dims is not None:
+            testmodel.X[:,fixed_X_dims].fix()
+        result = testmodel.checkgrad(verbose=verbose)
    except NotImplementedError:
        result=True
        if verbose:
@ -346,12 +349,58 @@ class KernelTestsNonContinuous(unittest.TestCase):
        kern = GPy.kern.IndependentOutputs(k, -1, name='ind_split')
        self.assertTrue(check_kernel_gradient_functions(kern, X=self.X, X2=self.X2, verbose=verbose, fixed_X_dims=-1))

+
    def test_ODE_UY(self):
        kern = GPy.kern.ODE_UY(2, active_dims=[0, self.D])
        X = self.X[self.X[:,-1]!=2]
        X2 = self.X2[self.X2[:,-1]!=2]
        self.assertTrue(check_kernel_gradient_functions(kern, X=X, X2=X2, verbose=verbose, fixed_X_dims=-1))

+class Coregionalize_weave_test(unittest.TestCase):
+    """
+    Make sure that the coregionalize kernel work with and without weave enabled
+    """
+    def setUp(self):
+        self.k = GPy.kern.Coregionalize(1, output_dim=12)
+        self.N1, self.N2 = 100, 200
+        self.X = np.random.randint(0,12,(self.N1,1))
+        self.X2 = np.random.randint(0,12,(self.N2,1))
+
+    def test_sym(self):
+        dL_dK = np.random.randn(self.N1, self.N1)
+        GPy.util.config.config.set('weave', 'working', 'True')
+        K_weave = self.k.K(self.X)
+        self.k.update_gradients_full(dL_dK, self.X)
+        grads_weave = self.k.gradient.copy()
+
+        GPy.util.config.config.set('weave', 'working', 'False')
+        K_numpy = self.k.K(self.X)
+        self.k.update_gradients_full(dL_dK, self.X)
+        grads_numpy = self.k.gradient.copy()
+
+        self.assertTrue(np.allclose(K_numpy, K_weave))
+        self.assertTrue(np.allclose(grads_numpy, grads_weave))
+
+    def test_nonsym(self):
+        dL_dK = np.random.randn(self.N1, self.N2)
+        GPy.util.config.config.set('weave', 'working', 'True')
+        K_weave = self.k.K(self.X, self.X2)
+        self.k.update_gradients_full(dL_dK, self.X, self.X2)
+        grads_weave = self.k.gradient.copy()
+
+        GPy.util.config.config.set('weave', 'working', 'False')
+        K_numpy = self.k.K(self.X, self.X2)
+        self.k.update_gradients_full(dL_dK, self.X, self.X2)
+        grads_numpy = self.k.gradient.copy()
+
+        self.assertTrue(np.allclose(K_numpy, K_weave))
+        self.assertTrue(np.allclose(grads_numpy, grads_weave))
+
+    #reset the weave state for any other tests
+    GPy.util.config.config.set('weave', 'working', 'False')
+
+
+

 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@ -352,8 +352,8 @@ class TestNoiseModels(object):
        print model
        #print model._get_params()
        np.testing.assert_almost_equal(
-                               model.pdf(f.copy(), Y.copy()),
-                               np.exp(model.logpdf(f.copy(), Y.copy()))
+                model.pdf(f.copy(), Y.copy()).prod(),
+                               np.exp(model.logpdf(f.copy(), Y.copy()).sum())
                               )

    @with_setup(setUp, tearDown)
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -20,7 +20,7 @@ class MiscTests(unittest.TestCase):
        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
        m.randomize()
        m.likelihood.variance = .5
-        Kinv = np.linalg.pinv(k.K(self.X) + np.eye(self.N)*m.likelihood.variance)
+        Kinv = np.linalg.pinv(k.K(self.X) + np.eye(self.N) * m.likelihood.variance)
        K_hat = k.K(self.X_new) - k.K(self.X_new, self.X).dot(Kinv).dot(k.K(self.X, self.X_new))
        mu_hat = k.K(self.X_new, self.X).dot(Kinv).dot(m.Y_normalized)

@ -43,7 +43,7 @@ class MiscTests(unittest.TestCase):
        Z = m.Z[:]
        X = self.X[:]

-        #Not easy to check if woodbury_inv is correct in itself as it requires a large derivation and expression
+        # Not easy to check if woodbury_inv is correct in itself as it requires a large derivation and expression
        Kinv = m.posterior.woodbury_inv
        K_hat = k.K(self.X_new) - k.K(self.X_new, Z).dot(Kinv).dot(k.K(Z, self.X_new))

@ -51,13 +51,13 @@ class MiscTests(unittest.TestCase):
        self.assertEquals(mu.shape, (self.N_new, self.D))
        self.assertEquals(covar.shape, (self.N_new, self.N_new))
        np.testing.assert_almost_equal(K_hat, covar)
-        #np.testing.assert_almost_equal(mu_hat, mu)
+        # np.testing.assert_almost_equal(mu_hat, mu)

        mu, var = m._raw_predict(self.X_new)
        self.assertEquals(mu.shape, (self.N_new, self.D))
        self.assertEquals(var.shape, (self.N_new, 1))
        np.testing.assert_almost_equal(np.diag(K_hat)[:, None], var)
-        #np.testing.assert_almost_equal(mu_hat, mu)
+        # np.testing.assert_almost_equal(mu_hat, mu)

    def test_likelihood_replicate(self):
        m = GPy.models.GPRegression(self.X, self.Y)
@ -110,22 +110,45 @@ class MiscTests(unittest.TestCase):
        m2.kern.lengthscale = m.kern['.*lengthscale']
        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())

+    def test_missing_data(self):
+        from GPy import kern
+        from GPy.models import BayesianGPLVM
+        from GPy.examples.dimensionality_reduction import _simulate_matern
+
+        D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
+        _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, False)
+        Y = Ylist[0]
+
+        inan = np.random.binomial(1, .9, size=Y.shape).astype(bool) # 80% missing data
+        Ymissing = Y.copy()
+        Ymissing[inan] = np.nan
+
+        k = kern.Linear(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
+        m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing,
+                          kernel=k, missing_data=True)
+        assert(m.checkgrad())
+
+        k = kern.RBF(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
+        m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing,
+                          kernel=k, missing_data=True)
+        assert(m.checkgrad())
+
    def test_likelihood_replicate_kern(self):
        m = GPy.models.GPRegression(self.X, self.Y)
        m2 = GPy.models.GPRegression(self.X, self.Y)
        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())
        m.kern.randomize()
        m2.kern[''] = m.kern[:]
-        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())
+        np.testing.assert_almost_equal(m.log_likelihood(), m2.log_likelihood())
        m.kern.randomize()
        m2.kern[:] = m.kern[:]
-        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())
+        np.testing.assert_almost_equal(m.log_likelihood(), m2.log_likelihood())
        m.kern.randomize()
        m2.kern[''] = m.kern['']
-        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())
+        np.testing.assert_almost_equal(m.log_likelihood(), m2.log_likelihood())
        m.kern.randomize()
        m2.kern[:] = m.kern[''].values()
-        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())
+        np.testing.assert_almost_equal(m.log_likelihood(), m2.log_likelihood())

    def test_big_model(self):
        m = GPy.examples.dimensionality_reduction.mrd_simulation(optimize=0, plot=0, plot_sim=0)
@ -158,7 +181,7 @@ class MiscTests(unittest.TestCase):
    def test_model_optimize(self):
        X = np.random.uniform(-3., 3., (20, 1))
        Y = np.sin(X) + np.random.randn(20, 1) * 0.05
-        m = GPy.models.GPRegression(X,Y)
+        m = GPy.models.GPRegression(X, Y)
        m.optimize()
        print m

@ -219,8 +242,8 @@ class GradientTests(np.testing.TestCase):
        mlp = GPy.kern.MLP(1)
        self.check_model(mlp, model_type='GPRegression', dimension=1)

-    #TODO:
-    #def test_GPRegression_poly_1d(self):
+    # TODO:
+    # def test_GPRegression_poly_1d(self):
    #    ''' Testing the GP regression with polynomial kernel with white kernel on 1d data '''
    #    mlp = GPy.kern.Poly(1, degree=5)
    #    self.check_model(mlp, model_type='GPRegression', dimension=1)
@ -417,11 +440,11 @@ class GradientTests(np.testing.TestCase):

    def test_gp_heteroscedastic_regression(self):
        num_obs = 25
-        X = np.random.randint(0,140,num_obs)
-        X = X[:,None]
-        Y = 25. + np.sin(X/20.) * 2. + np.random.rand(num_obs)[:,None]
+        X = np.random.randint(0, 140, num_obs)
+        X = X[:, None]
+        Y = 25. + np.sin(X / 20.) * 2. + np.random.rand(num_obs)[:, None]
        kern = GPy.kern.Bias(1) + GPy.kern.RBF(1)
-        m = GPy.models.GPHeteroscedasticRegression(X,Y,kern)
+        m = GPy.models.GPHeteroscedasticRegression(X, Y, kern)
        self.assertTrue(m.checkgrad())

    def test_gp_kronecker_gaussian(self):
@ -432,12 +455,12 @@ class GradientTests(np.testing.TestCase):
        k1 = GPy.kern.RBF(1)  # + GPy.kern.White(1)
        k2 = GPy.kern.RBF(1)  # + GPy.kern.White(1)
        Y = np.random.randn(N1, N2)
-        Y = Y-Y.mean(0)
-        Y = Y/Y.std(0)
+        Y = Y - Y.mean(0)
+        Y = Y / Y.std(0)
        m = GPy.models.GPKroneckerGaussianRegression(X1, X2, Y, k1, k2)

        # build the model the dumb way
-        assert (N1*N2<1000), "too much data for standard GPs!"
+        assert (N1 * N2 < 1000), "too much data for standard GPs!"
        yy, xx = np.meshgrid(X2, X1)
        Xgrid = np.vstack((xx.flatten(order='F'), yy.flatten(order='F'))).T
        kg = GPy.kern.RBF(1, active_dims=[0]) * GPy.kern.RBF(1, active_dims=[1])
@ -458,11 +481,11 @@ class GradientTests(np.testing.TestCase):

    def test_gp_VGPC(self):
        num_obs = 25
-        X = np.random.randint(0,140,num_obs)
-        X = X[:,None]
-        Y = 25. + np.sin(X/20.) * 2. + np.random.rand(num_obs)[:,None]
+        X = np.random.randint(0, 140, num_obs)
+        X = X[:, None]
+        Y = 25. + np.sin(X / 20.) * 2. + np.random.rand(num_obs)[:, None]
        kern = GPy.kern.Bias(1) + GPy.kern.RBF(1)
-        m = GPy.models.GPVariationalGaussianApproximation(X,Y,kern)
+        m = GPy.models.GPVariationalGaussianApproximation(X, Y, kern)
        self.assertTrue(m.checkgrad())


--- a/GPy/testing/parameterized_tests.py
+++ b/GPy/testing/parameterized_tests.py
@ -8,7 +8,9 @@ import GPy
 import numpy as np
 from GPy.core.parameterization.parameter_core import HierarchyError
 from GPy.core.parameterization.observable_array import ObsAr
-from GPy.core.parameterization.transformations import NegativeLogexp
+from GPy.core.parameterization.transformations import NegativeLogexp, Logistic
+from GPy.core.parameterization.parameterized import Parameterized
+from GPy.core.parameterization.param import Param

 class ArrayCoreTest(unittest.TestCase):
    def setUp(self):
@ -32,7 +34,7 @@ class ParameterizedTest(unittest.TestCase):
        self.white = GPy.kern.White(1)
        from GPy.core.parameterization import Param
        from GPy.core.parameterization.transformations import Logistic
-        self.param = Param('param', np.random.uniform(0,1,(25,2)), Logistic(0, 1))
+        self.param = Param('param', np.random.uniform(0,1,(10,5)), Logistic(0, 1))

        self.test1 = GPy.core.Parameterized("test model")
        self.test1.param = self.param
@ -153,12 +155,14 @@ class ParameterizedTest(unittest.TestCase):
        self.test1.kern.randomize()
        self.assertEqual(val, self.rbf.variance)

-#     def test_updates(self):
-#         # WHAT DO YOU WANT TO TEST HERE?
-#         self.test1.update_model(False)
-#         val = float(self.rbf.variance)
-#         self.test1.kern.randomize()
-#         self.assertEqual(val, self.rbf.variance,str(self.test1))
+    def test_updates(self):
+        val = float(self.testmodel.log_likelihood())
+        self.testmodel.update_model(False)
+        self.testmodel.kern.randomize()
+        self.testmodel.likelihood.randomize()
+        self.assertEqual(val, self.testmodel.log_likelihood())
+        self.testmodel.update_model(True)
+        self.assertNotEqual(val, self.testmodel.log_likelihood())

    def test_fixing_optimize(self):
        self.testmodel.kern.lengthscale.fix()
@ -196,6 +200,20 @@ class ParameterizedTest(unittest.TestCase):
        unfixed = self.testmodel.kern.unfix()
        self.assertListEqual(unfixed.tolist(), [0,1])

+    def test_constraints_in_init(self):
+        class Test(Parameterized):
+            def __init__(self, name=None, parameters=[], *a, **kw):
+                super(Test, self).__init__(name=name)
+                self.x = Param('x', np.random.uniform(0,1,(3,4)))
+                self.x[0].constrain_bounded(0,1)
+                self.link_parameter(self.x)
+                self.x[1].fix()
+        t = Test()
+        c = {Logistic(0,1): np.array([0, 1, 2, 3]), 'fixed': np.array([4, 5, 6, 7])}
+        np.testing.assert_equal(t.x.constraints[Logistic(0,1)], c[Logistic(0,1)])
+        np.testing.assert_equal(t.x.constraints['fixed'], c['fixed'])
+
+
    def test_printing(self):
        print self.test1
        print self.param
--- a/GPy/util/init.py
+++ b/GPy/util/init.py
@ -18,12 +18,3 @@ import multioutput
 import linalg_gpu
 import mpi

-try:
-    import sympy
-    _sympy_available = True
-    del sympy
-except ImportError as e:
-    _sympy_available = False
-
-if _sympy_available:
-    import symbolic
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@ -188,4 +188,6 @@ class Cache_this(object):
        self.ignore_args = ignore_args
        self.force_args = force_kwargs
    def __call__(self, f):
-        return Cacher_wrap(f, self.limit, self.ignore_args, self.force_args)
+        newf = Cacher_wrap(f, self.limit, self.ignore_args, self.force_args)
+        update_wrapper(newf, f)
+        return newf
--- a/GPy/util/data_resources.json
+++ b/GPy/util/data_resources.json
@ -523,6 +523,23 @@
            "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/singlecell/"
        ]
    },
+    "singlecell_islam": {
+        "citation": "Single-Cell RNA-Seq Reveals Dynamic, Random Monoallelic Gene Expression in Mammalian Cells Qiaolin Deng, Daniel Ramskoeld, Bjoern Reinius, and Rickard Sandberg Science 10 January 2014: 343 (6167), 193-196. [DOI:10.1126/science.1245316]",
+        "details" : "92 single cells (48 mouse ES cells, 44 mouse embryonic fibroblasts and 4 negative controls) were analyzed by single-cell tagged reverse transcription (STRT)",
+        "files"   : [["GSE29087_L139_expression_tab.txt.gz"], ["GSE29087_family.soft.gz"]],
+        "license" : "Gene Expression Omnibus: http://www.ncbi.nlm.nih.gov/geo/info/disclaimer.html",
+        "size"    : 1159449,
+        "urls"    : ["ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE29nnn/GSE29087/suppl/", "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE29nnn/GSE29087/soft/"]
+    },
+    "singlecell_deng": {
+        "citation": "Deng Q, Ramsköld D, Reinius B, Sandberg R. Single-cell RNA-seq reveals dynamic, random monoallelic gene expression in mammalian cells. Science 2014 Jan 10;343(6167):193-6. PMID: 24408435",
+        "details" : "First generation mouse strain crosses were used to study monoallelic expression on the single cell level",
+        "files"   : [["?acc=GSE45719&format=file"], ["GSE45719_series_matrix.txt.gz"]],
+        "license" : "Gene Expression Omnibus: http://www.ncbi.nlm.nih.gov/geo/info/disclaimer.html",
+        "size"    : 1159449,
+        "save_names": [["GSE45719_Raw.tar"], [null]],
+        "urls"    : ["http://www.ncbi.nlm.nih.gov/geo/download/", "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE45nnn/GSE45719/matrix/"]
+    },
    "sod1_mouse": {
        "citation": "Transcriptomic indices of fast and slow disease progression in two mouse models of amyotrophic lateral sclerosis' Nardo G1, Iennaco R, Fusi N, Heath PR, Marino M, Trolese MC, Ferraiuolo L, Lawrence N, Shaw PJ, Bendotti C Brain. 2013 Nov;136(Pt 11):3305-32. doi: 10.1093/brain/awt250. Epub 2013 Sep 24.",
        "details": "Gene expression data from two separate strains of mice: C57 and 129Sv in wild type and SOD1 mutant strains.",
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -82,20 +82,32 @@ def prompt_user(prompt):

 def data_available(dataset_name=None):
    """Check if the data set is available on the local machine already."""
-    for file_list in data_resources[dataset_name]['files']:
-        for file in file_list:
-            if not os.path.exists(os.path.join(data_path, dataset_name, file)):
+    from itertools import izip_longest
+    dr = data_resources[dataset_name]
+    zip_urls = (dr['files'], )
+    if dr.has_key('save_names'): zip_urls += (dr['save_names'], )
+    else: zip_urls += ([],)
+
+    for file_list, save_list in izip_longest(*zip_urls, fillvalue=[]):
+        for f, s in izip_longest(file_list, save_list, fillvalue=None):
+            if s is not None: f=s # If there is a save_name given, use that one
+            if not os.path.exists(os.path.join(data_path, dataset_name, f)):
                return False
    return True

-def download_url(url, store_directory, save_name = None, messages = True, suffix=''):
+def download_url(url, store_directory, save_name=None, messages=True, suffix=''):
    """Download a file from a url and save it to disk."""
    i = url.rfind('/')
    file = url[i+1:]
    print file
    dir_name = os.path.join(data_path, store_directory)
-    save_name = os.path.join(dir_name, file)
-    print "Downloading ", url, "->", os.path.join(store_directory, file)
+
+    if save_name is None: save_name = os.path.join(dir_name, file)
+    else: save_name = os.path.join(dir_name, save_name)
+
+    if suffix is None: suffix=''
+
+    print "Downloading ", url, "->", save_name
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    try:
@ -178,19 +190,24 @@ def authorize_download(dataset_name=None):

 def download_data(dataset_name=None):
    """Check with the user that the are happy with terms and conditions for the data set, then download it."""
+    import itertools

    dr = data_resources[dataset_name]
    if not authorize_download(dataset_name):
        raise Exception("Permission to download data set denied.")

-    if dr.has_key('suffices'):
-        for url, files, suffices in zip(dr['urls'], dr['files'], dr['suffices']):
-            for file, suffix in zip(files, suffices):
-                download_url(os.path.join(url,file), dataset_name, dataset_name, suffix=suffix)
-    else:
-        for url, files in zip(dr['urls'], dr['files']):
-            for file in files:
-                download_url(os.path.join(url,file), dataset_name, dataset_name)
+    zip_urls = (dr['urls'], dr['files'])
+
+    if dr.has_key('save_names'): zip_urls += (dr['save_names'], )
+    else: zip_urls += ([],)
+
+    if dr.has_key('suffices'): zip_urls += (dr['suffices'], )
+    else: zip_urls += ([],)
+
+    for url, files, save_names, suffices in itertools.izip_longest(*zip_urls, fillvalue=[]):
+        for f, save_name, suffix in itertools.izip_longest(files, save_names, suffices, fillvalue=None):
+            download_url(os.path.join(url,f), dataset_name, save_name, suffix=suffix)
+
    return True

 def data_details_return(data, data_set):
@ -895,6 +912,143 @@ def singlecell(data_set='singlecell'):
                                'genes': genes, 'labels':labels,
                                }, data_set)

+def singlecell_rna_seq_islam(dataset='singlecell_islam'):
+    if not data_available(dataset):
+        download_data(dataset)
+
+    from pandas import read_csv, DataFrame, concat
+    dir_path = os.path.join(data_path, dataset)
+    filename = os.path.join(dir_path, 'GSE29087_L139_expression_tab.txt.gz')
+    data = read_csv(filename, sep='\t', skiprows=6, compression='gzip', header=None)
+    header1 = read_csv(filename, sep='\t', header=None, skiprows=5, nrows=1, compression='gzip')
+    header2 = read_csv(filename, sep='\t', header=None, skiprows=3, nrows=1, compression='gzip')
+    data.columns = np.concatenate((header1.ix[0, :], header2.ix[0, 7:]))
+    Y = data.set_index("Feature").ix[8:, 6:-4].T.astype(float)
+
+    # read the info .soft
+    filename = os.path.join(dir_path, 'GSE29087_family.soft.gz')
+    info = read_csv(filename, sep='\t', skiprows=0, compression='gzip', header=None)
+    # split at ' = '
+    info = DataFrame(info.ix[:,0].str.split(' = ').tolist())
+    # only take samples:
+    info = info[info[0].str.contains("!Sample")]
+    info[0] = info[0].apply(lambda row: row[len("!Sample_"):])
+
+    groups = info.groupby(0).groups
+    # remove 'GGG' from barcodes
+    barcode = info[1][groups['barcode']].apply(lambda row: row[:-3])
+
+    title = info[1][groups['title']]
+    title.index = barcode
+    title.name = 'title'
+    geo_accession = info[1][groups['geo_accession']]
+    geo_accession.index = barcode
+    geo_accession.name = 'geo_accession'
+    case_id = info[1][groups['source_name_ch1']]
+    case_id.index = barcode
+    case_id.name = 'source_name_ch1'
+
+    info = concat([title, geo_accession, case_id], axis=1)
+    labels = info.join(Y).source_name_ch1[:-4]
+    labels[labels=='Embryonic stem cell'] = "ES"
+    labels[labels=='Embryonic fibroblast'] = "MEF"
+
+    return data_details_return({'Y': Y,
+                                'info': '92 single cells (48 mouse ES cells, 44 mouse embryonic fibroblasts and 4 negative controls) were analyzed by single-cell tagged reverse transcription (STRT)',
+                                'genes': Y.columns,
+                                'labels': labels,
+                                'datadf': data,
+                                'infodf': info}, dataset)
+
+def singlecell_rna_seq_deng(dataset='singlecell_deng'):
+    if not data_available(dataset):
+        download_data(dataset)
+
+    from pandas import read_csv, isnull
+    dir_path = os.path.join(data_path, dataset)
+
+    # read the info .soft
+    filename = os.path.join(dir_path, 'GSE45719_series_matrix.txt.gz')
+    info = read_csv(filename, sep='\t', skiprows=0, compression='gzip', header=None, nrows=29, index_col=0)
+    summary = info.loc['!Series_summary'][1]
+    design = info.loc['!Series_overall_design']
+
+    # only take samples:
+    sample_info = read_csv(filename, sep='\t', skiprows=30, compression='gzip', header=0, index_col=0).T
+    sample_info.columns = sample_info.columns.to_series().apply(lambda row: row[len("!Sample_"):])
+    sample_info.columns.name = sample_info.columns.name[len("!Sample_"):]
+    sample_info = sample_info[['geo_accession', 'characteristics_ch1',  'description']]
+    sample_info = sample_info.iloc[:, np.r_[0:4, 5:sample_info.shape[1]]]
+    c = sample_info.columns.to_series()
+    c[1:4] = ['strain', 'cross', 'developmental_stage']
+    sample_info.columns = c
+
+    # get the labels right:
+    rep = re.compile('\(.*\)')
+    def filter_dev_stage(row):
+        if isnull(row):
+            row = "2-cell stage embryo"
+        if row.startswith("developmental stage: "):
+            row = row[len("developmental stage: "):]
+        if row == 'adult':
+            row += " liver"
+        row = row.replace(' stage ', ' ')
+        row = rep.sub(' ', row)
+        row = row.strip(' ')
+        return row
+    labels = sample_info.developmental_stage.apply(filter_dev_stage)
+
+    # Extract the tar file
+    filename = os.path.join(dir_path, 'GSE45719_Raw.tar')
+    with tarfile.open(filename, 'r') as files:
+        print "Extracting Archive {}...".format(files.name)
+        data = None
+        gene_info = None
+        message = ''
+        members = files.getmembers()
+        overall = len(members)
+        for i, file_info in enumerate(members):
+            f = files.extractfile(file_info)
+            inner = read_csv(f, sep='\t', header=0, compression='gzip', index_col=0)
+            print ' '*(len(message)+1) + '\r',
+            message = "{: >7.2%}: Extracting: {}".format(float(i+1)/overall, file_info.name[:20]+"...txt.gz")
+            print message,
+            if data is None:
+                data = inner.RPKM.to_frame()
+                data.columns = [file_info.name[:-18]]
+                gene_info = inner.Refseq_IDs.to_frame()
+                gene_info.columns = [file_info.name[:-18]]
+            else:
+                data[file_info.name[:-18]] = inner.RPKM
+                gene_info[file_info.name[:-18]] = inner.Refseq_IDs
+
+    # Strip GSM number off data index
+    rep = re.compile('GSM\d+_')
+    data.columns = data.columns.to_series().apply(lambda row: row[rep.match(row).end():])
+    data = data.T
+
+    # make sure the same index gets used
+    sample_info.index = data.index
+
+    # get the labels from the description
+    #rep = re.compile('fibroblast|\d+-cell|embryo|liver|early blastocyst|mid blastocyst|late blastocyst|blastomere|zygote', re.IGNORECASE)
+
+    sys.stdout.write(' '*len(message) + '\r')
+    sys.stdout.flush()
+    print
+    print "Read Archive {}".format(files.name)
+
+    return data_details_return({'Y': data,
+                                'series_info': info,
+                                'sample_info': sample_info,
+                                'gene_info': gene_info,
+                                'summary': summary,
+                                'design': design,
+                                'genes': data.columns,
+                                'labels': labels,
+                                }, dataset)
+
+
 def swiss_roll_1000():
    return swiss_roll(num_samples=1000)

--- a/GPy/util/datasets/swiss_roll.pickle
+++ b/GPy/util/datasets/swiss_roll.pickle
--- a/GPy/util/initialization.py
+++ b/GPy/util/initialization.py
@ -13,7 +13,7 @@ def initialize_latent(init, input_dim, Y):
        p = pca(Y)
        PC = p.project(Y, min(input_dim, Y.shape[1]))
        Xr[:PC.shape[0], :PC.shape[1]] = PC
-        var = p.fracs[:input_dim]
+        var = .1*p.fracs[:input_dim]
    else:
        var = Xr.var(0)

--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -10,11 +10,10 @@ from scipy import linalg, weave
 import types
 import ctypes
 from ctypes import byref, c_char, c_int, c_double # TODO
-# import scipy.lib.lapack
 import scipy
 import warnings
 import os
-from config import *
+from config import config
 import logging

 _scipyversion = np.float64((scipy.__version__).split('.')[:2])
@ -521,6 +520,27 @@ def symmetrify(A, upper=False):
    Take the square matrix A and make it symmetrical by copting elements from the lower half to the upper

    works IN PLACE.
+
+    note: tries to use weave, falls back to a slower numpy version
+    """
+    if config.getboolean('weave', 'working'):
+        try:
+            symmetrify_weave(A, upper)
+        except:
+            print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+            config.set('weave', 'working', 'False')
+            symmetrify_numpy(A, upper)
+    else:
+        symmetrify_numpy(A, upper)
+
+
+def symmetrify_weave(A, upper=False):
+    """
+    Take the square matrix A and make it symmetrical by copting elements from the lower half to the upper
+
+    works IN PLACE.
+
+
    """
    N, M = A.shape
    assert N == M
@ -563,10 +583,15 @@ def symmetrify(A, upper=False):
        A += np.tril(tmp, -1).T


-def symmetrify_murray(A):
-    A += A.T
-    nn = A.shape[0]
-    A[[range(nn), range(nn)]] /= 2.0
+def symmetrify_numpy(A, upper=False):
+    """
+    Force a matrix to be symmetric
+    """
+    triu = np.triu_indices_from(A,k=1)
+    if upper:
+        A.T[triu] = A[triu]
+    else:
+        A[triu] = A.T[triu]

 def cholupdate(L, x):
    """
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@ -2,7 +2,6 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from scipy import weave
 from config import *

 def chain_1(df_dg, dg_dx):
@ -84,96 +83,6 @@ def kmm_init(X, m = 10):
    inducing = np.array(inducing)
    return X[inducing]

-def fast_array_equal(A, B):
-
-
-    if config.getboolean('parallel', 'openmp'):
-        pragma_string = '#pragma omp parallel for private(i, j)'
-    else:
-        pragma_string = ''
-
-    code2="""
-    int i, j;
-    return_val = 1;
-
-    %s
-    for(i=0;i<N;i++){
-       for(j=0;j<D;j++){
-          if(A(i, j) != B(i, j)){
-              return_val = 0;
-              break;
-          }
-       }
-    }
-    """ % pragma_string
-
-    if config.getboolean('parallel', 'openmp'):
-        pragma_string = '#pragma omp parallel for private(i, j, z)'
-    else:
-        pragma_string = ''
-
-    code3="""
-    int i, j, z;
-    return_val = 1;
-
-    %s
-    for(i=0;i<N;i++){
-       for(j=0;j<D;j++){
-         for(z=0;z<Q;z++){
-            if(A(i, j, z) != B(i, j, z)){
-               return_val = 0;
-               break;
-            }
-          }
-       }
-    }
-    """ % pragma_string
-
-    if config.getboolean('parallel', 'openmp'):
-        header_string = '#include <omp.h>'
-    else:
-        header_string = ''
-
-    support_code = """
-    %s
-    #include <math.h>
-    """ % header_string
-
-
-    weave_options_openmp = {'headers'           : ['<omp.h>'],
-                            'extra_compile_args': ['-fopenmp -O3'],
-                            'extra_link_args'   : ['-lgomp'],
-                            'libraries': ['gomp']}
-    weave_options_noopenmp = {'extra_compile_args': ['-O3']}
-
-    if config.getboolean('parallel', 'openmp'):
-        weave_options = weave_options_openmp
-    else:
-        weave_options = weave_options_noopenmp
-
-    value = False
-
-
-    if (A == None) and (B == None):
-        return True
-    elif ((A == None) and (B != None)) or ((A != None) and (B == None)):
-        return False
-    elif A.shape == B.shape:
-        if A.ndim == 2:
-            N, D = [int(i) for i in A.shape]
-            value = weave.inline(code2, support_code=support_code,
-                                 arg_names=['A', 'B', 'N', 'D'],
-                                 type_converters=weave.converters.blitz, **weave_options)
-        elif A.ndim == 3:
-            N, D, Q = [int(i) for i in A.shape]
-            value = weave.inline(code3, support_code=support_code,
-                                 arg_names=['A', 'B', 'N', 'D', 'Q'],
-                                 type_converters=weave.converters.blitz, **weave_options)
-        else:
-            value = np.array_equal(A,B)
-
-    return value
-
 ### make a parameter to its corresponding array:
 def param_to_array(*param):
    """
@ -181,6 +90,8 @@ Convert an arbitrary number of parameters to :class:ndarray class objects. This
 converting parameter objects to numpy arrays, when using scipy.weave.inline routine.
 In scipy.weave.blitz there is no automatic array detection (even when the array inherits
 from :class:ndarray)"""
+    import warnings
+    warnings.warn("Please use param.values, as this function will be deprecated in the next release.", DeprecationWarning)
    assert len(param) > 0, "At least one parameter needed"
    if len(param) == 1:
        return param[0].view(np.ndarray)
--- a/GPy/util/pca.py
+++ b/GPy/util/pca.py
@ -11,14 +11,16 @@ try:
 except:
    pass
 from numpy.linalg.linalg import LinAlgError
+from operator import setitem
+import itertools

 class pca(object):
    """
    pca module with automatic primal/dual determination.
    """
    def __init__(self, X):
-        self.mu = X.mean(0)
-        self.sigma = X.std(0)
+        self.mu = None
+        self.sigma = None

        X = self.center(X)

@ -39,6 +41,13 @@ class pca(object):
        """
        Center `X` in pca space.
        """
+        X = X.copy()
+        inan = numpy.isnan(X)
+        if self.mu is None:
+            X_ = numpy.ma.masked_array(X, inan)
+            self.mu = X_.mean(0).base
+            self.sigma = X_.std(0).base
+        reduce(lambda y,x: setitem(x[0], x[1], x[2]), itertools.izip(X.T, inan.T, self.mu), None)
        X = X - self.mu
        X = X / numpy.where(self.sigma == 0, 1e-30, self.sigma)
        return X
--- a/GPy/util/symbolic.py
+++ b/GPy/util/symbolic.py
@ -1,367 +0,0 @@
-import sys
-import numpy as np
-import sympy as sym
-from sympy import Function, S, oo, I, cos, sin, asin, log, erf, pi, exp, sqrt, sign, gamma, polygamma
-from sympy.matrices import Matrix
-########################################
-## Try to do some matrix functions: problem, you can't do derivatives
-## with respect to matrix functions :-(
-
-class GPySymMatrix(Matrix):
-    def __init__(self, indices):
-        Matrix.__init__(self)
-    def atoms(self):
-        return [e2 for e in self for e2 in e.atoms()]
-        
-class selector(Function):
-    """A function that returns an element of a Matrix depending on input indices."""
-    nargs = 3
-    def fdiff(self, argindex=1):
-        return selector(*self.args)
-    @classmethod
-    def eval(cls, X, i, j):
-        if i.is_Number and j.is_Number:
-            return X[i, j]
-        
-##################################################    
-class logistic(Function):
-    """The logistic function as a symbolic function."""
-    nargs = 1
-    def fdiff(self, argindex=1):
-        x = self.args[0]
-        return logistic(x)*(1-logistic(x))
-
-    @classmethod
-    def eval(cls, x):
-        if x.is_Number:
-            return 1/(1+exp(-x))
-    
-class logisticln(Function):
-    """The log logistic, which can often be computed with more precision than the simply taking log(logistic(x)) when x is small or large."""
-    nargs = 1
-
-    def fdiff(self, argindex=1):
-        x = self.args[0]
-        return 1-logistic(x)
-
-    @classmethod
-    def eval(cls, x):
-        if x.is_Number:
-            return -np.log(1+exp(-x))
-
-class erfc(Function):
-    """The complementary error function, erfc(x) = 1-erf(x). Used as a helper function, particularly for erfcx, the scaled complementary error function. and the normal distributions cdf."""
-    nargs = 1
-    
-    @classmethod
-    def eval(cls, arg):
-        return 1-erf(arg)
-
-class erfcx(Function):
-    nargs = 1
-    def fdiff(self, argindex=1):
-        x = self.args[0]
-        return x*erfcx(x)-2/sqrt(pi)
-        
-    @classmethod
-    def eval(cls, x):
-        if x.is_Number:
-            return exp(x**2)*erfc(x)
-
-class gammaln(Function):
-    """The log of the gamma function, which is often needed instead of log(gamma(x)) for better accuracy for large x."""
-    nargs = 1
-
-    def fdiff(self, argindex=1):
-        x=self.args[0]
-        return polygamma(0, x)
-
-    @classmethod
-    def eval(cls, x):
-        if x.is_Number:
-            return log(gamma(x))
-    
-
-class normcdfln(Function):
-    """The log of the normal cdf. Can often be computed with better accuracy than log(normcdf(x)), particulary when x is either small or large."""
-    nargs = 1
-
-    def fdiff(self, argindex=1):
-        x = self.args[0]
-        #return -erfcx(-x/sqrt(2))/sqrt(2*pi)
-        #return exp(-normcdfln(x) - 0.5*x*x)/sqrt(2*pi)
-        return sqrt(2/pi)*1/erfcx(-x/sqrt(2))
-
-    @classmethod
-    def eval(cls, x):
-        if x.is_Number:
-            return log(normcdf(x))
-
-    def _eval_is_real(self):
-        return self.args[0].is_real
-
-class normcdf(Function):
-    """The cumulative distribution function of the standard normal. Provided as a convenient helper function. It is computed throught -0.5*erfc(-x/sqrt(2))."""
-    nargs = 1
-    def fdiff(self, argindex=1):
-        x = self.args[0]
-        return gaussian(x)
-
-    @classmethod
-    def eval(cls, x):
-        if x.is_Number:
-            return 0.5*(erfc(-x/sqrt(2)))
-
-    def _eval_is_real(self):
-        return self.args[0].is_real
-
-class normalln(Function):
-    """The log of the standard normal distribution."""
-    nargs = 1
-    def fdiff(self, argindex=1):
-         x = self.args[0]
-         return -x
-     
-    @classmethod
-    def eval(cls, x):
-        if x.is_Number:
-            return 0.5*sqrt(2*pi) - 0.5*x*x
-
-
-    def _eval_is_real(self):
-        return True
-
-
-class normal(Function):
-    """The standard normal distribution. Provided as a convenience function."""
-    nargs = 1
-    @classmethod
-    def eval(cls, x):
-        return 1/sqrt(2*pi)*exp(-0.5*x*x)
-
-    def _eval_is_real(self):
-        return True
-
-class differfln(Function):
-    nargs = 2
-
-    def fdiff(self, argindex=2):
-        if argindex == 2:
-            x0, x1 = self.args
-            return -2/(sqrt(pi)*(erfcx(x1)-exp(x1**2-x0**2)*erfcx(x0)))
-        elif argindex == 1:
-            x0, x1 = self.args
-            return 2/(sqrt(pi)*(exp(x0**2-x1**2)*erfcx(x1)-erfcx(x0)))
-        else:
-            raise ArgumentIndexError(self, argindex)
-        
-    @classmethod
-    def eval(cls, x0, x1):
-        if x0.is_Number and x1.is_Number:            
-            return log(erfc(x1)-erfc(x0))
-
-class dh_dd_i(Function):
-    nargs = 5
-    @classmethod
-    def eval(cls, t, tprime, d_i, d_j, l):
-        if (t.is_Number
-            and tprime.is_Number
-            and d_i.is_Number
-            and d_j.is_Number
-            and l.is_Number):
-
-            diff_t = (t-tprime)
-            l2 = l*l
-            h = h(t, tprime, d_i, d_j, l)
-            half_l_di = 0.5*l*d_i
-            arg_1 = half_l_di + tprime/l
-            arg_2 = half_l_di - (t-tprime)/l
-            ln_part_1 = ln_diff_erf(arg_1, arg_2)
-            arg_1 = half_l_di 
-            arg_2 = half_l_di - t/l
-            sign_val = sign(t/l)
-            ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
-
-            base = ((0.5*d_i*l2*(d_i+d_j)-1)*h 
-                    + (-diff_t*sign_val*exp(half_l_di*half_l_di
-                                          -d_i*diff_t
-                                          +ln_part_1)
-                       +t*sign_val*exp(half_l_di*half_l_di
-                                          -d_i*t-d_j*tprime
-                                          +ln_part_2))
-                    + l/sqrt(pi)*(-exp(-diff_t*diff_t/l2)
-                                     +exp(-tprime*tprime/l2-d_i*t)
-                                     +exp(-t*t/l2-d_j*tprime)
-                                     -exp(-(d_i*t + d_j*tprime))))
-            return base/(d_i+d_j)
-
-class dh_dd_j(Function):
-    nargs = 5
-    @classmethod
-    def eval(cls, t, tprime, d_i, d_j, l):
-        if (t.is_Number
-            and tprime.is_Number
-            and d_i.is_Number
-            and d_j.is_Number
-            and l.is_Number):
-            diff_t = (t-tprime)
-            l2 = l*l
-            half_l_di = 0.5*l*d_i
-            h = h(t, tprime, d_i, d_j, l)
-            arg_1 = half_l_di + tprime/l
-            arg_2 = half_l_di - (t-tprime)/l
-            ln_part_1 = ln_diff_erf(arg_1, arg_2)
-            arg_1 = half_l_di 
-            arg_2 = half_l_di - t/l
-            sign_val = sign(t/l)
-            ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
-            sign_val = sign(t/l)
-            base = tprime*sign_val*exp(half_l_di*half_l_di-(d_i*t+d_j*tprime)+ln_part_2)-h
-            return base/(d_i+d_j)
-    
-class dh_dl(Function):
-    nargs = 5
-    @classmethod
-    def eval(cls, t, tprime, d_i, d_j, l):
-        if (t.is_Number
-            and tprime.is_Number
-            and d_i.is_Number
-            and d_j.is_Number
-            and l.is_Number):
-
-            diff_t = (t-tprime)
-            l2 = l*l
-            h = h(t, tprime, d_i, d_j, l)
-            return 0.5*d_i*d_i*l*h + 2./(sqrt(pi)*(d_i+d_j))*((-diff_t/l2-d_i/2.)*exp(-diff_t*diff_t/l2)+(-tprime/l2+d_i/2.)*exp(-tprime*tprime/l2-d_i*t)-(-t/l2-d_i/2.)*exp(-t*t/l2-d_j*tprime)-d_i/2.*exp(-(d_i*t+d_j*tprime)))
-
-class dh_dt(Function):
-    nargs = 5
-    @classmethod
-    def eval(cls, t, tprime, d_i, d_j, l):
-        if (t.is_Number
-            and tprime.is_Number
-            and d_i.is_Number
-            and d_j.is_Number
-            and l.is_Number):
-            if (t is S.NaN
-                or tprime is S.NaN
-                or d_i is S.NaN
-                or d_j is S.NaN
-                or l is S.NaN):
-                return S.NaN
-            else:
-                half_l_di = 0.5*l*d_i
-                arg_1 = half_l_di + tprime/l
-                arg_2 = half_l_di - (t-tprime)/l
-                ln_part_1 = ln_diff_erf(arg_1, arg_2)
-                arg_1 = half_l_di 
-                arg_2 = half_l_di - t/l
-                sign_val = sign(t/l)
-                ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
-
-                
-                return (sign_val*exp(half_l_di*half_l_di
-                                        - d_i*(t-tprime)
-                                        + ln_part_1
-                                        - log(d_i + d_j))
-                        - sign_val*exp(half_l_di*half_l_di
-                                          - d_i*t - d_j*tprime
-                                          + ln_part_2
-                                          - log(d_i + d_j))).diff(t)
-
-class dh_dtprime(Function):
-    nargs = 5
-    @classmethod
-    def eval(cls, t, tprime, d_i, d_j, l):
-        if (t.is_Number
-            and tprime.is_Number
-            and d_i.is_Number
-            and d_j.is_Number
-            and l.is_Number):
-            if (t is S.NaN
-                or tprime is S.NaN
-                or d_i is S.NaN
-                or d_j is S.NaN
-                or l is S.NaN):
-                return S.NaN
-            else:
-                half_l_di = 0.5*l*d_i
-                arg_1 = half_l_di + tprime/l
-                arg_2 = half_l_di - (t-tprime)/l
-                ln_part_1 = ln_diff_erf(arg_1, arg_2)
-                arg_1 = half_l_di 
-                arg_2 = half_l_di - t/l
-                sign_val = sign(t/l)
-                ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
-
-                
-                return (sign_val*exp(half_l_di*half_l_di
-                                        - d_i*(t-tprime)
-                                        + ln_part_1
-                                        - log(d_i + d_j))
-                        - sign_val*exp(half_l_di*half_l_di
-                                          - d_i*t - d_j*tprime
-                                          + ln_part_2
-                                          - log(d_i + d_j))).diff(tprime)
-
-
-class h(Function):
-    nargs = 5
-    def fdiff(self, argindex=5):
-        t, tprime, d_i, d_j, l = self.args
-        if argindex == 1:
-            return dh_dt(t, tprime, d_i, d_j, l)
-        elif argindex == 2:
-            return dh_dtprime(t, tprime, d_i, d_j, l)
-        elif argindex == 3:
-            return dh_dd_i(t, tprime, d_i, d_j, l)
-        elif argindex == 4:
-            return dh_dd_j(t, tprime, d_i, d_j, l)
-        elif argindex == 5:
-            return dh_dl(t, tprime, d_i, d_j, l)
-                                                                
-    
-    @classmethod
-    def eval(cls, t, tprime, d_i, d_j, l):
-        if (t.is_Number
-            and tprime.is_Number
-            and d_i.is_Number
-            and d_j.is_Number
-            and l.is_Number):
-            if (t is S.NaN
-                or tprime is S.NaN
-                or d_i is S.NaN
-                or d_j is S.NaN
-                or l is S.NaN):
-                return S.NaN
-            else:
-                half_l_di = 0.5*l*d_i
-                arg_1 = half_l_di + tprime/l
-                arg_2 = half_l_di - (t-tprime)/l
-                ln_part_1 = ln_diff_erf(arg_1, arg_2)
-                arg_1 = half_l_di 
-                arg_2 = half_l_di - t/l
-                sign_val = sign(t/l)
-                ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
-
-                
-                return (sign_val*exp(half_l_di*half_l_di
-                                        - d_i*(t-tprime)
-                                        + ln_part_1
-                                        - log(d_i + d_j))
-                        - sign_val*exp(half_l_di*half_l_di
-                                          - d_i*t - d_j*tprime
-                                          + ln_part_2
-                                          - log(d_i + d_j)))
-            
-                                  
-                # return (exp((d_j/2.*l)**2)/(d_i+d_j)
-                #         *(exp(-d_j*(tprime - t))
-                #           *(erf((tprime-t)/l - d_j/2.*l)
-                #             + erf(t/l + d_j/2.*l))
-                #           - exp(-(d_j*tprime + d_i))
-                #           *(erf(tprime/l - d_j/2.*l)
-                #             + erf(d_j/2.*l))))
-
-
-
--- a/GPy/util/univariate_Gaussian.py
+++ b/GPy/util/univariate_Gaussian.py
@ -12,6 +12,39 @@ def std_norm_cdf(x):
    """
    Cumulative standard Gaussian distribution
    Based on Abramowitz, M. and Stegun, I. (1970)
+    """
+    x_shape = np.asarray(x).shape
+
+    if len(x_shape) == 0 or x_shape[0] == 1:
+        sign = np.sign(x)
+        x *= sign
+        x /= np.sqrt(2.)
+        t = 1.0/(1.0 +  0.3275911*x)
+        erf = 1. - np.exp(-x**2)*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))))
+        cdf_x = 0.5*(1.0 + sign*erf)
+        return cdf_x
+    else:
+        x = np.atleast_1d(x).copy()
+        cdf_x = np.zeros_like(x)
+        sign = np.ones_like(x)
+        neg_x_ind = x<0
+        sign[neg_x_ind] = -1.0
+        x[neg_x_ind] = -x[neg_x_ind]
+        x /= np.sqrt(2.)
+        t = 1.0/(1.0 +  0.3275911*x)
+        erf = 1. - np.exp(-x**2)*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))))
+        cdf_x = 0.5*(1.0 + sign*erf)
+        cdf_x = cdf_x.reshape(x_shape)
+    return cdf_x
+
+def std_norm_cdf_weave(x):
+    """
+    Cumulative standard Gaussian distribution
+    Based on Abramowitz, M. and Stegun, I. (1970)
+
+    A weave implementation of std_norm_cdf, which is faster. this is unused,
+    because of the difficulties of a weave dependency. (see github issue #94)
+
    """
    #Generalize for many x
    x = np.asarray(x).copy()
@ -40,37 +73,6 @@ def std_norm_cdf(x):
    weave.inline(code, arg_names=['x', 'cdf_x', 'N'], support_code=support_code)
    return cdf_x

-def std_norm_cdf_np(x):
-    """
-    Cumulative standard Gaussian distribution
-    Based on Abramowitz, M. and Stegun, I. (1970)
-    Around 3 times slower when x is a scalar otherwise quite a lot slower
-    """
-    x_shape = np.asarray(x).shape
-
-    if len(x_shape) == 0 or x_shape[0] == 1:
-        sign = np.sign(x)
-        x *= sign
-        x /= np.sqrt(2.)
-        t = 1.0/(1.0 +  0.3275911*x)
-        erf = 1. - np.exp(-x**2)*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))))
-        cdf_x = 0.5*(1.0 + sign*erf)
-        return cdf_x
-    else:
-        x = np.atleast_1d(x).copy()
-        cdf_x = np.zeros_like(x)
-        sign = np.ones_like(x)
-        neg_x_ind = x<0
-        sign[neg_x_ind] = -1.0
-        x[neg_x_ind] = -x[neg_x_ind]
-        x /= np.sqrt(2.)
-        t = 1.0/(1.0 +  0.3275911*x)
-        erf = 1. - np.exp(-x**2)*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))))
-        cdf_x = 0.5*(1.0 + sign*erf)
-        cdf_x = cdf_x.reshape(x_shape)
-    return cdf_x
-
-
 def inv_std_norm_cdf(x):
    """
    Inverse cumulative standard Gaussian distribution