Merge branch 'devel' of github.com:SheffieldML/GPy into devel

2026-05-10 12:32:40 +02:00 · 2015-08-17 16:06:47 +01:00 · 2015-08-17 16:06:47 +01:00 · 65b6e54d5c
commit 65b6e54d5c
parent 070137504a e144463ce7
42 changed files with 10282 additions and 7712 deletions
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -60,9 +60,11 @@ class GP(Model):
            self.normalizer.scale_by(Y)
            self.Y_normalized = ObsAr(self.normalizer.normalize(Y))
            self.Y = Y
-        else:
+        elif isinstance(Y, np.ndarray):
            self.Y = ObsAr(Y)
            self.Y_normalized = self.Y
+        else:
+            self.Y = Y

        if Y.shape[0] != self.num_data:
            #There can be cases where we want inputs than outputs, for example if we have multiple latent
@ -473,16 +475,16 @@ class GP(Model):
            self.inference_method.on_optimization_end()
            raise

-    def infer_newX(self, Y_new, optimize=True, ):
+    def infer_newX(self, Y_new, optimize=True):
        """
-        Infer the distribution of X for the new observed data *Y_new*.
+        Infer X for the new observed data *Y_new*.

        :param Y_new: the new observed data for inference
        :type Y_new: numpy.ndarray
        :param optimize: whether to optimize the location of new X (True by default)
        :type optimize: boolean
        :return: a tuple containing the posterior estimation of X and the model that optimize X
-        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` or numpy.ndarray, :class:`~GPy.core.model.Model`)
+        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` and numpy.ndarray, :class:`~GPy.core.model.Model`)
        """
        from ..inference.latent_function_inference.inferenceX import infer_newX
        return infer_newX(self, Y_new, optimize=optimize)
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -257,7 +257,7 @@ class Model(Parameterized):
            optimizer = optimization.get_optimizer(optimizer)
            opt = optimizer(start, model=self, max_iters=max_iters, **kwargs)

-        with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook) as vo:
+        with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook, clear_after_finish=clear_after_finish) as vo:
            opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
            vo.finish(opt)

--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -197,9 +197,10 @@ class Parameterized(Parameterizable):
                raise RuntimeError("{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param)))

        start = sum([p.size for p in self.parameters[:param._parent_index_]])
-        self._remove_parameter_name(param)
        self.size -= param.size
        del self.parameters[param._parent_index_]
+        self._remove_parameter_name(param)
+

        param._disconnect_parent()
        param.remove_observer(self, self._pass_through_notify_observers)
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -522,16 +522,9 @@ class DGPLVM(Prior):

    """
    domain = _REAL
-    # _instances = []
-    # def __new__(cls, mu, sigma): # Singleton:
-    #     if cls._instances:
-    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
-    #         for instance in cls._instances:
-    #             if instance().mu == mu and instance().sigma == sigma:
-    #                 return instance()
-    #     o = super(Prior, cls).__new__(cls, mu, sigma)
-    #     cls._instances.append(weakref.ref(o))
-    #     return cls._instances[-1]()
+    
+    def __new__(cls, sigma2, lbl, x_shape): 
+        return super(Prior, cls).__new__(cls, sigma2, lbl, x_shape)

    def __init__(self, sigma2, lbl, x_shape):
        self.sigma2 = sigma2
@ -843,7 +836,7 @@ class DGPLVM_Lamda(Prior, Parameterized):

    # Calculating beta and Bi for Sb
    def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
-        import pdb
+        # import pdb
        # pdb.set_trace()
        B_i = np.zeros((self.classnum, self.dim))
        Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
@ -909,8 +902,8 @@ class DGPLVM_Lamda(Prior, Parameterized):
        Sw = self.compute_Sw(cls, M_i)
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))

    # This function calculates derivative of the log of prior function
@ -933,8 +926,8 @@ class DGPLVM_Lamda(Prior, Parameterized):
        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -161,6 +161,16 @@ class NormalPosterior(VariationalPosterior):
        from ...plotting.matplot_dep import variational_plots
        return variational_plots.plot(self, *args, **kwargs)

+    def KL(self, other):
+        """Compute the KL divergence to another NormalPosterior Object. This only holds, if the two NormalPosterior objects have the same shape, as we do computational tricks for the multivariate normal KL divergence.
+        """
+        return .5*(
+            np.sum(self.variance/other.variance) 
+            + ((other.mean-self.mean)**2/other.variance).sum() 
+            - self.num_data * self.input_dim
+            + np.sum(np.log(other.variance)) - np.sum(np.log(self.variance))
+            )
+    
 class SpikeAndSlabPosterior(VariationalPosterior):
    '''
    The SpikeAndSlab distribution for variational approximations.
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@ -34,7 +34,7 @@ class SparseGP_MPI(SparseGP):

    """

-    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False):
+    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp', Y_metadata=None, mpi_comm=None, normalizer=False):
        self._IN_OPTIMIZATION_ = False
        if mpi_comm != None:
            if inference_method is None:
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@ -5,7 +5,7 @@ import numpy as np
 from ..util import choleskies
 from .sparse_gp import SparseGP
 from .parameterization.param import Param
-from ..inference.latent_function_inference import SVGP as svgp_inf
+from ..inference.latent_function_inference.svgp import SVGP as svgp_inf


 class SVGP(SparseGP):
--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -69,7 +69,7 @@ from .expectation_propagation_dtc import EPDTC
 from .dtc import DTC
 from .fitc import FITC
 from .var_dtc_parallel import VarDTC_minibatch
-from .svgp import SVGP
+#from .svgp import SVGP

 # class FullLatentFunctionData(object):
 #
--- a/GPy/inference/latent_function_inference/inferenceX.py
+++ b/GPy/inference/latent_function_inference/inferenceX.py
@ -27,12 +27,19 @@ def infer_newX(model, Y_new, optimize=True, init='L2'):

 class InferenceX(Model):
    """
-    The class for inference of new X with given new Y. (do_test_latent)
+    The model class for inference of new X with given new Y. (replacing the "do_test_latent" in Bayesian GPLVM)
+    It is a tiny inference model created from the original GP model. The kernel, likelihood (only Gaussian is supported at the moment) 
+    and posterior distribution are taken from the original model.
+    For Regression models and GPLVM, a point estimate of the latent variable X will be inferred. 
+    For Bayesian GPLVM, the variational posterior of X will be inferred. 
+    X is inferred through a gradient optimization of the inference model.

    :param model: the GPy model used in inference
    :type model: GPy.core.Model
    :param Y: the new observed data for inference
    :type Y: numpy.ndarray
+    :param init: the distance metric of Y for initializing X with the nearest neighbour.
+    :type init: 'L2', 'NCC' and 'rand'
    """
    def __init__(self, model, Y, name='inferenceX', init='L2'):
        if np.isnan(Y).any() or getattr(model, 'missing_data', False):
@ -59,7 +66,7 @@ class InferenceX(Model):
            from ...models.ss_mrd import IBPPrior_SSMRD
            if isinstance(model.variational_prior, IBPPrior) or isinstance(model.variational_prior, IBPPrior_SSMRD):
                from ...core.parameterization.variational import SpikeAndSlabPrior
-                self.variational_prior = SpikeAndSlabPrior(pi=05,learnPi=False, group_spike=False)
+                self.variational_prior = SpikeAndSlabPrior(pi=0.5, learnPi=False, group_spike=False)
            else:
                self.variational_prior = model.variational_prior.copy()
        else:
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@ -5,6 +5,10 @@ class StochasticStorage(object):
    '''
    This is a container for holding the stochastic parameters,
    such as subset indices or step length and so on.
+    
+    self.d has to be a list of lists:
+    [dimension indices, nan indices for those dimensions]
+    so that the minibatches can be used as efficiently as possible.10
    '''
    def __init__(self, model):
        """
@ -28,9 +32,23 @@ class SparseGPMissing(StochasticStorage):
        """
        Here we want to loop over all dimensions everytime.
        Thus, we can just make sure the loop goes over self.d every
-        time.
+        time. We will try to get batches which look the same together
+        which speeds up calculations significantly.
        """
-        self.d = range(model.Y_normalized.shape[1])
+        import numpy as np
+        self.Y = model.Y_normalized
+        bdict = {}
+        for d in range(self.Y.shape[1]):
+            inan = np.isnan(self.Y[:, d])
+            arr_str = np.array2string(inan, 
+                                      np.inf, 0, 
+                                      True, '', 
+                                      formatter={'bool':lambda x: '1' if x else '0'})
+            try:
+                bdict[arr_str][0].append(d)
+            except:
+                bdict[arr_str] = [[d], ~inan]
+        self.d = bdict.values()

 class SparseGPStochastics(StochasticStorage):
    """
@ -40,16 +58,29 @@ class SparseGPStochastics(StochasticStorage):
    def __init__(self, model, batchsize=1):
        self.batchsize = batchsize
        self.output_dim = model.Y.shape[1]
+        self.Y = model.Y_normalized
        self.reset()
        self.do_stochastics()

    def do_stochastics(self):
        if self.batchsize == 1:
            self.current_dim = (self.current_dim+1)%self.output_dim
-            self.d = [self.current_dim]
+            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.d])]]
        else:
            import numpy as np
            self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
+            bdict = {}
+            for d in self.d:
+                inan = np.isnan(self.Y[:, d])
+                arr_str = int(np.array2string(inan, 
+                                          np.inf, 0, 
+                                          True, '', 
+                                          formatter={'bool':lambda x: '1' if x else '0'}), 2)
+                try:
+                    bdict[arr_str][0].append(d)
+                except:
+                    bdict[arr_str] = [[d], ~inan]
+            self.d = bdict.values()

    def reset(self):
        self.current_dim = -1
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -6,6 +6,7 @@ from ._src.brownian import Brownian
 from ._src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from ._src.mlp import MLP
 from ._src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
+from ._src.standard_periodic import StdPeriodic
 from ._src.independent_outputs import IndependentOutputs, Hierarchical
 from ._src.coregionalize import Coregionalize
 from ._src.ODE_UY import ODE_UY
@ -17,7 +18,7 @@ from ._src.eq_ode2 import EQ_ODE2
 from ._src.trunclinear import TruncLinear,TruncLinear_inf
 from ._src.splitKern import SplitKern,DEtime
 from ._src.splitKern import DEtime as DiffGenomeKern
-
-
+from ._src.spline import Spline
+from ._src.eq_ode2 import EQ_ODE2
 from ._src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel

--- a/GPy/kern/_src/coregionalize_cython.c
+++ b/GPy/kern/_src/coregionalize_cython.c
--- a/GPy/kern/_src/coregionalize_cython.pyx
+++ b/GPy/kern/_src/coregionalize_cython.pyx
@ -1,12 +1,14 @@
-#cython: boundscheck=True
-#cython: wraparound=True
+#cython: boundscheck=False
+#cython: wraparound=False
+#cython: nonecheck=False
 import cython
 import numpy as np
 cimport numpy as np

 def K_symmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X):
    cdef int N = X.size
-    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, N))
+    cdef np.ndarray[np.double_t, ndim=2, mode='c'] K = np.empty((N, N))
+    with nogil:
        for n in range(N):
            for m in range(N):
                K[n, m] = B[X[n], X[m]]
@ -15,16 +17,18 @@ def K_symmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X):
 def K_asymmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X, np.ndarray[np.int64_t, ndim=1] X2):
    cdef int N = X.size
    cdef int M = X2.size
-    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, M))
+    cdef np.ndarray[np.double_t, ndim=2, mode='c'] K = np.empty((N, M))
+    with nogil:
        for n in range(N):
            for m in range(M):
                K[n, m] = B[X[n], X2[m]]
    return K

 def gradient_reduce(int D, np.ndarray[double, ndim=2] dL_dK, np.ndarray[np.int64_t, ndim=1] index, np.ndarray[np.int64_t, ndim=1] index2):
-        cdef np.ndarray[np.double_t, ndim=2] dL_dK_small = np.zeros((D, D))
+        cdef np.ndarray[np.double_t, ndim=2, mode='c'] dL_dK_small = np.zeros((D, D))
        cdef int N = index.size
        cdef int M = index2.size
+        with nogil:
            for i in range(N):
                for j in range(M):
                    dL_dK_small[index2[j],index[i]] += dL_dK[i,j];
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@ -55,16 +55,15 @@ def __psi2computations(variance, lengthscale, Z, mu, S):
    # Produced intermediate results:
    # _psi2                MxM

+    N,M,Q = mu.shape[0], Z.shape[0], mu.shape[1]
    lengthscale2 = np.square(lengthscale)

    _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N
    _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM
    Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ
    denom = 1./(2.*S+lengthscale2)
-    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom)
+    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+(2*(mu*denom).dot(Z_hat.reshape(M*M,Q).T) - denom.dot(np.square(Z_hat).reshape(M*M,Q).T)).reshape(N,M,M)
    _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
-
-
    return _psi2

 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
@ -157,5 +156,5 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):

    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS

-_psi1computations = Cacher(__psi1computations, limit=1)
-_psi2computations = Cacher(__psi2computations, limit=1)
+_psi1computations = Cacher(__psi1computations, limit=5)
+_psi2computations = Cacher(__psi2computations, limit=5)
--- a/GPy/kern/_src/spline.py
+++ b/GPy/kern/_src/spline.py
@ -0,0 +1,52 @@
+# Copyright (c) 2015, Thomas Hornung
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from .kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+class Spline(Kern):
+    """
+    Linear spline kernel. You need to specify 2 parameters: the variance and c.
+    The variance is defined in powers of 10. Thus specifying -2 means 10^-2.
+    The parameter c allows to define the stiffness of the spline fit. A very stiff
+    spline equals linear regression.
+    See https://www.youtube.com/watch?v=50Vgw11qn0o starting at minute 1:17:28
+    Lit: Wahba, 1990
+    """
+
+    def __init__(self, input_dim, variance=1., c=1., active_dims=None, name='spline'):
+        super(Spline, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', variance, Logexp())
+        self.c = Param('c', c)
+        self.link_parameters(self.variance,self.c)
+
+
+    def K(self, X, X2=None):
+        if X2 is None: X2=X
+        term1 = (X+8.)*(X2.T+8.)/16.
+        term2 = abs((X-X2.T)/16.)**3
+        term3 = ((X+8.)/16.)**3 + ((X2.T+8.)/16.)**3
+        return (self.variance**2 * (1. + (1.+self.c) * term1 + self.c/3. * (term2 - term3)))
+
+    def Kdiag(self, X):
+        term1 = np.square(X+8.,X+8.)/16.
+        term3 = 2. * ((X+8.)/16.)**3
+        return (self.variance**2 * (1. + (1.+self.c) * term1 - self.c/3. * term3))[:,0]
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None: X2=X
+        term1 = (X+8.)*(X2.T+8.)/16.
+        term2 = abs((X-X2.T)/16.)**3
+        term3 = ((X+8.)/16.)**3 + ((X2.T+8.)/16.)**3
+        self.variance.gradient = np.sum(dL_dK * (2*self.variance * (1. + (1.+self.c) * term1 + self.c/3. * ( term2 - term3))))
+        self.c.gradient = np.sum(dL_dK * (self.variance**2* (term1 + 1./3.*(term2 - term3))))
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        raise NotImplementedError
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
--- a/GPy/kern/_src/standard_periodic.py
+++ b/GPy/kern/_src/standard_periodic.py
@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+"""
+The standard periodic kernel which mentioned in:
+
+[1] Gaussian Processes for Machine Learning, C. E. Rasmussen, C. K. I. Williams.
+The MIT Press, 2005.
+
+
+[2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor, 
+Neural Networks and Machine Learning, pages 133-165. Springer, 1998.
+"""
+
+from .kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+
+import numpy as np
+
+class StdPeriodic(Kern):
+    """
+    Standart periodic kernel
+
+    .. math::
+
+       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
+       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance :math:`\theta_1` in the formula above
+    :type variance: float
+    :param wavelength: the vector of wavelengths :math:`\lambda_i`. If None then 1.0 is assumed.
+    :type wavelength: array or list of the appropriate size (or float if there is only one wavelength parameter)
+    :param lengthscale: the vector of lengthscale :math:`\l_i`. If None then 1.0 is assumed.
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
+    :param ARD1: Auto Relevance Determination with respect to wavelength. 
+        If equal to "False" one single wavelength parameter :math:`\lambda_i` for 
+        each dimension is assumed, otherwise there is one lengthscale 
+        parameter per dimension.
+    :type ARD1: Boolean
+    :param ARD2: Auto Relevance Determination with respect to lengthscale. 
+        If equal to "False" one single wavelength parameter :math:`l_i` for 
+        each dimension is assumed, otherwise there is one lengthscale 
+        parameter per dimension.
+    :type ARD2: Boolean
+    :param active_dims: indices of dimensions which are used in the computation of the kernel
+    :type wavelength: array or list of the appropriate size
+    :param name: Name of the kernel for output
+    :type String
+    :param useGPU: whether of not use GPU
+    :type Boolean
+    """
+    
+    def __init__(self, input_dim, variance=1., wavelength=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
+        super(StdPeriodic, self).__init__(input_dim, active_dims, name, useGPU=useGPU)
+        self.input_dim = input_dim
+        self.ARD1 = ARD1 # correspond to wavelengths        
+        self.ARD2 = ARD2 # correspond to lengthscales
+        
+        self.name = name
+        
+        if self.ARD1 == False:
+            if wavelength is not None:
+                wavelength = np.asarray(wavelength)
+                assert wavelength.size == 1, "Only one wavelength needed for non-ARD kernel"
+            else:
+                wavelength = np.ones(1)
+        else:
+            if wavelength is not None:
+                wavelength = np.asarray(wavelength)
+                assert wavelength.size == input_dim, "bad number of wavelengths"
+            else:
+                wavelength = np.ones(input_dim)
+        
+        if self.ARD2 == False:
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                lengthscale = np.ones(1)
+        else:
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == input_dim, "bad number of lengthscales"
+            else:
+                lengthscale = np.ones(input_dim)
+        
+        self.variance = Param('variance', variance, Logexp())
+        assert self.variance.size==1, "Variance size must be one"
+        self.wavelengths =  Param('wavelengths', wavelength, Logexp())
+        self.lengthscales =  Param('lengthscales', lengthscale, Logexp())
+        
+        self.link_parameters(self.variance,  self.wavelengths, self.lengthscales)
+
+    def parameters_changed(self):
+        """
+        This functions deals as a callback for each optimization iteration. 
+        If one optimization step was successfull and the parameters
+        this callback function will be called to be able to update any 
+        precomputations for the kernel.
+        """
+        
+        pass
+        
+        
+    def K(self, X, X2=None):
+        """Compute the covariance matrix between X and X2."""
+        if X2 is None: 
+            X2 = X
+            
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
+        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscales ), axis = -1 ) ) 
+            
+        return self.variance * exp_dist
+
+
+    def Kdiag(self, X):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        ret = np.empty(X.shape[0])
+        ret[:] = self.variance
+        return ret
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """derivative of the covariance matrix with respect to the parameters."""
+        if X2 is None: 
+            X2 = X
+        
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
+        
+        sin_base = np.sin( base )         
+        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscales ), axis = -1 ) ) 
+        
+        dwl = self.variance * (1.0/np.square(self.lengthscales)) * sin_base*np.cos(base) * (base / self.wavelengths)
+        
+        dl = self.variance * np.square( sin_base) / np.power( self.lengthscales, 3) 
+        
+        self.variance.gradient = np.sum(exp_dist * dL_dK)    
+        #target[0] += np.sum( exp_dist * dL_dK)        
+        
+        if self.ARD1: # different wavelengths
+            self.wavelengths.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+        else:  # same wavelengths
+            self.wavelengths.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
+            
+        if self.ARD2: # different lengthscales
+            self.lengthscales.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+        else: # same lengthscales
+            self.lengthscales.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
+        
+    def update_gradients_diag(self, dL_dKdiag, X):
+        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
+        self.variance.gradient = np.sum(dL_dKdiag)
+        self.wavelengths.gradient = 0
+        self.lengthscales.gradient = 0
+
+#    def gradients_X(self, dL_dK, X, X2=None):
+#        """derivative of the covariance matrix with respect to X."""
+#    
+#        raise NotImplemented("Periodic kernel: dK_dX not implemented")
+#
+#    def gradients_X_diag(self, dL_dKdiag, X):
+#        
+#        raise NotImplemented("Periodic kernel: dKdiag_dX not implemented")
--- a/GPy/kern/_src/stationary_cython.c
+++ b/GPy/kern/_src/stationary_cython.c
--- a/GPy/kern/_src/stationary_cython.pyx
+++ b/GPy/kern/_src/stationary_cython.pyx
@ -4,14 +4,15 @@
 import numpy as np
 cimport numpy as np
 from cython.parallel import prange
+cimport cython

 ctypedef np.float64_t DTYPE_t
 
 cdef extern from "stationary_utils.h":
-    void _grad_X "_grad_X" (int N, int D, int M, double* X, double* X2, double* tmp, double* grad)
+    void _grad_X "_grad_X" (int N, int D, int M, double* X, double* X2, double* tmp, double* grad) nogil

 cdef extern from "stationary_utils.h":
-    void _lengthscale_grads "_lengthscale_grads" (int N, int M, int Q, double* tmp, double* X, double* X2, double* grad)
+    void _lengthscale_grads "_lengthscale_grads" (int N, int M, int Q, double* tmp, double* X, double* X2, double* grad) nogil
 
 def grad_X(int N, int D, int M,
        np.ndarray[DTYPE_t, ndim=2] _X,
@ -22,8 +23,10 @@ def grad_X(int N, int D, int M,
    cdef double *X2 = <double*> _X2.data
    cdef double *tmp = <double*> _tmp.data
    cdef double *grad = <double*> _grad.data
+    with nogil:
        _grad_X(N, D, M, X, X2, tmp, grad) # return nothing, work in place.

+@cython.cdivision(True)
 def grad_X_cython(int N, int D, int M, double[:,:] X, double[:,:] X2, double[:,:] tmp, double[:,:] grad):
    cdef int n,d,nd,m
    for nd in prange(N * D, nogil=True):
@ -33,8 +36,6 @@ def grad_X_cython(int N, int D, int M, double[:,:] X, double[:,:] X2, double[:,:
        for m in range(M):
            grad[n,d] += tmp[n, m] * (X[n, d] - X2[m, d])

-
-
 def lengthscale_grads_in_c(int N, int M, int Q,
        np.ndarray[DTYPE_t, ndim=2] _tmp,
        np.ndarray[DTYPE_t, ndim=2] _X,
@ -44,16 +45,16 @@ def lengthscale_grads_in_c(int N, int M, int Q,
    cdef double *X = <double*> _X.data
    cdef double *X2 = <double*> _X2.data
    cdef double *grad = <double*> _grad.data
+    with nogil:
        _lengthscale_grads(N, M, Q, tmp, X, X2, grad) # return nothing, work in place.

 def lengthscale_grads(int N, int M, int Q, double[:,:] tmp, double[:,:] X, double[:,:] X2, double[:] grad):
    cdef int q, n, m
    cdef double gradq, dist
+    with nogil:
        for q in range(Q):
            grad[q] = 0.0
            for n in range(N):
                for m in range(M):
                    dist = X[n,q] - X2[m,q]
                    grad[q] += tmp[n, m] * dist * dist
-
-
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -232,6 +232,17 @@ class Bernoulli(Likelihood):
        np.seterr(**state)
        return d3logpdf_dlink3

+    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
+        """
+        Get the "quantiles" of the binary labels (Bernoulli draws). all the
+        quantiles must be either 0 or 1, since those are the only values the
+        draw can take!
+        """
+        p = self.predictive_mean(mu, var)
+        return [np.asarray(p>(q/100.), dtype=np.int32) for q in quantiles]
+
+
+
    def samples(self, gp, Y_metadata=None):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -607,7 +607,7 @@ class Likelihood(Parameterized):
            pred_mean = self.predictive_mean(mu, var, Y_metadata=Y_metadata)
            pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata=Y_metadata)
        except NotImplementedError:
-            print "Finding predictive mean and variance via sampling rather than quadrature"
+            print("Finding predictive mean and variance via sampling rather than quadrature")
            Nf_samp = 300
            Ny_samp = 1
            s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@ -137,7 +137,7 @@ class Poisson(Likelihood):
        """
        return self.gp_link.transf(gp)

-    def samples(self, gp, Y_metadata=None):
+    def samples(self, gp, Y_metadata=None, samples=1):
        """
        Returns a set of samples of observations based on a given value of the latent variable.

@ -145,5 +145,5 @@ class Poisson(Likelihood):
        """
        orig_shape = gp.shape
        gp = gp.flatten()
-        Ysim = np.random.poisson(self.gp_link.transf(gp))
-        return Ysim.reshape(orig_shape)
+        Ysim = np.random.poisson(self.gp_link.transf(gp), [samples, gp.size]).T
+        return Ysim.reshape(orig_shape+(samples,))
--- a/GPy/models/bcgplvm.py
+++ b/GPy/models/bcgplvm.py
@ -1,11 +1,11 @@
-# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015 James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


 import numpy as np
 from ..core import GP
-from ..models import GPLVM
-from ..mappings import *
+from . import GPLVM
+from .. import mappings


 class BCGPLVM(GPLVM):
@ -16,33 +16,31 @@ class BCGPLVM(GPLVM):
    :type Y: np.ndarray
    :param input_dim: latent dimensionality
    :type input_dim: int
-    :param init: initialisation method for the latent space
-    :type init: 'PCA'|'random'
    :param mapping: mapping for back constraint
    :type mapping: GPy.core.Mapping object

    """
-    def __init__(self, Y, input_dim, init='PCA', X=None, kernel=None, normalize_Y=False, mapping=None):
+    def __init__(self, Y, input_dim, kernel=None, mapping=None):
+

        if mapping is None:
-            mapping = Kernel(X=Y, output_dim=input_dim)
+            mapping = mappings.MLP(input_dim=Y.shape[1],
+                                   output_dim=input_dim,
+                                   hidden_dim=10)
+        else:
+            assert mapping.input_dim==Y.shape[1], "mapping input dim does not work for Y dimension"
+            assert mapping.output_dim==input_dim, "mapping output dim does not work for self.input_dim"
+        GPLVM.__init__(self, Y, input_dim, X=mapping.f(Y), kernel=kernel, name="bcgplvm")
+        self.unlink_parameter(self.X)
        self.mapping = mapping
-        GPLVM.__init__(self, Y, input_dim, init, X, kernel, normalize_Y)
-        self.X = self.mapping.f(self.likelihood.Y)
+        self.link_parameter(self.mapping)

-    def _get_param_names(self):
-        return self.mapping._get_param_names() + GP._get_param_names(self)
+        self.X = self.mapping.f(self.Y)

-    def _get_params(self):
-        return np.hstack((self.mapping._get_params(), GP._get_params(self)))
+    def parameters_changed(self):
+        self.X = self.mapping.f(self.Y)
+        GP.parameters_changed(self)
+        Xgradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
+        self.mapping.update_gradients(Xgradient, self.Y)

-    def _set_params(self, x):
-        self.mapping._set_params(x[:self.mapping.num_params])
-        self.X = self.mapping.f(self.likelihood.Y)
-        GP._set_params(self, x[self.mapping.num_params:])
-
-    def _log_likelihood_gradients(self):
-        dL_df = self.kern.gradients_X(self.dL_dK, self.X)
-        dL_dtheta = self.mapping.df_dtheta(dL_df, self.likelihood.Y)
-        return np.hstack((dL_dtheta.flatten(), GP._log_likelihood_gradients(self)))

--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -58,12 +58,15 @@ class GPLVM(GP):
        return target

    def plot(self):
-        assert self.likelihood.Y.shape[1] == 2
-        pb.scatter(self.likelihood.Y[:, 0], self.likelihood.Y[:, 1], 40, self.X[:, 0].copy(), linewidth=0, cmap=pb.cm.jet)  # @UndefinedVariable
+        assert self.Y.shape[1] == 2, "too high dimensional to plot. Try plot_latent"
+        from matplotlib import pyplot as plt
+        plt.scatter(self.Y[:, 0],
+                    self.Y[:, 1],
+                    40, self.X[:, 0].copy(),
+                    linewidth=0, cmap=plt.cm.jet)
        Xnew = np.linspace(self.X.min(), self.X.max(), 200)[:, None]
        mu, _ = self.predict(Xnew)
-        import pylab as pb
-        pb.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)
+        plt.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)

    def plot_latent(self, labels=None, which_indices=None,
                resolution=50, ax=None, marker='o', s=40,
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -63,33 +63,18 @@ class SparseGPMiniBatch(SparseGP):

        if stochastic and missing_data:
            self.missing_data = True
-            self.ninan = ~np.isnan(Y)
            self.stochastics = SparseGPStochastics(self, batchsize)
        elif stochastic and not missing_data:
            self.missing_data = False
            self.stochastics = SparseGPStochastics(self, batchsize)
        elif missing_data:
            self.missing_data = True
-            self.ninan = ~np.isnan(Y)
            self.stochastics = SparseGPMissing(self)
        else:
            self.stochastics = False

        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
-        if self.missing_data:
-            self.Ylist = []
-            overall = self.Y_normalized.shape[1]
-            m_f = lambda i: "Precomputing Y for missing data: {: >7.2%}".format(float(i+1)/overall)
-            message = m_f(-1)
-            print(message, end=' ')
-            for d in range(overall):
-                self.Ylist.append(self.Y_normalized[self.ninan[:, d], d][:, None])
-                print(' '*(len(message)+1) + '\r', end=' ')
-                message = m_f(d)
-                print(message, end=' ')
-            print('')
-
        self.posterior = None

    def has_uncertain_inputs(self):
@ -245,8 +230,7 @@ class SparseGPMiniBatch(SparseGP):
            message = m_f(-1)
            print(message, end=' ')

-        for d in self.stochastics.d:
-            ninan = self.ninan[:, d]
+        for d, ninan in self.stochastics.d:

            if not self.stochastics:
                print(' '*(len(message)) + '\r', end=' ')
@ -257,7 +241,7 @@ class SparseGPMiniBatch(SparseGP):
                grad_dict, current_values, value_indices = self._inner_parameters_changed(
                                self.kern, self.X[ninan],
                                self.Z, self.likelihood,
-                                self.Ylist[d], self.Y_metadata,
+                                self.Y_normalized[ninan][:, d], self.Y_metadata,
                                Lm, dL_dKmm,
                                subset_indices=dict(outputs=d, samples=ninan))

@ -266,8 +250,8 @@ class SparseGPMiniBatch(SparseGP):

            Lm = posterior.K_chol
            dL_dKmm = grad_dict['dL_dKmm']
-            woodbury_inv[:, :, d] = posterior.woodbury_inv
-            woodbury_vector[:, d:d+1] = posterior.woodbury_vector
+            woodbury_inv[:, :, d] = posterior.woodbury_inv[:,:,None]
+            woodbury_vector[:, d] = posterior.woodbury_vector
            self._log_marginal_likelihood += log_marginal_likelihood
        if not self.stochastics:
            print('')
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -3,7 +3,7 @@


 try:
-    import Tango
+    #import Tango
    import pylab as pb
 except:
    pass
@ -17,11 +17,11 @@ def ax_default(fignum, ax):
        fig = ax.figure
    return fig, ax

-def meanplot(x, mu, color=Tango.colorsHex['darkBlue'], ax=None, fignum=None, linewidth=2,**kw):
+def meanplot(x, mu, color='#3300FF', ax=None, fignum=None, linewidth=2,**kw):
    _, axes = ax_default(fignum, ax)
    return axes.plot(x,mu,color=color,linewidth=linewidth,**kw)

-def gpplot(x, mu, lower, upper, edgecol=Tango.colorsHex['darkBlue'], fillcol=Tango.colorsHex['lightBlue'], ax=None, fignum=None, **kwargs):
+def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, fignum=None, **kwargs):
    _, axes = ax_default(fignum, ax)

    mu = mu.flatten()
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@ -1,17 +1,14 @@
 # Copyright (c) 2012-2015, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-try:
-    import Tango
-    import pylab as pb
-except:
-    pass
 import numpy as np
+from . import Tango
 from base_plots import gpplot, x_frame1D, x_frame2D
 from ...models.gp_coregionalized_regression import GPCoregionalizedRegression
 from ...models.sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
 from scipy import sparse
 from ...core.parameterization.variational import VariationalPosterior
+from matplotlib import pyplot as plt

 def plot_fit(model, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
@ -64,7 +61,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
    #if len(which_data_ycols)==0:
        #raise ValueError('No data selected for plotting')
    if ax is None:
-        fig = pb.figure(num=fignum)
+        fig = plt.figure(num=fignum)
        ax = fig.add_subplot(111)

    if hasattr(model, 'has_uncertain_inputs') and model.has_uncertain_inputs():
@ -126,7 +123,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
            print Ysim.shape
            print Xnew.shape
            for yi in Ysim.T:
-                plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
+                plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], '#3300FF', linewidth=0.25)
                #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.

        if samples_f: #NOTE not tested with fixed_inputs
@ -197,8 +194,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
        for d in which_data_ycols:
            m_d = m[:,d].reshape(resolution, resolution).T
-            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-            if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=plt.cm.jet)
+            if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=plt.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)

        #set the limits of the plot to some sensible values
        ax.set_xlim(xmin[0], xmax[0])
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@ -22,7 +22,7 @@ def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
    lines = []
    fills = []
    bg_lines = []
-    means, variances = parameterized.mean, parameterized.variance
+    means, variances = parameterized.mean.values, parameterized.variance.values
    x = np.arange(means.shape[0])
    for i in range(means.shape[1]):
        if ax is None:
@ -43,7 +43,7 @@ def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
        if i < means.shape[1] - 1:
            a.set_xticklabels('')
    pb.draw()
-    fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
+    a.figure.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
    return dict(lines=lines, fills=fills, bg_lines=bg_lines)

 def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_side=True):
--- a/GPy/testing/cython_tests.py
+++ b/GPy/testing/cython_tests.py
@ -51,11 +51,16 @@ class test_stationary(np.testing.TestCase):

 class test_choleskies_backprop(np.testing.TestCase):
    def setUp(self):
-        self.dL, self.L = np.random.randn(2, 100, 100)
+        a =np.random.randn(10,12)
+        A = a.dot(a.T)
+        self.L = GPy.util.linalg.jitchol(A)
+        self.dL = np.random.randn(10,10)
    def test(self):
        r1 = GPy.util.choleskies._backprop_gradient_pure(self.dL, self.L)
        r2 = GPy.util.choleskies.choleskies_cython.backprop_gradient(self.dL, self.L)
+        r3 = GPy.util.choleskies.choleskies_cython.backprop_gradient_par_c(self.dL, self.L)
        np.testing.assert_allclose(r1, r2)
+        np.testing.assert_allclose(r1, r3)



--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@ -13,6 +13,7 @@ import GPy
 class InferenceXTestCase(unittest.TestCase):

    def genData(self):
+        np.random.seed(1)
        D1,D2,N = 12,12,50

        x = np.linspace(0, 4 * np.pi, N)[:, None]
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -313,6 +313,11 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
        
+    def test_standard_periodic(self):
+        k = GPy.kern.StdPeriodic(self.D, self.D-1)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+        
 class KernelTestsMiscellaneous(unittest.TestCase):
    def setUp(self):
        N, D = 100, 10
--- a/GPy/testing/link_function_tests.py
+++ b/GPy/testing/link_function_tests.py
@ -7,6 +7,7 @@ _lim_val_exp = np.log(_lim_val)
 _lim_val_square = np.sqrt(_lim_val)
 _lim_val_cube = cbrt(_lim_val)
 from GPy.likelihoods.link_functions import Identity, Probit, Cloglog, Log, Log_ex_1, Reciprocal, Heaviside
+#np.seterr(over='raise')

 class LinkFunctionTests(np.testing.TestCase):
    def setUp(self):
--- a/GPy/testing/misc_tests.py
+++ b/GPy/testing/misc_tests.py
@ -1,6 +1,7 @@
 import numpy as np
 import scipy as sp
 import GPy
+import warnings

 class MiscTests(np.testing.TestCase):
    """
@ -11,8 +12,12 @@ class MiscTests(np.testing.TestCase):
        self._lim_val_exp = np.log(self._lim_val)

    def test_safe_exp_upper(self):
-        assert np.exp(self._lim_val_exp + 1) == np.inf
-        assert GPy.util.misc.safe_exp(self._lim_val_exp + 1) < np.inf
+        with warnings.catch_warnings(record=True) as w:
+            assert np.isfinite(np.exp(self._lim_val_exp))
+            assert np.isinf(np.exp(self._lim_val_exp + 1))
+            assert np.isfinite(GPy.util.misc.safe_exp(self._lim_val_exp + 1))
+
+            assert len(w)==1 # should have one overflow warning

    def test_safe_exp_lower(self):
        assert GPy.util.misc.safe_exp(1e-10) < np.inf
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -361,14 +361,12 @@ class GradientTests(np.testing.TestCase):
        rbflin = GPy.kern.RBF(2) + GPy.kern.Linear(2)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2)

-    # @unittest.expectedFailure
    def test_SparseGPRegression_rbf_linear_white_kern_2D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 2d data with uncertain inputs'''
        rbflin = GPy.kern.RBF(2) + GPy.kern.Linear(2)
        raise unittest.SkipTest("This is not implemented yet!")
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2, uncertain_inputs=1)

-    # @unittest.expectedFailure
    def test_SparseGPRegression_rbf_linear_white_kern_1D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 1d data with uncertain inputs'''
        rbflin = GPy.kern.RBF(1) + GPy.kern.Linear(1)
@ -385,6 +383,16 @@ class GradientTests(np.testing.TestCase):
        m = GPy.models.GPLVM(Y, input_dim, kernel=k)
        self.assertTrue(m.checkgrad())

+    def test_BCGPLVM_rbf_bias_white_kern_2D(self):
+        """ Testing GPLVM with rbf + bias kernel """
+        N, input_dim, D = 50, 1, 2
+        X = np.random.rand(N, input_dim)
+        k = GPy.kern.RBF(input_dim, 0.5, 0.9 * np.ones((1,))) + GPy.kern.Bias(input_dim, 0.1) + GPy.kern.White(input_dim, 0.05)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N), K, input_dim).T
+        m = GPy.models.BCGPLVM(Y, input_dim, kernel=k)
+        self.assertTrue(m.checkgrad())
+
    def test_GPLVM_rbf_linear_white_kern_2D(self):
        """ Testing GPLVM with rbf + bias kernel """
        N, input_dim, D = 50, 1, 2
@ -410,23 +418,8 @@ class GradientTests(np.testing.TestCase):
        Z = np.linspace(0, 15, 4)[:, None]
        kernel = GPy.kern.RBF(1)
        m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, Z=Z)
-        # distribution = GPy.likelihoods.likelihood_functions.Bernoulli()
-        # likelihood = GPy.likelihoods.EP(Y, distribution)
-        # m = GPy.core.SparseGP(X, likelihood, kernel, Z)
-        # m.ensure_default_constraints()
        self.assertTrue(m.checkgrad())

-    @unittest.expectedFailure
-    def test_generalized_FITC(self):
-        N = 20
-        X = np.hstack([np.random.rand(N / 2) + 1, np.random.rand(N / 2) - 1])[:, None]
-        k = GPy.kern.RBF(1) + GPy.kern.White(1)
-        Y = np.hstack([np.ones(N / 2), np.zeros(N / 2)])[:, None]
-        m = GPy.models.FITCClassification(X, Y, kernel=k)
-        m.update_likelihood_approximation()
-        self.assertTrue(m.checkgrad())
-
-    @unittest.expectedFailure
    def test_multioutput_regression_1D(self):
        X1 = np.random.rand(50, 1) * 8
        X2 = np.random.rand(30, 1) * 5
@ -436,12 +429,11 @@ class GradientTests(np.testing.TestCase):
        Y = np.vstack((Y1, Y2))

        k1 = GPy.kern.RBF(1)
-        m = GPy.models.GPMultioutputRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel_list=[k1])
-        import ipdb;ipdb.set_trace()
-        m.constrain_fixed('.*rbf_var', 1.)
+        m = GPy.models.GPCoregionalizedRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel=k1)
+        #import ipdb;ipdb.set_trace()
+        #m.constrain_fixed('.*rbf_var', 1.)
        self.assertTrue(m.checkgrad())

-    @unittest.expectedFailure
    def test_multioutput_sparse_regression_1D(self):
        X1 = np.random.rand(500, 1) * 8
        X2 = np.random.rand(300, 1) * 5
@ -451,8 +443,7 @@ class GradientTests(np.testing.TestCase):
        Y = np.vstack((Y1, Y2))

        k1 = GPy.kern.RBF(1)
-        m = GPy.models.SparseGPMultioutputRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel_list=[k1])
-        m.constrain_fixed('.*rbf_var', 1.)
+        m = GPy.models.SparseGPCoregionalizedRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel=k1)
        self.assertTrue(m.checkgrad())

    def test_gp_heteroscedastic_regression(self):
--- a/GPy/util/choleskies_cython.c
+++ b/GPy/util/choleskies_cython.c
--- a/GPy/util/choleskies_cython.pyx
+++ b/GPy/util/choleskies_cython.pyx
@ -7,8 +7,9 @@
 import numpy as np
 from cython.parallel import prange, parallel
 cimport numpy as np
+cimport scipy.linalg.cython_blas as cblas

-def flat_to_triang(np.ndarray[double, ndim=2] flat, int M):
+def flat_to_triang(double[:, :] flat, int M):
    """take a matrix N x D and return a D X M x M array where

    N = M(M+1)/2
@ -18,8 +19,9 @@ def flat_to_triang(np.ndarray[double, ndim=2] flat, int M):
    cdef int D = flat.shape[1]
    cdef int N = flat.shape[0]
    cdef int count = 0
-    cdef np.ndarray[double, ndim=3] ret = np.zeros((D, M, M))
+    cdef double[:, :, ::1] ret = np.zeros((D, M, M))
    cdef int d, m, mm
+    with nogil:
        for d in range(D):
            count = 0
            for m in range(M):
@ -28,13 +30,14 @@ def flat_to_triang(np.ndarray[double, ndim=2] flat, int M):
                    count += 1
    return ret

-def triang_to_flat(np.ndarray[double, ndim=3] L):
+def triang_to_flat(double[:, :, :] L):
    cdef int D = L.shape[0]
    cdef int M = L.shape[1]
    cdef int N = M*(M+1)/2
    cdef int count = 0
-    cdef np.ndarray[double, ndim=2] flat = np.empty((N, D))
+    cdef double[:, ::1] flat = np.empty((N, D))
    cdef int d, m, mm
+    with nogil:
        for d in range(D):
            count = 0
            for m in range(M):
@ -43,11 +46,11 @@ def triang_to_flat(np.ndarray[double, ndim=3] L):
                    count += 1
    return flat

-
-def backprop_gradient(np.ndarray[double, ndim=2] dL, np.ndarray[double, ndim=2] L):
-    cdef np.ndarray[double, ndim=2] dL_dK = np.tril(dL).copy()
+def backprop_gradient(double[:, :] dL, double[:, :] L):
+    cdef double[:, ::1] dL_dK = np.tril(dL)
    cdef int N = L.shape[0]
    cdef int k, j, i
+    with nogil:
        for k in range(N - 1, -1, -1):
            for j in range(k + 1, N):
                for i in range(j, N):
@ -60,11 +63,12 @@ def backprop_gradient(np.ndarray[double, ndim=2] dL, np.ndarray[double, ndim=2]
    return dL_dK

 def backprop_gradient_par(double[:,:] dL, double[:,:] L):
-    cdef double[:,:] dL_dK = np.tril(dL).copy()
+    cdef double[:,::1] dL_dK = np.tril(dL)
    cdef int N = L.shape[0]
    cdef int k, j, i
+    with nogil:
        for k in range(N - 1, -1, -1):
-        with nogil, parallel():
+            with parallel():
                for i in prange(k + 1, N):
                    for j in range(k+1, i+1):
                        dL_dK[i, k] -= dL_dK[i, j] * L[j, k]
@ -76,32 +80,35 @@ def backprop_gradient_par(double[:,:] dL, double[:,:] L):
            dL_dK[k, k] /= (2. * L[k, k])
    return dL_dK

-#here's a pure C version...
-cdef extern from "cholesky_backprop.h" nogil:
-    void chol_backprop(int N, double* dL, double* L)
+cdef void chol_backprop(int N, double[:, ::1] dL, double[:, ::1] L) nogil:
+    cdef int i, k, n

-def backprop_gradient_par_c(np.ndarray[double, ndim=2] dL, np.ndarray[double, ndim=2] L):
-    cdef np.ndarray[double, ndim=2] dL_dK = np.tril(dL) # makes a copy, c-contig
+    # DSYMV required constant arguments
+    cdef double alpha=-1, beta=1
+    cdef int incx=N
+
+    # DSCAL required arguments
+    cdef double scale
+
+    dL[N - 1, N - 1] /= (2. * L[N - 1, N - 1])
+    for k in range(N-2, -1, -1):
+        n = N-k-1
+        cblas.dsymv(uplo='u', n=&n, alpha=&alpha, a=&dL[k + 1, k + 1], lda=&N, x=&L[k + 1, k], incx=&incx,
+                    beta=&beta, y=&dL[k + 1, k], incy=&N)
+
+        for i in xrange(0, N - k - 1):
+            dL[k + 1 + i, k] -= dL[k + i+ 1, k + i + 1] * L[k + 1 + i, k]
+
+        scale = 1.0 / L[k, k]
+        cblas.dscal(&n, &scale , &dL[k + 1, k], &N)
+#
+        dL[k, k] -= cblas.ddot(&n, &dL[k + 1, k], &N, &L[k+1, k], &incx)
+        dL[k, k] /= (2.0 * L[k, k])
+
+def backprop_gradient_par_c(double[:, :] dL, double[:, :] L):
+    cdef double[:, ::1] dL_dK = np.tril(dL) # makes a copy, c-contig
+    cdef double[:, ::1] L_cont = np.ascontiguousarray(L)
    cdef int N = L.shape[0]
    with nogil:
-        chol_backprop(N, <double*> dL_dK.data, <double*> L.data)
-    return dL_dK
-
-cdef extern from "cholesky_backprop.h" nogil:
-    void old_chol_backprop(int N, double* dL, double* L)
-
-def backprop_gradient_par_c_old(np.ndarray[double, ndim=2] dL, np.ndarray[double, ndim=2] L):
-    cdef np.ndarray[double, ndim=2] dL_dK = np.tril(dL) # makes a copy, c-contig
-    cdef int N = L.shape[0]
-    with nogil:
-        old_chol_backprop(N, <double*> dL_dK.data, <double*> L.data)
-    return dL_dK
-
-
-
-
-
-
-
-
-
+        chol_backprop(N, dL_dK, L_cont)
+    return np.asarray(dL_dK)
--- a/GPy/util/cholesky_backprop.c
+++ b/GPy/util/cholesky_backprop.c
@ -1,51 +0,0 @@
-#include <cblas.h>
-void chol_backprop(int N, double* dL, double* L){
-    //at the input to this fn, dL is df_dL. after this fn is complet, dL is df_dK
-    int i,k;
-
-    dL[N*N - 1] /= (2. * L[N*N - 1]);
-    for(k=N-2;k>(-1);k--){
-        cblas_dsymv(CblasRowMajor, CblasLower,
-	            N-k-1, -1,
-		    &dL[(N*(k+1) + k+1)],N,
-		    &L[k*N+k+1],1,
-		    1, &dL[N*(k+1)+k], N);
-        for(i=0;i<(N-k-1); i++){
-		dL[N*(k+1+i)+k] -= dL[N*(k+1)+k+i*(N+1)+1] * L[k*N+k+1+i];
-	}
-
-	cblas_dscal(N-k-1, 1.0/L[k*N+k], &dL[(k+1)*N+k], N);
-        dL[k*N + k] -= cblas_ddot(N-k-1, &dL[(k+1)*N+k], N, &L[k*N+k+1], 1);
-        dL[k*N + k] /= (2.0 * L[k*N + k]);
-    }
-}
-
-double mydot(int n, double* a,  int stride_a, double* b, int stride_b){
-    double ret = 0;	
-    for(int i=0; i<n; i++){
-	ret += a[i*stride_a]*b[i*stride_b];
-    }
-    return ret;
-}
-void old_chol_backprop(int N, double* dL, double* U){
-    //at the input to this fn, dL is df_dL. after this fn is complet, dL is df_dK
-    int iN, kN,i,j,k;
-    dL[N*N-1] /= (2. * U[N*N-1]);
-    for(k=N-2;k>(-1);k--){
-	kN = k*N;
-        #pragma omp parallel for private(i,iN)
-        for(i=k+1; i<N; i++){
-	    iN = i*N;
-	    dL[iN+k] -= mydot(i-k, &dL[iN+k+1], 1, &U[kN+k+1], 1);
-	    dL[iN+k] -= mydot(N-i, &dL[iN+i], N, &U[kN+i],  1);
-
-	}
-        for(i=(k + 1); i<N; i++){
-	    iN = i*N;
-            dL[iN + k] /= U[kN + k];
-            dL[kN + k] -= U[kN + i] * dL[iN + k];
-	}
-        dL[kN + k] /= (2. * U[kN + k]);
-    }
-}
-
--- a/GPy/util/cholesky_backprop.h
+++ b/GPy/util/cholesky_backprop.h
@ -1,5 +0,0 @@
-#include <cblas.h>
-
-void dsymv(int N, double*A, double*b, double*y);
-double mydot(int n, double* a,  int stride_a, double* b, int stride_b);
-void chol_backprop(int N, double* dL, double* L);
--- a/GPy/util/linalg_cython.c
+++ b/GPy/util/linalg_cython.c
--- a/GPy/util/linalg_cython.pyx
+++ b/GPy/util/linalg_cython.pyx
@ -1,3 +1,4 @@
+from libc.math cimport sqrt
 cimport numpy as np
 from cpython cimport bool
 import cython
@ -19,16 +20,18 @@ def symmetrify(np.ndarray[double, ndim=2] A, bool upper):
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
+@cython.cdivision(True)
 def cholupdate(np.ndarray[double, ndim=1] x, np.ndarray[double, ndim=2] L, int N):
-    cdef double r
-    cdef double c
-    cdef double s
+    cdef double r, c, s
+    cdef int j, i
+
+    with nogil:
        for j in xrange(N):
-        r = np.sqrt(L[j,j]*L[j,j] + x[j]*x[j])
+            r = sqrt(L[j, j] * L[j, j] + x[j] * x[j])
            c = r / L[j, j]
            s = x[j] / L[j, j]
            L[j, j] = r
            for i in xrange(j):
                L[i, j] = (L[i, j] + s * x[i]) / c
-            x[i] = c*x[i] - s*L[i,j];
-        r = np.sqrt(L[j,j])
+                x[i] = c * x[i] - s * L[i, j]
+            r = sqrt(L[j, j])
--- a/GPy/util/univariate_Gaussian.py
+++ b/GPy/util/univariate_Gaussian.py
@ -8,6 +8,7 @@ from scipy.special import ndtr as std_norm_cdf
 #define a standard normal pdf
 _sqrt_2pi = np.sqrt(2*np.pi)
 def std_norm_pdf(x):
+    x = np.clip(x,-1e150,1e150)
    return np.exp(-np.square(x)/2)/_sqrt_2pi

 def inv_std_norm_cdf(x):
--- a/setup.py
+++ b/setup.py
@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-

 import os
+import sys
 from setuptools import setup, Extension
 import numpy as np

@ -20,10 +21,10 @@ ext_mods = [Extension(name='GPy.kern._src.stationary_cython',
                      extra_compile_args=compile_flags,
                      extra_link_args = ['-lgomp']),
            Extension(name='GPy.util.choleskies_cython',
-                      sources=['GPy/util/choleskies_cython.c', 'GPy/util/cholesky_backprop.c'],
+                      sources=['GPy/util/choleskies_cython.c'],
                      include_dirs=[np.get_include()],
-                      extra_link_args = ['-lgomp', '-lblas'],
-                      extra_compile_args=compile_flags+['-std=c99']),
+                      extra_link_args = ['-lgomp'],
+                      extra_compile_args=compile_flags),
            Extension(name='GPy.util.linalg_cython',
                      sources=['GPy/util/linalg_cython.c'],
                      include_dirs=[np.get_include()],
@ -63,7 +64,7 @@ setup(name = 'GPy',
      py_modules = ['GPy.__init__'],
      test_suite = 'GPy.testing',
      long_description=read('README.md'),
-      install_requires=['numpy>=1.7', 'scipy>=0.12'],
+      install_requires=['numpy>=1.7', 'scipy>=0.16'],
      extras_require = {'docs':['matplotlib >=1.3','Sphinx','IPython']},
      classifiers=['License :: OSI Approved :: BSD License',
                   'Natural Language :: English',