Merge branch 'params' of github.com:SheffieldML/GPy into params

2026-06-20 15:38:08 +02:00 · 2014-03-25 17:00:00 +00:00 · 2014-03-25 17:00:00 +00:00 · 81040f4566
commit 81040f4566
parent 17c2799b1e 1294c24a28
19 changed files with 293 additions and 139 deletions
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -24,7 +24,6 @@ class Model(Parameterized):

    def log_likelihood(self):
        raise NotImplementedError, "this needs to be implemented to use the model class"
-
    def _log_likelihood_gradients(self):
        return self.gradient

@ -148,7 +147,60 @@ class Model(Parameterized):
        """
        return self.kern.input_sensitivity()

-    def objective_function(self, x):
+    def objective_function(self):
+        """
+        The objective function for the given algorithm.
+
+        This function is the true objective, which wants to be minimized. 
+        Note that all parameters are already set and in place, so you just need 
+        to return the objective function here.
+
+        For probabilistic models this is the negative log_likelihood
+        (including the MAP prior), so we return it here. If your model is not 
+        probabilistic, just return your objective here!
+        """
+        return -float(self.log_likelihood()) - self.log_prior()
+
+    def objective_function_gradients(self):
+        """
+        The gradients for the objective function for the given algorithm.
+
+        You can find the gradient for the parameters in self.gradient at all times.
+        This is the place, where gradients get stored for parameters.
+
+        This function is the true objective, which wants to be minimized. 
+        Note that all parameters are already set and in place, so you just need 
+        to return the gradient here.
+
+        For probabilistic models this is the gradient of the negative log_likelihood
+        (including the MAP prior), so we return it here. If your model is not 
+        probabilistic, just return your gradient here!
+        """
+        return -(self._log_likelihood_gradients() + self._log_prior_gradients())
+
+    def _grads(self, x):
+        """
+        Gets the gradients from the likelihood and the priors.
+
+        Failures are handled robustly. The algorithm will try several times to
+        return the gradients, and will raise the original exception if
+        the objective cannot be computed.
+
+        :param x: the parameters of the model.
+        :type x: np.array
+        """
+        try:
+            self._set_params_transformed(x)
+            obj_grads = self._transform_gradients(self.objective_function_gradients())
+            self._fail_count = 0
+        except (LinAlgError, ZeroDivisionError, ValueError):
+            if self._fail_count >= self._allowed_failures:
+                raise
+            self._fail_count += 1
+            obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
+        return obj_grads
+
+    def _objective(self, x):
        """
        The objective function passed to the optimizer. It combines
        the likelihood and the priors.
@ -162,55 +214,26 @@ class Model(Parameterized):
        """
        try:
            self._set_params_transformed(x)
+            obj = self.objective_function()
            self._fail_count = 0
-        except (LinAlgError, ZeroDivisionError, ValueError) as e:
+        except (LinAlgError, ZeroDivisionError, ValueError):
            if self._fail_count >= self._allowed_failures:
-                raise e
+                raise
            self._fail_count += 1
            return np.inf
-        return -float(self.log_likelihood()) - self.log_prior()
+        return obj

-    def objective_function_gradients(self, x):
-        """
-        Gets the gradients from the likelihood and the priors.
-
-        Failures are handled robustly. The algorithm will try several times to
-        return the gradients, and will raise the original exception if
-        the objective cannot be computed.
-
-        :param x: the parameters of the model.
-        :type x: np.array
-        """
+    def _objective_grads(self, x):
        try:
            self._set_params_transformed(x)
-            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+            obj_f, obj_grads = self.objective_function(), self._transform_gradients(self.objective_function_gradients())
            self._fail_count = 0
-        except (LinAlgError, ZeroDivisionError, ValueError) as e:
+        except (LinAlgError, ZeroDivisionError, ValueError):
            if self._fail_count >= self._allowed_failures:
-                raise e
-            self._fail_count += 1
-            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
-        return obj_grads
-
-    def objective_and_gradients(self, x):
-        """
-        Compute the objective function of the model and the gradient of the model at the point given by x.
-
-        :param x: the point at which gradients are to be computed.
-        :type x: np.array
-        """
-
-        try:
-            self._set_params_transformed(x)
-            obj_f = -float(self.log_likelihood()) - self.log_prior()
-            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
-            self._fail_count = 0
-        except (LinAlgError, ZeroDivisionError, ValueError) as e:
-            if self._fail_count >= self._allowed_failures:
-                raise e
+                raise
            self._fail_count += 1
            obj_f = np.inf
-            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
+            obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
        return obj_f, obj_grads

    def optimize(self, optimizer=None, start=None, **kwargs):
@ -241,7 +264,7 @@ class Model(Parameterized):
        optimizer = optimization.get_optimizer(optimizer)
        opt = optimizer(start, model=self, **kwargs)

-        opt.run(f_fp=self.objective_and_gradients, f=self.objective_function, fp=self.objective_function_gradients)
+        opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)

        self.optimization_runs.append(opt)

@ -292,9 +315,9 @@ class Model(Parameterized):
            dx[transformed_index] = step * np.sign(np.random.uniform(-1, 1, transformed_index.size))

            # evaulate around the point x
-            f1 = self.objective_function(x + dx)
-            f2 = self.objective_function(x - dx)
-            gradient = self.objective_function_gradients(x)
+            f1 = self._objective(x + dx)
+            f2 = self._objective(x - dx)
+            gradient = self._grads(x)

            dx = dx[transformed_index]
            gradient = gradient[transformed_index]
@ -337,15 +360,15 @@ class Model(Parameterized):
                    print "No free parameters to check"
                    return

-            gradient = self.objective_function_gradients(x).copy()
+            gradient = self._grads(x).copy()
            np.where(gradient == 0, 1e-312, gradient)
            ret = True
            for nind, xind in itertools.izip(param_index, transformed_index):
                xx = x.copy()
                xx[xind] += step
-                f1 = self.objective_function(xx)
+                f1 = self._objective(xx)
                xx[xind] -= 2.*step
-                f2 = self.objective_function(xx)
+                f2 = self._objective(xx)
                numerical_gradient = (f1 - f2) / (2 * step)
                if np.all(gradient[xind]==0): ratio = (f1-f2) == gradient[xind]
                else: ratio = (f1 - f2) / (2 * step * gradient[xind])
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@ -1,7 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-__updated__ = '2014-03-21'
+__updated__ = '2014-03-24'

 import numpy as np
 from parameter_core import Observable
@ -38,24 +38,9 @@ class ObsAr(np.ndarray, Observable):
        np.ndarray.__setstate__(self, state[0])
        Observable._setstate(self, state[1])

-    def _s_not_empty(self, s):
-        # this checks whether there is something picked by this slice.
-        return True
-        # TODO:  disarmed, for performance increase,
-        if not isinstance(s, (list,tuple,np.ndarray)):
-            return True
-        if isinstance(s, (list,tuple)):
-            return len(s)!=0
-        if isinstance(s, np.ndarray):
-            if s.dtype is bool:
-                return np.all(s)
-            else:
-                return s.size != 0
-
    def __setitem__(self, s, val):
-        if self._s_not_empty(s):
-            super(ObsAr, self).__setitem__(s, val)
-            self.notify_observers()
+        super(ObsAr, self).__setitem__(s, val)
+        self.notify_observers()

    def __getslice__(self, start, stop):
        return self.__getitem__(slice(start, stop))
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@ -183,7 +183,7 @@ class ParameterIndexOperationsView(object):


    def remove(self, prop, indices):
-        removed = self._param_index_ops.remove(prop, indices+self._offset)
+        removed = self._param_index_ops.remove(prop, numpy.array(indices)+self._offset)
        if removed.size > 0:
            return removed - self._size + 1
        return removed
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -282,8 +282,8 @@ class Param(OptimizationHandlable, ObsAr):
        if isinstance(slice_index, (tuple, list)):
            clean_curr_slice = [s for s in slice_index if numpy.any(s != Ellipsis)]
            for i in range(self._realndim_-len(clean_curr_slice)):
-                i+=len(clean_curr_slice)
-                clean_curr_slice += range(self._realshape_[i])
+                i+=1
+                clean_curr_slice += [range(self._realshape_[i])]
            if (all(isinstance(n, (numpy.ndarray, list, tuple)) for n in clean_curr_slice)
                and len(set(map(len, clean_curr_slice))) <= 1):
                return numpy.fromiter(itertools.izip(*clean_curr_slice),
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -63,7 +63,7 @@ class Parameterized(Parameterizable, Pickleable):
    # Metaclass for parameters changed after init. 
    # This makes sure, that parameters changed will always be called after __init__
    # **Never** call parameters_changed() yourself 
-    __metaclass__ = ParametersChangedMeta    
+    __metaclass__ = ParametersChangedMeta
    #===========================================================================
    def __init__(self, name=None, parameters=[], *a, **kw):
        super(Parameterized, self).__init__(name=name, *a, **kw)
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -277,7 +277,9 @@ def bgplvm_simulation(optimize=True, verbose=1,
    k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    #k = kern.RBF(Q, ARD=True, lengthscale=10.)
    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
-    
+    m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape)
+    m.likelihood.variance = .1
+
    if optimize:
        print "Optimizing model:"
        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
@ -299,15 +301,16 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
-    
+
    inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
    m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, kernel=k)
    m.inference_method = VarDTCMissingData()
    m.Y[inan] = _np.nan
-    m.X.variance *= .1
+    m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape)
+    m.likelihood.variance = .01
    m.parameters_changed()
    m.Yreal = Y
-    
+
    if optimize:
        print "Optimizing model:"
        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
@ -325,11 +328,11 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):

    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
-    
+
    #Ylist = [Ylist[0]]
-    k = [kern.Linear(Q, ARD=True) + kern.White(Q, 1e-4) for _ in range(len(Ylist))]
+    k = [kern.Linear(Q, ARD=True) for _ in range(len(Ylist))]
    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernel=k, initx="", initz='permute', **kw)
-    
+
    m['.*noise'] = [Y.var()/500. for Y in Ylist]
    #for i, Y in enumerate(Ylist):
    #    m['.*Y_{}.*Gaussian.*noise'.format(i)] = Y.var(1) / 500.
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -179,6 +179,7 @@ class VarDTC(object):
        return post, log_marginal, grad_dict

 class VarDTCMissingData(object):
+    const_jitter = 1e-6
    def __init__(self, limit=1):
        from ...util.caching import Cacher
        self._Y = Cacher(self._subarray_computations, limit)
@ -250,7 +251,7 @@ class VarDTCMissingData(object):

        for y, trYYT, [v, ind] in itertools.izip(Ys, traces, self._subarray_indices):
            if het_noise: beta = beta_all[ind]
-            else: beta = beta_all[0]
+            else: beta = beta_all

            VVT_factor = (beta*y)
            VVT_factor_all[v, ind].flat = VVT_factor.flat
@ -311,7 +312,7 @@ class VarDTCMissingData(object):
                het_noise, uncertain_inputs, LB,
                _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
                psi0, psi1, beta,
-                data_fit, num_data, output_dim, trYYT)
+                data_fit, num_data, output_dim, trYYT, Y)

            if full_VVT_factor: woodbury_vector[:, ind] = Cpsi1Vf
            else:
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@ -121,7 +121,7 @@ class Linear(Kern):
            gamma = variational_posterior.binary_prob
            mu = variational_posterior.mean
            return np.einsum('nq,q,mq,nq->nm',gamma,self.variances,Z,mu)
-#            return (self.variances*gamma*mu).sum(axis=1)       
+#            return (self.variances*gamma*mu).sum(axis=1)
        else:
            return self.K(variational_posterior.mean, Z) #the variance, it does nothing

@ -177,7 +177,7 @@ class Linear(Kern):

            grad =  np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, self.variances,mu) +\
                 np.einsum('nmo,noq->mq',dL_dpsi2,_dpsi2_dZ)
-            
+
            return grad
        else:
            #psi1
@ -191,15 +191,15 @@ class Linear(Kern):
            gamma = variational_posterior.binary_prob
            mu = variational_posterior.mean
            S = variational_posterior.variance
-            mu2S = np.square(mu)+S            
+            mu2S = np.square(mu)+S
            _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma)
-            
+
            grad_gamma = np.einsum('n,q,nq->nq',dL_dpsi0,self.variances,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,self.variances,Z,mu) +\
                 np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dgamma)
            grad_mu = np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*self.variances,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,self.variances,Z) +\
                 np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dmu)
            grad_S = np.einsum('n,nq,q->nq',dL_dpsi0,gamma,self.variances) + np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dS)
-            
+
            return grad_mu, grad_S, grad_gamma
        else:
            grad_mu, grad_S = np.zeros(variational_posterior.mean.shape), np.zeros(variational_posterior.mean.shape)
@ -210,7 +210,7 @@ class Linear(Kern):
            grad_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
            # psi2
            self._weave_dpsi2_dmuS(dL_dpsi2, Z, variational_posterior, grad_mu, grad_S)
-    
+
            return grad_mu, grad_S

    #--------------------------------------------------#
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@ -58,8 +58,6 @@ class Prod(CombinationKernel):
    def gradients_X_diag(self, dL_dKdiag, X):
        target = np.zeros(X.shape)
        for k1,k2 in itertools.combinations(self.parts, 2):
-            target += k1.gradients_X(dL_dKdiag*k2.Kdiag(X), X)
-            target += k2.gradients_X(dL_dKdiag*k1.Kdiag(X), X)
+            target += k1.gradients_X_diag(dL_dKdiag*k2.Kdiag(X), X)
+            target += k2.gradients_X_diag(dL_dKdiag*k1.Kdiag(X), X)
        return target
-
-
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -26,7 +26,10 @@ class BayesianGPLVM(SparseGP):
                 Z=None, kernel=None, inference_method=None, likelihood=None, name='bayesian gplvm', **kwargs):
        if X == None:
            from ..util.initialization import initialize_latent
-            X = initialize_latent(init, input_dim, Y)
+            X, fracs = initialize_latent(init, input_dim, Y)
+        else:
+            fracs = np.ones(input_dim)
+
        self.init = init

        if X_variance is None:
@ -38,7 +41,7 @@ class BayesianGPLVM(SparseGP):
        assert Z.shape[1] == X.shape[1]

        if kernel is None:
-            kernel = kern.RBF(input_dim) # + kern.white(input_dim)
+            kernel = kern.RBF(input_dim, lengthscale=fracs, ARD=True) # + kern.white(input_dim)

        if likelihood is None:
            likelihood = Gaussian()
@ -47,6 +50,14 @@ class BayesianGPLVM(SparseGP):
        self.variational_prior = NormalPrior()
        X = NormalPosterior(X, X_variance)

+        if inference_method is None:
+            if np.any(np.isnan(Y)):
+                from ..inference.latent_function_inference.var_dtc import VarDTCMissingData
+                inference_method = VarDTCMissingData()
+            else:
+                from ..inference.latent_function_inference.var_dtc import VarDTC
+                inference_method = VarDTC()
+
        SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
        self.add_parameter(self.X, index=0)

--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@ -2,10 +2,10 @@
 # Copyright (c) 2013, the GPy Authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import numpy as np
 from ..core import GP
 from .. import likelihoods
 from .. import kern
+from ..inference.latent_function_inference.expectation_propagation import EP

 class GPClassification(GP):
    """
@ -27,4 +27,4 @@ class GPClassification(GP):

        likelihood = likelihoods.Bernoulli()

-        GP.__init__(self, X=X, Y=Y,  kernel=kernel, likelihood=likelihood, name='gp_classification')
+        GP.__init__(self, X=X, Y=Y,  kernel=kernel, likelihood=likelihood, inference_method=EP(), name='gp_classification')
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -5,7 +5,6 @@
 import numpy as np
 import pylab as pb
 from .. import kern
-from ..util.linalg import PCA
 from ..core import GP, Param
 from ..likelihoods import Gaussian
 from .. import util
@ -29,9 +28,11 @@ class GPLVM(GP):
        """
        if X is None:
            from ..util.initialization import initialize_latent
-            X = initialize_latent(init, input_dim, Y)
+            X, fracs = initialize_latent(init, input_dim, Y)
+        else:
+            fracs = np.ones(input_dim)
        if kernel is None:
-            kernel = kern.RBF(input_dim, ARD=input_dim > 1) + kern.Bias(input_dim, np.exp(-2))
+            kernel = kern.RBF(input_dim, lengthscale=fracs, ARD=input_dim > 1) + kern.Bias(input_dim, np.exp(-2))

        likelihood = Gaussian()

--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -6,12 +6,12 @@ import itertools
 import pylab

 from ..core import Model
-from ..util.linalg import PCA
 from ..kern import Kern
 from ..core.parameterization.variational import NormalPosterior, NormalPrior
 from ..core.parameterization import Param, Parameterized
 from ..inference.latent_function_inference.var_dtc import VarDTCMissingData, VarDTC
 from ..likelihoods import Gaussian
+from GPy.util.initialization import initialize_latent

 class MRD(Model):
    """
@ -51,27 +51,28 @@ class MRD(Model):
                 inference_method=None, likelihood=None, name='mrd', Ynames=None):
        super(MRD, self).__init__(name)

+        self.input_dim = input_dim
+        self.num_inducing = num_inducing
+
+        self.Ylist = Ylist
+        self._in_init_ = True
+        X, fracs = self._init_X(initx, Ylist)
+        self.Z = Param('inducing inputs', self._init_Z(initz, X))
+        self.num_inducing = self.Z.shape[0] # ensure M==N if M>N
+
        # sort out the kernels
        if kernel is None:
            from ..kern import RBF
-            self.kern = [RBF(input_dim, ARD=1, name='rbf'.format(i)) for i in range(len(Ylist))]
+            self.kern = [RBF(input_dim, ARD=1, lengthscale=fracs[i], name='rbf'.format(i)) for i in range(len(Ylist))]
        elif isinstance(kernel, Kern):
            self.kern = [kernel.copy(name='{}'.format(kernel.name, i)) for i in range(len(Ylist))]
        else:
            assert len(kernel) == len(Ylist), "need one kernel per output"
            assert all([isinstance(k, Kern) for k in kernel]), "invalid kernel object detected!"
            self.kern = kernel
-        self.input_dim = input_dim
-        self.num_inducing = num_inducing
-
-        self.Ylist = Ylist
-        self._in_init_ = True
-        X = self._init_X(initx, Ylist)
-        self.Z = Param('inducing inputs', self._init_Z(initz, X))
-        self.num_inducing = self.Z.shape[0] # ensure M==N if M>N

        if X_variance is None:
-            X_variance = np.random.uniform(0, .2, X.shape)
+            X_variance = np.random.uniform(0, .1, X.shape)

        self.variational_prior = NormalPrior()
        self.X = NormalPosterior(X, X_variance)
@ -108,8 +109,7 @@ class MRD(Model):
        self._log_marginal_likelihood = 0
        self.posteriors = []
        self.Z.gradient = 0.
-        self.X.mean.gradient = 0.
-        self.X.variance.gradient = 0.
+        self.X.gradient = 0.

        for y, k, l, i in itertools.izip(self.Ylist, self.kern, self.likelihood, self.inference_method):
            posterior, lml, grad_dict = i.inference(k, self.X, self.Z, l, y)
@ -147,14 +147,20 @@ class MRD(Model):
        if Ylist is None:
            Ylist = self.Ylist
        if init in "PCA_concat":
-            X = PCA(np.hstack(Ylist), self.input_dim)[0]
+            X, fracs = initialize_latent('PCA', self.input_dim, np.hstack(Ylist))
+            fracs = [fracs]*self.input_dim
        elif init in "PCA_single":
            X = np.zeros((Ylist[0].shape[0], self.input_dim))
+            fracs = []
            for qs, Y in itertools.izip(np.array_split(np.arange(self.input_dim), len(Ylist)), Ylist):
-                X[:, qs] = PCA(Y, len(qs))[0]
+                x,frcs = initialize_latent('PCA', len(qs), Y)
+                X[:, qs] = x
+                fracs.append(frcs)
        else: # init == 'random':
            X = np.random.randn(Ylist[0].shape[0], self.input_dim)
-        return X
+            fracs = X.var(0)
+            fracs = [fracs]*self.input_dim
+        return X, fracs

    def _init_Z(self, init="permute", X=None):
        if X is None:
--- a/GPy/testing/index_operations_tests.py
+++ b/GPy/testing/index_operations_tests.py
@ -33,6 +33,8 @@ class Test(unittest.TestCase):
        self.assertListEqual(self.param_index[one].tolist(), [3])
        self.assertListEqual(self.param_index.remove('not in there', [2,3,4]).tolist(), [])

+        self.assertListEqual(self.view.remove('not in there', [2,3,4]).tolist(), [])
+
    def test_shift_left(self):
        self.view.shift_left(0, 2)
        self.assertListEqual(self.param_index[three].tolist(), [2,5])
@ -82,6 +84,10 @@ class Test(unittest.TestCase):
        self.assertEqual(self.param_index.size, 6)
        self.assertEqual(self.view.size, 5)

+    def test_print(self):
+        print self.param_index
+        print self.view
+
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.test_index_view']
    unittest.main()
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -8,6 +8,7 @@ import sys

 verbose = 0

+np.random.seed(50)


 class Kern_check_model(GPy.core.Model):
@ -243,6 +244,17 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))

+    def test_Prod2(self):
+        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
+    def test_Prod3(self):
+        k = GPy.kern.Matern32(2, active_dims=[2,3]) * (GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D))
+        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
    def test_Add(self):
        k = GPy.kern.Matern32(2, active_dims=[2,3]) + GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D)
        k += GPy.kern.Matern32(2, active_dims=[2,3]) + GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D)
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -130,7 +130,14 @@ class MiscTests(unittest.TestCase):
        m2.kern[:] = m.kern[''].values()
        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())

-class GradientTests(unittest.TestCase):
+    def test_model_optimize(self):
+        X = np.random.uniform(-3., 3., (20, 1))
+        Y = np.sin(X) + np.random.randn(20, 1) * 0.05
+        m = GPy.models.GPRegression(X,Y)
+        m.optimize()
+        print m
+
+class GradientTests(np.testing.TestCase):
    def setUp(self):
        ######################################
        # # 1 dimensional example
--- a/GPy/util/initialization.py
+++ b/GPy/util/initialization.py
@ -5,13 +5,15 @@ Created on 24 Feb 2014
 '''

 import numpy as np
-from linalg import PCA
+from GPy.util.pca import pca

 def initialize_latent(init, input_dim, Y):
    Xr = np.random.randn(Y.shape[0], input_dim)
    if init == 'PCA':
-        PC = PCA(Y, input_dim)[0]
+        p = pca(Y)
+        PC = p.project(Y, min(input_dim, Y.shape[1]))
        Xr[:PC.shape[0], :PC.shape[1]] = PC
    else:
-        pass
-    return Xr
+        var = Xr.var(0)
+        return Xr, var/var.max()
+    return Xr, p.fracs[:input_dim]
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -580,26 +580,3 @@ def backsub_both_sides(L, X, transpose='left'):
        tmp, _ = dtrtrs(L, X, lower=1, trans=0)
        return dtrtrs(L, tmp.T, lower=1, trans=0)[0].T

-def PCA(Y, input_dim):
-    """
-    Principal component analysis: maximum likelihood solution by SVD
-
-    :param Y: NxD np.array of data
-    :param input_dim: int, dimension of projection
-
-
-    :rval X: - Nxinput_dim np.array of dimensionality reduced data
-    :rval W: - input_dimxD mapping from X to Y
-
-    """
-    if not np.allclose(Y.mean(axis=0), 0.0):
-        print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"
-
-        # Y -= Y.mean(axis=0)
-
-    Z = linalg.svd(Y - Y.mean(axis=0), full_matrices=False)
-    [X, W] = [Z[0][:, 0:input_dim], np.dot(np.diag(Z[1]), Z[2]).T[:, 0:input_dim]]
-    v = X.std(axis=0)
-    X /= v;
-    W *= v;
-    return X, W.T
--- a/GPy/util/pca.py
+++ b/GPy/util/pca.py
@ -0,0 +1,122 @@
+'''
+Created on 10 Sep 2012
+
+@author: Max Zwiessele
+@copyright: Max Zwiessele 2012
+'''
+import numpy
+import pylab
+import matplotlib
+from numpy.linalg.linalg import LinAlgError
+
+class pca(object):
+    """
+    pca module with automatic primal/dual determination.
+    """
+    def __init__(self, X):
+        self.mu = X.mean(0)
+        self.sigma = X.std(0)
+
+        X = self.center(X)
+
+        # self.X = input
+        if X.shape[0] >= X.shape[1]:
+            # print "N >= D: using primal"
+            self.eigvals, self.eigvectors = self._primal_eig(X)
+        else:
+            # print "N < D: using dual"
+            self.eigvals, self.eigvectors = self._dual_eig(X)
+        self.sort = numpy.argsort(self.eigvals)[::-1]
+        self.eigvals = self.eigvals[self.sort]
+        self.eigvectors = self.eigvectors[:, self.sort]
+        self.fracs = self.eigvals / self.eigvals.sum()
+        self.Q = self.eigvals.shape[0]
+
+    def center(self, X):
+        """
+        Center `X` in pca space.
+        """
+        X = X - self.mu
+        X = X / numpy.where(self.sigma == 0, 1e-30, self.sigma)
+        return X
+
+    def _primal_eig(self, X):
+        return numpy.linalg.eigh(numpy.einsum('ji,jk->ik',X,X))
+
+    def _dual_eig(self, X):
+        dual_eigvals, dual_eigvects = numpy.linalg.eigh(numpy.einsum('ij,kj->ik',X,X))
+        relevant_dimensions = numpy.argsort(numpy.abs(dual_eigvals))[-X.shape[1]:]
+        eigvals = dual_eigvals[relevant_dimensions]
+        eigvects = dual_eigvects[:, relevant_dimensions]
+        eigvects = (1. / numpy.sqrt(X.shape[0] * numpy.abs(eigvals))) * X.T.dot(eigvects)
+        eigvects /= numpy.sqrt(numpy.diag(eigvects.T.dot(eigvects)))
+        return eigvals, eigvects
+
+    def project(self, X, Q=None):
+        """
+        Project X into pca space, defined by the Q highest eigenvalues.
+        Y = X dot V
+        """
+        if Q is None:
+            Q = self.Q
+        if Q > X.shape[1]:
+            raise IndexError("requested dimension larger then input dimension")
+        X = self.center(X)
+        return X.dot(self.eigvectors[:, :Q])
+
+    def plot_fracs(self, Q=None, ax=None, fignum=None):
+        """
+        Plot fractions of Eigenvalues sorted in descending order.
+        """
+        if ax is None:
+            fig = pylab.figure(fignum)
+            ax = fig.add_subplot(111)
+        if Q is None:
+            Q = self.Q
+        ticks = numpy.arange(Q)
+        bar = ax.bar(ticks - .4, self.fracs[:Q])
+        ax.set_xticks(ticks, map(lambda x: r"${}$".format(x), ticks + 1))
+        ax.set_ylabel("Eigenvalue fraction")
+        ax.set_xlabel("PC")
+        ax.set_ylim(0, ax.get_ylim()[1])
+        ax.set_xlim(ticks.min() - .5, ticks.max() + .5)
+        try:
+            pylab.tight_layout()
+        except:
+            pass
+        return bar
+
+    def plot_2d(self, X, labels=None, s=20, marker='o',
+                dimensions=(0, 1), ax=None, colors=None,
+                fignum=None, cmap=matplotlib.cm.jet, # @UndefinedVariable
+                ** kwargs):
+        """
+        Plot dimensions `dimensions` with given labels against each other in 
+        PC space. Labels can be any sequence of labels of dimensions X.shape[0].
+        Labels can be drawn with a subsequent call to legend()
+        """
+        if ax is None:
+            fig = pylab.figure(fignum)
+            ax = fig.add_subplot(111)
+        if labels is None:
+            labels = numpy.zeros(X.shape[0])
+        ulabels = []
+        for lab in labels:
+            if not lab in ulabels:
+                ulabels.append(lab)
+        nlabels = len(ulabels)
+        if colors is None:
+            colors = [cmap(float(i) / nlabels) for i in range(nlabels)]
+        X_ = self.project(X, self.Q)[:,dimensions]
+        kwargs.update(dict(s=s))
+        plots = list()
+        for i, l in enumerate(ulabels):
+            kwargs.update(dict(color=colors[i], marker=marker[i % len(marker)]))
+            plots.append(ax.scatter(*X_[labels == l, :].T, label=str(l), **kwargs))
+        ax.set_xlabel(r"PC$_1$")
+        ax.set_ylabel(r"PC$_2$")
+        try:
+            pylab.tight_layout()
+        except:
+            pass
+        return plots