diff --git a/GPy/core/model.py b/GPy/core/model.py index 94202396..e3a9bb68 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -244,6 +244,12 @@ class model(parameterised): LL_gradients = self._transform_gradients(self._log_likelihood_gradients()) prior_gradients = self._transform_gradients(self._log_prior_gradients()) obj_grads = -LL_gradients - prior_gradients + print self + print self._get_params() + print -obj_grads + self.plot() + if isinstance(self.likelihood, likelihoods.Laplace): + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return obj_f, obj_grads def optimize(self, optimizer=None, start=None, **kwargs): diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 279bc597..2b93122c 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -85,10 +85,60 @@ def v_fail_test(): import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print(m) +def student_t_obj_plane(): + plt.close('all') + X = np.linspace(0, 1, 50)[:, None] + real_std = 0.002 + noise = np.random.randn(*X.shape)*real_std + Y = np.sin(X*2*np.pi) + noise + deg_free = 1000 + + kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1]) + mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp) + mgp.ensure_default_constraints() + mgp['noise'] = real_std**2 + print "Gaussian" + print mgp + + kernelst = kernelgp.copy() + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=(real_std**2)) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') + m = GPy.models.GP(X, stu_t_likelihood, kernelst) + m.ensure_default_constraints() + m.constrain_fixed('t_no', real_std**2) + vs = 10 + ls = 10 + objs_t = np.zeros((vs, ls)) + objs_g = np.zeros((vs, ls)) + rbf_vs = np.linspace(1e-6, 8, vs) + rbf_ls = np.linspace(1e-2, 8, ls) + for v_id, rbf_v in enumerate(rbf_vs): + for l_id, rbf_l in enumerate(rbf_ls): + m['rbf_v'] = rbf_v + m['rbf_l'] = rbf_l + mgp['rbf_v'] = rbf_v + mgp['rbf_l'] = rbf_l + objs_t[v_id, l_id] = m.log_likelihood() + objs_g[v_id, l_id] = mgp.log_likelihood() + plt.figure() + plt.subplot(211) + plt.title('Student t') + plt.imshow(objs_t, interpolation='none') + plt.xlabel('variance') + plt.ylabel('lengthscale') + plt.subplot(212) + plt.title('Gaussian') + plt.imshow(objs_g, interpolation='none') + plt.xlabel('variance') + plt.ylabel('lengthscale') + plt.show() + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + return objs_t + def student_t_f_check(): plt.close('all') X = np.linspace(0, 1, 50)[:, None] - real_std = 0.001 + real_std = 0.2 noise = np.random.randn(*X.shape)*real_std Y = np.sin(X*2*np.pi) + noise deg_free = 1000 @@ -98,17 +148,26 @@ def student_t_f_check(): mgp.ensure_default_constraints() mgp.randomize() mgp.optimize() + print "Gaussian" print mgp import ipdb; ipdb.set_trace() ### XXX BREAKPOINT kernelst = kernelgp.copy() - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=1e-5) + #kernelst += GPy.kern.bias(X.shape[1]) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=0.05) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, stu_t_likelihood, kernelst) - m['rbf_v'] = mgp._get_params()[0] - m['rbf_l'] = mgp._get_params()[1] + 1 + #m['rbf_v'] = mgp._get_params()[0] + #m['rbf_l'] = mgp._get_params()[1] + 1 m.ensure_default_constraints() + #m.constrain_fixed('rbf_v', mgp._get_params()[0]) + #m.constrain_fixed('rbf_l', mgp._get_params()[1]) + #m.constrain_bounded('t_no', 2*real_std**2, 1e3) + #m.constrain_positive('bias') m.constrain_positive('t_no') + m.randomize() + m['t_no'] = 0.3 + m.likelihood.X = X print m plt.figure() plt.subplot(511) @@ -143,7 +202,8 @@ def student_t_fix_optimise_check(): Y = np.sin(X*2*np.pi) + noise X_full = X Y_full = np.sin(X_full) - #Y = Y/Y.max() + Y = Y/Y.max() + Y_full = Y_full/Y_full.max() deg_free = 1000 #GP @@ -219,7 +279,7 @@ def student_t_fix_optimise_check(): plt.subplot(121) mrbf.plot() plt.title('Student t fixed noise') - #mrbf.optimize() + mrbf.optimize() print "After optimize" print mrbf plt.subplot(122) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 5343f5dc..8b39f222 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -156,17 +156,23 @@ class Laplace(likelihood): Y_tilde = Wi*self.Ki_f + self.f_hat self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R + #self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6 + self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K) self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) - Z_tilde = (#+ self.NORMAL_CONST + self.aA = 0.5*self.ln_det_K_Wi__Bi + self.bB = - 0.5*self.f_Ki_f + self.cC = 0.5*self.y_Wi_Ki_i_y + Z_tilde = (+ 100*self.NORMAL_CONST + self.lik + 0.5*self.ln_det_K_Wi__Bi - 0.5*self.f_Ki_f + 0.5*self.y_Wi_Ki_i_y ) - #print "Ztilde: {}".format(Z_tilde) + print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC) + print self.likelihood_function._get_params() #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) @@ -198,7 +204,7 @@ class Laplace(likelihood): self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data) if not self.likelihood_function.log_concave: - self.W[self.W < 0] = 1e-10 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods @@ -280,7 +286,7 @@ class Laplace(likelihood): f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False) return f_hat[:, None] - def rasm_mode(self, K, MAX_ITER=40, MAX_RESTART=10): + def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10): """ Rasmussens numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -290,15 +296,19 @@ class Laplace(likelihood): :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation :returns: f_mode """ - old_a = np.zeros((self.N, 1)) - #old_a = None - #if self.old_a is None: - #old_a = np.zeros((self.N, 1)) - #else: - #old_a = self.old_a + self.old_before_s = self.likelihood_function._get_params() + print "before: ", self.old_before_s + #if self.old_before_s < 1e-5: + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + #old_a = np.zeros((self.N, 1)) + if self.old_a is None: + old_a = np.zeros((self.N, 1)) + f = np.dot(K, old_a) + else: + old_a = self.old_a.copy() + f = self.f_hat.copy() - f = np.dot(self.K, old_a) - self.f = f new_obj = -np.inf old_obj = np.inf @@ -306,18 +316,20 @@ class Laplace(likelihood): return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data) difference = np.inf - epsilon = 1e-10 + epsilon = 1e-4 step_size = 1 rs = 0 i = 0 - while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: + + while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART: W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) + #W = np.maximum(W, 0) if not self.likelihood_function.log_concave: - W[W < 0] = 1e-10 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur # If the likelihood is non-log-concave. We wan't to say that there is a negative variance # To cause the posterior to become less certain than the prior and likelihood, # This is a property only held by non-log-concave likelihoods - B, L, W_12 = self._compute_B_statistics(K, W) + B, L, W_12 = self._compute_B_statistics(K, W.copy()) W_f = W*f grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data) @@ -328,54 +340,105 @@ class Laplace(likelihood): full_step_a = b - W_12*solve_L da = full_step_a - old_a - f_old = f.copy() - - f_old = self.f.copy() - def inner_obj(step_size, old_a, da, K): - a = old_a + step_size*da - f = np.dot(K, a) - self.a = a # This is nasty, need to set something within an optimization though - self.f = f - return -obj(a, f) - - from functools import partial - i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K) - new_obj = sp.optimize.brent(i_o, tol=1e-6, maxiter=10) - - #update_passed = False - #while not update_passed: + #f_old = f.copy() + #def inner_obj(step_size, old_a, da, K): #a = old_a + step_size*da #f = np.dot(K, a) + #self.a = a.copy() # This is nasty, need to set something within an optimization though + #self.f = f.copy() + #return -obj(a, f) - #old_obj = new_obj - #new_obj = obj(a, f) - #difference = new_obj - old_obj - #print "difference: ",difference - #if difference < 0: - ##print grad - ##print "Objective function rose", np.float(difference) - ##If the objective function isn't rising, restart optimization - #step_size *= 0.8 - ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) - ##objective function isn't increasing, try reducing step size - ##f = f_old #it's actually faster not to go back to old location and just zigzag across the mode - ##old_obj = tmp_old_obj - #old_obj = new_obj - #rs += 1 - #else: - #update_passed = True + #from functools import partial + #i_o = partial(inner_obj, old_a=old_a, da=da, K=K) + ##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20) + #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun + #f = self.f.copy() + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - f = self.f - difference = new_obj - old_obj - difference = np.abs(np.sum(f - f_old)) #+ abs(difference) - old_a = self.a #a + f_old = f.copy() + update_passed = False + while not update_passed: + a = old_a + step_size*da + f = np.dot(K, a) + + old_obj = new_obj + new_obj = obj(a, f) + difference = new_obj - old_obj + print "difference: ",difference + if difference < 0: + #print "Objective function rose", np.float(difference) + #If the objective function isn't rising, restart optimization + step_size *= 0.8 + #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) + #objective function isn't increasing, try reducing step size + f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode + old_obj = new_obj + rs += 1 + else: + update_passed = True + + #difference = abs(new_obj - old_obj) + #old_obj = new_obj.copy() + difference = np.abs(np.sum(f - f_old)) + #old_a = self.a.copy() #a + old_a = a.copy() i += 1 + #print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a)) - self.old_a = old_a + self.old_a = old_a.copy() #print "Positive difference obj: ", np.float(difference) #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size) print "Iterations: {}, Final_difference: {}".format(i, difference) - #self.a = a + if difference > 1e-4: + print "FAIL FAIL FAIL FAIL FAIL FAIL" + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + if hasattr(self, 'X'): + import pylab as pb + pb.figure() + pb.subplot(311) + pb.title('old f_hat') + pb.plot(self.X, self.f_hat) + pb.subplot(312) + pb.title('old ff') + pb.plot(self.X, self.old_ff) + pb.subplot(313) + pb.title('new f_hat') + pb.plot(self.X, f) + + pb.figure() + pb.subplot(121) + pb.title('old K') + pb.imshow(np.diagflat(self.old_K), interpolation='none') + pb.colorbar() + pb.subplot(122) + pb.title('new K') + pb.imshow(np.diagflat(K), interpolation='none') + pb.colorbar() + + pb.figure() + pb.subplot(121) + pb.title('old W') + pb.imshow(np.diagflat(self.old_W), interpolation='none') + pb.colorbar() + pb.subplot(122) + pb.title('new W') + pb.imshow(np.diagflat(W), interpolation='none') + pb.colorbar() + + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + pb.close('all') + + #FIXME: DELETE THESE + self.old_W = W.copy() + self.old_grad = grad.copy() + self.old_B = B.copy() + self.old_W_12 = W_12.copy() + self.old_ff = f.copy() + self.old_K = self.K.copy() + self.old_s = self.likelihood_function._get_params() + print "after: ", self.old_s + #print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a)) + self.a = a #self.B, self.B_chol, self.W_12 = B, L, W_12 #self.Bi, _, _, B_det = pdinv(self.B) return f diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 595fa63c..62e09a1a 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -193,11 +193,16 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f + #A = gammaln((self.v + 1) * 0.5) + #B = - gammaln(self.v * 0.5) + #C = - 0.5*np.log(self.sigma2 * self.v * np.pi) + #D = + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v)) objective = (+ gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) - 0.5*np.log(self.sigma2 * self.v * np.pi) + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v)) ) + #print "C: {} D: {} obj: {}".format(C, np.sum(D), objective.sum()) return np.sum(objective) def dlik_df(self, y, f, extra_data=None): @@ -459,147 +464,153 @@ class weibull_survival(likelihood_function): hess = (y**self.shape)*np.exp(f) return np.squeeze(hess) -#class gaussian(likelihood_function): - #""" - #Gaussian likelihood - this is a test class for approximation schemes - #""" - #def __init__(self, variance): - #self._set_params(np.asarray(variance)) +class gaussian(likelihood_function): + """ + Gaussian likelihood - this is a test class for approximation schemes + """ + def __init__(self, variance): + self._set_params(np.asarray(variance)) - #def _get_params(self): - #return np.asarray(self.sigma2) + def _get_params(self): + return np.asarray(self._variance) - #def _get_param_names(self): - #return ["noise_variance"] + def _get_param_names(self): + return ["noise_variance"] - #def _set_params(self, x): - #self.variance = float(x) + def _set_params(self, x): + self._variance = float(x) + self.covariance_matrix = np.eye(self.N) * self._variance + self.Ki, _, _, self.ln_K = pdinv(self.covariance_matrix) # THIS MAY BE WRONG - #def link_function(self, y, f, extra_data=None): - #"""link_function $\ln p(y|f)$ - #$$\ln p(y_{i}|f_{i}) = \ln $$ + def link_function(self, y, f, extra_data=None): + """link_function $\ln p(y|f)$ + $$\ln p(y_{i}|f_{i}) = \ln $$ - #:y: data - #:f: latent variables f - #:extra_data: extra_data which is not used in student t distribution - #:returns: float(likelihood evaluated for this point) + :y: data + :f: latent variables f + :extra_data: extra_data which is not used in student t distribution + :returns: float(likelihood evaluated for this point) - #""" - #assert y.shape == f.shape - #e = y - f - #objective = -0.5*self.D* - #return np.sum(objective) + """ + assert y.shape == f.shape + e = y - f + eeT = np.dot(e, e.T) + objective = (- 0.5*self.D*np.log(2*np.pi) + - 0.5*self.ln_K + - 0.5*np.sum(np.multiply(self.Ki, eeT)) + ) + return np.sum(objective) - #def dlik_df(self, y, f, extra_data=None): - #""" - #Gradient of the link function at y, given f w.r.t f + def dlik_df(self, y, f, extra_data=None): + """ + Gradient of the link function at y, given f w.r.t f - #$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$ + $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$ - #:y: data - #:f: latent variables f - #:extra_data: extra_data which is not used in student t distribution - #:returns: gradient of likelihood evaluated at points + :y: data + :f: latent variables f + :extra_data: extra_data which is not used in student t distribution + :returns: gradient of likelihood evaluated at points - #""" - #assert y.shape == f.shape - #e = y - f - #grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2)) - #return grad + """ + assert y.shape == f.shape + e = y - f + grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2)) + return grad - #def d2lik_d2f(self, y, f, extra_data=None): - #""" - #Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j - #i.e. second derivative link_function at y given f f_j w.r.t f and f_j + def d2lik_d2f(self, y, f, extra_data=None): + """ + Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j + i.e. second derivative link_function at y given f f_j w.r.t f and f_j - #Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases - #(the distribution for y_{i} depends only on f_{i} not on f_{j!=i} + Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases + (the distribution for y_{i} depends only on f_{i} not on f_{j!=i} - #$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$ + $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$ - #:y: data - #:f: latent variables f - #:extra_data: extra_data which is not used in student t distribution - #:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) - #""" - #assert y.shape == f.shape - #e = y - f - #hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2) - #return hess + :y: data + :f: latent variables f + :extra_data: extra_data which is not used in student t distribution + :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) + """ + assert y.shape == f.shape + e = y - f + hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2) + return hess - #def d3lik_d3f(self, y, f, extra_data=None): - #""" - #Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j + def d3lik_d3f(self, y, f, extra_data=None): + """ + Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j - #$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$ - #""" - #assert y.shape == f.shape - #e = y - f - #d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) / - #((e**2 + self.sigma2*self.v)**3) - #) - #return d3lik_d3f + $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$ + """ + assert y.shape == f.shape + e = y - f + d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) / + ((e**2 + self.sigma2*self.v)**3) + ) + return d3lik_d3f - #def lik_dstd(self, y, f, extra_data=None): - #""" - #Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation) + def lik_dstd(self, y, f, extra_data=None): + """ + Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation) - #Terms relavent to derivatives wrt sigma are: - #-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2)) + Terms relavent to derivatives wrt sigma are: + -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2)) - #$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$ - #""" - #assert y.shape == f.shape - #e = y - f - #sigma = np.sqrt(self.sigma2) - ##dlik_dsigma = ( - (1/sigma) + - ##((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) - ##) - ##dlik_dsigma = ( - 1 + - ##((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) - ##) - ##dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1 - #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v)) - #return dlik_dsigma + $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$ + """ + assert y.shape == f.shape + e = y - f + sigma = np.sqrt(self.sigma2) + #dlik_dsigma = ( - (1/sigma) + + #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) + #) + #dlik_dsigma = ( - 1 + + #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) + #) + #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1 + dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v)) + return dlik_dsigma - #def dlik_df_dstd(self, y, f, extra_data=None): - #""" - #Gradient of the dlik_df w.r.t sigma parameter (standard deviation) + def dlik_df_dstd(self, y, f, extra_data=None): + """ + Gradient of the dlik_df w.r.t sigma parameter (standard deviation) - #$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$ - #""" - #assert y.shape == f.shape - #e = y - f - #sigma = np.sqrt(self.sigma2) - #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here? - #/ ((self.v*self.sigma2 + e**2)**2) - #) - #return dlik_grad_dsigma + $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$ + """ + assert y.shape == f.shape + e = y - f + sigma = np.sqrt(self.sigma2) + dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here? + / ((self.v*self.sigma2 + e**2)**2) + ) + return dlik_grad_dsigma - #def d2lik_d2f_dstd(self, y, f, extra_data=None): - #""" - #Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation) + def d2lik_d2f_dstd(self, y, f, extra_data=None): + """ + Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation) - #$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$ - #""" - #assert y.shape == f.shape - #e = y - f - #sigma = np.sqrt(self.sigma2) - #dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) / - #((e**2 + self.sigma2*self.v)**3) - #) - ##dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2)) - ##/ ((e**2 + (self.sigma**2)*self.v)**3) ) - #return dlik_hess_dsigma + $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$ + """ + assert y.shape == f.shape + e = y - f + sigma = np.sqrt(self.sigma2) + dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) / + ((e**2 + self.sigma2*self.v)**3) + ) + #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2)) + #/ ((e**2 + (self.sigma**2)*self.v)**3) ) + return dlik_hess_dsigma - #def _gradients(self, y, f, extra_data=None): - ##must be listed in same order as 'get_param_names' - #derivs = ([self.lik_dstd(y, f, extra_data=extra_data)], - #[self.dlik_df_dstd(y, f, extra_data=extra_data)], - #[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)] - #) # lists as we might learn many parameters - ## ensure we have gradients for every parameter we want to optimize - #assert len(derivs[0]) == len(self._get_param_names()) - #assert len(derivs[1]) == len(self._get_param_names()) - #assert len(derivs[2]) == len(self._get_param_names()) - #return derivs + def _gradients(self, y, f, extra_data=None): + #must be listed in same order as 'get_param_names' + derivs = ([self.lik_dstd(y, f, extra_data=extra_data)], + [self.dlik_df_dstd(y, f, extra_data=extra_data)], + [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)] + ) # lists as we might learn many parameters + # ensure we have gradients for every parameter we want to optimize + assert len(derivs[0]) == len(self._get_param_names()) + assert len(derivs[1]) == len(self._get_param_names()) + assert len(derivs[2]) == len(self._get_param_names()) + return derivs