diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 97601b0f..f1bb5d0d 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -207,7 +207,7 @@ def bgplvm_simulation(burnin='scg', plot_sim=False,
                       max_burnin=100, true_X=False,
                       do_opt=True,
                       max_f_eval=1000):
-    D1, D2, D3, N, M, Q = 10, 8, 8, 250, 10, 6
+    D1, D2, D3, N, M, Q = 15, 8, 8, 350, 3, 6
     slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, M, Q, plot_sim)
 
     from GPy.models import mrd
diff --git a/GPy/inference/SCG.py b/GPy/inference/SCG.py
index ca42acfe..7bfa16ce 100644
--- a/GPy/inference/SCG.py
+++ b/GPy/inference/SCG.py
@@ -1,6 +1,6 @@
-#Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2012)
+# Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2012)
 
-#Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman
+# Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman
 
 #      THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
 #      HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
@@ -25,7 +25,7 @@
 import numpy as np
 import sys
 
-def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xtol=1e-6, ftol=1e-6):
+def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xtol=None, ftol=None, gtol=None):
     """
     Optimisation through Scaled Conjugate Gradients (SCG)
 
@@ -38,19 +38,24 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
     flog : a list of all the objective values
 
     """
-
+    if xtol is None:
+        xtol = 1e-6
+    if ftol is None:
+        ftol = 1e-6
+    if gtol is None:
+        gtol = 1e-5
     sigma0 = 1.0e-4
-    fold = f(x, *optargs)	# Initial function value.
+    fold = f(x, *optargs)  # Initial function value.
     function_eval = 1
     fnow = fold
-    gradnew = gradf(x, *optargs)	# Initial gradient.
+    gradnew = gradf(x, *optargs)  # Initial gradient.
     gradold = gradnew.copy()
-    d = -gradnew				# Initial search direction.
-    success = True				# Force calculation of directional derivs.
-    nsuccess = 0				# nsuccess counts number of successes.
-    beta = 1.0				# Initial scale parameter.
-    betamin = 1.0e-15 			# Lower bound on scale.
-    betamax = 1.0e100			# Upper bound on scale.
+    d = -gradnew  # Initial search direction.
+    success = True  # Force calculation of directional derivs.
+    nsuccess = 0  # nsuccess counts number of successes.
+    beta = 1.0  # Initial scale parameter.
+    betamin = 1.0e-15  # Lower bound on scale.
+    betamax = 1.0e100  # Upper bound on scale.
     status = "Not converged"
 
     flog = [fold]
@@ -67,21 +72,21 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
                 d = -gradnew
                 mu = np.dot(d, gradnew)
             kappa = np.dot(d, d)
-            sigma = sigma0/np.sqrt(kappa)
-            xplus = x + sigma*d
+            sigma = sigma0 / np.sqrt(kappa)
+            xplus = x + sigma * d
             gplus = gradf(xplus, *optargs)
-            theta = np.dot(d, (gplus - gradnew))/sigma
+            theta = np.dot(d, (gplus - gradnew)) / sigma
 
         # Increase effective curvature and evaluate step size alpha.
-        delta = theta + beta*kappa
+        delta = theta + beta * kappa
         if delta <= 0:
-            delta = beta*kappa
-            beta = beta - theta/kappa
+            delta = beta * kappa
+            beta = beta - theta / kappa
 
-        alpha = - mu/delta
+        alpha = -mu / delta
 
         # Calculate the comparison ratio.
-        xnew = x + alpha*d
+        xnew = x + alpha * d
         fnew = f(xnew, *optargs)
         function_eval += 1
 
@@ -89,8 +94,8 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
             status = "Maximum number of function evaluations exceeded"
             return x, flog, function_eval, status
 
-        Delta = 2.*(fnew - fold)/(alpha*mu)
-        if Delta  >= 0.:
+        Delta = 2.*(fnew - fold) / (alpha * mu)
+        if Delta >= 0.:
             success = True
             nsuccess += 1
             x = xnew
@@ -100,19 +105,19 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
             fnow = fold
 
         # Store relevant variables
-        flog.append(fnow)		# Current function value
+        flog.append(fnow)  # Current function value
 
         iteration += 1
         if display:
             print '\r',
-            print 'Iteration: {0:>5g}  Objective:{1:> 12e}  Scale:{2:> 12e}'.format(iteration, fnow, beta),
+            print 'i: {0:>5g}  f:{1:> 12e}  b:{2:> 12e} |g|:{3:> 12e}'.format(iteration, fnow, beta, np.dot(gradnew, gradnew)),
             # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
             sys.stdout.flush()
 
         if success:
             # Test for termination
-            if (np.max(np.abs(alpha*d)) < xtol) or (np.abs(fnew-fold) < ftol):
-                status='converged'
+            if (np.dot(gradnew, gradnew)) <= gtol or (np.max(np.abs(alpha * d)) < xtol) or (np.abs(fnew - fold) < ftol):
+                status = 'converged'
                 return x, flog, function_eval, status
 
             else:
@@ -121,14 +126,15 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
                 gradold = gradnew
                 gradnew = gradf(x, *optargs)
                 # If the gradient is zero then we are done.
-                if np.dot(gradnew,gradnew) == 0:
+                if (np.dot(gradnew, gradnew)) <= gtol:
+                    status = 'converged'
                     return x, flog, function_eval, status
 
         # Adjust beta according to comparison ratio.
         if Delta < 0.25:
-            beta = min(4.0*beta, betamax)
+            beta = min(4.0 * beta, betamax)
         if Delta > 0.75:
-            beta = max(0.5*beta, betamin)
+            beta = max(0.5 * beta, betamin)
 
         # Update search direction using Polak-Ribiere formula, or re-start
         # in direction of negative gradient after nparams steps.
@@ -136,8 +142,8 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
             d = -gradnew
             nsuccess = 0
         elif success:
-            gamma = np.dot(gradold - gradnew,gradnew)/(mu)
-            d = gamma*d - gradnew
+            gamma = np.dot(gradold - gradnew, gradnew) / (mu)
+            d = gamma * d - gradnew
 
     # If we get here, then we haven't terminated in the given number of
     # iterations.
diff --git a/GPy/inference/conjugate_gradient_descent.py b/GPy/inference/conjugate_gradient_descent.py
index c9981fe8..2fcd9ba0 100644
--- a/GPy/inference/conjugate_gradient_descent.py
+++ b/GPy/inference/conjugate_gradient_descent.py
@@ -12,6 +12,7 @@ from scipy.optimize.linesearch import line_search_wolfe1, line_search_wolfe2
 from threading import Thread
 import numpy
 import sys
+import time
 
 RUNNING = "running"
 CONVERGED = "converged"
@@ -71,7 +72,10 @@ class _Async_Optimization(Thread):
 
     def callback_return(self, *a):
         self.callback(*a)
-        self.callback(self.SENTINEL)
+        if self.outq is not None:
+            self.outq.put(self.SENTINEL)
+        if self.messages:
+            print ""
         self.runsignal.clear()
 
     def run(self, *args, **kwargs):
@@ -105,7 +109,7 @@ class _CGDAsync(_Async_Optimization):
                 status = MAX_F_EVAL
 
             gi = -self.df(xi, *a, **kw)
-            if numpy.dot(gi.T, gi) < self.gtol:
+            if numpy.dot(gi.T, gi) <= self.gtol:
                 status = CONVERGED
                 break
             if numpy.isnan(numpy.dot(gi.T, gi)):
@@ -123,10 +127,8 @@ class _CGDAsync(_Async_Optimization):
                                                                  xi,
                                                                  si, gi,
                                                                  fi, fi_old)
-            if alphai is not None and fi2 < fi:
-                fi, fi_old = fi2, fi_old2
-            else:
-                alphai, _, _, fi, fi_old, gfi = \
+            if alphai is None:
+                alphai, _, _, fi2, fi_old2, gfi = \
                          line_search_wolfe2(self.f, self.df,
                                             xi, si, gi,
                                             fi, fi_old)
@@ -134,11 +136,14 @@ class _CGDAsync(_Async_Optimization):
                     # This line search also failed to find a better solution.
                     status = LINE_SEARCH
                     break
+            if fi2 < fi:
+                fi, fi_old = fi2, fi_old2
             if gfi is not None:
                 gi = gfi
 
             if numpy.isnan(fi) or fi_old < fi:
                 gi, ur, si = self.reset(xi, *a, **kw)
+
             else:
                 xi += numpy.dot(alphai, si)
                 if self.messages:
@@ -164,6 +169,7 @@ class Async_Optimize(object):
             try:
                 for ret in iter(lambda: q.get(timeout=1), self.SENTINEL):
                     self.callback(*ret)
+                self.runsignal.clear()
             except Empty:
                 pass
 
@@ -193,13 +199,21 @@ class Async_Optimize(object):
         while self.runsignal.is_set():
             try:
                 p.join(1)
-                # c.join(1)
+                c.join(1)
             except KeyboardInterrupt:
                 # print "^C"
                 self.runsignal.clear()
                 p.join()
                 c.join()
         if c and c.is_alive():
+#             self.runsignal.set()
+#             while self.runsignal.is_set():
+#                 try:
+#                     c.join(.1)
+#                 except KeyboardInterrupt:
+#                     # print "^C"
+#                     self.runsignal.clear()
+#                     c.join()
             print "WARNING: callback still running, optimisation done!"
         return p.result
 
diff --git a/GPy/inference/gradient_descent_update_rules.py b/GPy/inference/gradient_descent_update_rules.py
index b3ccb2ce..1c14ed63 100644
--- a/GPy/inference/gradient_descent_update_rules.py
+++ b/GPy/inference/gradient_descent_update_rules.py
@@ -41,3 +41,13 @@ class FletcherReeves(GDUpdateRule):
         if tmp:
             return tmp / numpy.dot(self.gradold.T, self.gradnatold)
         return tmp
+
+class PolakRibiere(GDUpdateRule):
+    '''
+    Fletcher Reeves update rule for gamma
+    '''
+    def _gamma(self, *a, **kw):
+        tmp = numpy.dot((self.grad - self.gradold).T, self.gradnat)
+        if tmp:
+            return tmp / numpy.dot(self.gradold.T, self.gradnatold)
+        return tmp
diff --git a/GPy/inference/optimization.py b/GPy/inference/optimization.py
index 75cd94ba..e208392b 100644
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@@ -29,7 +29,8 @@ class Optimizer():
     :rtype: optimizer object.
 
     """
-    def __init__(self, x_init, messages=False, model = None, max_f_eval=1e4, max_iters = 1e3, ftol=None, gtol=None, xtol=None):
+    def __init__(self, x_init, messages=False, model=None, max_f_eval=1e4, max_iters=1e3,
+                 ftol=None, gtol=None, xtol=None):
         self.opt_name = None
         self.x_init = x_init
         self.messages = messages
@@ -205,7 +206,11 @@ class opt_SCG(Optimizer):
     def opt(self, f_fp = None, f = None, fp = None):
         assert not f is None
         assert not fp is None
-        opt_result = SCG(f,fp,self.x_init, display=self.messages, maxiters=self.max_iters, max_f_eval=self.max_f_eval, xtol=self.xtol, ftol=self.ftol)
+        opt_result = SCG(f, fp, self.x_init, display=self.messages,
+                         maxiters=self.max_iters,
+                         max_f_eval=self.max_f_eval,
+                         xtol=self.xtol, ftol=self.ftol,
+                         gtol=self.gtol)
         self.x_opt = opt_result[0]
         self.trace = opt_result[1]
         self.f_opt = self.trace[-1]
diff --git a/GPy/testing/cgd_tests.py b/GPy/testing/cgd_tests.py
index 79e5e08b..19c1c21b 100644
--- a/GPy/testing/cgd_tests.py
+++ b/GPy/testing/cgd_tests.py
@@ -9,6 +9,7 @@ from GPy.inference.conjugate_gradient_descent import CGD, RUNNING
 import pylab
 import time
 from scipy.optimize.optimize import rosen, rosen_der
+from GPy.inference.gradient_descent_update_rules import PolakRibiere
 
 
 class Test(unittest.TestCase):
@@ -71,10 +72,12 @@ if __name__ == "__main__":
 #     df = lambda x: numpy.dot(A, x) - b
     f = rosen
     df = rosen_der
-    x0 = numpy.random.randn(N) * .5
+    x0 = (numpy.random.randn(N) * .5) + numpy.ones(N)
+    print x0
 
     opt = CGD()
 
+    pylab.ion()
     fig = pylab.figure("cgd optimize")
     if fig.axes:
         ax = fig.axes[0]
@@ -83,13 +86,13 @@ if __name__ == "__main__":
         ax = fig.add_subplot(111, projection='3d')
 
     interpolation = 40
-    x, y = numpy.linspace(-1, 1, interpolation)[:, None], numpy.linspace(-1, 1, interpolation)[:, None]
+    x, y = numpy.linspace(.5, 1.5, interpolation)[:, None], numpy.linspace(.5, 1.5, interpolation)[:, None]
     X, Y = numpy.meshgrid(x, y)
     fXY = numpy.array([f(numpy.array([x, y])) for x, y in zip(X.flatten(), Y.flatten())]).reshape(interpolation, interpolation)
 
     ax.plot_wireframe(X, Y, fXY)
     xopts = [x0.copy()]
-    optplts, = ax.plot3D([x0[0]], [x0[1]], zs=f(x0), marker='o', color='r')
+    optplts, = ax.plot3D([x0[0]], [x0[1]], zs=f(x0), marker='', color='r')
 
     raw_input("enter to start optimize")
     res = [0]
@@ -102,11 +105,7 @@ if __name__ == "__main__":
         if r[-1] != RUNNING:
             res[0] = r
 
-    p, c = opt.opt_async(f, df, x0.copy(), callback, messages=True, maxiter=1000,
-                   report_every=20, gtol=1e-12)
-
-
-    pylab.ion()
-    pylab.show()
+    res[0] = opt.opt(f, df, x0.copy(), callback, messages=True, maxiter=1000,
+                   report_every=7, gtol=1e-12, update_rule=PolakRibiere)
 
     pass